Runner for Ollama engine

This provides integration with the new Ollama engine (5824541 next ollama runner (#7913)) and the rest of the Ollama infrastructure such as the runner and Ollama server. In addition, it also builds out the KV cache infrastructure to support requirements of how Ollama runs models such as: - Parallel processing - Memory management for defragmentation and shifting - Multi-modal modals Both old and new engines continue to be supported. By default, only the old engine is used. To enable the new engine: Start the server with the OLLAMA_NEW_ENGINE environment variable set: OLLAMA_NEW_ENGINE=1 ./ollama serve Start a model that is supported by the Ollama engine. This one is Llama 3.1 8b Q4_K_M: ./ollama run jessegross/llama3.1
2025-12-10 07:46:59 +00:00 · 2024-12-17 19:59:41 -08:00
parent 6945617af5
commit ed443a0393
31 changed files with 2952 additions and 244 deletions
--- a/runner/README.md
+++ b/runner/README.md
@@ -0,0 +1,21 @@
+# `runner`
+
+> Note: this is a work in progress
+
+A minimial runner for loading a model and running inference via a http web server.
+
+```shell
+./runner -model <model binary>
+```
+
+### Completion
+
+```shell
+curl -X POST -H "Content-Type: application/json" -d '{"prompt": "hi"}' http://localhost:8080/completion
+```
+
+### Embeddings
+
+```shell
+curl -X POST -H "Content-Type: application/json" -d '{"prompt": "turn me into an embedding"}' http://localhost:8080/embedding
+```
--- a/runner/common/stop.go
+++ b/runner/common/stop.go
@@ -0,0 +1,96 @@
+package common
+
+import (
+	"strings"
+)
+
+func FindStop(sequence string, stops []string) (bool, string) {
+	for _, stop := range stops {
+		if strings.Contains(sequence, stop) {
+			return true, stop
+		}
+	}
+
+	return false, ""
+}
+
+func ContainsStopSuffix(sequence string, stops []string) bool {
+	for _, stop := range stops {
+		for i := 1; i <= len(stop); i++ {
+			if strings.HasSuffix(sequence, stop[:i]) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// truncateStop removes the provided stop string from pieces,
+// returning the partial pieces with stop removed, including truncating
+// the last piece if required (and signalling if this was the case)
+func TruncateStop(pieces []string, stop string) ([]string, bool) {
+	joined := strings.Join(pieces, "")
+
+	index := strings.Index(joined, stop)
+	if index == -1 {
+		return pieces, false
+	}
+
+	joined = joined[:index]
+
+	// Split truncated string back into pieces of original lengths
+	lengths := make([]int, len(pieces))
+	for i, piece := range pieces {
+		lengths[i] = len(piece)
+	}
+
+	var result []string
+	tokenTruncated := false
+	start := 0
+	for _, length := range lengths {
+		if start >= len(joined) {
+			break
+		}
+
+		end := start + length
+		if end > len(joined) {
+			end = len(joined)
+			tokenTruncated = true
+		}
+		result = append(result, joined[start:end])
+		start = end
+	}
+
+	return result, tokenTruncated
+}
+
+func IncompleteUnicode(token string) bool {
+	incomplete := false
+
+	// check if there is incomplete UTF-8 character at the end
+	for i := 1; i < 5 && i <= len(token); i++ {
+		c := token[len(token)-i]
+
+		if (c & 0xc0) == 0x80 {
+			// continuation byte: 10xxxxxx
+			continue
+		}
+
+		if (c & 0xe0) == 0xc0 {
+			// 2-byte character: 110xxxxx ...
+			incomplete = i < 2
+		} else if (c & 0xf0) == 0xe0 {
+			// 3-byte character: 1110xxxx ...
+			incomplete = i < 3
+		} else if (c & 0xf8) == 0xf0 {
+			// 4-byte character: 11110xxx ...
+			incomplete = i < 4
+		}
+
+		// else 1-byte character or invalid byte
+		break
+	}
+
+	return incomplete
+}
--- a/runner/common/stop_test.go
+++ b/runner/common/stop_test.go
@@ -0,0 +1,129 @@
+package common
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestTruncateStop(t *testing.T) {
+	tests := []struct {
+		name          string
+		pieces        []string
+		stop          string
+		expected      []string
+		expectedTrunc bool
+	}{
+		{
+			name:          "Single word",
+			pieces:        []string{"hello", "world"},
+			stop:          "world",
+			expected:      []string{"hello"},
+			expectedTrunc: false,
+		},
+		{
+			name:          "Partial",
+			pieces:        []string{"hello", "wor"},
+			stop:          "or",
+			expected:      []string{"hello", "w"},
+			expectedTrunc: true,
+		},
+		{
+			name:          "Suffix",
+			pieces:        []string{"Hello", " there", "!"},
+			stop:          "!",
+			expected:      []string{"Hello", " there"},
+			expectedTrunc: false,
+		},
+		{
+			name:          "Suffix partial",
+			pieces:        []string{"Hello", " the", "re!"},
+			stop:          "there!",
+			expected:      []string{"Hello", " "},
+			expectedTrunc: true,
+		},
+		{
+			name:          "Middle",
+			pieces:        []string{"hello", " wor"},
+			stop:          "llo w",
+			expected:      []string{"he"},
+			expectedTrunc: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, resultTrunc := TruncateStop(tt.pieces, tt.stop)
+			if !reflect.DeepEqual(result, tt.expected) || resultTrunc != tt.expectedTrunc {
+				t.Errorf("truncateStop(%v, %s): have %v (%v); want %v (%v)", tt.pieces, tt.stop, result, resultTrunc, tt.expected, tt.expectedTrunc)
+			}
+		})
+	}
+}
+
+func TestIncompleteUnicode(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected bool
+	}{
+		{
+			name:     "Basic",
+			input:    "hi",
+			expected: false,
+		},
+		{
+			name:     "Two byte",
+			input:    "hi" + string([]byte{0xc2, 0xa3}),
+			expected: false,
+		},
+		{
+			name:     "Two byte - missing last",
+			input:    "hi" + string([]byte{0xc2}),
+			expected: true,
+		},
+		{
+			name:     "Three byte",
+			input:    "hi" + string([]byte{0xe0, 0xA0, 0x80}),
+			expected: false,
+		},
+		{
+			name:     "Three byte - missing last",
+			input:    "hi" + string([]byte{0xe0, 0xA0}),
+			expected: true,
+		},
+		{
+			name:     "Three byte - missing last 2",
+			input:    "hi" + string([]byte{0xe0}),
+			expected: true,
+		},
+		{
+			name:     "Four byte",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a, 0xb7}),
+			expected: false,
+		},
+		{
+			name:     "Four byte - missing last",
+			input:    "hi" + string([]byte{0xf0, 0x92, 0x8a}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 2",
+			input:    "hi" + string([]byte{0xf0, 0x92}),
+			expected: true,
+		},
+		{
+			name:     "Four byte - missing last 3",
+			input:    "hi" + string([]byte{0xf0}),
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := IncompleteUnicode(tt.input)
+			if result != tt.expected {
+				t.Errorf("incompleteUnicode(%s): have %v; want %v", tt.input, result, tt.expected)
+			}
+		})
+	}
+}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -0,0 +1,246 @@
+package llamarunner
+
+import (
+	"errors"
+	"fmt"
+	"log/slog"
+	"reflect"
+	"time"
+
+	"github.com/ollama/ollama/llama"
+)
+
+type InputCache struct {
+	// context window size (per slot)
+	numCtx int
+
+	// individual KV caches
+	slots []InputCacheSlot
+
+	// optimize cache eviction for multiple users
+	multiUserCache bool
+
+	lc *llama.Context
+}
+
+func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/numSlots < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
+	slots := make([]InputCacheSlot, numSlots)
+
+	for i := range slots {
+		slots[i] = InputCacheSlot{
+			Id:     i,
+			Inputs: make([]input, 0),
+		}
+	}
+
+	return &InputCache{
+		numCtx:         kvSize / numSlots,
+		slots:          slots,
+		multiUserCache: multiUserCache,
+		lc:             lc,
+	}, nil
+}
+
+// Locking: Operations on InputCacheSlot (including finding one
+// through LoadCacheSlot) require a lock to be be held that serializes
+// these operations with each other and llama.Decode
+
+type InputCacheSlot struct {
+	// Index in the KV cache
+	Id int
+
+	// Inputs that are stored in the KV cache
+	Inputs []input
+
+	// is this cache actively being processed as part of a sequence?
+	InUse bool
+
+	// last time this cache was used (as of start of processing)
+	lastUsed time.Time
+}
+
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
+	var slot *InputCacheSlot
+	var numPast int
+	var err error
+
+	// In single-user scenarios, the longest cache slot works fine for getting good input
+	// cache hit rates and it reuses the same VRAM over and over again, which is good for
+	// GPU performance in situations where we miss the input cache.
+	// For multiple users, the "best" cache slot produces better input cache hit rates
+	// at the cost of worse performance when we miss the input cache (because it causes
+	// GPU L2 cache misses due to spreading out accesses across VRAM).
+	if !c.multiUserCache {
+		slot, numPast, err = c.findLongestCacheSlot(prompt)
+	} else {
+		slot, numPast, err = c.findBestCacheSlot(prompt)
+	}
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if !cachePrompt {
+		numPast = 0
+	}
+
+	slot.InUse = true
+	slot.lastUsed = time.Now()
+
+	if numPast == len(prompt) {
+		// Leave one input to sample so we can get a response
+		numPast--
+	}
+
+	if !c.lc.KvCacheSeqRm(slot.Id, numPast, -1) {
+		// Some models don't support partial erasure
+		c.lc.KvCacheSeqRm(slot.Id, 0, -1)
+		numPast = 0
+	}
+
+	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
+		"used", numPast, "remaining", len(prompt)-numPast)
+
+	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]
+
+	return slot, prompt, nil
+}
+
+func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
+	longest := -1
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		if s.InUse {
+			continue
+		}
+
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+	}
+
+	if longestSlot == nil {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	return longestSlot, longest, nil
+}
+
+func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int, error) {
+	oldest := time.Now()
+	var oldestSlot *InputCacheSlot
+
+	longest := -1
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+
+		if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
+			oldest = s.lastUsed
+			oldestSlot = &c.slots[i]
+		}
+	}
+
+	if longest == len(longestSlot.Inputs) && !longestSlot.InUse {
+		return longestSlot, longest, nil
+	}
+
+	if oldestSlot.InUse {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	if len(oldestSlot.Inputs) != 0 {
+		slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
+			"used", oldestSlot.lastUsed)
+	}
+
+	if longest > 0 && longestSlot != oldestSlot {
+		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
+			len(longestSlot.Inputs))
+		oldestSlot.Inputs = make([]input, longest)
+		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
+		// This is only nil for unit tests
+		if c.lc != nil {
+			c.lc.KvCacheSeqRm(oldestSlot.Id, 0, -1)
+			c.lc.KvCacheSeqCp(longestSlot.Id, oldestSlot.Id, 0, longest)
+		}
+	}
+
+	return oldestSlot, longest, nil
+}
+
+func countCommonPrefix(a []input, b []input) int {
+	var count int
+
+	for i := range a {
+		if i >= len(b) {
+			break
+		}
+
+		if !reflect.DeepEqual(a[i], b[i]) {
+			break
+		}
+
+		count++
+	}
+
+	return count
+}
+
+func (c *InputCache) ShiftDiscard(inputLen int, numKeep int) int {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
+
+	currentFree := c.numCtx - inputLen
+	discard := targetFree - currentFree
+
+	if discard < 0 {
+		discard = 0
+	}
+
+	return discard
+}
+
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
+	}
+
+	discard := c.ShiftDiscard(len(slot.Inputs), numKeep)
+
+	if discard <= 0 {
+		return nil
+	}
+
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	if !c.lc.KvCacheSeqRm(slot.Id, numKeep, numKeep+discard) {
+		return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v)", slot.Id, numKeep, discard)
+	}
+	c.lc.KvCacheSeqAdd(slot.Id, numKeep+discard, len(slot.Inputs), -discard)
+
+	for i := numKeep + discard; i < len(slot.Inputs); i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:len(slot.Inputs)-discard]
+
+	return nil
+}
--- a/runner/llamarunner/cache_test.go
+++ b/runner/llamarunner/cache_test.go
@@ -0,0 +1,292 @@
+package llamarunner
+
+import (
+	"testing"
+	"time"
+)
+
+func TestCountCommon(t *testing.T) {
+	tests := []struct {
+		name     string
+		t1       []input
+		t2       []input
+		expected int
+	}{
+		{
+			name:     "Equal",
+			t1:       []input{{token: 1}, {token: 2}, {token: 3}},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 3,
+		},
+		{
+			name:     "Prefix",
+			t1:       []input{{token: 1}},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 1,
+		},
+		{
+			name:     "Embeddings Prefix",
+			t1:       []input{{embed: []float32{0.1, 0.2, 0.3}}},
+			t2:       []input{{embed: []float32{0.1, 0.2, 0.3}}, {embed: []float32{0.4, 0.5, 0.6}}, {embed: []float32{0.7}}},
+			expected: 1,
+		},
+		{
+			name:     "Embeddings Prefix Partial",
+			t1:       []input{{embed: []float32{0.1, 0.2, 0.3}}},
+			t2:       []input{{embed: []float32{0.1, 0.2}}, {embed: []float32{0.4, 0.5, 0.6}}, {embed: []float32{0.7}}},
+			expected: 0,
+		},
+		{
+			name:     "Mixed",
+			t1:       []input{{token: 1}, {embed: []float32{0.2, 0.3, 0.4}}},
+			t2:       []input{{token: 1}, {embed: []float32{0.2, 0.3, 0.4}}, {token: 5}},
+			expected: 2,
+		},
+		{
+			name:     "Empty",
+			t1:       []input{},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 0,
+		},
+		{
+			name:     "Both Empty",
+			t1:       []input{},
+			t2:       []input{},
+			expected: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := countCommonPrefix(tt.t1, tt.t2)
+			if result != tt.expected {
+				t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestFindCacheSlot(t *testing.T) {
+	type expected struct {
+		result int
+		len    int
+	}
+
+	tests := []struct {
+		name    string
+		cache   InputCache
+		prompt  []input
+		longest expected
+		best    expected
+	}{
+		{
+			name: "Empty",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+				{
+					Id:       1,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+			}},
+			prompt:  []input{{token: 1}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 0, len: 0},
+		},
+		{
+			name: "Extend",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 1}, {token: 2}},
+			longest: expected{result: 1, len: 2},
+			best:    expected{result: 1, len: 2},
+		},
+		{
+			name: "New",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+			}},
+			prompt:  []input{{token: 2}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 1, len: 0},
+		},
+		{
+			name: "Fork",
+			cache: InputCache{
+				slots: []InputCacheSlot{
+					{
+						Id:       0,
+						Inputs:   []input{{token: 1}, {token: 2}},
+						InUse:    false,
+						lastUsed: time.Now().Add(-time.Second),
+					},
+					{
+						Id:       1,
+						Inputs:   []input{},
+						InUse:    false,
+						lastUsed: time.Time{},
+					},
+				},
+			},
+			prompt:  []input{{token: 1}},
+			longest: expected{result: 0, len: 1},
+			best:    expected{result: 1, len: 1},
+		},
+		{
+			name: "Evict",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 2}, {token: 3}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 1, len: 0},
+		},
+		{
+			name: "In use",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    true,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 1}, {token: 2}},
+			longest: expected{result: 1, len: 1},
+			best:    expected{result: 1, len: 2},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run("Longest-"+tt.name, func(t *testing.T) {
+			result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
+			if err != nil {
+				t.Errorf("findLongestCacheSlot: err %v", err)
+			} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
+				t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
+					result.Id, tt.longest.result, resultLen, tt.longest.len)
+			}
+		})
+	}
+
+	for _, tt := range tests {
+		t.Run("Best-"+tt.name, func(t *testing.T) {
+			result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
+			if err != nil {
+				t.Errorf("findBestCacheSlot: err %v", err)
+			} else if result.Id != tt.best.result || resultLen != tt.best.len {
+				t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
+					result.Id, tt.best.result, resultLen, tt.best.len)
+			}
+		})
+	}
+}
+
+func TestShiftDiscard(t *testing.T) {
+	tests := []struct {
+		name     string
+		numCtx   int
+		numKeep  int
+		inputLen int
+		expected int
+	}{
+		{
+			name:     "Shift",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 2048,
+			expected: 1021,
+		},
+		{
+			name:     "Max Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 2048,
+			expected: 1,
+		},
+		{
+			name:     "No Keep",
+			numCtx:   2048,
+			numKeep:  0,
+			inputLen: 2048,
+			expected: 1024,
+		},
+		{
+			name:     "Truncate",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 5000,
+			expected: 3973,
+		},
+		{
+			name:     "Truncate Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 5000,
+			expected: 2953,
+		},
+		{
+			name:     "No Op",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 512,
+			expected: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			c := InputCache{numCtx: tt.numCtx}
+			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
+			if result != tt.expected {
+				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
+			}
+		})
+	}
+}
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -0,0 +1,183 @@
+package llamarunner
+
+import (
+	"errors"
+	"fmt"
+	"hash/maphash"
+	"log/slog"
+	"slices"
+	"sync"
+	"time"
+
+	"github.com/ollama/ollama/llama"
+)
+
+const imageCacheSize = 4
+
+type ImageContext struct {
+	// mu is required to be held when generating embeddings or accessing the cache
+	mu sync.Mutex
+
+	clip   *llama.ClipContext
+	mllama *llama.MllamaContext
+
+	// cache of images to embeddings
+	images    []imageCache
+	imageHash maphash.Hash
+}
+
+func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageContext, error) {
+	arch, err := llama.GetModelArch(modelPath)
+	if err != nil {
+		return nil, fmt.Errorf("unable to determine vision architecture: %w (%s)", err, modelPath)
+	}
+
+	var c ImageContext
+	if arch == "clip" {
+		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
+	} else if arch == "mllama" {
+		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
+	} else {
+		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	c.images = make([]imageCache, imageCacheSize)
+
+	return &c, nil
+}
+
+func (c *ImageContext) Free(modelPath string) {
+	if c == nil {
+		return
+	}
+
+	if c.clip != nil {
+		c.clip.Free()
+	}
+	if c.mllama != nil {
+		c.mllama.Free()
+	}
+}
+
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+	if c == nil {
+		return nil, nil
+	}
+
+	if len(data) <= 0 {
+		return nil, errors.New("received zero length image")
+	}
+
+	hash := c.hashImage(data)
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	embed, err := c.findImage(hash)
+	if err != nil {
+		if c.mllama != nil {
+			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
+			if err != nil {
+				return nil, err
+			}
+		} else if c.clip != nil {
+			embed, err = c.clip.NewEmbed(llamaContext, data)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			return nil, errors.New("received image but vision model not loaded")
+		}
+
+		c.addImage(hash, embed)
+	}
+
+	return embed, nil
+}
+
+func (c *ImageContext) BatchSize(configuredBatchSize int) int {
+	// If images are not supported, we don't need to allocate embedding batches
+	if c == nil {
+		return 0
+	}
+
+	// Mllama maps an image to 1 embedding token (llava creates many tokens)
+	// and doesn't support more than a single image per request.
+	// The embeddings are large (100 MB), so allocating a big batch can fail
+	// on some systems
+	if c.mllama != nil {
+		return 1
+	}
+
+	return configuredBatchSize
+}
+
+func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
+	if c != nil && c.mllama != nil {
+		return c.mllama.EmbedSize(llamaContext)
+	} else {
+		return llamaContext.Model().NEmbd()
+	}
+}
+
+func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
+	if c == nil || c.mllama == nil {
+		return false
+	}
+
+	return slices.ContainsFunc(inputs, func(input input) bool {
+		return input.embed != nil
+	})
+}
+
+type imageCache struct {
+	key      uint64
+	val      [][]float32
+	lastUsed time.Time
+}
+
+func (c *ImageContext) hashImage(image []byte) uint64 {
+	c.imageHash.Reset()
+	_, _ = c.imageHash.Write(image)
+	return c.imageHash.Sum64()
+}
+
+var errImageNotFound = errors.New("image not found in cache")
+
+func (c *ImageContext) findImage(hash uint64) ([][]float32, error) {
+	for i := range c.images {
+		if c.images[i].key == hash {
+			slog.Debug("loading image embeddings from cache", "entry", i)
+			c.images[i].lastUsed = time.Now()
+			return c.images[i].val, nil
+		}
+	}
+
+	return nil, errImageNotFound
+}
+
+func (c *ImageContext) addImage(hash uint64, embed [][]float32) {
+	best := time.Now()
+	var bestImage int
+
+	for i := range c.images {
+		if c.images[i].key == hash {
+			bestImage = i
+			break
+		}
+
+		if c.images[i].lastUsed.Compare(best) < 0 {
+			best = c.images[i].lastUsed
+			bestImage = i
+		}
+	}
+
+	slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
+	c.images[bestImage].key = hash
+	c.images[bestImage].val = embed
+	c.images[bestImage].lastUsed = time.Now()
+}
--- a/runner/llamarunner/image_test.go
+++ b/runner/llamarunner/image_test.go
@@ -0,0 +1,80 @@
+package llamarunner
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestImageCache(t *testing.T) {
+	cache := ImageContext{images: make([]imageCache, 4)}
+
+	valA := [][]float32{{0.1, 0.2}, {0.3}}
+	valB := [][]float32{{0.4}, {0.5}, {0.6}}
+	valC := [][]float32{{0.7}}
+	valD := [][]float32{{0.8}}
+	valE := [][]float32{{0.9}}
+
+	// Empty cache
+	result, err := cache.findImage(0x5adb61d31933a946)
+	if err != errImageNotFound {
+		t.Errorf("found result in empty cache: result %v, err %v", result, err)
+	}
+
+	// Insert A
+	cache.addImage(0x5adb61d31933a946, valA)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Insert B
+	cache.addImage(0x011551369a34a901, valB)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valB) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Replace B with C
+	cache.addImage(0x011551369a34a901, valC)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if !reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valC) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+
+	// Evict A
+	cache.addImage(0x756b218a517e7353, valB)
+	cache.addImage(0x75e5e8d35d7e3967, valD)
+	cache.addImage(0xd96f7f268ca0646e, valE)
+
+	result, err = cache.findImage(0x5adb61d31933a946)
+	if reflect.DeepEqual(result, valA) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x756b218a517e7353)
+	if !reflect.DeepEqual(result, valB) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x011551369a34a901)
+	if !reflect.DeepEqual(result, valC) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0x75e5e8d35d7e3967)
+	if !reflect.DeepEqual(result, valD) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+	result, err = cache.findImage(0xd96f7f268ca0646e)
+	if !reflect.DeepEqual(result, valE) {
+		t.Errorf("failed to find expected value: result %v, err %v", result, err)
+	}
+}
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -0,0 +1,280 @@
+package ollamarunner
+
+import (
+	"errors"
+	"fmt"
+	"log/slog"
+	"math"
+	"reflect"
+	"time"
+
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+)
+
+type InputCache struct {
+	// context window size (per slot)
+	numCtx int32
+
+	// does the cache store data or do we need to always send the full input?
+	// note that when enabled is false the underlying cache may either be nil
+	// or a non-nil dummy that doesn't actually store anything
+	enabled bool
+
+	// individual KV caches
+	slots []InputCacheSlot
+
+	// optimize cache eviction for multiple users
+	multiUserCache bool
+
+	cache kvcache.Cache
+}
+
+func NewInputCache(model model.Model, kvCacheType string, kvSize int32, numSlots int, multiUserCache bool) (*InputCache, error) {
+	if kvSize/int32(numSlots) < 1 {
+		return nil, fmt.Errorf("must have at least one kv cache entry per parallel sequence (kv: %v parallel: %v)", kvSize, numSlots)
+	}
+
+	slots := make([]InputCacheSlot, numSlots)
+
+	for i := range slots {
+		slots[i] = InputCacheSlot{
+			Id:     i,
+			Inputs: make([]input, 0),
+		}
+	}
+
+	cache := model.Config().Cache
+	if cache != nil {
+		cache.Init(model.Backend(), kvCacheTypeFromStr(kvCacheType), kvSize)
+	}
+
+	return &InputCache{
+		numCtx:         kvSize / int32(numSlots),
+		enabled:        cache != nil,
+		slots:          slots,
+		multiUserCache: multiUserCache,
+		cache:          cache,
+	}, nil
+}
+
+func kvCacheTypeFromStr(s string) ml.DType {
+	switch s {
+	case "q8_0":
+		panic("kv cache quantization not yet implemented")
+	case "q4_0":
+		panic("kv cache quantization not yet implemented")
+	default:
+		return ml.DTypeF16
+	}
+}
+
+func (c *InputCache) Close() {
+	c.cache.Close()
+}
+
+// Locking: Operations on InputCacheSlot (including finding one
+// through LoadCacheSlot) require a lock to be be held that serializes
+// these operations with each other and processBatch
+
+type InputCacheSlot struct {
+	// Index in the KV cache
+	Id int
+
+	// Inputs that are stored in the KV cache
+	Inputs []input
+
+	// is this cache actively being processed as part of a sequence?
+	InUse bool
+
+	// last time this cache was used (as of start of processing)
+	lastUsed time.Time
+}
+
+func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCacheSlot, []input, error) {
+	var slot *InputCacheSlot
+	var numPast int32
+	var err error
+
+	// In single-user scenarios, the longest cache slot works fine for getting good input
+	// cache hit rates and it keeps the footprint of the cache small, which improves throughput.
+	// For multiple users, the "best" cache slot produces better input cache hit rates
+	// at the cost of worse performance when we miss the input cache.
+	if !c.multiUserCache {
+		slot, numPast, err = c.findLongestCacheSlot(prompt)
+	} else {
+		slot, numPast, err = c.findBestCacheSlot(prompt)
+	}
+	if err != nil {
+		return nil, nil, err
+	}
+
+	if !cachePrompt {
+		numPast = 0
+	}
+
+	slot.InUse = true
+	slot.lastUsed = time.Now()
+
+	if numPast == int32(len(prompt)) {
+		// Leave one input to sample so we can get a response
+		numPast--
+	}
+
+	if c.cache != nil {
+		err = c.cache.Remove(slot.Id, numPast, math.MaxInt32)
+		if err != nil {
+			// Some models don't support partial erasure
+			err = c.cache.Remove(slot.Id, 0, math.MaxInt32)
+			if err != nil {
+				return nil, nil, err
+			}
+			numPast = 0
+		}
+	}
+
+	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
+		"used", numPast, "remaining", int32(len(prompt))-numPast)
+
+	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]
+
+	return slot, prompt, nil
+}
+
+func (c *InputCache) findLongestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
+	longest := int32(-1)
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		if s.InUse {
+			continue
+		}
+
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+	}
+
+	if longestSlot == nil {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	return longestSlot, longest, nil
+}
+
+func (c *InputCache) findBestCacheSlot(prompt []input) (*InputCacheSlot, int32, error) {
+	oldest := time.Now()
+	var oldestSlot *InputCacheSlot
+
+	longest := int32(-1)
+	var longestSlot *InputCacheSlot
+
+	for i, s := range c.slots {
+		count := countCommonPrefix(s.Inputs, prompt)
+		if count > longest {
+			longest = count
+			longestSlot = &c.slots[i]
+		}
+
+		if s.lastUsed.Compare(oldest) < 0 && !s.InUse {
+			oldest = s.lastUsed
+			oldestSlot = &c.slots[i]
+		}
+	}
+
+	if longest == int32(len(longestSlot.Inputs)) && !longestSlot.InUse {
+		return longestSlot, longest, nil
+	}
+
+	if oldestSlot.InUse {
+		return nil, 0, errors.New("no available cache slots")
+	}
+
+	if len(oldestSlot.Inputs) != 0 {
+		slog.Debug("evicting cache slot", "id", oldestSlot.Id, "inputs", len(oldestSlot.Inputs),
+			"used", oldestSlot.lastUsed)
+	}
+
+	if longest > 0 && longestSlot != oldestSlot {
+		slog.Debug("forking cache slot", "src", longestSlot.Id, "dst", oldestSlot.Id, "inputs", longest, "total",
+			len(longestSlot.Inputs))
+		oldestSlot.Inputs = make([]input, longest)
+		copy(oldestSlot.Inputs, longestSlot.Inputs[:longest])
+		if c.cache != nil {
+			c.cache.CopyPrefix(longestSlot.Id, oldestSlot.Id, longest)
+		}
+	}
+
+	return oldestSlot, longest, nil
+}
+
+func countCommonPrefix(a []input, b []input) int32 {
+	var count int32
+
+	for i := range a {
+		if i >= len(b) {
+			break
+		}
+
+		if !reflect.DeepEqual(a[i], b[i]) {
+			break
+		}
+
+		count++
+	}
+
+	return count
+}
+
+func (c *InputCache) ShiftDiscard(inputLen int32, numKeep int32) int32 {
+	targetFree := (c.numCtx - numKeep) / 2
+	targetFree = max(targetFree, 1)
+
+	currentFree := c.numCtx - inputLen
+	discard := targetFree - currentFree
+
+	if discard < 0 {
+		discard = 0
+	}
+
+	return discard
+}
+
+// Frees up space in the KV cache by deleting the oldest half of history and shifting
+// the newest half into that space (saving numKeep inputs at the beginning).
+//
+// Assumes that at least 1 entry can be freed up by shifting (i.e. numKeep < numCtx)
+func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int32) error {
+	if numKeep >= c.numCtx {
+		return fmt.Errorf("unable to shift context - keep exceeds context (keep: %v context: %v)", numKeep, c.numCtx)
+	}
+
+	inputLen := int32(len(slot.Inputs))
+	discard := c.ShiftDiscard(inputLen, numKeep)
+
+	if discard <= 0 {
+		return nil
+	}
+
+	slog.Debug("context limit hit - shifting", "id", slot.Id, "limit", c.numCtx, "input", len(slot.Inputs),
+		"keep", numKeep, "discard", discard)
+
+	// TODO (jessegross): KV cache removal can fail for certain types of models
+	if c.cache != nil {
+		err := c.cache.Remove(slot.Id, numKeep, numKeep+discard)
+		if err != nil {
+			return fmt.Errorf("unable to remove old kv cache entries (id: %v, keep: %v discard: %v): %w", slot.Id, numKeep, discard, err)
+		}
+	}
+
+	for i := numKeep + discard; i < inputLen; i++ {
+		slot.Inputs[i-discard] = slot.Inputs[i]
+	}
+	slot.Inputs = slot.Inputs[:inputLen-discard]
+
+	return nil
+}
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -0,0 +1,291 @@
+package ollamarunner
+
+import (
+	"image"
+	"testing"
+	"time"
+)
+
+func TestCountCommon(t *testing.T) {
+	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
+	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
+	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
+
+	tests := []struct {
+		name     string
+		t1       []input
+		t2       []input
+		expected int32
+	}{
+		{
+			name:     "Equal",
+			t1:       []input{{token: 1}, {token: 2}, {token: 3}},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 3,
+		},
+		{
+			name:     "Prefix",
+			t1:       []input{{token: 1}},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 1,
+		},
+		{
+			name:     "Image Prefix",
+			t1:       []input{{image: imgA}},
+			t2:       []input{{image: imgA}, {image: imgB}, {image: imgC}},
+			expected: 1,
+		},
+		{
+			name:     "Mixed",
+			t1:       []input{{token: 1}, {image: imgA}},
+			t2:       []input{{token: 1}, {image: imgA}, {token: 5}},
+			expected: 2,
+		},
+		{
+			name:     "Empty",
+			t1:       []input{},
+			t2:       []input{{token: 1}, {token: 2}, {token: 3}},
+			expected: 0,
+		},
+		{
+			name:     "Both Empty",
+			t1:       []input{},
+			t2:       []input{},
+			expected: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := countCommonPrefix(tt.t1, tt.t2)
+			if result != tt.expected {
+				t.Errorf("countCommonPrefix(%v, %v): have %v; want %v", tt.t1, tt.t2, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestFindCacheSlot(t *testing.T) {
+	type expected struct {
+		result int
+		len    int32
+	}
+
+	tests := []struct {
+		name    string
+		cache   InputCache
+		prompt  []input
+		longest expected
+		best    expected
+	}{
+		{
+			name: "Empty",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+				{
+					Id:       1,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+			}},
+			prompt:  []input{{token: 1}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 0, len: 0},
+		},
+		{
+			name: "Extend",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 1}, {token: 2}},
+			longest: expected{result: 1, len: 2},
+			best:    expected{result: 1, len: 2},
+		},
+		{
+			name: "New",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{},
+					InUse:    false,
+					lastUsed: time.Time{},
+				},
+			}},
+			prompt:  []input{{token: 2}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 1, len: 0},
+		},
+		{
+			name: "Fork",
+			cache: InputCache{
+				slots: []InputCacheSlot{
+					{
+						Id:       0,
+						Inputs:   []input{{token: 1}, {token: 2}},
+						InUse:    false,
+						lastUsed: time.Now().Add(-time.Second),
+					},
+					{
+						Id:       1,
+						Inputs:   []input{},
+						InUse:    false,
+						lastUsed: time.Time{},
+					},
+				},
+			},
+			prompt:  []input{{token: 1}},
+			longest: expected{result: 0, len: 1},
+			best:    expected{result: 1, len: 1},
+		},
+		{
+			name: "Evict",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 2}, {token: 3}},
+			longest: expected{result: 0, len: 0},
+			best:    expected{result: 1, len: 0},
+		},
+		{
+			name: "In use",
+			cache: InputCache{slots: []InputCacheSlot{
+				{
+					Id:       0,
+					Inputs:   []input{{token: 1}, {token: 2}},
+					InUse:    true,
+					lastUsed: time.Now().Add(-time.Second),
+				},
+				{
+					Id:       1,
+					Inputs:   []input{{token: 1}},
+					InUse:    false,
+					lastUsed: time.Now().Add(-2 * time.Second),
+				},
+			}},
+			prompt:  []input{{token: 1}, {token: 2}},
+			longest: expected{result: 1, len: 1},
+			best:    expected{result: 1, len: 2},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run("Longest-"+tt.name, func(t *testing.T) {
+			result, resultLen, err := tt.cache.findLongestCacheSlot(tt.prompt)
+			if err != nil {
+				t.Errorf("findLongestCacheSlot: err %v", err)
+			} else if result.Id != tt.longest.result || resultLen != tt.longest.len {
+				t.Errorf("findLongestCacheSlot: slot have %v, want %v len have %v, want %v",
+					result.Id, tt.longest.result, resultLen, tt.longest.len)
+			}
+		})
+	}
+
+	for _, tt := range tests {
+		t.Run("Best-"+tt.name, func(t *testing.T) {
+			result, resultLen, err := tt.cache.findBestCacheSlot(tt.prompt)
+			if err != nil {
+				t.Errorf("findBestCacheSlot: err %v", err)
+			} else if result.Id != tt.best.result || resultLen != tt.best.len {
+				t.Errorf("findBestCacheSlot: slot have %v, want %v len have %v, want %v",
+					result.Id, tt.best.result, resultLen, tt.best.len)
+			}
+		})
+	}
+}
+
+func TestShiftDiscard(t *testing.T) {
+	tests := []struct {
+		name     string
+		numCtx   int32
+		numKeep  int32
+		inputLen int32
+		expected int32
+	}{
+		{
+			name:     "Shift",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 2048,
+			expected: 1021,
+		},
+		{
+			name:     "Max Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 2048,
+			expected: 1,
+		},
+		{
+			name:     "No Keep",
+			numCtx:   2048,
+			numKeep:  0,
+			inputLen: 2048,
+			expected: 1024,
+		},
+		{
+			name:     "Truncate",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 5000,
+			expected: 3973,
+		},
+		{
+			name:     "Truncate Keep",
+			numCtx:   2048,
+			numKeep:  2047,
+			inputLen: 5000,
+			expected: 2953,
+		},
+		{
+			name:     "No Op",
+			numCtx:   2048,
+			numKeep:  5,
+			inputLen: 512,
+			expected: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			c := InputCache{numCtx: tt.numCtx}
+			result := c.ShiftDiscard(tt.inputLen, tt.numKeep)
+			if result != tt.expected {
+				t.Errorf("shiftDiscard(ctx: %v, keep: %v input: %v): have %v; want %v", tt.numCtx, tt.numKeep, tt.inputLen, result, tt.expected)
+			}
+		})
+	}
+}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -0,0 +1,945 @@
+package ollamarunner
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"image"
+	"log"
+	"log/slog"
+	"net"
+	"net/http"
+	"os"
+	"path/filepath"
+	"regexp"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+	"unicode/utf8"
+
+	"golang.org/x/sync/semaphore"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/runner/common"
+	"github.com/ollama/ollama/sample"
+
+	_ "github.com/ollama/ollama/model/models"
+)
+
+// input is an element of the prompt to process, either a token or an image
+type input struct {
+	token int32
+
+	image image.Image
+}
+
+type Sequence struct {
+	// batch index
+	iBatch int
+
+	// prompt inputs left to evaluate
+	inputs []input
+
+	// inputs that have been added to a batch but not yet submitted to Forward
+	pendingInputs []input
+
+	// tokens that have been generated but not returned yet (e.g. for stop sequences)
+	pendingResponses []string
+
+	// input cache being used by this sequence
+	cache *InputCacheSlot
+
+	// channel to send responses over
+	responses chan string
+
+	// channel to stop decoding (such as if the remote connection is closed)
+	quit chan bool
+
+	// number of tokens to predict
+	numPredict int
+
+	// set of samplers to run on generated logits
+	samplers []sample.Sampler
+
+	// channel to send back the embedding if embedding only
+	embedding chan []float32
+
+	// stop sequences
+	stop []string
+
+	// number of inputs to keep at the beginning when shifting context window
+	numKeep int32
+
+	// true if an embedding are to be returned instead of text generation
+	embeddingOnly bool
+
+	doneReason string
+
+	// Metrics
+	startProcessingTime time.Time
+	startGenerationTime time.Time
+	numPredicted        int
+	numPromptInputs     int
+}
+
+type NewSequenceParams struct {
+	numPredict int
+	stop       []string
+	numKeep    int32
+	samplers   []sample.Sampler
+	embedding  bool
+}
+
+func (s *Server) NewSequence(prompt string, images []ImageData, params NewSequenceParams) (*Sequence, error) {
+	s.ready.Wait()
+
+	startTime := time.Now()
+
+	inputs, err := s.inputs(prompt, images)
+	if err != nil {
+		return nil, fmt.Errorf("failed to process inputs: %w", err)
+	} else if len(inputs) == 0 {
+		return nil, errors.New("no input provided")
+	}
+
+	if params.numKeep < 0 {
+		params.numKeep = int32(len(inputs))
+	}
+
+	// Ensure that at least 1 input can be discarded during shift
+	params.numKeep = min(params.numKeep, s.cache.numCtx-1)
+
+	if int32(len(inputs)) > s.cache.numCtx {
+		discard := int32(len(inputs)) - s.cache.numCtx
+		newInputs := inputs[:params.numKeep]
+		newInputs = append(newInputs, inputs[params.numKeep+discard:]...)
+
+		slog.Warn("truncating input prompt", "limit", s.cache.numCtx, "prompt", len(inputs), "keep", params.numKeep, "new", len(newInputs))
+		inputs = newInputs
+	}
+
+	// TODO(jessegross): Ingest cached history for grammar
+
+	return &Sequence{
+		inputs:              inputs,
+		numPromptInputs:     len(inputs),
+		startProcessingTime: startTime,
+		numPredict:          params.numPredict,
+		pendingResponses:    make([]string, 0),
+		responses:           make(chan string, 100),
+		quit:                make(chan bool, 1),
+		embedding:           make(chan []float32, 1),
+		samplers:            params.samplers,
+		embeddingOnly:       params.embedding,
+		stop:                params.stop,
+		numKeep:             params.numKeep,
+	}, nil
+}
+
+// inputs processes the prompt and images into a list of inputs
+// by splitting the prompt on [img-<n>] tags, tokenizing text and
+// decoding images
+func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
+	var inputs []input
+	var parts []string
+	var matches [][]string
+
+	// TODO(jessegross): This can sometimes trigger for matching text in the
+	// user's prompt. We previously tried to avoid it by only looking for images
+	// on image models. We don't have a clear indication now but it would be better
+	// to properly escape it in any case.
+	re := regexp.MustCompile(`\[img-(\d+)\]`)
+	parts = re.Split(prompt, -1)
+	matches = re.FindAllStringSubmatch(prompt, -1)
+
+	for i, part := range parts {
+		// text - tokenize
+		tokens, err := s.model.(model.TextProcessor).Encode(part)
+		if err != nil {
+			return nil, err
+		}
+
+		for _, t := range tokens {
+			inputs = append(inputs, input{token: t})
+		}
+
+		// image - decode and store
+		if i < len(matches) {
+			n, _ := strconv.Atoi(matches[i][1])
+
+			imageIndex := -1
+			for j := range images {
+				if images[j].ID == n {
+					imageIndex = j
+					break
+				}
+			}
+
+			if imageIndex < 0 {
+				return nil, fmt.Errorf("invalid image index: %d", n)
+			}
+
+			image, _, err := image.Decode(bytes.NewReader(images[imageIndex].Data))
+			if err != nil {
+				return nil, err
+			}
+
+			inputs = append(inputs, input{image: image})
+		}
+	}
+
+	return inputs, nil
+}
+
+type Server struct {
+	// is the server ready to process requests?
+	// protects access to model and image
+	ready sync.WaitGroup
+
+	// loaded model
+	model model.Model
+
+	// status for external health reporting - loading, ready to serve, etc.
+	status ServerStatus
+
+	// current progress on loading the model
+	progress float32
+
+	// number of simultaneous requests to handle
+	parallel int
+
+	// maximum number of elements in a batch (per sequence)
+	// TODO (jmorganca): make this n_batch
+	batchSize int
+
+	// protects access to everything below this line
+	// this is context state needed for decoding
+	mu sync.Mutex
+
+	// indicates that data is ready for processing
+	cond *sync.Cond
+
+	// the list of simultaneous sequences being evaluated
+	seqs []*Sequence
+
+	// seqs can have a maximum of parallel entries, which
+	// is enfoced by seqSem
+	seqsSem *semaphore.Weighted
+
+	// KV cache
+	cache *InputCache
+
+	// next sequence for prompt processing to avoid starvation
+	nextSeq int
+}
+
+func (s *Server) allNil() bool {
+	for _, item := range s.seqs {
+		if item != nil {
+			return false
+		}
+	}
+	return true
+}
+
+func flushPending(seq *Sequence) bool {
+	joined := strings.Join(seq.pendingResponses, "")
+	seq.pendingResponses = []string{}
+
+	// Check if there are any partial UTF-8 characters remaining.
+	// We already check and queue as we are generating but some may
+	// still make it here:
+	// - Sequence is ending, e.g. generation limit has been hit
+	// - Invalid characters in the middle of a string
+	// This is a stricter check to ensure we never output invalid Unicode.
+	for !utf8.ValidString(joined) {
+		joined = joined[:len(joined)-1]
+	}
+
+	if len(joined) == 0 {
+		return true
+	}
+
+	select {
+	case seq.responses <- joined:
+		return true
+	case <-seq.quit:
+		return false
+	}
+}
+
+func (s *Server) removeSequence(seqIndex int, reason string) {
+	seq := s.seqs[seqIndex]
+
+	flushPending(seq)
+	seq.doneReason = reason
+	close(seq.responses)
+	close(seq.embedding)
+	seq.cache.InUse = false
+	s.seqs[seqIndex] = nil
+	s.seqsSem.Release(1)
+}
+
+func (s *Server) run(ctx context.Context) {
+	s.ready.Wait()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+			err := s.processBatch()
+			if err != nil {
+				panic(err)
+			}
+		}
+	}
+}
+
+func (s *Server) processBatch() error {
+	s.mu.Lock()
+	for s.allNil() {
+		s.cond.Wait() // Wait until an item is added
+	}
+	defer s.mu.Unlock()
+
+	var options model.Options
+	imgSeq := -1
+
+	seqIdx := s.nextSeq - 1
+	for range s.seqs {
+		seqIdx = (seqIdx + 1) % len(s.seqs)
+		seq := s.seqs[seqIdx]
+
+		if seq == nil {
+			continue
+		}
+
+		// if past the num predict limit
+		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
+			s.removeSequence(seqIdx, "limit")
+			continue
+		}
+
+		if !s.cache.enabled {
+			seq.inputs = append(seq.cache.Inputs, seq.inputs...)
+			seq.cache.Inputs = []input{}
+		}
+
+		for i, input := range seq.inputs {
+			if int32(len(seq.cache.Inputs)+len(seq.pendingInputs)+1) > s.cache.numCtx {
+				if len(seq.pendingInputs) == 0 {
+					err := s.cache.ShiftCacheSlot(seq.cache, seq.numKeep)
+					if err != nil {
+						return err
+					}
+				} else {
+					break
+				}
+			}
+
+			if i >= s.batchSize {
+				break
+			}
+
+			// TODO(jessegross): Image inputs need to be rethought - it's
+			// it doesn't work well for different types of models or multiple sequences
+			if input.image != nil {
+				if len(seq.pendingInputs) != len(options.Images) {
+					break
+				}
+
+				if imgSeq != seqIdx && imgSeq != -1 {
+					s.nextSeq = seqIdx
+					break
+				}
+
+				imgSeq = seqIdx
+				options.Images = append(options.Images, input.image)
+				seq.pendingInputs = append(seq.pendingInputs, input)
+				continue
+			}
+
+			options.Inputs = append(options.Inputs, input.token)
+			options.Positions = append(options.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
+			options.Sequences = append(options.Sequences, seq.cache.Id)
+
+			seq.iBatch = len(options.Outputs)
+			if i+1 == len(seq.inputs) {
+				options.Outputs = append(options.Outputs, int32(len(options.Inputs)-1))
+			}
+			seq.pendingInputs = append(seq.pendingInputs, input)
+		}
+
+		seq.inputs = seq.inputs[len(seq.pendingInputs):]
+	}
+
+	if len(options.Inputs) == 0 {
+		return nil
+	}
+
+	ctx := s.model.Backend().NewContext()
+	defer ctx.Close()
+
+	modelOutput, err := model.Forward(ctx, s.model, options)
+	if err != nil {
+		return fmt.Errorf("failed to decode batch: %w", err)
+	}
+
+	f32s := modelOutput.Floats()
+
+	// TODO(jessegross): This will no longer be necessary once the sampling interface takes f32s
+	logits := make([]float64, len(f32s))
+	for i, f32 := range f32s {
+		logits[i] = float64(f32)
+	}
+
+	for i, seq := range s.seqs {
+		if seq == nil {
+			continue
+		}
+
+		// After calling Forward, pending inputs are now in the cache
+		if len(seq.pendingInputs) > 0 {
+			seq.cache.Inputs = append(seq.cache.Inputs, seq.pendingInputs...)
+			seq.pendingInputs = []input{}
+		}
+
+		// don't sample prompt processing
+		if len(seq.inputs) != 0 {
+			if !s.cache.enabled {
+				return errors.New("caching disabled but unable to fit entire input in a batch")
+			}
+			continue
+		}
+
+		seq.numPredicted++
+		if seq.numPredicted == 1 {
+			seq.startGenerationTime = time.Now()
+		}
+
+		// if done processing the prompt, generate an embedding and return
+		if seq.embeddingOnly {
+			// TODO(jessegross): Embedding support
+			s.removeSequence(i, "")
+			continue
+		}
+
+		// sample a token
+		vocabSize := len(f32s) / len(options.Outputs)
+		tokens, err := sample.Sample(logits[seq.iBatch*vocabSize:(seq.iBatch+1)*vocabSize], seq.samplers...)
+		if err != nil {
+			return err
+		}
+
+		// TODO(jessegross): Sampler will output a single int32 in the future
+		token := int32(tokens[0])
+
+		// if it's an end of sequence token, break
+		if s.model.(model.TextProcessor).Is(token, model.SpecialEOS) {
+			// TODO (jmorganca): we should send this back
+			// as it's important for the /api/generate context
+			// seq.responses <- piece
+
+			s.removeSequence(i, "stop")
+			continue
+		}
+
+		piece, err := s.model.(model.TextProcessor).Decode([]int32{token})
+		if err != nil {
+			return err
+		}
+
+		seq.inputs = []input{{token: token}}
+
+		seq.pendingResponses = append(seq.pendingResponses, piece)
+		sequence := strings.Join(seq.pendingResponses, "")
+
+		if ok, stop := common.FindStop(sequence, seq.stop); ok {
+			slog.Debug("hit stop token", "pending", seq.pendingResponses, "stop", stop)
+
+			var tokenTruncated bool
+			origLen := len(seq.pendingResponses)
+			seq.pendingResponses, tokenTruncated = common.TruncateStop(seq.pendingResponses, stop)
+			newLen := len(seq.pendingResponses)
+
+			// Update the cache based on the tokens that will be returned:
+			// - We have 1 token more than is currently in the cache because
+			// the last one generated wasn't submitted to Decode
+			// - Remove any stop sequences that we stripped out
+			// - If truncateStop removed a portion of a token, drop that
+			// - As defense-in-depth, if truncatedToken didn't find a stop token
+			// remove the extra one that we added to the cache len
+			tokenLen := len(seq.cache.Inputs) + 1
+			tokenLen -= origLen - newLen
+			if tokenTruncated || origLen == newLen {
+				tokenLen--
+			}
+			seq.cache.Inputs = seq.cache.Inputs[:tokenLen]
+
+			s.removeSequence(i, "stop")
+			continue
+		}
+
+		if common.ContainsStopSuffix(sequence, seq.stop) {
+			continue
+		}
+
+		if common.IncompleteUnicode(sequence) {
+			continue
+		}
+
+		if !flushPending(seq) {
+			s.removeSequence(i, "connection")
+		}
+	}
+
+	return nil
+}
+
+// TODO (jmorganca): use structs from the api package to avoid duplication
+// this way the api acts as a proxy instead of using a different api for the
+// runner
+type Options struct {
+	api.Runner
+
+	NumKeep          int      `json:"n_keep"`
+	Seed             int      `json:"seed"`
+	NumPredict       int      `json:"n_predict"`
+	TopK             int      `json:"top_k"`
+	TopP             float32  `json:"top_p"`
+	MinP             float32  `json:"min_p"`
+	TypicalP         float32  `json:"typical_p"`
+	RepeatLastN      int      `json:"repeat_last_n"`
+	Temperature      float32  `json:"temperature"`
+	RepeatPenalty    float32  `json:"repeat_penalty"`
+	PresencePenalty  float32  `json:"presence_penalty"`
+	FrequencyPenalty float32  `json:"frequency_penalty"`
+	Mirostat         int      `json:"mirostat"`
+	MirostatTau      float32  `json:"mirostat_tau"`
+	MirostatEta      float32  `json:"mirostat_eta"`
+	Stop             []string `json:"stop"`
+}
+
+type ImageData struct {
+	Data          []byte `json:"data"`
+	ID            int    `json:"id"`
+	AspectRatioID int    `json:"aspect_ratio_id"`
+}
+
+type CompletionRequest struct {
+	Prompt      string      `json:"prompt"`
+	Images      []ImageData `json:"image_data"`
+	Grammar     string      `json:"grammar"`
+	CachePrompt bool        `json:"cache_prompt"`
+
+	Options
+}
+
+type Timings struct {
+	PredictedN  int     `json:"predicted_n"`
+	PredictedMS float64 `json:"predicted_ms"`
+	PromptN     int     `json:"prompt_n"`
+	PromptMS    float64 `json:"prompt_ms"`
+}
+
+type CompletionResponse struct {
+	Content string `json:"content"`
+	Stop    bool   `json:"stop"`
+
+	Model        string  `json:"model,omitempty"`
+	Prompt       string  `json:"prompt,omitempty"`
+	StoppedLimit bool    `json:"stopped_limit,omitempty"`
+	PredictedN   int     `json:"predicted_n,omitempty"`
+	PredictedMS  float64 `json:"predicted_ms,omitempty"`
+	PromptN      int     `json:"prompt_n,omitempty"`
+	PromptMS     float64 `json:"prompt_ms,omitempty"`
+
+	Timings Timings `json:"timings"`
+}
+
+func getSamplers(_ CompletionRequest) []sample.Sampler {
+	// TODO(jessegross): Waiting for sampling code
+
+	/*samplingParams.TopK = req.TopK
+	samplingParams.TopP = req.TopP
+	samplingParams.MinP = req.MinP
+	samplingParams.TypicalP = req.TypicalP
+	samplingParams.Temp = req.Temperature
+	samplingParams.RepeatLastN = req.RepeatLastN
+	samplingParams.PenaltyRepeat = req.RepeatPenalty
+	samplingParams.PenaltyFreq = req.FrequencyPenalty
+	samplingParams.PenaltyPresent = req.PresencePenalty
+	samplingParams.Mirostat = req.Mirostat
+	samplingParams.MirostatTau = req.MirostatTau
+	samplingParams.MirostatEta = req.MirostatEta
+	samplingParams.Seed = uint32(req.Seed)
+	samplingParams.Grammar = req.Grammar*/
+
+	return []sample.Sampler{sample.Greedy()}
+}
+
+func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
+	var req CompletionRequest
+	req.Options = Options(api.DefaultOptions())
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		http.Error(w, "Bad request", http.StatusBadRequest)
+		return
+	}
+
+	// Set the headers to indicate streaming
+	w.Header().Set("Content-Type", "application/json")
+	w.Header().Set("Transfer-Encoding", "chunked")
+
+	flusher, ok := w.(http.Flusher)
+	if !ok {
+		http.Error(w, "Streaming not supported", http.StatusInternalServerError)
+		return
+	}
+
+	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
+		numPredict: req.NumPredict,
+		stop:       req.Stop,
+		numKeep:    int32(req.NumKeep),
+		samplers:   getSamplers(req),
+		embedding:  false,
+	})
+	if err != nil {
+		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
+		return
+	}
+
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting completion request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
+		return
+	}
+
+	s.mu.Lock()
+	found := false
+	for i, sq := range s.seqs {
+		if sq == nil {
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			if err != nil {
+				s.mu.Unlock()
+				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
+				return
+			}
+
+			s.seqs[i] = seq
+			s.cond.Signal()
+			found = true
+			break
+		}
+	}
+	s.mu.Unlock()
+
+	if !found {
+		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
+		return
+	}
+
+	for {
+		select {
+		case <-r.Context().Done():
+			close(seq.quit)
+			return
+		case content, ok := <-seq.responses:
+			if ok {
+				if err := json.NewEncoder(w).Encode(&CompletionResponse{
+					Content: content,
+				}); err != nil {
+					http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
+					close(seq.quit)
+					return
+				}
+
+				flusher.Flush()
+			} else {
+				// Send the final response
+				if err := json.NewEncoder(w).Encode(&CompletionResponse{
+					Stop:         true,
+					StoppedLimit: seq.doneReason == "limit",
+					Timings: Timings{
+						PromptN:     seq.numPromptInputs,
+						PromptMS:    float64(seq.startGenerationTime.Sub(seq.startProcessingTime).Milliseconds()),
+						PredictedN:  seq.numPredicted,
+						PredictedMS: float64(time.Since(seq.startGenerationTime).Milliseconds()),
+					},
+				}); err != nil {
+					http.Error(w, fmt.Sprintf("failed to encode final response: %v", err), http.StatusInternalServerError)
+				}
+
+				return
+			}
+		}
+	}
+}
+
+type EmbeddingRequest struct {
+	Content     string `json:"content"`
+	CachePrompt bool   `json:"cache_prompt"`
+}
+
+type EmbeddingResponse struct {
+	Embedding []float32 `json:"embedding"`
+}
+
+func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
+	var req EmbeddingRequest
+	if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+		http.Error(w, fmt.Sprintf("bad request: %s", err), http.StatusBadRequest)
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+
+	slog.Debug("embedding request", "content", req.Content)
+
+	seq, err := s.NewSequence(req.Content, nil, NewSequenceParams{embedding: true})
+	if err != nil {
+		http.Error(w, fmt.Sprintf("Failed to create new sequence: %v", err), http.StatusInternalServerError)
+		return
+	}
+
+	// Ensure there is a place to put the sequence, released when removed from s.seqs
+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
+		if errors.Is(err, context.Canceled) {
+			slog.Info("aborting embeddings request due to client closing the connection")
+		} else {
+			slog.Error("Failed to acquire semaphore", "error", err)
+		}
+		return
+	}
+
+	s.mu.Lock()
+	found := false
+	for i, sq := range s.seqs {
+		if sq == nil {
+			seq.cache, seq.inputs, err = s.cache.LoadCacheSlot(seq.inputs, req.CachePrompt)
+			if err != nil {
+				s.mu.Unlock()
+				http.Error(w, fmt.Sprintf("Failed to load cache: %v", err), http.StatusInternalServerError)
+				return
+			}
+			s.seqs[i] = seq
+			s.cond.Signal()
+			found = true
+			break
+		}
+	}
+	s.mu.Unlock()
+
+	if !found {
+		http.Error(w, "could not find an available sequence", http.StatusInternalServerError)
+		return
+	}
+
+	embedding := <-seq.embedding
+
+	if err := json.NewEncoder(w).Encode(&EmbeddingResponse{
+		Embedding: embedding,
+	}); err != nil {
+		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
+	}
+}
+
+type HealthResponse struct {
+	Status   string  `json:"status"`
+	Progress float32 `json:"progress"`
+}
+
+type ServerStatus int
+
+const (
+	ServerStatusReady ServerStatus = iota
+	ServerStatusLoadingModel
+	ServerStatusError
+)
+
+func (s ServerStatus) ToString() string {
+	switch s {
+	case ServerStatusReady:
+		return "ok"
+	case ServerStatusLoadingModel:
+		return "loading model"
+	default:
+		return "server error"
+	}
+}
+
+func (s *Server) health(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Content-Type", "application/json")
+	if err := json.NewEncoder(w).Encode(&HealthResponse{
+		Status:   s.status.ToString(),
+		Progress: s.progress,
+	}); err != nil {
+		http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
+	}
+}
+
+type multiLPath []string
+
+func (m *multiLPath) Set(value string) error {
+	*m = append(*m, value)
+	return nil
+}
+
+func (m *multiLPath) String() string {
+	return strings.Join(*m, ", ")
+}
+
+func (s *Server) loadModel(
+	mpath string,
+	lpath multiLPath,
+	parallel int,
+	kvCacheType string,
+	kvSize int,
+	multiUserCache bool,
+) {
+	var err error
+	s.model, err = model.New(mpath)
+	if err != nil {
+		panic(err)
+	}
+
+	// TODO(jessegross): LoRA loading
+	if lpath.String() != "" {
+		panic("loras are not yet implemented")
+	}
+
+	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, multiUserCache)
+	if err != nil {
+		panic(err)
+	}
+
+	if !s.cache.enabled && parallel > 1 {
+		parallel = 1
+		slog.Warn("model does not support caching, disabling parallel processing")
+	}
+
+	s.parallel = parallel
+	s.seqs = make([]*Sequence, s.parallel)
+	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))
+
+	s.status = ServerStatusReady
+	s.ready.Done()
+}
+
+func Execute(args []string) error {
+	fs := flag.NewFlagSet("runner", flag.ExitOnError)
+	mpath := fs.String("model", "", "Path to model binary file")
+	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
+	batchSize := fs.Int("batch-size", 512, "Batch size")
+	_ = fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
+	_ = fs.Int("main-gpu", 0, "Main GPU")
+	_ = fs.Bool("flash-attn", false, "Enable flash attention")
+	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
+	port := fs.Int("port", 8080, "Port to expose the server on")
+	_ = fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
+	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
+	_ = fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
+	_ = fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
+	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
+
+	var lpaths multiLPath
+	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
+
+	fs.Usage = func() {
+		fmt.Fprintf(fs.Output(), "Runner usage\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
+	}
+	level := slog.LevelInfo
+	if *verbose {
+		level = slog.LevelDebug
+	}
+	handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
+		Level:     level,
+		AddSource: true,
+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
+			if attr.Key == slog.SourceKey {
+				source := attr.Value.Any().(*slog.Source)
+				source.File = filepath.Base(source.File)
+			}
+			return attr
+		},
+	})
+	slog.SetDefault(slog.New(handler))
+	slog.Info("starting ollama engine")
+	// TODO(jessegross): Some system info would be useful
+
+	server := &Server{
+		batchSize: *batchSize,
+		status:    ServerStatusLoadingModel,
+	}
+
+	// TODO(jessegross): Parameters that need to be implemented:
+	//	n-gpu-layers
+	//	main-gpu
+	//	flash-attn
+	//	threads
+	//	no-mmap
+	//	mlock
+	//	tensor-split
+
+	/*var tensorSplitFloats []float32
+	if *tensorSplit != "" {
+		stringFloats := regexp.MustCompile(",").Split(*tensorSplit, -1)
+
+		tensorSplitFloats = make([]float32, 0, len(stringFloats))
+		for _, s := range stringFloats {
+			f, _ := strconv.ParseFloat(s, 32)
+			tensorSplitFloats = append(tensorSplitFloats, float32(f))
+		}
+	}*/
+
+	server.ready.Add(1)
+	go server.loadModel(*mpath, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
+
+	server.cond = sync.NewCond(&server.mu)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	go server.run(ctx)
+
+	addr := "127.0.0.1:" + strconv.Itoa(*port)
+	listener, err := net.Listen("tcp", addr)
+	if err != nil {
+		fmt.Println("Listen error:", err)
+		cancel()
+		return err
+	}
+	defer listener.Close()
+
+	mux := http.NewServeMux()
+	mux.HandleFunc("/embedding", server.embeddings)
+	mux.HandleFunc("/completion", server.completion)
+	mux.HandleFunc("/health", server.health)
+
+	httpServer := http.Server{
+		Handler: mux,
+	}
+
+	log.Println("Server listening on", addr)
+	if err := httpServer.Serve(listener); err != nil {
+		log.Fatal("server error:", err)
+		return err
+	}
+
+	cancel()
+	return nil
+}
--- a/runner/runner.go
+++ b/runner/runner.go
@@ -0,0 +1,24 @@
+package runner
+
+import (
+	"github.com/ollama/ollama/runner/llamarunner"
+	"github.com/ollama/ollama/runner/ollamarunner"
+)
+
+func Execute(args []string) error {
+	if args[0] == "runner" {
+		args = args[1:]
+	}
+
+	var newRunner bool
+	if args[0] == "--ollama-engine" {
+		args = args[1:]
+		newRunner = true
+	}
+
+	if newRunner {
+		return ollamarunner.Execute(args)
+	} else {
+		return llamarunner.Execute(args)
+	}
+}