runner.go: Only allocate 1 element embedding batches for mllama

Mllama has large embeddings (100 MB per image) and each embedding is represented as 1 token when passed to llama.cpp. Batches are pre- allocated for the size of the tokens times the batch size, so this results in allocations of over 50 GB at the default batch size. On some systems, these mallocs will fail. Since an image is represented as a single token and mllama doesn't support more than 1 image per request, we only need to allocate a batch size of 1, which is much more reasonable. In addition, for non-multimodal models, we don't need to allocate the embedding batches at all. Fixes #7464
2025-12-11 16:26:59 +00:00 · 2024-11-01 14:29:57 -07:00
parent 8a9bb0d000
commit a103dae01e
3 changed files with 54 additions and 21 deletions
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -211,6 +211,7 @@ type Server struct {
 	// required for image embeddings
 	image *ImageContext

+	// TODO (jmorganca): make this n_batch
 	batchSize int

 	// parallel is the number of parallel requests to handle
@@ -302,13 +303,19 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
 func (s *Server) run(ctx context.Context) {
 	s.ready.Wait()

-	// logically these batches are used only within the context of processBatch
+	// Logically these batches are used only within the context of processBatch
 	// but it is better for performance to allocate them once here
-	tokenBatch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
+	tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0)
 	defer tokenBatch.Free()

-	embedBatch := llama.NewBatch(s.batchSize*len(s.seqs), s.image.EmbedSize(s.lc), len(s.seqs))
-	defer embedBatch.Free()
+	var embedBatch *llama.Batch
+	embedBatchSize := s.image.BatchSize(s.batchSize)
+	if embedBatchSize != 0 {
+		embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
+		defer embedBatch.Free()
+	} else {
+		embedBatch = &llama.Batch{}
+	}

 	for {
 		select {
@@ -378,13 +385,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}

-			// todo: make this n_batch
-			if i >= s.batchSize {
+			if i >= batch.Size() {
 				break
 			}

 			crossAttention = seq.crossAttention
-			batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
+			batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
 			seq.numPast++
 			numInputsProcessed++
 		}