chore: update mllama to use ollama engine (#10637)

2025-12-10 07:46:59 +00:00 · 2025-05-13 17:36:02 -07:00
parent 0478d440f0
commit 23125648b8
67 changed files with 785 additions and 4354 deletions
--- a/runner/llamarunner/image.go
+++ b/runner/llamarunner/image.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"hash/maphash"
 	"log/slog"
-	"slices"
 	"sync"
 	"time"

@@ -18,8 +17,7 @@ type ImageContext struct {
 	// mu is required to be held when generating embeddings or accessing the cache
 	mu sync.Mutex

-	clip   *llama.ClipContext
-	mllama *llama.MllamaContext
+	clip *llama.ClipContext

 	// cache of images to embeddings
 	images    []imageCache
@@ -35,8 +33,6 @@ func NewImageContext(llamaContext *llama.Context, modelPath string) (*ImageConte
 	var c ImageContext
 	if arch == "clip" {
 		c.clip, err = llama.NewClipContext(llamaContext, modelPath)
-	} else if arch == "mllama" {
-		c.mllama, err = llama.NewMllamaContext(llamaContext, modelPath)
 	} else {
 		return nil, fmt.Errorf("unknown vision model architecture: %s", arch)
 	}
@@ -58,12 +54,9 @@ func (c *ImageContext) Free(modelPath string) {
 	if c.clip != nil {
 		c.clip.Free()
 	}
-	if c.mllama != nil {
-		c.mllama.Free()
-	}
 }

-func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspectRatioId int) ([][]float32, error) {
+func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte) ([][]float32, error) {
 	if c == nil {
 		return nil, nil
 	}
@@ -79,12 +72,7 @@ func (c *ImageContext) NewEmbed(llamaContext *llama.Context, data []byte, aspect

 	embed, err := c.findImage(hash)
 	if err != nil {
-		if c.mllama != nil {
-			embed, err = c.mllama.NewEmbed(llamaContext, data, aspectRatioId)
-			if err != nil {
-				return nil, err
-			}
-		} else if c.clip != nil {
+		if c.clip != nil {
 			embed, err = c.clip.NewEmbed(llamaContext, data)
 			if err != nil {
 				return nil, err
@@ -105,33 +93,11 @@ func (c *ImageContext) BatchSize(configuredBatchSize int) int {
 		return 0
 	}

-	// Mllama maps an image to 1 embedding token (llava creates many tokens)
-	// and doesn't support more than a single image per request.
-	// The embeddings are large (100 MB), so allocating a big batch can fail
-	// on some systems
-	if c.mllama != nil {
-		return 1
-	}
-
 	return configuredBatchSize
 }

 func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
-	if c != nil && c.mllama != nil {
-		return c.mllama.EmbedSize(llamaContext)
-	} else {
-		return llamaContext.Model().NEmbd()
-	}
-}
-
-func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
-	if c == nil || c.mllama == nil {
-		return false
-	}
-
-	return slices.ContainsFunc(inputs, func(input input) bool {
-		return input.embed != nil
-	})
+	return llamaContext.Model().NEmbd()
 }

 type imageCache struct {
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -57,10 +57,6 @@ type Sequence struct {
 	// input cache being used by this sequence
 	cache *InputCacheSlot

-	// does this sequence require cross-attention layers to be processed? - if we have seen
-	// an image for certain multi-modal models
-	crossAttention bool
-
 	// channel to send responses over
 	responses chan string

@@ -205,7 +201,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input, error)
 				return nil, fmt.Errorf("invalid image index: %d", n)
 			}

-			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data, images[imageIndex].AspectRatioID)
+			embed, err := s.image.NewEmbed(s.lc, images[imageIndex].Data)
 			if err != nil {
 				return nil, err
 			}
@@ -368,7 +364,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 	defer s.mu.Unlock()

 	var batch *llama.Batch
-	crossAttention := false

 	seqIdx := s.nextSeq - 1
 	for range s.seqs {
@@ -416,9 +411,8 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 					batch = tokenBatch
 				} else {
 					batch = embedBatch
-					seq.crossAttention = s.image.NeedCrossAttention(input)
 				}
-			} else if embedding != batch.IsEmbedding() || crossAttention != seq.crossAttention {
+			} else if embedding != batch.IsEmbedding() {
 				s.nextSeq = seqIdx
 				break
 			}
@@ -427,7 +421,6 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 				break
 			}

-			crossAttention = seq.crossAttention
 			batch.Add(input.token, input.embed, len(seq.cache.Inputs)+len(seq.pendingInputs), i+1 == len(seq.inputs), seq.cache.Id)
 			seq.pendingInputs = append(seq.pendingInputs, input)
 			seq.iBatch = batch.NumTokens() - 1
@@ -440,20 +433,11 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		return nil
 	}

-	s.lc.SetCrossAttention(crossAttention)
-
 	err := s.lc.Decode(batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
 	}

-	if crossAttention {
-		// synchronize state to ensure the cross attention batch is complete.
-		// needed specifically for multi-GPU systems otherwise an inflight
-		// task may be incorrectly invalidated causing a crash
-		s.lc.Synchronize()
-	}
-
 	for i, seq := range s.seqs {
 		if seq == nil {
 			continue
@@ -622,8 +606,6 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 				return
 			}

-			seq.crossAttention = s.image.NeedCrossAttention(seq.cache.Inputs...)
-
 			s.seqs[i] = seq
 			s.cond.Signal()
 			found = true