Multimodal support (#1216)

--------- Co-authored-by: Matt Apperson <mattapperson@Matts-MacBook-Pro.local>
2025-12-11 16:26:59 +00:00 · 2023-12-11 13:56:22 -08:00
parent 7a1b37ac64
commit 910e9401d0
6 changed files with 235 additions and 28 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -223,8 +223,14 @@ type Running struct {
 	*StatusWriter            // captures error messages from the llama runner process
 }

+type ImageData struct {
+	Data []byte `json:"data"`
+	ID   int    `json:"id"`
+}
+
 type llama struct {
 	api.Options
+	ImageData []ImageData
 	Running
 }

@@ -547,6 +553,7 @@ const maxBufferSize = 512 * format.KiloByte
 type PredictOpts struct {
 	Prompt           string
 	Format           string
+	Images           []api.ImageData
 	CheckpointStart  time.Time
 	CheckpointLoaded time.Time
 }
@@ -564,6 +571,14 @@ type PredictResult struct {
 }

 func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
+	imageData := llm.ImageData
+	if len(predict.Images) > 0 {
+		for cnt, i := range predict.Images {
+			imageData = append(imageData, ImageData{Data: i, ID: cnt})
+		}
+	}
+	log.Printf("loaded %d images", len(imageData))
+
 	request := map[string]any{
 		"prompt":            predict.Prompt,
 		"stream":            true,
@@ -585,6 +600,7 @@ func (llm *llama) Predict(ctx context.Context, predict PredictOpts, fn func(Pred
 		"penalize_nl":       llm.PenalizeNewline,
 		"seed":              llm.Seed,
 		"stop":              llm.Stop,
+		"image_data":        imageData,
 	}

 	if predict.Format == "json" {