runner.go: Better abstract vision model integration

-Update mllama to take the cross attention state as embeddings in a batch, more similar to how Llava handles it. This improves integration with the input cache. -Pass locations in a prompt for embeddings using tags similar to Llava. -Abstract interface to vision models so the main runner accesses Clip and Mllama similarly Co-authored-by: Michael Yang <mxyng@pm.me>
2025-12-11 08:17:03 +00:00 · 2024-10-11 15:34:01 -07:00
parent 712e99d477
commit c826e57475
13 changed files with 534 additions and 454 deletions
--- a/server/prompt_test.go
+++ b/server/prompt_test.go
@@ -249,7 +249,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@@ -264,7 +264,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
+				prompt:        "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},
@@ -279,8 +279,8 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
 			},
 			expect: expect{
-				prompt:        "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
-				images:        [][]byte{imgBuf2},
+				prompt:        "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
+				images:        [][]byte{imgBuf, imgBuf2},
 				aspectRatioID: 1,
 			},
 		},
@@ -294,7 +294,7 @@ func TestChatPrompt(t *testing.T) {
 				{Role: "user", Content: "Which ones have mustard?"},
 			},
 			expect: expect{
-				prompt:        "<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
+				prompt:        "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
 				images:        [][]byte{imgBuf},
 				aspectRatioID: 1,
 			},