runner.go: Better abstract vision model integration

-Update mllama to take the cross attention state as embeddings in
a batch, more similar to how Llava handles it. This improves
integration with the input cache.
-Pass locations in a prompt for embeddings using tags similar to Llava.
-Abstract interface to vision models so the main runner accesses Clip
and Mllama similarly

Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
Jesse Gross
2024-10-11 15:34:01 -07:00
committed by Jesse Gross
parent 712e99d477
commit c826e57475
13 changed files with 534 additions and 454 deletions

View File

@@ -249,7 +249,7 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "How many hotdogs are in this image?", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "<|image|>How many hotdogs are in this image? ",
prompt: "[img-0]<|image|>How many hotdogs are in this image? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
@@ -264,7 +264,7 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
prompt: "You're a test, Harry! I-I'm a what? [img-0]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},
@@ -279,8 +279,8 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "A test. And a thumping good one at that, I'd wager.", Images: []api.ImageData{imgBuf2}},
},
expect: expect{
prompt: "You're a test, Harry! I-I'm a what? <|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf2},
prompt: "[img-0]<|image|>You're a test, Harry! I-I'm a what? [img-1]<|image|>A test. And a thumping good one at that, I'd wager. ",
images: [][]byte{imgBuf, imgBuf2},
aspectRatioID: 1,
},
},
@@ -294,7 +294,7 @@ func TestChatPrompt(t *testing.T) {
{Role: "user", Content: "Which ones have mustard?"},
},
expect: expect{
prompt: "<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
prompt: "[img-0]<|image|>How many hotdogs are in this image? There are four hotdogs. Which ones have mustard? ",
images: [][]byte{imgBuf},
aspectRatioID: 1,
},