runner.go: Better abstract vision model integration

-Update mllama to take the cross attention state as embeddings in
a batch, more similar to how Llava handles it. This improves
integration with the input cache.
-Pass locations in a prompt for embeddings using tags similar to Llava.
-Abstract interface to vision models so the main runner accesses Clip
and Mllama similarly

Co-authored-by: Michael Yang <mxyng@pm.me>
This commit is contained in:
Jesse Gross
2024-10-11 15:34:01 -07:00
committed by Jesse Gross
parent 712e99d477
commit c826e57475
13 changed files with 534 additions and 454 deletions

View File

@@ -2,7 +2,6 @@ package main
import (
"errors"
"hash/maphash"
"log/slog"
"reflect"
"time"
@@ -20,10 +19,6 @@ type InputCache struct {
// optimize cache eviction for multiple users
multiUserCache bool
// cache of images to embeddings
images []imageCache
imageHash maphash.Hash
lc *llama.Context
}
@@ -41,7 +36,6 @@ func NewInputCache(lc *llama.Context, kvSize int, numSlots int, multiUserCache b
numCtx: kvSize / numSlots,
slots: slots,
multiUserCache: multiUserCache,
images: make([]imageCache, numSlots),
lc: lc,
}
}
@@ -211,55 +205,3 @@ func (c *InputCache) ShiftCacheSlot(slot *InputCacheSlot, numKeep int, numDiscar
}
slot.Inputs = slot.Inputs[:len(slot.Inputs)-numDiscard]
}
// Locking: Lookup and store operations on imageCache require a lock
// to be held that serializes these with each other. Hash does not
// require a lock nor they need to be serialized with InputCacheSlot.
type imageCache struct {
key uint64
val [][]float32
lastUsed time.Time
}
func (c *InputCache) HashImage(image []byte) uint64 {
c.imageHash.Reset()
_, _ = c.imageHash.Write(image)
return c.imageHash.Sum64()
}
var ErrImageNotFound = errors.New("image not found in cache")
func (c *InputCache) FindImage(hash uint64) ([][]float32, error) {
for i := range c.images {
if c.images[i].key == hash {
slog.Debug("loading image embeddings from cache", "entry", i)
c.images[i].lastUsed = time.Now()
return c.images[i].val, nil
}
}
return nil, ErrImageNotFound
}
func (c *InputCache) AddImage(hash uint64, embed [][]float32) {
best := time.Now()
var bestImage int
for i := range c.images {
if c.images[i].key == hash {
bestImage = i
break
}
if c.images[i].lastUsed.Compare(best) < 0 {
best = c.images[i].lastUsed
bestImage = i
}
}
slog.Debug("storing image embeddings in cache", "entry", bestImage, "used", c.images[bestImage].lastUsed)
c.images[bestImage].key = hash
c.images[bestImage].val = embed
c.images[bestImage].lastUsed = time.Now()
}