runner.go: Don't set cross attention before sending embeddings

Currently if an input has embeddings at any point then we will set
cross attention to true from the beginning. This means that any
tokens before the embeddings are sent will incorrectly have cross
attention layers applied.

This only sets cross attention when we have an embedding, either
previously in this sequence or in the cache. It also makes cross
attention capable of supporting parallelism at the runner level,
though the mllama implementation doesn't support that yet.
This commit is contained in:
Jesse Gross
2024-10-31 10:55:31 -07:00
committed by Jesse Gross
parent 921779bb10
commit 26acdcf44e
2 changed files with 23 additions and 9 deletions

View File

@@ -5,6 +5,7 @@ import (
"fmt"
"hash/maphash"
"log/slog"
"slices"
"sync"
"time"
@@ -96,6 +97,16 @@ func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
}
}
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
if c == nil || c.mllama == nil {
return false
}
return slices.ContainsFunc(inputs, func(input input) bool {
return input.embed != nil
})
}
type imageCache struct {
key uint64
val [][]float32