mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-17 11:17:11 +00:00
runner.go: Don't set cross attention before sending embeddings
Currently if an input has embeddings at any point then we will set cross attention to true from the beginning. This means that any tokens before the embeddings are sent will incorrectly have cross attention layers applied. This only sets cross attention when we have an embedding, either previously in this sequence or in the cache. It also makes cross attention capable of supporting parallelism at the runner level, though the mllama implementation doesn't support that yet.
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"hash/maphash"
|
||||
"log/slog"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -96,6 +97,16 @@ func (c *ImageContext) EmbedSize(llamaContext *llama.Context) int {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ImageContext) NeedCrossAttention(inputs ...input) bool {
|
||||
if c == nil || c.mllama == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return slices.ContainsFunc(inputs, func(input input) bool {
|
||||
return input.embed != nil
|
||||
})
|
||||
}
|
||||
|
||||
type imageCache struct {
|
||||
key uint64
|
||||
val [][]float32
|
||||
|
||||
Reference in New Issue
Block a user