mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 16:26:59 +00:00
runner.go: Only allocate 1 element embedding batches for mllama
Mllama has large embeddings (100 MB per image) and each embedding is represented as 1 token when passed to llama.cpp. Batches are pre- allocated for the size of the tokens times the batch size, so this results in allocations of over 50 GB at the default batch size. On some systems, these mallocs will fail. Since an image is represented as a single token and mllama doesn't support more than 1 image per request, we only need to allocate a batch size of 1, which is much more reasonable. In addition, for non-multimodal models, we don't need to allocate the embedding batches at all. Fixes #7464
This commit is contained in:
@@ -211,6 +211,7 @@ type Server struct {
|
||||
// required for image embeddings
|
||||
image *ImageContext
|
||||
|
||||
// TODO (jmorganca): make this n_batch
|
||||
batchSize int
|
||||
|
||||
// parallel is the number of parallel requests to handle
|
||||
@@ -302,13 +303,19 @@ func (s *Server) removeSequence(seqIndex int, reason string) {
|
||||
func (s *Server) run(ctx context.Context) {
|
||||
s.ready.Wait()
|
||||
|
||||
// logically these batches are used only within the context of processBatch
|
||||
// Logically these batches are used only within the context of processBatch
|
||||
// but it is better for performance to allocate them once here
|
||||
tokenBatch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
|
||||
tokenBatch := llama.NewBatch(s.batchSize, len(s.seqs), 0)
|
||||
defer tokenBatch.Free()
|
||||
|
||||
embedBatch := llama.NewBatch(s.batchSize*len(s.seqs), s.image.EmbedSize(s.lc), len(s.seqs))
|
||||
defer embedBatch.Free()
|
||||
var embedBatch *llama.Batch
|
||||
embedBatchSize := s.image.BatchSize(s.batchSize)
|
||||
if embedBatchSize != 0 {
|
||||
embedBatch = llama.NewBatch(embedBatchSize, len(s.seqs), s.image.EmbedSize(s.lc))
|
||||
defer embedBatch.Free()
|
||||
} else {
|
||||
embedBatch = &llama.Batch{}
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
@@ -378,13 +385,12 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
|
||||
break
|
||||
}
|
||||
|
||||
// todo: make this n_batch
|
||||
if i >= s.batchSize {
|
||||
if i >= batch.Size() {
|
||||
break
|
||||
}
|
||||
|
||||
crossAttention = seq.crossAttention
|
||||
batch.Add(input.token, input.embed, seq.numPast, []int{seq.cache.Id}, numInputsProcessed+1 == len(seq.inputs))
|
||||
batch.Add(input.token, input.embed, seq.numPast, numInputsProcessed+1 == len(seq.inputs), seq.cache.Id)
|
||||
seq.numPast++
|
||||
numInputsProcessed++
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user