model: add Qwen2.5-VL support (#10385)

This commit is contained in:
Bruce MacDonald
2025-05-13 20:58:02 -07:00
committed by GitHub
parent 23125648b8
commit 0aa8b371dd
16 changed files with 1619 additions and 10 deletions

View File

@@ -6,6 +6,7 @@ import (
"fmt"
"io"
"log/slog"
"math"
"slices"
"strings"
@@ -126,6 +127,7 @@ func (kv KV) OllamaEngineRequired() bool {
"mistral3",
"llama4",
"mllama",
"qwen25vl",
}, kv.Architecture())
}
@@ -649,6 +651,29 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
graphSize = 4 * (imageSize*imageSize*numChannels +
embeddingLength*patchSize +
numPatches*numPatches*headCount)
case "qwen25vl":
maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
temporalPatchSize := uint64(2)
// Calculate max possible patches based on max_pixels
maxHeight := uint64(math.Sqrt(float64(maxPixels)))
maxWidth := maxPixels / maxHeight
maxGridHeight := maxHeight / patchSize
maxGridWidth := maxWidth / patchSize
// Account for merged patches (2x2 grid)
numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
// Calculate graph size based on typical operations in ProcessImage and createPatches
graphSize = 4 * (maxPixels*numChannels + // Original image storage
// Normalized pixels
maxPixels*numChannels +
// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
// Self-attention calculations (similar to other architectures)
numPatches*numPatches*headCount +
// Additional buffer for processing
embeddingLength*numPatches)
case "llama4":
// vision graph is computed independently in the same schedule
// and is negligible compared to the worst case text graph