Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support

This commit represents a complete rework after pulling the latest changes from
official ollama/ollama repository and re-applying Tesla K80 compatibility patches.

## Key Changes

### CUDA Compute Capability 3.7 Support (Tesla K80)
- Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt
- Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset
- Using 37-virtual (PTX with JIT compilation) for maximum compatibility

### Legacy Toolchain Compatibility
- **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80)
- **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7)
- **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h)

### CPU Architecture Trade-offs
Due to GCC 10.5 limitation, sacrificed newer CPU optimizations:
- Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+)
- Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA
- Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility)

### Build System Updates
- Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7
- Added -Wno-deprecated-gpu-targets flag to suppress warnings
- Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI

### Upstream Sync
Merged latest llama.cpp changes including:
- Enhanced KV cache management with ISWA and hybrid memory support
- Improved multi-modal support (mtmd framework)
- New model architectures (Gemma3, Llama4, Qwen3, etc.)
- GPU backend improvements for CUDA, Metal, and ROCm
- Updated quantization support and GGUF format handling

### Documentation
- Updated CLAUDE.md with comprehensive build instructions
- Documented toolchain constraints and CPU architecture trade-offs
- Removed outdated CI/CD workflows (tesla-k80-*.yml)
- Cleaned up temporary development artifacts

## Rationale

This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in
official Ollama due to legacy driver/CUDA requirements. The toolchain constraint
creates a deadlock:
- K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI

We accept the loss of cutting-edge CPU optimizations to enable running modern
LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-11-05 14:03:05 +08:00
parent fabe2c5cb7
commit ef14fb5b26
817 changed files with 241634 additions and 70888 deletions

View File

@@ -0,0 +1,196 @@
package qwen3vl
import (
"fmt"
"image"
"math"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model/imageproc"
)
// ImageProcessor contains configuration for the Qwen 3 VL image processing
type ImageProcessor struct {
numChannels int
patchSize int
temporalPatchSize int
mergeSize int
shortestEdge int
longestEdge int
factor int
rescaleFactor float32
imageMean []float32
imageStd []float32
}
// newImageProcessor creates a new image processor with default values
func newImageProcessor(c fs.Config) ImageProcessor {
patchSize := int(c.Uint("vision.patch_size", 14))
mergeSize := int(c.Uint("vision.spatial_merge_size", 2))
return ImageProcessor{
numChannels: int(c.Uint("vision.num_channels", 3)), // not set
patchSize: patchSize,
temporalPatchSize: 2,
mergeSize: mergeSize,
shortestEdge: int(c.Uint("vision.shortest_edge", 64<<10)),
// FIXME(mxyng): the model defined longest edge (16M) is too large for the default
// context length of 8K and will panic. Adjusting to 2M for now.
// longestEdge: int(c.Uint("vision.longest_edge", 16<<20)),
longestEdge: 2 << 20,
factor: patchSize * mergeSize,
rescaleFactor: 1.0 / 255.0,
imageMean: c.Floats("vision.image_mean", imageproc.ImageNetStandardMean[:]),
imageStd: c.Floats("vision.image_std", imageproc.ImageNetStandardSTD[:]),
}
}
// SmartResize implements the smart resize algorithm
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
factor := p.factor
if height < factor || width < factor {
panic(fmt.Sprintf("height:%d or width:%d must be larger than factor:%d", height, width, factor))
} else if aspectRatio := max(height, width) / min(height, width); aspectRatio > 200 {
panic(fmt.Sprintf("absolute aspect ratio must be smaller than 200, got %v", aspectRatio))
}
round := func(x float64) int { return int(math.RoundToEven(x)) }
hBar := round(float64(height)/float64(factor)) * factor
wBar := round(float64(width)/float64(factor)) * factor
if hBar*wBar > p.longestEdge {
beta := math.Sqrt(float64(height*width) / float64(p.longestEdge))
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
} else if hBar*wBar < p.shortestEdge {
beta := math.Sqrt(float64(p.shortestEdge) / float64(height*width))
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
}
return hBar, wBar
}
type Grid struct {
Height int
Width int
Temporal int
}
func (p *ImageProcessor) ProcessImage(ctx ml.Context, img image.Image) (ml.Tensor, *Grid, error) {
img = imageproc.Composite(img)
origWidth := img.Bounds().Dx()
origHeight := img.Bounds().Dy()
// Calculate smart resize dimensions
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
// Resize image using existing functions
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeBilinear)
normalizedPixels := imageproc.Normalize(
resizedImg,
[3]float32{p.imageMean[0], p.imageMean[1], p.imageMean[2]},
[3]float32{p.imageStd[0], p.imageStd[1], p.imageStd[2]},
true, // rescale
true, // channelFirst
)
// Calculate grid dimensions
grid := &Grid{
Height: resizedHeight / p.patchSize,
Width: resizedWidth / p.patchSize,
Temporal: 1, // For single images, temporal dimension is 1
}
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
if err != nil {
return nil, nil, fmt.Errorf("failed to create patches: %v", err)
}
patchDim := p.numChannels * p.temporalPatchSize *
p.patchSize * p.patchSize
numPatches := grid.Temporal * grid.Height * grid.Width
pixelValues := ctx.Input().FromFloats(patches, patchDim, numPatches)
// Return patches and grid dimensions
return pixelValues, grid, nil
}
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
channels := p.numChannels
patchSize := p.patchSize
mergeSize := p.mergeSize
temporalPatchSize := p.temporalPatchSize
// Calculate output dimensions
numPatches := grid.Temporal * grid.Height * grid.Width
patchDim := channels * temporalPatchSize * patchSize * patchSize
result := make([]float32, numPatches*patchDim)
patchIndex := 0
// Single temporal frame handling (copies to all frames)
for range grid.Temporal {
for h := 0; h < grid.Height; h += mergeSize {
for w := 0; w < grid.Width; w += mergeSize {
// Handle the 2x2 merged patches
for mh := range mergeSize {
for mw := range mergeSize {
baseOffset := patchIndex * patchDim
// Extract patch data for first temporal frame
for c := range channels {
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
for py := range patchSize {
for px := range patchSize {
// Calculate source pixel coordinates
y := (h+mh)*patchSize + py
x := (w+mw)*patchSize + px
// Source index in input tensor (CHW format)
srcIdx := c*height*width + y*width + x
// Destination index in first temporal frame
dstIdx := channelOffset + (py * patchSize) + px
if srcIdx < len(pixels) && dstIdx < len(result) {
result[dstIdx] = pixels[srcIdx]
}
}
}
}
// Copy first temporal frame to all other frames
if temporalPatchSize > 1 {
for c := range channels {
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
firstFrameOffset := channelOffset
frameSize := patchSize * patchSize
// Copy first frame to all other frames
for tp := 1; tp < temporalPatchSize; tp++ {
currentFrameOffset := channelOffset + (tp * frameSize)
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
result[firstFrameOffset:firstFrameOffset+frameSize])
}
}
}
patchIndex++
}
}
}
}
}
return result, nil
}

View File

@@ -0,0 +1,206 @@
package qwen3vl
import (
"bytes"
"image"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/model"
"github.com/ollama/ollama/model/input"
)
type Model struct {
model.Base
model.TextProcessor
*TextModel
*VisionModel `gguf:"v"`
ImageProcessor
positionCache []int32
}
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
if len(m.VisionModel.Layers) == 0 {
return nil, model.ErrNoVisionModel
}
img, _, err := image.Decode(bytes.NewReader(multimodalData))
if err != nil {
return nil, err
}
pixelValues, grid, err := m.ProcessImage(ctx, img)
if err != nil {
return nil, err
}
// Calculate tensor dimensions
visionOutputs, deepstackVisualEmbeds := m.VisionModel.Forward(ctx, pixelValues, grid)
mm := []input.Multimodal{{Tensor: visionOutputs, Data: grid}}
for i := range deepstackVisualEmbeds {
mm = append(mm, input.Multimodal{Tensor: deepstackVisualEmbeds[i]})
}
return mm, nil
}
var (
tokenVision int32 = 151655
tokenVisionStart int32 = 151652
tokenVisionEnd int32 = 151653
)
type modelInput struct {
*input.Input
position int32
}
// PostTokenize arranges Qwen 3 VL's inputs for the forward pass
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
m.positionCache = m.positionCache[:0]
return slices.Collect(func(yield func(*input.Input) bool) {
for i := range inputs {
s := []modelInput{{Input: inputs[i]}}
if mm := inputs[i].Multimodal; mm != nil {
t := mm[0].Tensor
s = slices.Repeat([]modelInput{
{
position: int32(i + 1),
Input: &input.Input{Token: tokenVision},
},
}, t.Dim(1)+1+1)
s[0] = modelInput{
Input: &input.Input{Token: tokenVisionStart},
position: int32(i),
}
s[len(s)-1] = modelInput{
Input: &input.Input{Token: tokenVisionEnd},
position: int32(i + mm[0].Data.(*Grid).Width/m.spatialMergeSize + 1),
}
s[1] = modelInput{
Input: &input.Input{
Token: tokenVision,
Multimodal: inputs[i].Multimodal,
MultimodalHash: inputs[i].MultimodalHash,
SameBatch: t.Dim(1),
},
position: int32(i + 1),
}
}
for _, e := range s {
position := e.position
if position == 0 && len(m.positionCache) > 0 {
position = m.positionCache[len(m.positionCache)-1] + 1
}
m.positionCache = append(m.positionCache, position)
if !yield(e.Input) {
return
}
}
}
}), nil
}
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
// ggml mrope requires 4 positions per token: [time, height, width, extra]
positionSlice := slices.Collect(makeSlice2D[int32](4, len(batch.Positions)))
for i, id := range batch.Positions {
if id < int32(len(m.positionCache)) {
id = m.positionCache[id]
} else if len(m.positionCache) > 0 {
id = id - int32(len(m.positionCache)) + m.positionCache[len(m.positionCache)-1] + 1
}
positionSlice[0][i] = id
positionSlice[1][i] = id
positionSlice[2][i] = id
// positionSlice[3] is intentionally left as zeros
}
hiddenStates := m.TextModel.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
var deepstackVisualEmbeds []ml.Tensor
for _, mi := range batch.Multimodal {
visionOutputs := mi.Multimodal[0].Tensor
ctx.Forward(visionOutputs.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
for i := range visionOutputs.Dim(1) {
w := grid.Width / m.spatialMergeSize
positionSlice[1][mi.Index+i] += int32(i / w)
positionSlice[2][mi.Index+i] += int32(i % w)
}
}
deepstackVisualEmbeds = make([]ml.Tensor, len(mi.Multimodal[1:]))
for i, mm := range mi.Multimodal[1:] {
deepstackVisualEmbeds[i] = ctx.Input().Zeros(mm.Tensor.DType(), hiddenStates.Shape()...)
ctx.Forward(mm.Tensor.Copy(ctx, deepstackVisualEmbeds[i].View(ctx, mi.Index*deepstackVisualEmbeds[i].Stride(1), mm.Tensor.Dim(0)*mm.Tensor.Dim(1))))
}
}
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
for i, layer := range m.TextModel.Layers {
if m.Cache != nil {
m.Cache.SetLayer(i)
}
var outputs ml.Tensor
if i == len(m.TextModel.Layers)-1 {
outputs = batch.Outputs
}
hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
if i < len(deepstackVisualEmbeds) {
hiddenStates = hiddenStates.Add(ctx, deepstackVisualEmbeds[i])
}
}
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, 1e-06)
return m.Output.Forward(ctx, hiddenStates), nil
}
func New(c fs.Config) (model.Model, error) {
m := Model{
TextProcessor: model.NewBytePairEncoding(
&model.Vocabulary{
Values: c.Strings("tokenizer.ggml.tokens"),
Types: c.Ints("tokenizer.ggml.token_type"),
Merges: c.Strings("tokenizer.ggml.merges"),
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
EOS: append(
[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
c.Ints("tokenizer.ggml.eos_token_ids")...,
),
},
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
),
TextModel: newTextModel(c),
VisionModel: newVisionModel(c),
ImageProcessor: newImageProcessor(c),
}
m.Cache = kvcache.NewCausalCache(func(ctx ml.Context, layer int, key, positions ml.Tensor) (ml.Tensor, error) {
m.positionCache = nil
positions = positions.Repeat(ctx, 1, 4).Reshape(ctx, -1)
return m.Options.applyRotaryPositionalEmbedding(ctx, key, positions), nil
})
return &m, nil
}
func init() {
model.Register("qwen3vl", New)
model.Register("qwen3vlmoe", New)
}

View File

@@ -0,0 +1,199 @@
package qwen3vl
import (
"cmp"
"math"
"slices"
"strings"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/kvcache"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
"github.com/ollama/ollama/ml/nn/fast"
"github.com/ollama/ollama/ml/nn/rope"
"github.com/ollama/ollama/model"
)
type TextOptions struct {
hiddenSize,
numHeads,
numKVHeads,
keyLength,
valueLength int
eps,
ropeBase,
ropeScale float32
mropeSections []int
numExperts, numExpertsUsed int
normTopKProb bool
}
func (o TextOptions) headDim() int {
return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
}
func (o TextOptions) applyRotaryPositionalEmbedding(ctx ml.Context, t, p ml.Tensor) ml.Tensor {
return fast.RoPE(ctx, t, p, o.headDim(), o.ropeBase, 1/float32(math.Sqrt(float64(o.ropeScale))),
rope.WithMRoPESections(o.mropeSections),
)
}
type TextAttention struct {
Query *nn.Linear `gguf:"attn_q"`
QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
Key *nn.Linear `gguf:"attn_k"`
KeyNorm *nn.RMSNorm `gguf:"attn_k_norm"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_output"`
}
func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
batchSize := hiddenStates.Dim(1)
query := sa.Query.Forward(ctx, hiddenStates)
key := sa.Key.Forward(ctx, hiddenStates)
value := sa.Value.Forward(ctx, hiddenStates)
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
query = sa.QueryNorm.Forward(ctx, query, opts.eps)
key = sa.KeyNorm.Forward(ctx, key, opts.eps)
query = opts.applyRotaryPositionalEmbedding(ctx, query, positions)
key = opts.applyRotaryPositionalEmbedding(ctx, key, positions)
attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
return sa.Output.Forward(ctx, attention)
}
type TextMLP interface {
Forward(ml.Context, ml.Tensor, *TextOptions) ml.Tensor
}
type sparse struct {
Router *nn.Linear `gguf:"ffn_gate_inp"`
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
}
func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextOptions) ml.Tensor {
hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
routerLogits := mlp.Router.Forward(ctx, hiddenStates)
routingWeights := routerLogits.Softmax(ctx)
selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
if opts.normTopKProb {
routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
}
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates, selectedExperts).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates, selectedExperts))
experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
experts = experts.Mul(ctx, routingWeights)
nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
for i := 1; i < opts.numExpertsUsed; i++ {
nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
}
return nextStates
}
type dense struct {
Gate *nn.Linear `gguf:"ffn_gate"`
Up *nn.Linear `gguf:"ffn_up"`
Down *nn.Linear `gguf:"ffn_down"`
}
func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *TextOptions) ml.Tensor {
hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
return mlp.Down.Forward(ctx, hiddenStates)
}
type TextLayer struct {
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
*TextAttention
MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
TextMLP
}
func (d *TextLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
residual := hiddenStates
hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = d.TextAttention.Forward(ctx, hiddenStates, positions, cache, opts)
if outputs != nil {
hiddenStates = hiddenStates.Rows(ctx, outputs)
residual = residual.Rows(ctx, outputs)
}
hiddenStates = hiddenStates.Add(ctx, residual)
residual = hiddenStates
hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = d.TextMLP.Forward(ctx, hiddenStates, opts)
return hiddenStates.Add(ctx, residual)
}
type TextModel struct {
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
Output *nn.Linear `gguf:"output,alt:token_embd"`
Layers []TextLayer `gguf:"blk"`
Options *TextOptions
}
var _ model.Model = (*Model)(nil)
func newTextModel(c fs.Config) *TextModel {
layers := make([]TextLayer, c.Uint("block_count"))
for i := range layers {
if strings.HasSuffix(c.String("general.architecture"), "moe") {
layers[i].TextMLP = &sparse{}
} else {
layers[i].TextMLP = &dense{}
}
}
m := TextModel{
Layers: layers,
Options: &TextOptions{
hiddenSize: int(c.Uint("embedding_length")),
numHeads: int(c.Uint("attention.head_count")),
numKVHeads: int(c.Uint("attention.head_count_kv")),
keyLength: int(c.Uint("attention.key_length")),
valueLength: int(c.Uint("attention.value_length")),
eps: c.Float("attention.layer_norm_rms_epsilon"),
ropeBase: c.Float("rope.freq_base"),
ropeScale: c.Float("rope.scaling.factor", 1),
numExperts: int(c.Uint("expert_count")),
numExpertsUsed: int(c.Uint("expert_used_count")),
normTopKProb: c.Bool("norm_top_k_prob", true),
mropeSections: slices.Collect(func(yield func(int) bool) {
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
if !yield(int(section)) {
return
}
}
}),
},
}
return &m
}

View File

@@ -0,0 +1,268 @@
package qwen3vl
import (
"iter"
"math"
"slices"
"github.com/ollama/ollama/fs"
"github.com/ollama/ollama/ml"
"github.com/ollama/ollama/ml/nn"
)
type VisionAttention struct {
Query *nn.Linear `gguf:"attn_q"`
Key *nn.Linear `gguf:"attn_k"`
Value *nn.Linear `gguf:"attn_v"`
Output *nn.Linear `gguf:"attn_out"`
}
func rotateHalf(ctx ml.Context, t ml.Tensor) ml.Tensor {
x1 := t.View(ctx, 0, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3))
x2 := t.View(ctx, t.Stride(0)*t.Dim(0)/2, t.Dim(0)/2, t.Stride(1), t.Dim(1), t.Stride(2), t.Dim(2), t.Stride(3), t.Dim(3)).Contiguous(ctx)
return x2.Scale(ctx, -1).Concat(ctx, x1, 0)
}
func applyRotaryPositionalEmbedding(ctx ml.Context, t, cos, sin ml.Tensor) ml.Tensor {
return t.Mul(ctx, cos).Add(ctx, rotateHalf(ctx, t).Mul(ctx, sin))
}
func (sa *VisionAttention) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
query := sa.Query.Forward(ctx, hiddenStates)
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, query.Dim(1))
query = applyRotaryPositionalEmbedding(ctx, query, cos, sin)
key := sa.Key.Forward(ctx, hiddenStates)
key = key.Reshape(ctx, opts.headDim(), opts.numHeads, key.Dim(1))
key = applyRotaryPositionalEmbedding(ctx, key, cos, sin)
value := sa.Value.Forward(ctx, hiddenStates)
value = value.Reshape(ctx, opts.headDim(), opts.numHeads, value.Dim(1))
attention := nn.Attention(ctx, query, key, value, math.Pow(float64(opts.headDim()), -0.5), nil)
attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2))
return sa.Output.Forward(ctx, attention)
}
type VisionMLP struct {
FC1 *nn.Linear `gguf:"linear_fc1"`
FC2 *nn.Linear `gguf:"linear_fc2"`
}
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts VisionOptions) ml.Tensor {
return mlp.FC2.Forward(ctx, mlp.FC1.Forward(ctx, hiddenStates).GELU(ctx))
}
type VisionEncoderLayer struct {
Norm1 *nn.LayerNorm `gguf:"norm1"`
Attention *VisionAttention
Norm2 *nn.LayerNorm `gguf:"norm2"`
MLP *VisionMLP `gguf:"mlp"`
}
func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenStates, cos, sin ml.Tensor, opts VisionOptions) ml.Tensor {
residual := hiddenStates
hiddenStates = e.Norm1.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = e.Attention.Forward(ctx, hiddenStates, cos, sin, opts)
hiddenStates = hiddenStates.Add(ctx, residual)
residual = hiddenStates
hiddenStates = e.Norm2.Forward(ctx, hiddenStates, opts.eps)
hiddenStates = e.MLP.Forward(ctx, hiddenStates, opts)
return hiddenStates.Add(ctx, residual)
}
type VisionOptions struct {
hiddenSize,
numHeads,
patchSize,
numChannels,
spatialMergeSize,
temporalPatchSize,
gridPerSide int
eps,
ropeTheta float32
deepstackVisualIndexes []int32
mropeSections []int
}
func (o VisionOptions) headDim() int {
return o.hiddenSize / o.numHeads
}
type VisionPatchMerger struct {
Norm *nn.LayerNorm `gguf:"norm"`
FC1 *nn.Linear `gguf:"linear_fc1"`
FC2 *nn.Linear `gguf:"linear_fc2"`
}
func (m *VisionPatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor, postshuffleNorm bool, opts VisionOptions) ml.Tensor {
hiddenSize := opts.hiddenSize * opts.spatialMergeSize * opts.spatialMergeSize
if postshuffleNorm {
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
}
visionOutputs = m.Norm.Forward(ctx, visionOutputs, opts.eps)
visionOutputs = visionOutputs.Reshape(ctx, hiddenSize, -1)
return m.FC2.Forward(ctx, m.FC1.Forward(ctx, visionOutputs).GELU(ctx))
}
type VisionPositionEmbedding struct {
PositionEmbedding *nn.Embedding `gguf:"pos_embed"`
}
func makeSlice2D[T int32 | float32](n0, n1 int) iter.Seq[[]T] {
return func(yield func([]T) bool) {
for range n0 {
if !yield(make([]T, n1)) {
return
}
}
}
}
func (m *VisionPositionEmbedding) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts VisionOptions) ml.Tensor {
indexSlice := slices.Collect(makeSlice2D[int32](4, grid.Height*grid.Width))
weightSlice := slices.Collect(makeSlice2D[float32](4, grid.Height*grid.Width))
stepHeight := float32(opts.gridPerSide-1) / float32(grid.Height-1)
stepWidth := float32(opts.gridPerSide-1) / float32(grid.Width-1)
var i int
for h := range grid.Height {
for w := range grid.Width {
y, x := float32(h)*stepHeight, float32(w)*stepWidth
floorY, floorX := int32(y), int32(x)
ceilY, ceilX := min(floorY+1, int32(opts.gridPerSide-1)), min(floorX+1, int32(opts.gridPerSide-1))
indexSlice[0][i] = floorY*int32(opts.gridPerSide) + floorX
indexSlice[1][i] = floorY*int32(opts.gridPerSide) + ceilX
indexSlice[2][i] = ceilY*int32(opts.gridPerSide) + floorX
indexSlice[3][i] = ceilY*int32(opts.gridPerSide) + ceilX
weightSlice[0][i] = (1 - (y - float32(floorY))) * (1 - (x - float32(floorX)))
weightSlice[1][i] = (1 - (y - float32(floorY))) * (x - float32(floorX))
weightSlice[2][i] = (y - float32(floorY)) * (1 - (x - float32(floorX)))
weightSlice[3][i] = (y - float32(floorY)) * (x - float32(floorX))
i++
}
}
indices := ctx.Input().FromInts(slices.Concat(indexSlice...), grid.Height*grid.Width*4)
weights := ctx.Input().FromFloats(slices.Concat(weightSlice...), 1, grid.Height*grid.Width*4)
n := hiddenStates.Dim(0)
positionEmbeds := m.PositionEmbedding.Forward(ctx, indices)
positionEmbeds = positionEmbeds.Mul(ctx, weights)
positionEmbeds = positionEmbeds.Reshape(ctx, n, -1, 4)
positionEmbeds = positionEmbeds.View(ctx, 0, n, positionEmbeds.Stride(1), grid.Height*grid.Width).
Add(ctx, positionEmbeds.View(ctx, 1*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
Add(ctx, positionEmbeds.View(ctx, 2*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width)).
Add(ctx, positionEmbeds.View(ctx, 3*positionEmbeds.Stride(2), n, positionEmbeds.Stride(1), grid.Height*grid.Width))
positionEmbeds = positionEmbeds.Reshape(ctx, -1, grid.Width/opts.spatialMergeSize, opts.spatialMergeSize, grid.Height/opts.spatialMergeSize)
positionEmbeds = positionEmbeds.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx, n, -1)
return hiddenStates.Add(ctx, positionEmbeds)
}
type VisionModel struct {
PatchEmbedding *nn.Conv3D `gguf:"patch_embed"`
PositionEmbedding *VisionPositionEmbedding
Layers []VisionEncoderLayer `gguf:"blk"`
PatchMerger *VisionPatchMerger `gguf:"merger"`
DeepstackMerger []*VisionPatchMerger `gguf:"deepstack_merger"`
VisionOptions
}
func (m *VisionModel) positions(ctx ml.Context, grid *Grid) (_, _ ml.Tensor) {
indices := ctx.Input().FromInts(slices.Collect(func(yield func(int32) bool) {
for y := range grid.Height {
for x := range grid.Width {
if !yield(int32(y)) {
return
}
if !yield(int32(x)) {
return
}
}
}
}), grid.Width*grid.Height*2)
indices = indices.Reshape(ctx, -1, grid.Width/m.spatialMergeSize, m.spatialMergeSize, grid.Height/m.spatialMergeSize)
indices = indices.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
indices = indices.Reshape(ctx, -1)
halfDim := m.headDim() / 2
maxGrid := max(grid.Height, grid.Width)
frequencies := ctx.Input().FromFloats(slices.Collect(func(yield func(float32) bool) {
ropeTheta := float64(m.ropeTheta)
for i := range maxGrid {
for j := range halfDim / 2 {
if !yield(float32(i) / float32(math.Pow(ropeTheta, float64(j*2)/float64(halfDim)))) {
return
}
}
}
}), halfDim/2, maxGrid)
embeds := frequencies.Rows(ctx, indices)
embeds = embeds.Reshape(ctx, halfDim, 1, -1)
embeds = embeds.Concat(ctx, embeds, 0)
return embeds.Cos(ctx), embeds.Sin(ctx)
}
// Forward computes the vision model for an input tensor
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) (ml.Tensor, []ml.Tensor) {
pixelValues = pixelValues.Reshape(ctx, m.patchSize, m.patchSize, m.temporalPatchSize, -1)
hiddenStates := m.PatchEmbedding.Forward(ctx, pixelValues, m.numChannels, m.patchSize, m.patchSize, m.temporalPatchSize, 0, 0, 0, 1, 1, 1)
hiddenStates = m.PositionEmbedding.Forward(ctx, hiddenStates, grid, m.VisionOptions)
cos, sin := m.positions(ctx, grid)
deepstackStates := make([]ml.Tensor, len(m.deepstackVisualIndexes))
for i, layer := range m.Layers {
hiddenStates = layer.Forward(ctx, hiddenStates, cos, sin, m.VisionOptions)
if i := slices.Index(m.deepstackVisualIndexes, int32(i)); i >= 0 {
deepstackStates[i] = m.DeepstackMerger[i].Forward(ctx, hiddenStates, true, m.VisionOptions)
}
}
hiddenStates = m.PatchMerger.Forward(ctx, hiddenStates, false, m.VisionOptions)
return hiddenStates, deepstackStates
}
// newVisionModel creates a new instance of the Qwen vision model
func newVisionModel(c fs.Config) *VisionModel {
deepstackVisualIndexes := c.Ints("vision.deepstack_visual_indexes")
model := &VisionModel{
Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 32)),
DeepstackMerger: make([]*VisionPatchMerger, len(deepstackVisualIndexes)),
VisionOptions: VisionOptions{
hiddenSize: int(c.Uint("vision.embedding_length", 1280)),
numHeads: int(c.Uint("vision.attention.head_count", 16)),
patchSize: int(c.Uint("vision.patch_size", 14)),
numChannels: int(c.Uint("vision.num_channels", 3)),
eps: c.Float("vision.attention.layer_norm_epsilon", 1e-6),
ropeTheta: c.Float("vision.rope.freq_base", 10000.0),
spatialMergeSize: int(c.Uint("vision.spatial_merge_size", 2)),
temporalPatchSize: int(c.Uint("vision.temporal_patch_size", 2)),
gridPerSide: int(math.Sqrt(float64(c.Uint("vision.num_positional_embeddings", 2304)))),
mropeSections: slices.Collect(func(yield func(int) bool) {
for _, section := range c.Ints("mrope_sections", []int32{24, 20, 20}) {
if !yield(int(section)) {
return
}
}
}),
deepstackVisualIndexes: deepstackVisualIndexes,
},
}
return model
}