mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
117 lines
3.4 KiB
Go
117 lines
3.4 KiB
Go
package convert
|
|
|
|
import (
|
|
"cmp"
|
|
"encoding/json"
|
|
"io/fs"
|
|
"slices"
|
|
"strings"
|
|
|
|
"github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
type qwen3VLModel struct {
|
|
qwen3Model `json:"text_config"`
|
|
|
|
VisionModel struct {
|
|
Depth uint32 `json:"depth"`
|
|
HiddenSize uint32 `json:"hidden_size"`
|
|
NumHeads uint32 `json:"num_heads"`
|
|
InChannels uint32 `json:"in_channels"`
|
|
PatchSize uint32 `json:"patch_size"`
|
|
SpatialMergeSize uint32 `json:"spatial_merge_size"`
|
|
WindowSize uint32 `json:"window_size"`
|
|
RMSNormEps float32 `json:"layer_norm_epsilon"`
|
|
RopeTheta float32 `json:"rope_theta"`
|
|
TemporalPatchSize uint32 `json:"temporal_patch_size"`
|
|
DeepstackVisualIndexes []int32 `json:"deepstack_visual_indexes"`
|
|
|
|
Size struct {
|
|
ShortestEdge uint32 `json:"shortest_edge"`
|
|
LongestEdge uint32 `json:"longest_edge"`
|
|
} `json:"size"`
|
|
|
|
ImageMean []float32 `json:"image_mean"`
|
|
ImageStd []float32 `json:"image_std"`
|
|
} `json:"vision_config"`
|
|
}
|
|
|
|
func (m *qwen3VLModel) parseMore(fsys fs.FS) error {
|
|
bts, err := fs.ReadFile(fsys, "preprocessor_config.json")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return json.Unmarshal(bts, &m.VisionModel)
|
|
}
|
|
|
|
func (m *qwen3VLModel) KV(t *Tokenizer) ggml.KV {
|
|
kv := m.qwen3Model.KV(t)
|
|
|
|
arch := "qwen3vl"
|
|
if m.NumExperts > 0 {
|
|
arch += "moe"
|
|
}
|
|
// override architecture
|
|
kv["general.architecture"] = arch
|
|
|
|
kv["vision.block_count"] = cmp.Or(m.VisionModel.Depth, 32)
|
|
kv["vision.embedding_length"] = m.VisionModel.HiddenSize
|
|
kv["vision.attention.head_count"] = cmp.Or(m.VisionModel.NumHeads, 16)
|
|
kv["vision.num_channels"] = m.VisionModel.InChannels
|
|
kv["vision.patch_size"] = cmp.Or(m.VisionModel.PatchSize, 14)
|
|
kv["vision.spatial_merge_size"] = cmp.Or(m.VisionModel.SpatialMergeSize, 2)
|
|
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(m.VisionModel.RMSNormEps, 1e-6)
|
|
kv["vision.rope.freq_base"] = cmp.Or(m.VisionModel.RopeTheta, 1e4)
|
|
kv["vision.temporal_patch_size"] = cmp.Or(m.VisionModel.TemporalPatchSize, 2)
|
|
kv["vision.deepstack_visual_indexes"] = m.VisionModel.DeepstackVisualIndexes
|
|
|
|
kv["vision.shortest_edge"] = m.VisionModel.Size.ShortestEdge
|
|
kv["vision.longest_edge"] = m.VisionModel.Size.LongestEdge
|
|
|
|
kv["vision.image_mean"] = m.VisionModel.ImageMean
|
|
kv["vision.image_std"] = m.VisionModel.ImageStd
|
|
|
|
return kv
|
|
}
|
|
|
|
func (m *qwen3VLModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
|
var rest []Tensor
|
|
var out []*ggml.Tensor
|
|
for _, t := range ts {
|
|
switch {
|
|
case strings.Contains(t.Name(), "attn_qkv"):
|
|
out = append(out, slices.Collect(splitDim(t, 0,
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_q")},
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_k")},
|
|
split{Replacer: strings.NewReplacer("attn_qkv", "attn_v")},
|
|
))...)
|
|
case strings.Contains(t.Name(), "patch_embed") && strings.HasSuffix(t.Name(), "weight"):
|
|
shape := t.Shape()
|
|
out = append(out, &ggml.Tensor{
|
|
Name: t.Name(),
|
|
Kind: t.Kind(),
|
|
Shape: append([]uint64{shape[0] * shape[1]}, shape[2:]...),
|
|
WriterTo: t,
|
|
})
|
|
default:
|
|
rest = append(rest, t)
|
|
}
|
|
}
|
|
|
|
return append(m.qwen3Model.Tensors(rest), out...)
|
|
}
|
|
|
|
func (m *qwen3VLModel) Replacements() []string {
|
|
return append(
|
|
m.qwen3Model.Replacements(),
|
|
"model.language_", "",
|
|
"model.visual", "v",
|
|
"patch_embed.proj", "patch_embed",
|
|
"blocks", "blk",
|
|
"attn.qkv", "attn_qkv",
|
|
"attn.proj", "attn_out",
|
|
"deepstack_merger_list", "deepstack_merger",
|
|
)
|
|
}
|