mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
173 lines
3.8 KiB
Go
173 lines
3.8 KiB
Go
package model
|
||
|
||
import (
|
||
"log/slog"
|
||
"os"
|
||
"path/filepath"
|
||
"slices"
|
||
"testing"
|
||
|
||
"google.golang.org/protobuf/proto"
|
||
|
||
"github.com/ollama/ollama/convert/sentencepiece"
|
||
)
|
||
|
||
func loadSentencePieceVocab(t *testing.T) SentencePiece {
|
||
t.Helper()
|
||
|
||
bts, err := os.ReadFile(filepath.Join("testdata", "gemma2", "tokenizer.model"))
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
|
||
var spm sentencepiece.ModelProto
|
||
if err := proto.Unmarshal(bts, &spm); err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
|
||
var v Vocabulary
|
||
|
||
for _, piece := range spm.GetPieces() {
|
||
v.Values = append(v.Values, piece.GetPiece())
|
||
v.Scores = append(v.Scores, piece.GetScore())
|
||
switch t := piece.GetType(); t {
|
||
case sentencepiece.ModelProto_SentencePiece_UNKNOWN,
|
||
sentencepiece.ModelProto_SentencePiece_CONTROL,
|
||
sentencepiece.ModelProto_SentencePiece_UNUSED,
|
||
sentencepiece.ModelProto_SentencePiece_BYTE:
|
||
v.Types = append(v.Types, int32(t))
|
||
default:
|
||
tt := int32(sentencepiece.ModelProto_SentencePiece_NORMAL)
|
||
// todo parse the special tokens file
|
||
// - this will roundtrip correctly but the <start_of_turn> and
|
||
// <end_of_turn> tokens aren't processed
|
||
v.Types = append(v.Types, tt)
|
||
}
|
||
}
|
||
|
||
return NewSentencePiece(&v)
|
||
}
|
||
|
||
func TestSentencePieceEncode(t *testing.T) {
|
||
logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug}))
|
||
slog.SetDefault(logger)
|
||
|
||
tokenizer := loadSentencePieceVocab(t)
|
||
|
||
t.Run("basic roundtrip", func(t *testing.T) {
|
||
t.Parallel()
|
||
|
||
cases := []string{
|
||
"hello",
|
||
"hello ",
|
||
"hello ",
|
||
" hello",
|
||
" hello ",
|
||
" hello ",
|
||
"hello world",
|
||
"请考试我的软件!12345",
|
||
"你好",
|
||
"Hello 你好 world!",
|
||
"Special characters: !@#$%^&*()_+-=[]{}|;':\",./<>?",
|
||
"Multilingual: 你好 こんにちは Привет Hola مرحبا",
|
||
"Numbers and symbols: 123456789 +- */",
|
||
"Special tokens: <bos> text <eos>",
|
||
"Code snippets: func main() { fmt.Println(\"Hello World\") }",
|
||
"Long text: " + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " +
|
||
"Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. " +
|
||
"Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.",
|
||
}
|
||
|
||
for _, want := range cases {
|
||
ids, err := tokenizer.Encode(want, true)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
|
||
if got, err := tokenizer.Decode(ids); err != nil {
|
||
t.Fatal(err)
|
||
} else if got != want {
|
||
t.Errorf("got %q, want %q [%#v]", got, want, ids)
|
||
}
|
||
}
|
||
})
|
||
|
||
t.Run("special tokens", func(t *testing.T) {
|
||
type candidate struct {
|
||
token string
|
||
ids []int32
|
||
}
|
||
|
||
cases := []candidate{
|
||
{"<bos>", []int32{2}},
|
||
{"<eos>", []int32{1}},
|
||
}
|
||
|
||
for _, want := range cases {
|
||
ids, err := tokenizer.Encode(want.token, true)
|
||
if err != nil {
|
||
t.Fatal(err)
|
||
}
|
||
if !slices.Equal(ids, want.ids) {
|
||
t.Errorf("got %#v, want %#v", ids, want.ids)
|
||
}
|
||
}
|
||
})
|
||
}
|
||
|
||
func TestSentencePieceDecodeByteTokens(t *testing.T) {
|
||
vocab := &Vocabulary{
|
||
Values: []string{
|
||
"normal",
|
||
"<0xEA>",
|
||
"<0x41>",
|
||
"<0xC3>",
|
||
"<0xA3>",
|
||
},
|
||
Types: []int32{
|
||
TOKEN_TYPE_NORMAL,
|
||
TOKEN_TYPE_BYTE,
|
||
TOKEN_TYPE_BYTE,
|
||
TOKEN_TYPE_BYTE,
|
||
TOKEN_TYPE_BYTE,
|
||
},
|
||
Scores: []float32{0, 0, 0, 0, 0},
|
||
}
|
||
|
||
spm := NewSentencePiece(vocab)
|
||
|
||
tests := []struct {
|
||
name string
|
||
ids []int32
|
||
expected string
|
||
}{
|
||
{
|
||
name: "single byte token",
|
||
ids: []int32{1},
|
||
expected: "\xea",
|
||
},
|
||
{
|
||
name: "ASCII byte token",
|
||
ids: []int32{2},
|
||
expected: "A",
|
||
},
|
||
{
|
||
name: "multiple byte tokens forming UTF-8 character",
|
||
ids: []int32{3, 4},
|
||
expected: "ã",
|
||
},
|
||
}
|
||
|
||
for _, tt := range tests {
|
||
t.Run(tt.name, func(t *testing.T) {
|
||
result, err := spm.Decode(tt.ids)
|
||
if err != nil {
|
||
t.Errorf("failed to decode token IDs %v: %v", tt.ids, err)
|
||
}
|
||
if result != tt.expected {
|
||
t.Errorf("got %q, want %q", result, tt.expected)
|
||
}
|
||
})
|
||
}
|
||
}
|