mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
168 lines
3.5 KiB
Go
168 lines
3.5 KiB
Go
package model
|
|
|
|
import (
|
|
"fmt"
|
|
"iter"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/ollama/ollama/logutil"
|
|
)
|
|
|
|
type WordPiece struct {
|
|
vocab *Vocabulary
|
|
}
|
|
|
|
// ggmlPrefix is the prefix used by GGML vocabularies to indicate word boundaries.
|
|
// this differs from original word piece which uses "##" to indicate subwords.
|
|
const ggmlPrefix = "▁"
|
|
|
|
var wordPieceReplacer = strings.NewReplacer(
|
|
" .", ".",
|
|
" ?", "?",
|
|
" !", "!",
|
|
" ,", ",",
|
|
" ' ", "'",
|
|
" n't", "n't",
|
|
" 'm", "'m",
|
|
" do not", " don't",
|
|
" 's", "'s",
|
|
" 've", "'ve",
|
|
" 're", "'re",
|
|
)
|
|
|
|
// Decode implements TextProcessor.
|
|
func (wpm WordPiece) Decode(ids []int32) (string, error) {
|
|
var sb strings.Builder
|
|
for i, id := range ids {
|
|
if id < 0 || int(id) >= len(wpm.vocab.Values) {
|
|
return "", fmt.Errorf("invalid token id: %d", id)
|
|
}
|
|
|
|
var separator string
|
|
piece := wpm.vocab.Values[id]
|
|
if i > 0 &&
|
|
(strings.HasPrefix(piece, ggmlPrefix) ||
|
|
(strings.HasPrefix(piece, "[") && strings.HasSuffix(piece, "]"))) {
|
|
separator = " "
|
|
}
|
|
|
|
sb.WriteString(wordPieceReplacer.Replace(separator + strings.TrimPrefix(piece, ggmlPrefix)))
|
|
}
|
|
|
|
return sb.String(), nil
|
|
}
|
|
|
|
// words splits a string into words, treating CJK characters as separate words.
|
|
// TODO: this is specifically for BERT and may need to be adjusted or refactored for other models.
|
|
func (wpm WordPiece) words(s string) iter.Seq[string] {
|
|
return func(yield func(string) bool) {
|
|
runes := make([]rune, 0, len(s)*3)
|
|
for _, r := range s {
|
|
switch {
|
|
case r >= 0x4E00 && r <= 0x9FFF,
|
|
r >= 0x3400 && r <= 0x4DBF,
|
|
r >= 0x20000 && r <= 0x2A6DF,
|
|
r >= 0x2A700 && r <= 0x2B73F,
|
|
r >= 0x2B740 && r <= 0x2B81F,
|
|
r >= 0x2B820 && r <= 0x2CEAF,
|
|
r >= 0xF900 && r <= 0xFAFF,
|
|
r >= 0x2F800 && r <= 0x2FA1F:
|
|
runes = append(runes, ' ', r, ' ')
|
|
default:
|
|
runes = append(runes, r)
|
|
}
|
|
}
|
|
|
|
for w := range strings.FieldsFuncSeq(string(runes), unicode.IsSpace) {
|
|
// split on but keep punctuation
|
|
var start int
|
|
for start < len(w) {
|
|
end := strings.IndexFunc(w[start:], unicode.IsPunct)
|
|
if end < 0 {
|
|
end = len(w) - start
|
|
} else if end == 0 {
|
|
end = 1
|
|
}
|
|
|
|
if !yield(w[start : start+end]) {
|
|
return
|
|
}
|
|
|
|
start += end
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Encode implements TextProcessor.
|
|
func (wpm WordPiece) Encode(s string, addSpecial bool) ([]int32, error) {
|
|
var ids []int32
|
|
|
|
// TODO: use [UNK] from config
|
|
unk := wpm.vocab.Encode("[UNK]")
|
|
for word := range wpm.words(s) {
|
|
var start int
|
|
var pieces []int32
|
|
for start < len(word) {
|
|
end := len(word)
|
|
|
|
var piece int32
|
|
for start < end {
|
|
subword := word[start:end]
|
|
if start == 0 {
|
|
subword = ggmlPrefix + subword
|
|
}
|
|
|
|
// TODO: some models might not want [ToLower]
|
|
piece = wpm.vocab.Encode(strings.ToLower(subword))
|
|
if piece >= 0 {
|
|
break
|
|
}
|
|
|
|
end--
|
|
}
|
|
|
|
if piece < 0 {
|
|
// Unknown token
|
|
pieces = pieces[:0]
|
|
break
|
|
}
|
|
|
|
pieces = append(pieces, piece)
|
|
start = end
|
|
}
|
|
|
|
if len(pieces) > 0 {
|
|
ids = append(ids, pieces...)
|
|
} else {
|
|
ids = append(ids, unk)
|
|
}
|
|
}
|
|
|
|
if addSpecial && len(ids) > 0 {
|
|
ids = wpm.vocab.addSpecials(ids)
|
|
}
|
|
|
|
logutil.Trace("encoded", "string", s, "ids", ids)
|
|
return ids, nil
|
|
}
|
|
|
|
// Is implements TextProcessor.
|
|
func (wpm WordPiece) Is(id int32, special Special) bool {
|
|
return wpm.vocab.Is(id, special)
|
|
}
|
|
|
|
// Vocabulary implements TextProcessor.
|
|
func (wpm WordPiece) Vocabulary() *Vocabulary {
|
|
return wpm.vocab
|
|
}
|
|
|
|
var _ TextProcessor = (*WordPiece)(nil)
|
|
|
|
func NewWordPiece(vocab *Vocabulary) WordPiece {
|
|
return WordPiece{
|
|
vocab: vocab,
|
|
}
|
|
}
|