Files
ollama37/thinking/parser.go
Shang Chieh Tseng ef14fb5b26 Sync with upstream ollama/ollama and restore Tesla K80 (compute 3.7) support
This commit represents a complete rework after pulling the latest changes from
official ollama/ollama repository and re-applying Tesla K80 compatibility patches.

## Key Changes

### CUDA Compute Capability 3.7 Support (Tesla K80)
- Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt
- Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset
- Using 37-virtual (PTX with JIT compilation) for maximum compatibility

### Legacy Toolchain Compatibility
- **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80)
- **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7)
- **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h)

### CPU Architecture Trade-offs
Due to GCC 10.5 limitation, sacrificed newer CPU optimizations:
- Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+)
- Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA
- Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility)

### Build System Updates
- Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7
- Added -Wno-deprecated-gpu-targets flag to suppress warnings
- Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI

### Upstream Sync
Merged latest llama.cpp changes including:
- Enhanced KV cache management with ISWA and hybrid memory support
- Improved multi-modal support (mtmd framework)
- New model architectures (Gemma3, Llama4, Qwen3, etc.)
- GPU backend improvements for CUDA, Metal, and ROCm
- Updated quantization support and GGUF format handling

### Documentation
- Updated CLAUDE.md with comprehensive build instructions
- Documented toolchain constraints and CPU architecture trade-offs
- Removed outdated CI/CD workflows (tesla-k80-*.yml)
- Cleaned up temporary development artifacts

## Rationale

This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in
official Ollama due to legacy driver/CUDA requirements. The toolchain constraint
creates a deadlock:
- K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI

We accept the loss of cutting-edge CPU optimizations to enable running modern
LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 14:03:05 +08:00

174 lines
5.4 KiB
Go

package thinking
import (
"strings"
"unicode"
)
type thinkingState int
const (
// We're looking for the opening tag, but we haven't seen any non-whitespace
// characters yet
thinkingState_LookingForOpening thinkingState = iota
// We've seen the opening tag, but we haven't seen any non-whitespace
// characters yet (we want to eat any whitespace between the opening tag and
// the thinking content)
thinkingState_ThinkingStartedEatingWhitespace
// We've seen non-whitespace characters after the opening tag, but we haven't
// seen the closing tag yet
thinkingState_Thinking
// We've seen the closing tag, but we haven't seen any non-whitespace
// characters after the closing tag yet (we want to eat any whitespace between
// the closing tag and the content)
thinkingState_ThinkingDoneEatingWhitespace
// We've seen the closing tag and seen at least one non-whitespace character
// after it
thinkingState_ThinkingDone
)
func (s thinkingState) String() string {
switch s {
case thinkingState_LookingForOpening:
return "LookingForOpening"
case thinkingState_ThinkingStartedEatingWhitespace:
return "ThinkingStartedEatingWhitespace"
case thinkingState_Thinking:
return "Thinking"
case thinkingState_ThinkingDoneEatingWhitespace:
return "ThinkingDoneEatingWhitespace"
case thinkingState_ThinkingDone:
return "ThinkingDone"
default:
return "Unknown"
}
}
type Parser struct {
state thinkingState
OpeningTag string
ClosingTag string
acc strings.Builder
}
// AddContent returns the thinking content and the non-thinking content that
// should be immediately sent to the user. It will internally buffer if it needs
// to see more raw content to disambiguate
func (s *Parser) AddContent(content string) (string, string) {
s.acc.WriteString(content)
var thinkingSb, remainingSb strings.Builder
var thinking, remaining string
keepLooping := true
// we loop because we might pass through multiple parsing states in a single
// call to addContent, and we want to make sure callers don't have to wait for
// data that's already unambiguous
for keepLooping {
thinking, remaining, keepLooping = eat(s)
thinkingSb.WriteString(thinking)
remainingSb.WriteString(remaining)
}
return thinkingSb.String(), remainingSb.String()
}
// the additional bool return is true iff we should continue eating
func eat(s *Parser) (string, string, bool) {
switch s.state {
case thinkingState_LookingForOpening:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
if strings.HasPrefix(trimmed, s.OpeningTag) {
after := strings.Join(strings.Split(trimmed, s.OpeningTag)[1:], s.OpeningTag)
after = strings.TrimLeftFunc(after, unicode.IsSpace)
// after might contain more than just thinking tokens, so we continue
// parsing instead of returning it as thinking tokens here
s.acc.Reset()
s.acc.WriteString(after)
if after == "" {
s.state = thinkingState_ThinkingStartedEatingWhitespace
} else {
s.state = thinkingState_Thinking
}
return "", "", true
} else if strings.HasPrefix(s.OpeningTag, trimmed) {
// partial opening seen, so let's keep accumulating
return "", "", false
} else if trimmed == "" {
// saw whitespace only, so let's keep accumulating
return "", "", false
} else {
// didn't see an opening tag, but we have content, so thinking was skipped
s.state = thinkingState_ThinkingDone
// note that we use the original content, not the trimmed one because we
// don't want to eat any whitespace in the real content if there were no
// thinking tags
untrimmed := s.acc.String()
s.acc.Reset()
return "", untrimmed, false
}
case thinkingState_ThinkingStartedEatingWhitespace:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
s.acc.Reset()
if trimmed == "" {
return "", "", false
} else {
s.state = thinkingState_Thinking
s.acc.WriteString(trimmed)
return "", "", true
}
case thinkingState_Thinking:
acc := s.acc.String()
if strings.Contains(acc, s.ClosingTag) {
split := strings.Split(acc, s.ClosingTag)
thinking := split[0]
remaining := strings.Join(split[1:], s.ClosingTag)
remaining = strings.TrimLeftFunc(remaining, unicode.IsSpace)
s.acc.Reset()
if remaining == "" {
s.state = thinkingState_ThinkingDoneEatingWhitespace
} else {
s.state = thinkingState_ThinkingDone
}
return thinking, remaining, false
} else if overlapLen := overlap(acc, s.ClosingTag); overlapLen > 0 {
thinking := acc[:len(acc)-overlapLen]
remaining := acc[len(acc)-overlapLen:]
s.acc.Reset()
// keep track of the candidate closing tag. We have to buffer it until it
// becomes disambiguated
s.acc.WriteString(remaining)
return thinking, "", false
} else {
// purely just thinking tokens, so we can return them
s.acc.Reset()
return acc, "", false
}
case thinkingState_ThinkingDoneEatingWhitespace:
trimmed := strings.TrimLeftFunc(s.acc.String(), unicode.IsSpace)
s.acc.Reset()
// if we see non-whitespace, we're done eating the leading whitespace of the content
if trimmed != "" {
s.state = thinkingState_ThinkingDone
}
return "", trimmed, false
case thinkingState_ThinkingDone:
acc := s.acc.String()
s.acc.Reset()
return "", acc, false
default:
panic("unknown state")
}
}
// longest overlap between suffix of s and prefix of delim
func overlap(s, delim string) int {
max := min(len(delim), len(s))
for i := max; i > 0; i-- {
if strings.HasSuffix(s, delim[:i]) {
return i
}
}
return 0
}