mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-17 19:27:00 +00:00
This commit represents a complete rework after pulling the latest changes from official ollama/ollama repository and re-applying Tesla K80 compatibility patches. ## Key Changes ### CUDA Compute Capability 3.7 Support (Tesla K80) - Added sm_37 (compute 3.7) to CMAKE_CUDA_ARCHITECTURES in CMakeLists.txt - Updated CMakePresets.json to include compute 3.7 in "CUDA 11" preset - Using 37-virtual (PTX with JIT compilation) for maximum compatibility ### Legacy Toolchain Compatibility - **NVIDIA Driver**: 470.256.02 (last version supporting Kepler/K80) - **CUDA Version**: 11.4.4 (last CUDA 11.x supporting compute 3.7) - **GCC Version**: 10.5.0 (required by CUDA 11.4 host_config.h) ### CPU Architecture Trade-offs Due to GCC 10.5 limitation, sacrificed newer CPU optimizations: - Alderlake CPU variant enabled WITHOUT AVX_VNNI (requires GCC 11+) - Still supports: SSE4.2, AVX, F16C, AVX2, BMI2, FMA - Performance impact: ~3-7% on newer CPUs (acceptable for K80 compatibility) ### Build System Updates - Modified ml/backend/ggml/ggml/src/ggml-cuda/CMakeLists.txt for compute 3.7 - Added -Wno-deprecated-gpu-targets flag to suppress warnings - Updated ml/backend/ggml/ggml/src/CMakeLists.txt for Alderlake without AVX_VNNI ### Upstream Sync Merged latest llama.cpp changes including: - Enhanced KV cache management with ISWA and hybrid memory support - Improved multi-modal support (mtmd framework) - New model architectures (Gemma3, Llama4, Qwen3, etc.) - GPU backend improvements for CUDA, Metal, and ROCm - Updated quantization support and GGUF format handling ### Documentation - Updated CLAUDE.md with comprehensive build instructions - Documented toolchain constraints and CPU architecture trade-offs - Removed outdated CI/CD workflows (tesla-k80-*.yml) - Cleaned up temporary development artifacts ## Rationale This fork maintains Tesla K80 GPU support (compute 3.7) which was dropped in official Ollama due to legacy driver/CUDA requirements. The toolchain constraint creates a deadlock: - K80 → Driver 470 → CUDA 11.4 → GCC 10 → No AVX_VNNI We accept the loss of cutting-edge CPU optimizations to enable running modern LLMs on legacy but still capable Tesla K80 hardware (12GB VRAM per GPU). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
254 lines
7.1 KiB
Go
254 lines
7.1 KiB
Go
package parsers
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"log/slog"
|
|
"strings"
|
|
"unicode"
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/logutil"
|
|
)
|
|
|
|
// TODO: call the init function
|
|
const (
|
|
CollectingThinkingContent qwenParserState = iota
|
|
CollectingContent
|
|
CollectingToolContent
|
|
ThinkingDoneEatingWhitespace
|
|
ToolCallDoneEatingWhitespace
|
|
)
|
|
|
|
const (
|
|
thinkingCloseTag = "</think>"
|
|
)
|
|
|
|
type Qwen3VLParser struct {
|
|
state qwenParserState
|
|
buffer strings.Builder
|
|
tools []api.Tool
|
|
hasThinkingSupport bool
|
|
}
|
|
|
|
func (p *Qwen3VLParser) HasToolSupport() bool {
|
|
return true
|
|
}
|
|
|
|
func (p *Qwen3VLParser) HasThinkingSupport() bool {
|
|
return p.hasThinkingSupport
|
|
}
|
|
|
|
func (p *Qwen3VLParser) setInitialState(lastMessage *api.Message) {
|
|
prefill := lastMessage != nil && lastMessage.Role == "assistant"
|
|
if !p.HasThinkingSupport() {
|
|
p.state = CollectingContent
|
|
return
|
|
}
|
|
|
|
if prefill && lastMessage.Content != "" {
|
|
p.state = CollectingContent
|
|
return
|
|
}
|
|
|
|
p.state = CollectingThinkingContent
|
|
}
|
|
|
|
func (p *Qwen3VLParser) Init(tools []api.Tool, lastMessage *api.Message) []api.Tool {
|
|
p.tools = tools
|
|
p.setInitialState(lastMessage)
|
|
return tools
|
|
}
|
|
|
|
type qwenEventThinkingContent struct {
|
|
content string
|
|
}
|
|
|
|
func (qwenEventThinkingContent) isQwenEvent() {}
|
|
|
|
func (p *Qwen3VLParser) Add(s string, done bool) (content string, thinking string, calls []api.ToolCall, err error) {
|
|
p.buffer.WriteString(s)
|
|
events := p.parseEvents()
|
|
|
|
var toolCalls []api.ToolCall
|
|
var contentSb strings.Builder
|
|
var thinkingSb strings.Builder
|
|
for _, event := range events {
|
|
switch event := event.(type) {
|
|
case qwenEventRawToolCall:
|
|
toolCall, err := parseJSONToolCall(event, p.tools)
|
|
if err != nil {
|
|
slog.Warn("qwen tool call parsing failed", "error", err)
|
|
return "", "", nil, err
|
|
}
|
|
toolCalls = append(toolCalls, toolCall)
|
|
case qwenEventThinkingContent:
|
|
thinkingSb.WriteString(event.content)
|
|
case qwenEventContent:
|
|
// TODO(drifkin): if the same turn contains multiple interleaved content
|
|
// events, we naively append them together here.
|
|
contentSb.WriteString(event.content)
|
|
}
|
|
}
|
|
|
|
return contentSb.String(), thinkingSb.String(), toolCalls, nil
|
|
}
|
|
|
|
func (p *Qwen3VLParser) parseEvents() []qwenEvent {
|
|
var all []qwenEvent
|
|
|
|
keepLooping := true
|
|
for keepLooping {
|
|
var events []qwenEvent
|
|
events, keepLooping = p.eat()
|
|
if len(events) > 0 {
|
|
all = append(all, events...)
|
|
}
|
|
}
|
|
|
|
if len(all) > 0 {
|
|
slog.Log(context.TODO(), logutil.LevelTrace, "qwen events parsed", "events", all, "state", p.state, "buffer", p.buffer.String())
|
|
}
|
|
|
|
return all
|
|
}
|
|
|
|
func splitAtTag(p *Qwen3VLParser, tag string, trimAfter bool) (string, string) {
|
|
split := strings.SplitN(p.buffer.String(), tag, 2)
|
|
before := split[0]
|
|
before = strings.TrimRightFunc(before, unicode.IsSpace)
|
|
after := split[1]
|
|
if trimAfter {
|
|
after = strings.TrimLeftFunc(after, unicode.IsSpace)
|
|
}
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(after)
|
|
return before, after // return events
|
|
}
|
|
|
|
func (p *Qwen3VLParser) eatLeadingWhitespaceAndTransitionTo(nextState qwenParserState) ([]qwenEvent, bool) {
|
|
trimmed := strings.TrimLeftFunc(p.buffer.String(), unicode.IsSpace)
|
|
p.buffer.Reset()
|
|
if trimmed == "" {
|
|
return nil, false
|
|
}
|
|
p.state = nextState
|
|
p.buffer.WriteString(trimmed)
|
|
return nil, true
|
|
}
|
|
|
|
func (p *Qwen3VLParser) eat() ([]qwenEvent, bool) {
|
|
var events []qwenEvent
|
|
|
|
switch p.state {
|
|
case CollectingContent:
|
|
if strings.Contains(p.buffer.String(), toolOpenTag) {
|
|
// events = emitContentBeforeTag(p, events, toolOpenTag)
|
|
before, _ := splitAtTag(p, toolOpenTag, false)
|
|
if len(before) > 0 {
|
|
events = append(events, qwenEventContent{content: before})
|
|
}
|
|
p.state = CollectingToolContent
|
|
return events, true
|
|
} else if overlapLen := overlap(p.buffer.String(), toolOpenTag); overlapLen > 0 {
|
|
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
|
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
|
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
|
|
|
unambiguous := p.buffer.String()[:ambiguousStart]
|
|
ambiguous := p.buffer.String()[ambiguousStart:]
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(ambiguous)
|
|
if len(unambiguous) > 0 {
|
|
events = append(events, qwenEventContent{content: unambiguous})
|
|
}
|
|
return events, false
|
|
} else {
|
|
whitespaceLen := trailingWhitespaceLen(p.buffer.String())
|
|
ambiguousStart := len(p.buffer.String()) - whitespaceLen
|
|
|
|
unambiguous := p.buffer.String()[:ambiguousStart]
|
|
ambiguous := p.buffer.String()[ambiguousStart:]
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(ambiguous)
|
|
if len(unambiguous) > 0 {
|
|
events = append(events, qwenEventContent{content: unambiguous})
|
|
}
|
|
return events, false
|
|
}
|
|
case CollectingToolContent:
|
|
if strings.Contains(p.buffer.String(), toolCloseTag) {
|
|
split := strings.SplitN(p.buffer.String(), toolCloseTag, 2)
|
|
before := split[0] // do we also need to do it to tool calls?
|
|
if len(before) == 0 {
|
|
slog.Warn("qwen tool call closing tag found but no content before it")
|
|
}
|
|
|
|
after := split[1]
|
|
events = append(events, qwenEventRawToolCall{raw: before})
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(after)
|
|
p.state = ToolCallDoneEatingWhitespace
|
|
return events, true
|
|
} else {
|
|
return events, false
|
|
}
|
|
case CollectingThinkingContent:
|
|
if strings.Contains(p.buffer.String(), thinkingCloseTag) {
|
|
thinking, remaining := splitAtTag(p, thinkingCloseTag, true)
|
|
if len(thinking) > 0 {
|
|
events = append(events, qwenEventThinkingContent{content: thinking})
|
|
}
|
|
if remaining == "" {
|
|
p.state = ThinkingDoneEatingWhitespace
|
|
} else {
|
|
p.state = CollectingContent
|
|
}
|
|
return events, true
|
|
} else if overlapLen := overlap(p.buffer.String(), thinkingCloseTag); overlapLen > 0 {
|
|
beforePartialTag := p.buffer.String()[:len(p.buffer.String())-overlapLen]
|
|
trailingWhitespaceLen := trailingWhitespaceLen(beforePartialTag)
|
|
ambiguousStart := len(beforePartialTag) - trailingWhitespaceLen
|
|
|
|
unambiguous := p.buffer.String()[:ambiguousStart]
|
|
ambiguous := p.buffer.String()[ambiguousStart:]
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(ambiguous)
|
|
if len(unambiguous) > 0 {
|
|
events = append(events, qwenEventThinkingContent{content: unambiguous})
|
|
}
|
|
return events, false
|
|
} else {
|
|
whitespaceLen := trailingWhitespaceLen(p.buffer.String())
|
|
ambiguousStart := len(p.buffer.String()) - whitespaceLen
|
|
|
|
unambiguous := p.buffer.String()[:ambiguousStart]
|
|
ambiguous := p.buffer.String()[ambiguousStart:]
|
|
p.buffer.Reset()
|
|
p.buffer.WriteString(ambiguous)
|
|
if len(unambiguous) > 0 {
|
|
events = append(events, qwenEventThinkingContent{content: unambiguous})
|
|
}
|
|
return events, false
|
|
}
|
|
case ThinkingDoneEatingWhitespace:
|
|
return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent)
|
|
case ToolCallDoneEatingWhitespace:
|
|
return p.eatLeadingWhitespaceAndTransitionTo(CollectingContent)
|
|
default:
|
|
panic("unreachable")
|
|
}
|
|
}
|
|
|
|
func parseJSONToolCall(raw qwenEventRawToolCall, tools []api.Tool) (api.ToolCall, error) {
|
|
var toolCallFunction api.ToolCallFunction
|
|
if err := json.Unmarshal([]byte(raw.raw), &toolCallFunction); err != nil {
|
|
return api.ToolCall{}, err
|
|
}
|
|
|
|
toolCall := api.ToolCall{}
|
|
toolCall.Function = toolCallFunction
|
|
|
|
return toolCall, nil
|
|
}
|