mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 16:26:59 +00:00
add thinking support to the api and cli (#10584)
- Both `/api/generate` and `/api/chat` now accept a `"think"` option that allows specifying whether thinking mode should be on or not - Templates get passed this new option so, e.g., qwen3's template can put `/think` or `/no_think` in the system prompt depending on the value of the setting - Models' thinking support is inferred by inspecting model templates. The prefix and suffix the parser uses to identify thinking support is also automatically inferred from templates - Thinking control & parsing is opt-in via the API to prevent breaking existing API consumers. If the `"think"` option is not specified, the behavior is unchanged from previous versions of ollama - Add parsing for thinking blocks in both streaming/non-streaming mode in both `/generate` and `/chat` - Update the CLI to make use of these changes. Users can pass `--think` or `--think=false` to control thinking, or during an interactive session they can use the commands `/set think` or `/set nothink` - A `--hidethinking` option has also been added to the CLI. This makes it easy to use thinking in scripting scenarios like `ollama run qwen3 --think --hidethinking "my question here"` where you just want to see the answer but still want the benefits of thinking models
This commit is contained in:
@@ -17,7 +17,6 @@ import (
|
||||
"net/netip"
|
||||
"os"
|
||||
"os/signal"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
"syscall"
|
||||
@@ -186,6 +185,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
if req.Suffix != "" {
|
||||
caps = append(caps, model.CapabilityInsert)
|
||||
}
|
||||
if req.Think != nil && *req.Think {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
// TODO(drifkin): consider adding a warning if it's false and the model
|
||||
// doesn't support thinking. It's not strictly required, but it can be a
|
||||
// hint that the user is on an older qwen3/r1 model that doesn't have an
|
||||
// updated template supporting thinking
|
||||
}
|
||||
|
||||
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
|
||||
if errors.Is(err, errCapabilityCompletion) {
|
||||
@@ -254,6 +260,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||
}
|
||||
|
||||
values.Think = req.Think != nil && *req.Think
|
||||
values.IsThinkSet = req.Think != nil
|
||||
|
||||
var b bytes.Buffer
|
||||
if req.Context != nil {
|
||||
slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
|
||||
@@ -273,6 +282,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
prompt = b.String()
|
||||
}
|
||||
|
||||
var thinkingState *thinkingParser
|
||||
openingTag, closingTag := inferThinkingTags(m.Template.Template)
|
||||
if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinkingParser{
|
||||
openingTag: openingTag,
|
||||
closingTag: closingTag,
|
||||
}
|
||||
}
|
||||
|
||||
ch := make(chan any)
|
||||
go func() {
|
||||
// TODO (jmorganca): avoid building the response twice both here and below
|
||||
@@ -297,6 +315,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
},
|
||||
}
|
||||
|
||||
if thinkingState != nil {
|
||||
thinking, content := thinkingState.addContent(cr.Content)
|
||||
res.Thinking = thinking
|
||||
res.Response = content
|
||||
}
|
||||
|
||||
if _, err := sb.WriteString(cr.Content); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
@@ -324,11 +348,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
var r api.GenerateResponse
|
||||
var sb strings.Builder
|
||||
var sbThinking strings.Builder
|
||||
var sbContent strings.Builder
|
||||
for rr := range ch {
|
||||
switch t := rr.(type) {
|
||||
case api.GenerateResponse:
|
||||
sb.WriteString(t.Response)
|
||||
sbThinking.WriteString(t.Thinking)
|
||||
sbContent.WriteString(t.Response)
|
||||
r = t
|
||||
case gin.H:
|
||||
msg, ok := t["error"].(string)
|
||||
@@ -344,7 +370,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
r.Response = sb.String()
|
||||
r.Thinking = sbThinking.String()
|
||||
r.Response = sbContent.String()
|
||||
|
||||
c.JSON(http.StatusOK, r)
|
||||
return
|
||||
}
|
||||
@@ -1436,6 +1464,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
if len(req.Tools) > 0 {
|
||||
caps = append(caps, model.CapabilityTools)
|
||||
}
|
||||
if req.Think != nil && *req.Think {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
}
|
||||
|
||||
name := model.ParseName(req.Model)
|
||||
if !name.IsValid() {
|
||||
@@ -1476,13 +1507,22 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
msgs = filterThinkTags(msgs, m)
|
||||
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
|
||||
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
|
||||
if err != nil {
|
||||
slog.Error("chat prompt error", "error", err)
|
||||
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
|
||||
var thinkingState *thinkingParser
|
||||
openingTag, closingTag := inferThinkingTags(m.Template.Template)
|
||||
if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinkingParser{
|
||||
openingTag: openingTag,
|
||||
closingTag: closingTag,
|
||||
}
|
||||
}
|
||||
|
||||
var toolParser *tools.Parser
|
||||
if len(req.Tools) > 0 {
|
||||
toolParser, err = tools.NewParser(m.Template.Template)
|
||||
@@ -1516,6 +1556,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
},
|
||||
}
|
||||
|
||||
if thinkingState != nil {
|
||||
thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
|
||||
if thinkingContent == "" && remainingContent == "" && !r.Done {
|
||||
// need to accumulate more to decide what to send
|
||||
return
|
||||
}
|
||||
res.Message.Content = remainingContent
|
||||
res.Message.Thinking = thinkingContent
|
||||
}
|
||||
|
||||
if r.Done {
|
||||
res.DoneReason = r.DoneReason.String()
|
||||
res.TotalDuration = time.Since(checkpointStart)
|
||||
@@ -1523,12 +1573,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
|
||||
if len(req.Tools) > 0 {
|
||||
toolCalls, content := toolParser.Add(r.Content)
|
||||
toolCalls, content := toolParser.Add(res.Message.Content)
|
||||
if len(content) > 0 {
|
||||
res.Message.Content = content
|
||||
} else if len(toolCalls) > 0 {
|
||||
res.Message.ToolCalls = toolCalls
|
||||
res.Message.Content = ""
|
||||
} else if res.Message.Thinking != "" {
|
||||
// don't return
|
||||
} else {
|
||||
if r.Done {
|
||||
ch <- res
|
||||
@@ -1536,6 +1588,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
ch <- res
|
||||
}); err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
@@ -1544,12 +1597,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
|
||||
if req.Stream != nil && !*req.Stream {
|
||||
var resp api.ChatResponse
|
||||
var sb strings.Builder
|
||||
var toolCalls []api.ToolCall
|
||||
var sbThinking strings.Builder
|
||||
var sbContent strings.Builder
|
||||
for rr := range ch {
|
||||
switch t := rr.(type) {
|
||||
case api.ChatResponse:
|
||||
sb.WriteString(t.Message.Content)
|
||||
sbThinking.WriteString(t.Message.Thinking)
|
||||
sbContent.WriteString(t.Message.Content)
|
||||
resp = t
|
||||
if len(req.Tools) > 0 {
|
||||
toolCalls = append(toolCalls, t.Message.ToolCalls...)
|
||||
@@ -1568,7 +1623,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
resp.Message.Content = sb.String()
|
||||
resp.Message.Content = sbContent.String()
|
||||
resp.Message.Thinking = sbThinking.String()
|
||||
|
||||
if len(toolCalls) > 0 {
|
||||
resp.Message.ToolCalls = toolCalls
|
||||
}
|
||||
@@ -1595,8 +1652,6 @@ func handleScheduleError(c *gin.Context, name string, err error) {
|
||||
}
|
||||
}
|
||||
|
||||
var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
|
||||
|
||||
func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
|
||||
if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
|
||||
finalUserIndex := -1
|
||||
@@ -1608,7 +1663,17 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
|
||||
|
||||
for i, msg := range msgs {
|
||||
if msg.Role == "assistant" && i < finalUserIndex {
|
||||
msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
|
||||
// TODO(drifkin): this is from before we added proper thinking support.
|
||||
// However, even if thinking is not enabled (and therefore we shouldn't
|
||||
// change the user output), we should probably perform this filtering
|
||||
// for all thinking models (not just qwen3 & deepseek-r1) since it tends
|
||||
// to save tokens and improve quality.
|
||||
thinkingState := &thinkingParser{
|
||||
openingTag: "<think>",
|
||||
closingTag: "</think>",
|
||||
}
|
||||
_, content := thinkingState.addContent(msg.Content)
|
||||
msgs[i].Content = content
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user