add thinking support to the api and cli (#10584)

- Both `/api/generate` and `/api/chat` now accept a `"think"`
  option that allows specifying whether thinking mode should be on or
  not
- Templates get passed this new option so, e.g., qwen3's template can
  put `/think` or `/no_think` in the system prompt depending on the
  value of the setting
- Models' thinking support is inferred by inspecting model templates.
  The prefix and suffix the parser uses to identify thinking support is
  also automatically inferred from templates
- Thinking control & parsing is opt-in via the API to prevent breaking
  existing API consumers. If the `"think"` option is not specified, the
  behavior is unchanged from previous versions of ollama
- Add parsing for thinking blocks in both streaming/non-streaming mode
  in both `/generate` and `/chat`
- Update the CLI to make use of these changes. Users can pass `--think`
  or `--think=false` to control thinking, or during an interactive
  session they can use the commands `/set think` or `/set nothink`
- A `--hidethinking` option has also been added to the CLI. This makes
  it easy to use thinking in scripting scenarios like
  `ollama run qwen3 --think --hidethinking "my question here"` where you
  just want to see the answer but still want the benefits of thinking
  models
This commit is contained in:
Devon Rifkin
2025-05-28 19:38:52 -07:00
committed by GitHub
parent aa25aff10d
commit 5f57b0ef42
17 changed files with 1195 additions and 49 deletions

View File

@@ -17,7 +17,6 @@ import (
"net/netip"
"os"
"os/signal"
"regexp"
"slices"
"strings"
"syscall"
@@ -186,6 +185,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
if req.Suffix != "" {
caps = append(caps, model.CapabilityInsert)
}
if req.Think != nil && *req.Think {
caps = append(caps, model.CapabilityThinking)
// TODO(drifkin): consider adding a warning if it's false and the model
// doesn't support thinking. It's not strictly required, but it can be a
// hint that the user is on an older qwen3/r1 model that doesn't have an
// updated template supporting thinking
}
r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
if errors.Is(err, errCapabilityCompletion) {
@@ -254,6 +260,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
}
values.Think = req.Think != nil && *req.Think
values.IsThinkSet = req.Think != nil
var b bytes.Buffer
if req.Context != nil {
slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
@@ -273,6 +282,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
prompt = b.String()
}
var thinkingState *thinkingParser
openingTag, closingTag := inferThinkingTags(m.Template.Template)
if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
thinkingState = &thinkingParser{
openingTag: openingTag,
closingTag: closingTag,
}
}
ch := make(chan any)
go func() {
// TODO (jmorganca): avoid building the response twice both here and below
@@ -297,6 +315,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
},
}
if thinkingState != nil {
thinking, content := thinkingState.addContent(cr.Content)
res.Thinking = thinking
res.Response = content
}
if _, err := sb.WriteString(cr.Content); err != nil {
ch <- gin.H{"error": err.Error()}
}
@@ -324,11 +348,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
if req.Stream != nil && !*req.Stream {
var r api.GenerateResponse
var sb strings.Builder
var sbThinking strings.Builder
var sbContent strings.Builder
for rr := range ch {
switch t := rr.(type) {
case api.GenerateResponse:
sb.WriteString(t.Response)
sbThinking.WriteString(t.Thinking)
sbContent.WriteString(t.Response)
r = t
case gin.H:
msg, ok := t["error"].(string)
@@ -344,7 +370,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
}
}
r.Response = sb.String()
r.Thinking = sbThinking.String()
r.Response = sbContent.String()
c.JSON(http.StatusOK, r)
return
}
@@ -1436,6 +1464,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
if len(req.Tools) > 0 {
caps = append(caps, model.CapabilityTools)
}
if req.Think != nil && *req.Think {
caps = append(caps, model.CapabilityThinking)
}
name := model.ParseName(req.Model)
if !name.IsValid() {
@@ -1476,13 +1507,22 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
msgs = filterThinkTags(msgs, m)
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
if err != nil {
slog.Error("chat prompt error", "error", err)
c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
return
}
var thinkingState *thinkingParser
openingTag, closingTag := inferThinkingTags(m.Template.Template)
if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
thinkingState = &thinkingParser{
openingTag: openingTag,
closingTag: closingTag,
}
}
var toolParser *tools.Parser
if len(req.Tools) > 0 {
toolParser, err = tools.NewParser(m.Template.Template)
@@ -1516,6 +1556,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
},
}
if thinkingState != nil {
thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
if thinkingContent == "" && remainingContent == "" && !r.Done {
// need to accumulate more to decide what to send
return
}
res.Message.Content = remainingContent
res.Message.Thinking = thinkingContent
}
if r.Done {
res.DoneReason = r.DoneReason.String()
res.TotalDuration = time.Since(checkpointStart)
@@ -1523,12 +1573,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
if len(req.Tools) > 0 {
toolCalls, content := toolParser.Add(r.Content)
toolCalls, content := toolParser.Add(res.Message.Content)
if len(content) > 0 {
res.Message.Content = content
} else if len(toolCalls) > 0 {
res.Message.ToolCalls = toolCalls
res.Message.Content = ""
} else if res.Message.Thinking != "" {
// don't return
} else {
if r.Done {
ch <- res
@@ -1536,6 +1588,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
return
}
}
ch <- res
}); err != nil {
ch <- gin.H{"error": err.Error()}
@@ -1544,12 +1597,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
if req.Stream != nil && !*req.Stream {
var resp api.ChatResponse
var sb strings.Builder
var toolCalls []api.ToolCall
var sbThinking strings.Builder
var sbContent strings.Builder
for rr := range ch {
switch t := rr.(type) {
case api.ChatResponse:
sb.WriteString(t.Message.Content)
sbThinking.WriteString(t.Message.Thinking)
sbContent.WriteString(t.Message.Content)
resp = t
if len(req.Tools) > 0 {
toolCalls = append(toolCalls, t.Message.ToolCalls...)
@@ -1568,7 +1623,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
}
}
resp.Message.Content = sb.String()
resp.Message.Content = sbContent.String()
resp.Message.Thinking = sbThinking.String()
if len(toolCalls) > 0 {
resp.Message.ToolCalls = toolCalls
}
@@ -1595,8 +1652,6 @@ func handleScheduleError(c *gin.Context, name string, err error) {
}
}
var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
finalUserIndex := -1
@@ -1608,7 +1663,17 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
for i, msg := range msgs {
if msg.Role == "assistant" && i < finalUserIndex {
msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
// TODO(drifkin): this is from before we added proper thinking support.
// However, even if thinking is not enabled (and therefore we shouldn't
// change the user output), we should probably perform this filtering
// for all thinking models (not just qwen3 & deepseek-r1) since it tends
// to save tokens and improve quality.
thinkingState := &thinkingParser{
openingTag: "<think>",
closingTag: "</think>",
}
_, content := thinkingState.addContent(msg.Content)
msgs[i].Content = content
}
}
}