add thinking support to the api and cli (#10584)

- Both `/api/generate` and `/api/chat` now accept a `"think"` option that allows specifying whether thinking mode should be on or not - Templates get passed this new option so, e.g., qwen3's template can put `/think` or `/no_think` in the system prompt depending on the value of the setting - Models' thinking support is inferred by inspecting model templates. The prefix and suffix the parser uses to identify thinking support is also automatically inferred from templates - Thinking control & parsing is opt-in via the API to prevent breaking existing API consumers. If the `"think"` option is not specified, the behavior is unchanged from previous versions of ollama - Add parsing for thinking blocks in both streaming/non-streaming mode in both `/generate` and `/chat` - Update the CLI to make use of these changes. Users can pass `--think` or `--think=false` to control thinking, or during an interactive session they can use the commands `/set think` or `/set nothink` - A `--hidethinking` option has also been added to the CLI. This makes it easy to use thinking in scripting scenarios like `ollama run qwen3 --think --hidethinking "my question here"` where you just want to see the answer but still want the benefits of thinking models
2025-12-11 16:26:59 +00:00 · 2025-05-28 19:38:52 -07:00
parent aa25aff10d
commit 5f57b0ef42
17 changed files with 1195 additions and 49 deletions
--- a/server/routes.go
+++ b/server/routes.go
@@ -17,7 +17,6 @@ import (
 	"net/netip"
 	"os"
 	"os/signal"
-	"regexp"
 	"slices"
 	"strings"
 	"syscall"
@@ -186,6 +185,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 	if req.Suffix != "" {
 		caps = append(caps, model.CapabilityInsert)
 	}
+	if req.Think != nil && *req.Think {
+		caps = append(caps, model.CapabilityThinking)
+		// TODO(drifkin): consider adding a warning if it's false and the model
+		// doesn't support thinking. It's not strictly required, but it can be a
+		// hint that the user is on an older qwen3/r1 model that doesn't have an
+		// updated template supporting thinking
+	}

 	r, m, opts, err := s.scheduleRunner(c.Request.Context(), name.String(), caps, req.Options, req.KeepAlive)
 	if errors.Is(err, errCapabilityCompletion) {
@@ -254,6 +260,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
 		}

+		values.Think = req.Think != nil && *req.Think
+		values.IsThinkSet = req.Think != nil
+
 		var b bytes.Buffer
 		if req.Context != nil {
 			slog.Warn("the context field is deprecated and will be removed in a future version of Ollama")
@@ -273,6 +282,15 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		prompt = b.String()
 	}

+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
+		}
+	}
+
 	ch := make(chan any)
 	go func() {
 		// TODO (jmorganca): avoid building the response twice both here and below
@@ -297,6 +315,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 				},
 			}

+			if thinkingState != nil {
+				thinking, content := thinkingState.addContent(cr.Content)
+				res.Thinking = thinking
+				res.Response = content
+			}
+
 			if _, err := sb.WriteString(cr.Content); err != nil {
 				ch <- gin.H{"error": err.Error()}
 			}
@@ -324,11 +348,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {

 	if req.Stream != nil && !*req.Stream {
 		var r api.GenerateResponse
-		var sb strings.Builder
+		var sbThinking strings.Builder
+		var sbContent strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.GenerateResponse:
-				sb.WriteString(t.Response)
+				sbThinking.WriteString(t.Thinking)
+				sbContent.WriteString(t.Response)
 				r = t
 			case gin.H:
 				msg, ok := t["error"].(string)
@@ -344,7 +370,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 			}
 		}

-		r.Response = sb.String()
+		r.Thinking = sbThinking.String()
+		r.Response = sbContent.String()
+
 		c.JSON(http.StatusOK, r)
 		return
 	}
@@ -1436,6 +1464,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	if len(req.Tools) > 0 {
 		caps = append(caps, model.CapabilityTools)
 	}
+	if req.Think != nil && *req.Think {
+		caps = append(caps, model.CapabilityThinking)
+	}

 	name := model.ParseName(req.Model)
 	if !name.IsValid() {
@@ -1476,13 +1507,22 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	}
 	msgs = filterThinkTags(msgs, m)

-	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools)
+	prompt, images, err := chatPrompt(c.Request.Context(), m, r.Tokenize, opts, msgs, req.Tools, req.Think)
 	if err != nil {
 		slog.Error("chat prompt error", "error", err)
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}

+	var thinkingState *thinkingParser
+	openingTag, closingTag := inferThinkingTags(m.Template.Template)
+	if req.Think != nil && *req.Think && openingTag != "" && closingTag != "" {
+		thinkingState = &thinkingParser{
+			openingTag: openingTag,
+			closingTag: closingTag,
+		}
+	}
+
 	var toolParser *tools.Parser
 	if len(req.Tools) > 0 {
 		toolParser, err = tools.NewParser(m.Template.Template)
@@ -1516,6 +1556,16 @@ func (s *Server) ChatHandler(c *gin.Context) {
 				},
 			}

+			if thinkingState != nil {
+				thinkingContent, remainingContent := thinkingState.addContent(res.Message.Content)
+				if thinkingContent == "" && remainingContent == "" && !r.Done {
+					// need to accumulate more to decide what to send
+					return
+				}
+				res.Message.Content = remainingContent
+				res.Message.Thinking = thinkingContent
+			}
+
 			if r.Done {
 				res.DoneReason = r.DoneReason.String()
 				res.TotalDuration = time.Since(checkpointStart)
@@ -1523,12 +1573,14 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}

 			if len(req.Tools) > 0 {
-				toolCalls, content := toolParser.Add(r.Content)
+				toolCalls, content := toolParser.Add(res.Message.Content)
 				if len(content) > 0 {
 					res.Message.Content = content
 				} else if len(toolCalls) > 0 {
 					res.Message.ToolCalls = toolCalls
 					res.Message.Content = ""
+				} else if res.Message.Thinking != "" {
+					// don't return
 				} else {
 					if r.Done {
 						ch <- res
@@ -1536,6 +1588,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
 					return
 				}
 			}
+
 			ch <- res
 		}); err != nil {
 			ch <- gin.H{"error": err.Error()}
@@ -1544,12 +1597,14 @@ func (s *Server) ChatHandler(c *gin.Context) {

 	if req.Stream != nil && !*req.Stream {
 		var resp api.ChatResponse
-		var sb strings.Builder
 		var toolCalls []api.ToolCall
+		var sbThinking strings.Builder
+		var sbContent strings.Builder
 		for rr := range ch {
 			switch t := rr.(type) {
 			case api.ChatResponse:
-				sb.WriteString(t.Message.Content)
+				sbThinking.WriteString(t.Message.Thinking)
+				sbContent.WriteString(t.Message.Content)
 				resp = t
 				if len(req.Tools) > 0 {
 					toolCalls = append(toolCalls, t.Message.ToolCalls...)
@@ -1568,7 +1623,9 @@ func (s *Server) ChatHandler(c *gin.Context) {
 			}
 		}

-		resp.Message.Content = sb.String()
+		resp.Message.Content = sbContent.String()
+		resp.Message.Thinking = sbThinking.String()
+
 		if len(toolCalls) > 0 {
 			resp.Message.ToolCalls = toolCalls
 		}
@@ -1595,8 +1652,6 @@ func handleScheduleError(c *gin.Context, name string, err error) {
 	}
 }

-var thinkTagRegexp = regexp.MustCompile(`<think>(?s).*?</think>(\n)*`)
-
 func filterThinkTags(msgs []api.Message, m *Model) []api.Message {
 	if m.Config.ModelFamily == "qwen3" || model.ParseName(m.Name).Model == "deepseek-r1" {
 		finalUserIndex := -1
@@ -1608,7 +1663,17 @@ func filterThinkTags(msgs []api.Message, m *Model) []api.Message {

 		for i, msg := range msgs {
 			if msg.Role == "assistant" && i < finalUserIndex {
-				msgs[i].Content = thinkTagRegexp.ReplaceAllString(msg.Content, "")
+				// TODO(drifkin): this is from before we added proper thinking support.
+				// However, even if thinking is not enabled (and therefore we shouldn't
+				// change the user output), we should probably perform this filtering
+				// for all thinking models (not just qwen3 & deepseek-r1) since it tends
+				// to save tokens and improve quality.
+				thinkingState := &thinkingParser{
+					openingTag: "<think>",
+					closingTag: "</think>",
+				}
+				_, content := thinkingState.addContent(msg.Content)
+				msgs[i].Content = content
 			}
 		}
 	}