llm: set done reason at server level (#9830)

No functional change. Many different done reasons can be set at the runner level, so rather than obsuring them we should return them to the server process and let it choose what to do with the done reason. This separates the API concerns from the runner.
2025-12-10 15:57:04 +00:00 · 2025-04-03 10:19:24 -07:00
parent b51e0f397c
commit e53b3cbd0c
5 changed files with 54 additions and 42 deletions
--- a/runner/llamarunner/runner.go
+++ b/runner/llamarunner/runner.go
@@ -83,7 +83,7 @@ type Sequence struct {
 	// true if an embedding are to be returned instead of text generation
 	embeddingOnly bool

-	doneReason string
+	doneReason llm.DoneReason

 	// Metrics
 	startProcessingTime time.Time
@@ -301,7 +301,7 @@ func flushPending(seq *Sequence) bool {
 	}
 }

-func (s *Server) removeSequence(seqIndex int, reason string) {
+func (s *Server) removeSequence(seqIndex int, reason llm.DoneReason) {
 	seq := s.seqs[seqIndex]

 	flushPending(seq)
@@ -380,7 +380,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)

 		// if past the num predict limit
 		if seq.numPredict > 0 && seq.numPredicted >= seq.numPredict {
-			s.removeSequence(seqIdx, "limit")
+			s.removeSequence(seqIdx, llm.DoneReasonLength)
 			continue
 		}

@@ -482,7 +482,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}

 			seq.embedding <- embed
-			s.removeSequence(i, "")
+			s.removeSequence(i, llm.DoneReasonStop)
 			continue
 		}

@@ -499,7 +499,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			// as it's important for the /api/generate context
 			// seq.responses <- piece

-			s.removeSequence(i, "stop")
+			s.removeSequence(i, llm.DoneReasonStop)
 			continue
 		}

@@ -530,7 +530,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 			}
 			seq.cache.Inputs = seq.cache.Inputs[:tokenLen]

-			s.removeSequence(i, "stop")
+			s.removeSequence(i, llm.DoneReasonStop)
 			continue
 		}

@@ -543,7 +543,7 @@ func (s *Server) processBatch(tokenBatch *llama.Batch, embedBatch *llama.Batch)
 		}

 		if !flushPending(seq) {
-			s.removeSequence(i, "connection")
+			s.removeSequence(i, llm.DoneReasonConnectionClosed)
 		}
 	}

@@ -657,14 +657,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {

 				flusher.Flush()
 			} else {
-				// Send the final response
-				doneReason := "stop"
-				if seq.doneReason == "limit" {
-					doneReason = "length"
-				}
 				if err := json.NewEncoder(w).Encode(&llm.CompletionResponse{
 					Done:               true,
-					DoneReason:         doneReason,
+					DoneReason:         seq.doneReason,
 					PromptEvalCount:    seq.numPromptInputs,
 					PromptEvalDuration: seq.startGenerationTime.Sub(seq.startProcessingTime),
 					EvalCount:          seq.numDecoded,