sample: improve ollama engine sampler performance (#9374)

This change bring in various interface cleanups along with greatly improving the performance of the sampler. Tested with llama3.2 on local machine. Improves performance from ~ 70 tokens/s -> 135 tokens/s with topK(40) enabled. Without topK performance is ~ 110 tokens/s
2025-12-10 15:57:04 +00:00 · 2025-03-07 12:37:48 -08:00
parent 1f6986e919
commit 0682dae027
7 changed files with 572 additions and 331 deletions
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -589,11 +589,19 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	sampler := sample.NewSampler(
+		req.Temperature,
+		req.TopK,
+		req.TopP,
+		req.MinP,
+		req.Seed,
+	)
+
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict: req.NumPredict,
 		stop:       req.Stop,
 		numKeep:    int32(req.NumKeep),
-		sampler:    sample.Greedy(), // TODO: add support for different samplers when performance is optimized
+		sampler:    sampler,
 		embedding:  false,
 	})
 	if err != nil {