sample: improve ollama engine sampler performance (#9374)

This change bring in various interface cleanups along with greatly improving the performance of the sampler.

Tested with llama3.2 on local machine.
Improves performance from ~ 70 tokens/s -> 135 tokens/s with topK(40) enabled.
Without topK performance is ~ 110 tokens/s
This commit is contained in:
Parth Sareen
2025-03-07 12:37:48 -08:00
committed by GitHub
parent 1f6986e919
commit 0682dae027
7 changed files with 572 additions and 331 deletions

View File

@@ -589,11 +589,19 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return
}
sampler := sample.NewSampler(
req.Temperature,
req.TopK,
req.TopP,
req.MinP,
req.Seed,
)
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
numPredict: req.NumPredict,
stop: req.Stop,
numKeep: int32(req.NumKeep),
sampler: sample.Greedy(), // TODO: add support for different samplers when performance is optimized
sampler: sampler,
embedding: false,
})
if err != nil {