sample: temporarily use grammars for constrained generation in new engine (#9586)

2025-12-10 15:57:04 +00:00 · 2025-03-10 16:17:39 +01:00
parent a1cda80bcb
commit e093db92c4
10 changed files with 301 additions and 213 deletions
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -254,6 +254,12 @@ type Server struct {
 	// multimodalHash generates hashes for comparing equality
 	// of non-text data
 	multimodalHash maphash.Hash
+
+	// vocab is a llama.cpp vocab required for gammar-based
+	// constrained generation (json mode, structured outputs)
+	// TODO: this is temporary until Ollama sampling supports
+	// constrained generation
+	vocab *sample.Vocab
 }

 func (s *Server) allNil() bool {
@@ -574,18 +580,25 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

+	var grammar *sample.Grammar
+	var err error
+	if req.Grammar != "" {
+		grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
+		if err != nil {
+			http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
+			return
+		}
+	}
+
 	sampler := sample.NewSampler(
 		req.Temperature,
 		req.TopK,
 		req.TopP,
 		req.MinP,
 		req.Seed,
+		grammar,
 	)

-	if req.Grammar != "" {
-		panic("grammars are not yet supported")
-	}
-
 	seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
 		numPredict: req.NumPredict,
 		stop:       req.Stop,
@@ -797,6 +810,8 @@ func (s *Server) loadModel(
 		panic(err)
 	}

+	s.vocab = sample.NewVocab(mpath)
+
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")