llama: remove model loading for grammar (#10096)

2025-12-11 00:07:07 +00:00 · 2025-04-24 11:51:19 -07:00
parent 40b10eee6d
commit a53d744b01
13 changed files with 521 additions and 107 deletions
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -298,12 +298,6 @@ type Server struct {
 	// multimodalHash generates hashes for comparing equality
 	// of non-text data
 	multimodalHash maphash.Hash
-
-	// vocab is a llama.cpp vocab required for gammar-based
-	// constrained generation (json mode, structured outputs)
-	// TODO: this is temporary until Ollama sampling supports
-	// constrained generation
-	vocab *sample.Vocab
 }

 func (s *Server) allNil() bool {
@@ -606,14 +600,15 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	var grammar *sample.Grammar
+	var grammar *sample.GrammarSampler
 	var err error
 	if req.Grammar != "" {
-		grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
+		grammar, err = sample.NewGrammarSampler(s.model.(model.TextProcessor), req.Grammar)
 		if err != nil {
 			http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
 			return
 		}
+		defer grammar.Free()
 	}

 	sampler := sample.NewSampler(
@@ -789,8 +784,6 @@ func (s *Server) loadModel(
 		panic(err)
 	}

-	s.vocab = sample.NewVocab(mpath)
-
 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
 		panic("loras are not yet implemented")