llama: remove model loading for grammar (#10096)

This commit is contained in:
Parth Sareen
2025-04-24 11:51:19 -07:00
committed by GitHub
parent 40b10eee6d
commit a53d744b01
13 changed files with 521 additions and 107 deletions

View File

@@ -298,12 +298,6 @@ type Server struct {
// multimodalHash generates hashes for comparing equality
// of non-text data
multimodalHash maphash.Hash
// vocab is a llama.cpp vocab required for gammar-based
// constrained generation (json mode, structured outputs)
// TODO: this is temporary until Ollama sampling supports
// constrained generation
vocab *sample.Vocab
}
func (s *Server) allNil() bool {
@@ -606,14 +600,15 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return
}
var grammar *sample.Grammar
var grammar *sample.GrammarSampler
var err error
if req.Grammar != "" {
grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
grammar, err = sample.NewGrammarSampler(s.model.(model.TextProcessor), req.Grammar)
if err != nil {
http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
return
}
defer grammar.Free()
}
sampler := sample.NewSampler(
@@ -789,8 +784,6 @@ func (s *Server) loadModel(
panic(err)
}
s.vocab = sample.NewVocab(mpath)
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")