sample: temporarily use grammars for constrained generation in new engine (#9586)

This commit is contained in:
Jeffrey Morgan
2025-03-10 16:17:39 +01:00
committed by GitHub
parent a1cda80bcb
commit e093db92c4
10 changed files with 301 additions and 213 deletions

View File

@@ -254,6 +254,12 @@ type Server struct {
// multimodalHash generates hashes for comparing equality
// of non-text data
multimodalHash maphash.Hash
// vocab is a llama.cpp vocab required for gammar-based
// constrained generation (json mode, structured outputs)
// TODO: this is temporary until Ollama sampling supports
// constrained generation
vocab *sample.Vocab
}
func (s *Server) allNil() bool {
@@ -574,18 +580,25 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
return
}
var grammar *sample.Grammar
var err error
if req.Grammar != "" {
grammar, err = sample.NewGrammar(s.vocab, req.Grammar)
if err != nil {
http.Error(w, "failed to load model vocabulary required for format", http.StatusInternalServerError)
return
}
}
sampler := sample.NewSampler(
req.Temperature,
req.TopK,
req.TopP,
req.MinP,
req.Seed,
grammar,
)
if req.Grammar != "" {
panic("grammars are not yet supported")
}
seq, err := s.NewSequence(req.Prompt, req.Images, NewSequenceParams{
numPredict: req.NumPredict,
stop: req.Stop,
@@ -797,6 +810,8 @@ func (s *Server) loadModel(
panic(err)
}
s.vocab = sample.NewVocab(mpath)
// TODO(jessegross): LoRA loading
if lpath.String() != "" {
panic("loras are not yet implemented")