subprocess llama.cpp server (#401)

* remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm
2025-12-14 09:47:02 +00:00 · 2023-08-30 16:35:03 -04:00
parent f4432e1dba
commit 42998d797d
37 changed files with 958 additions and 43928 deletions
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,6 +1,7 @@
 package llm

 import (
+	"context"
 	"fmt"
 	"log"
 	"os"
@@ -11,12 +12,13 @@ import (
 )

 type LLM interface {
-	Predict([]int, string, func(api.GenerateResponse)) error
-	Embedding(string) ([]float64, error)
-	Encode(string) []int
-	Decode(...int) string
+	Predict(context.Context, []int, string, func(api.GenerateResponse)) error
+	Embedding(context.Context, string) ([]float64, error)
+	Encode(context.Context, string) ([]int, error)
+	Decode(context.Context, []int) (string, error)
 	SetOptions(api.Options)
 	Close()
+	Ping(context.Context) error
 }

 func New(model string, adapters []string, opts api.Options) (LLM, error) {
@@ -75,7 +77,7 @@ func New(model string, adapters []string, opts api.Options) (LLM, error) {

 	switch ggml.ModelFamily() {
 	case ModelFamilyLlama:
-		return newLlama(model, adapters, opts)
+		return newLlama(model, adapters, ggmlRunner(), opts)
 	default:
 		return nil, fmt.Errorf("unknown ggml type: %s", ggml.ModelFamily())
 	}