Add cgo implementation for llama.cpp

Run the server.cpp directly inside the Go runtime via cgo while retaining the LLM Go abstractions.
2025-12-11 00:07:07 +00:00 · 2023-11-13 17:20:34 -08:00
parent 5e7fd6906f
commit d4cd695759
27 changed files with 1189 additions and 765 deletions
--- a/server/llm_test.go
+++ b/server/llm_test.go
@@ -0,0 +1,103 @@
+package server
+
+import (
+	"context"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/jmorganca/ollama/api"
+)
+
+// TODO - this would ideally be in the llm package, but that would require some refactoring of interfaces in the server
+//        package to avoid circular dependencies
+
+// WARNING - these tests will fail on mac if you don't manually copy ggml-metal.metal to this dir (./server)
+//
+// TODO - Fix this ^^
+
+var (
+	req = [2]api.GenerateRequest{
+		{
+			Model:   "orca-mini",
+			Prompt:  "tell me a short story about agi?",
+			Options: map[string]interface{}{},
+		}, {
+			Model:   "orca-mini",
+			Prompt:  "what is the origin of the us thanksgiving holiday?",
+			Options: map[string]interface{}{},
+		},
+	}
+	resp = [2]string{
+		"once upon a time",
+		"fourth thursday",
+	}
+)
+
+func TestIntegrationSimpleOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+	defer llmRunner.Close()
+	response := OneShotPromptResponse(t, ctx, req[0], model, llmRunner)
+	assert.Contains(t, strings.ToLower(response), resp[0])
+}
+
+// TODO
+// The server always loads a new runner and closes the old one, which forces serial execution
+// At present this test case fails with concurrency problems.  Eventually we should try to
+// get true concurrency working with n_parallel support in the backend
+func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	t.Skip("concurrent prediction on single runner not currently supported")
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+	model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+	defer llmRunner.Close()
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
+			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
+			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
+		}(i)
+	}
+	wg.Wait()
+}
+
+func TestIntegrationConcurrentRunnersOrcaMini(t *testing.T) {
+	SkipIFNoTestData(t)
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
+	defer cancel()
+	opts := api.DefaultOptions()
+	opts.Seed = 42
+	opts.Temperature = 0.0
+	var wg sync.WaitGroup
+	wg.Add(len(req))
+
+	for i := 0; i < len(req); i++ {
+		go func(i int) {
+			defer wg.Done()
+			model, llmRunner := PrepareModelForPrompts(t, req[0].Model, opts)
+			defer llmRunner.Close()
+			response := OneShotPromptResponse(t, ctx, req[i], model, llmRunner)
+			t.Logf("Prompt: %s\nResponse: %s", req[0].Prompt, response)
+			assert.Contains(t, strings.ToLower(response), resp[i], "error in thread %d (%s)", i, req[i].Prompt)
+		}(i)
+	}
+	wg.Wait()
+}
+
+// TODO - create a parallel test with 2 different models once we support concurrency
--- a/server/llm_utils_test.go
+++ b/server/llm_utils_test.go
@@ -0,0 +1,76 @@
+package server
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path"
+	"runtime"
+	"testing"
+	"time"
+
+	"github.com/jmorganca/ollama/api"
+	"github.com/jmorganca/ollama/llm"
+	"github.com/stretchr/testify/require"
+)
+
+func SkipIFNoTestData(t *testing.T) {
+	modelDir := getModelDir()
+	if _, err := os.Stat(modelDir); errors.Is(err, os.ErrNotExist) {
+		t.Skipf("%s does not exist - skipping integration tests", modelDir)
+	}
+}
+
+func getModelDir() string {
+	_, filename, _, _ := runtime.Caller(0)
+	return path.Dir(path.Dir(filename) + "/../test_data/models/.")
+}
+
+func PrepareModelForPrompts(t *testing.T, modelName string, opts api.Options) (*Model, llm.LLM) {
+	modelDir := getModelDir()
+	os.Setenv("OLLAMA_MODELS", modelDir)
+	model, err := GetModel(modelName)
+	require.NoError(t, err, "GetModel ")
+	err = opts.FromMap(model.Options)
+	require.NoError(t, err, "opts from model ")
+	runner, err := llm.New("unused", model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
+	require.NoError(t, err, "llm.New failed")
+	return model, runner
+}
+
+func OneShotPromptResponse(t *testing.T, ctx context.Context, req api.GenerateRequest, model *Model, runner llm.LLM) string {
+	checkpointStart := time.Now()
+	prompt, err := model.Prompt(PromptVars{
+		System: req.System,
+		Prompt: req.Prompt,
+		First:  len(req.Context) == 0,
+	})
+	require.NoError(t, err, "prompt generation failed")
+	success := make(chan bool, 1)
+	response := ""
+	cb := func(r llm.PredictResult) {
+
+		if !r.Done {
+			response += r.Content
+		} else {
+			success <- true
+		}
+	}
+	checkpointLoaded := time.Now()
+	predictReq := llm.PredictOpts{
+		Prompt:           prompt,
+		Format:           req.Format,
+		CheckpointStart:  checkpointStart,
+		CheckpointLoaded: checkpointLoaded,
+	}
+	err = runner.Predict(ctx, predictReq, cb)
+	require.NoError(t, err, "predict call failed")
+
+	select {
+	case <-ctx.Done():
+		t.Errorf("failed to complete before timeout: \n%s", response)
+		return ""
+	case <-success:
+		return response
+	}
+}
--- a/server/routes.go
+++ b/server/routes.go
@@ -126,10 +126,6 @@ func load(c *gin.Context, modelName string, reqOpts map[string]interface{}, sess
 		loaded.Options = &opts
 	}

-	// update options for the loaded llm
-	// TODO(mxyng): this isn't thread safe, but it should be fine for now
-	loaded.runner.SetOptions(opts)
-
 	loaded.expireAt = time.Now().Add(sessionDuration)

 	if loaded.expireTimer == nil {