deprecate ggml

- remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>
2025-12-11 16:26:59 +00:00 · 2023-11-24 13:58:09 -05:00
parent ed195f3562
commit 811b1f03c8
19 changed files with 74 additions and 393 deletions
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -59,13 +59,12 @@ ws ::= ([ \t\n] ws)?
 var llamaCppEmbed embed.FS

 type ModelRunner struct {
-	Type        string // "gguf" or "ggml"
 	Path        string // path to the model runner executable
 	Accelerated bool
 }

-func chooseRunners(workDir, runnerType string) []ModelRunner {
-	buildPath := path.Join("llama.cpp", runnerType, "build")
+func chooseRunners(workDir string) []ModelRunner {
+	buildPath := path.Join("llama.cpp", "gguf", "build")
 	var runners []ModelRunner

 	// set the runners based on the OS
@@ -73,25 +72,25 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	switch runtime.GOOS {
 	case "darwin":
 		if runtime.GOARCH == "arm64" {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "metal", "bin", "ollama-runner")}}
 		} else {
-			runners = []ModelRunner{{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
+			runners = []ModelRunner{{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")}}
 		}
 	case "linux":
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cuda", "bin", "ollama-runner"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	case "windows":
 		// TODO: select windows GPU runner here when available
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
+			{Path: path.Join(buildPath, "cuda", "bin", "Release", "ollama-runner.exe"), Accelerated: true},
+			{Path: path.Join(buildPath, "cpu", "bin", "Release", "ollama-runner.exe")},
 		}
 	default:
 		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
 		runners = []ModelRunner{
-			{Type: runnerType, Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
+			{Path: path.Join(buildPath, "cpu", "bin", "ollama-runner")},
 		}
 	}

@@ -141,7 +140,7 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 		}
 	}
 	if !runnerAvailable {
-		log.Fatalf("%s runner not found", runnerType)
+		log.Fatalf("gguf runner not found")
 	}

 	// return the runners to try in priority order
@@ -149,7 +148,6 @@ func chooseRunners(workDir, runnerType string) []ModelRunner {
 	for _, r := range runners {
 		// clean the ModelRunner paths so that they match the OS we are running on
 		localRunnersByPriority = append(localRunnersByPriority, ModelRunner{
-			Type:        r.Type,
 			Path:        filepath.Clean(path.Join(workDir, r.Path)),
 			Accelerated: r.Accelerated,
 		})
@@ -350,6 +348,7 @@ func newLlama(model string, adapters, projectors []string, runners []ModelRunner
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--n-gpu-layers", fmt.Sprintf("%d", numGPU),
 		"--embedding",
+		"--parallel", "2",
 	}

 	if opts.MainGPU > 0 {