Centralize server config handling

This moves all the env var reading into one central module and logs the loaded config once at startup which should help in troubleshooting user server logs
2025-12-10 15:57:04 +00:00 · 2024-05-04 11:46:01 -07:00
parent 6707768ebd
commit f56aa20014
12 changed files with 235 additions and 162 deletions
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,12 +3,11 @@ package llm
 import (
 	"fmt"
 	"log/slog"
-	"os"
-	"strconv"

 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/server/envconfig"
 )

 // This algorithm looks for a complete fit to determine if we need to unload other models
@@ -50,15 +49,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	for _, info := range gpus {
 		memoryAvailable += info.FreeMemory
 	}
-	userLimit := os.Getenv("OLLAMA_MAX_VRAM")
-	if userLimit != "" {
-		avail, err := strconv.ParseUint(userLimit, 10, 64)
-		if err != nil {
-			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
-		} else {
-			slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
-			memoryAvailable = avail
-		}
+	if envconfig.MaxVRAM > 0 {
+		memoryAvailable = envconfig.MaxVRAM
 	}

 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
--- a/llm/server.go
+++ b/llm/server.go
@@ -26,6 +26,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/server/envconfig"
 )

 type LlamaServer interface {
@@ -124,7 +125,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	} else {
 		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
-	demandLib := strings.Trim(os.Getenv("OLLAMA_LLM_LIBRARY"), "\"' ")
+	demandLib := envconfig.LLMLibrary
 	if demandLib != "" {
 		serverPath := availableServers[demandLib]
 		if serverPath == "" {
@@ -145,7 +146,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
 		"--embedding",
 	}
-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--log-format", "json")
 	} else {
 		params = append(params, "--log-disable")
@@ -155,7 +156,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
 	}

-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+	if envconfig.Debug {
 		params = append(params, "--verbose")
 	}

@@ -194,15 +195,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	}

 	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
-	numParallel := 1
-	if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
-		numParallel, err = strconv.Atoi(onp)
-		if err != nil || numParallel <= 0 {
-			err = fmt.Errorf("invalid OLLAMA_NUM_PARALLEL=%s must be greater than zero - %w", onp, err)
-			slog.Error("misconfiguration", "error", err)
-			return nil, err
-		}
-	}
+	numParallel := envconfig.NumParallel
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

 	for i := 0; i < len(servers); i++ {