Merge branch 'main' into drifkin/array-head-count-simple

2025-12-12 00:37:04 +00:00 · 2025-05-08 11:46:52 -07:00
parent d2ee599dcf 6e9a7a2568
commit 20c5fd39c8
156 changed files with 6327 additions and 3282 deletions
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -7,6 +7,7 @@ import (
 const (
 	CREATE_DEFAULT_ERROR_MODE   = 0x04000000
 	ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
+	CREATE_NO_WINDOW            = 0x08000000
 )

 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
@@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	//
 	// Setting Above Normal priority class ensures when running as a "background service"
 	// with "programs" given best priority, we aren't starved of cpu cycles
-	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW,
 }
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	inputLayerCount := 5

-	tensors := []ggml.Tensor{
+	tensors := []*ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
--- a/llm/server.go
+++ b/llm/server.go
@@ -44,6 +44,7 @@ type LlamaServer interface {
 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
 	EstimatedVRAMByGPU(gpuID string) uint64
+	Pid() int
 }

 // llmServer is an instance of the llama.cpp server
@@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--no-mmap")
 	}

-	if opts.UseMLock {
-		params = append(params, "--mlock")
-	}
-
 	// TODO - NUMA support currently doesn't work properly

 	params = append(params, "--parallel", strconv.Itoa(numParallel))
@@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		params = append(params, "--mmproj", projectors[0])
 	}

-	// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
+	// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
 	// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
 	// without any LD_LIBRARY_PATH flags
 	for {
@@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			pathEnv = "LD_LIBRARY_PATH"
 		}

-		var libraryPaths []string
+		// Note: we always put our dependency paths first
+		// since these are the exact version we compiled/linked against
+		libraryPaths := []string{discover.LibOllamaPath}
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
 		}

+		ggmlPaths := []string{discover.LibOllamaPath}
 		if len(compatible) > 0 {
 			c := compatible[0]
 			if libpath, ok := libs[c]; ok {
 				slog.Debug("adding gpu library", "path", libpath)
-				libraryPaths = append(libraryPaths, libpath)
+				libraryPaths = append([]string{libpath}, libraryPaths...)
+				ggmlPaths = append(ggmlPaths, libpath)
 			}
 		}

-		// Note: we always put the dependency path first
-		// since this was the exact version we compiled/linked against
 		if gpus[0].DependencyPath != nil {
 			slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
 			// assume gpus from the same library have the same dependency path
@@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		s.cmd.Stderr = s.status
 		s.cmd.SysProcAttr = LlamaServerSysProcAttr

+		s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
+
 		envWorkarounds := [][2]string{}
 		for _, gpu := range gpus {
 			envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
@@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		if envconfig.Debug() {
 			filteredEnv := []string{}
 			for _, ev := range s.cmd.Env {
-				if strings.HasPrefix(ev, "CUDA_") ||
+				if strings.HasPrefix(ev, "OLLAMA_") ||
+					strings.HasPrefix(ev, "CUDA_") ||
 					strings.HasPrefix(ev, "ROCR_") ||
 					strings.HasPrefix(ev, "ROCM_") ||
 					strings.HasPrefix(ev, "HIP_") ||
@@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
 		if errors.Is(err, context.DeadlineExceeded) {
 			return ServerStatusNotResponding, errors.New("server not responding")
 		}
+		if strings.Contains(err.Error(), "connection refused") {
+			return ServerStatusNotResponding, errors.New("connection refused")
+		}
 		return ServerStatusError, fmt.Errorf("health resp: %w", err)
 	}
 	defer resp.Body.Close()
@@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	}
 }

+func (s *llmServer) Pid() int {
+	if s.cmd != nil && s.cmd.Process != nil {
+		return s.cmd.Process.Pid
+	}
+	return -1
+}
+
 var grammarJSON = `
 root   ::= object
 value  ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -998,17 +1010,17 @@ func (s *llmServer) Close() error {
 	s.llamaModelLock.Unlock()

 	if s.cmd != nil {
-		slog.Debug("stopping llama server")
+		slog.Debug("stopping llama server", "pid", s.Pid())
 		if err := s.cmd.Process.Kill(); err != nil {
 			return err
 		}
 		// if ProcessState is already populated, Wait already completed, no need to wait again
 		if s.cmd.ProcessState == nil {
-			slog.Debug("waiting for llama server to exit")
+			slog.Debug("waiting for llama server to exit", "pid", s.Pid())
 			<-s.done
 		}

-		slog.Debug("llama server stopped")
+		slog.Debug("llama server stopped", "pid", s.Pid())
 	}

 	return nil
--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
 	// of a mess, and but it's good enough, until we can refactoring the
 	// Completion method to be more testable.

-	ctx, cancel := context.WithCancel(context.Background())
+	ctx, cancel := context.WithCancel(t.Context())
 	s := &llmServer{
 		sem: semaphore.NewWeighted(1), // required to prevent nil panic
 	}