Merge branch 'main' into drifkin/array-head-count-simple

This commit is contained in:
Devon Rifkin
2025-05-08 11:46:52 -07:00
committed by GitHub
156 changed files with 6327 additions and 3282 deletions

View File

@@ -7,6 +7,7 @@ import (
const (
CREATE_DEFAULT_ERROR_MODE = 0x04000000
ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
CREATE_NO_WINDOW = 0x08000000
)
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
@@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
//
// Setting Above Normal priority class ensures when running as a "background service"
// with "programs" given best priority, we aren't starved of cpu cycles
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW,
}

View File

@@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
defer f.Close()
inputLayerCount := 5
tensors := []ggml.Tensor{
tensors := []*ggml.Tensor{
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},

View File

@@ -44,6 +44,7 @@ type LlamaServer interface {
EstimatedVRAM() uint64 // Total VRAM across all GPUs
EstimatedTotal() uint64
EstimatedVRAMByGPU(gpuID string) uint64
Pid() int
}
// llmServer is an instance of the llama.cpp server
@@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--no-mmap")
}
if opts.UseMLock {
params = append(params, "--mlock")
}
// TODO - NUMA support currently doesn't work properly
params = append(params, "--parallel", strconv.Itoa(numParallel))
@@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
params = append(params, "--mmproj", projectors[0])
}
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
// without any LD_LIBRARY_PATH flags
for {
@@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
pathEnv = "LD_LIBRARY_PATH"
}
var libraryPaths []string
// Note: we always put our dependency paths first
// since these are the exact version we compiled/linked against
libraryPaths := []string{discover.LibOllamaPath}
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
}
ggmlPaths := []string{discover.LibOllamaPath}
if len(compatible) > 0 {
c := compatible[0]
if libpath, ok := libs[c]; ok {
slog.Debug("adding gpu library", "path", libpath)
libraryPaths = append(libraryPaths, libpath)
libraryPaths = append([]string{libpath}, libraryPaths...)
ggmlPaths = append(ggmlPaths, libpath)
}
}
// Note: we always put the dependency path first
// since this was the exact version we compiled/linked against
if gpus[0].DependencyPath != nil {
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
// assume gpus from the same library have the same dependency path
@@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
envWorkarounds := [][2]string{}
for _, gpu := range gpus {
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
@@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
if envconfig.Debug() {
filteredEnv := []string{}
for _, ev := range s.cmd.Env {
if strings.HasPrefix(ev, "CUDA_") ||
if strings.HasPrefix(ev, "OLLAMA_") ||
strings.HasPrefix(ev, "CUDA_") ||
strings.HasPrefix(ev, "ROCR_") ||
strings.HasPrefix(ev, "ROCM_") ||
strings.HasPrefix(ev, "HIP_") ||
@@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
if errors.Is(err, context.DeadlineExceeded) {
return ServerStatusNotResponding, errors.New("server not responding")
}
if strings.Contains(err.Error(), "connection refused") {
return ServerStatusNotResponding, errors.New("connection refused")
}
return ServerStatusError, fmt.Errorf("health resp: %w", err)
}
defer resp.Body.Close()
@@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
}
}
func (s *llmServer) Pid() int {
if s.cmd != nil && s.cmd.Process != nil {
return s.cmd.Process.Pid
}
return -1
}
var grammarJSON = `
root ::= object
value ::= object | array | string | number | ("true" | "false" | "null") ws
@@ -998,17 +1010,17 @@ func (s *llmServer) Close() error {
s.llamaModelLock.Unlock()
if s.cmd != nil {
slog.Debug("stopping llama server")
slog.Debug("stopping llama server", "pid", s.Pid())
if err := s.cmd.Process.Kill(); err != nil {
return err
}
// if ProcessState is already populated, Wait already completed, no need to wait again
if s.cmd.ProcessState == nil {
slog.Debug("waiting for llama server to exit")
slog.Debug("waiting for llama server to exit", "pid", s.Pid())
<-s.done
}
slog.Debug("llama server stopped")
slog.Debug("llama server stopped", "pid", s.Pid())
}
return nil

View File

@@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
// of a mess, and but it's good enough, until we can refactoring the
// Completion method to be more testable.
ctx, cancel := context.WithCancel(context.Background())
ctx, cancel := context.WithCancel(t.Context())
s := &llmServer{
sem: semaphore.NewWeighted(1), // required to prevent nil panic
}