mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 00:37:04 +00:00
Merge branch 'main' into drifkin/array-head-count-simple
This commit is contained in:
@@ -7,6 +7,7 @@ import (
|
||||
const (
|
||||
CREATE_DEFAULT_ERROR_MODE = 0x04000000
|
||||
ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
|
||||
CREATE_NO_WINDOW = 0x08000000
|
||||
)
|
||||
|
||||
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
||||
@@ -18,5 +19,5 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
|
||||
//
|
||||
// Setting Above Normal priority class ensures when running as a "background service"
|
||||
// with "programs" given best priority, we aren't starved of cpu cycles
|
||||
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
|
||||
CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS | CREATE_NO_WINDOW,
|
||||
}
|
||||
|
||||
@@ -25,7 +25,7 @@ func TestEstimateGPULayers(t *testing.T) {
|
||||
defer f.Close()
|
||||
inputLayerCount := 5
|
||||
|
||||
tensors := []ggml.Tensor{
|
||||
tensors := []*ggml.Tensor{
|
||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||
|
||||
@@ -44,6 +44,7 @@ type LlamaServer interface {
|
||||
EstimatedVRAM() uint64 // Total VRAM across all GPUs
|
||||
EstimatedTotal() uint64
|
||||
EstimatedVRAMByGPU(gpuID string) uint64
|
||||
Pid() int
|
||||
}
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
@@ -216,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
if opts.UseMLock {
|
||||
params = append(params, "--mlock")
|
||||
}
|
||||
|
||||
// TODO - NUMA support currently doesn't work properly
|
||||
|
||||
params = append(params, "--parallel", strconv.Itoa(numParallel))
|
||||
@@ -289,7 +286,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
params = append(params, "--mmproj", projectors[0])
|
||||
}
|
||||
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'cuda_v11', 'rocm', etc.
|
||||
// iterate through compatible GPU libraries such as 'cuda_v12', 'rocm', etc.
|
||||
// adding each library's respective path to the LD_LIBRARY_PATH, until finally running
|
||||
// without any LD_LIBRARY_PATH flags
|
||||
for {
|
||||
@@ -324,21 +321,23 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
pathEnv = "LD_LIBRARY_PATH"
|
||||
}
|
||||
|
||||
var libraryPaths []string
|
||||
// Note: we always put our dependency paths first
|
||||
// since these are the exact version we compiled/linked against
|
||||
libraryPaths := []string{discover.LibOllamaPath}
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
}
|
||||
|
||||
ggmlPaths := []string{discover.LibOllamaPath}
|
||||
if len(compatible) > 0 {
|
||||
c := compatible[0]
|
||||
if libpath, ok := libs[c]; ok {
|
||||
slog.Debug("adding gpu library", "path", libpath)
|
||||
libraryPaths = append(libraryPaths, libpath)
|
||||
libraryPaths = append([]string{libpath}, libraryPaths...)
|
||||
ggmlPaths = append(ggmlPaths, libpath)
|
||||
}
|
||||
}
|
||||
|
||||
// Note: we always put the dependency path first
|
||||
// since this was the exact version we compiled/linked against
|
||||
if gpus[0].DependencyPath != nil {
|
||||
slog.Debug("adding gpu dependency paths", "paths", gpus[0].DependencyPath)
|
||||
// assume gpus from the same library have the same dependency path
|
||||
@@ -369,6 +368,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
s.cmd.Stderr = s.status
|
||||
s.cmd.SysProcAttr = LlamaServerSysProcAttr
|
||||
|
||||
s.cmd.Env = append(s.cmd.Env, "OLLAMA_LIBRARY_PATH="+strings.Join(ggmlPaths, string(filepath.ListSeparator)))
|
||||
|
||||
envWorkarounds := [][2]string{}
|
||||
for _, gpu := range gpus {
|
||||
envWorkarounds = append(envWorkarounds, gpu.EnvWorkarounds...)
|
||||
@@ -406,7 +407,8 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
|
||||
if envconfig.Debug() {
|
||||
filteredEnv := []string{}
|
||||
for _, ev := range s.cmd.Env {
|
||||
if strings.HasPrefix(ev, "CUDA_") ||
|
||||
if strings.HasPrefix(ev, "OLLAMA_") ||
|
||||
strings.HasPrefix(ev, "CUDA_") ||
|
||||
strings.HasPrefix(ev, "ROCR_") ||
|
||||
strings.HasPrefix(ev, "ROCM_") ||
|
||||
strings.HasPrefix(ev, "HIP_") ||
|
||||
@@ -515,6 +517,9 @@ func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
if errors.Is(err, context.DeadlineExceeded) {
|
||||
return ServerStatusNotResponding, errors.New("server not responding")
|
||||
}
|
||||
if strings.Contains(err.Error(), "connection refused") {
|
||||
return ServerStatusNotResponding, errors.New("connection refused")
|
||||
}
|
||||
return ServerStatusError, fmt.Errorf("health resp: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
@@ -635,6 +640,13 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
func (s *llmServer) Pid() int {
|
||||
if s.cmd != nil && s.cmd.Process != nil {
|
||||
return s.cmd.Process.Pid
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
var grammarJSON = `
|
||||
root ::= object
|
||||
value ::= object | array | string | number | ("true" | "false" | "null") ws
|
||||
@@ -998,17 +1010,17 @@ func (s *llmServer) Close() error {
|
||||
s.llamaModelLock.Unlock()
|
||||
|
||||
if s.cmd != nil {
|
||||
slog.Debug("stopping llama server")
|
||||
slog.Debug("stopping llama server", "pid", s.Pid())
|
||||
if err := s.cmd.Process.Kill(); err != nil {
|
||||
return err
|
||||
}
|
||||
// if ProcessState is already populated, Wait already completed, no need to wait again
|
||||
if s.cmd.ProcessState == nil {
|
||||
slog.Debug("waiting for llama server to exit")
|
||||
slog.Debug("waiting for llama server to exit", "pid", s.Pid())
|
||||
<-s.done
|
||||
}
|
||||
|
||||
slog.Debug("llama server stopped")
|
||||
slog.Debug("llama server stopped", "pid", s.Pid())
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -16,7 +16,7 @@ func TestLLMServerCompletionFormat(t *testing.T) {
|
||||
// of a mess, and but it's good enough, until we can refactoring the
|
||||
// Completion method to be more testable.
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
ctx, cancel := context.WithCancel(t.Context())
|
||||
s := &llmServer{
|
||||
sem: semaphore.NewWeighted(1), // required to prevent nil panic
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user