mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 16:26:59 +00:00
Optimize container images for startup (#6547)
* Optimize container images for startup This change adjusts how to handle runner payloads to support container builds where we keep them extracted in the filesystem. This makes it easier to optimize the cpu/cuda vs cpu/rocm images for size, and should result in faster startup times for container images. * Refactor payload logic and add buildx support for faster builds * Move payloads around * Review comments * Converge to buildx based helper scripts * Use docker buildx action for release
This commit is contained in:
@@ -24,9 +24,11 @@ import (
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/build"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/runners"
|
||||
)
|
||||
|
||||
type LlamaServer interface {
|
||||
@@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
gpus = gpu.GetCPUInfo()
|
||||
}
|
||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||
cpuRunner = serverForCpu()
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
} else {
|
||||
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
@@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
opts.NumGPU = 0
|
||||
case gpus[0].Library != "metal" && estimate.Layers == 0:
|
||||
// Don't bother loading into the GPU if no layers can fit
|
||||
cpuRunner = serverForCpu()
|
||||
cpuRunner = runners.ServerForCpu()
|
||||
gpus = gpu.GetCPUInfo()
|
||||
case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
|
||||
opts.NumGPU = estimate.Layers
|
||||
@@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||
}
|
||||
|
||||
availableServers := getAvailableServers()
|
||||
rDir, err := runners.Refresh(build.EmbedFS)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
availableServers := runners.GetAvailableServers(rDir)
|
||||
if len(availableServers) == 0 {
|
||||
if runtime.GOOS != "windows" {
|
||||
slog.Warn("llama server binary disappeared, reinitializing payloads")
|
||||
err = Init()
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
availableServers = getAvailableServers()
|
||||
} else {
|
||||
return nil, finalErr
|
||||
}
|
||||
return nil, finalErr
|
||||
}
|
||||
var servers []string
|
||||
if cpuRunner != "" {
|
||||
servers = []string{cpuRunner}
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := envconfig.LLMLibrary()
|
||||
if demandLib != "" {
|
||||
@@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
_, err := os.Stat(server)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||
err = Init()
|
||||
_, err = runners.Refresh(build.EmbedFS)
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
|
||||
Reference in New Issue
Block a user