llm: make load time stall duration configurable via OLLAMA_LOAD_TIMEOUT

With the new very large parameter models, some users are willing to wait for
a very long time for models to load.
This commit is contained in:
Daniel Hiltgen
2024-09-05 14:00:08 -07:00
committed by GitHub
parent b05c9e83d9
commit 6719097649
4 changed files with 60 additions and 7 deletions

View File

@@ -112,6 +112,26 @@ func KeepAlive() (keepAlive time.Duration) {
return keepAlive
}
// LoadTimeout returns the duration for stall detection during model loads. LoadTimeout can be configured via the OLLAMA_LOAD_TIMEOUT environment variable.
// Zero or Negative values are treated as infinite.
// Default is 5 minutes.
func LoadTimeout() (loadTimeout time.Duration) {
loadTimeout = 5 * time.Minute
if s := Var("OLLAMA_LOAD_TIMEOUT"); s != "" {
if d, err := time.ParseDuration(s); err == nil {
loadTimeout = d
} else if n, err := strconv.ParseInt(s, 10, 64); err == nil {
loadTimeout = time.Duration(n) * time.Second
}
}
if loadTimeout <= 0 {
return time.Duration(math.MaxInt64)
}
return loadTimeout
}
func Bool(k string) func() bool {
return func() bool {
if s := Var(k); s != "" {
@@ -245,10 +265,8 @@ func Uint64(key string, defaultValue uint64) func() uint64 {
}
}
var (
// Set aside VRAM per GPU
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
)
// Set aside VRAM per GPU
var GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
type EnvVar struct {
Name string
@@ -264,6 +282,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
"OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"},
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"},
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"},
"OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"},