Request and model concurrency

This change adds support for multiple concurrent requests, as well as
loading multiple models by spawning multiple runners. The default
settings are currently set at 1 concurrent request per model and only 1
loaded model at a time, but these can be adjusted by setting
OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
This commit is contained in:
Daniel Hiltgen
2024-03-30 09:50:05 -07:00
parent ee448deaba
commit 34b9db5afc
30 changed files with 2572 additions and 1387 deletions

View File

@@ -80,7 +80,7 @@ func cleanupTmpDirs() {
}
err = os.RemoveAll(d)
if err != nil {
slog.Debug(fmt.Sprintf("unable to cleanup stale tmpdir %s: %s", d, err))
slog.Debug("unable to cleanup stale tmpdir", "path", d, "error", err)
}
}
}
@@ -120,7 +120,7 @@ func UpdatePath(dir string) {
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
slog.Info(fmt.Sprintf("Updating PATH to %s", newPath))
slog.Info("updating", "PATH", newPath)
os.Setenv("PATH", newPath)
}
// linux and darwin rely on rpath