Enable concurrency by default

This adjusts our default settings to enable multiple models and parallel
requests to a single model.  Users can still override these by the same
env var settings as before.  Parallel has a direct impact on
num_ctx, which in turn can have a significant impact on small VRAM GPUs
so this change also refines the algorithm so that when parallel is not
explicitly set by the user, we try to find a reasonable default that fits
the model on their GPU(s).  As before, multiple models will only load
concurrently if they fully fit in VRAM.
This commit is contained in:
Daniel Hiltgen
2024-05-06 17:47:52 -07:00
parent c7c2f3bc22
commit 17b7186cd7
4 changed files with 135 additions and 72 deletions

View File

@@ -77,7 +77,7 @@ func LoadModel(model string) (*GGML, error) {
// NewLlamaServer will run a server for the given GPUs
// The gpu list must be a single family.
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
var err error
var cpuRunner string
var estimate MemoryEstimate
@@ -213,8 +213,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
// Windows CUDA should not use mmap for best performance
// Linux with a model larger than free space, mmap leads to thrashing
// For CPU loads we want the memory to be allocated, not FS cache
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
opts.UseMMap == api.TriStateFalse {
params = append(params, "--no-mmap")
}
@@ -227,15 +229,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa")
}
numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(projectors) > 0 {
numParallel = 1
slog.Warn("multimodal models don't support parallel requests yet")
}
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
if estimate.TensorSplit != "" {