mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Adjust mmap logic for cuda windows for faster model load
On Windows, recent llama.cpp changes make mmap slower in most cases, so default to off. This also implements a tri-state for use_mmap so we can detect the difference between a user provided value of true/false, or unspecified.
This commit is contained in:
@@ -200,7 +200,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
if g.Library == "metal" &&
|
||||
uint64(opts.NumGPU) > 0 &&
|
||||
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
||||
opts.UseMMap = false
|
||||
opts.UseMMap = api.TriStateFalse
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,7 +208,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--flash-attn")
|
||||
}
|
||||
|
||||
if !opts.UseMMap {
|
||||
// Windows CUDA should not use mmap for best performance
|
||||
if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
|
||||
params = append(params, "--no-mmap")
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user