mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Reduce default parallelism to 1 (#11330)
The current scheduler algorithm of picking the paralellism based on available VRAM complicates the upcoming dynamic layer memory allocation algorithm. This changes the default to 1, with the intent going forward that parallelism is explicit and will no longer be dynamically determined. Removal of the dynamic logic will come in a follow up.
This commit is contained in:
@@ -219,7 +219,7 @@ func Uint(key string, defaultValue uint) func() uint {
|
||||
|
||||
var (
|
||||
// NumParallel sets the number of parallel model requests. NumParallel can be configured via the OLLAMA_NUM_PARALLEL environment variable.
|
||||
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 0)
|
||||
NumParallel = Uint("OLLAMA_NUM_PARALLEL", 1)
|
||||
// MaxRunners sets the maximum number of loaded models. MaxRunners can be configured via the OLLAMA_MAX_LOADED_MODELS environment variable.
|
||||
MaxRunners = Uint("OLLAMA_MAX_LOADED_MODELS", 0)
|
||||
// MaxQueue sets the maximum number of queued requests. MaxQueue can be configured via the OLLAMA_MAX_QUEUE environment variable.
|
||||
|
||||
Reference in New Issue
Block a user