mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Request and model concurrency
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
This commit is contained in:
49
gpu/types.go
49
gpu/types.go
@@ -3,7 +3,6 @@ package gpu
|
||||
type memInfo struct {
|
||||
TotalMemory uint64 `json:"total_memory,omitempty"`
|
||||
FreeMemory uint64 `json:"free_memory,omitempty"`
|
||||
DeviceCount uint32 `json:"device_count,omitempty"`
|
||||
}
|
||||
|
||||
// Beginning of an `ollama info` command
|
||||
@@ -17,11 +16,49 @@ type GpuInfo struct {
|
||||
// MinimumMemory represents the minimum memory required to use the GPU
|
||||
MinimumMemory uint64 `json:"-"`
|
||||
|
||||
// TODO add other useful attributes about the card here for discovery information
|
||||
// Any extra PATH/LD_LIBRARY_PATH dependencies required for the Library to operate properly
|
||||
DependencyPath string `json:"lib_path,omitempty"`
|
||||
|
||||
// GPU information
|
||||
ID string `json:"gpu_id"` // string to use for selection of this specific GPU
|
||||
Name string `json:"name"` // user friendly name if available
|
||||
Major int `json:"major,omitempty"` // Major compatibility version (CC or gfx)
|
||||
Minor int `json:"minor,omitempty"` // Minor compatibility version (CC or gfx)
|
||||
Patch int `json:"patch,omitempty"` // Patch compatibility only matters on AMD
|
||||
|
||||
// TODO other performance capability info to help in scheduling decisions
|
||||
}
|
||||
|
||||
type Version struct {
|
||||
Major uint
|
||||
Minor uint
|
||||
Patch uint
|
||||
type GpuInfoList []GpuInfo
|
||||
|
||||
// Split up the set of gpu info's by Library and variant
|
||||
func (l GpuInfoList) ByLibrary() []GpuInfoList {
|
||||
resp := []GpuInfoList{}
|
||||
libs := []string{}
|
||||
for _, info := range l {
|
||||
found := false
|
||||
requested := info.Library
|
||||
if info.Variant != "" {
|
||||
requested += "_" + info.Variant
|
||||
}
|
||||
for i, lib := range libs {
|
||||
if lib == requested {
|
||||
resp[i] = append(resp[i], info)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
libs = append(libs, info.Library)
|
||||
resp = append(resp, []GpuInfo{info})
|
||||
}
|
||||
}
|
||||
return resp
|
||||
}
|
||||
|
||||
// Sort by Free Space
|
||||
type ByFreeMemory []GpuInfo
|
||||
|
||||
func (a ByFreeMemory) Len() int { return len(a) }
|
||||
func (a ByFreeMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
|
||||
|
||||
Reference in New Issue
Block a user