Request and model concurrency

This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
2025-12-11 16:26:59 +00:00 · 2024-03-30 09:50:05 -07:00
parent ee448deaba
commit 34b9db5afc
30 changed files with 2572 additions and 1387 deletions
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -38,12 +38,17 @@
 extern "C" {
 #endif

+#define GPU_ID_LEN 64
+
 typedef struct mem_info {
+  char *err;  // If non-nill, caller responsible for freeing
+  char gpu_id[GPU_ID_LEN];
  uint64_t total;
  uint64_t free;
-  unsigned int count;
-  int igpu_index; // If >= 0, we detected an integrated GPU to ignore
-  char *err;  // If non-nill, caller responsible for freeing
+
+  // Compute Capability
+  int major; 
+  int minor;
 } mem_info_t;

 void cpu_check_ram(mem_info_t *resp);
@@ -52,7 +57,6 @@ void cpu_check_ram(mem_info_t *resp);
 }
 #endif

-#include "gpu_info_nvml.h"
 #include "gpu_info_cudart.h"

 #endif  // __GPU_INFO_H__