mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 16:26:59 +00:00
Request and model concurrency
This change adds support for multiple concurrent requests, as well as loading multiple models by spawning multiple runners. The default settings are currently set at 1 concurrent request per model and only 1 loaded model at a time, but these can be adjusted by setting OLLAMA_NUM_PARALLEL and OLLAMA_MAX_LOADED_MODELS.
This commit is contained in:
@@ -38,12 +38,17 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GPU_ID_LEN 64
|
||||
|
||||
typedef struct mem_info {
|
||||
char *err; // If non-nill, caller responsible for freeing
|
||||
char gpu_id[GPU_ID_LEN];
|
||||
uint64_t total;
|
||||
uint64_t free;
|
||||
unsigned int count;
|
||||
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
|
||||
char *err; // If non-nill, caller responsible for freeing
|
||||
|
||||
// Compute Capability
|
||||
int major;
|
||||
int minor;
|
||||
} mem_info_t;
|
||||
|
||||
void cpu_check_ram(mem_info_t *resp);
|
||||
@@ -52,7 +57,6 @@ void cpu_check_ram(mem_info_t *resp);
|
||||
}
|
||||
#endif
|
||||
|
||||
#include "gpu_info_nvml.h"
|
||||
#include "gpu_info_cudart.h"
|
||||
|
||||
#endif // __GPU_INFO_H__
|
||||
|
||||
Reference in New Issue
Block a user