mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Better nvidia GPU discovery logging
Refine the way we log GPU discovery to improve the non-debug output, and report more actionable log messages when possible to help users troubleshoot on their own.
This commit is contained in:
23
gpu/gpu.go
23
gpu/gpu.go
@@ -202,7 +202,7 @@ func GetGPUInfo() GpuInfoList {
|
||||
}()
|
||||
|
||||
if !bootstrapped {
|
||||
slog.Debug("Detecting GPUs")
|
||||
slog.Info("looking for compatible GPUs")
|
||||
needRefresh = false
|
||||
cpuCapability = GetCPUCapability()
|
||||
var memInfo C.mem_info_t
|
||||
@@ -320,6 +320,9 @@ func GetGPUInfo() GpuInfoList {
|
||||
|
||||
rocmGPUs = AMDGetGPUInfo()
|
||||
bootstrapped = true
|
||||
if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
|
||||
slog.Info("no compatible GPUs were discovered")
|
||||
}
|
||||
}
|
||||
|
||||
// For detected GPUs, load library if not loaded
|
||||
@@ -514,7 +517,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
|
||||
defer C.free(unsafe.Pointer(lib))
|
||||
C.nvcuda_init(lib, &resp)
|
||||
if resp.err != nil {
|
||||
slog.Debug("Unable to load nvcuda", "library", libPath, "error", C.GoString(resp.err))
|
||||
// Decide what log level based on the type of error message to help users understand why
|
||||
msg := C.GoString(resp.err)
|
||||
switch resp.cudaErr {
|
||||
case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
|
||||
slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
|
||||
case C.CUDA_ERROR_NO_DEVICE:
|
||||
slog.Info("no nvidia devices detected", "library", libPath)
|
||||
case C.CUDA_ERROR_UNKNOWN:
|
||||
slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
|
||||
slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
|
||||
default:
|
||||
if strings.Contains(msg, "wrong ELF class") {
|
||||
slog.Debug("skipping 32bit library", "library", libPath)
|
||||
} else {
|
||||
slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
|
||||
}
|
||||
}
|
||||
C.free(unsafe.Pointer(resp.err))
|
||||
} else {
|
||||
return int(resp.num_devices), &resp.ch, libPath
|
||||
|
||||
@@ -7,6 +7,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
CUresult ret;
|
||||
resp->err = NULL;
|
||||
resp->num_devices = 0;
|
||||
resp->cudaErr = CUDA_SUCCESS;
|
||||
const int buflen = 256;
|
||||
char buf[buflen + 1];
|
||||
int i;
|
||||
@@ -38,6 +39,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
nvcuda_lib_path, msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
resp->cudaErr = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -52,6 +54,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
msg);
|
||||
free(msg);
|
||||
resp->err = strdup(buf);
|
||||
resp->cudaErr = -1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -61,12 +64,9 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
|
||||
UNLOAD_LIBRARY(resp->ch.handle);
|
||||
resp->ch.handle = NULL;
|
||||
if (ret == CUDA_ERROR_INSUFFICIENT_DRIVER) {
|
||||
resp->err = strdup("your nvidia driver is too old or missing. If you have a CUDA GPU please upgrade to run ollama");
|
||||
return;
|
||||
}
|
||||
snprintf(buf, buflen, "nvcuda init failure: %d", ret);
|
||||
snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
resp->cudaErr = ret;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -91,6 +91,7 @@ void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
|
||||
resp->ch.handle = NULL;
|
||||
snprintf(buf, buflen, "unable to get device count: %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
resp->cudaErr = ret;
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -106,13 +107,13 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||
CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
|
||||
|
||||
if (h.handle == NULL) {
|
||||
resp->err = strdup("nvcuda handle isn't initialized");
|
||||
resp->err = strdup("cuda driver library handle isn't initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuDeviceGet)(&device, i);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device failed to initialize");
|
||||
snprintf(buf, buflen, "cuda driver library device failed to initialize");
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
@@ -168,14 +169,14 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||
// To get memory we have to set (and release) a context
|
||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda failed to get device context %d", ret);
|
||||
snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
snprintf(buf, buflen, "nvcuda device memory info lookup failure %d", ret);
|
||||
snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
|
||||
resp->err = strdup(buf);
|
||||
// Best effort on failure...
|
||||
(*h.cuCtxDestroy)(ctx);
|
||||
@@ -193,7 +194,7 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
|
||||
|
||||
ret = (*h.cuCtxDestroy)(ctx);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda failed to release device context %d", ret);
|
||||
LOG(1, "cuda driver library failed to release device context %d", ret);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,7 +207,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
||||
|
||||
ret = (*h.cuDeviceGet)(&device, i);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda device failed to initialize");
|
||||
LOG(1, "cuda driver library device failed to initialize");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -214,13 +215,13 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
||||
// To get memory we have to set (and release) a context
|
||||
ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda failed to get device context %d", ret);
|
||||
LOG(1, "cuda driver library failed to get device context %d", ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = (*h.cuMemGetInfo_v2)(free, total);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda device memory info lookup failure %d", ret);
|
||||
LOG(1, "cuda driver library device memory info lookup failure %d", ret);
|
||||
// Best effort on failure...
|
||||
(*h.cuCtxDestroy)(ctx);
|
||||
return;
|
||||
@@ -228,12 +229,12 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total)
|
||||
|
||||
ret = (*h.cuCtxDestroy)(ctx);
|
||||
if (ret != CUDA_SUCCESS) {
|
||||
LOG(1, "nvcuda failed to release device context %d", ret);
|
||||
LOG(1, "cuda driver library failed to release device context %d", ret);
|
||||
}
|
||||
}
|
||||
|
||||
void nvcuda_release(nvcuda_handle_t h) {
|
||||
LOG(h.verbose, "releasing nvcuda library\n");
|
||||
LOG(h.verbose, "releasing cuda driver library\n");
|
||||
UNLOAD_LIBRARY(h.handle);
|
||||
// TODO and other context release logic?
|
||||
h.handle = NULL;
|
||||
|
||||
@@ -7,9 +7,12 @@
|
||||
typedef enum cudaError_enum {
|
||||
CUDA_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
CUDA_ERROR_MEMORY_ALLOCATION = 2,
|
||||
CUDA_ERROR_OUT_OF_MEMORY = 2,
|
||||
CUDA_ERROR_NOT_INITIALIZED = 3,
|
||||
CUDA_ERROR_INSUFFICIENT_DRIVER = 35,
|
||||
CUDA_ERROR_NO_DEVICE = 100,
|
||||
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803,
|
||||
CUDA_ERROR_UNKNOWN = 999,
|
||||
// Other values omitted for now...
|
||||
} CUresult;
|
||||
|
||||
@@ -64,6 +67,7 @@ typedef struct nvcuda_init_resp {
|
||||
char *err; // If err is non-null handle is invalid
|
||||
nvcuda_handle_t ch;
|
||||
int num_devices;
|
||||
CUresult cudaErr;
|
||||
} nvcuda_init_resp_t;
|
||||
|
||||
void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
|
||||
|
||||
Reference in New Issue
Block a user