Harden GPU mgmt library lookup

When there are multiple management libraries installed on a system
not every one will be compatible with the current driver.  This change
improves our management library algorithm to build up a set of discovered
libraries based on glob patterns, and then try all of them until we're able to
load one without error.
This commit is contained in:
Daniel Hiltgen
2024-01-10 14:39:51 -08:00
parent 9754ae4c89
commit 3c49c3ab0d
5 changed files with 169 additions and 66 deletions

View File

@@ -4,26 +4,9 @@
#include <string.h>
#ifndef _WIN32
const char *cuda_lib_paths[] = {
"libnvidia-ml.so",
"/usr/local/cuda/lib64/libnvidia-ml.so",
"/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
"/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
"/usr/lib/wsl/lib/libnvidia-ml.so.1", // TODO Maybe glob?
NULL,
};
#else
const char *cuda_lib_paths[] = {
"nvml.dll",
"",
NULL,
};
#endif
#define CUDA_LOOKUP_SIZE 6
void cuda_init(cuda_init_resp_t *resp) {
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
const int buflen = 256;
@@ -42,16 +25,12 @@ void cuda_init(cuda_init_resp_t *resp) {
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
};
for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
}
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
// TODO improve error message, as the LOAD_ERR will have typically have the
// final path that was checked which might be confusing.
char *msg = LOAD_ERR();
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_paths[0], msg);
cuda_lib_path, msg);
free(msg);
resp->err = strdup(buf);
return;
@@ -73,6 +52,8 @@ void cuda_init(cuda_init_resp_t *resp) {
ret = (*resp->ch.initFn)();
if (ret != NVML_SUCCESS) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
resp->err = strdup(buf);
}