Harden GPU mgmt library lookup

When there are multiple management libraries installed on a system not every one will be compatible with the current driver. This change improves our management library algorithm to build up a set of discovered libraries based on glob patterns, and then try all of them until we're able to load one without error.
2025-12-11 16:26:59 +00:00 · 2024-01-10 14:39:51 -08:00
parent 9754ae4c89
commit 3c49c3ab0d
5 changed files with 169 additions and 66 deletions
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,26 +4,9 @@

 #include <string.h>

-#ifndef _WIN32
-const char *cuda_lib_paths[] = {
-    "libnvidia-ml.so",
-    "/usr/local/cuda/lib64/libnvidia-ml.so",
-    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
-    "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
-    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
-    NULL,
-};
-#else
-const char *cuda_lib_paths[] = {
-    "nvml.dll",
-    "",
-    NULL,
-};
-#endif
-
 #define CUDA_LOOKUP_SIZE 6

-void cuda_init(cuda_init_resp_t *resp) {
+void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
  nvmlReturn_t ret;
  resp->err = NULL;
  const int buflen = 256;
@@ -42,16 +25,12 @@ void cuda_init(cuda_init_resp_t *resp) {
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
  };

-  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
-    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
-  }
+  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
-    // TODO improve error message, as the LOAD_ERR will have typically have the
-    // final path that was checked which might be confusing.
    char *msg = LOAD_ERR();
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
-             cuda_lib_paths[0], msg);
+             cuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
    return;
@@ -73,6 +52,8 @@ void cuda_init(cuda_init_resp_t *resp) {

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }