More logging for gpu management

Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
This commit is contained in:
Daniel Hiltgen
2024-01-24 10:32:00 -08:00
parent f63dc2db5c
commit 013fd07139
5 changed files with 61 additions and 44 deletions

View File

@@ -4,8 +4,6 @@
#include <string.h>
#define CUDA_LOOKUP_SIZE 12
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
nvmlReturn_t ret;
resp->err = NULL;
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
struct lookup {
char *s;
void **p;
} l[CUDA_LOOKUP_SIZE] = {
{"nvmlInit_v2", (void *)&resp->ch.initFn},
{"nvmlShutdown", (void *)&resp->ch.shutdownFn},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
} l[] = {
{"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
{"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
{"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
{"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
{"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
{"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
{"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
{"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
{"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
{"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
{"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
{"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
{NULL, NULL},
};
resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
if (!resp->ch.handle) {
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
snprintf(buf, buflen,
"Unable to load %s library to query for Nvidia GPUs: %s",
cuda_lib_path, msg);
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
return;
}
for (i = 0; i < CUDA_LOOKUP_SIZE; i++) { // TODO - fix this to use a null terminated list
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
for (i = 0; l[i].s != NULL; i++) {
// TODO once we've squashed the remaining corner cases remove this log
LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
*l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
char *msg = LOAD_ERR();
LOG(resp->ch.verbose, "dlerr: %s\n", msg);
UNLOAD_LIBRARY(resp->ch.handle);
snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
msg);
free(msg);
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
}
}
ret = (*resp->ch.initFn)();
ret = (*resp->ch.nvmlInit_v2)();
if (ret != NVML_SUCCESS) {
LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
UNLOAD_LIBRARY(resp->ch.handle);
resp->ch.handle = NULL;
snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return;
}
ret = (*h.getCount)(&resp->count);
ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp->total = 0;
resp->free = 0;
for (i = 0; i < resp->count; i++) {
ret = (*h.getHandle)(i, &device);
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getMemInfo)(device, &memInfo);
ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
resp->err = strdup(buf);
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
}
unsigned int devices;
ret = (*h.getCount)(&devices);
ret = (*h.nvmlDeviceGetCount_v2)(&devices);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device count: %d", ret);
resp->err = strdup(buf);
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
}
for (i = 0; i < devices; i++) {
ret = (*h.getHandle)(i, &device);
ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
resp->err = strdup(buf);
return;
}
ret = (*h.getComputeCapability)(device, &major, &minor);
ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
if (ret != NVML_SUCCESS) {
snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
resp->err = strdup(buf);