Support multiple variants for a given llm lib type

In some cases we may want multiple variants for a given GPU type or CPU.
This adds logic to have an optional Variant which we can use to select
an optimal library, but also allows us to try multiple variants in case
some fail to load.

This can be useful for scenarios such as ROCm v5 vs v6 incompatibility
or potentially CPU features.
This commit is contained in:
Daniel Hiltgen
2024-01-05 12:13:08 -08:00
parent b24e8d17b2
commit 8da7bef05f
16 changed files with 428 additions and 212 deletions

View File

@@ -145,6 +145,15 @@ func GetGPUInfo() GpuInfo {
C.free(unsafe.Pointer(memInfo.err))
} else {
resp.Library = "rocm"
var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version)
verString := C.GoString(version.str)
if version.status == 0 {
resp.Variant = "v" + verString
} else {
log.Printf("failed to look up ROCm version: %s", verString)
}
C.free(unsafe.Pointer(version.str))
}
}
if resp.Library == "" {

View File

@@ -4,6 +4,8 @@
#include <string.h>
#define ROCM_LOOKUP_SIZE 5
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
rsmi_status_t ret;
resp->err = NULL;
@@ -13,11 +15,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
struct lookup {
char *s;
void **p;
} l[4] = {
} l[ROCM_LOOKUP_SIZE] = {
{"rsmi_init", (void *)&resp->rh.initFn},
{"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
{"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
{"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
{"rsmi_version_get", (void *)&resp->rh.versionGetFn},
// { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
};
@@ -32,7 +35,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
return;
}
for (i = 0; i < 4; i++) {
for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
*l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
if (!l[i].p) {
UNLOAD_LIBRARY(resp->rh.handle);
@@ -103,4 +106,25 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
return;
}
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
const int buflen = 256;
char buf[buflen + 1];
if (h.handle == NULL) {
resp->str = strdup("nvml handle not initialized");
resp->status = 1;
return;
}
rsmi_version_t ver;
rsmi_status_t ret;
ret = h.versionGetFn(&ver);
if (ret != RSMI_STATUS_SUCCESS) {
snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
resp->status = 1;
} else {
snprintf(buf, buflen, "%d", ver.major);
resp->status = 0;
}
resp->str = strdup(buf);
}
#endif // __APPLE__

View File

@@ -15,12 +15,20 @@ typedef enum rsmi_memory_type {
RSMI_MEM_TYPE_GTT,
} rsmi_memory_type_t;
typedef struct {
uint32_t major;
uint32_t minor;
uint32_t patch;
const char *build;
} rsmi_version_t;
typedef struct rocm_handle {
void *handle;
rsmi_status_t (*initFn)(uint64_t);
rsmi_status_t (*shutdownFn)(void);
rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
// rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
} rocm_handle_t;
@@ -29,8 +37,14 @@ typedef struct rocm_init_resp {
rocm_handle_t rh;
} rocm_init_resp_t;
typedef struct rocm_version_resp {
rsmi_status_t status;
char *str; // Contains version or error string if status != 0
} rocm_version_resp_t;
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp);
void rocm_check_vram(rocm_handle_t rh, mem_info_t *resp);
void rocm_get_version(rocm_handle_t rh, rocm_version_resp_t *resp);
#endif // __GPU_INFO_ROCM_H__
#endif // __APPLE__

View File

@@ -11,5 +11,8 @@ type GpuInfo struct {
memInfo
Library string `json:"library,omitempty"`
// Optional variant to select (e.g. versions, cpu feature flags)
Variant string `json:"variant,omitempty"`
// TODO add other useful attributes about the card here for discovery information
}