Refine CPU load behavior with system memory visibility

2025-12-10 07:46:59 +00:00 · 2024-06-03 19:09:23 -07:00
parent 434dfe30c5
commit fc37c192ae
7 changed files with 211 additions and 98 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -11,6 +11,8 @@ package gpu
 */
 import "C"
 import (
+	"bufio"
+	"bytes"
 	"fmt"
 	"log/slog"
 	"os"
@@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
 	return oHandles
 }

+func GetCPUInfo() GpuInfoList {
+	gpuMutex.Lock()
+	if !bootstrapped {
+		gpuMutex.Unlock()
+		GetGPUInfo()
+	} else {
+		gpuMutex.Unlock()
+	}
+	return GpuInfoList{cpus[0].GpuInfo}
+}
+
 func GetGPUInfo() GpuInfoList {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
@@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
 		needRefresh = false
 		cpuCapability = getCPUCapability()
 		var memInfo C.mem_info_t
-		C.cpu_check_ram(&memInfo)
-		if memInfo.err != nil {
-			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
-			C.free(unsafe.Pointer(memInfo.err))
-			return []GpuInfo{}
+
+		mem, err := GetCPUMem()
+		if err != nil {
+			slog.Warn("error looking up system memory", "error", err)
 		}
-		cpuInfo := CPUInfo{
+		cpus = []CPUInfo{CPUInfo{
 			GpuInfo: GpuInfo{
+				memInfo: mem,
 				Library: "cpu",
 				Variant: cpuCapability.ToVariant(),
+				ID:      "0",
 			},
-		}
-		cpuInfo.TotalMemory = uint64(memInfo.total)
-		cpuInfo.FreeMemory = uint64(memInfo.free)
-		cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		cpus = []CPUInfo{cpuInfo}
+		}}

 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {

 	// Refresh free memory usage
 	if needRefresh {
-		// TODO - CPU system memory tracking/refresh
+		mem, err := GetCPUMem()
+		if err != nil {
+			slog.Warn("error looking up system memory", "error", err)
+		} else {
+			slog.Debug("updating system memory data",
+				slog.Group(
+					"before",
+					"total", format.HumanBytes2(cpus[0].TotalMemory),
+					"free", format.HumanBytes2(cpus[0].FreeMemory),
+				),
+				slog.Group(
+					"now",
+					"total", format.HumanBytes2(mem.TotalMemory),
+					"free", format.HumanBytes2(mem.FreeMemory),
+				),
+			)
+			cpus[0].FreeMemory = mem.FreeMemory
+		}
+
 		var memInfo C.mem_info_t
 		if cHandles == nil && len(cudaGPUs) > 0 {
 			cHandles = initCudaHandles()
@@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
 			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
 		}

-		err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
+		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
 		if err != nil {
 			slog.Debug("problem refreshing ROCm free memory", "error", err)
 		}
@@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
 }

 func GetCPUMem() (memInfo, error) {
+	if runtime.GOOS == "linux" {
+		return GetLinuxMemInfo()
+	}
 	var ret memInfo
 	var info C.mem_info_t
 	C.cpu_check_ram(&info)
@@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
+
+func GetLinuxMemInfo() (memInfo, error) {
+	var mem memInfo
+	var total, available, free, buffers, cached uint64
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return mem, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		switch {
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
+		case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
+			_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
+		case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
+			_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
+		default:
+			continue
+		}
+		if err != nil {
+			return mem, err
+		}
+
+		if total > 0 && available > 0 {
+			mem.TotalMemory = total * 1024
+			mem.FreeMemory = available * 1024
+			return mem, nil
+		}
+	}
+	mem.TotalMemory = total * 1024
+	mem.FreeMemory = (free + buffers + cached) * 1024
+	return mem, nil
+}
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
 	return []GpuInfo{info}
 }

+func GetCPUInfo() GpuInfoList {
+	mem, _ := GetCPUMem()
+	return []GpuInfo{
+		{
+			Library: "cpu",
+			Variant: GetCPUVariant(),
+			memInfo: mem,
+		},
+	}
+}
+
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
 }

 #elif __APPLE__
-// TODO consider an Apple implementation that does something useful
-// mem_info_t cpu_check_ram() {
-//   mem_info_t resp = {0, 0, NULL};
-//   return resp;
-// }
+// Unused - see gpu_darwin.go
 #else
 #error "Unsupported platform"
 #endif
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  char buf[buflen + 1];
  int i;

-  LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
-
  struct lookup {
    char *s;
    void **p;
@@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
  }

  // TODO once we've squashed the remaining corner cases remove this log
-//   LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
  
-    LOG(1, "XXX wiring functions nvml_init\n");
-
  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);

    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
@@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
      return;
    }
  }
-    LOG(1, "XXX calling init_v2\n");

  ret = (*resp->ch.nvmlInit_v2)();
  if (ret != NVML_SUCCESS) {
@@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
    resp->err = strdup(buf);
    return;
  }
-      LOG(1, "XXX nvml_init done\n");
-
 }


@@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
    nvmlDevice_t device;
    nvmlMemory_t memInfo = {0};
    nvmlReturn_t ret;
-    LOG(1, "XXX in nvml_get_free\n");
    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
    if (ret != NVML_SUCCESS) {
        LOG(1, "unable to get device handle %d: %d", device_id, ret);