Verify permissions for AMD GPU (#6736)

This adds back a check which was lost many releases back to verify /dev/kfd permissions
which when lacking, can lead to confusing failure modes of:
  "rocBLAS error: Could not initialize Tensile host: No devices found"

This implementation does not hard fail the serve command but instead will fall back to CPU
with an error log.  In the future we can include this in the GPU discovery UX to show
detected but unsupported devices we discovered.
This commit is contained in:
Daniel Hiltgen
2024-09-11 11:38:25 -07:00
committed by GitHub
parent 735a0ca2e4
commit 9246e6dd15
2 changed files with 32 additions and 0 deletions

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"io/fs"
"log/slog"
"os"
"path/filepath"
@@ -359,6 +360,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
if len(resp) == 0 {
slog.Info("no compatible amdgpu devices detected")
}
if err := verifyKFDDriverAccess(); err != nil {
slog.Error("amdgpu devices detected but permission problems block access", "error", err)
return nil
}
return resp
}
@@ -455,3 +460,19 @@ func getFreeMemory(usedFile string) (uint64, error) {
}
return usedMemory, nil
}
func verifyKFDDriverAccess() error {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0o666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("permissions not set up properly. Either run ollama as root, or add you user account to the render group. %w", err)
} else if errors.Is(err, fs.ErrNotExist) {
// Container runtime failure?
return fmt.Errorf("kfd driver not loaded. If running in a container, remember to include '--device /dev/kfd --device /dev/dri'")
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
return nil
}