ollama37/tests/testcases/runtime/TC-RUNTIME-002.yml

id: TC-RUNTIME-002
name: GPU Detection
suite: runtime
priority: 2
timeout: 120000

dependencies:
  - TC-RUNTIME-001

steps:
  - name: Check nvidia-smi inside container
    command: docker exec ollama37 nvidia-smi

  - name: Check CUDA libraries
    command: docker exec ollama37 ldconfig -p | grep -i cuda | head -5

  - name: Verify UVM device files
    command: |
      if [ ! -e /dev/nvidia-uvm ]; then
        echo "WARNING: UVM device missing, creating with nvidia-modprobe..."
        sudo nvidia-modprobe -u -c=0
        echo "Restarting container to pick up UVM devices..."
        cd docker && docker compose restart
        sleep 15
        echo "UVM device fix applied"
      else
        echo "SUCCESS: UVM device file present"
        ls -l /dev/nvidia-uvm
      fi

  - name: Verify GPU detection in Ollama logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs 2>&1)
      fi

      echo "=== GPU Detection Check ==="

      # Check for inference compute with CUDA library
      if echo "$LOGS" | grep -q "inference compute.*library=CUDA"; then
        echo "SUCCESS: GPU detected with CUDA library"
        echo "$LOGS" | grep "inference compute" | head -2
      else
        echo "ERROR: GPU not detected with CUDA library"
        exit 1
      fi

      # Check for Tesla K80 specifically
      if echo "$LOGS" | grep -q 'description="Tesla K80"'; then
        echo "SUCCESS: Tesla K80 GPU identified"
      else
        echo "WARNING: Tesla K80 not explicitly identified"
      fi

      # Check compute capability 3.7
      if echo "$LOGS" | grep -q "compute=3.7"; then
        echo "SUCCESS: Compute capability 3.7 detected"
      else
        echo "WARNING: Compute capability 3.7 not detected"
      fi

  - name: Check for GPU-related errors in logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs 2>&1)
      fi

      echo "=== GPU Error Check ==="

      # Check for critical CUDA/CUBLAS errors
      if echo "$LOGS" | grep -qE "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed|out of memory)"; then
        echo "CRITICAL GPU ERRORS FOUND:"
        echo "$LOGS" | grep -E "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed|out of memory)"
        exit 1
      fi

      # Check for CPU fallback (bad!)
      if echo "$LOGS" | grep -q "id=cpu library=cpu"; then
        echo "ERROR: Ollama fell back to CPU-only mode"
        exit 1
      fi

      echo "SUCCESS: No GPU-related errors found"

  - name: Display GPU memory status from logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs 2>&1)
      fi

      echo "=== GPU Memory Status ==="
      echo "$LOGS" | grep -E "gpu memory.*library=CUDA" | tail -4

criteria: |
  Tesla K80 GPU should be detected by both nvidia-smi AND Ollama CUDA runtime.

  Expected:
  - nvidia-smi shows Tesla K80 GPU(s) with Driver 470.x
  - CUDA libraries are available (libcuda, libcublas, etc.)
  - /dev/nvidia-uvm device file exists (required for CUDA runtime)
  - Ollama logs show "inference compute" with "library=CUDA"
  - NO "id=cpu library=cpu" (CPU fallback)
  - NO CUBLAS_STATUS_ errors
  - NO CUDA error messages

  NOTE: "ggml_cuda_init" and "load_backend" only appear when a model is loaded,
  so they are checked in inference tests, not here.

  NOTE: If nvidia-smi works but Ollama shows only CPU, the UVM device
  files are missing. The test will auto-fix with nvidia-modprobe -u -c=0.

  The K80 has 12GB VRAM per GPU. Accept variations in reported memory.