ollama37/tests/testcases/inference/TC-INFERENCE-002.yml

id: TC-INFERENCE-002
name: Basic Inference
suite: inference
priority: 2
timeout: 180000

dependencies:
  - TC-INFERENCE-001

steps:
  - name: Run simple math question
    command: docker exec ollama37 ollama run gemma3:4b "What is 2+2? Answer with just the number." 2>&1
    timeout: 120000

  - name: Check GPU memory usage
    command: docker exec ollama37 nvidia-smi --query-compute-apps=pid,used_memory --format=csv 2>/dev/null || echo "No GPU processes"

  - name: Check for inference errors in logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
      fi

      echo "=== Inference Error Check ==="

      # Check for CUBLAS errors (critical for K80)
      if echo "$LOGS" | grep -qE "CUBLAS_STATUS_"; then
        echo "CRITICAL: CUBLAS error during inference:"
        echo "$LOGS" | grep -E "CUBLAS_STATUS_"
        exit 1
      fi

      # Check for CUDA errors
      if echo "$LOGS" | grep -qE "CUDA error"; then
        echo "CRITICAL: CUDA error during inference:"
        echo "$LOGS" | grep -E "CUDA error"
        exit 1
      fi

      # Check for compute graph errors
      if echo "$LOGS" | grep -qiE "(compute.*failed|graph.*error)"; then
        echo "ERROR: Compute graph error:"
        echo "$LOGS" | grep -iE "(compute.*failed|graph.*error)"
        exit 1
      fi

      echo "SUCCESS: No inference errors in logs"

  - name: Verify inference request in logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
      fi

      echo "=== Inference Request Verification ==="

      # Check for generate API call
      if echo "$LOGS" | grep -qE '\[GIN\].*POST.*/api/generate'; then
        echo "SUCCESS: Generate API request logged"
        echo "$LOGS" | grep -E '\[GIN\].*POST.*/api/generate' | tail -2
      else
        echo "WARNING: Generate API request not found in recent logs"
      fi

      # Check for successful response (200 status)
      if echo "$LOGS" | grep -qE '\[GIN\].*200.*POST'; then
        echo "SUCCESS: Inference returned 200 status"
      else
        echo "WARNING: Could not verify 200 status"
      fi

  - name: Display recent CUDA activity from logs
    command: |
      # Use log collector file if available, fallback to docker compose logs
      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
      else
        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
      fi

      echo "=== Recent CUDA Activity ==="
      echo "$LOGS" | grep -iE "(CUDA|cuda|device=CUDA)" | tail -5 || echo "No recent CUDA activity logged"

criteria: |
  Basic inference should work on Tesla K80.

  Expected:
  - Model responds to the math question
  - Response should indicate "4" (accept variations: "4", "four", "The answer is 4", etc.)
  - GPU memory should be allocated during inference
  - NO CUBLAS_STATUS_ errors in logs (critical for K80 compatibility)
  - NO CUDA error messages in logs
  - Generate API request logged with 200 status

  This is AI-generated output - accept reasonable variations.
  Focus on the model producing a coherent response without GPU errors.