diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d6935e58..71e8b69e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,33 @@ name: Build Verification on: workflow_dispatch: # Manual trigger + inputs: + judge_mode: + description: 'Test judge mode' + required: false + default: 'simple' + type: choice + options: + - 'simple' + - 'llm' + - 'dual' + judge_model: + description: 'LLM model for judging (if llm/dual mode)' + required: false + default: 'gemma3:4b' + type: string workflow_call: # Called by other workflows + inputs: + judge_mode: + description: 'Test judge mode (simple, llm, dual)' + required: false + default: 'simple' + type: string + judge_model: + description: 'LLM model for judging' + required: false + default: 'gemma3:4b' + type: string outputs: result: description: "Build test result" @@ -31,8 +57,23 @@ jobs: id: build-tests run: | cd tests + + # Build judge flags based on input + JUDGE_FLAGS="" + if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + JUDGE_FLAGS="--no-llm" + elif [ "${{ inputs.judge_mode }}" = "dual" ]; then + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + else + # llm mode + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + fi + + echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge flags: $JUDGE_FLAGS" + # Progress goes to stderr (visible), JSON results go to file - npm run --silent dev -- run --suite build --no-llm --output json > /tmp/build-results.json || true + npm run --silent dev -- run --suite build $JUDGE_FLAGS --output json > /tmp/build-results.json || true echo "--- JSON Results ---" cat /tmp/build-results.json diff --git a/.github/workflows/full-pipeline.yml b/.github/workflows/full-pipeline.yml index a29dcf13..4de74db9 100644 --- a/.github/workflows/full-pipeline.yml +++ b/.github/workflows/full-pipeline.yml @@ -3,10 +3,24 @@ name: Full Pipeline on: workflow_dispatch: # Manual trigger inputs: - skip_llm_judge: - description: 'Skip LLM judge evaluation' + judge_mode: + description: 'Test judge mode' required: false - default: 'false' + default: 'simple' + type: choice + options: + - 'simple' + - 'llm' + - 'dual' + judge_model: + description: 'LLM model for judging (if llm/dual mode)' + required: false + default: 'gemma3:4b' + type: string + skip_llm_judge_stage: + description: 'Skip separate LLM judge evaluation stage' + required: false + default: 'true' type: choice options: - 'true' @@ -19,6 +33,9 @@ jobs: build: name: Build Verification uses: ./.github/workflows/build.yml + with: + judge_mode: ${{ inputs.judge_mode || 'simple' }} + judge_model: ${{ inputs.judge_model || 'gemma3:4b' }} start-container: name: Start Container @@ -62,7 +79,22 @@ jobs: id: runtime-tests run: | cd tests - npm run --silent dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json || true + + # Build judge flags based on input + JUDGE_FLAGS="" + if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + JUDGE_FLAGS="--no-llm" + elif [ "${{ inputs.judge_mode }}" = "dual" ]; then + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + else + # llm mode + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + fi + + echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge flags: $JUDGE_FLAGS" + + npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true echo "--- JSON Results ---" cat /tmp/runtime-results.json @@ -104,7 +136,22 @@ jobs: id: inference-tests run: | cd tests - npm run --silent dev -- run --suite inference --no-llm --output json > /tmp/inference-results.json || true + + # Build judge flags based on input + JUDGE_FLAGS="" + if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + JUDGE_FLAGS="--no-llm" + elif [ "${{ inputs.judge_mode }}" = "dual" ]; then + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + else + # llm mode + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + fi + + echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge flags: $JUDGE_FLAGS" + + npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true echo "--- JSON Results ---" cat /tmp/inference-results.json @@ -129,7 +176,7 @@ jobs: name: LLM Judge Evaluation runs-on: self-hosted needs: [build, runtime, inference] - if: ${{ inputs.skip_llm_judge != 'true' }} + if: ${{ inputs.skip_llm_judge_stage != 'true' }} steps: - name: Checkout @@ -152,7 +199,9 @@ jobs: run: | cd tests echo "Running LLM judge evaluation..." - npm run --silent dev -- run --output json > /tmp/llm-judged-results.json || true + echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}" + + npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true echo "--- JSON Results ---" cat /tmp/llm-judged-results.json diff --git a/.github/workflows/inference.yml b/.github/workflows/inference.yml index f1b5de43..4b810806 100644 --- a/.github/workflows/inference.yml +++ b/.github/workflows/inference.yml @@ -11,6 +11,20 @@ on: options: - 'true' - 'false' + judge_mode: + description: 'Test judge mode' + required: false + default: 'simple' + type: choice + options: + - 'simple' + - 'llm' + - 'dual' + judge_model: + description: 'LLM model for judging (if llm/dual mode)' + required: false + default: 'gemma3:4b' + type: string workflow_call: # Called by other workflows inputs: use_existing_container: @@ -18,6 +32,16 @@ on: required: false default: false type: boolean + judge_mode: + description: 'Test judge mode (simple, llm, dual)' + required: false + default: 'simple' + type: string + judge_model: + description: 'LLM model for judging' + required: false + default: 'gemma3:4b' + type: string outputs: result: description: "Inference test result" @@ -57,8 +81,23 @@ jobs: id: inference-tests run: | cd tests + + # Build judge flags based on input + JUDGE_FLAGS="" + if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + JUDGE_FLAGS="--no-llm" + elif [ "${{ inputs.judge_mode }}" = "dual" ]; then + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + else + # llm mode + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + fi + + echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge flags: $JUDGE_FLAGS" + # Progress goes to stderr (visible), JSON results go to file - npm run --silent dev -- run --suite inference --no-llm --output json > /tmp/inference-results.json || true + npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true echo "--- JSON Results ---" cat /tmp/inference-results.json diff --git a/.github/workflows/runtime.yml b/.github/workflows/runtime.yml index 70b863f0..e239415d 100644 --- a/.github/workflows/runtime.yml +++ b/.github/workflows/runtime.yml @@ -11,6 +11,20 @@ on: options: - 'true' - 'false' + judge_mode: + description: 'Test judge mode' + required: false + default: 'simple' + type: choice + options: + - 'simple' + - 'llm' + - 'dual' + judge_model: + description: 'LLM model for judging (if llm/dual mode)' + required: false + default: 'gemma3:4b' + type: string workflow_call: # Called by other workflows inputs: keep_container: @@ -18,6 +32,16 @@ on: required: false default: false type: boolean + judge_mode: + description: 'Test judge mode (simple, llm, dual)' + required: false + default: 'simple' + type: string + judge_model: + description: 'LLM model for judging' + required: false + default: 'gemma3:4b' + type: string outputs: result: description: "Runtime test result" @@ -53,8 +77,23 @@ jobs: id: runtime-tests run: | cd tests + + # Build judge flags based on input + JUDGE_FLAGS="" + if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + JUDGE_FLAGS="--no-llm" + elif [ "${{ inputs.judge_mode }}" = "dual" ]; then + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + else + # llm mode + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + fi + + echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge flags: $JUDGE_FLAGS" + # Progress goes to stderr (visible), JSON results go to file - npm run --silent dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json || true + npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true echo "--- JSON Results ---" cat /tmp/runtime-results.json diff --git a/tests/testcases/inference/TC-INFERENCE-001.yml b/tests/testcases/inference/TC-INFERENCE-001.yml index b6858cec..51f212c3 100644 --- a/tests/testcases/inference/TC-INFERENCE-001.yml +++ b/tests/testcases/inference/TC-INFERENCE-001.yml @@ -25,6 +25,84 @@ steps: | jq -r '.response' | head -c 100 timeout: 300000 + - name: Verify model loading in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Model Loading Check ===" + + # Check for model loading message + if echo "$LOGS" | grep -q 'msg="loading model"'; then + echo "SUCCESS: Model loading initiated" + echo "$LOGS" | grep 'msg="loading model"' | tail -1 + else + echo "WARNING: Model loading message not found" + fi + + # Check for layer offloading to GPU + if echo "$LOGS" | grep -q "offloaded.*layers to GPU"; then + echo "SUCCESS: Model layers offloaded to GPU" + echo "$LOGS" | grep "offloaded.*layers to GPU" | tail -1 + else + echo "ERROR: Model layers not offloaded to GPU" + exit 1 + fi + + # Check model weights loaded + if echo "$LOGS" | grep -q 'msg="model weights loaded successfully"'; then + echo "SUCCESS: Model weights loaded" + else + echo "WARNING: Model weights loaded message not found" + fi + + - name: Verify llama runner started + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Llama Runner Check ===" + + # Check llama runner started + if echo "$LOGS" | grep -q "llama runner started"; then + echo "SUCCESS: Llama runner started" + echo "$LOGS" | grep "llama runner started" | tail -1 + else + echo "ERROR: Llama runner not started" + exit 1 + fi + + - name: Check for model loading errors + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Model Loading Error Check ===" + + # Check for CUDA/CUBLAS errors during model load + if echo "$LOGS" | grep -qE "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed)"; then + echo "CRITICAL CUDA ERRORS during model load:" + echo "$LOGS" | grep -E "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed)" + exit 1 + fi + + # Check for out of memory + if echo "$LOGS" | grep -qi "out of memory"; then + echo "ERROR: Out of memory during model load" + echo "$LOGS" | grep -i "out of memory" + exit 1 + fi + + echo "SUCCESS: No model loading errors" + + - name: Display model memory allocation from logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Model Memory Allocation ===" + echo "$LOGS" | grep -E '(model weights|kv cache|compute graph|total memory).*device=' | tail -8 + criteria: | The gemma3:4b model should be available for inference. @@ -32,8 +110,11 @@ criteria: | - Model is either already present or successfully downloaded - "ollama list" shows gemma3:4b in the output - No download errors - - Warmup step loads model into GPU memory (may take up to 3 minutes on Tesla K80) - - Warmup returns a response from the model + - Logs show "offloaded X/Y layers to GPU" + - Logs show "llama runner started" + - Logs show model weights on CUDA device (not CPU only) + - NO CUBLAS_STATUS_ errors during model load + - NO out of memory errors Accept if model already exists (skip download). Model size is ~3GB, download may take time. diff --git a/tests/testcases/inference/TC-INFERENCE-002.yml b/tests/testcases/inference/TC-INFERENCE-002.yml index a4d783b0..52d587a7 100644 --- a/tests/testcases/inference/TC-INFERENCE-002.yml +++ b/tests/testcases/inference/TC-INFERENCE-002.yml @@ -15,6 +15,66 @@ steps: - name: Check GPU memory usage command: docker exec ollama37 nvidia-smi --query-compute-apps=pid,used_memory --format=csv 2>/dev/null || echo "No GPU processes" + - name: Check for inference errors in logs + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== Inference Error Check ===" + + # Check for CUBLAS errors (critical for K80) + if echo "$LOGS" | grep -qE "CUBLAS_STATUS_"; then + echo "CRITICAL: CUBLAS error during inference:" + echo "$LOGS" | grep -E "CUBLAS_STATUS_" + exit 1 + fi + + # Check for CUDA errors + if echo "$LOGS" | grep -qE "CUDA error"; then + echo "CRITICAL: CUDA error during inference:" + echo "$LOGS" | grep -E "CUDA error" + exit 1 + fi + + # Check for compute graph errors + if echo "$LOGS" | grep -qiE "(compute.*failed|graph.*error)"; then + echo "ERROR: Compute graph error:" + echo "$LOGS" | grep -iE "(compute.*failed|graph.*error)" + exit 1 + fi + + echo "SUCCESS: No inference errors in logs" + + - name: Verify inference request in logs + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== Inference Request Verification ===" + + # Check for generate API call + if echo "$LOGS" | grep -qE '\[GIN\].*POST.*/api/generate'; then + echo "SUCCESS: Generate API request logged" + echo "$LOGS" | grep -E '\[GIN\].*POST.*/api/generate' | tail -2 + else + echo "WARNING: Generate API request not found in recent logs" + fi + + # Check for successful response (200 status) + if echo "$LOGS" | grep -qE '\[GIN\].*200.*POST'; then + echo "SUCCESS: Inference returned 200 status" + else + echo "WARNING: Could not verify 200 status" + fi + + - name: Display recent CUDA activity from logs + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== Recent CUDA Activity ===" + echo "$LOGS" | grep -iE "(CUDA|cuda|device=CUDA)" | tail -5 || echo "No recent CUDA activity logged" + criteria: | Basic inference should work on Tesla K80. @@ -22,7 +82,9 @@ criteria: | - Model responds to the math question - Response should indicate "4" (accept variations: "4", "four", "The answer is 4", etc.) - GPU memory should be allocated during inference - - No CUDA errors in output + - NO CUBLAS_STATUS_ errors in logs (critical for K80 compatibility) + - NO CUDA error messages in logs + - Generate API request logged with 200 status This is AI-generated output - accept reasonable variations. - Focus on the model producing a coherent response. + Focus on the model producing a coherent response without GPU errors. diff --git a/tests/testcases/inference/TC-INFERENCE-003.yml b/tests/testcases/inference/TC-INFERENCE-003.yml index 810e0532..30a33faa 100644 --- a/tests/testcases/inference/TC-INFERENCE-003.yml +++ b/tests/testcases/inference/TC-INFERENCE-003.yml @@ -20,6 +20,65 @@ steps: -d '{"model":"gemma3:4b","prompt":"Count from 1 to 3","stream":true}' \ | head -5 + - name: Verify API requests logged successfully + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== API Request Log Verification ===" + + # Check for generate requests with 200 status + GENERATE_200=$(echo "$LOGS" | grep -c '\[GIN\].*200.*POST.*/api/generate' || echo "0") + echo "Generate requests with 200 status: $GENERATE_200" + + if [ "$GENERATE_200" -gt 0 ]; then + echo "SUCCESS: API generate requests completed successfully" + echo "$LOGS" | grep '\[GIN\].*POST.*/api/generate' | tail -3 + else + echo "WARNING: No successful generate requests found in recent logs" + fi + + - name: Check for API errors in logs + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== API Error Check ===" + + # Check for 4xx/5xx errors on generate endpoint + if echo "$LOGS" | grep -qE '\[GIN\].*(4[0-9]{2}|5[0-9]{2}).*POST.*/api/generate'; then + echo "WARNING: API errors found on generate endpoint:" + echo "$LOGS" | grep -E '\[GIN\].*(4[0-9]{2}|5[0-9]{2}).*POST.*/api/generate' | tail -3 + else + echo "SUCCESS: No API errors on generate endpoint" + fi + + # Check for any CUDA errors during API processing + if echo "$LOGS" | grep -qE "(CUBLAS_STATUS_|CUDA error)"; then + echo "CRITICAL: CUDA errors during API processing:" + echo "$LOGS" | grep -E "(CUBLAS_STATUS_|CUDA error)" + exit 1 + fi + + echo "SUCCESS: No critical errors during API processing" + + - name: Display API response times from logs + command: | + cd docker + LOGS=$(docker compose logs --since=5m 2>&1) + + echo "=== API Response Times ===" + + # Show recent generate request response times + echo "$LOGS" | grep -E '\[GIN\].*POST.*/api/generate' | tail -5 | while read line; do + # Extract response time from GIN log format + echo "$line" | grep -oE '[0-9]+(\.[0-9]+)?(ms|s|m)' | head -1 + done + + echo "" + echo "Recent API requests:" + echo "$LOGS" | grep '\[GIN\]' | tail -5 + criteria: | Ollama REST API should handle inference requests. @@ -31,4 +90,9 @@ criteria: | - Returns multiple JSON lines - Each line contains partial response + Log verification: + - Generate API requests logged with 200 status + - NO 4xx/5xx errors on generate endpoint + - NO CUDA/CUBLAS errors during API processing + Accept any valid JSON response. Content may vary. diff --git a/tests/testcases/runtime/TC-RUNTIME-001.yml b/tests/testcases/runtime/TC-RUNTIME-001.yml index a8040f24..3d0b389e 100644 --- a/tests/testcases/runtime/TC-RUNTIME-001.yml +++ b/tests/testcases/runtime/TC-RUNTIME-001.yml @@ -20,6 +20,30 @@ steps: - name: Check container status command: cd docker && docker compose ps + - name: Capture startup logs + command: | + cd docker && docker compose logs 2>&1 | head -100 + + - name: Check for startup errors in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + # Check for critical errors + if echo "$LOGS" | grep -qE "(level=ERROR|CUBLAS_STATUS_|CUDA error|cudaMalloc failed)"; then + echo "CRITICAL ERRORS FOUND IN STARTUP LOGS:" + echo "$LOGS" | grep -E "(level=ERROR|CUBLAS_STATUS_|CUDA error|cudaMalloc failed)" + exit 1 + fi + + # Check for CPU-only fallback (GPU not detected) + if echo "$LOGS" | grep -q "id=cpu library=cpu"; then + echo "ERROR: Ollama fell back to CPU-only mode" + exit 1 + fi + + echo "SUCCESS: No critical errors in startup logs" + criteria: | The ollama37 container should start successfully with GPU access. @@ -27,5 +51,7 @@ criteria: | - Container starts without errors - docker compose ps shows container in "Up" state - No "Exited" or "Restarting" status + - No critical errors in logs (level=ERROR, CUBLAS_STATUS_, CUDA error) + - No CPU-only fallback (id=cpu library=cpu) - Accept startup warnings. Container should be running. + Accept startup warnings (flash attention not supported is OK). Container should be running. diff --git a/tests/testcases/runtime/TC-RUNTIME-002.yml b/tests/testcases/runtime/TC-RUNTIME-002.yml index 48ed1e03..fe473c92 100644 --- a/tests/testcases/runtime/TC-RUNTIME-002.yml +++ b/tests/testcases/runtime/TC-RUNTIME-002.yml @@ -28,9 +28,90 @@ steps: ls -l /dev/nvidia-uvm fi - - name: Check Ollama GPU detection in logs + - name: Verify GPU detection in Ollama logs command: | - cd docker && docker compose logs 2>&1 | grep -E "(inference compute|GPU detected)" | tail -5 + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== GPU Detection Check ===" + + # Check for inference compute with CUDA library + if echo "$LOGS" | grep -q "inference compute.*library=CUDA"; then + echo "SUCCESS: GPU detected with CUDA library" + echo "$LOGS" | grep "inference compute" | head -2 + else + echo "ERROR: GPU not detected with CUDA library" + exit 1 + fi + + # Check for Tesla K80 specifically + if echo "$LOGS" | grep -q 'description="Tesla K80"'; then + echo "SUCCESS: Tesla K80 GPU identified" + else + echo "WARNING: Tesla K80 not explicitly identified" + fi + + # Check compute capability 3.7 + if echo "$LOGS" | grep -q "compute=3.7"; then + echo "SUCCESS: Compute capability 3.7 detected" + else + echo "WARNING: Compute capability 3.7 not detected" + fi + + - name: Verify CUDA initialization in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== CUDA Initialization Check ===" + + # Check ggml_cuda_init + if echo "$LOGS" | grep -q "ggml_cuda_init: found"; then + echo "SUCCESS: CUDA initialized" + echo "$LOGS" | grep "ggml_cuda_init: found" | head -1 + else + echo "ERROR: CUDA not initialized" + exit 1 + fi + + # Check CUDA backend loaded + if echo "$LOGS" | grep -q "load_backend: loaded CUDA backend"; then + echo "SUCCESS: CUDA backend loaded" + echo "$LOGS" | grep "load_backend: loaded CUDA backend" | head -1 + else + echo "ERROR: CUDA backend not loaded" + exit 1 + fi + + - name: Check for GPU-related errors in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== GPU Error Check ===" + + # Check for critical CUDA/CUBLAS errors + if echo "$LOGS" | grep -qE "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed|out of memory)"; then + echo "CRITICAL GPU ERRORS FOUND:" + echo "$LOGS" | grep -E "(CUBLAS_STATUS_|CUDA error|cudaMalloc failed|out of memory)" + exit 1 + fi + + # Check for CPU fallback (bad!) + if echo "$LOGS" | grep -q "id=cpu library=cpu"; then + echo "ERROR: Ollama fell back to CPU-only mode" + exit 1 + fi + + echo "SUCCESS: No GPU-related errors found" + + - name: Display GPU memory status from logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== GPU Memory Status ===" + echo "$LOGS" | grep -E "gpu memory.*library=CUDA" | tail -4 criteria: | Tesla K80 GPU should be detected by both nvidia-smi AND Ollama CUDA runtime. @@ -39,7 +120,12 @@ criteria: | - nvidia-smi shows Tesla K80 GPU(s) with Driver 470.x - CUDA libraries are available (libcuda, libcublas, etc.) - /dev/nvidia-uvm device file exists (required for CUDA runtime) - - Ollama logs show GPU detection, NOT "id=cpu library=cpu" + - Ollama logs show "inference compute" with "library=CUDA" + - Ollama logs show "ggml_cuda_init: found N CUDA devices" + - Ollama logs show "load_backend: loaded CUDA backend" + - NO "id=cpu library=cpu" (CPU fallback) + - NO CUBLAS_STATUS_ errors + - NO CUDA error messages NOTE: If nvidia-smi works but Ollama shows only CPU, the UVM device files are missing. The test will auto-fix with nvidia-modprobe -u -c=0. diff --git a/tests/testcases/runtime/TC-RUNTIME-003.yml b/tests/testcases/runtime/TC-RUNTIME-003.yml index ac2bfc1f..3080c797 100644 --- a/tests/testcases/runtime/TC-RUNTIME-003.yml +++ b/tests/testcases/runtime/TC-RUNTIME-003.yml @@ -28,6 +28,62 @@ steps: - name: Check Ollama version command: docker exec ollama37 ollama --version + - name: Verify server listening in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Server Status Check ===" + + # Check server is listening + if echo "$LOGS" | grep -q "Listening on"; then + echo "SUCCESS: Server is listening" + echo "$LOGS" | grep "Listening on" | head -1 + else + echo "ERROR: Server not listening" + exit 1 + fi + + - name: Check for runtime errors in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== Runtime Error Check ===" + + # Check for any ERROR level logs + ERROR_COUNT=$(echo "$LOGS" | grep -c "level=ERROR" || echo "0") + if [ "$ERROR_COUNT" -gt 0 ]; then + echo "WARNING: Found $ERROR_COUNT ERROR level log entries:" + echo "$LOGS" | grep "level=ERROR" | tail -5 + else + echo "SUCCESS: No ERROR level logs found" + fi + + # Check for panic/fatal + if echo "$LOGS" | grep -qiE "(panic|fatal)"; then + echo "CRITICAL: Panic or fatal error detected:" + echo "$LOGS" | grep -iE "(panic|fatal)" + exit 1 + fi + + echo "SUCCESS: No critical runtime errors" + + - name: Verify API request handling in logs + command: | + cd docker + LOGS=$(docker compose logs 2>&1) + + echo "=== API Request Logs ===" + + # Check that API requests are being logged (GIN framework) + if echo "$LOGS" | grep -q '\[GIN\].*200.*GET.*"/api/tags"'; then + echo "SUCCESS: API requests are being handled" + echo "$LOGS" | grep '\[GIN\].*"/api/tags"' | tail -3 + else + echo "WARNING: No API request logs found (might be first request)" + fi + criteria: | Ollama server should be healthy and API responsive. @@ -35,5 +91,8 @@ criteria: | - Container health status becomes "healthy" - /api/tags endpoint returns JSON response (even if empty models) - ollama --version shows version information + - Logs show "Listening on" message + - No panic or fatal errors in logs + - API requests logged with 200 status codes Accept any valid JSON response from API. Version format may vary.