Add comprehensive Ollama log checking and configurable LLM judge mode

Test case enhancements: - TC-RUNTIME-001: Add startup log error checking (CUDA, CUBLAS, CPU fallback) - TC-RUNTIME-002: Add GPU detection verification, CUDA init checks, error detection - TC-RUNTIME-003: Add server listening verification, runtime error checks - TC-INFERENCE-001: Add model loading logs, layer offload verification - TC-INFERENCE-002: Add inference error checking (CUBLAS/CUDA errors) - TC-INFERENCE-003: Add API request log verification, response time display Workflow enhancements: - Add judge_mode input (simple/llm/dual) to all workflows - Add judge_model input to specify LLM model for judging - Configurable via GitHub Actions UI without code changes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 21:26:59 +00:00 · 2025-12-16 23:27:57 +08:00
parent 143e6fa8e4
commit 1a185f7926
10 changed files with 564 additions and 18 deletions
--- a/.github/workflows/runtime.yml
+++ b/.github/workflows/runtime.yml
@@ -11,6 +11,20 @@ on:
        options:
          - 'true'
          - 'false'
+      judge_mode:
+        description: 'Test judge mode'
+        required: false
+        default: 'simple'
+        type: choice
+        options:
+          - 'simple'
+          - 'llm'
+          - 'dual'
+      judge_model:
+        description: 'LLM model for judging (if llm/dual mode)'
+        required: false
+        default: 'gemma3:4b'
+        type: string
  workflow_call:      # Called by other workflows
    inputs:
      keep_container:
@@ -18,6 +32,16 @@ on:
        required: false
        default: false
        type: boolean
+      judge_mode:
+        description: 'Test judge mode (simple, llm, dual)'
+        required: false
+        default: 'simple'
+        type: string
+      judge_model:
+        description: 'LLM model for judging'
+        required: false
+        default: 'gemma3:4b'
+        type: string
    outputs:
      result:
        description: "Runtime test result"
@@ -53,8 +77,23 @@ jobs:
        id: runtime-tests
        run: |
          cd tests
+
+          # Build judge flags based on input
+          JUDGE_FLAGS=""
+          if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
+            JUDGE_FLAGS="--no-llm"
+          elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
+            JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
+          else
+            # llm mode
+            JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
+          fi
+
+          echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
+          echo "Judge flags: $JUDGE_FLAGS"
+
          # Progress goes to stderr (visible), JSON results go to file
-          npm run --silent dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json || true
+          npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true

          echo "--- JSON Results ---"
          cat /tmp/runtime-results.json