Add comprehensive Ollama log checking and configurable LLM judge mode

Test case enhancements:
- TC-RUNTIME-001: Add startup log error checking (CUDA, CUBLAS, CPU fallback)
- TC-RUNTIME-002: Add GPU detection verification, CUDA init checks, error detection
- TC-RUNTIME-003: Add server listening verification, runtime error checks
- TC-INFERENCE-001: Add model loading logs, layer offload verification
- TC-INFERENCE-002: Add inference error checking (CUBLAS/CUDA errors)
- TC-INFERENCE-003: Add API request log verification, response time display

Workflow enhancements:
- Add judge_mode input (simple/llm/dual) to all workflows
- Add judge_model input to specify LLM model for judging
- Configurable via GitHub Actions UI without code changes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-12-16 23:27:57 +08:00
parent 143e6fa8e4
commit 1a185f7926
10 changed files with 564 additions and 18 deletions

View File

@@ -2,7 +2,33 @@ name: Build Verification
on:
workflow_dispatch: # Manual trigger
inputs:
judge_mode:
description: 'Test judge mode'
required: false
default: 'simple'
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
required: false
default: 'gemma3:4b'
type: string
workflow_call: # Called by other workflows
inputs:
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
required: false
default: 'simple'
type: string
judge_model:
description: 'LLM model for judging'
required: false
default: 'gemma3:4b'
type: string
outputs:
result:
description: "Build test result"
@@ -31,8 +57,23 @@ jobs:
id: build-tests
run: |
cd tests
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file
npm run --silent dev -- run --suite build --no-llm --output json > /tmp/build-results.json || true
npm run --silent dev -- run --suite build $JUDGE_FLAGS --output json > /tmp/build-results.json || true
echo "--- JSON Results ---"
cat /tmp/build-results.json

View File

@@ -3,10 +3,24 @@ name: Full Pipeline
on:
workflow_dispatch: # Manual trigger
inputs:
skip_llm_judge:
description: 'Skip LLM judge evaluation'
judge_mode:
description: 'Test judge mode'
required: false
default: 'false'
default: 'simple'
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
required: false
default: 'gemma3:4b'
type: string
skip_llm_judge_stage:
description: 'Skip separate LLM judge evaluation stage'
required: false
default: 'true'
type: choice
options:
- 'true'
@@ -19,6 +33,9 @@ jobs:
build:
name: Build Verification
uses: ./.github/workflows/build.yml
with:
judge_mode: ${{ inputs.judge_mode || 'simple' }}
judge_model: ${{ inputs.judge_model || 'gemma3:4b' }}
start-container:
name: Start Container
@@ -62,7 +79,22 @@ jobs:
id: runtime-tests
run: |
cd tests
npm run --silent dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json || true
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
echo "--- JSON Results ---"
cat /tmp/runtime-results.json
@@ -104,7 +136,22 @@ jobs:
id: inference-tests
run: |
cd tests
npm run --silent dev -- run --suite inference --no-llm --output json > /tmp/inference-results.json || true
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
echo "--- JSON Results ---"
cat /tmp/inference-results.json
@@ -129,7 +176,7 @@ jobs:
name: LLM Judge Evaluation
runs-on: self-hosted
needs: [build, runtime, inference]
if: ${{ inputs.skip_llm_judge != 'true' }}
if: ${{ inputs.skip_llm_judge_stage != 'true' }}
steps:
- name: Checkout
@@ -152,7 +199,9 @@ jobs:
run: |
cd tests
echo "Running LLM judge evaluation..."
npm run --silent dev -- run --output json > /tmp/llm-judged-results.json || true
echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}"
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true
echo "--- JSON Results ---"
cat /tmp/llm-judged-results.json

View File

@@ -11,6 +11,20 @@ on:
options:
- 'true'
- 'false'
judge_mode:
description: 'Test judge mode'
required: false
default: 'simple'
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
required: false
default: 'gemma3:4b'
type: string
workflow_call: # Called by other workflows
inputs:
use_existing_container:
@@ -18,6 +32,16 @@ on:
required: false
default: false
type: boolean
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
required: false
default: 'simple'
type: string
judge_model:
description: 'LLM model for judging'
required: false
default: 'gemma3:4b'
type: string
outputs:
result:
description: "Inference test result"
@@ -57,8 +81,23 @@ jobs:
id: inference-tests
run: |
cd tests
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file
npm run --silent dev -- run --suite inference --no-llm --output json > /tmp/inference-results.json || true
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
echo "--- JSON Results ---"
cat /tmp/inference-results.json

View File

@@ -11,6 +11,20 @@ on:
options:
- 'true'
- 'false'
judge_mode:
description: 'Test judge mode'
required: false
default: 'simple'
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
required: false
default: 'gemma3:4b'
type: string
workflow_call: # Called by other workflows
inputs:
keep_container:
@@ -18,6 +32,16 @@ on:
required: false
default: false
type: boolean
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
required: false
default: 'simple'
type: string
judge_model:
description: 'LLM model for judging'
required: false
default: 'gemma3:4b'
type: string
outputs:
result:
description: "Runtime test result"
@@ -53,8 +77,23 @@ jobs:
id: runtime-tests
run: |
cd tests
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file
npm run --silent dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json || true
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
echo "--- JSON Results ---"
cat /tmp/runtime-results.json