Change workflow defaults: judge_mode=dual, judge_model=gemma3:12b

This commit is contained in:
Shang Chieh Tseng
2025-12-17 16:43:38 +08:00
parent b0c2a07190
commit 7bb050f146
4 changed files with 101 additions and 101 deletions

View File

@@ -1,33 +1,33 @@
name: Build Verification name: Build Verification
on: on:
workflow_dispatch: # Manual trigger workflow_dispatch: # Manual trigger
inputs: inputs:
judge_mode: judge_mode:
description: 'Test judge mode' description: "Test judge mode"
required: false required: false
default: 'simple' default: "dual"
type: choice type: choice
options: options:
- 'simple' - "simple"
- 'llm' - "llm"
- 'dual' - "dual"
judge_model: judge_model:
description: 'LLM model for judging (if llm/dual mode)' description: "LLM model for judging (if llm/dual mode)"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
workflow_call: # Called by other workflows workflow_call: # Called by other workflows
inputs: inputs:
judge_mode: judge_mode:
description: 'Test judge mode (simple, llm, dual)' description: "Test judge mode (simple, llm, dual)"
required: false required: false
default: 'simple' default: "dual"
type: string type: string
judge_model: judge_model:
description: 'LLM model for judging' description: "LLM model for judging"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
outputs: outputs:
result: result:
@@ -51,7 +51,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -63,16 +63,16 @@ jobs:
# Build judge flags based on input # Build judge flags based on input
JUDGE_FLAGS="" JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm" JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else else
# llm mode # dual mode (default)
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS" echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file # Progress goes to stderr (visible), JSON results go to file

View File

@@ -1,30 +1,30 @@
name: Full Pipeline name: Full Pipeline
on: on:
workflow_dispatch: # Manual trigger workflow_dispatch: # Manual trigger
inputs: inputs:
judge_mode: judge_mode:
description: 'Test judge mode' description: "Test judge mode"
required: false required: false
default: 'simple' default: "dual"
type: choice type: choice
options: options:
- 'simple' - "simple"
- 'llm' - "llm"
- 'dual' - "dual"
judge_model: judge_model:
description: 'LLM model for judging (if llm/dual mode)' description: "LLM model for judging (if llm/dual mode)"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
skip_llm_judge_stage: skip_llm_judge_stage:
description: 'Skip separate LLM judge evaluation stage' description: "Skip separate LLM judge evaluation stage"
required: false required: false
default: 'true' default: "true"
type: choice type: choice
options: options:
- 'true' - "true"
- 'false' - "false"
env: env:
OLLAMA_HOST: http://localhost:11434 OLLAMA_HOST: http://localhost:11434
@@ -34,8 +34,8 @@ jobs:
name: Build Verification name: Build Verification
uses: ./.github/workflows/build.yml uses: ./.github/workflows/build.yml
with: with:
judge_mode: ${{ inputs.judge_mode || 'simple' }} judge_mode: ${{ inputs.judge_mode || 'dual' }}
judge_model: ${{ inputs.judge_model || 'gemma3:4b' }} judge_model: ${{ inputs.judge_model || 'gemma3:12b' }}
start-container: start-container:
name: Start Container name: Start Container
@@ -70,7 +70,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -82,16 +82,16 @@ jobs:
# Build judge flags based on input # Build judge flags based on input
JUDGE_FLAGS="" JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm" JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else else
# llm mode # dual mode (default)
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS" echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
@@ -127,7 +127,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -139,16 +139,16 @@ jobs:
# Build judge flags based on input # Build judge flags based on input
JUDGE_FLAGS="" JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm" JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else else
# llm mode # dual mode (default)
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS" echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
@@ -185,7 +185,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -199,9 +199,9 @@ jobs:
run: | run: |
cd tests cd tests
echo "Running LLM judge evaluation..." echo "Running LLM judge evaluation..."
echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}" echo "Using model: ${{ inputs.judge_model || 'gemma3:12b' }}"
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:12b' }} --output json > /tmp/llm-judged-results.json || true
echo "--- JSON Results ---" echo "--- JSON Results ---"
cat /tmp/llm-judged-results.json cat /tmp/llm-judged-results.json

View File

@@ -1,46 +1,46 @@
name: Inference Tests name: Inference Tests
on: on:
workflow_dispatch: # Manual trigger workflow_dispatch: # Manual trigger
inputs: inputs:
use_existing_container: use_existing_container:
description: 'Use existing running container' description: "Use existing running container"
required: false required: false
default: 'false' default: "false"
type: choice type: choice
options: options:
- 'true' - "true"
- 'false' - "false"
judge_mode: judge_mode:
description: 'Test judge mode' description: "Test judge mode"
required: false required: false
default: 'simple' default: "dual"
type: choice type: choice
options: options:
- 'simple' - "simple"
- 'llm' - "llm"
- 'dual' - "dual"
judge_model: judge_model:
description: 'LLM model for judging (if llm/dual mode)' description: "LLM model for judging (if llm/dual mode)"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
workflow_call: # Called by other workflows workflow_call: # Called by other workflows
inputs: inputs:
use_existing_container: use_existing_container:
description: 'Container is already running' description: "Container is already running"
required: false required: false
default: false default: false
type: boolean type: boolean
judge_mode: judge_mode:
description: 'Test judge mode (simple, llm, dual)' description: "Test judge mode (simple, llm, dual)"
required: false required: false
default: 'simple' default: "dual"
type: string type: string
judge_model: judge_model:
description: 'LLM model for judging' description: "LLM model for judging"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
outputs: outputs:
result: result:
@@ -64,7 +64,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -84,16 +84,16 @@ jobs:
# Build judge flags based on input # Build judge flags based on input
JUDGE_FLAGS="" JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm" JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else else
# llm mode # dual mode (default)
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS" echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file # Progress goes to stderr (visible), JSON results go to file

View File

@@ -1,46 +1,46 @@
name: Runtime Tests name: Runtime Tests
on: on:
workflow_dispatch: # Manual trigger workflow_dispatch: # Manual trigger
inputs: inputs:
keep_container: keep_container:
description: 'Keep container running after tests' description: "Keep container running after tests"
required: false required: false
default: 'false' default: "false"
type: choice type: choice
options: options:
- 'true' - "true"
- 'false' - "false"
judge_mode: judge_mode:
description: 'Test judge mode' description: "Test judge mode"
required: false required: false
default: 'simple' default: "dual"
type: choice type: choice
options: options:
- 'simple' - "simple"
- 'llm' - "llm"
- 'dual' - "dual"
judge_model: judge_model:
description: 'LLM model for judging (if llm/dual mode)' description: "LLM model for judging (if llm/dual mode)"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
workflow_call: # Called by other workflows workflow_call: # Called by other workflows
inputs: inputs:
keep_container: keep_container:
description: 'Keep container running for subsequent jobs' description: "Keep container running for subsequent jobs"
required: false required: false
default: false default: false
type: boolean type: boolean
judge_mode: judge_mode:
description: 'Test judge mode (simple, llm, dual)' description: "Test judge mode (simple, llm, dual)"
required: false required: false
default: 'simple' default: "dual"
type: string type: string
judge_model: judge_model:
description: 'LLM model for judging' description: "LLM model for judging"
required: false required: false
default: 'gemma3:4b' default: "gemma3:12b"
type: string type: string
outputs: outputs:
result: result:
@@ -61,7 +61,7 @@ jobs:
- name: Setup Node.js - name: Setup Node.js
uses: actions/setup-node@v4 uses: actions/setup-node@v4
with: with:
node-version: '20' node-version: "20"
- name: Install test runner dependencies - name: Install test runner dependencies
run: cd tests && npm ci run: cd tests && npm ci
@@ -80,16 +80,16 @@ jobs:
# Build judge flags based on input # Build judge flags based on input
JUDGE_FLAGS="" JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm" JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else else
# llm mode # dual mode (default)
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS" echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file # Progress goes to stderr (visible), JSON results go to file