Change workflow defaults: judge_mode=dual, judge_model=gemma3:12b

This commit is contained in:
Shang Chieh Tseng
2025-12-17 16:43:38 +08:00
parent b0c2a07190
commit 7bb050f146
4 changed files with 101 additions and 101 deletions

View File

@@ -1,33 +1,33 @@
name: Build Verification
on:
workflow_dispatch: # Manual trigger
workflow_dispatch: # Manual trigger
inputs:
judge_mode:
description: 'Test judge mode'
description: "Test judge mode"
required: false
default: 'simple'
default: "dual"
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
- "simple"
- "llm"
- "dual"
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
description: "LLM model for judging (if llm/dual mode)"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
workflow_call: # Called by other workflows
workflow_call: # Called by other workflows
inputs:
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
description: "Test judge mode (simple, llm, dual)"
required: false
default: 'simple'
default: "dual"
type: string
judge_model:
description: 'LLM model for judging'
description: "LLM model for judging"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
outputs:
result:
@@ -51,7 +51,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -63,16 +63,16 @@ jobs:
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file

View File

@@ -1,30 +1,30 @@
name: Full Pipeline
on:
workflow_dispatch: # Manual trigger
workflow_dispatch: # Manual trigger
inputs:
judge_mode:
description: 'Test judge mode'
description: "Test judge mode"
required: false
default: 'simple'
default: "dual"
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
- "simple"
- "llm"
- "dual"
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
description: "LLM model for judging (if llm/dual mode)"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
skip_llm_judge_stage:
description: 'Skip separate LLM judge evaluation stage'
description: "Skip separate LLM judge evaluation stage"
required: false
default: 'true'
default: "true"
type: choice
options:
- 'true'
- 'false'
- "true"
- "false"
env:
OLLAMA_HOST: http://localhost:11434
@@ -34,8 +34,8 @@ jobs:
name: Build Verification
uses: ./.github/workflows/build.yml
with:
judge_mode: ${{ inputs.judge_mode || 'simple' }}
judge_model: ${{ inputs.judge_model || 'gemma3:4b' }}
judge_mode: ${{ inputs.judge_mode || 'dual' }}
judge_model: ${{ inputs.judge_model || 'gemma3:12b' }}
start-container:
name: Start Container
@@ -70,7 +70,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -82,16 +82,16 @@ jobs:
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
@@ -127,7 +127,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -139,16 +139,16 @@ jobs:
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
@@ -185,7 +185,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -199,9 +199,9 @@ jobs:
run: |
cd tests
echo "Running LLM judge evaluation..."
echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}"
echo "Using model: ${{ inputs.judge_model || 'gemma3:12b' }}"
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:12b' }} --output json > /tmp/llm-judged-results.json || true
echo "--- JSON Results ---"
cat /tmp/llm-judged-results.json

View File

@@ -1,46 +1,46 @@
name: Inference Tests
on:
workflow_dispatch: # Manual trigger
workflow_dispatch: # Manual trigger
inputs:
use_existing_container:
description: 'Use existing running container'
description: "Use existing running container"
required: false
default: 'false'
default: "false"
type: choice
options:
- 'true'
- 'false'
- "true"
- "false"
judge_mode:
description: 'Test judge mode'
description: "Test judge mode"
required: false
default: 'simple'
default: "dual"
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
- "simple"
- "llm"
- "dual"
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
description: "LLM model for judging (if llm/dual mode)"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
workflow_call: # Called by other workflows
workflow_call: # Called by other workflows
inputs:
use_existing_container:
description: 'Container is already running'
description: "Container is already running"
required: false
default: false
type: boolean
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
description: "Test judge mode (simple, llm, dual)"
required: false
default: 'simple'
default: "dual"
type: string
judge_model:
description: 'LLM model for judging'
description: "LLM model for judging"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
outputs:
result:
@@ -64,7 +64,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -84,16 +84,16 @@ jobs:
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file

View File

@@ -1,46 +1,46 @@
name: Runtime Tests
on:
workflow_dispatch: # Manual trigger
workflow_dispatch: # Manual trigger
inputs:
keep_container:
description: 'Keep container running after tests'
description: "Keep container running after tests"
required: false
default: 'false'
default: "false"
type: choice
options:
- 'true'
- 'false'
- "true"
- "false"
judge_mode:
description: 'Test judge mode'
description: "Test judge mode"
required: false
default: 'simple'
default: "dual"
type: choice
options:
- 'simple'
- 'llm'
- 'dual'
- "simple"
- "llm"
- "dual"
judge_model:
description: 'LLM model for judging (if llm/dual mode)'
description: "LLM model for judging (if llm/dual mode)"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
workflow_call: # Called by other workflows
workflow_call: # Called by other workflows
inputs:
keep_container:
description: 'Keep container running for subsequent jobs'
description: "Keep container running for subsequent jobs"
required: false
default: false
type: boolean
judge_mode:
description: 'Test judge mode (simple, llm, dual)'
description: "Test judge mode (simple, llm, dual)"
required: false
default: 'simple'
default: "dual"
type: string
judge_model:
description: 'LLM model for judging'
description: "LLM model for judging"
required: false
default: 'gemma3:4b'
default: "gemma3:12b"
type: string
outputs:
result:
@@ -61,7 +61,7 @@ jobs:
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
@@ -80,16 +80,16 @@ jobs:
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# llm mode
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file