mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-20 12:47:00 +00:00
Change workflow defaults: judge_mode=dual, judge_model=gemma3:12b
This commit is contained in:
40
.github/workflows/build.yml
vendored
40
.github/workflows/build.yml
vendored
@@ -1,33 +1,33 @@
|
|||||||
name: Build Verification
|
name: Build Verification
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
inputs:
|
inputs:
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode'
|
description: "Test judge mode"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'simple'
|
- "simple"
|
||||||
- 'llm'
|
- "llm"
|
||||||
- 'dual'
|
- "dual"
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging (if llm/dual mode)'
|
description: "LLM model for judging (if llm/dual mode)"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
workflow_call: # Called by other workflows
|
workflow_call: # Called by other workflows
|
||||||
inputs:
|
inputs:
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode (simple, llm, dual)'
|
description: "Test judge mode (simple, llm, dual)"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: string
|
type: string
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging'
|
description: "LLM model for judging"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
outputs:
|
outputs:
|
||||||
result:
|
result:
|
||||||
@@ -51,7 +51,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -63,16 +63,16 @@ jobs:
|
|||||||
|
|
||||||
# Build judge flags based on input
|
# Build judge flags based on input
|
||||||
JUDGE_FLAGS=""
|
JUDGE_FLAGS=""
|
||||||
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
|
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
|
||||||
JUDGE_FLAGS="--no-llm"
|
JUDGE_FLAGS="--no-llm"
|
||||||
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
|
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
|
||||||
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
else
|
else
|
||||||
# llm mode
|
# dual mode (default)
|
||||||
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
|
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
|
||||||
echo "Judge flags: $JUDGE_FLAGS"
|
echo "Judge flags: $JUDGE_FLAGS"
|
||||||
|
|
||||||
# Progress goes to stderr (visible), JSON results go to file
|
# Progress goes to stderr (visible), JSON results go to file
|
||||||
|
|||||||
62
.github/workflows/full-pipeline.yml
vendored
62
.github/workflows/full-pipeline.yml
vendored
@@ -1,30 +1,30 @@
|
|||||||
name: Full Pipeline
|
name: Full Pipeline
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
inputs:
|
inputs:
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode'
|
description: "Test judge mode"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'simple'
|
- "simple"
|
||||||
- 'llm'
|
- "llm"
|
||||||
- 'dual'
|
- "dual"
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging (if llm/dual mode)'
|
description: "LLM model for judging (if llm/dual mode)"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
skip_llm_judge_stage:
|
skip_llm_judge_stage:
|
||||||
description: 'Skip separate LLM judge evaluation stage'
|
description: "Skip separate LLM judge evaluation stage"
|
||||||
required: false
|
required: false
|
||||||
default: 'true'
|
default: "true"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'true'
|
- "true"
|
||||||
- 'false'
|
- "false"
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OLLAMA_HOST: http://localhost:11434
|
OLLAMA_HOST: http://localhost:11434
|
||||||
@@ -34,8 +34,8 @@ jobs:
|
|||||||
name: Build Verification
|
name: Build Verification
|
||||||
uses: ./.github/workflows/build.yml
|
uses: ./.github/workflows/build.yml
|
||||||
with:
|
with:
|
||||||
judge_mode: ${{ inputs.judge_mode || 'simple' }}
|
judge_mode: ${{ inputs.judge_mode || 'dual' }}
|
||||||
judge_model: ${{ inputs.judge_model || 'gemma3:4b' }}
|
judge_model: ${{ inputs.judge_model || 'gemma3:12b' }}
|
||||||
|
|
||||||
start-container:
|
start-container:
|
||||||
name: Start Container
|
name: Start Container
|
||||||
@@ -70,7 +70,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -82,16 +82,16 @@ jobs:
|
|||||||
|
|
||||||
# Build judge flags based on input
|
# Build judge flags based on input
|
||||||
JUDGE_FLAGS=""
|
JUDGE_FLAGS=""
|
||||||
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
|
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
|
||||||
JUDGE_FLAGS="--no-llm"
|
JUDGE_FLAGS="--no-llm"
|
||||||
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
|
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
|
||||||
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
else
|
else
|
||||||
# llm mode
|
# dual mode (default)
|
||||||
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
|
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
|
||||||
echo "Judge flags: $JUDGE_FLAGS"
|
echo "Judge flags: $JUDGE_FLAGS"
|
||||||
|
|
||||||
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
|
npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true
|
||||||
@@ -127,7 +127,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -139,16 +139,16 @@ jobs:
|
|||||||
|
|
||||||
# Build judge flags based on input
|
# Build judge flags based on input
|
||||||
JUDGE_FLAGS=""
|
JUDGE_FLAGS=""
|
||||||
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
|
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
|
||||||
JUDGE_FLAGS="--no-llm"
|
JUDGE_FLAGS="--no-llm"
|
||||||
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
|
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
|
||||||
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
else
|
else
|
||||||
# llm mode
|
# dual mode (default)
|
||||||
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
|
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
|
||||||
echo "Judge flags: $JUDGE_FLAGS"
|
echo "Judge flags: $JUDGE_FLAGS"
|
||||||
|
|
||||||
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
|
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
|
||||||
@@ -185,7 +185,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -199,9 +199,9 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
cd tests
|
cd tests
|
||||||
echo "Running LLM judge evaluation..."
|
echo "Running LLM judge evaluation..."
|
||||||
echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}"
|
echo "Using model: ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
|
|
||||||
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true
|
npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:12b' }} --output json > /tmp/llm-judged-results.json || true
|
||||||
|
|
||||||
echo "--- JSON Results ---"
|
echo "--- JSON Results ---"
|
||||||
cat /tmp/llm-judged-results.json
|
cat /tmp/llm-judged-results.json
|
||||||
|
|||||||
50
.github/workflows/inference.yml
vendored
50
.github/workflows/inference.yml
vendored
@@ -1,46 +1,46 @@
|
|||||||
name: Inference Tests
|
name: Inference Tests
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
inputs:
|
inputs:
|
||||||
use_existing_container:
|
use_existing_container:
|
||||||
description: 'Use existing running container'
|
description: "Use existing running container"
|
||||||
required: false
|
required: false
|
||||||
default: 'false'
|
default: "false"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'true'
|
- "true"
|
||||||
- 'false'
|
- "false"
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode'
|
description: "Test judge mode"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'simple'
|
- "simple"
|
||||||
- 'llm'
|
- "llm"
|
||||||
- 'dual'
|
- "dual"
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging (if llm/dual mode)'
|
description: "LLM model for judging (if llm/dual mode)"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
workflow_call: # Called by other workflows
|
workflow_call: # Called by other workflows
|
||||||
inputs:
|
inputs:
|
||||||
use_existing_container:
|
use_existing_container:
|
||||||
description: 'Container is already running'
|
description: "Container is already running"
|
||||||
required: false
|
required: false
|
||||||
default: false
|
default: false
|
||||||
type: boolean
|
type: boolean
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode (simple, llm, dual)'
|
description: "Test judge mode (simple, llm, dual)"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: string
|
type: string
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging'
|
description: "LLM model for judging"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
outputs:
|
outputs:
|
||||||
result:
|
result:
|
||||||
@@ -64,7 +64,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -84,16 +84,16 @@ jobs:
|
|||||||
|
|
||||||
# Build judge flags based on input
|
# Build judge flags based on input
|
||||||
JUDGE_FLAGS=""
|
JUDGE_FLAGS=""
|
||||||
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
|
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
|
||||||
JUDGE_FLAGS="--no-llm"
|
JUDGE_FLAGS="--no-llm"
|
||||||
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
|
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
|
||||||
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
else
|
else
|
||||||
# llm mode
|
# dual mode (default)
|
||||||
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
|
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
|
||||||
echo "Judge flags: $JUDGE_FLAGS"
|
echo "Judge flags: $JUDGE_FLAGS"
|
||||||
|
|
||||||
# Progress goes to stderr (visible), JSON results go to file
|
# Progress goes to stderr (visible), JSON results go to file
|
||||||
|
|||||||
50
.github/workflows/runtime.yml
vendored
50
.github/workflows/runtime.yml
vendored
@@ -1,46 +1,46 @@
|
|||||||
name: Runtime Tests
|
name: Runtime Tests
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # Manual trigger
|
workflow_dispatch: # Manual trigger
|
||||||
inputs:
|
inputs:
|
||||||
keep_container:
|
keep_container:
|
||||||
description: 'Keep container running after tests'
|
description: "Keep container running after tests"
|
||||||
required: false
|
required: false
|
||||||
default: 'false'
|
default: "false"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'true'
|
- "true"
|
||||||
- 'false'
|
- "false"
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode'
|
description: "Test judge mode"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: choice
|
type: choice
|
||||||
options:
|
options:
|
||||||
- 'simple'
|
- "simple"
|
||||||
- 'llm'
|
- "llm"
|
||||||
- 'dual'
|
- "dual"
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging (if llm/dual mode)'
|
description: "LLM model for judging (if llm/dual mode)"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
workflow_call: # Called by other workflows
|
workflow_call: # Called by other workflows
|
||||||
inputs:
|
inputs:
|
||||||
keep_container:
|
keep_container:
|
||||||
description: 'Keep container running for subsequent jobs'
|
description: "Keep container running for subsequent jobs"
|
||||||
required: false
|
required: false
|
||||||
default: false
|
default: false
|
||||||
type: boolean
|
type: boolean
|
||||||
judge_mode:
|
judge_mode:
|
||||||
description: 'Test judge mode (simple, llm, dual)'
|
description: "Test judge mode (simple, llm, dual)"
|
||||||
required: false
|
required: false
|
||||||
default: 'simple'
|
default: "dual"
|
||||||
type: string
|
type: string
|
||||||
judge_model:
|
judge_model:
|
||||||
description: 'LLM model for judging'
|
description: "LLM model for judging"
|
||||||
required: false
|
required: false
|
||||||
default: 'gemma3:4b'
|
default: "gemma3:12b"
|
||||||
type: string
|
type: string
|
||||||
outputs:
|
outputs:
|
||||||
result:
|
result:
|
||||||
@@ -61,7 +61,7 @@ jobs:
|
|||||||
- name: Setup Node.js
|
- name: Setup Node.js
|
||||||
uses: actions/setup-node@v4
|
uses: actions/setup-node@v4
|
||||||
with:
|
with:
|
||||||
node-version: '20'
|
node-version: "20"
|
||||||
|
|
||||||
- name: Install test runner dependencies
|
- name: Install test runner dependencies
|
||||||
run: cd tests && npm ci
|
run: cd tests && npm ci
|
||||||
@@ -80,16 +80,16 @@ jobs:
|
|||||||
|
|
||||||
# Build judge flags based on input
|
# Build judge flags based on input
|
||||||
JUDGE_FLAGS=""
|
JUDGE_FLAGS=""
|
||||||
if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then
|
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
|
||||||
JUDGE_FLAGS="--no-llm"
|
JUDGE_FLAGS="--no-llm"
|
||||||
elif [ "${{ inputs.judge_mode }}" = "dual" ]; then
|
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
|
||||||
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
else
|
else
|
||||||
# llm mode
|
# dual mode (default)
|
||||||
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}"
|
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}"
|
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
|
||||||
echo "Judge flags: $JUDGE_FLAGS"
|
echo "Judge flags: $JUDGE_FLAGS"
|
||||||
|
|
||||||
# Progress goes to stderr (visible), JSON results go to file
|
# Progress goes to stderr (visible), JSON results go to file
|
||||||
|
|||||||
Reference in New Issue
Block a user