diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3901711f..4bcf2739 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,33 +1,33 @@ name: Build Verification on: - workflow_dispatch: # Manual trigger + workflow_dispatch: # Manual trigger inputs: judge_mode: - description: 'Test judge mode' + description: "Test judge mode" required: false - default: 'simple' + default: "dual" type: choice options: - - 'simple' - - 'llm' - - 'dual' + - "simple" + - "llm" + - "dual" judge_model: - description: 'LLM model for judging (if llm/dual mode)' + description: "LLM model for judging (if llm/dual mode)" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string - workflow_call: # Called by other workflows + workflow_call: # Called by other workflows inputs: judge_mode: - description: 'Test judge mode (simple, llm, dual)' + description: "Test judge mode (simple, llm, dual)" required: false - default: 'simple' + default: "dual" type: string judge_model: - description: 'LLM model for judging' + description: "LLM model for judging" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string outputs: result: @@ -51,7 +51,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -63,16 +63,16 @@ jobs: # Build judge flags based on input JUDGE_FLAGS="" - if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" - elif [ "${{ inputs.judge_mode }}" = "dual" ]; then - JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + elif [ "${{ inputs.judge_mode }}" = "llm" ]; then + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else - # llm mode - JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + # dual mode (default) + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi - echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" # Progress goes to stderr (visible), JSON results go to file diff --git a/.github/workflows/full-pipeline.yml b/.github/workflows/full-pipeline.yml index 4de74db9..f21e2ae9 100644 --- a/.github/workflows/full-pipeline.yml +++ b/.github/workflows/full-pipeline.yml @@ -1,30 +1,30 @@ name: Full Pipeline on: - workflow_dispatch: # Manual trigger + workflow_dispatch: # Manual trigger inputs: judge_mode: - description: 'Test judge mode' + description: "Test judge mode" required: false - default: 'simple' + default: "dual" type: choice options: - - 'simple' - - 'llm' - - 'dual' + - "simple" + - "llm" + - "dual" judge_model: - description: 'LLM model for judging (if llm/dual mode)' + description: "LLM model for judging (if llm/dual mode)" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string skip_llm_judge_stage: - description: 'Skip separate LLM judge evaluation stage' + description: "Skip separate LLM judge evaluation stage" required: false - default: 'true' + default: "true" type: choice options: - - 'true' - - 'false' + - "true" + - "false" env: OLLAMA_HOST: http://localhost:11434 @@ -34,8 +34,8 @@ jobs: name: Build Verification uses: ./.github/workflows/build.yml with: - judge_mode: ${{ inputs.judge_mode || 'simple' }} - judge_model: ${{ inputs.judge_model || 'gemma3:4b' }} + judge_mode: ${{ inputs.judge_mode || 'dual' }} + judge_model: ${{ inputs.judge_model || 'gemma3:12b' }} start-container: name: Start Container @@ -70,7 +70,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -82,16 +82,16 @@ jobs: # Build judge flags based on input JUDGE_FLAGS="" - if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" - elif [ "${{ inputs.judge_mode }}" = "dual" ]; then - JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + elif [ "${{ inputs.judge_mode }}" = "llm" ]; then + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else - # llm mode - JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + # dual mode (default) + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi - echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true @@ -127,7 +127,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -139,16 +139,16 @@ jobs: # Build judge flags based on input JUDGE_FLAGS="" - if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" - elif [ "${{ inputs.judge_mode }}" = "dual" ]; then - JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + elif [ "${{ inputs.judge_mode }}" = "llm" ]; then + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else - # llm mode - JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + # dual mode (default) + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi - echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true @@ -185,7 +185,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -199,9 +199,9 @@ jobs: run: | cd tests echo "Running LLM judge evaluation..." - echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}" + echo "Using model: ${{ inputs.judge_model || 'gemma3:12b' }}" - npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true + npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:12b' }} --output json > /tmp/llm-judged-results.json || true echo "--- JSON Results ---" cat /tmp/llm-judged-results.json diff --git a/.github/workflows/inference.yml b/.github/workflows/inference.yml index 4b810806..175fe01f 100644 --- a/.github/workflows/inference.yml +++ b/.github/workflows/inference.yml @@ -1,46 +1,46 @@ name: Inference Tests on: - workflow_dispatch: # Manual trigger + workflow_dispatch: # Manual trigger inputs: use_existing_container: - description: 'Use existing running container' + description: "Use existing running container" required: false - default: 'false' + default: "false" type: choice options: - - 'true' - - 'false' + - "true" + - "false" judge_mode: - description: 'Test judge mode' + description: "Test judge mode" required: false - default: 'simple' + default: "dual" type: choice options: - - 'simple' - - 'llm' - - 'dual' + - "simple" + - "llm" + - "dual" judge_model: - description: 'LLM model for judging (if llm/dual mode)' + description: "LLM model for judging (if llm/dual mode)" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string - workflow_call: # Called by other workflows + workflow_call: # Called by other workflows inputs: use_existing_container: - description: 'Container is already running' + description: "Container is already running" required: false default: false type: boolean judge_mode: - description: 'Test judge mode (simple, llm, dual)' + description: "Test judge mode (simple, llm, dual)" required: false - default: 'simple' + default: "dual" type: string judge_model: - description: 'LLM model for judging' + description: "LLM model for judging" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string outputs: result: @@ -64,7 +64,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -84,16 +84,16 @@ jobs: # Build judge flags based on input JUDGE_FLAGS="" - if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" - elif [ "${{ inputs.judge_mode }}" = "dual" ]; then - JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + elif [ "${{ inputs.judge_mode }}" = "llm" ]; then + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else - # llm mode - JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + # dual mode (default) + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi - echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" # Progress goes to stderr (visible), JSON results go to file diff --git a/.github/workflows/runtime.yml b/.github/workflows/runtime.yml index e239415d..8b0a0bb5 100644 --- a/.github/workflows/runtime.yml +++ b/.github/workflows/runtime.yml @@ -1,46 +1,46 @@ name: Runtime Tests on: - workflow_dispatch: # Manual trigger + workflow_dispatch: # Manual trigger inputs: keep_container: - description: 'Keep container running after tests' + description: "Keep container running after tests" required: false - default: 'false' + default: "false" type: choice options: - - 'true' - - 'false' + - "true" + - "false" judge_mode: - description: 'Test judge mode' + description: "Test judge mode" required: false - default: 'simple' + default: "dual" type: choice options: - - 'simple' - - 'llm' - - 'dual' + - "simple" + - "llm" + - "dual" judge_model: - description: 'LLM model for judging (if llm/dual mode)' + description: "LLM model for judging (if llm/dual mode)" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string - workflow_call: # Called by other workflows + workflow_call: # Called by other workflows inputs: keep_container: - description: 'Keep container running for subsequent jobs' + description: "Keep container running for subsequent jobs" required: false default: false type: boolean judge_mode: - description: 'Test judge mode (simple, llm, dual)' + description: "Test judge mode (simple, llm, dual)" required: false - default: 'simple' + default: "dual" type: string judge_model: - description: 'LLM model for judging' + description: "LLM model for judging" required: false - default: 'gemma3:4b' + default: "gemma3:12b" type: string outputs: result: @@ -61,7 +61,7 @@ jobs: - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' + node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci @@ -80,16 +80,16 @@ jobs: # Build judge flags based on input JUDGE_FLAGS="" - if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then + if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" - elif [ "${{ inputs.judge_mode }}" = "dual" ]; then - JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + elif [ "${{ inputs.judge_mode }}" = "llm" ]; then + JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else - # llm mode - JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" + # dual mode (default) + JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi - echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" + echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" # Progress goes to stderr (visible), JSON results go to file