name: Full Pipeline on: workflow_dispatch: # Manual trigger inputs: judge_mode: description: 'Test judge mode' required: false default: 'simple' type: choice options: - 'simple' - 'llm' - 'dual' judge_model: description: 'LLM model for judging (if llm/dual mode)' required: false default: 'gemma3:4b' type: string skip_llm_judge_stage: description: 'Skip separate LLM judge evaluation stage' required: false default: 'true' type: choice options: - 'true' - 'false' env: OLLAMA_HOST: http://localhost:11434 jobs: build: name: Build Verification uses: ./.github/workflows/build.yml with: judge_mode: ${{ inputs.judge_mode || 'simple' }} judge_model: ${{ inputs.judge_model || 'gemma3:4b' }} start-container: name: Start Container runs-on: self-hosted needs: build steps: - name: Checkout uses: actions/checkout@v4 - name: Start container run: | cd docker docker compose down 2>/dev/null || true docker compose up -d echo "Waiting for container to be ready..." sleep 10 - name: Verify container health run: | docker ps curl -s http://localhost:11434/api/tags || echo "Ollama not ready yet, continuing..." runtime: name: Runtime Tests runs-on: self-hosted needs: start-container steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' - name: Install test runner dependencies run: cd tests && npm ci - name: Run runtime tests id: runtime-tests run: | cd tests # Build judge flags based on input JUDGE_FLAGS="" if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then JUDGE_FLAGS="--no-llm" elif [ "${{ inputs.judge_mode }}" = "dual" ]; then JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" else # llm mode JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" fi echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true echo "--- JSON Results ---" cat /tmp/runtime-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/runtime-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED runtime test(s) failed" exit 1 fi - name: Upload runtime results uses: actions/upload-artifact@v4 if: always() with: name: runtime-test-results path: /tmp/runtime-results.json inference: name: Inference Tests runs-on: self-hosted needs: runtime steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' - name: Install test runner dependencies run: cd tests && npm ci - name: Run inference tests id: inference-tests run: | cd tests # Build judge flags based on input JUDGE_FLAGS="" if [ "${{ inputs.judge_mode }}" = "simple" ] || [ -z "${{ inputs.judge_mode }}" ]; then JUDGE_FLAGS="--no-llm" elif [ "${{ inputs.judge_mode }}" = "dual" ]; then JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" else # llm mode JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:4b' }}" fi echo "Judge mode: ${{ inputs.judge_mode || 'simple' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true echo "--- JSON Results ---" cat /tmp/inference-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/inference-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED inference test(s) failed" exit 1 fi - name: Upload inference results uses: actions/upload-artifact@v4 if: always() with: name: inference-test-results path: /tmp/inference-results.json llm-judge: name: LLM Judge Evaluation runs-on: self-hosted needs: [build, runtime, inference] if: ${{ inputs.skip_llm_judge_stage != 'true' }} steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: '20' - name: Install test runner dependencies run: cd tests && npm ci - name: Download all test results uses: actions/download-artifact@v4 with: path: /tmp/results - name: Run LLM judge on all results run: | cd tests echo "Running LLM judge evaluation..." echo "Using model: ${{ inputs.judge_model || 'gemma3:4b' }}" npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:4b' }} --output json > /tmp/llm-judged-results.json || true echo "--- JSON Results ---" cat /tmp/llm-judged-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/llm-judged-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED test(s) failed LLM evaluation" exit 1 fi - name: Upload final results uses: actions/upload-artifact@v4 if: always() with: name: llm-judged-results path: /tmp/llm-judged-results.json cleanup: name: Cleanup & Summary runs-on: self-hosted needs: [build, runtime, inference, llm-judge] if: always() steps: - name: Checkout uses: actions/checkout@v4 - name: Stop Container run: | cd docker docker compose down || true echo "Container stopped" - name: Summary run: | echo "## Full Pipeline Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Stage | Status |" >> $GITHUB_STEP_SUMMARY echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY echo "| Build Verification | ${{ needs.build.result }} |" >> $GITHUB_STEP_SUMMARY echo "| Runtime Tests | ${{ needs.runtime.result }} |" >> $GITHUB_STEP_SUMMARY echo "| Inference Tests | ${{ needs.inference.result }} |" >> $GITHUB_STEP_SUMMARY echo "| LLM Judge | ${{ needs.llm-judge.result }} |" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "Commit: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY