name: Full Pipeline on: workflow_dispatch: # Manual trigger inputs: judge_mode: description: "Test judge mode" required: false default: "dual" type: choice options: - "simple" - "llm" - "dual" judge_model: description: "LLM model for judging (if llm/dual mode)" required: false default: "gemma3:12b" type: string skip_llm_judge_stage: description: "Skip separate LLM judge evaluation stage" required: false default: "true" type: choice options: - "true" - "false" env: OLLAMA_HOST: http://localhost:11434 jobs: build: name: Build Verification uses: ./.github/workflows/build.yml with: judge_mode: ${{ inputs.judge_mode || 'dual' }} judge_model: ${{ inputs.judge_model || 'gemma3:12b' }} start-container: name: Start Container runs-on: self-hosted needs: build steps: - name: Checkout uses: actions/checkout@v4 - name: Start container run: | cd docker docker compose down 2>/dev/null || true docker compose up -d echo "Waiting for container to be ready..." sleep 10 - name: Verify container health run: | docker ps curl -s http://localhost:11434/api/tags || echo "Ollama not ready yet, continuing..." runtime: name: Runtime Tests runs-on: self-hosted needs: start-container steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci - name: Run runtime tests id: runtime-tests run: | cd tests # Build judge flags based on input JUDGE_FLAGS="" if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" elif [ "${{ inputs.judge_mode }}" = "llm" ]; then JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else # dual mode (default) JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite runtime $JUDGE_FLAGS --output json > /tmp/runtime-results.json || true echo "--- JSON Results ---" cat /tmp/runtime-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/runtime-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED runtime test(s) failed" exit 1 fi - name: Upload runtime results uses: actions/upload-artifact@v4 if: always() with: name: runtime-test-results path: /tmp/runtime-results.json inference: name: Inference Tests runs-on: self-hosted needs: runtime steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci - name: Run inference tests id: inference-tests run: | cd tests # Build judge flags based on input JUDGE_FLAGS="" if [ "${{ inputs.judge_mode }}" = "simple" ]; then JUDGE_FLAGS="--no-llm" elif [ "${{ inputs.judge_mode }}" = "llm" ]; then JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" else # dual mode (default) JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}" fi echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}" echo "Judge flags: $JUDGE_FLAGS" npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true echo "--- JSON Results ---" cat /tmp/inference-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/inference-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED inference test(s) failed" exit 1 fi - name: Unload test model from VRAM if: always() run: | echo "Unloading gemma3:4b from VRAM..." curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true echo "Model unloaded" - name: Upload inference results uses: actions/upload-artifact@v4 if: always() with: name: inference-test-results path: /tmp/inference-results.json llm-judge: name: LLM Judge Evaluation runs-on: self-hosted needs: [build, runtime, inference] if: ${{ inputs.skip_llm_judge_stage != 'true' }} steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Install test runner dependencies run: cd tests && npm ci - name: Download all test results uses: actions/download-artifact@v4 with: path: /tmp/results - name: Run LLM judge on all results run: | cd tests echo "Running LLM judge evaluation..." echo "Using model: ${{ inputs.judge_model || 'gemma3:12b' }}" npm run --silent dev -- run --judge-model ${{ inputs.judge_model || 'gemma3:12b' }} --output json > /tmp/llm-judged-results.json || true echo "--- JSON Results ---" cat /tmp/llm-judged-results.json - name: Check test results run: | FAILED=$(jq '.summary.failed' /tmp/llm-judged-results.json) echo "Failed tests: $FAILED" if [ "$FAILED" -gt 0 ]; then echo "::error::$FAILED test(s) failed LLM evaluation" exit 1 fi - name: Upload final results uses: actions/upload-artifact@v4 if: always() with: name: llm-judged-results path: /tmp/llm-judged-results.json cleanup: name: Cleanup & Summary runs-on: self-hosted needs: [build, runtime, inference, llm-judge] if: always() steps: - name: Checkout uses: actions/checkout@v4 - name: Stop Container run: | cd docker docker compose down || true echo "Container stopped" - name: Summary run: | echo "## Full Pipeline Summary" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "| Stage | Status |" >> $GITHUB_STEP_SUMMARY echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY echo "| Build Verification | ${{ needs.build.result }} |" >> $GITHUB_STEP_SUMMARY echo "| Runtime Tests | ${{ needs.runtime.result }} |" >> $GITHUB_STEP_SUMMARY echo "| Inference Tests | ${{ needs.inference.result }} |" >> $GITHUB_STEP_SUMMARY echo "| LLM Judge | ${{ needs.llm-judge.result }} |" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "Commit: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY