Add GitHub Actions CI/CD pipeline and test framework

- Add .github/workflows/build-test.yml for automated testing - Add tests/ directory with TypeScript test runner - Add docs/CICD.md documentation - Remove .gitlab-ci.yml (migrated to GitHub Actions) - Update .gitignore for test artifacts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-20 12:47:00 +00:00 · 2025-12-15 14:06:44 +08:00
parent 2b5aeaf86b
commit d11140c016
23 changed files with 3014 additions and 50 deletions
--- a/tests/src/judge.ts
+++ b/tests/src/judge.ts
@@ -0,0 +1,146 @@
+import axios from 'axios'
+import { TestResult, Judgment } from './types.js'
+
+export class LLMJudge {
+  private ollamaUrl: string
+  private model: string
+  private batchSize: number
+
+  constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
+    this.ollamaUrl = ollamaUrl
+    this.model = model
+    this.batchSize = 5 // Judge 5 tests per LLM call
+  }
+
+  private buildPrompt(results: TestResult[]): string {
+    const testsSection = results.map((r, i) => {
+      return `
+### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
+
+**Criteria:**
+${r.testCase.criteria}
+
+**Execution Logs:**
+\`\`\`
+${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
+\`\`\`
+`
+    }).join('\n---\n')
+
+    return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
+
+For each test, examine:
+1. The expected criteria
+2. The actual execution logs (stdout, stderr, exit codes)
+3. Whether the output meets the criteria (use fuzzy matching for AI outputs)
+
+${testsSection}
+
+Respond with a JSON array containing one object per test:
+[
+  {"testId": "TC-XXX-001", "pass": true, "reason": "Brief explanation"},
+  {"testId": "TC-XXX-002", "pass": false, "reason": "Brief explanation"}
+]
+
+Important:
+- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
+- For build/runtime tests, check exit codes and absence of error messages
+- Be lenient with formatting differences, focus on semantic correctness
+
+Respond ONLY with the JSON array, no other text.`
+  }
+
+  async judgeResults(results: TestResult[]): Promise<Judgment[]> {
+    const allJudgments: Judgment[] = []
+
+    // Process in batches
+    for (let i = 0; i < results.length; i += this.batchSize) {
+      const batch = results.slice(i, i + this.batchSize)
+      console.log(`  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
+
+      try {
+        const judgments = await this.judgeBatch(batch)
+        allJudgments.push(...judgments)
+      } catch (error) {
+        console.error(`  Failed to judge batch:`, error)
+        // Mark all tests in batch as failed
+        for (const r of batch) {
+          allJudgments.push({
+            testId: r.testCase.id,
+            pass: false,
+            reason: 'LLM judgment failed: ' + String(error)
+          })
+        }
+      }
+    }
+
+    return allJudgments
+  }
+
+  private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
+    const prompt = this.buildPrompt(results)
+
+    const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
+      model: this.model,
+      prompt,
+      stream: false,
+      options: {
+        temperature: 0.1, // Low temperature for consistent judging
+        num_predict: 1000
+      }
+    }, {
+      timeout: 120000 // 2 minute timeout
+    })
+
+    const responseText = response.data.response
+
+    // Extract JSON from response
+    const jsonMatch = responseText.match(/\[[\s\S]*\]/)
+    if (!jsonMatch) {
+      throw new Error('No JSON array found in LLM response')
+    }
+
+    try {
+      const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
+
+      // Validate and fill missing
+      const resultIds = results.map(r => r.testCase.id)
+      const judgedIds = new Set(judgments.map(j => j.testId))
+
+      // Add missing judgments
+      for (const id of resultIds) {
+        if (!judgedIds.has(id)) {
+          judgments.push({
+            testId: id,
+            pass: false,
+            reason: 'No judgment provided by LLM'
+          })
+        }
+      }
+
+      return judgments
+    } catch (parseError) {
+      throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
+    }
+  }
+
+  // Fallback: Simple rule-based judgment (no LLM)
+  simpleJudge(result: TestResult): Judgment {
+    const allStepsPassed = result.steps.every(s => s.exitCode === 0)
+
+    if (allStepsPassed) {
+      return {
+        testId: result.testCase.id,
+        pass: true,
+        reason: 'All steps completed with exit code 0'
+      }
+    } else {
+      const failedSteps = result.steps.filter(s => s.exitCode !== 0)
+      return {
+        testId: result.testCase.id,
+        pass: false,
+        reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
+      }
+    }
+  }
+}