Add GitHub Actions CI/CD pipeline and test framework

- Add .github/workflows/build-test.yml for automated testing
- Add tests/ directory with TypeScript test runner
- Add docs/CICD.md documentation
- Remove .gitlab-ci.yml (migrated to GitHub Actions)
- Update .gitignore for test artifacts

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-12-15 14:06:44 +08:00
parent 2b5aeaf86b
commit d11140c016
23 changed files with 3014 additions and 50 deletions

146
tests/src/judge.ts Normal file
View File

@@ -0,0 +1,146 @@
import axios from 'axios'
import { TestResult, Judgment } from './types.js'
export class LLMJudge {
private ollamaUrl: string
private model: string
private batchSize: number
constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
this.ollamaUrl = ollamaUrl
this.model = model
this.batchSize = 5 // Judge 5 tests per LLM call
}
private buildPrompt(results: TestResult[]): string {
const testsSection = results.map((r, i) => {
return `
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
**Criteria:**
${r.testCase.criteria}
**Execution Logs:**
\`\`\`
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
\`\`\`
`
}).join('\n---\n')
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
For each test, examine:
1. The expected criteria
2. The actual execution logs (stdout, stderr, exit codes)
3. Whether the output meets the criteria (use fuzzy matching for AI outputs)
${testsSection}
Respond with a JSON array containing one object per test:
[
{"testId": "TC-XXX-001", "pass": true, "reason": "Brief explanation"},
{"testId": "TC-XXX-002", "pass": false, "reason": "Brief explanation"}
]
Important:
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
- For build/runtime tests, check exit codes and absence of error messages
- Be lenient with formatting differences, focus on semantic correctness
Respond ONLY with the JSON array, no other text.`
}
async judgeResults(results: TestResult[]): Promise<Judgment[]> {
const allJudgments: Judgment[] = []
// Process in batches
for (let i = 0; i < results.length; i += this.batchSize) {
const batch = results.slice(i, i + this.batchSize)
console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
try {
const judgments = await this.judgeBatch(batch)
allJudgments.push(...judgments)
} catch (error) {
console.error(` Failed to judge batch:`, error)
// Mark all tests in batch as failed
for (const r of batch) {
allJudgments.push({
testId: r.testCase.id,
pass: false,
reason: 'LLM judgment failed: ' + String(error)
})
}
}
}
return allJudgments
}
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
const prompt = this.buildPrompt(results)
const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
model: this.model,
prompt,
stream: false,
options: {
temperature: 0.1, // Low temperature for consistent judging
num_predict: 1000
}
}, {
timeout: 120000 // 2 minute timeout
})
const responseText = response.data.response
// Extract JSON from response
const jsonMatch = responseText.match(/\[[\s\S]*\]/)
if (!jsonMatch) {
throw new Error('No JSON array found in LLM response')
}
try {
const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
// Validate and fill missing
const resultIds = results.map(r => r.testCase.id)
const judgedIds = new Set(judgments.map(j => j.testId))
// Add missing judgments
for (const id of resultIds) {
if (!judgedIds.has(id)) {
judgments.push({
testId: id,
pass: false,
reason: 'No judgment provided by LLM'
})
}
}
return judgments
} catch (parseError) {
throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
}
}
// Fallback: Simple rule-based judgment (no LLM)
simpleJudge(result: TestResult): Judgment {
const allStepsPassed = result.steps.every(s => s.exitCode === 0)
if (allStepsPassed) {
return {
testId: result.testCase.id,
pass: true,
reason: 'All steps completed with exit code 0'
}
} else {
const failedSteps = result.steps.filter(s => s.exitCode !== 0)
return {
testId: result.testCase.id,
pass: false,
reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
}
}
}
}