Enhance LLM judge prompt and add separate verdict display

- Add step results, timing context, and build notes to LLM prompt
- LLM now sees exit codes, durations, and simple judge result
- Add guidance that long build times within timeout are acceptable

- Add separate simple/LLM verdict tracking in dual-judge mode
- Console output shows both Simple and LLM pass/fail status
- JSON summary includes separate simple/llm breakdown
- Each test report includes simplePass/llmPass fields

This helps distinguish between simple judge failures (exit code != 0)
and LLM judge failures (semantic analysis), making debugging easier.
This commit is contained in:
Shang Chieh Tseng
2025-12-17 15:04:05 +08:00
parent 1e99c1bb50
commit e06deff40f
4 changed files with 465 additions and 273 deletions

View File

@@ -1,216 +1,244 @@
#!/usr/bin/env node #!/usr/bin/env node
import { Command } from 'commander' import { Command } from "commander";
import { writeFileSync } from 'fs' import { writeFileSync } from "fs";
import path from 'path' import path from "path";
import { fileURLToPath } from 'url' import { fileURLToPath } from "url";
import { TestLoader } from './loader.js' import { TestLoader } from "./loader.js";
import { TestExecutor } from './executor.js' import { TestExecutor } from "./executor.js";
import { LLMJudge } from './judge.js' import { LLMJudge } from "./judge.js";
import { Reporter, TestLinkReporter } from './reporter.js' import { Reporter, TestLinkReporter } from "./reporter.js";
import { RunnerOptions } from './types.js' import { RunnerOptions, Judgment } from "./types.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url)) const __dirname = path.dirname(fileURLToPath(import.meta.url));
const defaultTestcasesDir = path.join(__dirname, '..', 'testcases') const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
// Progress output to stderr (visible in console even when stdout is redirected) // Progress output to stderr (visible in console even when stdout is redirected)
const log = (msg: string) => process.stderr.write(msg + '\n') const log = (msg: string) => process.stderr.write(msg + "\n");
const program = new Command() const program = new Command();
program program
.name('ollama37-test') .name("ollama37-test")
.description('Scalable test runner with LLM-as-judge for ollama37') .description("Scalable test runner with LLM-as-judge for ollama37")
.version('1.0.0') .version("1.0.0");
program program
.command('run') .command("run")
.description('Run test cases') .description("Run test cases")
.option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)') .option(
.option('-i, --id <id>', 'Run only specified test case by ID') "-s, --suite <suite>",
.option('-w, --workers <n>', 'Number of parallel workers', '1') "Run only tests in specified suite (build, runtime, inference)",
.option('-d, --dry-run', 'Show what would be executed without running') )
.option('-o, --output <format>', 'Output format: console, json, junit', 'console') .option("-i, --id <id>", "Run only specified test case by ID")
.option('--report-testlink', 'Report results to TestLink') .option("-w, --workers <n>", "Number of parallel workers", "1")
.option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434') .option("-d, --dry-run", "Show what would be executed without running")
.option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435') .option(
.option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b') "-o, --output <format>",
.option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090') "Output format: console, json, junit",
.option('--testlink-api-key <key>', 'TestLink API key') "console",
.option('--no-llm', 'Skip LLM judging, use simple exit code check only') )
.option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)') .option("--report-testlink", "Report results to TestLink")
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir) .option(
"--ollama-url <url>",
"Ollama server URL (test subject)",
"http://localhost:11434",
)
.option(
"--judge-url <url>",
"LLM Judge server URL (separate instance)",
"http://localhost:11435",
)
.option("--judge-model <model>", "Model for LLM judging", "gemma3:4b")
.option(
"--testlink-url <url>",
"TestLink server URL",
"http://localhost:8090",
)
.option("--testlink-api-key <key>", "TestLink API key")
.option("--no-llm", "Skip LLM judging, use simple exit code check only")
.option(
"--dual-judge",
"Use both simple and LLM judge (fail if either fails)",
)
.option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
.action(async (options) => { .action(async (options) => {
log('='.repeat(60)) log("=".repeat(60));
log('OLLAMA37 TEST RUNNER') log("OLLAMA37 TEST RUNNER");
log('='.repeat(60)) log("=".repeat(60));
const loader = new TestLoader(options.testcasesDir) const loader = new TestLoader(options.testcasesDir);
const executor = new TestExecutor(path.join(__dirname, '..', '..')) const executor = new TestExecutor(path.join(__dirname, "..", ".."));
const judge = new LLMJudge(options.judgeUrl, options.judgeModel) const judge = new LLMJudge(options.judgeUrl, options.judgeModel);
// Load test cases // Load test cases
log('\nLoading test cases...') log("\nLoading test cases...");
let testCases = await loader.loadAll() let testCases = await loader.loadAll();
if (options.suite) { if (options.suite) {
testCases = testCases.filter(tc => tc.suite === options.suite) testCases = testCases.filter((tc) => tc.suite === options.suite);
log(` Filtered by suite: ${options.suite}`) log(` Filtered by suite: ${options.suite}`);
} }
if (options.id) { if (options.id) {
testCases = testCases.filter(tc => tc.id === options.id) testCases = testCases.filter((tc) => tc.id === options.id);
log(` Filtered by ID: ${options.id}`) log(` Filtered by ID: ${options.id}`);
} }
// Sort by dependencies // Sort by dependencies
testCases = loader.sortByDependencies(testCases) testCases = loader.sortByDependencies(testCases);
log(` Found ${testCases.length} test cases`) log(` Found ${testCases.length} test cases`);
if (testCases.length === 0) { if (testCases.length === 0) {
log('\nNo test cases found!') log("\nNo test cases found!");
process.exit(1) process.exit(1);
} }
// Dry run // Dry run
if (options.dryRun) { if (options.dryRun) {
log('\nDRY RUN - Would execute:') log("\nDRY RUN - Would execute:");
for (const tc of testCases) { for (const tc of testCases) {
log(` ${tc.id}: ${tc.name}`) log(` ${tc.id}: ${tc.name}`);
for (const step of tc.steps) { for (const step of tc.steps) {
log(` - ${step.name}: ${step.command}`) log(` - ${step.name}: ${step.command}`);
} }
} }
process.exit(0) process.exit(0);
} }
// Execute tests (progress goes to stderr via executor) // Execute tests (progress goes to stderr via executor)
const workers = parseInt(options.workers) const workers = parseInt(options.workers);
const results = await executor.executeAll(testCases, workers) const results = await executor.executeAll(testCases, workers);
// Judge results // Judge results
log('\nJudging results...') log("\nJudging results...");
let judgments let judgments: Judgment[];
let simpleJudgments: Judgment[] | undefined;
let llmJudgments: Judgment[] | undefined;
if (options.dualJudge) { if (options.dualJudge) {
// Dual judge mode: run both simple and LLM, fail if either fails // Dual judge mode: run both simple and LLM, fail if either fails
log(' Using dual judge mode (simple + LLM)') log(" Using dual judge mode (simple + LLM)");
// Simple judge first // Simple judge first
const simpleJudgments = results.map(r => judge.simpleJudge(r)) simpleJudgments = results.map((r) => judge.simpleJudge(r));
log(' Simple judge complete') log(" Simple judge complete");
// LLM judge second // LLM judge second
let llmJudgments
try { try {
llmJudgments = await judge.judgeResults(results) llmJudgments = await judge.judgeResults(results);
log(' LLM judge complete') log(" LLM judge complete");
} catch (error) { } catch (error) {
log(` LLM judge failed: ${error}`) log(` LLM judge failed: ${error}`);
log(' Falling back to simple judge only') log(" Falling back to simple judge only");
llmJudgments = simpleJudgments llmJudgments = simpleJudgments;
} }
// Combine: fail if either judge says fail // Combine: fail if either judge says fail
judgments = simpleJudgments.map((simple, i) => { judgments = simpleJudgments.map((simple, i) => {
const llm = llmJudgments.find(j => j.testId === simple.testId) || simple const llm =
const pass = simple.pass && llm.pass llmJudgments!.find((j) => j.testId === simple.testId) || simple;
const pass = simple.pass && llm.pass;
let reason = '' let reason = "";
if (!pass) { if (!pass) {
const reasons = [] const reasons = [];
if (!simple.pass) reasons.push(`Simple: ${simple.reason}`) if (!simple.pass) reasons.push(`Simple: ${simple.reason}`);
if (!llm.pass) reasons.push(`LLM: ${llm.reason}`) if (!llm.pass) reasons.push(`LLM: ${llm.reason}`);
reason = reasons.join(' | ') reason = reasons.join(" | ");
} else { } else {
reason = llm.reason || simple.reason reason = llm.reason || simple.reason;
} }
return { return {
testId: simple.testId, testId: simple.testId,
pass, pass,
reason reason,
} };
}) });
} else if (options.llm === false) { } else if (options.llm === false) {
log(' Using simple exit code check (--no-llm)') log(" Using simple exit code check (--no-llm)");
judgments = results.map(r => judge.simpleJudge(r)) judgments = results.map((r) => judge.simpleJudge(r));
} else { } else {
try { try {
judgments = await judge.judgeResults(results) judgments = await judge.judgeResults(results);
} catch (error) { } catch (error) {
log(` LLM judging failed, falling back to simple check: ${error}`) log(` LLM judging failed, falling back to simple check: ${error}`);
judgments = results.map(r => judge.simpleJudge(r)) judgments = results.map((r) => judge.simpleJudge(r));
} }
} }
// Create reports // Create reports (with separate verdicts in dual-judge mode)
const reports = Reporter.createReports(results, judgments) const reports = Reporter.createReports(
results,
judgments,
simpleJudgments,
llmJudgments,
);
// Output results // Output results
switch (options.output) { switch (options.output) {
case 'json': case "json":
const json = Reporter.toJSON(reports) const json = Reporter.toJSON(reports);
// JSON goes to stdout (can be redirected to file) // JSON goes to stdout (can be redirected to file)
process.stdout.write(json + '\n') process.stdout.write(json + "\n");
break break;
case 'junit': case "junit":
const junit = Reporter.toJUnit(reports) const junit = Reporter.toJUnit(reports);
writeFileSync('test-results.xml', junit) writeFileSync("test-results.xml", junit);
log('\nResults written to test-results.xml') log("\nResults written to test-results.xml");
break break;
case 'console': case "console":
default: default:
Reporter.toConsole(reports) Reporter.toConsole(reports);
break break;
} }
// Summary // Summary
const passed = reports.filter(r => r.pass).length const passed = reports.filter((r) => r.pass).length;
const failed = reports.filter(r => !r.pass).length const failed = reports.filter((r) => !r.pass).length;
log('\n' + '='.repeat(60)) log("\n" + "=".repeat(60));
log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`) log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`);
log('='.repeat(60)) log("=".repeat(60));
// Report to TestLink // Report to TestLink
if (options.reportTestlink && options.testlinkApiKey) { if (options.reportTestlink && options.testlinkApiKey) {
const testlinkReporter = new TestLinkReporter( const testlinkReporter = new TestLinkReporter(
options.testlinkUrl, options.testlinkUrl,
options.testlinkApiKey options.testlinkApiKey,
) );
// Would need plan ID and build ID // Would need plan ID and build ID
// await testlinkReporter.reportResults(reports, planId, buildId) // await testlinkReporter.reportResults(reports, planId, buildId)
log('\nTestLink reporting not yet implemented') log("\nTestLink reporting not yet implemented");
} }
// Exit with appropriate code // Exit with appropriate code
process.exit(failed > 0 ? 1 : 0) process.exit(failed > 0 ? 1 : 0);
}) });
program program
.command('list') .command("list")
.description('List all test cases') .description("List all test cases")
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir) .option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
.action(async (options) => { .action(async (options) => {
const loader = new TestLoader(options.testcasesDir) const loader = new TestLoader(options.testcasesDir);
const testCases = await loader.loadAll() const testCases = await loader.loadAll();
const grouped = loader.groupBySuite(testCases) const grouped = loader.groupBySuite(testCases);
console.log('Available Test Cases:\n') console.log("Available Test Cases:\n");
for (const [suite, cases] of grouped) { for (const [suite, cases] of grouped) {
console.log(`${suite.toUpperCase()}:`) console.log(`${suite.toUpperCase()}:`);
for (const tc of cases) { for (const tc of cases) {
console.log(` ${tc.id}: ${tc.name}`) console.log(` ${tc.id}: ${tc.name}`);
} }
console.log() console.log();
} }
console.log(`Total: ${testCases.length} test cases`) console.log(`Total: ${testCases.length} test cases`);
}) });
program.parse() program.parse();

View File

@@ -1,31 +1,71 @@
import axios from 'axios' import axios from "axios";
import { TestResult, Judgment } from './types.js' import { TestResult, Judgment } from "./types.js";
export class LLMJudge { export class LLMJudge {
private ollamaUrl: string private ollamaUrl: string;
private model: string private model: string;
private batchSize: number private batchSize: number;
constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') { constructor(
this.ollamaUrl = ollamaUrl ollamaUrl: string = "http://localhost:11434",
this.model = model model: string = "gemma3:4b",
this.batchSize = 5 // Judge 5 tests per LLM call ) {
this.ollamaUrl = ollamaUrl;
this.model = model;
this.batchSize = 5; // Judge 5 tests per LLM call
}
private formatDuration(ms: number): string {
if (ms < 1000) return `${ms}ms`;
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
return `${(ms / 60000).toFixed(1)}min`;
} }
private buildPrompt(results: TestResult[]): string { private buildPrompt(results: TestResult[]): string {
const testsSection = results.map((r, i) => { const testsSection = results
return ` .map((r, i) => {
// Build step results summary with exit codes and durations
const stepsSummary = r.steps
.map((step, j) => {
const status = step.exitCode === 0 ? "PASS" : "FAIL";
const stepTimeout =
r.testCase.steps[j]?.timeout || r.testCase.timeout;
return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
})
.join("\n");
// Simple judge result
const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
const simpleResult = allStepsPassed ? "PASS" : "FAIL";
// Check if duration is within timeout
const timeoutMs = r.testCase.timeout;
const withinTimeout = r.totalDuration < timeoutMs;
const timeoutNote = withinTimeout
? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
: `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
return `
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name} ### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
**Criteria:** **Criteria:**
${r.testCase.criteria} ${r.testCase.criteria}
**Step Results:**
${stepsSummary}
**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
**Timing:** ${timeoutNote}
${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
**Execution Logs:** **Execution Logs:**
\`\`\` \`\`\`
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''} ${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
\`\`\` \`\`\`
` `;
}).join('\n---\n') })
.join("\n---\n");
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided. return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
@@ -46,66 +86,74 @@ Important:
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions) - For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
- For build/runtime tests, check exit codes and absence of error messages - For build/runtime tests, check exit codes and absence of error messages
- Be lenient with formatting differences, focus on semantic correctness - Be lenient with formatting differences, focus on semantic correctness
- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
- Long durations are acceptable as long as they are within the configured timeout
Respond ONLY with the JSON array, no other text.` Respond ONLY with the JSON array, no other text.`;
} }
async judgeResults(results: TestResult[]): Promise<Judgment[]> { async judgeResults(results: TestResult[]): Promise<Judgment[]> {
const allJudgments: Judgment[] = [] const allJudgments: Judgment[] = [];
// Process in batches // Process in batches
for (let i = 0; i < results.length; i += this.batchSize) { for (let i = 0; i < results.length; i += this.batchSize) {
const batch = results.slice(i, i + this.batchSize) const batch = results.slice(i, i + this.batchSize);
console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`) console.log(
` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
);
try { try {
const judgments = await this.judgeBatch(batch) const judgments = await this.judgeBatch(batch);
allJudgments.push(...judgments) allJudgments.push(...judgments);
} catch (error) { } catch (error) {
console.error(` Failed to judge batch:`, error) console.error(` Failed to judge batch:`, error);
// Mark all tests in batch as failed // Mark all tests in batch as failed
for (const r of batch) { for (const r of batch) {
allJudgments.push({ allJudgments.push({
testId: r.testCase.id, testId: r.testCase.id,
pass: false, pass: false,
reason: 'LLM judgment failed: ' + String(error) reason: "LLM judgment failed: " + String(error),
}) });
} }
} }
} }
return allJudgments return allJudgments;
} }
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> { private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
const prompt = this.buildPrompt(results) const prompt = this.buildPrompt(results);
const response = await axios.post(`${this.ollamaUrl}/api/generate`, { const response = await axios.post(
model: this.model, `${this.ollamaUrl}/api/generate`,
prompt, {
stream: false, model: this.model,
options: { prompt,
temperature: 0.1, // Low temperature for consistent judging stream: false,
num_predict: 1000 options: {
} temperature: 0.1, // Low temperature for consistent judging
}, { num_predict: 1000,
timeout: 120000 // 2 minute timeout },
}) },
{
timeout: 120000, // 2 minute timeout
},
);
const responseText = response.data.response const responseText = response.data.response;
// Extract JSON from response // Extract JSON from response
const jsonMatch = responseText.match(/\[[\s\S]*\]/) const jsonMatch = responseText.match(/\[[\s\S]*\]/);
if (!jsonMatch) { if (!jsonMatch) {
throw new Error('No JSON array found in LLM response') throw new Error("No JSON array found in LLM response");
} }
try { try {
const judgments = JSON.parse(jsonMatch[0]) as Judgment[] const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
// Validate and fill missing // Validate and fill missing
const resultIds = results.map(r => r.testCase.id) const resultIds = results.map((r) => r.testCase.id);
const judgedIds = new Set(judgments.map(j => j.testId)) const judgedIds = new Set(judgments.map((j) => j.testId));
// Add missing judgments // Add missing judgments
for (const id of resultIds) { for (const id of resultIds) {
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
judgments.push({ judgments.push({
testId: id, testId: id,
pass: false, pass: false,
reason: 'No judgment provided by LLM' reason: "No judgment provided by LLM",
}) });
} }
} }
return judgments return judgments;
} catch (parseError) { } catch (parseError) {
throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`) throw new Error(
`Failed to parse LLM response: ${responseText.substring(0, 200)}`,
);
} }
} }
// Fallback: Simple rule-based judgment (no LLM) // Fallback: Simple rule-based judgment (no LLM)
simpleJudge(result: TestResult): Judgment { simpleJudge(result: TestResult): Judgment {
const allStepsPassed = result.steps.every(s => s.exitCode === 0) const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
if (allStepsPassed) { if (allStepsPassed) {
return { return {
testId: result.testCase.id, testId: result.testCase.id,
pass: true, pass: true,
reason: 'All steps completed with exit code 0' reason: "All steps completed with exit code 0",
} };
} else { } else {
const failedSteps = result.steps.filter(s => s.exitCode !== 0) const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
return { return {
testId: result.testCase.id, testId: result.testCase.id,
pass: false, pass: false,
reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}` reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
} };
} }
} }
} }

View File

@@ -1,112 +1,204 @@
import axios from 'axios' import axios from "axios";
import { TestReport, Judgment, TestResult } from './types.js' import { TestReport, Judgment, TestResult, TestSummary } from "./types.js";
export class Reporter { export class Reporter {
// Console reporter // Console reporter
static toConsole(reports: TestReport[]): void { static toConsole(reports: TestReport[]): void {
console.log('\n' + '='.repeat(60)) console.log("\n" + "=".repeat(60));
console.log('TEST RESULTS') console.log("TEST RESULTS");
console.log('='.repeat(60)) console.log("=".repeat(60));
const passed = reports.filter(r => r.pass) const passed = reports.filter((r) => r.pass);
const failed = reports.filter(r => !r.pass) const failed = reports.filter((r) => !r.pass);
// Check if we have dual-judge data
const hasDualJudge = reports.some(
(r) => r.simplePass !== undefined && r.llmPass !== undefined,
);
for (const report of reports) { for (const report of reports) {
const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m' const status = report.pass
console.log(`[${status}] ${report.testId}: ${report.name}`) ? "\x1b[32mPASS\x1b[0m"
console.log(` Reason: ${report.reason}`) : "\x1b[31mFAIL\x1b[0m";
console.log(` Duration: ${report.duration}ms`) console.log(`[${status}] ${report.testId}: ${report.name}`);
// Show separate verdicts in dual-judge mode
if (
hasDualJudge &&
report.simplePass !== undefined &&
report.llmPass !== undefined
) {
const simpleStatus = report.simplePass
? "\x1b[32mPASS\x1b[0m"
: "\x1b[31mFAIL\x1b[0m";
const llmStatus = report.llmPass
? "\x1b[32mPASS\x1b[0m"
: "\x1b[31mFAIL\x1b[0m";
console.log(
` Simple: [${simpleStatus}] ${report.simpleReason || ""}`,
);
console.log(` LLM: [${llmStatus}] ${report.llmReason || ""}`);
} else {
console.log(` Reason: ${report.reason}`);
}
console.log(` Duration: ${report.duration}ms`);
} }
console.log('\n' + '-'.repeat(60)) console.log("\n" + "-".repeat(60));
console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
console.log('='.repeat(60)) // Show separate summaries in dual-judge mode
if (hasDualJudge) {
const simplePassed = reports.filter((r) => r.simplePass).length;
const simpleFailed = reports.filter((r) => !r.simplePass).length;
const llmPassed = reports.filter((r) => r.llmPass).length;
const llmFailed = reports.filter((r) => !r.llmPass).length;
console.log(`Simple: ${simplePassed} passed, ${simpleFailed} failed`);
console.log(`LLM: ${llmPassed} passed, ${llmFailed} failed`);
console.log(
`Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`,
);
} else {
console.log(
`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`,
);
}
console.log("=".repeat(60));
} }
// JSON reporter // JSON reporter
static toJSON(reports: TestReport[]): string { static toJSON(reports: TestReport[]): string {
return JSON.stringify({ // Check if we have dual-judge data
summary: { const hasDualJudge = reports.some(
total: reports.length, (r) => r.simplePass !== undefined && r.llmPass !== undefined,
passed: reports.filter(r => r.pass).length, );
failed: reports.filter(r => !r.pass).length,
timestamp: new Date().toISOString() const summary: TestSummary = {
total: reports.length,
passed: reports.filter((r) => r.pass).length,
failed: reports.filter((r) => !r.pass).length,
timestamp: new Date().toISOString(),
};
// Add separate breakdowns in dual-judge mode
if (hasDualJudge) {
summary.simple = {
passed: reports.filter((r) => r.simplePass).length,
failed: reports.filter((r) => !r.simplePass).length,
};
summary.llm = {
passed: reports.filter((r) => r.llmPass).length,
failed: reports.filter((r) => !r.llmPass).length,
};
}
return JSON.stringify(
{
summary,
results: reports,
}, },
results: reports null,
}, null, 2) 2,
);
} }
// JUnit XML reporter (for CI/CD integration) // JUnit XML reporter (for CI/CD integration)
static toJUnit(reports: TestReport[]): string { static toJUnit(reports: TestReport[]): string {
const escapeXml = (s: string) => s const escapeXml = (s: string) =>
.replace(/&/g, '&amp;') s
.replace(/</g, '&lt;') .replace(/&/g, "&amp;")
.replace(/>/g, '&gt;') .replace(/</g, "&lt;")
.replace(/"/g, '&quot;') .replace(/>/g, "&gt;")
.replace(/'/g, '&apos;') .replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
const testcases = reports.map(r => { const testcases = reports
if (r.pass) { .map((r) => {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>` if (r.pass) {
} else { return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`;
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"> } else {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
<failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure> <failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
</testcase>` </testcase>`;
} }
}).join('\n') })
.join("\n");
const failures = reports.filter(r => !r.pass).length const failures = reports.filter((r) => !r.pass).length;
const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000 const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000;
return `<?xml version="1.0" encoding="UTF-8"?> return `<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}"> <testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
${testcases} ${testcases}
</testsuite>` </testsuite>`;
} }
// Combine results and judgments into reports // Combine results and judgments into reports
static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] { static createReports(
const judgmentMap = new Map(judgments.map(j => [j.testId, j])) results: TestResult[],
judgments: Judgment[],
simpleJudgments?: Judgment[],
llmJudgments?: Judgment[],
): TestReport[] {
const judgmentMap = new Map(judgments.map((j) => [j.testId, j]));
const simpleMap = simpleJudgments
? new Map(simpleJudgments.map((j) => [j.testId, j]))
: undefined;
const llmMap = llmJudgments
? new Map(llmJudgments.map((j) => [j.testId, j]))
: undefined;
return results.map(result => { return results.map((result) => {
const judgment = judgmentMap.get(result.testCase.id) const judgment = judgmentMap.get(result.testCase.id);
const simple = simpleMap?.get(result.testCase.id);
const llm = llmMap?.get(result.testCase.id);
return { const report: TestReport = {
testId: result.testCase.id, testId: result.testCase.id,
name: result.testCase.name, name: result.testCase.name,
suite: result.testCase.suite, suite: result.testCase.suite,
pass: judgment?.pass ?? false, pass: judgment?.pass ?? false,
reason: judgment?.reason ?? 'No judgment', reason: judgment?.reason ?? "No judgment",
duration: result.totalDuration, duration: result.totalDuration,
logs: result.logs logs: result.logs,
};
// Add separate verdicts if available (dual-judge mode)
if (simple && llm) {
report.simplePass = simple.pass;
report.simpleReason = simple.reason;
report.llmPass = llm.pass;
report.llmReason = llm.reason;
} }
})
return report;
});
} }
} }
// TestLink reporter // TestLink reporter
export class TestLinkReporter { export class TestLinkReporter {
private url: string private url: string;
private apiKey: string private apiKey: string;
constructor(url: string, apiKey: string) { constructor(url: string, apiKey: string) {
this.url = url this.url = url;
this.apiKey = apiKey this.apiKey = apiKey;
} }
async reportResults( async reportResults(
reports: TestReport[], reports: TestReport[],
planId: string, planId: string,
buildId: string buildId: string,
): Promise<void> { ): Promise<void> {
console.log('\nReporting to TestLink...') console.log("\nReporting to TestLink...");
for (const report of reports) { for (const report of reports) {
try { try {
await this.reportTestExecution(report, planId, buildId) await this.reportTestExecution(report, planId, buildId);
console.log(` Reported: ${report.testId}`) console.log(` Reported: ${report.testId}`);
} catch (error) { } catch (error) {
console.error(` Failed to report ${report.testId}:`, error) console.error(` Failed to report ${report.testId}:`, error);
} }
} }
} }
@@ -114,12 +206,12 @@ export class TestLinkReporter {
private async reportTestExecution( private async reportTestExecution(
report: TestReport, report: TestReport,
planId: string, planId: string,
buildId: string buildId: string,
): Promise<void> { ): Promise<void> {
// Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID) // Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
// This would need to be mapped from TestLink // This would need to be mapped from TestLink
const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked
// Note: This uses the TestLink XML-RPC API // Note: This uses the TestLink XML-RPC API
// In practice, you'd use the testlink-mcp or direct API calls // In practice, you'd use the testlink-mcp or direct API calls
@@ -129,10 +221,10 @@ export class TestLinkReporter {
testplanid: planId, testplanid: planId,
buildid: buildId, buildid: buildId,
status, status,
notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}` notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`,
} };
// For now, just log - actual implementation would call TestLink API // For now, just log - actual implementation would call TestLink API
console.log(` Would report: ${report.testId} = ${status}`) console.log(` Would report: ${report.testId} = ${status}`);
} }
} }

View File

@@ -1,66 +1,88 @@
// Test case definition // Test case definition
export interface TestStep { export interface TestStep {
name: string name: string;
command: string command: string;
timeout?: number timeout?: number;
} }
export interface TestCase { export interface TestCase {
id: string id: string;
name: string name: string;
suite: string suite: string;
priority: number priority: number;
timeout: number timeout: number;
dependencies: string[] dependencies: string[];
steps: TestStep[] steps: TestStep[];
criteria: string criteria: string;
} }
// Execution results // Execution results
export interface StepResult { export interface StepResult {
name: string name: string;
command: string command: string;
stdout: string stdout: string;
stderr: string stderr: string;
exitCode: number exitCode: number;
duration: number duration: number;
} }
export interface TestResult { export interface TestResult {
testCase: TestCase testCase: TestCase;
steps: StepResult[] steps: StepResult[];
totalDuration: number totalDuration: number;
logs: string logs: string;
} }
// LLM judgment // LLM judgment
export interface Judgment { export interface Judgment {
testId: string testId: string;
pass: boolean pass: boolean;
reason: string reason: string;
} }
// Final report // Final report
export interface TestReport { export interface TestReport {
testId: string testId: string;
name: string name: string;
suite: string suite: string;
pass: boolean pass: boolean;
reason: string reason: string;
duration: number duration: number;
logs: string logs: string;
// Separate verdicts for dual-judge mode
simplePass?: boolean;
simpleReason?: string;
llmPass?: boolean;
llmReason?: string;
}
// Summary with separate judge breakdowns
export interface TestSummary {
total: number;
passed: number;
failed: number;
timestamp: string;
// Separate breakdowns (only present in dual-judge mode)
simple?: {
passed: number;
failed: number;
};
llm?: {
passed: number;
failed: number;
};
} }
// Runner options // Runner options
export interface RunnerOptions { export interface RunnerOptions {
suite?: string suite?: string;
id?: string id?: string;
workers: number workers: number;
dryRun: boolean dryRun: boolean;
output: 'console' | 'json' | 'junit' output: "console" | "json" | "junit";
reportTestlink: boolean reportTestlink: boolean;
ollamaUrl: string ollamaUrl: string;
ollamaModel: string ollamaModel: string;
testlinkUrl: string testlinkUrl: string;
testlinkApiKey: string testlinkApiKey: string;
} }