Enhance LLM judge prompt and add separate verdict display

- Add step results, timing context, and build notes to LLM prompt
- LLM now sees exit codes, durations, and simple judge result
- Add guidance that long build times within timeout are acceptable

- Add separate simple/LLM verdict tracking in dual-judge mode
- Console output shows both Simple and LLM pass/fail status
- JSON summary includes separate simple/llm breakdown
- Each test report includes simplePass/llmPass fields

This helps distinguish between simple judge failures (exit code != 0)
and LLM judge failures (semantic analysis), making debugging easier.
This commit is contained in:
Shang Chieh Tseng
2025-12-17 15:04:05 +08:00
parent 1e99c1bb50
commit e06deff40f
4 changed files with 465 additions and 273 deletions

View File

@@ -1,216 +1,244 @@
#!/usr/bin/env node
import { Command } from 'commander'
import { writeFileSync } from 'fs'
import path from 'path'
import { fileURLToPath } from 'url'
import { TestLoader } from './loader.js'
import { TestExecutor } from './executor.js'
import { LLMJudge } from './judge.js'
import { Reporter, TestLinkReporter } from './reporter.js'
import { RunnerOptions } from './types.js'
import { Command } from "commander";
import { writeFileSync } from "fs";
import path from "path";
import { fileURLToPath } from "url";
import { TestLoader } from "./loader.js";
import { TestExecutor } from "./executor.js";
import { LLMJudge } from "./judge.js";
import { Reporter, TestLinkReporter } from "./reporter.js";
import { RunnerOptions, Judgment } from "./types.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const defaultTestcasesDir = path.join(__dirname, '..', 'testcases')
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
// Progress output to stderr (visible in console even when stdout is redirected)
const log = (msg: string) => process.stderr.write(msg + '\n')
const log = (msg: string) => process.stderr.write(msg + "\n");
const program = new Command()
const program = new Command();
program
.name('ollama37-test')
.description('Scalable test runner with LLM-as-judge for ollama37')
.version('1.0.0')
.name("ollama37-test")
.description("Scalable test runner with LLM-as-judge for ollama37")
.version("1.0.0");
program
.command('run')
.description('Run test cases')
.option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)')
.option('-i, --id <id>', 'Run only specified test case by ID')
.option('-w, --workers <n>', 'Number of parallel workers', '1')
.option('-d, --dry-run', 'Show what would be executed without running')
.option('-o, --output <format>', 'Output format: console, json, junit', 'console')
.option('--report-testlink', 'Report results to TestLink')
.option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434')
.option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435')
.option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b')
.option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
.option('--testlink-api-key <key>', 'TestLink API key')
.option('--no-llm', 'Skip LLM judging, use simple exit code check only')
.option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)')
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
.command("run")
.description("Run test cases")
.option(
"-s, --suite <suite>",
"Run only tests in specified suite (build, runtime, inference)",
)
.option("-i, --id <id>", "Run only specified test case by ID")
.option("-w, --workers <n>", "Number of parallel workers", "1")
.option("-d, --dry-run", "Show what would be executed without running")
.option(
"-o, --output <format>",
"Output format: console, json, junit",
"console",
)
.option("--report-testlink", "Report results to TestLink")
.option(
"--ollama-url <url>",
"Ollama server URL (test subject)",
"http://localhost:11434",
)
.option(
"--judge-url <url>",
"LLM Judge server URL (separate instance)",
"http://localhost:11435",
)
.option("--judge-model <model>", "Model for LLM judging", "gemma3:4b")
.option(
"--testlink-url <url>",
"TestLink server URL",
"http://localhost:8090",
)
.option("--testlink-api-key <key>", "TestLink API key")
.option("--no-llm", "Skip LLM judging, use simple exit code check only")
.option(
"--dual-judge",
"Use both simple and LLM judge (fail if either fails)",
)
.option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
.action(async (options) => {
log('='.repeat(60))
log('OLLAMA37 TEST RUNNER')
log('='.repeat(60))
log("=".repeat(60));
log("OLLAMA37 TEST RUNNER");
log("=".repeat(60));
const loader = new TestLoader(options.testcasesDir)
const executor = new TestExecutor(path.join(__dirname, '..', '..'))
const judge = new LLMJudge(options.judgeUrl, options.judgeModel)
const loader = new TestLoader(options.testcasesDir);
const executor = new TestExecutor(path.join(__dirname, "..", ".."));
const judge = new LLMJudge(options.judgeUrl, options.judgeModel);
// Load test cases
log('\nLoading test cases...')
let testCases = await loader.loadAll()
log("\nLoading test cases...");
let testCases = await loader.loadAll();
if (options.suite) {
testCases = testCases.filter(tc => tc.suite === options.suite)
log(` Filtered by suite: ${options.suite}`)
testCases = testCases.filter((tc) => tc.suite === options.suite);
log(` Filtered by suite: ${options.suite}`);
}
if (options.id) {
testCases = testCases.filter(tc => tc.id === options.id)
log(` Filtered by ID: ${options.id}`)
testCases = testCases.filter((tc) => tc.id === options.id);
log(` Filtered by ID: ${options.id}`);
}
// Sort by dependencies
testCases = loader.sortByDependencies(testCases)
testCases = loader.sortByDependencies(testCases);
log(` Found ${testCases.length} test cases`)
log(` Found ${testCases.length} test cases`);
if (testCases.length === 0) {
log('\nNo test cases found!')
process.exit(1)
log("\nNo test cases found!");
process.exit(1);
}
// Dry run
if (options.dryRun) {
log('\nDRY RUN - Would execute:')
log("\nDRY RUN - Would execute:");
for (const tc of testCases) {
log(` ${tc.id}: ${tc.name}`)
log(` ${tc.id}: ${tc.name}`);
for (const step of tc.steps) {
log(` - ${step.name}: ${step.command}`)
log(` - ${step.name}: ${step.command}`);
}
}
process.exit(0)
process.exit(0);
}
// Execute tests (progress goes to stderr via executor)
const workers = parseInt(options.workers)
const results = await executor.executeAll(testCases, workers)
const workers = parseInt(options.workers);
const results = await executor.executeAll(testCases, workers);
// Judge results
log('\nJudging results...')
let judgments
log("\nJudging results...");
let judgments: Judgment[];
let simpleJudgments: Judgment[] | undefined;
let llmJudgments: Judgment[] | undefined;
if (options.dualJudge) {
// Dual judge mode: run both simple and LLM, fail if either fails
log(' Using dual judge mode (simple + LLM)')
log(" Using dual judge mode (simple + LLM)");
// Simple judge first
const simpleJudgments = results.map(r => judge.simpleJudge(r))
log(' Simple judge complete')
simpleJudgments = results.map((r) => judge.simpleJudge(r));
log(" Simple judge complete");
// LLM judge second
let llmJudgments
try {
llmJudgments = await judge.judgeResults(results)
log(' LLM judge complete')
llmJudgments = await judge.judgeResults(results);
log(" LLM judge complete");
} catch (error) {
log(` LLM judge failed: ${error}`)
log(' Falling back to simple judge only')
llmJudgments = simpleJudgments
log(` LLM judge failed: ${error}`);
log(" Falling back to simple judge only");
llmJudgments = simpleJudgments;
}
// Combine: fail if either judge says fail
judgments = simpleJudgments.map((simple, i) => {
const llm = llmJudgments.find(j => j.testId === simple.testId) || simple
const pass = simple.pass && llm.pass
const llm =
llmJudgments!.find((j) => j.testId === simple.testId) || simple;
const pass = simple.pass && llm.pass;
let reason = ''
let reason = "";
if (!pass) {
const reasons = []
if (!simple.pass) reasons.push(`Simple: ${simple.reason}`)
if (!llm.pass) reasons.push(`LLM: ${llm.reason}`)
reason = reasons.join(' | ')
const reasons = [];
if (!simple.pass) reasons.push(`Simple: ${simple.reason}`);
if (!llm.pass) reasons.push(`LLM: ${llm.reason}`);
reason = reasons.join(" | ");
} else {
reason = llm.reason || simple.reason
reason = llm.reason || simple.reason;
}
return {
testId: simple.testId,
pass,
reason
}
})
reason,
};
});
} else if (options.llm === false) {
log(' Using simple exit code check (--no-llm)')
judgments = results.map(r => judge.simpleJudge(r))
log(" Using simple exit code check (--no-llm)");
judgments = results.map((r) => judge.simpleJudge(r));
} else {
try {
judgments = await judge.judgeResults(results)
judgments = await judge.judgeResults(results);
} catch (error) {
log(` LLM judging failed, falling back to simple check: ${error}`)
judgments = results.map(r => judge.simpleJudge(r))
log(` LLM judging failed, falling back to simple check: ${error}`);
judgments = results.map((r) => judge.simpleJudge(r));
}
}
// Create reports
const reports = Reporter.createReports(results, judgments)
// Create reports (with separate verdicts in dual-judge mode)
const reports = Reporter.createReports(
results,
judgments,
simpleJudgments,
llmJudgments,
);
// Output results
switch (options.output) {
case 'json':
const json = Reporter.toJSON(reports)
case "json":
const json = Reporter.toJSON(reports);
// JSON goes to stdout (can be redirected to file)
process.stdout.write(json + '\n')
break
process.stdout.write(json + "\n");
break;
case 'junit':
const junit = Reporter.toJUnit(reports)
writeFileSync('test-results.xml', junit)
log('\nResults written to test-results.xml')
break
case "junit":
const junit = Reporter.toJUnit(reports);
writeFileSync("test-results.xml", junit);
log("\nResults written to test-results.xml");
break;
case 'console':
case "console":
default:
Reporter.toConsole(reports)
break
Reporter.toConsole(reports);
break;
}
// Summary
const passed = reports.filter(r => r.pass).length
const failed = reports.filter(r => !r.pass).length
log('\n' + '='.repeat(60))
log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`)
log('='.repeat(60))
const passed = reports.filter((r) => r.pass).length;
const failed = reports.filter((r) => !r.pass).length;
log("\n" + "=".repeat(60));
log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`);
log("=".repeat(60));
// Report to TestLink
if (options.reportTestlink && options.testlinkApiKey) {
const testlinkReporter = new TestLinkReporter(
options.testlinkUrl,
options.testlinkApiKey
)
options.testlinkApiKey,
);
// Would need plan ID and build ID
// await testlinkReporter.reportResults(reports, planId, buildId)
log('\nTestLink reporting not yet implemented')
log("\nTestLink reporting not yet implemented");
}
// Exit with appropriate code
process.exit(failed > 0 ? 1 : 0)
})
process.exit(failed > 0 ? 1 : 0);
});
program
.command('list')
.description('List all test cases')
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
.command("list")
.description("List all test cases")
.option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
.action(async (options) => {
const loader = new TestLoader(options.testcasesDir)
const testCases = await loader.loadAll()
const loader = new TestLoader(options.testcasesDir);
const testCases = await loader.loadAll();
const grouped = loader.groupBySuite(testCases)
const grouped = loader.groupBySuite(testCases);
console.log('Available Test Cases:\n')
console.log("Available Test Cases:\n");
for (const [suite, cases] of grouped) {
console.log(`${suite.toUpperCase()}:`)
console.log(`${suite.toUpperCase()}:`);
for (const tc of cases) {
console.log(` ${tc.id}: ${tc.name}`)
console.log(` ${tc.id}: ${tc.name}`);
}
console.log()
console.log();
}
console.log(`Total: ${testCases.length} test cases`)
})
console.log(`Total: ${testCases.length} test cases`);
});
program.parse()
program.parse();

View File

@@ -1,31 +1,71 @@
import axios from 'axios'
import { TestResult, Judgment } from './types.js'
import axios from "axios";
import { TestResult, Judgment } from "./types.js";
export class LLMJudge {
private ollamaUrl: string
private model: string
private batchSize: number
private ollamaUrl: string;
private model: string;
private batchSize: number;
constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
this.ollamaUrl = ollamaUrl
this.model = model
this.batchSize = 5 // Judge 5 tests per LLM call
constructor(
ollamaUrl: string = "http://localhost:11434",
model: string = "gemma3:4b",
) {
this.ollamaUrl = ollamaUrl;
this.model = model;
this.batchSize = 5; // Judge 5 tests per LLM call
}
private formatDuration(ms: number): string {
if (ms < 1000) return `${ms}ms`;
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
return `${(ms / 60000).toFixed(1)}min`;
}
private buildPrompt(results: TestResult[]): string {
const testsSection = results.map((r, i) => {
return `
const testsSection = results
.map((r, i) => {
// Build step results summary with exit codes and durations
const stepsSummary = r.steps
.map((step, j) => {
const status = step.exitCode === 0 ? "PASS" : "FAIL";
const stepTimeout =
r.testCase.steps[j]?.timeout || r.testCase.timeout;
return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
})
.join("\n");
// Simple judge result
const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
const simpleResult = allStepsPassed ? "PASS" : "FAIL";
// Check if duration is within timeout
const timeoutMs = r.testCase.timeout;
const withinTimeout = r.totalDuration < timeoutMs;
const timeoutNote = withinTimeout
? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
: `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
return `
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
**Criteria:**
${r.testCase.criteria}
**Step Results:**
${stepsSummary}
**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
**Timing:** ${timeoutNote}
${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
**Execution Logs:**
\`\`\`
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
\`\`\`
`
}).join('\n---\n')
`;
})
.join("\n---\n");
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
@@ -46,66 +86,74 @@ Important:
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
- For build/runtime tests, check exit codes and absence of error messages
- Be lenient with formatting differences, focus on semantic correctness
- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
- Long durations are acceptable as long as they are within the configured timeout
Respond ONLY with the JSON array, no other text.`
Respond ONLY with the JSON array, no other text.`;
}
async judgeResults(results: TestResult[]): Promise<Judgment[]> {
const allJudgments: Judgment[] = []
const allJudgments: Judgment[] = [];
// Process in batches
for (let i = 0; i < results.length; i += this.batchSize) {
const batch = results.slice(i, i + this.batchSize)
console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
const batch = results.slice(i, i + this.batchSize);
console.log(
` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
);
try {
const judgments = await this.judgeBatch(batch)
allJudgments.push(...judgments)
const judgments = await this.judgeBatch(batch);
allJudgments.push(...judgments);
} catch (error) {
console.error(` Failed to judge batch:`, error)
console.error(` Failed to judge batch:`, error);
// Mark all tests in batch as failed
for (const r of batch) {
allJudgments.push({
testId: r.testCase.id,
pass: false,
reason: 'LLM judgment failed: ' + String(error)
})
reason: "LLM judgment failed: " + String(error),
});
}
}
}
return allJudgments
return allJudgments;
}
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
const prompt = this.buildPrompt(results)
const prompt = this.buildPrompt(results);
const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
model: this.model,
prompt,
stream: false,
options: {
temperature: 0.1, // Low temperature for consistent judging
num_predict: 1000
}
}, {
timeout: 120000 // 2 minute timeout
})
const response = await axios.post(
`${this.ollamaUrl}/api/generate`,
{
model: this.model,
prompt,
stream: false,
options: {
temperature: 0.1, // Low temperature for consistent judging
num_predict: 1000,
},
},
{
timeout: 120000, // 2 minute timeout
},
);
const responseText = response.data.response
const responseText = response.data.response;
// Extract JSON from response
const jsonMatch = responseText.match(/\[[\s\S]*\]/)
const jsonMatch = responseText.match(/\[[\s\S]*\]/);
if (!jsonMatch) {
throw new Error('No JSON array found in LLM response')
throw new Error("No JSON array found in LLM response");
}
try {
const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
// Validate and fill missing
const resultIds = results.map(r => r.testCase.id)
const judgedIds = new Set(judgments.map(j => j.testId))
const resultIds = results.map((r) => r.testCase.id);
const judgedIds = new Set(judgments.map((j) => j.testId));
// Add missing judgments
for (const id of resultIds) {
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
judgments.push({
testId: id,
pass: false,
reason: 'No judgment provided by LLM'
})
reason: "No judgment provided by LLM",
});
}
}
return judgments
return judgments;
} catch (parseError) {
throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
throw new Error(
`Failed to parse LLM response: ${responseText.substring(0, 200)}`,
);
}
}
// Fallback: Simple rule-based judgment (no LLM)
simpleJudge(result: TestResult): Judgment {
const allStepsPassed = result.steps.every(s => s.exitCode === 0)
const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
if (allStepsPassed) {
return {
testId: result.testCase.id,
pass: true,
reason: 'All steps completed with exit code 0'
}
reason: "All steps completed with exit code 0",
};
} else {
const failedSteps = result.steps.filter(s => s.exitCode !== 0)
const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
return {
testId: result.testCase.id,
pass: false,
reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
}
reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
};
}
}
}

View File

@@ -1,112 +1,204 @@
import axios from 'axios'
import { TestReport, Judgment, TestResult } from './types.js'
import axios from "axios";
import { TestReport, Judgment, TestResult, TestSummary } from "./types.js";
export class Reporter {
// Console reporter
static toConsole(reports: TestReport[]): void {
console.log('\n' + '='.repeat(60))
console.log('TEST RESULTS')
console.log('='.repeat(60))
console.log("\n" + "=".repeat(60));
console.log("TEST RESULTS");
console.log("=".repeat(60));
const passed = reports.filter(r => r.pass)
const failed = reports.filter(r => !r.pass)
const passed = reports.filter((r) => r.pass);
const failed = reports.filter((r) => !r.pass);
// Check if we have dual-judge data
const hasDualJudge = reports.some(
(r) => r.simplePass !== undefined && r.llmPass !== undefined,
);
for (const report of reports) {
const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m'
console.log(`[${status}] ${report.testId}: ${report.name}`)
console.log(` Reason: ${report.reason}`)
console.log(` Duration: ${report.duration}ms`)
const status = report.pass
? "\x1b[32mPASS\x1b[0m"
: "\x1b[31mFAIL\x1b[0m";
console.log(`[${status}] ${report.testId}: ${report.name}`);
// Show separate verdicts in dual-judge mode
if (
hasDualJudge &&
report.simplePass !== undefined &&
report.llmPass !== undefined
) {
const simpleStatus = report.simplePass
? "\x1b[32mPASS\x1b[0m"
: "\x1b[31mFAIL\x1b[0m";
const llmStatus = report.llmPass
? "\x1b[32mPASS\x1b[0m"
: "\x1b[31mFAIL\x1b[0m";
console.log(
` Simple: [${simpleStatus}] ${report.simpleReason || ""}`,
);
console.log(` LLM: [${llmStatus}] ${report.llmReason || ""}`);
} else {
console.log(` Reason: ${report.reason}`);
}
console.log(` Duration: ${report.duration}ms`);
}
console.log('\n' + '-'.repeat(60))
console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
console.log('='.repeat(60))
console.log("\n" + "-".repeat(60));
// Show separate summaries in dual-judge mode
if (hasDualJudge) {
const simplePassed = reports.filter((r) => r.simplePass).length;
const simpleFailed = reports.filter((r) => !r.simplePass).length;
const llmPassed = reports.filter((r) => r.llmPass).length;
const llmFailed = reports.filter((r) => !r.llmPass).length;
console.log(`Simple: ${simplePassed} passed, ${simpleFailed} failed`);
console.log(`LLM: ${llmPassed} passed, ${llmFailed} failed`);
console.log(
`Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`,
);
} else {
console.log(
`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`,
);
}
console.log("=".repeat(60));
}
// JSON reporter
static toJSON(reports: TestReport[]): string {
return JSON.stringify({
summary: {
total: reports.length,
passed: reports.filter(r => r.pass).length,
failed: reports.filter(r => !r.pass).length,
timestamp: new Date().toISOString()
// Check if we have dual-judge data
const hasDualJudge = reports.some(
(r) => r.simplePass !== undefined && r.llmPass !== undefined,
);
const summary: TestSummary = {
total: reports.length,
passed: reports.filter((r) => r.pass).length,
failed: reports.filter((r) => !r.pass).length,
timestamp: new Date().toISOString(),
};
// Add separate breakdowns in dual-judge mode
if (hasDualJudge) {
summary.simple = {
passed: reports.filter((r) => r.simplePass).length,
failed: reports.filter((r) => !r.simplePass).length,
};
summary.llm = {
passed: reports.filter((r) => r.llmPass).length,
failed: reports.filter((r) => !r.llmPass).length,
};
}
return JSON.stringify(
{
summary,
results: reports,
},
results: reports
}, null, 2)
null,
2,
);
}
// JUnit XML reporter (for CI/CD integration)
static toJUnit(reports: TestReport[]): string {
const escapeXml = (s: string) => s
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;')
const escapeXml = (s: string) =>
s
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
const testcases = reports.map(r => {
if (r.pass) {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`
} else {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
const testcases = reports
.map((r) => {
if (r.pass) {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`;
} else {
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
<failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
</testcase>`
}
}).join('\n')
</testcase>`;
}
})
.join("\n");
const failures = reports.filter(r => !r.pass).length
const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000
const failures = reports.filter((r) => !r.pass).length;
const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000;
return `<?xml version="1.0" encoding="UTF-8"?>
<testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
${testcases}
</testsuite>`
</testsuite>`;
}
// Combine results and judgments into reports
static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] {
const judgmentMap = new Map(judgments.map(j => [j.testId, j]))
static createReports(
results: TestResult[],
judgments: Judgment[],
simpleJudgments?: Judgment[],
llmJudgments?: Judgment[],
): TestReport[] {
const judgmentMap = new Map(judgments.map((j) => [j.testId, j]));
const simpleMap = simpleJudgments
? new Map(simpleJudgments.map((j) => [j.testId, j]))
: undefined;
const llmMap = llmJudgments
? new Map(llmJudgments.map((j) => [j.testId, j]))
: undefined;
return results.map(result => {
const judgment = judgmentMap.get(result.testCase.id)
return results.map((result) => {
const judgment = judgmentMap.get(result.testCase.id);
const simple = simpleMap?.get(result.testCase.id);
const llm = llmMap?.get(result.testCase.id);
return {
const report: TestReport = {
testId: result.testCase.id,
name: result.testCase.name,
suite: result.testCase.suite,
pass: judgment?.pass ?? false,
reason: judgment?.reason ?? 'No judgment',
reason: judgment?.reason ?? "No judgment",
duration: result.totalDuration,
logs: result.logs
logs: result.logs,
};
// Add separate verdicts if available (dual-judge mode)
if (simple && llm) {
report.simplePass = simple.pass;
report.simpleReason = simple.reason;
report.llmPass = llm.pass;
report.llmReason = llm.reason;
}
})
return report;
});
}
}
// TestLink reporter
export class TestLinkReporter {
private url: string
private apiKey: string
private url: string;
private apiKey: string;
constructor(url: string, apiKey: string) {
this.url = url
this.apiKey = apiKey
this.url = url;
this.apiKey = apiKey;
}
async reportResults(
reports: TestReport[],
planId: string,
buildId: string
buildId: string,
): Promise<void> {
console.log('\nReporting to TestLink...')
console.log("\nReporting to TestLink...");
for (const report of reports) {
try {
await this.reportTestExecution(report, planId, buildId)
console.log(` Reported: ${report.testId}`)
await this.reportTestExecution(report, planId, buildId);
console.log(` Reported: ${report.testId}`);
} catch (error) {
console.error(` Failed to report ${report.testId}:`, error)
console.error(` Failed to report ${report.testId}:`, error);
}
}
}
@@ -114,12 +206,12 @@ export class TestLinkReporter {
private async reportTestExecution(
report: TestReport,
planId: string,
buildId: string
buildId: string,
): Promise<void> {
// Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
// This would need to be mapped from TestLink
const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked
const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked
// Note: This uses the TestLink XML-RPC API
// In practice, you'd use the testlink-mcp or direct API calls
@@ -129,10 +221,10 @@ export class TestLinkReporter {
testplanid: planId,
buildid: buildId,
status,
notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`
}
notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`,
};
// For now, just log - actual implementation would call TestLink API
console.log(` Would report: ${report.testId} = ${status}`)
console.log(` Would report: ${report.testId} = ${status}`);
}
}

View File

@@ -1,66 +1,88 @@
// Test case definition
export interface TestStep {
name: string
command: string
timeout?: number
name: string;
command: string;
timeout?: number;
}
export interface TestCase {
id: string
name: string
suite: string
priority: number
timeout: number
dependencies: string[]
steps: TestStep[]
criteria: string
id: string;
name: string;
suite: string;
priority: number;
timeout: number;
dependencies: string[];
steps: TestStep[];
criteria: string;
}
// Execution results
export interface StepResult {
name: string
command: string
stdout: string
stderr: string
exitCode: number
duration: number
name: string;
command: string;
stdout: string;
stderr: string;
exitCode: number;
duration: number;
}
export interface TestResult {
testCase: TestCase
steps: StepResult[]
totalDuration: number
logs: string
testCase: TestCase;
steps: StepResult[];
totalDuration: number;
logs: string;
}
// LLM judgment
export interface Judgment {
testId: string
pass: boolean
reason: string
testId: string;
pass: boolean;
reason: string;
}
// Final report
export interface TestReport {
testId: string
name: string
suite: string
pass: boolean
reason: string
duration: number
logs: string
testId: string;
name: string;
suite: string;
pass: boolean;
reason: string;
duration: number;
logs: string;
// Separate verdicts for dual-judge mode
simplePass?: boolean;
simpleReason?: string;
llmPass?: boolean;
llmReason?: string;
}
// Summary with separate judge breakdowns
export interface TestSummary {
total: number;
passed: number;
failed: number;
timestamp: string;
// Separate breakdowns (only present in dual-judge mode)
simple?: {
passed: number;
failed: number;
};
llm?: {
passed: number;
failed: number;
};
}
// Runner options
export interface RunnerOptions {
suite?: string
id?: string
workers: number
dryRun: boolean
output: 'console' | 'json' | 'junit'
reportTestlink: boolean
ollamaUrl: string
ollamaModel: string
testlinkUrl: string
testlinkApiKey: string
suite?: string;
id?: string;
workers: number;
dryRun: boolean;
output: "console" | "json" | "junit";
reportTestlink: boolean;
ollamaUrl: string;
ollamaModel: string;
testlinkUrl: string;
testlinkApiKey: string;
}