mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-20 12:47:00 +00:00
Enhance LLM judge prompt and add separate verdict display
- Add step results, timing context, and build notes to LLM prompt - LLM now sees exit codes, durations, and simple judge result - Add guidance that long build times within timeout are acceptable - Add separate simple/LLM verdict tracking in dual-judge mode - Console output shows both Simple and LLM pass/fail status - JSON summary includes separate simple/llm breakdown - Each test report includes simplePass/llmPass fields This helps distinguish between simple judge failures (exit code != 0) and LLM judge failures (semantic analysis), making debugging easier.
This commit is contained in:
266
tests/src/cli.ts
266
tests/src/cli.ts
@@ -1,216 +1,244 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { Command } from 'commander'
|
||||
import { writeFileSync } from 'fs'
|
||||
import path from 'path'
|
||||
import { fileURLToPath } from 'url'
|
||||
import { TestLoader } from './loader.js'
|
||||
import { TestExecutor } from './executor.js'
|
||||
import { LLMJudge } from './judge.js'
|
||||
import { Reporter, TestLinkReporter } from './reporter.js'
|
||||
import { RunnerOptions } from './types.js'
|
||||
import { Command } from "commander";
|
||||
import { writeFileSync } from "fs";
|
||||
import path from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { TestLoader } from "./loader.js";
|
||||
import { TestExecutor } from "./executor.js";
|
||||
import { LLMJudge } from "./judge.js";
|
||||
import { Reporter, TestLinkReporter } from "./reporter.js";
|
||||
import { RunnerOptions, Judgment } from "./types.js";
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url))
|
||||
const defaultTestcasesDir = path.join(__dirname, '..', 'testcases')
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
|
||||
|
||||
// Progress output to stderr (visible in console even when stdout is redirected)
|
||||
const log = (msg: string) => process.stderr.write(msg + '\n')
|
||||
const log = (msg: string) => process.stderr.write(msg + "\n");
|
||||
|
||||
const program = new Command()
|
||||
const program = new Command();
|
||||
|
||||
program
|
||||
.name('ollama37-test')
|
||||
.description('Scalable test runner with LLM-as-judge for ollama37')
|
||||
.version('1.0.0')
|
||||
.name("ollama37-test")
|
||||
.description("Scalable test runner with LLM-as-judge for ollama37")
|
||||
.version("1.0.0");
|
||||
|
||||
program
|
||||
.command('run')
|
||||
.description('Run test cases')
|
||||
.option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)')
|
||||
.option('-i, --id <id>', 'Run only specified test case by ID')
|
||||
.option('-w, --workers <n>', 'Number of parallel workers', '1')
|
||||
.option('-d, --dry-run', 'Show what would be executed without running')
|
||||
.option('-o, --output <format>', 'Output format: console, json, junit', 'console')
|
||||
.option('--report-testlink', 'Report results to TestLink')
|
||||
.option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434')
|
||||
.option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435')
|
||||
.option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b')
|
||||
.option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
|
||||
.option('--testlink-api-key <key>', 'TestLink API key')
|
||||
.option('--no-llm', 'Skip LLM judging, use simple exit code check only')
|
||||
.option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)')
|
||||
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
|
||||
.command("run")
|
||||
.description("Run test cases")
|
||||
.option(
|
||||
"-s, --suite <suite>",
|
||||
"Run only tests in specified suite (build, runtime, inference)",
|
||||
)
|
||||
.option("-i, --id <id>", "Run only specified test case by ID")
|
||||
.option("-w, --workers <n>", "Number of parallel workers", "1")
|
||||
.option("-d, --dry-run", "Show what would be executed without running")
|
||||
.option(
|
||||
"-o, --output <format>",
|
||||
"Output format: console, json, junit",
|
||||
"console",
|
||||
)
|
||||
.option("--report-testlink", "Report results to TestLink")
|
||||
.option(
|
||||
"--ollama-url <url>",
|
||||
"Ollama server URL (test subject)",
|
||||
"http://localhost:11434",
|
||||
)
|
||||
.option(
|
||||
"--judge-url <url>",
|
||||
"LLM Judge server URL (separate instance)",
|
||||
"http://localhost:11435",
|
||||
)
|
||||
.option("--judge-model <model>", "Model for LLM judging", "gemma3:4b")
|
||||
.option(
|
||||
"--testlink-url <url>",
|
||||
"TestLink server URL",
|
||||
"http://localhost:8090",
|
||||
)
|
||||
.option("--testlink-api-key <key>", "TestLink API key")
|
||||
.option("--no-llm", "Skip LLM judging, use simple exit code check only")
|
||||
.option(
|
||||
"--dual-judge",
|
||||
"Use both simple and LLM judge (fail if either fails)",
|
||||
)
|
||||
.option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
|
||||
.action(async (options) => {
|
||||
log('='.repeat(60))
|
||||
log('OLLAMA37 TEST RUNNER')
|
||||
log('='.repeat(60))
|
||||
log("=".repeat(60));
|
||||
log("OLLAMA37 TEST RUNNER");
|
||||
log("=".repeat(60));
|
||||
|
||||
const loader = new TestLoader(options.testcasesDir)
|
||||
const executor = new TestExecutor(path.join(__dirname, '..', '..'))
|
||||
const judge = new LLMJudge(options.judgeUrl, options.judgeModel)
|
||||
const loader = new TestLoader(options.testcasesDir);
|
||||
const executor = new TestExecutor(path.join(__dirname, "..", ".."));
|
||||
const judge = new LLMJudge(options.judgeUrl, options.judgeModel);
|
||||
|
||||
// Load test cases
|
||||
log('\nLoading test cases...')
|
||||
let testCases = await loader.loadAll()
|
||||
log("\nLoading test cases...");
|
||||
let testCases = await loader.loadAll();
|
||||
|
||||
if (options.suite) {
|
||||
testCases = testCases.filter(tc => tc.suite === options.suite)
|
||||
log(` Filtered by suite: ${options.suite}`)
|
||||
testCases = testCases.filter((tc) => tc.suite === options.suite);
|
||||
log(` Filtered by suite: ${options.suite}`);
|
||||
}
|
||||
|
||||
if (options.id) {
|
||||
testCases = testCases.filter(tc => tc.id === options.id)
|
||||
log(` Filtered by ID: ${options.id}`)
|
||||
testCases = testCases.filter((tc) => tc.id === options.id);
|
||||
log(` Filtered by ID: ${options.id}`);
|
||||
}
|
||||
|
||||
// Sort by dependencies
|
||||
testCases = loader.sortByDependencies(testCases)
|
||||
testCases = loader.sortByDependencies(testCases);
|
||||
|
||||
log(` Found ${testCases.length} test cases`)
|
||||
log(` Found ${testCases.length} test cases`);
|
||||
|
||||
if (testCases.length === 0) {
|
||||
log('\nNo test cases found!')
|
||||
process.exit(1)
|
||||
log("\nNo test cases found!");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Dry run
|
||||
if (options.dryRun) {
|
||||
log('\nDRY RUN - Would execute:')
|
||||
log("\nDRY RUN - Would execute:");
|
||||
for (const tc of testCases) {
|
||||
log(` ${tc.id}: ${tc.name}`)
|
||||
log(` ${tc.id}: ${tc.name}`);
|
||||
for (const step of tc.steps) {
|
||||
log(` - ${step.name}: ${step.command}`)
|
||||
log(` - ${step.name}: ${step.command}`);
|
||||
}
|
||||
}
|
||||
process.exit(0)
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Execute tests (progress goes to stderr via executor)
|
||||
const workers = parseInt(options.workers)
|
||||
const results = await executor.executeAll(testCases, workers)
|
||||
const workers = parseInt(options.workers);
|
||||
const results = await executor.executeAll(testCases, workers);
|
||||
|
||||
// Judge results
|
||||
log('\nJudging results...')
|
||||
let judgments
|
||||
log("\nJudging results...");
|
||||
let judgments: Judgment[];
|
||||
let simpleJudgments: Judgment[] | undefined;
|
||||
let llmJudgments: Judgment[] | undefined;
|
||||
|
||||
if (options.dualJudge) {
|
||||
// Dual judge mode: run both simple and LLM, fail if either fails
|
||||
log(' Using dual judge mode (simple + LLM)')
|
||||
log(" Using dual judge mode (simple + LLM)");
|
||||
|
||||
// Simple judge first
|
||||
const simpleJudgments = results.map(r => judge.simpleJudge(r))
|
||||
log(' Simple judge complete')
|
||||
simpleJudgments = results.map((r) => judge.simpleJudge(r));
|
||||
log(" Simple judge complete");
|
||||
|
||||
// LLM judge second
|
||||
let llmJudgments
|
||||
try {
|
||||
llmJudgments = await judge.judgeResults(results)
|
||||
log(' LLM judge complete')
|
||||
llmJudgments = await judge.judgeResults(results);
|
||||
log(" LLM judge complete");
|
||||
} catch (error) {
|
||||
log(` LLM judge failed: ${error}`)
|
||||
log(' Falling back to simple judge only')
|
||||
llmJudgments = simpleJudgments
|
||||
log(` LLM judge failed: ${error}`);
|
||||
log(" Falling back to simple judge only");
|
||||
llmJudgments = simpleJudgments;
|
||||
}
|
||||
|
||||
// Combine: fail if either judge says fail
|
||||
judgments = simpleJudgments.map((simple, i) => {
|
||||
const llm = llmJudgments.find(j => j.testId === simple.testId) || simple
|
||||
const pass = simple.pass && llm.pass
|
||||
const llm =
|
||||
llmJudgments!.find((j) => j.testId === simple.testId) || simple;
|
||||
const pass = simple.pass && llm.pass;
|
||||
|
||||
let reason = ''
|
||||
let reason = "";
|
||||
if (!pass) {
|
||||
const reasons = []
|
||||
if (!simple.pass) reasons.push(`Simple: ${simple.reason}`)
|
||||
if (!llm.pass) reasons.push(`LLM: ${llm.reason}`)
|
||||
reason = reasons.join(' | ')
|
||||
const reasons = [];
|
||||
if (!simple.pass) reasons.push(`Simple: ${simple.reason}`);
|
||||
if (!llm.pass) reasons.push(`LLM: ${llm.reason}`);
|
||||
reason = reasons.join(" | ");
|
||||
} else {
|
||||
reason = llm.reason || simple.reason
|
||||
reason = llm.reason || simple.reason;
|
||||
}
|
||||
|
||||
return {
|
||||
testId: simple.testId,
|
||||
pass,
|
||||
reason
|
||||
}
|
||||
})
|
||||
|
||||
reason,
|
||||
};
|
||||
});
|
||||
} else if (options.llm === false) {
|
||||
log(' Using simple exit code check (--no-llm)')
|
||||
judgments = results.map(r => judge.simpleJudge(r))
|
||||
log(" Using simple exit code check (--no-llm)");
|
||||
judgments = results.map((r) => judge.simpleJudge(r));
|
||||
} else {
|
||||
try {
|
||||
judgments = await judge.judgeResults(results)
|
||||
judgments = await judge.judgeResults(results);
|
||||
} catch (error) {
|
||||
log(` LLM judging failed, falling back to simple check: ${error}`)
|
||||
judgments = results.map(r => judge.simpleJudge(r))
|
||||
log(` LLM judging failed, falling back to simple check: ${error}`);
|
||||
judgments = results.map((r) => judge.simpleJudge(r));
|
||||
}
|
||||
}
|
||||
|
||||
// Create reports
|
||||
const reports = Reporter.createReports(results, judgments)
|
||||
// Create reports (with separate verdicts in dual-judge mode)
|
||||
const reports = Reporter.createReports(
|
||||
results,
|
||||
judgments,
|
||||
simpleJudgments,
|
||||
llmJudgments,
|
||||
);
|
||||
|
||||
// Output results
|
||||
switch (options.output) {
|
||||
case 'json':
|
||||
const json = Reporter.toJSON(reports)
|
||||
case "json":
|
||||
const json = Reporter.toJSON(reports);
|
||||
// JSON goes to stdout (can be redirected to file)
|
||||
process.stdout.write(json + '\n')
|
||||
break
|
||||
process.stdout.write(json + "\n");
|
||||
break;
|
||||
|
||||
case 'junit':
|
||||
const junit = Reporter.toJUnit(reports)
|
||||
writeFileSync('test-results.xml', junit)
|
||||
log('\nResults written to test-results.xml')
|
||||
break
|
||||
case "junit":
|
||||
const junit = Reporter.toJUnit(reports);
|
||||
writeFileSync("test-results.xml", junit);
|
||||
log("\nResults written to test-results.xml");
|
||||
break;
|
||||
|
||||
case 'console':
|
||||
case "console":
|
||||
default:
|
||||
Reporter.toConsole(reports)
|
||||
break
|
||||
Reporter.toConsole(reports);
|
||||
break;
|
||||
}
|
||||
|
||||
// Summary
|
||||
const passed = reports.filter(r => r.pass).length
|
||||
const failed = reports.filter(r => !r.pass).length
|
||||
log('\n' + '='.repeat(60))
|
||||
log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`)
|
||||
log('='.repeat(60))
|
||||
const passed = reports.filter((r) => r.pass).length;
|
||||
const failed = reports.filter((r) => !r.pass).length;
|
||||
log("\n" + "=".repeat(60));
|
||||
log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`);
|
||||
log("=".repeat(60));
|
||||
|
||||
// Report to TestLink
|
||||
if (options.reportTestlink && options.testlinkApiKey) {
|
||||
const testlinkReporter = new TestLinkReporter(
|
||||
options.testlinkUrl,
|
||||
options.testlinkApiKey
|
||||
)
|
||||
options.testlinkApiKey,
|
||||
);
|
||||
// Would need plan ID and build ID
|
||||
// await testlinkReporter.reportResults(reports, planId, buildId)
|
||||
log('\nTestLink reporting not yet implemented')
|
||||
log("\nTestLink reporting not yet implemented");
|
||||
}
|
||||
|
||||
// Exit with appropriate code
|
||||
process.exit(failed > 0 ? 1 : 0)
|
||||
})
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
});
|
||||
|
||||
program
|
||||
.command('list')
|
||||
.description('List all test cases')
|
||||
.option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
|
||||
.command("list")
|
||||
.description("List all test cases")
|
||||
.option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
|
||||
.action(async (options) => {
|
||||
const loader = new TestLoader(options.testcasesDir)
|
||||
const testCases = await loader.loadAll()
|
||||
const loader = new TestLoader(options.testcasesDir);
|
||||
const testCases = await loader.loadAll();
|
||||
|
||||
const grouped = loader.groupBySuite(testCases)
|
||||
const grouped = loader.groupBySuite(testCases);
|
||||
|
||||
console.log('Available Test Cases:\n')
|
||||
console.log("Available Test Cases:\n");
|
||||
for (const [suite, cases] of grouped) {
|
||||
console.log(`${suite.toUpperCase()}:`)
|
||||
console.log(`${suite.toUpperCase()}:`);
|
||||
for (const tc of cases) {
|
||||
console.log(` ${tc.id}: ${tc.name}`)
|
||||
console.log(` ${tc.id}: ${tc.name}`);
|
||||
}
|
||||
console.log()
|
||||
console.log();
|
||||
}
|
||||
|
||||
console.log(`Total: ${testCases.length} test cases`)
|
||||
})
|
||||
console.log(`Total: ${testCases.length} test cases`);
|
||||
});
|
||||
|
||||
program.parse()
|
||||
program.parse();
|
||||
|
||||
@@ -1,31 +1,71 @@
|
||||
import axios from 'axios'
|
||||
import { TestResult, Judgment } from './types.js'
|
||||
import axios from "axios";
|
||||
import { TestResult, Judgment } from "./types.js";
|
||||
|
||||
export class LLMJudge {
|
||||
private ollamaUrl: string
|
||||
private model: string
|
||||
private batchSize: number
|
||||
private ollamaUrl: string;
|
||||
private model: string;
|
||||
private batchSize: number;
|
||||
|
||||
constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
|
||||
this.ollamaUrl = ollamaUrl
|
||||
this.model = model
|
||||
this.batchSize = 5 // Judge 5 tests per LLM call
|
||||
constructor(
|
||||
ollamaUrl: string = "http://localhost:11434",
|
||||
model: string = "gemma3:4b",
|
||||
) {
|
||||
this.ollamaUrl = ollamaUrl;
|
||||
this.model = model;
|
||||
this.batchSize = 5; // Judge 5 tests per LLM call
|
||||
}
|
||||
|
||||
private formatDuration(ms: number): string {
|
||||
if (ms < 1000) return `${ms}ms`;
|
||||
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
|
||||
return `${(ms / 60000).toFixed(1)}min`;
|
||||
}
|
||||
|
||||
private buildPrompt(results: TestResult[]): string {
|
||||
const testsSection = results.map((r, i) => {
|
||||
return `
|
||||
const testsSection = results
|
||||
.map((r, i) => {
|
||||
// Build step results summary with exit codes and durations
|
||||
const stepsSummary = r.steps
|
||||
.map((step, j) => {
|
||||
const status = step.exitCode === 0 ? "PASS" : "FAIL";
|
||||
const stepTimeout =
|
||||
r.testCase.steps[j]?.timeout || r.testCase.timeout;
|
||||
return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
// Simple judge result
|
||||
const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
|
||||
const simpleResult = allStepsPassed ? "PASS" : "FAIL";
|
||||
|
||||
// Check if duration is within timeout
|
||||
const timeoutMs = r.testCase.timeout;
|
||||
const withinTimeout = r.totalDuration < timeoutMs;
|
||||
const timeoutNote = withinTimeout
|
||||
? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
|
||||
: `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
|
||||
|
||||
return `
|
||||
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
|
||||
|
||||
**Criteria:**
|
||||
${r.testCase.criteria}
|
||||
|
||||
**Step Results:**
|
||||
${stepsSummary}
|
||||
|
||||
**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
|
||||
|
||||
**Timing:** ${timeoutNote}
|
||||
${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
|
||||
|
||||
**Execution Logs:**
|
||||
\`\`\`
|
||||
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
|
||||
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
|
||||
\`\`\`
|
||||
`
|
||||
}).join('\n---\n')
|
||||
`;
|
||||
})
|
||||
.join("\n---\n");
|
||||
|
||||
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
|
||||
|
||||
@@ -46,66 +86,74 @@ Important:
|
||||
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
|
||||
- For build/runtime tests, check exit codes and absence of error messages
|
||||
- Be lenient with formatting differences, focus on semantic correctness
|
||||
- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
|
||||
- Long durations are acceptable as long as they are within the configured timeout
|
||||
|
||||
Respond ONLY with the JSON array, no other text.`
|
||||
Respond ONLY with the JSON array, no other text.`;
|
||||
}
|
||||
|
||||
async judgeResults(results: TestResult[]): Promise<Judgment[]> {
|
||||
const allJudgments: Judgment[] = []
|
||||
const allJudgments: Judgment[] = [];
|
||||
|
||||
// Process in batches
|
||||
for (let i = 0; i < results.length; i += this.batchSize) {
|
||||
const batch = results.slice(i, i + this.batchSize)
|
||||
console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
|
||||
const batch = results.slice(i, i + this.batchSize);
|
||||
console.log(
|
||||
` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
|
||||
);
|
||||
|
||||
try {
|
||||
const judgments = await this.judgeBatch(batch)
|
||||
allJudgments.push(...judgments)
|
||||
const judgments = await this.judgeBatch(batch);
|
||||
allJudgments.push(...judgments);
|
||||
} catch (error) {
|
||||
console.error(` Failed to judge batch:`, error)
|
||||
console.error(` Failed to judge batch:`, error);
|
||||
// Mark all tests in batch as failed
|
||||
for (const r of batch) {
|
||||
allJudgments.push({
|
||||
testId: r.testCase.id,
|
||||
pass: false,
|
||||
reason: 'LLM judgment failed: ' + String(error)
|
||||
})
|
||||
reason: "LLM judgment failed: " + String(error),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allJudgments
|
||||
return allJudgments;
|
||||
}
|
||||
|
||||
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
|
||||
const prompt = this.buildPrompt(results)
|
||||
const prompt = this.buildPrompt(results);
|
||||
|
||||
const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
|
||||
model: this.model,
|
||||
prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.1, // Low temperature for consistent judging
|
||||
num_predict: 1000
|
||||
}
|
||||
}, {
|
||||
timeout: 120000 // 2 minute timeout
|
||||
})
|
||||
const response = await axios.post(
|
||||
`${this.ollamaUrl}/api/generate`,
|
||||
{
|
||||
model: this.model,
|
||||
prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.1, // Low temperature for consistent judging
|
||||
num_predict: 1000,
|
||||
},
|
||||
},
|
||||
{
|
||||
timeout: 120000, // 2 minute timeout
|
||||
},
|
||||
);
|
||||
|
||||
const responseText = response.data.response
|
||||
const responseText = response.data.response;
|
||||
|
||||
// Extract JSON from response
|
||||
const jsonMatch = responseText.match(/\[[\s\S]*\]/)
|
||||
const jsonMatch = responseText.match(/\[[\s\S]*\]/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error('No JSON array found in LLM response')
|
||||
throw new Error("No JSON array found in LLM response");
|
||||
}
|
||||
|
||||
try {
|
||||
const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
|
||||
const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
|
||||
|
||||
// Validate and fill missing
|
||||
const resultIds = results.map(r => r.testCase.id)
|
||||
const judgedIds = new Set(judgments.map(j => j.testId))
|
||||
const resultIds = results.map((r) => r.testCase.id);
|
||||
const judgedIds = new Set(judgments.map((j) => j.testId));
|
||||
|
||||
// Add missing judgments
|
||||
for (const id of resultIds) {
|
||||
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
|
||||
judgments.push({
|
||||
testId: id,
|
||||
pass: false,
|
||||
reason: 'No judgment provided by LLM'
|
||||
})
|
||||
reason: "No judgment provided by LLM",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return judgments
|
||||
return judgments;
|
||||
} catch (parseError) {
|
||||
throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
|
||||
throw new Error(
|
||||
`Failed to parse LLM response: ${responseText.substring(0, 200)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Simple rule-based judgment (no LLM)
|
||||
simpleJudge(result: TestResult): Judgment {
|
||||
const allStepsPassed = result.steps.every(s => s.exitCode === 0)
|
||||
const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
|
||||
|
||||
if (allStepsPassed) {
|
||||
return {
|
||||
testId: result.testCase.id,
|
||||
pass: true,
|
||||
reason: 'All steps completed with exit code 0'
|
||||
}
|
||||
reason: "All steps completed with exit code 0",
|
||||
};
|
||||
} else {
|
||||
const failedSteps = result.steps.filter(s => s.exitCode !== 0)
|
||||
const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
|
||||
return {
|
||||
testId: result.testCase.id,
|
||||
pass: false,
|
||||
reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
|
||||
}
|
||||
reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,112 +1,204 @@
|
||||
import axios from 'axios'
|
||||
import { TestReport, Judgment, TestResult } from './types.js'
|
||||
import axios from "axios";
|
||||
import { TestReport, Judgment, TestResult, TestSummary } from "./types.js";
|
||||
|
||||
export class Reporter {
|
||||
// Console reporter
|
||||
static toConsole(reports: TestReport[]): void {
|
||||
console.log('\n' + '='.repeat(60))
|
||||
console.log('TEST RESULTS')
|
||||
console.log('='.repeat(60))
|
||||
console.log("\n" + "=".repeat(60));
|
||||
console.log("TEST RESULTS");
|
||||
console.log("=".repeat(60));
|
||||
|
||||
const passed = reports.filter(r => r.pass)
|
||||
const failed = reports.filter(r => !r.pass)
|
||||
const passed = reports.filter((r) => r.pass);
|
||||
const failed = reports.filter((r) => !r.pass);
|
||||
|
||||
// Check if we have dual-judge data
|
||||
const hasDualJudge = reports.some(
|
||||
(r) => r.simplePass !== undefined && r.llmPass !== undefined,
|
||||
);
|
||||
|
||||
for (const report of reports) {
|
||||
const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m'
|
||||
console.log(`[${status}] ${report.testId}: ${report.name}`)
|
||||
console.log(` Reason: ${report.reason}`)
|
||||
console.log(` Duration: ${report.duration}ms`)
|
||||
const status = report.pass
|
||||
? "\x1b[32mPASS\x1b[0m"
|
||||
: "\x1b[31mFAIL\x1b[0m";
|
||||
console.log(`[${status}] ${report.testId}: ${report.name}`);
|
||||
|
||||
// Show separate verdicts in dual-judge mode
|
||||
if (
|
||||
hasDualJudge &&
|
||||
report.simplePass !== undefined &&
|
||||
report.llmPass !== undefined
|
||||
) {
|
||||
const simpleStatus = report.simplePass
|
||||
? "\x1b[32mPASS\x1b[0m"
|
||||
: "\x1b[31mFAIL\x1b[0m";
|
||||
const llmStatus = report.llmPass
|
||||
? "\x1b[32mPASS\x1b[0m"
|
||||
: "\x1b[31mFAIL\x1b[0m";
|
||||
console.log(
|
||||
` Simple: [${simpleStatus}] ${report.simpleReason || ""}`,
|
||||
);
|
||||
console.log(` LLM: [${llmStatus}] ${report.llmReason || ""}`);
|
||||
} else {
|
||||
console.log(` Reason: ${report.reason}`);
|
||||
}
|
||||
console.log(` Duration: ${report.duration}ms`);
|
||||
}
|
||||
|
||||
console.log('\n' + '-'.repeat(60))
|
||||
console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
|
||||
console.log('='.repeat(60))
|
||||
console.log("\n" + "-".repeat(60));
|
||||
|
||||
// Show separate summaries in dual-judge mode
|
||||
if (hasDualJudge) {
|
||||
const simplePassed = reports.filter((r) => r.simplePass).length;
|
||||
const simpleFailed = reports.filter((r) => !r.simplePass).length;
|
||||
const llmPassed = reports.filter((r) => r.llmPass).length;
|
||||
const llmFailed = reports.filter((r) => !r.llmPass).length;
|
||||
|
||||
console.log(`Simple: ${simplePassed} passed, ${simpleFailed} failed`);
|
||||
console.log(`LLM: ${llmPassed} passed, ${llmFailed} failed`);
|
||||
console.log(
|
||||
`Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`,
|
||||
);
|
||||
} else {
|
||||
console.log(
|
||||
`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`,
|
||||
);
|
||||
}
|
||||
console.log("=".repeat(60));
|
||||
}
|
||||
|
||||
// JSON reporter
|
||||
static toJSON(reports: TestReport[]): string {
|
||||
return JSON.stringify({
|
||||
summary: {
|
||||
total: reports.length,
|
||||
passed: reports.filter(r => r.pass).length,
|
||||
failed: reports.filter(r => !r.pass).length,
|
||||
timestamp: new Date().toISOString()
|
||||
// Check if we have dual-judge data
|
||||
const hasDualJudge = reports.some(
|
||||
(r) => r.simplePass !== undefined && r.llmPass !== undefined,
|
||||
);
|
||||
|
||||
const summary: TestSummary = {
|
||||
total: reports.length,
|
||||
passed: reports.filter((r) => r.pass).length,
|
||||
failed: reports.filter((r) => !r.pass).length,
|
||||
timestamp: new Date().toISOString(),
|
||||
};
|
||||
|
||||
// Add separate breakdowns in dual-judge mode
|
||||
if (hasDualJudge) {
|
||||
summary.simple = {
|
||||
passed: reports.filter((r) => r.simplePass).length,
|
||||
failed: reports.filter((r) => !r.simplePass).length,
|
||||
};
|
||||
summary.llm = {
|
||||
passed: reports.filter((r) => r.llmPass).length,
|
||||
failed: reports.filter((r) => !r.llmPass).length,
|
||||
};
|
||||
}
|
||||
|
||||
return JSON.stringify(
|
||||
{
|
||||
summary,
|
||||
results: reports,
|
||||
},
|
||||
results: reports
|
||||
}, null, 2)
|
||||
null,
|
||||
2,
|
||||
);
|
||||
}
|
||||
|
||||
// JUnit XML reporter (for CI/CD integration)
|
||||
static toJUnit(reports: TestReport[]): string {
|
||||
const escapeXml = (s: string) => s
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''')
|
||||
const escapeXml = (s: string) =>
|
||||
s
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'");
|
||||
|
||||
const testcases = reports.map(r => {
|
||||
if (r.pass) {
|
||||
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`
|
||||
} else {
|
||||
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
|
||||
const testcases = reports
|
||||
.map((r) => {
|
||||
if (r.pass) {
|
||||
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`;
|
||||
} else {
|
||||
return ` <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
|
||||
<failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
|
||||
</testcase>`
|
||||
}
|
||||
}).join('\n')
|
||||
</testcase>`;
|
||||
}
|
||||
})
|
||||
.join("\n");
|
||||
|
||||
const failures = reports.filter(r => !r.pass).length
|
||||
const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000
|
||||
const failures = reports.filter((r) => !r.pass).length;
|
||||
const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000;
|
||||
|
||||
return `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
|
||||
${testcases}
|
||||
</testsuite>`
|
||||
</testsuite>`;
|
||||
}
|
||||
|
||||
// Combine results and judgments into reports
|
||||
static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] {
|
||||
const judgmentMap = new Map(judgments.map(j => [j.testId, j]))
|
||||
static createReports(
|
||||
results: TestResult[],
|
||||
judgments: Judgment[],
|
||||
simpleJudgments?: Judgment[],
|
||||
llmJudgments?: Judgment[],
|
||||
): TestReport[] {
|
||||
const judgmentMap = new Map(judgments.map((j) => [j.testId, j]));
|
||||
const simpleMap = simpleJudgments
|
||||
? new Map(simpleJudgments.map((j) => [j.testId, j]))
|
||||
: undefined;
|
||||
const llmMap = llmJudgments
|
||||
? new Map(llmJudgments.map((j) => [j.testId, j]))
|
||||
: undefined;
|
||||
|
||||
return results.map(result => {
|
||||
const judgment = judgmentMap.get(result.testCase.id)
|
||||
return results.map((result) => {
|
||||
const judgment = judgmentMap.get(result.testCase.id);
|
||||
const simple = simpleMap?.get(result.testCase.id);
|
||||
const llm = llmMap?.get(result.testCase.id);
|
||||
|
||||
return {
|
||||
const report: TestReport = {
|
||||
testId: result.testCase.id,
|
||||
name: result.testCase.name,
|
||||
suite: result.testCase.suite,
|
||||
pass: judgment?.pass ?? false,
|
||||
reason: judgment?.reason ?? 'No judgment',
|
||||
reason: judgment?.reason ?? "No judgment",
|
||||
duration: result.totalDuration,
|
||||
logs: result.logs
|
||||
logs: result.logs,
|
||||
};
|
||||
|
||||
// Add separate verdicts if available (dual-judge mode)
|
||||
if (simple && llm) {
|
||||
report.simplePass = simple.pass;
|
||||
report.simpleReason = simple.reason;
|
||||
report.llmPass = llm.pass;
|
||||
report.llmReason = llm.reason;
|
||||
}
|
||||
})
|
||||
|
||||
return report;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// TestLink reporter
|
||||
export class TestLinkReporter {
|
||||
private url: string
|
||||
private apiKey: string
|
||||
private url: string;
|
||||
private apiKey: string;
|
||||
|
||||
constructor(url: string, apiKey: string) {
|
||||
this.url = url
|
||||
this.apiKey = apiKey
|
||||
this.url = url;
|
||||
this.apiKey = apiKey;
|
||||
}
|
||||
|
||||
async reportResults(
|
||||
reports: TestReport[],
|
||||
planId: string,
|
||||
buildId: string
|
||||
buildId: string,
|
||||
): Promise<void> {
|
||||
console.log('\nReporting to TestLink...')
|
||||
console.log("\nReporting to TestLink...");
|
||||
|
||||
for (const report of reports) {
|
||||
try {
|
||||
await this.reportTestExecution(report, planId, buildId)
|
||||
console.log(` Reported: ${report.testId}`)
|
||||
await this.reportTestExecution(report, planId, buildId);
|
||||
console.log(` Reported: ${report.testId}`);
|
||||
} catch (error) {
|
||||
console.error(` Failed to report ${report.testId}:`, error)
|
||||
console.error(` Failed to report ${report.testId}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -114,12 +206,12 @@ export class TestLinkReporter {
|
||||
private async reportTestExecution(
|
||||
report: TestReport,
|
||||
planId: string,
|
||||
buildId: string
|
||||
buildId: string,
|
||||
): Promise<void> {
|
||||
// Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
|
||||
// This would need to be mapped from TestLink
|
||||
|
||||
const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked
|
||||
const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked
|
||||
|
||||
// Note: This uses the TestLink XML-RPC API
|
||||
// In practice, you'd use the testlink-mcp or direct API calls
|
||||
@@ -129,10 +221,10 @@ export class TestLinkReporter {
|
||||
testplanid: planId,
|
||||
buildid: buildId,
|
||||
status,
|
||||
notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`
|
||||
}
|
||||
notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`,
|
||||
};
|
||||
|
||||
// For now, just log - actual implementation would call TestLink API
|
||||
console.log(` Would report: ${report.testId} = ${status}`)
|
||||
console.log(` Would report: ${report.testId} = ${status}`);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,66 +1,88 @@
|
||||
// Test case definition
|
||||
export interface TestStep {
|
||||
name: string
|
||||
command: string
|
||||
timeout?: number
|
||||
name: string;
|
||||
command: string;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
export interface TestCase {
|
||||
id: string
|
||||
name: string
|
||||
suite: string
|
||||
priority: number
|
||||
timeout: number
|
||||
dependencies: string[]
|
||||
steps: TestStep[]
|
||||
criteria: string
|
||||
id: string;
|
||||
name: string;
|
||||
suite: string;
|
||||
priority: number;
|
||||
timeout: number;
|
||||
dependencies: string[];
|
||||
steps: TestStep[];
|
||||
criteria: string;
|
||||
}
|
||||
|
||||
// Execution results
|
||||
export interface StepResult {
|
||||
name: string
|
||||
command: string
|
||||
stdout: string
|
||||
stderr: string
|
||||
exitCode: number
|
||||
duration: number
|
||||
name: string;
|
||||
command: string;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
exitCode: number;
|
||||
duration: number;
|
||||
}
|
||||
|
||||
export interface TestResult {
|
||||
testCase: TestCase
|
||||
steps: StepResult[]
|
||||
totalDuration: number
|
||||
logs: string
|
||||
testCase: TestCase;
|
||||
steps: StepResult[];
|
||||
totalDuration: number;
|
||||
logs: string;
|
||||
}
|
||||
|
||||
// LLM judgment
|
||||
export interface Judgment {
|
||||
testId: string
|
||||
pass: boolean
|
||||
reason: string
|
||||
testId: string;
|
||||
pass: boolean;
|
||||
reason: string;
|
||||
}
|
||||
|
||||
// Final report
|
||||
export interface TestReport {
|
||||
testId: string
|
||||
name: string
|
||||
suite: string
|
||||
pass: boolean
|
||||
reason: string
|
||||
duration: number
|
||||
logs: string
|
||||
testId: string;
|
||||
name: string;
|
||||
suite: string;
|
||||
pass: boolean;
|
||||
reason: string;
|
||||
duration: number;
|
||||
logs: string;
|
||||
// Separate verdicts for dual-judge mode
|
||||
simplePass?: boolean;
|
||||
simpleReason?: string;
|
||||
llmPass?: boolean;
|
||||
llmReason?: string;
|
||||
}
|
||||
|
||||
// Summary with separate judge breakdowns
|
||||
export interface TestSummary {
|
||||
total: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
timestamp: string;
|
||||
// Separate breakdowns (only present in dual-judge mode)
|
||||
simple?: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
};
|
||||
llm?: {
|
||||
passed: number;
|
||||
failed: number;
|
||||
};
|
||||
}
|
||||
|
||||
// Runner options
|
||||
export interface RunnerOptions {
|
||||
suite?: string
|
||||
id?: string
|
||||
workers: number
|
||||
dryRun: boolean
|
||||
output: 'console' | 'json' | 'junit'
|
||||
reportTestlink: boolean
|
||||
ollamaUrl: string
|
||||
ollamaModel: string
|
||||
testlinkUrl: string
|
||||
testlinkApiKey: string
|
||||
suite?: string;
|
||||
id?: string;
|
||||
workers: number;
|
||||
dryRun: boolean;
|
||||
output: "console" | "json" | "junit";
|
||||
reportTestlink: boolean;
|
||||
ollamaUrl: string;
|
||||
ollamaModel: string;
|
||||
testlinkUrl: string;
|
||||
testlinkApiKey: string;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user