diff --git a/tests/src/cli.ts b/tests/src/cli.ts index d4086ffc..cfca48d2 100644 --- a/tests/src/cli.ts +++ b/tests/src/cli.ts @@ -1,216 +1,244 @@ #!/usr/bin/env node -import { Command } from 'commander' -import { writeFileSync } from 'fs' -import path from 'path' -import { fileURLToPath } from 'url' -import { TestLoader } from './loader.js' -import { TestExecutor } from './executor.js' -import { LLMJudge } from './judge.js' -import { Reporter, TestLinkReporter } from './reporter.js' -import { RunnerOptions } from './types.js' +import { Command } from "commander"; +import { writeFileSync } from "fs"; +import path from "path"; +import { fileURLToPath } from "url"; +import { TestLoader } from "./loader.js"; +import { TestExecutor } from "./executor.js"; +import { LLMJudge } from "./judge.js"; +import { Reporter, TestLinkReporter } from "./reporter.js"; +import { RunnerOptions, Judgment } from "./types.js"; -const __dirname = path.dirname(fileURLToPath(import.meta.url)) -const defaultTestcasesDir = path.join(__dirname, '..', 'testcases') +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const defaultTestcasesDir = path.join(__dirname, "..", "testcases"); // Progress output to stderr (visible in console even when stdout is redirected) -const log = (msg: string) => process.stderr.write(msg + '\n') +const log = (msg: string) => process.stderr.write(msg + "\n"); -const program = new Command() +const program = new Command(); program - .name('ollama37-test') - .description('Scalable test runner with LLM-as-judge for ollama37') - .version('1.0.0') + .name("ollama37-test") + .description("Scalable test runner with LLM-as-judge for ollama37") + .version("1.0.0"); program - .command('run') - .description('Run test cases') - .option('-s, --suite ', 'Run only tests in specified suite (build, runtime, inference)') - .option('-i, --id ', 'Run only specified test case by ID') - .option('-w, --workers ', 'Number of parallel workers', '1') - .option('-d, --dry-run', 'Show what would be executed without running') - .option('-o, --output ', 'Output format: console, json, junit', 'console') - .option('--report-testlink', 'Report results to TestLink') - .option('--ollama-url ', 'Ollama server URL (test subject)', 'http://localhost:11434') - .option('--judge-url ', 'LLM Judge server URL (separate instance)', 'http://localhost:11435') - .option('--judge-model ', 'Model for LLM judging', 'gemma3:4b') - .option('--testlink-url ', 'TestLink server URL', 'http://localhost:8090') - .option('--testlink-api-key ', 'TestLink API key') - .option('--no-llm', 'Skip LLM judging, use simple exit code check only') - .option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)') - .option('--testcases-dir ', 'Test cases directory', defaultTestcasesDir) + .command("run") + .description("Run test cases") + .option( + "-s, --suite ", + "Run only tests in specified suite (build, runtime, inference)", + ) + .option("-i, --id ", "Run only specified test case by ID") + .option("-w, --workers ", "Number of parallel workers", "1") + .option("-d, --dry-run", "Show what would be executed without running") + .option( + "-o, --output ", + "Output format: console, json, junit", + "console", + ) + .option("--report-testlink", "Report results to TestLink") + .option( + "--ollama-url ", + "Ollama server URL (test subject)", + "http://localhost:11434", + ) + .option( + "--judge-url ", + "LLM Judge server URL (separate instance)", + "http://localhost:11435", + ) + .option("--judge-model ", "Model for LLM judging", "gemma3:4b") + .option( + "--testlink-url ", + "TestLink server URL", + "http://localhost:8090", + ) + .option("--testlink-api-key ", "TestLink API key") + .option("--no-llm", "Skip LLM judging, use simple exit code check only") + .option( + "--dual-judge", + "Use both simple and LLM judge (fail if either fails)", + ) + .option("--testcases-dir ", "Test cases directory", defaultTestcasesDir) .action(async (options) => { - log('='.repeat(60)) - log('OLLAMA37 TEST RUNNER') - log('='.repeat(60)) + log("=".repeat(60)); + log("OLLAMA37 TEST RUNNER"); + log("=".repeat(60)); - const loader = new TestLoader(options.testcasesDir) - const executor = new TestExecutor(path.join(__dirname, '..', '..')) - const judge = new LLMJudge(options.judgeUrl, options.judgeModel) + const loader = new TestLoader(options.testcasesDir); + const executor = new TestExecutor(path.join(__dirname, "..", "..")); + const judge = new LLMJudge(options.judgeUrl, options.judgeModel); // Load test cases - log('\nLoading test cases...') - let testCases = await loader.loadAll() + log("\nLoading test cases..."); + let testCases = await loader.loadAll(); if (options.suite) { - testCases = testCases.filter(tc => tc.suite === options.suite) - log(` Filtered by suite: ${options.suite}`) + testCases = testCases.filter((tc) => tc.suite === options.suite); + log(` Filtered by suite: ${options.suite}`); } if (options.id) { - testCases = testCases.filter(tc => tc.id === options.id) - log(` Filtered by ID: ${options.id}`) + testCases = testCases.filter((tc) => tc.id === options.id); + log(` Filtered by ID: ${options.id}`); } // Sort by dependencies - testCases = loader.sortByDependencies(testCases) + testCases = loader.sortByDependencies(testCases); - log(` Found ${testCases.length} test cases`) + log(` Found ${testCases.length} test cases`); if (testCases.length === 0) { - log('\nNo test cases found!') - process.exit(1) + log("\nNo test cases found!"); + process.exit(1); } // Dry run if (options.dryRun) { - log('\nDRY RUN - Would execute:') + log("\nDRY RUN - Would execute:"); for (const tc of testCases) { - log(` ${tc.id}: ${tc.name}`) + log(` ${tc.id}: ${tc.name}`); for (const step of tc.steps) { - log(` - ${step.name}: ${step.command}`) + log(` - ${step.name}: ${step.command}`); } } - process.exit(0) + process.exit(0); } // Execute tests (progress goes to stderr via executor) - const workers = parseInt(options.workers) - const results = await executor.executeAll(testCases, workers) + const workers = parseInt(options.workers); + const results = await executor.executeAll(testCases, workers); // Judge results - log('\nJudging results...') - let judgments + log("\nJudging results..."); + let judgments: Judgment[]; + let simpleJudgments: Judgment[] | undefined; + let llmJudgments: Judgment[] | undefined; if (options.dualJudge) { // Dual judge mode: run both simple and LLM, fail if either fails - log(' Using dual judge mode (simple + LLM)') + log(" Using dual judge mode (simple + LLM)"); // Simple judge first - const simpleJudgments = results.map(r => judge.simpleJudge(r)) - log(' Simple judge complete') + simpleJudgments = results.map((r) => judge.simpleJudge(r)); + log(" Simple judge complete"); // LLM judge second - let llmJudgments try { - llmJudgments = await judge.judgeResults(results) - log(' LLM judge complete') + llmJudgments = await judge.judgeResults(results); + log(" LLM judge complete"); } catch (error) { - log(` LLM judge failed: ${error}`) - log(' Falling back to simple judge only') - llmJudgments = simpleJudgments + log(` LLM judge failed: ${error}`); + log(" Falling back to simple judge only"); + llmJudgments = simpleJudgments; } // Combine: fail if either judge says fail judgments = simpleJudgments.map((simple, i) => { - const llm = llmJudgments.find(j => j.testId === simple.testId) || simple - const pass = simple.pass && llm.pass + const llm = + llmJudgments!.find((j) => j.testId === simple.testId) || simple; + const pass = simple.pass && llm.pass; - let reason = '' + let reason = ""; if (!pass) { - const reasons = [] - if (!simple.pass) reasons.push(`Simple: ${simple.reason}`) - if (!llm.pass) reasons.push(`LLM: ${llm.reason}`) - reason = reasons.join(' | ') + const reasons = []; + if (!simple.pass) reasons.push(`Simple: ${simple.reason}`); + if (!llm.pass) reasons.push(`LLM: ${llm.reason}`); + reason = reasons.join(" | "); } else { - reason = llm.reason || simple.reason + reason = llm.reason || simple.reason; } return { testId: simple.testId, pass, - reason - } - }) - + reason, + }; + }); } else if (options.llm === false) { - log(' Using simple exit code check (--no-llm)') - judgments = results.map(r => judge.simpleJudge(r)) + log(" Using simple exit code check (--no-llm)"); + judgments = results.map((r) => judge.simpleJudge(r)); } else { try { - judgments = await judge.judgeResults(results) + judgments = await judge.judgeResults(results); } catch (error) { - log(` LLM judging failed, falling back to simple check: ${error}`) - judgments = results.map(r => judge.simpleJudge(r)) + log(` LLM judging failed, falling back to simple check: ${error}`); + judgments = results.map((r) => judge.simpleJudge(r)); } } - // Create reports - const reports = Reporter.createReports(results, judgments) + // Create reports (with separate verdicts in dual-judge mode) + const reports = Reporter.createReports( + results, + judgments, + simpleJudgments, + llmJudgments, + ); // Output results switch (options.output) { - case 'json': - const json = Reporter.toJSON(reports) + case "json": + const json = Reporter.toJSON(reports); // JSON goes to stdout (can be redirected to file) - process.stdout.write(json + '\n') - break + process.stdout.write(json + "\n"); + break; - case 'junit': - const junit = Reporter.toJUnit(reports) - writeFileSync('test-results.xml', junit) - log('\nResults written to test-results.xml') - break + case "junit": + const junit = Reporter.toJUnit(reports); + writeFileSync("test-results.xml", junit); + log("\nResults written to test-results.xml"); + break; - case 'console': + case "console": default: - Reporter.toConsole(reports) - break + Reporter.toConsole(reports); + break; } // Summary - const passed = reports.filter(r => r.pass).length - const failed = reports.filter(r => !r.pass).length - log('\n' + '='.repeat(60)) - log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`) - log('='.repeat(60)) + const passed = reports.filter((r) => r.pass).length; + const failed = reports.filter((r) => !r.pass).length; + log("\n" + "=".repeat(60)); + log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`); + log("=".repeat(60)); // Report to TestLink if (options.reportTestlink && options.testlinkApiKey) { const testlinkReporter = new TestLinkReporter( options.testlinkUrl, - options.testlinkApiKey - ) + options.testlinkApiKey, + ); // Would need plan ID and build ID // await testlinkReporter.reportResults(reports, planId, buildId) - log('\nTestLink reporting not yet implemented') + log("\nTestLink reporting not yet implemented"); } // Exit with appropriate code - process.exit(failed > 0 ? 1 : 0) - }) + process.exit(failed > 0 ? 1 : 0); + }); program - .command('list') - .description('List all test cases') - .option('--testcases-dir ', 'Test cases directory', defaultTestcasesDir) + .command("list") + .description("List all test cases") + .option("--testcases-dir ", "Test cases directory", defaultTestcasesDir) .action(async (options) => { - const loader = new TestLoader(options.testcasesDir) - const testCases = await loader.loadAll() + const loader = new TestLoader(options.testcasesDir); + const testCases = await loader.loadAll(); - const grouped = loader.groupBySuite(testCases) + const grouped = loader.groupBySuite(testCases); - console.log('Available Test Cases:\n') + console.log("Available Test Cases:\n"); for (const [suite, cases] of grouped) { - console.log(`${suite.toUpperCase()}:`) + console.log(`${suite.toUpperCase()}:`); for (const tc of cases) { - console.log(` ${tc.id}: ${tc.name}`) + console.log(` ${tc.id}: ${tc.name}`); } - console.log() + console.log(); } - console.log(`Total: ${testCases.length} test cases`) - }) + console.log(`Total: ${testCases.length} test cases`); + }); -program.parse() +program.parse(); diff --git a/tests/src/judge.ts b/tests/src/judge.ts index 78ac4489..8d26ded8 100644 --- a/tests/src/judge.ts +++ b/tests/src/judge.ts @@ -1,31 +1,71 @@ -import axios from 'axios' -import { TestResult, Judgment } from './types.js' +import axios from "axios"; +import { TestResult, Judgment } from "./types.js"; export class LLMJudge { - private ollamaUrl: string - private model: string - private batchSize: number + private ollamaUrl: string; + private model: string; + private batchSize: number; - constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') { - this.ollamaUrl = ollamaUrl - this.model = model - this.batchSize = 5 // Judge 5 tests per LLM call + constructor( + ollamaUrl: string = "http://localhost:11434", + model: string = "gemma3:4b", + ) { + this.ollamaUrl = ollamaUrl; + this.model = model; + this.batchSize = 5; // Judge 5 tests per LLM call + } + + private formatDuration(ms: number): string { + if (ms < 1000) return `${ms}ms`; + if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`; + return `${(ms / 60000).toFixed(1)}min`; } private buildPrompt(results: TestResult[]): string { - const testsSection = results.map((r, i) => { - return ` + const testsSection = results + .map((r, i) => { + // Build step results summary with exit codes and durations + const stepsSummary = r.steps + .map((step, j) => { + const status = step.exitCode === 0 ? "PASS" : "FAIL"; + const stepTimeout = + r.testCase.steps[j]?.timeout || r.testCase.timeout; + return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`; + }) + .join("\n"); + + // Simple judge result + const allStepsPassed = r.steps.every((s) => s.exitCode === 0); + const simpleResult = allStepsPassed ? "PASS" : "FAIL"; + + // Check if duration is within timeout + const timeoutMs = r.testCase.timeout; + const withinTimeout = r.totalDuration < timeoutMs; + const timeoutNote = withinTimeout + ? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.` + : `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`; + + return ` ### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name} **Criteria:** ${r.testCase.criteria} +**Step Results:** +${stepsSummary} + +**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"}) + +**Timing:** ${timeoutNote} +${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""} + **Execution Logs:** \`\`\` -${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''} +${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""} \`\`\` -` - }).join('\n---\n') +`; + }) + .join("\n---\n"); return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided. @@ -46,66 +86,74 @@ Important: - For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions) - For build/runtime tests, check exit codes and absence of error messages - Be lenient with formatting differences, focus on semantic correctness +- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs +- Long durations are acceptable as long as they are within the configured timeout -Respond ONLY with the JSON array, no other text.` +Respond ONLY with the JSON array, no other text.`; } async judgeResults(results: TestResult[]): Promise { - const allJudgments: Judgment[] = [] + const allJudgments: Judgment[] = []; // Process in batches for (let i = 0; i < results.length; i += this.batchSize) { - const batch = results.slice(i, i + this.batchSize) - console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`) + const batch = results.slice(i, i + this.batchSize); + console.log( + ` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`, + ); try { - const judgments = await this.judgeBatch(batch) - allJudgments.push(...judgments) + const judgments = await this.judgeBatch(batch); + allJudgments.push(...judgments); } catch (error) { - console.error(` Failed to judge batch:`, error) + console.error(` Failed to judge batch:`, error); // Mark all tests in batch as failed for (const r of batch) { allJudgments.push({ testId: r.testCase.id, pass: false, - reason: 'LLM judgment failed: ' + String(error) - }) + reason: "LLM judgment failed: " + String(error), + }); } } } - return allJudgments + return allJudgments; } private async judgeBatch(results: TestResult[]): Promise { - const prompt = this.buildPrompt(results) + const prompt = this.buildPrompt(results); - const response = await axios.post(`${this.ollamaUrl}/api/generate`, { - model: this.model, - prompt, - stream: false, - options: { - temperature: 0.1, // Low temperature for consistent judging - num_predict: 1000 - } - }, { - timeout: 120000 // 2 minute timeout - }) + const response = await axios.post( + `${this.ollamaUrl}/api/generate`, + { + model: this.model, + prompt, + stream: false, + options: { + temperature: 0.1, // Low temperature for consistent judging + num_predict: 1000, + }, + }, + { + timeout: 120000, // 2 minute timeout + }, + ); - const responseText = response.data.response + const responseText = response.data.response; // Extract JSON from response - const jsonMatch = responseText.match(/\[[\s\S]*\]/) + const jsonMatch = responseText.match(/\[[\s\S]*\]/); if (!jsonMatch) { - throw new Error('No JSON array found in LLM response') + throw new Error("No JSON array found in LLM response"); } try { - const judgments = JSON.parse(jsonMatch[0]) as Judgment[] + const judgments = JSON.parse(jsonMatch[0]) as Judgment[]; // Validate and fill missing - const resultIds = results.map(r => r.testCase.id) - const judgedIds = new Set(judgments.map(j => j.testId)) + const resultIds = results.map((r) => r.testCase.id); + const judgedIds = new Set(judgments.map((j) => j.testId)); // Add missing judgments for (const id of resultIds) { @@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.` judgments.push({ testId: id, pass: false, - reason: 'No judgment provided by LLM' - }) + reason: "No judgment provided by LLM", + }); } } - return judgments + return judgments; } catch (parseError) { - throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`) + throw new Error( + `Failed to parse LLM response: ${responseText.substring(0, 200)}`, + ); } } // Fallback: Simple rule-based judgment (no LLM) simpleJudge(result: TestResult): Judgment { - const allStepsPassed = result.steps.every(s => s.exitCode === 0) + const allStepsPassed = result.steps.every((s) => s.exitCode === 0); if (allStepsPassed) { return { testId: result.testCase.id, pass: true, - reason: 'All steps completed with exit code 0' - } + reason: "All steps completed with exit code 0", + }; } else { - const failedSteps = result.steps.filter(s => s.exitCode !== 0) + const failedSteps = result.steps.filter((s) => s.exitCode !== 0); return { testId: result.testCase.id, pass: false, - reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}` - } + reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`, + }; } } } diff --git a/tests/src/reporter.ts b/tests/src/reporter.ts index cc0f6012..505bf64d 100644 --- a/tests/src/reporter.ts +++ b/tests/src/reporter.ts @@ -1,112 +1,204 @@ -import axios from 'axios' -import { TestReport, Judgment, TestResult } from './types.js' +import axios from "axios"; +import { TestReport, Judgment, TestResult, TestSummary } from "./types.js"; export class Reporter { // Console reporter static toConsole(reports: TestReport[]): void { - console.log('\n' + '='.repeat(60)) - console.log('TEST RESULTS') - console.log('='.repeat(60)) + console.log("\n" + "=".repeat(60)); + console.log("TEST RESULTS"); + console.log("=".repeat(60)); - const passed = reports.filter(r => r.pass) - const failed = reports.filter(r => !r.pass) + const passed = reports.filter((r) => r.pass); + const failed = reports.filter((r) => !r.pass); + + // Check if we have dual-judge data + const hasDualJudge = reports.some( + (r) => r.simplePass !== undefined && r.llmPass !== undefined, + ); for (const report of reports) { - const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m' - console.log(`[${status}] ${report.testId}: ${report.name}`) - console.log(` Reason: ${report.reason}`) - console.log(` Duration: ${report.duration}ms`) + const status = report.pass + ? "\x1b[32mPASS\x1b[0m" + : "\x1b[31mFAIL\x1b[0m"; + console.log(`[${status}] ${report.testId}: ${report.name}`); + + // Show separate verdicts in dual-judge mode + if ( + hasDualJudge && + report.simplePass !== undefined && + report.llmPass !== undefined + ) { + const simpleStatus = report.simplePass + ? "\x1b[32mPASS\x1b[0m" + : "\x1b[31mFAIL\x1b[0m"; + const llmStatus = report.llmPass + ? "\x1b[32mPASS\x1b[0m" + : "\x1b[31mFAIL\x1b[0m"; + console.log( + ` Simple: [${simpleStatus}] ${report.simpleReason || ""}`, + ); + console.log(` LLM: [${llmStatus}] ${report.llmReason || ""}`); + } else { + console.log(` Reason: ${report.reason}`); + } + console.log(` Duration: ${report.duration}ms`); } - console.log('\n' + '-'.repeat(60)) - console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`) - console.log('='.repeat(60)) + console.log("\n" + "-".repeat(60)); + + // Show separate summaries in dual-judge mode + if (hasDualJudge) { + const simplePassed = reports.filter((r) => r.simplePass).length; + const simpleFailed = reports.filter((r) => !r.simplePass).length; + const llmPassed = reports.filter((r) => r.llmPass).length; + const llmFailed = reports.filter((r) => !r.llmPass).length; + + console.log(`Simple: ${simplePassed} passed, ${simpleFailed} failed`); + console.log(`LLM: ${llmPassed} passed, ${llmFailed} failed`); + console.log( + `Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`, + ); + } else { + console.log( + `Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`, + ); + } + console.log("=".repeat(60)); } // JSON reporter static toJSON(reports: TestReport[]): string { - return JSON.stringify({ - summary: { - total: reports.length, - passed: reports.filter(r => r.pass).length, - failed: reports.filter(r => !r.pass).length, - timestamp: new Date().toISOString() + // Check if we have dual-judge data + const hasDualJudge = reports.some( + (r) => r.simplePass !== undefined && r.llmPass !== undefined, + ); + + const summary: TestSummary = { + total: reports.length, + passed: reports.filter((r) => r.pass).length, + failed: reports.filter((r) => !r.pass).length, + timestamp: new Date().toISOString(), + }; + + // Add separate breakdowns in dual-judge mode + if (hasDualJudge) { + summary.simple = { + passed: reports.filter((r) => r.simplePass).length, + failed: reports.filter((r) => !r.simplePass).length, + }; + summary.llm = { + passed: reports.filter((r) => r.llmPass).length, + failed: reports.filter((r) => !r.llmPass).length, + }; + } + + return JSON.stringify( + { + summary, + results: reports, }, - results: reports - }, null, 2) + null, + 2, + ); } // JUnit XML reporter (for CI/CD integration) static toJUnit(reports: TestReport[]): string { - const escapeXml = (s: string) => s - .replace(/&/g, '&') - .replace(//g, '>') - .replace(/"/g, '"') - .replace(/'/g, ''') + const escapeXml = (s: string) => + s + .replace(/&/g, "&") + .replace(//g, ">") + .replace(/"/g, """) + .replace(/'/g, "'"); - const testcases = reports.map(r => { - if (r.pass) { - return ` ` - } else { - return ` + const testcases = reports + .map((r) => { + if (r.pass) { + return ` `; + } else { + return ` ${escapeXml(r.logs.substring(0, 1000))} - ` - } - }).join('\n') + `; + } + }) + .join("\n"); - const failures = reports.filter(r => !r.pass).length - const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000 + const failures = reports.filter((r) => !r.pass).length; + const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000; return ` ${testcases} -` +`; } // Combine results and judgments into reports - static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] { - const judgmentMap = new Map(judgments.map(j => [j.testId, j])) + static createReports( + results: TestResult[], + judgments: Judgment[], + simpleJudgments?: Judgment[], + llmJudgments?: Judgment[], + ): TestReport[] { + const judgmentMap = new Map(judgments.map((j) => [j.testId, j])); + const simpleMap = simpleJudgments + ? new Map(simpleJudgments.map((j) => [j.testId, j])) + : undefined; + const llmMap = llmJudgments + ? new Map(llmJudgments.map((j) => [j.testId, j])) + : undefined; - return results.map(result => { - const judgment = judgmentMap.get(result.testCase.id) + return results.map((result) => { + const judgment = judgmentMap.get(result.testCase.id); + const simple = simpleMap?.get(result.testCase.id); + const llm = llmMap?.get(result.testCase.id); - return { + const report: TestReport = { testId: result.testCase.id, name: result.testCase.name, suite: result.testCase.suite, pass: judgment?.pass ?? false, - reason: judgment?.reason ?? 'No judgment', + reason: judgment?.reason ?? "No judgment", duration: result.totalDuration, - logs: result.logs + logs: result.logs, + }; + + // Add separate verdicts if available (dual-judge mode) + if (simple && llm) { + report.simplePass = simple.pass; + report.simpleReason = simple.reason; + report.llmPass = llm.pass; + report.llmReason = llm.reason; } - }) + + return report; + }); } } // TestLink reporter export class TestLinkReporter { - private url: string - private apiKey: string + private url: string; + private apiKey: string; constructor(url: string, apiKey: string) { - this.url = url - this.apiKey = apiKey + this.url = url; + this.apiKey = apiKey; } async reportResults( reports: TestReport[], planId: string, - buildId: string + buildId: string, ): Promise { - console.log('\nReporting to TestLink...') + console.log("\nReporting to TestLink..."); for (const report of reports) { try { - await this.reportTestExecution(report, planId, buildId) - console.log(` Reported: ${report.testId}`) + await this.reportTestExecution(report, planId, buildId); + console.log(` Reported: ${report.testId}`); } catch (error) { - console.error(` Failed to report ${report.testId}:`, error) + console.error(` Failed to report ${report.testId}:`, error); } } } @@ -114,12 +206,12 @@ export class TestLinkReporter { private async reportTestExecution( report: TestReport, planId: string, - buildId: string + buildId: string, ): Promise { // Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID) // This would need to be mapped from TestLink - const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked + const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked // Note: This uses the TestLink XML-RPC API // In practice, you'd use the testlink-mcp or direct API calls @@ -129,10 +221,10 @@ export class TestLinkReporter { testplanid: planId, buildid: buildId, status, - notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}` - } + notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`, + }; // For now, just log - actual implementation would call TestLink API - console.log(` Would report: ${report.testId} = ${status}`) + console.log(` Would report: ${report.testId} = ${status}`); } } diff --git a/tests/src/types.ts b/tests/src/types.ts index f07badcf..fa549ca4 100644 --- a/tests/src/types.ts +++ b/tests/src/types.ts @@ -1,66 +1,88 @@ // Test case definition export interface TestStep { - name: string - command: string - timeout?: number + name: string; + command: string; + timeout?: number; } export interface TestCase { - id: string - name: string - suite: string - priority: number - timeout: number - dependencies: string[] - steps: TestStep[] - criteria: string + id: string; + name: string; + suite: string; + priority: number; + timeout: number; + dependencies: string[]; + steps: TestStep[]; + criteria: string; } // Execution results export interface StepResult { - name: string - command: string - stdout: string - stderr: string - exitCode: number - duration: number + name: string; + command: string; + stdout: string; + stderr: string; + exitCode: number; + duration: number; } export interface TestResult { - testCase: TestCase - steps: StepResult[] - totalDuration: number - logs: string + testCase: TestCase; + steps: StepResult[]; + totalDuration: number; + logs: string; } // LLM judgment export interface Judgment { - testId: string - pass: boolean - reason: string + testId: string; + pass: boolean; + reason: string; } // Final report export interface TestReport { - testId: string - name: string - suite: string - pass: boolean - reason: string - duration: number - logs: string + testId: string; + name: string; + suite: string; + pass: boolean; + reason: string; + duration: number; + logs: string; + // Separate verdicts for dual-judge mode + simplePass?: boolean; + simpleReason?: string; + llmPass?: boolean; + llmReason?: string; +} + +// Summary with separate judge breakdowns +export interface TestSummary { + total: number; + passed: number; + failed: number; + timestamp: string; + // Separate breakdowns (only present in dual-judge mode) + simple?: { + passed: number; + failed: number; + }; + llm?: { + passed: number; + failed: number; + }; } // Runner options export interface RunnerOptions { - suite?: string - id?: string - workers: number - dryRun: boolean - output: 'console' | 'json' | 'junit' - reportTestlink: boolean - ollamaUrl: string - ollamaModel: string - testlinkUrl: string - testlinkApiKey: string + suite?: string; + id?: string; + workers: number; + dryRun: boolean; + output: "console" | "json" | "junit"; + reportTestlink: boolean; + ollamaUrl: string; + ollamaModel: string; + testlinkUrl: string; + testlinkApiKey: string; }