Enhance LLM judge prompt and add separate verdict display

- Add step results, timing context, and build notes to LLM prompt - LLM now sees exit codes, durations, and simple judge result - Add guidance that long build times within timeout are acceptable - Add separate simple/LLM verdict tracking in dual-judge mode - Console output shows both Simple and LLM pass/fail status - JSON summary includes separate simple/llm breakdown - Each test report includes simplePass/llmPass fields This helps distinguish between simple judge failures (exit code != 0) and LLM judge failures (semantic analysis), making debugging easier.
2025-12-20 12:47:00 +00:00 · 2025-12-17 15:04:05 +08:00
parent 1e99c1bb50
commit e06deff40f
4 changed files with 465 additions and 273 deletions
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -1,216 +1,244 @@
 #!/usr/bin/env node
-import { Command } from 'commander'
+import { Command } from "commander";
-import { writeFileSync } from 'fs'
+import { writeFileSync } from "fs";
-import path from 'path'
+import path from "path";
-import { fileURLToPath } from 'url'
+import { fileURLToPath } from "url";
-import { TestLoader } from './loader.js'
+import { TestLoader } from "./loader.js";
-import { TestExecutor } from './executor.js'
+import { TestExecutor } from "./executor.js";
-import { LLMJudge } from './judge.js'
+import { LLMJudge } from "./judge.js";
-import { Reporter, TestLinkReporter } from './reporter.js'
+import { Reporter, TestLinkReporter } from "./reporter.js";
-import { RunnerOptions } from './types.js'
+import { RunnerOptions, Judgment } from "./types.js";
-const __dirname = path.dirname(fileURLToPath(import.meta.url))
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const defaultTestcasesDir = path.join(__dirname, '..', 'testcases')
+const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
 // Progress output to stderr (visible in console even when stdout is redirected)
-const log = (msg: string) => process.stderr.write(msg + '\n')
+const log = (msg: string) => process.stderr.write(msg + "\n");
-const program = new Command()
+const program = new Command();
 program
-  .name('ollama37-test')
+  .name("ollama37-test")
-  .description('Scalable test runner with LLM-as-judge for ollama37')
+  .description("Scalable test runner with LLM-as-judge for ollama37")
-  .version('1.0.0')
+  .version("1.0.0");
 program
-  .command('run')
+  .command("run")
-  .description('Run test cases')
+  .description("Run test cases")
-  .option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)')
+  .option(
-  .option('-i, --id <id>', 'Run only specified test case by ID')
+    "-s, --suite <suite>",
-  .option('-w, --workers <n>', 'Number of parallel workers', '1')
+    "Run only tests in specified suite (build, runtime, inference)",
-  .option('-d, --dry-run', 'Show what would be executed without running')
+  )
-  .option('-o, --output <format>', 'Output format: console, json, junit', 'console')
+  .option("-i, --id <id>", "Run only specified test case by ID")
-  .option('--report-testlink', 'Report results to TestLink')
+  .option("-w, --workers <n>", "Number of parallel workers", "1")
-  .option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434')
+  .option("-d, --dry-run", "Show what would be executed without running")
-  .option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435')
+  .option(
-  .option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b')
+    "-o, --output <format>",
-  .option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
+    "Output format: console, json, junit",
-  .option('--testlink-api-key <key>', 'TestLink API key')
+    "console",
-  .option('--no-llm', 'Skip LLM judging, use simple exit code check only')
+  )
-  .option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)')
+  .option("--report-testlink", "Report results to TestLink")
-  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .option(
    "--ollama-url <url>",
    "Ollama server URL (test subject)",
    "http://localhost:11434",
  )
  .option(
    "--judge-url <url>",
    "LLM Judge server URL (separate instance)",
    "http://localhost:11435",
  )
  .option("--judge-model <model>", "Model for LLM judging", "gemma3:4b")
  .option(
    "--testlink-url <url>",
    "TestLink server URL",
    "http://localhost:8090",
  )
  .option("--testlink-api-key <key>", "TestLink API key")
  .option("--no-llm", "Skip LLM judging, use simple exit code check only")
  .option(
    "--dual-judge",
    "Use both simple and LLM judge (fail if either fails)",
  )
  .option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
  .action(async (options) => {
-    log('='.repeat(60))
+    log("=".repeat(60));
-    log('OLLAMA37 TEST RUNNER')
+    log("OLLAMA37 TEST RUNNER");
-    log('='.repeat(60))
+    log("=".repeat(60));
-    const loader = new TestLoader(options.testcasesDir)
+    const loader = new TestLoader(options.testcasesDir);
-    const executor = new TestExecutor(path.join(__dirname, '..', '..'))
+    const executor = new TestExecutor(path.join(__dirname, "..", ".."));
-    const judge = new LLMJudge(options.judgeUrl, options.judgeModel)
+    const judge = new LLMJudge(options.judgeUrl, options.judgeModel);
    // Load test cases
-    log('\nLoading test cases...')
+    log("\nLoading test cases...");
-    let testCases = await loader.loadAll()
+    let testCases = await loader.loadAll();
    if (options.suite) {
-      testCases = testCases.filter(tc => tc.suite === options.suite)
+      testCases = testCases.filter((tc) => tc.suite === options.suite);
-      log(`  Filtered by suite: ${options.suite}`)
+      log(`  Filtered by suite: ${options.suite}`);
    }
    if (options.id) {
-      testCases = testCases.filter(tc => tc.id === options.id)
+      testCases = testCases.filter((tc) => tc.id === options.id);
-      log(`  Filtered by ID: ${options.id}`)
+      log(`  Filtered by ID: ${options.id}`);
    }
    // Sort by dependencies
-    testCases = loader.sortByDependencies(testCases)
+    testCases = loader.sortByDependencies(testCases);
-    log(`  Found ${testCases.length} test cases`)
+    log(`  Found ${testCases.length} test cases`);
    if (testCases.length === 0) {
-      log('\nNo test cases found!')
+      log("\nNo test cases found!");
-      process.exit(1)
+      process.exit(1);
    }
    // Dry run
    if (options.dryRun) {
-      log('\nDRY RUN - Would execute:')
+      log("\nDRY RUN - Would execute:");
      for (const tc of testCases) {
-        log(`  ${tc.id}: ${tc.name}`)
+        log(`  ${tc.id}: ${tc.name}`);
        for (const step of tc.steps) {
-          log(`    - ${step.name}: ${step.command}`)
+          log(`    - ${step.name}: ${step.command}`);
        }
      }
-      process.exit(0)
+      process.exit(0);
    }
    // Execute tests (progress goes to stderr via executor)
-    const workers = parseInt(options.workers)
+    const workers = parseInt(options.workers);
-    const results = await executor.executeAll(testCases, workers)
+    const results = await executor.executeAll(testCases, workers);
    // Judge results
-    log('\nJudging results...')
+    log("\nJudging results...");
-    let judgments
+    let judgments: Judgment[];
    let simpleJudgments: Judgment[] | undefined;
    let llmJudgments: Judgment[] | undefined;
    if (options.dualJudge) {
      // Dual judge mode: run both simple and LLM, fail if either fails
-      log('  Using dual judge mode (simple + LLM)')
+      log("  Using dual judge mode (simple + LLM)");
      // Simple judge first
-      const simpleJudgments = results.map(r => judge.simpleJudge(r))
+      simpleJudgments = results.map((r) => judge.simpleJudge(r));
-      log('  Simple judge complete')
+      log("  Simple judge complete");
      // LLM judge second
      let llmJudgments
      try {
-        llmJudgments = await judge.judgeResults(results)
+        llmJudgments = await judge.judgeResults(results);
-        log('  LLM judge complete')
+        log("  LLM judge complete");
      } catch (error) {
-        log(`  LLM judge failed: ${error}`)
+        log(`  LLM judge failed: ${error}`);
-        log('  Falling back to simple judge only')
+        log("  Falling back to simple judge only");
-        llmJudgments = simpleJudgments
+        llmJudgments = simpleJudgments;
      }
      // Combine: fail if either judge says fail
      judgments = simpleJudgments.map((simple, i) => {
-        const llm = llmJudgments.find(j => j.testId === simple.testId) || simple
+        const llm =
-        const pass = simple.pass && llm.pass
+          llmJudgments!.find((j) => j.testId === simple.testId) || simple;
        const pass = simple.pass && llm.pass;
-        let reason = ''
+        let reason = "";
        if (!pass) {
-          const reasons = []
+          const reasons = [];
-          if (!simple.pass) reasons.push(`Simple: ${simple.reason}`)
+          if (!simple.pass) reasons.push(`Simple: ${simple.reason}`);
-          if (!llm.pass) reasons.push(`LLM: ${llm.reason}`)
+          if (!llm.pass) reasons.push(`LLM: ${llm.reason}`);
-          reason = reasons.join(' | ')
+          reason = reasons.join(" | ");
        } else {
-          reason = llm.reason || simple.reason
+          reason = llm.reason || simple.reason;
        }
        return {
          testId: simple.testId,
          pass,
-          reason
+          reason,
-        }
+        };
-      })
+      });
    } else if (options.llm === false) {
-      log('  Using simple exit code check (--no-llm)')
+      log("  Using simple exit code check (--no-llm)");
-      judgments = results.map(r => judge.simpleJudge(r))
+      judgments = results.map((r) => judge.simpleJudge(r));
    } else {
      try {
-        judgments = await judge.judgeResults(results)
+        judgments = await judge.judgeResults(results);
      } catch (error) {
-        log(`  LLM judging failed, falling back to simple check: ${error}`)
+        log(`  LLM judging failed, falling back to simple check: ${error}`);
-        judgments = results.map(r => judge.simpleJudge(r))
+        judgments = results.map((r) => judge.simpleJudge(r));
      }
    }
-    // Create reports
+    // Create reports (with separate verdicts in dual-judge mode)
-    const reports = Reporter.createReports(results, judgments)
+    const reports = Reporter.createReports(
      results,
      judgments,
      simpleJudgments,
      llmJudgments,
    );
    // Output results
    switch (options.output) {
-      case 'json':
+      case "json":
-        const json = Reporter.toJSON(reports)
+        const json = Reporter.toJSON(reports);
        // JSON goes to stdout (can be redirected to file)
-        process.stdout.write(json + '\n')
+        process.stdout.write(json + "\n");
-        break
+        break;
-      case 'junit':
+      case "junit":
-        const junit = Reporter.toJUnit(reports)
+        const junit = Reporter.toJUnit(reports);
-        writeFileSync('test-results.xml', junit)
+        writeFileSync("test-results.xml", junit);
-        log('\nResults written to test-results.xml')
+        log("\nResults written to test-results.xml");
-        break
+        break;
-      case 'console':
+      case "console":
      default:
-        Reporter.toConsole(reports)
+        Reporter.toConsole(reports);
-        break
+        break;
    }
    // Summary
-    const passed = reports.filter(r => r.pass).length
+    const passed = reports.filter((r) => r.pass).length;
-    const failed = reports.filter(r => !r.pass).length
+    const failed = reports.filter((r) => !r.pass).length;
-    log('\n' + '='.repeat(60))
+    log("\n" + "=".repeat(60));
-    log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`)
+    log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`);
-    log('='.repeat(60))
+    log("=".repeat(60));
    // Report to TestLink
    if (options.reportTestlink && options.testlinkApiKey) {
      const testlinkReporter = new TestLinkReporter(
        options.testlinkUrl,
-        options.testlinkApiKey
+        options.testlinkApiKey,
-      )
+      );
      // Would need plan ID and build ID
      // await testlinkReporter.reportResults(reports, planId, buildId)
-      log('\nTestLink reporting not yet implemented')
+      log("\nTestLink reporting not yet implemented");
    }
    // Exit with appropriate code
-    process.exit(failed > 0 ? 1 : 0)
+    process.exit(failed > 0 ? 1 : 0);
-  })
+  });
 program
-  .command('list')
+  .command("list")
-  .description('List all test cases')
+  .description("List all test cases")
-  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
  .action(async (options) => {
-    const loader = new TestLoader(options.testcasesDir)
+    const loader = new TestLoader(options.testcasesDir);
-    const testCases = await loader.loadAll()
+    const testCases = await loader.loadAll();
-    const grouped = loader.groupBySuite(testCases)
+    const grouped = loader.groupBySuite(testCases);
-    console.log('Available Test Cases:\n')
+    console.log("Available Test Cases:\n");
    for (const [suite, cases] of grouped) {
-      console.log(`${suite.toUpperCase()}:`)
+      console.log(`${suite.toUpperCase()}:`);
      for (const tc of cases) {
-        console.log(`  ${tc.id}: ${tc.name}`)
+        console.log(`  ${tc.id}: ${tc.name}`);
      }
-      console.log()
+      console.log();
    }
-    console.log(`Total: ${testCases.length} test cases`)
+    console.log(`Total: ${testCases.length} test cases`);
-  })
+  });
-program.parse()
+program.parse();
--- a/tests/src/judge.ts
+++ b/tests/src/judge.ts
@@ -1,31 +1,71 @@
-import axios from 'axios'
+import axios from "axios";
-import { TestResult, Judgment } from './types.js'
+import { TestResult, Judgment } from "./types.js";
 export class LLMJudge {
-  private ollamaUrl: string
+  private ollamaUrl: string;
-  private model: string
+  private model: string;
-  private batchSize: number
+  private batchSize: number;
-  constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
+  constructor(
-    this.ollamaUrl = ollamaUrl
+    ollamaUrl: string = "http://localhost:11434",
-    this.model = model
+    model: string = "gemma3:4b",
-    this.batchSize = 5 // Judge 5 tests per LLM call
+  ) {
    this.ollamaUrl = ollamaUrl;
    this.model = model;
    this.batchSize = 5; // Judge 5 tests per LLM call
  }
  private formatDuration(ms: number): string {
    if (ms < 1000) return `${ms}ms`;
    if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
    return `${(ms / 60000).toFixed(1)}min`;
  }
  private buildPrompt(results: TestResult[]): string {
-    const testsSection = results.map((r, i) => {
+    const testsSection = results
      .map((r, i) => {
        // Build step results summary with exit codes and durations
        const stepsSummary = r.steps
          .map((step, j) => {
            const status = step.exitCode === 0 ? "PASS" : "FAIL";
            const stepTimeout =
              r.testCase.steps[j]?.timeout || r.testCase.timeout;
            return `  ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
          })
          .join("\n");
        // Simple judge result
        const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
        const simpleResult = allStepsPassed ? "PASS" : "FAIL";
        // Check if duration is within timeout
        const timeoutMs = r.testCase.timeout;
        const withinTimeout = r.totalDuration < timeoutMs;
        const timeoutNote = withinTimeout
          ? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
          : `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
        return `
 ### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
 **Criteria:**
 ${r.testCase.criteria}
 **Step Results:**
 ${stepsSummary}
 **Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
 **Timing:** ${timeoutNote}
 ${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
 **Execution Logs:**
 \`\`\`
-${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
+${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
 \`\`\`
-`
+`;
-    }).join('\n---\n')
+      })
      .join("\n---\n");
    return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
@@ -46,66 +86,74 @@ Important:
 - For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
 - For build/runtime tests, check exit codes and absence of error messages
 - Be lenient with formatting differences, focus on semantic correctness
 - If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
 - Long durations are acceptable as long as they are within the configured timeout
-Respond ONLY with the JSON array, no other text.`
+Respond ONLY with the JSON array, no other text.`;
  }
  async judgeResults(results: TestResult[]): Promise<Judgment[]> {
-    const allJudgments: Judgment[] = []
+    const allJudgments: Judgment[] = [];
    // Process in batches
    for (let i = 0; i < results.length; i += this.batchSize) {
-      const batch = results.slice(i, i + this.batchSize)
+      const batch = results.slice(i, i + this.batchSize);
-      console.log(`  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
+      console.log(
        `  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
      );
      try {
-        const judgments = await this.judgeBatch(batch)
+        const judgments = await this.judgeBatch(batch);
-        allJudgments.push(...judgments)
+        allJudgments.push(...judgments);
      } catch (error) {
-        console.error(`  Failed to judge batch:`, error)
+        console.error(`  Failed to judge batch:`, error);
        // Mark all tests in batch as failed
        for (const r of batch) {
          allJudgments.push({
            testId: r.testCase.id,
            pass: false,
-            reason: 'LLM judgment failed: ' + String(error)
+            reason: "LLM judgment failed: " + String(error),
-          })
+          });
        }
      }
    }
-    return allJudgments
+    return allJudgments;
  }
  private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
-    const prompt = this.buildPrompt(results)
+    const prompt = this.buildPrompt(results);
-    const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
+    const response = await axios.post(
      `${this.ollamaUrl}/api/generate`,
      {
        model: this.model,
        prompt,
        stream: false,
        options: {
          temperature: 0.1, // Low temperature for consistent judging
-        num_predict: 1000
+          num_predict: 1000,
-      }
+        },
-    }, {
+      },
-      timeout: 120000 // 2 minute timeout
+      {
-    })
+        timeout: 120000, // 2 minute timeout
      },
    );
-    const responseText = response.data.response
+    const responseText = response.data.response;
    // Extract JSON from response
-    const jsonMatch = responseText.match(/\[[\s\S]*\]/)
+    const jsonMatch = responseText.match(/\[[\s\S]*\]/);
    if (!jsonMatch) {
-      throw new Error('No JSON array found in LLM response')
+      throw new Error("No JSON array found in LLM response");
    }
    try {
-      const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
+      const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
      // Validate and fill missing
-      const resultIds = results.map(r => r.testCase.id)
+      const resultIds = results.map((r) => r.testCase.id);
-      const judgedIds = new Set(judgments.map(j => j.testId))
+      const judgedIds = new Set(judgments.map((j) => j.testId));
      // Add missing judgments
      for (const id of resultIds) {
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
          judgments.push({
            testId: id,
            pass: false,
-            reason: 'No judgment provided by LLM'
+            reason: "No judgment provided by LLM",
-          })
+          });
        }
      }
-      return judgments
+      return judgments;
    } catch (parseError) {
-      throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
+      throw new Error(
        `Failed to parse LLM response: ${responseText.substring(0, 200)}`,
      );
    }
  }
  // Fallback: Simple rule-based judgment (no LLM)
  simpleJudge(result: TestResult): Judgment {
-    const allStepsPassed = result.steps.every(s => s.exitCode === 0)
+    const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
    if (allStepsPassed) {
      return {
        testId: result.testCase.id,
        pass: true,
-        reason: 'All steps completed with exit code 0'
+        reason: "All steps completed with exit code 0",
-      }
+      };
    } else {
-      const failedSteps = result.steps.filter(s => s.exitCode !== 0)
+      const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
      return {
        testId: result.testCase.id,
        pass: false,
-        reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
+        reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
-      }
+      };
    }
  }
 }
--- a/tests/src/reporter.ts
+++ b/tests/src/reporter.ts
@@ -1,112 +1,204 @@
-import axios from 'axios'
+import axios from "axios";
-import { TestReport, Judgment, TestResult } from './types.js'
+import { TestReport, Judgment, TestResult, TestSummary } from "./types.js";
 export class Reporter {
  // Console reporter
  static toConsole(reports: TestReport[]): void {
-    console.log('\n' + '='.repeat(60))
+    console.log("\n" + "=".repeat(60));
-    console.log('TEST RESULTS')
+    console.log("TEST RESULTS");
-    console.log('='.repeat(60))
+    console.log("=".repeat(60));
-    const passed = reports.filter(r => r.pass)
+    const passed = reports.filter((r) => r.pass);
-    const failed = reports.filter(r => !r.pass)
+    const failed = reports.filter((r) => !r.pass);
    // Check if we have dual-judge data
    const hasDualJudge = reports.some(
      (r) => r.simplePass !== undefined && r.llmPass !== undefined,
    );
    for (const report of reports) {
-      const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m'
+      const status = report.pass
-      console.log(`[${status}] ${report.testId}: ${report.name}`)
+        ? "\x1b[32mPASS\x1b[0m"
-      console.log(`       Reason: ${report.reason}`)
+        : "\x1b[31mFAIL\x1b[0m";
-      console.log(`       Duration: ${report.duration}ms`)
+      console.log(`[${status}] ${report.testId}: ${report.name}`);
      // Show separate verdicts in dual-judge mode
      if (
        hasDualJudge &&
        report.simplePass !== undefined &&
        report.llmPass !== undefined
      ) {
        const simpleStatus = report.simplePass
          ? "\x1b[32mPASS\x1b[0m"
          : "\x1b[31mFAIL\x1b[0m";
        const llmStatus = report.llmPass
          ? "\x1b[32mPASS\x1b[0m"
          : "\x1b[31mFAIL\x1b[0m";
        console.log(
          `       Simple: [${simpleStatus}] ${report.simpleReason || ""}`,
        );
        console.log(`       LLM:    [${llmStatus}] ${report.llmReason || ""}`);
      } else {
        console.log(`       Reason: ${report.reason}`);
      }
      console.log(`       Duration: ${report.duration}ms`);
    }
-    console.log('\n' + '-'.repeat(60))
+    console.log("\n" + "-".repeat(60));
-    console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
+
-    console.log('='.repeat(60))
+    // Show separate summaries in dual-judge mode
    if (hasDualJudge) {
      const simplePassed = reports.filter((r) => r.simplePass).length;
      const simpleFailed = reports.filter((r) => !r.simplePass).length;
      const llmPassed = reports.filter((r) => r.llmPass).length;
      const llmFailed = reports.filter((r) => !r.llmPass).length;
      console.log(`Simple:   ${simplePassed} passed, ${simpleFailed} failed`);
      console.log(`LLM:      ${llmPassed} passed, ${llmFailed} failed`);
      console.log(
        `Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`,
      );
    } else {
      console.log(
        `Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`,
      );
    }
    console.log("=".repeat(60));
  }
  // JSON reporter
  static toJSON(reports: TestReport[]): string {
-    return JSON.stringify({
+    // Check if we have dual-judge data
-      summary: {
+    const hasDualJudge = reports.some(
      (r) => r.simplePass !== undefined && r.llmPass !== undefined,
    );
    const summary: TestSummary = {
      total: reports.length,
-        passed: reports.filter(r => r.pass).length,
+      passed: reports.filter((r) => r.pass).length,
-        failed: reports.filter(r => !r.pass).length,
+      failed: reports.filter((r) => !r.pass).length,
-        timestamp: new Date().toISOString()
+      timestamp: new Date().toISOString(),
    };
    // Add separate breakdowns in dual-judge mode
    if (hasDualJudge) {
      summary.simple = {
        passed: reports.filter((r) => r.simplePass).length,
        failed: reports.filter((r) => !r.simplePass).length,
      };
      summary.llm = {
        passed: reports.filter((r) => r.llmPass).length,
        failed: reports.filter((r) => !r.llmPass).length,
      };
    }
    return JSON.stringify(
      {
        summary,
        results: reports,
      },
-      results: reports
+      null,
-    }, null, 2)
+      2,
    );
  }
  // JUnit XML reporter (for CI/CD integration)
  static toJUnit(reports: TestReport[]): string {
-    const escapeXml = (s: string) => s
+    const escapeXml = (s: string) =>
-      .replace(/&/g, '&amp;')
+      s
-      .replace(/</g, '&lt;')
+        .replace(/&/g, "&amp;")
-      .replace(/>/g, '&gt;')
+        .replace(/</g, "&lt;")
-      .replace(/"/g, '&quot;')
+        .replace(/>/g, "&gt;")
-      .replace(/'/g, '&apos;')
+        .replace(/"/g, "&quot;")
        .replace(/'/g, "&apos;");
-    const testcases = reports.map(r => {
+    const testcases = reports
      .map((r) => {
        if (r.pass) {
-        return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`
+          return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`;
        } else {
          return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
      <failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
-    </testcase>`
+    </testcase>`;
        }
-    }).join('\n')
+      })
      .join("\n");
-    const failures = reports.filter(r => !r.pass).length
+    const failures = reports.filter((r) => !r.pass).length;
-    const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000
+    const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000;
    return `<?xml version="1.0" encoding="UTF-8"?>
 <testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
 ${testcases}
-</testsuite>`
+</testsuite>`;
  }
  // Combine results and judgments into reports
-  static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] {
+  static createReports(
-    const judgmentMap = new Map(judgments.map(j => [j.testId, j]))
+    results: TestResult[],
    judgments: Judgment[],
    simpleJudgments?: Judgment[],
    llmJudgments?: Judgment[],
  ): TestReport[] {
    const judgmentMap = new Map(judgments.map((j) => [j.testId, j]));
    const simpleMap = simpleJudgments
      ? new Map(simpleJudgments.map((j) => [j.testId, j]))
      : undefined;
    const llmMap = llmJudgments
      ? new Map(llmJudgments.map((j) => [j.testId, j]))
      : undefined;
-    return results.map(result => {
+    return results.map((result) => {
-      const judgment = judgmentMap.get(result.testCase.id)
+      const judgment = judgmentMap.get(result.testCase.id);
      const simple = simpleMap?.get(result.testCase.id);
      const llm = llmMap?.get(result.testCase.id);
-      return {
+      const report: TestReport = {
        testId: result.testCase.id,
        name: result.testCase.name,
        suite: result.testCase.suite,
        pass: judgment?.pass ?? false,
-        reason: judgment?.reason ?? 'No judgment',
+        reason: judgment?.reason ?? "No judgment",
        duration: result.totalDuration,
-        logs: result.logs
+        logs: result.logs,
      };
      // Add separate verdicts if available (dual-judge mode)
      if (simple && llm) {
        report.simplePass = simple.pass;
        report.simpleReason = simple.reason;
        report.llmPass = llm.pass;
        report.llmReason = llm.reason;
      }
-    })
+
      return report;
    });
  }
 }
 // TestLink reporter
 export class TestLinkReporter {
-  private url: string
+  private url: string;
-  private apiKey: string
+  private apiKey: string;
  constructor(url: string, apiKey: string) {
-    this.url = url
+    this.url = url;
-    this.apiKey = apiKey
+    this.apiKey = apiKey;
  }
  async reportResults(
    reports: TestReport[],
    planId: string,
-    buildId: string
+    buildId: string,
  ): Promise<void> {
-    console.log('\nReporting to TestLink...')
+    console.log("\nReporting to TestLink...");
    for (const report of reports) {
      try {
-        await this.reportTestExecution(report, planId, buildId)
+        await this.reportTestExecution(report, planId, buildId);
-        console.log(`  Reported: ${report.testId}`)
+        console.log(`  Reported: ${report.testId}`);
      } catch (error) {
-        console.error(`  Failed to report ${report.testId}:`, error)
+        console.error(`  Failed to report ${report.testId}:`, error);
      }
    }
  }
@@ -114,12 +206,12 @@ export class TestLinkReporter {
  private async reportTestExecution(
    report: TestReport,
    planId: string,
-    buildId: string
+    buildId: string,
  ): Promise<void> {
    // Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
    // This would need to be mapped from TestLink
-    const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked
+    const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked
    // Note: This uses the TestLink XML-RPC API
    // In practice, you'd use the testlink-mcp or direct API calls
@@ -129,10 +221,10 @@ export class TestLinkReporter {
      testplanid: planId,
      buildid: buildId,
      status,
-      notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`
+      notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`,
-    }
+    };
    // For now, just log - actual implementation would call TestLink API
-    console.log(`    Would report: ${report.testId} = ${status}`)
+    console.log(`    Would report: ${report.testId} = ${status}`);
  }
 }
--- a/tests/src/types.ts
+++ b/tests/src/types.ts
@@ -1,66 +1,88 @@
 // Test case definition
 export interface TestStep {
-  name: string
+  name: string;
-  command: string
+  command: string;
-  timeout?: number
+  timeout?: number;
 }
 export interface TestCase {
-  id: string
+  id: string;
-  name: string
+  name: string;
-  suite: string
+  suite: string;
-  priority: number
+  priority: number;
-  timeout: number
+  timeout: number;
-  dependencies: string[]
+  dependencies: string[];
-  steps: TestStep[]
+  steps: TestStep[];
-  criteria: string
+  criteria: string;
 }
 // Execution results
 export interface StepResult {
-  name: string
+  name: string;
-  command: string
+  command: string;
-  stdout: string
+  stdout: string;
-  stderr: string
+  stderr: string;
-  exitCode: number
+  exitCode: number;
-  duration: number
+  duration: number;
 }
 export interface TestResult {
-  testCase: TestCase
+  testCase: TestCase;
-  steps: StepResult[]
+  steps: StepResult[];
-  totalDuration: number
+  totalDuration: number;
-  logs: string
+  logs: string;
 }
 // LLM judgment
 export interface Judgment {
-  testId: string
+  testId: string;
-  pass: boolean
+  pass: boolean;
-  reason: string
+  reason: string;
 }
 // Final report
 export interface TestReport {
-  testId: string
+  testId: string;
-  name: string
+  name: string;
-  suite: string
+  suite: string;
-  pass: boolean
+  pass: boolean;
-  reason: string
+  reason: string;
-  duration: number
+  duration: number;
-  logs: string
+  logs: string;
  // Separate verdicts for dual-judge mode
  simplePass?: boolean;
  simpleReason?: string;
  llmPass?: boolean;
  llmReason?: string;
 }
 // Summary with separate judge breakdowns
 export interface TestSummary {
  total: number;
  passed: number;
  failed: number;
  timestamp: string;
  // Separate breakdowns (only present in dual-judge mode)
  simple?: {
    passed: number;
    failed: number;
  };
  llm?: {
    passed: number;
    failed: number;
  };
 }
 // Runner options
 export interface RunnerOptions {
-  suite?: string
+  suite?: string;
-  id?: string
+  id?: string;
-  workers: number
+  workers: number;
-  dryRun: boolean
+  dryRun: boolean;
-  output: 'console' | 'json' | 'junit'
+  output: "console" | "json" | "junit";
-  reportTestlink: boolean
+  reportTestlink: boolean;
-  ollamaUrl: string
+  ollamaUrl: string;
-  ollamaModel: string
+  ollamaModel: string;
-  testlinkUrl: string
+  testlinkUrl: string;
-  testlinkApiKey: string
+  testlinkApiKey: string;
 }