Enhance LLM judge prompt and add separate verdict display

- Add step results, timing context, and build notes to LLM prompt - LLM now sees exit codes, durations, and simple judge result - Add guidance that long build times within timeout are acceptable - Add separate simple/LLM verdict tracking in dual-judge mode - Console output shows both Simple and LLM pass/fail status - JSON summary includes separate simple/llm breakdown - Each test report includes simplePass/llmPass fields This helps distinguish between simple judge failures (exit code != 0) and LLM judge failures (semantic analysis), making debugging easier.
2025-12-20 04:37:00 +00:00 · 2025-12-17 15:04:05 +08:00
parent 1e99c1bb50
commit e06deff40f
4 changed files with 465 additions and 273 deletions
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -1,216 +1,244 @@
 #!/usr/bin/env node

-import { Command } from 'commander'
-import { writeFileSync } from 'fs'
-import path from 'path'
-import { fileURLToPath } from 'url'
-import { TestLoader } from './loader.js'
-import { TestExecutor } from './executor.js'
-import { LLMJudge } from './judge.js'
-import { Reporter, TestLinkReporter } from './reporter.js'
-import { RunnerOptions } from './types.js'
+import { Command } from "commander";
+import { writeFileSync } from "fs";
+import path from "path";
+import { fileURLToPath } from "url";
+import { TestLoader } from "./loader.js";
+import { TestExecutor } from "./executor.js";
+import { LLMJudge } from "./judge.js";
+import { Reporter, TestLinkReporter } from "./reporter.js";
+import { RunnerOptions, Judgment } from "./types.js";

-const __dirname = path.dirname(fileURLToPath(import.meta.url))
-const defaultTestcasesDir = path.join(__dirname, '..', 'testcases')
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const defaultTestcasesDir = path.join(__dirname, "..", "testcases");

 // Progress output to stderr (visible in console even when stdout is redirected)
-const log = (msg: string) => process.stderr.write(msg + '\n')
+const log = (msg: string) => process.stderr.write(msg + "\n");

-const program = new Command()
+const program = new Command();

 program
-  .name('ollama37-test')
-  .description('Scalable test runner with LLM-as-judge for ollama37')
-  .version('1.0.0')
+  .name("ollama37-test")
+  .description("Scalable test runner with LLM-as-judge for ollama37")
+  .version("1.0.0");

 program
-  .command('run')
-  .description('Run test cases')
-  .option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)')
-  .option('-i, --id <id>', 'Run only specified test case by ID')
-  .option('-w, --workers <n>', 'Number of parallel workers', '1')
-  .option('-d, --dry-run', 'Show what would be executed without running')
-  .option('-o, --output <format>', 'Output format: console, json, junit', 'console')
-  .option('--report-testlink', 'Report results to TestLink')
-  .option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434')
-  .option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435')
-  .option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b')
-  .option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
-  .option('--testlink-api-key <key>', 'TestLink API key')
-  .option('--no-llm', 'Skip LLM judging, use simple exit code check only')
-  .option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)')
-  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .command("run")
+  .description("Run test cases")
+  .option(
+    "-s, --suite <suite>",
+    "Run only tests in specified suite (build, runtime, inference)",
+  )
+  .option("-i, --id <id>", "Run only specified test case by ID")
+  .option("-w, --workers <n>", "Number of parallel workers", "1")
+  .option("-d, --dry-run", "Show what would be executed without running")
+  .option(
+    "-o, --output <format>",
+    "Output format: console, json, junit",
+    "console",
+  )
+  .option("--report-testlink", "Report results to TestLink")
+  .option(
+    "--ollama-url <url>",
+    "Ollama server URL (test subject)",
+    "http://localhost:11434",
+  )
+  .option(
+    "--judge-url <url>",
+    "LLM Judge server URL (separate instance)",
+    "http://localhost:11435",
+  )
+  .option("--judge-model <model>", "Model for LLM judging", "gemma3:4b")
+  .option(
+    "--testlink-url <url>",
+    "TestLink server URL",
+    "http://localhost:8090",
+  )
+  .option("--testlink-api-key <key>", "TestLink API key")
+  .option("--no-llm", "Skip LLM judging, use simple exit code check only")
+  .option(
+    "--dual-judge",
+    "Use both simple and LLM judge (fail if either fails)",
+  )
+  .option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
  .action(async (options) => {
-    log('='.repeat(60))
-    log('OLLAMA37 TEST RUNNER')
-    log('='.repeat(60))
+    log("=".repeat(60));
+    log("OLLAMA37 TEST RUNNER");
+    log("=".repeat(60));

-    const loader = new TestLoader(options.testcasesDir)
-    const executor = new TestExecutor(path.join(__dirname, '..', '..'))
-    const judge = new LLMJudge(options.judgeUrl, options.judgeModel)
+    const loader = new TestLoader(options.testcasesDir);
+    const executor = new TestExecutor(path.join(__dirname, "..", ".."));
+    const judge = new LLMJudge(options.judgeUrl, options.judgeModel);

    // Load test cases
-    log('\nLoading test cases...')
-    let testCases = await loader.loadAll()
+    log("\nLoading test cases...");
+    let testCases = await loader.loadAll();

    if (options.suite) {
-      testCases = testCases.filter(tc => tc.suite === options.suite)
-      log(`  Filtered by suite: ${options.suite}`)
+      testCases = testCases.filter((tc) => tc.suite === options.suite);
+      log(`  Filtered by suite: ${options.suite}`);
    }

    if (options.id) {
-      testCases = testCases.filter(tc => tc.id === options.id)
-      log(`  Filtered by ID: ${options.id}`)
+      testCases = testCases.filter((tc) => tc.id === options.id);
+      log(`  Filtered by ID: ${options.id}`);
    }

    // Sort by dependencies
-    testCases = loader.sortByDependencies(testCases)
+    testCases = loader.sortByDependencies(testCases);

-    log(`  Found ${testCases.length} test cases`)
+    log(`  Found ${testCases.length} test cases`);

    if (testCases.length === 0) {
-      log('\nNo test cases found!')
-      process.exit(1)
+      log("\nNo test cases found!");
+      process.exit(1);
    }

    // Dry run
    if (options.dryRun) {
-      log('\nDRY RUN - Would execute:')
+      log("\nDRY RUN - Would execute:");
      for (const tc of testCases) {
-        log(`  ${tc.id}: ${tc.name}`)
+        log(`  ${tc.id}: ${tc.name}`);
        for (const step of tc.steps) {
-          log(`    - ${step.name}: ${step.command}`)
+          log(`    - ${step.name}: ${step.command}`);
        }
      }
-      process.exit(0)
+      process.exit(0);
    }

    // Execute tests (progress goes to stderr via executor)
-    const workers = parseInt(options.workers)
-    const results = await executor.executeAll(testCases, workers)
+    const workers = parseInt(options.workers);
+    const results = await executor.executeAll(testCases, workers);

    // Judge results
-    log('\nJudging results...')
-    let judgments
+    log("\nJudging results...");
+    let judgments: Judgment[];
+    let simpleJudgments: Judgment[] | undefined;
+    let llmJudgments: Judgment[] | undefined;

    if (options.dualJudge) {
      // Dual judge mode: run both simple and LLM, fail if either fails
-      log('  Using dual judge mode (simple + LLM)')
+      log("  Using dual judge mode (simple + LLM)");

      // Simple judge first
-      const simpleJudgments = results.map(r => judge.simpleJudge(r))
-      log('  Simple judge complete')
+      simpleJudgments = results.map((r) => judge.simpleJudge(r));
+      log("  Simple judge complete");

      // LLM judge second
-      let llmJudgments
      try {
-        llmJudgments = await judge.judgeResults(results)
-        log('  LLM judge complete')
+        llmJudgments = await judge.judgeResults(results);
+        log("  LLM judge complete");
      } catch (error) {
-        log(`  LLM judge failed: ${error}`)
-        log('  Falling back to simple judge only')
-        llmJudgments = simpleJudgments
+        log(`  LLM judge failed: ${error}`);
+        log("  Falling back to simple judge only");
+        llmJudgments = simpleJudgments;
      }

      // Combine: fail if either judge says fail
      judgments = simpleJudgments.map((simple, i) => {
-        const llm = llmJudgments.find(j => j.testId === simple.testId) || simple
-        const pass = simple.pass && llm.pass
+        const llm =
+          llmJudgments!.find((j) => j.testId === simple.testId) || simple;
+        const pass = simple.pass && llm.pass;

-        let reason = ''
+        let reason = "";
        if (!pass) {
-          const reasons = []
-          if (!simple.pass) reasons.push(`Simple: ${simple.reason}`)
-          if (!llm.pass) reasons.push(`LLM: ${llm.reason}`)
-          reason = reasons.join(' | ')
+          const reasons = [];
+          if (!simple.pass) reasons.push(`Simple: ${simple.reason}`);
+          if (!llm.pass) reasons.push(`LLM: ${llm.reason}`);
+          reason = reasons.join(" | ");
        } else {
-          reason = llm.reason || simple.reason
+          reason = llm.reason || simple.reason;
        }

        return {
          testId: simple.testId,
          pass,
-          reason
-        }
-      })
-
+          reason,
+        };
+      });
    } else if (options.llm === false) {
-      log('  Using simple exit code check (--no-llm)')
-      judgments = results.map(r => judge.simpleJudge(r))
+      log("  Using simple exit code check (--no-llm)");
+      judgments = results.map((r) => judge.simpleJudge(r));
    } else {
      try {
-        judgments = await judge.judgeResults(results)
+        judgments = await judge.judgeResults(results);
      } catch (error) {
-        log(`  LLM judging failed, falling back to simple check: ${error}`)
-        judgments = results.map(r => judge.simpleJudge(r))
+        log(`  LLM judging failed, falling back to simple check: ${error}`);
+        judgments = results.map((r) => judge.simpleJudge(r));
      }
    }

-    // Create reports
-    const reports = Reporter.createReports(results, judgments)
+    // Create reports (with separate verdicts in dual-judge mode)
+    const reports = Reporter.createReports(
+      results,
+      judgments,
+      simpleJudgments,
+      llmJudgments,
+    );

    // Output results
    switch (options.output) {
-      case 'json':
-        const json = Reporter.toJSON(reports)
+      case "json":
+        const json = Reporter.toJSON(reports);
        // JSON goes to stdout (can be redirected to file)
-        process.stdout.write(json + '\n')
-        break
+        process.stdout.write(json + "\n");
+        break;

-      case 'junit':
-        const junit = Reporter.toJUnit(reports)
-        writeFileSync('test-results.xml', junit)
-        log('\nResults written to test-results.xml')
-        break
+      case "junit":
+        const junit = Reporter.toJUnit(reports);
+        writeFileSync("test-results.xml", junit);
+        log("\nResults written to test-results.xml");
+        break;

-      case 'console':
+      case "console":
      default:
-        Reporter.toConsole(reports)
-        break
+        Reporter.toConsole(reports);
+        break;
    }

    // Summary
-    const passed = reports.filter(r => r.pass).length
-    const failed = reports.filter(r => !r.pass).length
-    log('\n' + '='.repeat(60))
-    log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`)
-    log('='.repeat(60))
+    const passed = reports.filter((r) => r.pass).length;
+    const failed = reports.filter((r) => !r.pass).length;
+    log("\n" + "=".repeat(60));
+    log(`SUMMARY: ${passed} passed, ${failed} failed, ${reports.length} total`);
+    log("=".repeat(60));

    // Report to TestLink
    if (options.reportTestlink && options.testlinkApiKey) {
      const testlinkReporter = new TestLinkReporter(
        options.testlinkUrl,
-        options.testlinkApiKey
-      )
+        options.testlinkApiKey,
+      );
      // Would need plan ID and build ID
      // await testlinkReporter.reportResults(reports, planId, buildId)
-      log('\nTestLink reporting not yet implemented')
+      log("\nTestLink reporting not yet implemented");
    }

    // Exit with appropriate code
-    process.exit(failed > 0 ? 1 : 0)
-  })
+    process.exit(failed > 0 ? 1 : 0);
+  });

 program
-  .command('list')
-  .description('List all test cases')
-  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .command("list")
+  .description("List all test cases")
+  .option("--testcases-dir <dir>", "Test cases directory", defaultTestcasesDir)
  .action(async (options) => {
-    const loader = new TestLoader(options.testcasesDir)
-    const testCases = await loader.loadAll()
+    const loader = new TestLoader(options.testcasesDir);
+    const testCases = await loader.loadAll();

-    const grouped = loader.groupBySuite(testCases)
+    const grouped = loader.groupBySuite(testCases);

-    console.log('Available Test Cases:\n')
+    console.log("Available Test Cases:\n");
    for (const [suite, cases] of grouped) {
-      console.log(`${suite.toUpperCase()}:`)
+      console.log(`${suite.toUpperCase()}:`);
      for (const tc of cases) {
-        console.log(`  ${tc.id}: ${tc.name}`)
+        console.log(`  ${tc.id}: ${tc.name}`);
      }
-      console.log()
+      console.log();
    }

-    console.log(`Total: ${testCases.length} test cases`)
-  })
+    console.log(`Total: ${testCases.length} test cases`);
+  });

-program.parse()
+program.parse();
--- a/tests/src/judge.ts
+++ b/tests/src/judge.ts
@@ -1,31 +1,71 @@
-import axios from 'axios'
-import { TestResult, Judgment } from './types.js'
+import axios from "axios";
+import { TestResult, Judgment } from "./types.js";

 export class LLMJudge {
-  private ollamaUrl: string
-  private model: string
-  private batchSize: number
+  private ollamaUrl: string;
+  private model: string;
+  private batchSize: number;

-  constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
-    this.ollamaUrl = ollamaUrl
-    this.model = model
-    this.batchSize = 5 // Judge 5 tests per LLM call
+  constructor(
+    ollamaUrl: string = "http://localhost:11434",
+    model: string = "gemma3:4b",
+  ) {
+    this.ollamaUrl = ollamaUrl;
+    this.model = model;
+    this.batchSize = 5; // Judge 5 tests per LLM call
+  }
+
+  private formatDuration(ms: number): string {
+    if (ms < 1000) return `${ms}ms`;
+    if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
+    return `${(ms / 60000).toFixed(1)}min`;
  }

  private buildPrompt(results: TestResult[]): string {
-    const testsSection = results.map((r, i) => {
-      return `
+    const testsSection = results
+      .map((r, i) => {
+        // Build step results summary with exit codes and durations
+        const stepsSummary = r.steps
+          .map((step, j) => {
+            const status = step.exitCode === 0 ? "PASS" : "FAIL";
+            const stepTimeout =
+              r.testCase.steps[j]?.timeout || r.testCase.timeout;
+            return `  ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
+          })
+          .join("\n");
+
+        // Simple judge result
+        const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
+        const simpleResult = allStepsPassed ? "PASS" : "FAIL";
+
+        // Check if duration is within timeout
+        const timeoutMs = r.testCase.timeout;
+        const withinTimeout = r.totalDuration < timeoutMs;
+        const timeoutNote = withinTimeout
+          ? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
+          : `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
+
+        return `
 ### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}

 **Criteria:**
 ${r.testCase.criteria}

+**Step Results:**
+${stepsSummary}
+
+**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
+
+**Timing:** ${timeoutNote}
+${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
+
 **Execution Logs:**
 \`\`\`
-${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
+${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
 \`\`\`
-`
-    }).join('\n---\n')
+`;
+      })
+      .join("\n---\n");

    return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.

@@ -46,66 +86,74 @@ Important:
 - For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
 - For build/runtime tests, check exit codes and absence of error messages
 - Be lenient with formatting differences, focus on semantic correctness
+- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
+- Long durations are acceptable as long as they are within the configured timeout

-Respond ONLY with the JSON array, no other text.`
+Respond ONLY with the JSON array, no other text.`;
  }

  async judgeResults(results: TestResult[]): Promise<Judgment[]> {
-    const allJudgments: Judgment[] = []
+    const allJudgments: Judgment[] = [];

    // Process in batches
    for (let i = 0; i < results.length; i += this.batchSize) {
-      const batch = results.slice(i, i + this.batchSize)
-      console.log(`  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
+      const batch = results.slice(i, i + this.batchSize);
+      console.log(
+        `  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
+      );

      try {
-        const judgments = await this.judgeBatch(batch)
-        allJudgments.push(...judgments)
+        const judgments = await this.judgeBatch(batch);
+        allJudgments.push(...judgments);
      } catch (error) {
-        console.error(`  Failed to judge batch:`, error)
+        console.error(`  Failed to judge batch:`, error);
        // Mark all tests in batch as failed
        for (const r of batch) {
          allJudgments.push({
            testId: r.testCase.id,
            pass: false,
-            reason: 'LLM judgment failed: ' + String(error)
-          })
+            reason: "LLM judgment failed: " + String(error),
+          });
        }
      }
    }

-    return allJudgments
+    return allJudgments;
  }

  private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
-    const prompt = this.buildPrompt(results)
+    const prompt = this.buildPrompt(results);

-    const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
-      model: this.model,
-      prompt,
-      stream: false,
-      options: {
-        temperature: 0.1, // Low temperature for consistent judging
-        num_predict: 1000
-      }
-    }, {
-      timeout: 120000 // 2 minute timeout
-    })
+    const response = await axios.post(
+      `${this.ollamaUrl}/api/generate`,
+      {
+        model: this.model,
+        prompt,
+        stream: false,
+        options: {
+          temperature: 0.1, // Low temperature for consistent judging
+          num_predict: 1000,
+        },
+      },
+      {
+        timeout: 120000, // 2 minute timeout
+      },
+    );

-    const responseText = response.data.response
+    const responseText = response.data.response;

    // Extract JSON from response
-    const jsonMatch = responseText.match(/\[[\s\S]*\]/)
+    const jsonMatch = responseText.match(/\[[\s\S]*\]/);
    if (!jsonMatch) {
-      throw new Error('No JSON array found in LLM response')
+      throw new Error("No JSON array found in LLM response");
    }

    try {
-      const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
+      const judgments = JSON.parse(jsonMatch[0]) as Judgment[];

      // Validate and fill missing
-      const resultIds = results.map(r => r.testCase.id)
-      const judgedIds = new Set(judgments.map(j => j.testId))
+      const resultIds = results.map((r) => r.testCase.id);
+      const judgedIds = new Set(judgments.map((j) => j.testId));

      // Add missing judgments
      for (const id of resultIds) {
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
          judgments.push({
            testId: id,
            pass: false,
-            reason: 'No judgment provided by LLM'
-          })
+            reason: "No judgment provided by LLM",
+          });
        }
      }

-      return judgments
+      return judgments;
    } catch (parseError) {
-      throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
+      throw new Error(
+        `Failed to parse LLM response: ${responseText.substring(0, 200)}`,
+      );
    }
  }

  // Fallback: Simple rule-based judgment (no LLM)
  simpleJudge(result: TestResult): Judgment {
-    const allStepsPassed = result.steps.every(s => s.exitCode === 0)
+    const allStepsPassed = result.steps.every((s) => s.exitCode === 0);

    if (allStepsPassed) {
      return {
        testId: result.testCase.id,
        pass: true,
-        reason: 'All steps completed with exit code 0'
-      }
+        reason: "All steps completed with exit code 0",
+      };
    } else {
-      const failedSteps = result.steps.filter(s => s.exitCode !== 0)
+      const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
      return {
        testId: result.testCase.id,
        pass: false,
-        reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
-      }
+        reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
+      };
    }
  }
 }
--- a/tests/src/reporter.ts
+++ b/tests/src/reporter.ts
@@ -1,112 +1,204 @@
-import axios from 'axios'
-import { TestReport, Judgment, TestResult } from './types.js'
+import axios from "axios";
+import { TestReport, Judgment, TestResult, TestSummary } from "./types.js";

 export class Reporter {
  // Console reporter
  static toConsole(reports: TestReport[]): void {
-    console.log('\n' + '='.repeat(60))
-    console.log('TEST RESULTS')
-    console.log('='.repeat(60))
+    console.log("\n" + "=".repeat(60));
+    console.log("TEST RESULTS");
+    console.log("=".repeat(60));

-    const passed = reports.filter(r => r.pass)
-    const failed = reports.filter(r => !r.pass)
+    const passed = reports.filter((r) => r.pass);
+    const failed = reports.filter((r) => !r.pass);
+
+    // Check if we have dual-judge data
+    const hasDualJudge = reports.some(
+      (r) => r.simplePass !== undefined && r.llmPass !== undefined,
+    );

    for (const report of reports) {
-      const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m'
-      console.log(`[${status}] ${report.testId}: ${report.name}`)
-      console.log(`       Reason: ${report.reason}`)
-      console.log(`       Duration: ${report.duration}ms`)
+      const status = report.pass
+        ? "\x1b[32mPASS\x1b[0m"
+        : "\x1b[31mFAIL\x1b[0m";
+      console.log(`[${status}] ${report.testId}: ${report.name}`);
+
+      // Show separate verdicts in dual-judge mode
+      if (
+        hasDualJudge &&
+        report.simplePass !== undefined &&
+        report.llmPass !== undefined
+      ) {
+        const simpleStatus = report.simplePass
+          ? "\x1b[32mPASS\x1b[0m"
+          : "\x1b[31mFAIL\x1b[0m";
+        const llmStatus = report.llmPass
+          ? "\x1b[32mPASS\x1b[0m"
+          : "\x1b[31mFAIL\x1b[0m";
+        console.log(
+          `       Simple: [${simpleStatus}] ${report.simpleReason || ""}`,
+        );
+        console.log(`       LLM:    [${llmStatus}] ${report.llmReason || ""}`);
+      } else {
+        console.log(`       Reason: ${report.reason}`);
+      }
+      console.log(`       Duration: ${report.duration}ms`);
    }

-    console.log('\n' + '-'.repeat(60))
-    console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
-    console.log('='.repeat(60))
+    console.log("\n" + "-".repeat(60));
+
+    // Show separate summaries in dual-judge mode
+    if (hasDualJudge) {
+      const simplePassed = reports.filter((r) => r.simplePass).length;
+      const simpleFailed = reports.filter((r) => !r.simplePass).length;
+      const llmPassed = reports.filter((r) => r.llmPass).length;
+      const llmFailed = reports.filter((r) => !r.llmPass).length;
+
+      console.log(`Simple:   ${simplePassed} passed, ${simpleFailed} failed`);
+      console.log(`LLM:      ${llmPassed} passed, ${llmFailed} failed`);
+      console.log(
+        `Combined: ${passed.length} passed, ${failed.length} failed, ${reports.length} total`,
+      );
+    } else {
+      console.log(
+        `Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`,
+      );
+    }
+    console.log("=".repeat(60));
  }

  // JSON reporter
  static toJSON(reports: TestReport[]): string {
-    return JSON.stringify({
-      summary: {
-        total: reports.length,
-        passed: reports.filter(r => r.pass).length,
-        failed: reports.filter(r => !r.pass).length,
-        timestamp: new Date().toISOString()
+    // Check if we have dual-judge data
+    const hasDualJudge = reports.some(
+      (r) => r.simplePass !== undefined && r.llmPass !== undefined,
+    );
+
+    const summary: TestSummary = {
+      total: reports.length,
+      passed: reports.filter((r) => r.pass).length,
+      failed: reports.filter((r) => !r.pass).length,
+      timestamp: new Date().toISOString(),
+    };
+
+    // Add separate breakdowns in dual-judge mode
+    if (hasDualJudge) {
+      summary.simple = {
+        passed: reports.filter((r) => r.simplePass).length,
+        failed: reports.filter((r) => !r.simplePass).length,
+      };
+      summary.llm = {
+        passed: reports.filter((r) => r.llmPass).length,
+        failed: reports.filter((r) => !r.llmPass).length,
+      };
+    }
+
+    return JSON.stringify(
+      {
+        summary,
+        results: reports,
      },
-      results: reports
-    }, null, 2)
+      null,
+      2,
+    );
  }

  // JUnit XML reporter (for CI/CD integration)
  static toJUnit(reports: TestReport[]): string {
-    const escapeXml = (s: string) => s
-      .replace(/&/g, '&amp;')
-      .replace(/</g, '&lt;')
-      .replace(/>/g, '&gt;')
-      .replace(/"/g, '&quot;')
-      .replace(/'/g, '&apos;')
+    const escapeXml = (s: string) =>
+      s
+        .replace(/&/g, "&amp;")
+        .replace(/</g, "&lt;")
+        .replace(/>/g, "&gt;")
+        .replace(/"/g, "&quot;")
+        .replace(/'/g, "&apos;");

-    const testcases = reports.map(r => {
-      if (r.pass) {
-        return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`
-      } else {
-        return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
+    const testcases = reports
+      .map((r) => {
+        if (r.pass) {
+          return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`;
+        } else {
+          return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
      <failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
-    </testcase>`
-      }
-    }).join('\n')
+    </testcase>`;
+        }
+      })
+      .join("\n");

-    const failures = reports.filter(r => !r.pass).length
-    const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000
+    const failures = reports.filter((r) => !r.pass).length;
+    const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000;

    return `<?xml version="1.0" encoding="UTF-8"?>
 <testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
 ${testcases}
-</testsuite>`
+</testsuite>`;
  }

  // Combine results and judgments into reports
-  static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] {
-    const judgmentMap = new Map(judgments.map(j => [j.testId, j]))
+  static createReports(
+    results: TestResult[],
+    judgments: Judgment[],
+    simpleJudgments?: Judgment[],
+    llmJudgments?: Judgment[],
+  ): TestReport[] {
+    const judgmentMap = new Map(judgments.map((j) => [j.testId, j]));
+    const simpleMap = simpleJudgments
+      ? new Map(simpleJudgments.map((j) => [j.testId, j]))
+      : undefined;
+    const llmMap = llmJudgments
+      ? new Map(llmJudgments.map((j) => [j.testId, j]))
+      : undefined;

-    return results.map(result => {
-      const judgment = judgmentMap.get(result.testCase.id)
+    return results.map((result) => {
+      const judgment = judgmentMap.get(result.testCase.id);
+      const simple = simpleMap?.get(result.testCase.id);
+      const llm = llmMap?.get(result.testCase.id);

-      return {
+      const report: TestReport = {
        testId: result.testCase.id,
        name: result.testCase.name,
        suite: result.testCase.suite,
        pass: judgment?.pass ?? false,
-        reason: judgment?.reason ?? 'No judgment',
+        reason: judgment?.reason ?? "No judgment",
        duration: result.totalDuration,
-        logs: result.logs
+        logs: result.logs,
+      };
+
+      // Add separate verdicts if available (dual-judge mode)
+      if (simple && llm) {
+        report.simplePass = simple.pass;
+        report.simpleReason = simple.reason;
+        report.llmPass = llm.pass;
+        report.llmReason = llm.reason;
      }
-    })
+
+      return report;
+    });
  }
 }

 // TestLink reporter
 export class TestLinkReporter {
-  private url: string
-  private apiKey: string
+  private url: string;
+  private apiKey: string;

  constructor(url: string, apiKey: string) {
-    this.url = url
-    this.apiKey = apiKey
+    this.url = url;
+    this.apiKey = apiKey;
  }

  async reportResults(
    reports: TestReport[],
    planId: string,
-    buildId: string
+    buildId: string,
  ): Promise<void> {
-    console.log('\nReporting to TestLink...')
+    console.log("\nReporting to TestLink...");

    for (const report of reports) {
      try {
-        await this.reportTestExecution(report, planId, buildId)
-        console.log(`  Reported: ${report.testId}`)
+        await this.reportTestExecution(report, planId, buildId);
+        console.log(`  Reported: ${report.testId}`);
      } catch (error) {
-        console.error(`  Failed to report ${report.testId}:`, error)
+        console.error(`  Failed to report ${report.testId}:`, error);
      }
    }
  }
@@ -114,12 +206,12 @@ export class TestLinkReporter {
  private async reportTestExecution(
    report: TestReport,
    planId: string,
-    buildId: string
+    buildId: string,
  ): Promise<void> {
    // Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
    // This would need to be mapped from TestLink

-    const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked
+    const status = report.pass ? "p" : "f"; // p=passed, f=failed, b=blocked

    // Note: This uses the TestLink XML-RPC API
    // In practice, you'd use the testlink-mcp or direct API calls
@@ -129,10 +221,10 @@ export class TestLinkReporter {
      testplanid: planId,
      buildid: buildId,
      status,
-      notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`
-    }
+      notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`,
+    };

    // For now, just log - actual implementation would call TestLink API
-    console.log(`    Would report: ${report.testId} = ${status}`)
+    console.log(`    Would report: ${report.testId} = ${status}`);
  }
 }
--- a/tests/src/types.ts
+++ b/tests/src/types.ts
@@ -1,66 +1,88 @@
 // Test case definition
 export interface TestStep {
-  name: string
-  command: string
-  timeout?: number
+  name: string;
+  command: string;
+  timeout?: number;
 }

 export interface TestCase {
-  id: string
-  name: string
-  suite: string
-  priority: number
-  timeout: number
-  dependencies: string[]
-  steps: TestStep[]
-  criteria: string
+  id: string;
+  name: string;
+  suite: string;
+  priority: number;
+  timeout: number;
+  dependencies: string[];
+  steps: TestStep[];
+  criteria: string;
 }

 // Execution results
 export interface StepResult {
-  name: string
-  command: string
-  stdout: string
-  stderr: string
-  exitCode: number
-  duration: number
+  name: string;
+  command: string;
+  stdout: string;
+  stderr: string;
+  exitCode: number;
+  duration: number;
 }

 export interface TestResult {
-  testCase: TestCase
-  steps: StepResult[]
-  totalDuration: number
-  logs: string
+  testCase: TestCase;
+  steps: StepResult[];
+  totalDuration: number;
+  logs: string;
 }

 // LLM judgment
 export interface Judgment {
-  testId: string
-  pass: boolean
-  reason: string
+  testId: string;
+  pass: boolean;
+  reason: string;
 }

 // Final report
 export interface TestReport {
-  testId: string
-  name: string
-  suite: string
-  pass: boolean
-  reason: string
-  duration: number
-  logs: string
+  testId: string;
+  name: string;
+  suite: string;
+  pass: boolean;
+  reason: string;
+  duration: number;
+  logs: string;
+  // Separate verdicts for dual-judge mode
+  simplePass?: boolean;
+  simpleReason?: string;
+  llmPass?: boolean;
+  llmReason?: string;
+}
+
+// Summary with separate judge breakdowns
+export interface TestSummary {
+  total: number;
+  passed: number;
+  failed: number;
+  timestamp: string;
+  // Separate breakdowns (only present in dual-judge mode)
+  simple?: {
+    passed: number;
+    failed: number;
+  };
+  llm?: {
+    passed: number;
+    failed: number;
+  };
 }

 // Runner options
 export interface RunnerOptions {
-  suite?: string
-  id?: string
-  workers: number
-  dryRun: boolean
-  output: 'console' | 'json' | 'junit'
-  reportTestlink: boolean
-  ollamaUrl: string
-  ollamaModel: string
-  testlinkUrl: string
-  testlinkApiKey: string
+  suite?: string;
+  id?: string;
+  workers: number;
+  dryRun: boolean;
+  output: "console" | "json" | "junit";
+  reportTestlink: boolean;
+  ollamaUrl: string;
+  ollamaModel: string;
+  testlinkUrl: string;
+  testlinkApiKey: string;
 }