Add dual-judge mode to test runner

New options: - --dual-judge: Run both simple and LLM judge, fail if either fails - --judge-url: Separate LLM Judge server URL (default: localhost:11435) - --judge-model: Model for LLM judging (default: gemma3:4b) Dual judge logic: - Simple judge checks exit codes - LLM judge analyzes logs semantically - Final result: FAIL if either judge says FAIL - Combines reasons from both judges on failure 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 19:27:00 +00:00 · 2025-12-15 22:58:28 +08:00
parent 6b84acd7d7
commit c2f4f378cc
1 changed files with 49 additions and 5 deletions
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -32,11 +32,13 @@ program
  .option('-d, --dry-run', 'Show what would be executed without running')
  .option('-o, --output <format>', 'Output format: console, json, junit', 'console')
  .option('--report-testlink', 'Report results to TestLink')
-  .option('--ollama-url <url>', 'Ollama server URL', 'http://localhost:11434')
+  .option('--ollama-url <url>', 'Ollama server URL (test subject)', 'http://localhost:11434')
-  .option('--ollama-model <model>', 'Ollama model for judging', 'gemma3:4b')
+  .option('--judge-url <url>', 'LLM Judge server URL (separate instance)', 'http://localhost:11435')
  .option('--judge-model <model>', 'Model for LLM judging', 'gemma3:4b')
  .option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
  .option('--testlink-api-key <key>', 'TestLink API key')
-  .option('--no-llm', 'Skip LLM judging, use simple exit code check')
+  .option('--no-llm', 'Skip LLM judging, use simple exit code check only')
  .option('--dual-judge', 'Use both simple and LLM judge (fail if either fails)')
  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
  .action(async (options) => {
    log('='.repeat(60))
@@ -45,7 +47,7 @@ program
    const loader = new TestLoader(options.testcasesDir)
    const executor = new TestExecutor(path.join(__dirname, '..', '..'))
-    const judge = new LLMJudge(options.ollamaUrl, options.ollamaModel)
+    const judge = new LLMJudge(options.judgeUrl, options.judgeModel)
    // Load test cases
    log('\nLoading test cases...')
@@ -90,7 +92,49 @@ program
    // Judge results
    log('\nJudging results...')
    let judgments
-    if (options.llm === false) {
+
    if (options.dualJudge) {
      // Dual judge mode: run both simple and LLM, fail if either fails
      log('  Using dual judge mode (simple + LLM)')
      // Simple judge first
      const simpleJudgments = results.map(r => judge.simpleJudge(r))
      log('  Simple judge complete')
      // LLM judge second
      let llmJudgments
      try {
        llmJudgments = await judge.judgeResults(results)
        log('  LLM judge complete')
      } catch (error) {
        log(`  LLM judge failed: ${error}`)
        log('  Falling back to simple judge only')
        llmJudgments = simpleJudgments
      }
      // Combine: fail if either judge says fail
      judgments = simpleJudgments.map((simple, i) => {
        const llm = llmJudgments.find(j => j.testId === simple.testId) || simple
        const pass = simple.pass && llm.pass
        let reason = ''
        if (!pass) {
          const reasons = []
          if (!simple.pass) reasons.push(`Simple: ${simple.reason}`)
          if (!llm.pass) reasons.push(`LLM: ${llm.reason}`)
          reason = reasons.join(' | ')
        } else {
          reason = llm.reason || simple.reason
        }
        return {
          testId: simple.testId,
          pass,
          reason
        }
      })
    } else if (options.llm === false) {
      log('  Using simple exit code check (--no-llm)')
      judgments = results.map(r => judge.simpleJudge(r))
    } else {