Enhance LLM judge prompt and add separate verdict display

- Add step results, timing context, and build notes to LLM prompt
- LLM now sees exit codes, durations, and simple judge result
- Add guidance that long build times within timeout are acceptable

- Add separate simple/LLM verdict tracking in dual-judge mode
- Console output shows both Simple and LLM pass/fail status
- JSON summary includes separate simple/llm breakdown
- Each test report includes simplePass/llmPass fields

This helps distinguish between simple judge failures (exit code != 0)
and LLM judge failures (semantic analysis), making debugging easier.
This commit is contained in:
Shang Chieh Tseng
2025-12-17 15:04:05 +08:00
parent 1e99c1bb50
commit e06deff40f
4 changed files with 465 additions and 273 deletions

View File

@@ -1,31 +1,71 @@
import axios from 'axios'
import { TestResult, Judgment } from './types.js'
import axios from "axios";
import { TestResult, Judgment } from "./types.js";
export class LLMJudge {
private ollamaUrl: string
private model: string
private batchSize: number
private ollamaUrl: string;
private model: string;
private batchSize: number;
constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
this.ollamaUrl = ollamaUrl
this.model = model
this.batchSize = 5 // Judge 5 tests per LLM call
constructor(
ollamaUrl: string = "http://localhost:11434",
model: string = "gemma3:4b",
) {
this.ollamaUrl = ollamaUrl;
this.model = model;
this.batchSize = 5; // Judge 5 tests per LLM call
}
private formatDuration(ms: number): string {
if (ms < 1000) return `${ms}ms`;
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
return `${(ms / 60000).toFixed(1)}min`;
}
private buildPrompt(results: TestResult[]): string {
const testsSection = results.map((r, i) => {
return `
const testsSection = results
.map((r, i) => {
// Build step results summary with exit codes and durations
const stepsSummary = r.steps
.map((step, j) => {
const status = step.exitCode === 0 ? "PASS" : "FAIL";
const stepTimeout =
r.testCase.steps[j]?.timeout || r.testCase.timeout;
return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
})
.join("\n");
// Simple judge result
const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
const simpleResult = allStepsPassed ? "PASS" : "FAIL";
// Check if duration is within timeout
const timeoutMs = r.testCase.timeout;
const withinTimeout = r.totalDuration < timeoutMs;
const timeoutNote = withinTimeout
? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
: `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
return `
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
**Criteria:**
${r.testCase.criteria}
**Step Results:**
${stepsSummary}
**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
**Timing:** ${timeoutNote}
${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
**Execution Logs:**
\`\`\`
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
\`\`\`
`
}).join('\n---\n')
`;
})
.join("\n---\n");
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
@@ -46,66 +86,74 @@ Important:
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
- For build/runtime tests, check exit codes and absence of error messages
- Be lenient with formatting differences, focus on semantic correctness
- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
- Long durations are acceptable as long as they are within the configured timeout
Respond ONLY with the JSON array, no other text.`
Respond ONLY with the JSON array, no other text.`;
}
async judgeResults(results: TestResult[]): Promise<Judgment[]> {
const allJudgments: Judgment[] = []
const allJudgments: Judgment[] = [];
// Process in batches
for (let i = 0; i < results.length; i += this.batchSize) {
const batch = results.slice(i, i + this.batchSize)
console.log(` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
const batch = results.slice(i, i + this.batchSize);
console.log(
` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
);
try {
const judgments = await this.judgeBatch(batch)
allJudgments.push(...judgments)
const judgments = await this.judgeBatch(batch);
allJudgments.push(...judgments);
} catch (error) {
console.error(` Failed to judge batch:`, error)
console.error(` Failed to judge batch:`, error);
// Mark all tests in batch as failed
for (const r of batch) {
allJudgments.push({
testId: r.testCase.id,
pass: false,
reason: 'LLM judgment failed: ' + String(error)
})
reason: "LLM judgment failed: " + String(error),
});
}
}
}
return allJudgments
return allJudgments;
}
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
const prompt = this.buildPrompt(results)
const prompt = this.buildPrompt(results);
const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
model: this.model,
prompt,
stream: false,
options: {
temperature: 0.1, // Low temperature for consistent judging
num_predict: 1000
}
}, {
timeout: 120000 // 2 minute timeout
})
const response = await axios.post(
`${this.ollamaUrl}/api/generate`,
{
model: this.model,
prompt,
stream: false,
options: {
temperature: 0.1, // Low temperature for consistent judging
num_predict: 1000,
},
},
{
timeout: 120000, // 2 minute timeout
},
);
const responseText = response.data.response
const responseText = response.data.response;
// Extract JSON from response
const jsonMatch = responseText.match(/\[[\s\S]*\]/)
const jsonMatch = responseText.match(/\[[\s\S]*\]/);
if (!jsonMatch) {
throw new Error('No JSON array found in LLM response')
throw new Error("No JSON array found in LLM response");
}
try {
const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
// Validate and fill missing
const resultIds = results.map(r => r.testCase.id)
const judgedIds = new Set(judgments.map(j => j.testId))
const resultIds = results.map((r) => r.testCase.id);
const judgedIds = new Set(judgments.map((j) => j.testId));
// Add missing judgments
for (const id of resultIds) {
@@ -113,34 +161,36 @@ Respond ONLY with the JSON array, no other text.`
judgments.push({
testId: id,
pass: false,
reason: 'No judgment provided by LLM'
})
reason: "No judgment provided by LLM",
});
}
}
return judgments
return judgments;
} catch (parseError) {
throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
throw new Error(
`Failed to parse LLM response: ${responseText.substring(0, 200)}`,
);
}
}
// Fallback: Simple rule-based judgment (no LLM)
simpleJudge(result: TestResult): Judgment {
const allStepsPassed = result.steps.every(s => s.exitCode === 0)
const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
if (allStepsPassed) {
return {
testId: result.testCase.id,
pass: true,
reason: 'All steps completed with exit code 0'
}
reason: "All steps completed with exit code 0",
};
} else {
const failedSteps = result.steps.filter(s => s.exitCode !== 0)
const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
return {
testId: result.testCase.id,
pass: false,
reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
}
reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
};
}
}
}