mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-20 04:37:00 +00:00
- Add step results, timing context, and build notes to LLM prompt - LLM now sees exit codes, durations, and simple judge result - Add guidance that long build times within timeout are acceptable - Add separate simple/LLM verdict tracking in dual-judge mode - Console output shows both Simple and LLM pass/fail status - JSON summary includes separate simple/llm breakdown - Each test report includes simplePass/llmPass fields This helps distinguish between simple judge failures (exit code != 0) and LLM judge failures (semantic analysis), making debugging easier.
197 lines
6.2 KiB
TypeScript
197 lines
6.2 KiB
TypeScript
import axios from "axios";
|
|
import { TestResult, Judgment } from "./types.js";
|
|
|
|
export class LLMJudge {
|
|
private ollamaUrl: string;
|
|
private model: string;
|
|
private batchSize: number;
|
|
|
|
constructor(
|
|
ollamaUrl: string = "http://localhost:11434",
|
|
model: string = "gemma3:4b",
|
|
) {
|
|
this.ollamaUrl = ollamaUrl;
|
|
this.model = model;
|
|
this.batchSize = 5; // Judge 5 tests per LLM call
|
|
}
|
|
|
|
private formatDuration(ms: number): string {
|
|
if (ms < 1000) return `${ms}ms`;
|
|
if (ms < 60000) return `${(ms / 1000).toFixed(1)}s`;
|
|
return `${(ms / 60000).toFixed(1)}min`;
|
|
}
|
|
|
|
private buildPrompt(results: TestResult[]): string {
|
|
const testsSection = results
|
|
.map((r, i) => {
|
|
// Build step results summary with exit codes and durations
|
|
const stepsSummary = r.steps
|
|
.map((step, j) => {
|
|
const status = step.exitCode === 0 ? "PASS" : "FAIL";
|
|
const stepTimeout =
|
|
r.testCase.steps[j]?.timeout || r.testCase.timeout;
|
|
return ` ${j + 1}. "${step.name}" - ${status} (exit: ${step.exitCode}, duration: ${this.formatDuration(step.duration)}, timeout: ${this.formatDuration(stepTimeout)})`;
|
|
})
|
|
.join("\n");
|
|
|
|
// Simple judge result
|
|
const allStepsPassed = r.steps.every((s) => s.exitCode === 0);
|
|
const simpleResult = allStepsPassed ? "PASS" : "FAIL";
|
|
|
|
// Check if duration is within timeout
|
|
const timeoutMs = r.testCase.timeout;
|
|
const withinTimeout = r.totalDuration < timeoutMs;
|
|
const timeoutNote = withinTimeout
|
|
? `Total duration ${this.formatDuration(r.totalDuration)} is within timeout of ${this.formatDuration(timeoutMs)}.`
|
|
: `Total duration ${this.formatDuration(r.totalDuration)} exceeded timeout of ${this.formatDuration(timeoutMs)}.`;
|
|
|
|
return `
|
|
### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
|
|
|
|
**Criteria:**
|
|
${r.testCase.criteria}
|
|
|
|
**Step Results:**
|
|
${stepsSummary}
|
|
|
|
**Simple Judge Result:** ${simpleResult} (${allStepsPassed ? "all steps exit code 0" : "some steps failed"})
|
|
|
|
**Timing:** ${timeoutNote}
|
|
${r.testCase.suite === "build" ? "Note: Long build times are expected for CUDA compilation on older GPUs." : ""}
|
|
|
|
**Execution Logs:**
|
|
\`\`\`
|
|
${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? "\n... (truncated)" : ""}
|
|
\`\`\`
|
|
`;
|
|
})
|
|
.join("\n---\n");
|
|
|
|
return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
|
|
|
|
For each test, examine:
|
|
1. The expected criteria
|
|
2. The actual execution logs (stdout, stderr, exit codes)
|
|
3. Whether the output meets the criteria (use fuzzy matching for AI outputs)
|
|
|
|
${testsSection}
|
|
|
|
Respond with a JSON array containing one object per test:
|
|
[
|
|
{"testId": "TC-XXX-001", "pass": true, "reason": "Brief explanation"},
|
|
{"testId": "TC-XXX-002", "pass": false, "reason": "Brief explanation"}
|
|
]
|
|
|
|
Important:
|
|
- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
|
|
- For build/runtime tests, check exit codes and absence of error messages
|
|
- Be lenient with formatting differences, focus on semantic correctness
|
|
- If the Simple Judge Result is PASS and duration is within timeout, the test should generally pass unless there are clear errors in the logs
|
|
- Long durations are acceptable as long as they are within the configured timeout
|
|
|
|
Respond ONLY with the JSON array, no other text.`;
|
|
}
|
|
|
|
async judgeResults(results: TestResult[]): Promise<Judgment[]> {
|
|
const allJudgments: Judgment[] = [];
|
|
|
|
// Process in batches
|
|
for (let i = 0; i < results.length; i += this.batchSize) {
|
|
const batch = results.slice(i, i + this.batchSize);
|
|
console.log(
|
|
` Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`,
|
|
);
|
|
|
|
try {
|
|
const judgments = await this.judgeBatch(batch);
|
|
allJudgments.push(...judgments);
|
|
} catch (error) {
|
|
console.error(` Failed to judge batch:`, error);
|
|
// Mark all tests in batch as failed
|
|
for (const r of batch) {
|
|
allJudgments.push({
|
|
testId: r.testCase.id,
|
|
pass: false,
|
|
reason: "LLM judgment failed: " + String(error),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
return allJudgments;
|
|
}
|
|
|
|
private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
|
|
const prompt = this.buildPrompt(results);
|
|
|
|
const response = await axios.post(
|
|
`${this.ollamaUrl}/api/generate`,
|
|
{
|
|
model: this.model,
|
|
prompt,
|
|
stream: false,
|
|
options: {
|
|
temperature: 0.1, // Low temperature for consistent judging
|
|
num_predict: 1000,
|
|
},
|
|
},
|
|
{
|
|
timeout: 120000, // 2 minute timeout
|
|
},
|
|
);
|
|
|
|
const responseText = response.data.response;
|
|
|
|
// Extract JSON from response
|
|
const jsonMatch = responseText.match(/\[[\s\S]*\]/);
|
|
if (!jsonMatch) {
|
|
throw new Error("No JSON array found in LLM response");
|
|
}
|
|
|
|
try {
|
|
const judgments = JSON.parse(jsonMatch[0]) as Judgment[];
|
|
|
|
// Validate and fill missing
|
|
const resultIds = results.map((r) => r.testCase.id);
|
|
const judgedIds = new Set(judgments.map((j) => j.testId));
|
|
|
|
// Add missing judgments
|
|
for (const id of resultIds) {
|
|
if (!judgedIds.has(id)) {
|
|
judgments.push({
|
|
testId: id,
|
|
pass: false,
|
|
reason: "No judgment provided by LLM",
|
|
});
|
|
}
|
|
}
|
|
|
|
return judgments;
|
|
} catch (parseError) {
|
|
throw new Error(
|
|
`Failed to parse LLM response: ${responseText.substring(0, 200)}`,
|
|
);
|
|
}
|
|
}
|
|
|
|
// Fallback: Simple rule-based judgment (no LLM)
|
|
simpleJudge(result: TestResult): Judgment {
|
|
const allStepsPassed = result.steps.every((s) => s.exitCode === 0);
|
|
|
|
if (allStepsPassed) {
|
|
return {
|
|
testId: result.testCase.id,
|
|
pass: true,
|
|
reason: "All steps completed with exit code 0",
|
|
};
|
|
} else {
|
|
const failedSteps = result.steps.filter((s) => s.exitCode !== 0);
|
|
return {
|
|
testId: result.testCase.id,
|
|
pass: false,
|
|
reason: `Steps failed: ${failedSteps.map((s) => s.name).join(", ")}`,
|
|
};
|
|
}
|
|
}
|
|
}
|