Add LogCollector for precise test log boundaries

Problem: Tests used `docker compose logs --since=5m` which caused:
- Log overlap between tests
- Logs from previous tests included
- Missing logs if test exceeded 5 minutes

Solution:
- New LogCollector class runs `docker compose logs --follow`
- Marks test start/end boundaries
- Writes test-specific logs to /tmp/test-{testId}-logs.txt
- Test steps access via TEST_ID environment variable

Changes:
- tests/src/log-collector.ts: New LogCollector class
- tests/src/executor.ts: Integrate LogCollector, set TEST_ID env
- tests/src/cli.ts: Start/stop LogCollector for runtime/inference
- All test cases: Use log collector with fallback to docker compose

Also updated docs/CICD.md with:
- Test runner CLI documentation
- Judge modes (simple, llm, dual)
- Log collector integration
- Updated test case list (12b, 27b models)
- Model unload strategy
- Troubleshooting guide

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-12-17 17:46:49 +08:00
parent 82ab6cc96e
commit 2c5094db92
12 changed files with 702 additions and 272 deletions

View File

@@ -9,6 +9,7 @@ import { TestExecutor } from "./executor.js";
import { LLMJudge } from "./judge.js";
import { Reporter, TestLinkReporter } from "./reporter.js";
import { RunnerOptions, Judgment } from "./types.js";
import { LogCollector } from "./log-collector.js";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
@@ -68,9 +69,12 @@ program
log("=".repeat(60));
const loader = new TestLoader(options.testcasesDir);
const executor = new TestExecutor(path.join(__dirname, "..", ".."));
const projectRoot = path.join(__dirname, "..", "..");
const judge = new LLMJudge(options.judgeUrl, options.judgeModel);
// LogCollector will be created if running runtime or inference tests
let logCollector: LogCollector | undefined;
// Load test cases
log("\nLoading test cases...");
let testCases = await loader.loadAll();
@@ -107,10 +111,38 @@ program
process.exit(0);
}
// Check if we need log collector (runtime or inference tests)
const needsLogCollector = testCases.some(
(tc) => tc.suite === "runtime" || tc.suite === "inference"
);
if (needsLogCollector) {
log("\nStarting log collector...");
logCollector = new LogCollector(projectRoot);
try {
await logCollector.start();
log(" Log collector started");
} catch (error) {
log(` Warning: Log collector failed to start: ${error}`);
log(" Tests will run without precise log collection");
logCollector = undefined;
}
}
// Create executor with optional log collector
const executor = new TestExecutor(projectRoot, logCollector);
// Execute tests (progress goes to stderr via executor)
const workers = parseInt(options.workers);
const results = await executor.executeAll(testCases, workers);
// Stop log collector if running
if (logCollector) {
log("\nStopping log collector...");
await logCollector.stop();
log(" Log collector stopped");
}
// Judge results
log("\nJudging results...");
let judgments: Judgment[];

View File

@@ -1,6 +1,7 @@
import { exec } from 'child_process'
import { promisify } from 'util'
import { TestCase, TestResult, StepResult } from './types.js'
import { LogCollector } from './log-collector.js'
const execAsync = promisify(exec)
@@ -17,9 +18,12 @@ export class TestExecutor {
private workingDir: string
private totalTests: number = 0
private currentTest: number = 0
private logCollector: LogCollector | null = null
private currentTestId: string | null = null
constructor(workingDir: string = process.cwd()) {
constructor(workingDir: string = process.cwd(), logCollector?: LogCollector) {
this.workingDir = workingDir
this.logCollector = logCollector || null
}
// Progress output goes to stderr (visible in console)
@@ -34,12 +38,24 @@ export class TestExecutor {
let exitCode = 0
let timedOut = false
// Build environment with TEST_ID for log access
const env = { ...process.env }
if (this.currentTestId) {
env.TEST_ID = this.currentTestId
}
// Update log file before each step so test can access current logs
if (this.logCollector && this.currentTestId) {
this.logCollector.writeCurrentLogs(this.currentTestId)
}
try {
const result = await execAsync(command, {
cwd: this.workingDir,
timeout,
maxBuffer: 10 * 1024 * 1024, // 10MB buffer
shell: '/bin/bash'
shell: '/bin/bash',
env
})
stdout = result.stdout
stderr = result.stderr
@@ -77,8 +93,14 @@ export class TestExecutor {
const timestamp = new Date().toISOString().substring(11, 19)
this.currentTest++
this.currentTestId = testCase.id
this.progress(`[${timestamp}] [${this.currentTest}/${this.totalTests}] ${testCase.id}: ${testCase.name}`)
// Mark test start for log collection
if (this.logCollector) {
this.logCollector.markTestStart(testCase.id)
}
for (let i = 0; i < testCase.steps.length; i++) {
const step = testCase.steps[i]
const stepTimestamp = new Date().toISOString().substring(11, 19)
@@ -106,6 +128,12 @@ export class TestExecutor {
const totalDuration = Date.now() - startTime
// Mark test end for log collection
if (this.logCollector) {
this.logCollector.markTestEnd(testCase.id)
}
this.currentTestId = null
// Combine all logs
const logs = stepResults.map(r => {
return `=== Step: ${r.name} ===

200
tests/src/log-collector.ts Normal file
View File

@@ -0,0 +1,200 @@
import { spawn, ChildProcess } from "child_process";
import { writeFileSync } from "fs";
import path from "path";
interface LogEntry {
timestamp: Date;
line: string;
}
interface TestMarker {
start: number;
end?: number;
}
/**
* LogCollector runs `docker compose logs --follow` as a background process
* and captures logs with precise test boundaries.
*
* This solves the log overlap problem where `docker compose logs --since=5m`
* could include logs from previous tests or miss logs if a test exceeds 5 minutes.
*/
export class LogCollector {
private process: ChildProcess | null = null;
private logs: LogEntry[] = [];
private testMarkers: Map<string, TestMarker> = new Map();
private workingDir: string;
private isRunning: boolean = false;
constructor(workingDir: string) {
// workingDir should be the project root, docker-compose.yml is in docker/
this.workingDir = path.join(workingDir, "docker");
}
/**
* Start the log collector background process
*/
async start(): Promise<void> {
if (this.isRunning) {
return;
}
return new Promise((resolve, reject) => {
// Spawn docker compose logs --follow
this.process = spawn("docker", ["compose", "logs", "--follow", "--timestamps"], {
cwd: this.workingDir,
stdio: ["ignore", "pipe", "pipe"],
});
this.isRunning = true;
// Handle stdout (main log output)
this.process.stdout?.on("data", (data: Buffer) => {
const lines = data.toString().split("\n").filter((l) => l.trim());
for (const line of lines) {
this.logs.push({
timestamp: new Date(),
line: line,
});
}
});
// Handle stderr (docker compose messages)
this.process.stderr?.on("data", (data: Buffer) => {
const lines = data.toString().split("\n").filter((l) => l.trim());
for (const line of lines) {
this.logs.push({
timestamp: new Date(),
line: `[stderr] ${line}`,
});
}
});
this.process.on("error", (err) => {
this.isRunning = false;
reject(err);
});
this.process.on("close", (code) => {
this.isRunning = false;
});
// Give it a moment to start
setTimeout(() => {
if (this.isRunning) {
resolve();
} else {
reject(new Error("Log collector failed to start"));
}
}, 500);
});
}
/**
* Stop the log collector background process
*/
async stop(): Promise<void> {
if (!this.process || !this.isRunning) {
return;
}
return new Promise((resolve) => {
this.process!.on("close", () => {
this.isRunning = false;
resolve();
});
this.process!.kill("SIGTERM");
// Force kill after 5 seconds
setTimeout(() => {
if (this.isRunning && this.process) {
this.process.kill("SIGKILL");
}
resolve();
}, 5000);
});
}
/**
* Mark the start of a test - records current log index
*/
markTestStart(testId: string): void {
this.testMarkers.set(testId, {
start: this.logs.length,
});
}
/**
* Mark the end of a test - records current log index and writes logs to file
*/
markTestEnd(testId: string): void {
const marker = this.testMarkers.get(testId);
if (marker) {
marker.end = this.logs.length;
// Write logs to temp file for test steps to access
const logs = this.getLogsForTest(testId);
const filePath = `/tmp/test-${testId}-logs.txt`;
writeFileSync(filePath, logs);
}
}
/**
* Get logs for a specific test (between start and end markers)
*/
getLogsForTest(testId: string): string {
const marker = this.testMarkers.get(testId);
if (!marker) {
return "";
}
const endIndex = marker.end ?? this.logs.length;
const testLogs = this.logs.slice(marker.start, endIndex);
return testLogs.map((entry) => entry.line).join("\n");
}
/**
* Get all logs collected so far
*/
getAllLogs(): string {
return this.logs.map((entry) => entry.line).join("\n");
}
/**
* Get logs since a specific index
*/
getLogsSince(startIndex: number): string {
return this.logs.slice(startIndex).map((entry) => entry.line).join("\n");
}
/**
* Get current log count (useful for tracking)
*/
getLogCount(): number {
return this.logs.length;
}
/**
* Check if the collector is running
*/
isActive(): boolean {
return this.isRunning;
}
/**
* Write current test logs to file (call during test execution)
* This allows test steps to access logs accumulated so far
*/
writeCurrentLogs(testId: string): void {
const marker = this.testMarkers.get(testId);
if (!marker) {
return;
}
const testLogs = this.logs.slice(marker.start);
const filePath = `/tmp/test-${testId}-logs.txt`;
writeFileSync(filePath, testLogs.map((entry) => entry.line).join("\n"));
}
}

View File

@@ -27,8 +27,12 @@ steps:
- name: Verify model loading in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Model Loading Check ==="
@@ -58,8 +62,12 @@ steps:
- name: Verify llama runner started
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Llama Runner Check ==="
@@ -74,8 +82,12 @@ steps:
- name: Check for model loading errors
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Model Loading Error Check ==="
@@ -97,8 +109,12 @@ steps:
- name: Display model memory allocation from logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Model Memory Allocation ==="
echo "$LOGS" | grep -E '(model weights|kv cache|compute graph|total memory).*device=' | tail -8

View File

@@ -17,8 +17,12 @@ steps:
- name: Check for inference errors in logs
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== Inference Error Check ==="
@@ -47,8 +51,12 @@ steps:
- name: Verify inference request in logs
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== Inference Request Verification ==="
@@ -69,8 +77,12 @@ steps:
- name: Display recent CUDA activity from logs
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== Recent CUDA Activity ==="
echo "$LOGS" | grep -iE "(CUDA|cuda|device=CUDA)" | tail -5 || echo "No recent CUDA activity logged"

View File

@@ -22,8 +22,12 @@ steps:
- name: Verify API requests logged successfully
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== API Request Log Verification ==="
@@ -40,8 +44,12 @@ steps:
- name: Check for API errors in logs
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== API Error Check ==="
@@ -64,8 +72,12 @@ steps:
- name: Display API response times from logs
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== API Response Times ==="

View File

@@ -27,8 +27,12 @@ steps:
- name: Verify model loaded to GPU
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== Model Loading Check for gemma3:12b ==="
@@ -63,8 +67,12 @@ steps:
- name: Check for inference errors
command: |
cd docker
LOGS=$(docker compose logs --since=5m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
fi
echo "=== Inference Error Check ==="

View File

@@ -39,8 +39,12 @@ steps:
- name: Verify model loaded across GPUs
command: |
cd docker
LOGS=$(docker compose logs --since=10m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=10m 2>&1)
fi
echo "=== Model Loading Check for gemma3:27b ==="
@@ -96,8 +100,12 @@ steps:
- name: Check for inference errors
command: |
cd docker
LOGS=$(docker compose logs --since=10m 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs --since=10m 2>&1)
fi
echo "=== Inference Error Check ==="

View File

@@ -22,12 +22,21 @@ steps:
- name: Capture startup logs
command: |
cd docker && docker compose logs 2>&1 | head -100
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
head -100 /tmp/test-${TEST_ID}-logs.txt
else
cd docker && docker compose logs 2>&1 | head -100
fi
- name: Check for startup errors in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
# Check for critical errors
if echo "$LOGS" | grep -qE "(level=ERROR|CUBLAS_STATUS_|CUDA error|cudaMalloc failed)"; then

View File

@@ -30,8 +30,12 @@ steps:
- name: Verify GPU detection in Ollama logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== GPU Detection Check ==="
@@ -60,8 +64,12 @@ steps:
- name: Check for GPU-related errors in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== GPU Error Check ==="
@@ -82,8 +90,12 @@ steps:
- name: Display GPU memory status from logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== GPU Memory Status ==="
echo "$LOGS" | grep -E "gpu memory.*library=CUDA" | tail -4

View File

@@ -30,8 +30,12 @@ steps:
- name: Verify server listening in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Server Status Check ==="
@@ -46,8 +50,12 @@ steps:
- name: Check for runtime errors in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== Runtime Error Check ==="
@@ -71,8 +79,12 @@ steps:
- name: Verify API request handling in logs
command: |
cd docker
LOGS=$(docker compose logs 2>&1)
# Use log collector file if available, fallback to docker compose logs
if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
else
LOGS=$(cd docker && docker compose logs 2>&1)
fi
echo "=== API Request Logs ==="