Add LogCollector for precise test log boundaries

Problem: Tests used `docker compose logs --since=5m` which caused: - Log overlap between tests - Logs from previous tests included - Missing logs if test exceeded 5 minutes Solution: - New LogCollector class runs `docker compose logs --follow` - Marks test start/end boundaries - Writes test-specific logs to /tmp/test-{testId}-logs.txt - Test steps access via TEST_ID environment variable Changes: - tests/src/log-collector.ts: New LogCollector class - tests/src/executor.ts: Integrate LogCollector, set TEST_ID env - tests/src/cli.ts: Start/stop LogCollector for runtime/inference - All test cases: Use log collector with fallback to docker compose Also updated docs/CICD.md with: - Test runner CLI documentation - Judge modes (simple, llm, dual) - Log collector integration - Updated test case list (12b, 27b models) - Model unload strategy - Troubleshooting guide 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 05:07:04 +00:00 · 2025-12-17 17:46:49 +08:00
parent 82ab6cc96e
commit 2c5094db92
12 changed files with 702 additions and 272 deletions
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -9,6 +9,7 @@ import { TestExecutor } from "./executor.js";
 import { LLMJudge } from "./judge.js";
 import { Reporter, TestLinkReporter } from "./reporter.js";
 import { RunnerOptions, Judgment } from "./types.js";
+import { LogCollector } from "./log-collector.js";

 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const defaultTestcasesDir = path.join(__dirname, "..", "testcases");
@@ -68,9 +69,12 @@ program
    log("=".repeat(60));

    const loader = new TestLoader(options.testcasesDir);
-    const executor = new TestExecutor(path.join(__dirname, "..", ".."));
+    const projectRoot = path.join(__dirname, "..", "..");
    const judge = new LLMJudge(options.judgeUrl, options.judgeModel);

+    // LogCollector will be created if running runtime or inference tests
+    let logCollector: LogCollector | undefined;
+
    // Load test cases
    log("\nLoading test cases...");
    let testCases = await loader.loadAll();
@@ -107,10 +111,38 @@ program
      process.exit(0);
    }

+    // Check if we need log collector (runtime or inference tests)
+    const needsLogCollector = testCases.some(
+      (tc) => tc.suite === "runtime" || tc.suite === "inference"
+    );
+
+    if (needsLogCollector) {
+      log("\nStarting log collector...");
+      logCollector = new LogCollector(projectRoot);
+      try {
+        await logCollector.start();
+        log("  Log collector started");
+      } catch (error) {
+        log(`  Warning: Log collector failed to start: ${error}`);
+        log("  Tests will run without precise log collection");
+        logCollector = undefined;
+      }
+    }
+
+    // Create executor with optional log collector
+    const executor = new TestExecutor(projectRoot, logCollector);
+
    // Execute tests (progress goes to stderr via executor)
    const workers = parseInt(options.workers);
    const results = await executor.executeAll(testCases, workers);

+    // Stop log collector if running
+    if (logCollector) {
+      log("\nStopping log collector...");
+      await logCollector.stop();
+      log("  Log collector stopped");
+    }
+
    // Judge results
    log("\nJudging results...");
    let judgments: Judgment[];
--- a/tests/src/executor.ts
+++ b/tests/src/executor.ts
@@ -1,6 +1,7 @@
 import { exec } from 'child_process'
 import { promisify } from 'util'
 import { TestCase, TestResult, StepResult } from './types.js'
+import { LogCollector } from './log-collector.js'

 const execAsync = promisify(exec)

@@ -17,9 +18,12 @@ export class TestExecutor {
  private workingDir: string
  private totalTests: number = 0
  private currentTest: number = 0
+  private logCollector: LogCollector | null = null
+  private currentTestId: string | null = null

-  constructor(workingDir: string = process.cwd()) {
+  constructor(workingDir: string = process.cwd(), logCollector?: LogCollector) {
    this.workingDir = workingDir
+    this.logCollector = logCollector || null
  }

  // Progress output goes to stderr (visible in console)
@@ -34,12 +38,24 @@ export class TestExecutor {
    let exitCode = 0
    let timedOut = false

+    // Build environment with TEST_ID for log access
+    const env = { ...process.env }
+    if (this.currentTestId) {
+      env.TEST_ID = this.currentTestId
+    }
+
+    // Update log file before each step so test can access current logs
+    if (this.logCollector && this.currentTestId) {
+      this.logCollector.writeCurrentLogs(this.currentTestId)
+    }
+
    try {
      const result = await execAsync(command, {
        cwd: this.workingDir,
        timeout,
        maxBuffer: 10 * 1024 * 1024, // 10MB buffer
-        shell: '/bin/bash'
+        shell: '/bin/bash',
+        env
      })
      stdout = result.stdout
      stderr = result.stderr
@@ -77,8 +93,14 @@ export class TestExecutor {
    const timestamp = new Date().toISOString().substring(11, 19)

    this.currentTest++
+    this.currentTestId = testCase.id
    this.progress(`[${timestamp}] [${this.currentTest}/${this.totalTests}] ${testCase.id}: ${testCase.name}`)

+    // Mark test start for log collection
+    if (this.logCollector) {
+      this.logCollector.markTestStart(testCase.id)
+    }
+
    for (let i = 0; i < testCase.steps.length; i++) {
      const step = testCase.steps[i]
      const stepTimestamp = new Date().toISOString().substring(11, 19)
@@ -106,6 +128,12 @@ export class TestExecutor {

    const totalDuration = Date.now() - startTime

+    // Mark test end for log collection
+    if (this.logCollector) {
+      this.logCollector.markTestEnd(testCase.id)
+    }
+    this.currentTestId = null
+
    // Combine all logs
    const logs = stepResults.map(r => {
      return `=== Step: ${r.name} ===
--- a/tests/src/log-collector.ts
+++ b/tests/src/log-collector.ts
@@ -0,0 +1,200 @@
+import { spawn, ChildProcess } from "child_process";
+import { writeFileSync } from "fs";
+import path from "path";
+
+interface LogEntry {
+  timestamp: Date;
+  line: string;
+}
+
+interface TestMarker {
+  start: number;
+  end?: number;
+}
+
+/**
+ * LogCollector runs `docker compose logs --follow` as a background process
+ * and captures logs with precise test boundaries.
+ *
+ * This solves the log overlap problem where `docker compose logs --since=5m`
+ * could include logs from previous tests or miss logs if a test exceeds 5 minutes.
+ */
+export class LogCollector {
+  private process: ChildProcess | null = null;
+  private logs: LogEntry[] = [];
+  private testMarkers: Map<string, TestMarker> = new Map();
+  private workingDir: string;
+  private isRunning: boolean = false;
+
+  constructor(workingDir: string) {
+    // workingDir should be the project root, docker-compose.yml is in docker/
+    this.workingDir = path.join(workingDir, "docker");
+  }
+
+  /**
+   * Start the log collector background process
+   */
+  async start(): Promise<void> {
+    if (this.isRunning) {
+      return;
+    }
+
+    return new Promise((resolve, reject) => {
+      // Spawn docker compose logs --follow
+      this.process = spawn("docker", ["compose", "logs", "--follow", "--timestamps"], {
+        cwd: this.workingDir,
+        stdio: ["ignore", "pipe", "pipe"],
+      });
+
+      this.isRunning = true;
+
+      // Handle stdout (main log output)
+      this.process.stdout?.on("data", (data: Buffer) => {
+        const lines = data.toString().split("\n").filter((l) => l.trim());
+        for (const line of lines) {
+          this.logs.push({
+            timestamp: new Date(),
+            line: line,
+          });
+        }
+      });
+
+      // Handle stderr (docker compose messages)
+      this.process.stderr?.on("data", (data: Buffer) => {
+        const lines = data.toString().split("\n").filter((l) => l.trim());
+        for (const line of lines) {
+          this.logs.push({
+            timestamp: new Date(),
+            line: `[stderr] ${line}`,
+          });
+        }
+      });
+
+      this.process.on("error", (err) => {
+        this.isRunning = false;
+        reject(err);
+      });
+
+      this.process.on("close", (code) => {
+        this.isRunning = false;
+      });
+
+      // Give it a moment to start
+      setTimeout(() => {
+        if (this.isRunning) {
+          resolve();
+        } else {
+          reject(new Error("Log collector failed to start"));
+        }
+      }, 500);
+    });
+  }
+
+  /**
+   * Stop the log collector background process
+   */
+  async stop(): Promise<void> {
+    if (!this.process || !this.isRunning) {
+      return;
+    }
+
+    return new Promise((resolve) => {
+      this.process!.on("close", () => {
+        this.isRunning = false;
+        resolve();
+      });
+
+      this.process!.kill("SIGTERM");
+
+      // Force kill after 5 seconds
+      setTimeout(() => {
+        if (this.isRunning && this.process) {
+          this.process.kill("SIGKILL");
+        }
+        resolve();
+      }, 5000);
+    });
+  }
+
+  /**
+   * Mark the start of a test - records current log index
+   */
+  markTestStart(testId: string): void {
+    this.testMarkers.set(testId, {
+      start: this.logs.length,
+    });
+  }
+
+  /**
+   * Mark the end of a test - records current log index and writes logs to file
+   */
+  markTestEnd(testId: string): void {
+    const marker = this.testMarkers.get(testId);
+    if (marker) {
+      marker.end = this.logs.length;
+
+      // Write logs to temp file for test steps to access
+      const logs = this.getLogsForTest(testId);
+      const filePath = `/tmp/test-${testId}-logs.txt`;
+      writeFileSync(filePath, logs);
+    }
+  }
+
+  /**
+   * Get logs for a specific test (between start and end markers)
+   */
+  getLogsForTest(testId: string): string {
+    const marker = this.testMarkers.get(testId);
+    if (!marker) {
+      return "";
+    }
+
+    const endIndex = marker.end ?? this.logs.length;
+    const testLogs = this.logs.slice(marker.start, endIndex);
+
+    return testLogs.map((entry) => entry.line).join("\n");
+  }
+
+  /**
+   * Get all logs collected so far
+   */
+  getAllLogs(): string {
+    return this.logs.map((entry) => entry.line).join("\n");
+  }
+
+  /**
+   * Get logs since a specific index
+   */
+  getLogsSince(startIndex: number): string {
+    return this.logs.slice(startIndex).map((entry) => entry.line).join("\n");
+  }
+
+  /**
+   * Get current log count (useful for tracking)
+   */
+  getLogCount(): number {
+    return this.logs.length;
+  }
+
+  /**
+   * Check if the collector is running
+   */
+  isActive(): boolean {
+    return this.isRunning;
+  }
+
+  /**
+   * Write current test logs to file (call during test execution)
+   * This allows test steps to access logs accumulated so far
+   */
+  writeCurrentLogs(testId: string): void {
+    const marker = this.testMarkers.get(testId);
+    if (!marker) {
+      return;
+    }
+
+    const testLogs = this.logs.slice(marker.start);
+    const filePath = `/tmp/test-${testId}-logs.txt`;
+    writeFileSync(filePath, testLogs.map((entry) => entry.line).join("\n"));
+  }
+}
--- a/tests/testcases/inference/TC-INFERENCE-001.yml
+++ b/tests/testcases/inference/TC-INFERENCE-001.yml
@@ -27,8 +27,12 @@ steps:

  - name: Verify model loading in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Model Loading Check ==="

@@ -58,8 +62,12 @@ steps:

  - name: Verify llama runner started
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Llama Runner Check ==="

@@ -74,8 +82,12 @@ steps:

  - name: Check for model loading errors
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Model Loading Error Check ==="

@@ -97,8 +109,12 @@ steps:

  - name: Display model memory allocation from logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Model Memory Allocation ==="
      echo "$LOGS" | grep -E '(model weights|kv cache|compute graph|total memory).*device=' | tail -8
--- a/tests/testcases/inference/TC-INFERENCE-002.yml
+++ b/tests/testcases/inference/TC-INFERENCE-002.yml
@@ -17,8 +17,12 @@ steps:

  - name: Check for inference errors in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== Inference Error Check ==="

@@ -47,8 +51,12 @@ steps:

  - name: Verify inference request in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== Inference Request Verification ==="

@@ -69,8 +77,12 @@ steps:

  - name: Display recent CUDA activity from logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== Recent CUDA Activity ==="
      echo "$LOGS" | grep -iE "(CUDA|cuda|device=CUDA)" | tail -5 || echo "No recent CUDA activity logged"
--- a/tests/testcases/inference/TC-INFERENCE-003.yml
+++ b/tests/testcases/inference/TC-INFERENCE-003.yml
@@ -22,8 +22,12 @@ steps:

  - name: Verify API requests logged successfully
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== API Request Log Verification ==="

@@ -40,8 +44,12 @@ steps:

  - name: Check for API errors in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== API Error Check ==="

@@ -64,8 +72,12 @@ steps:

  - name: Display API response times from logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== API Response Times ==="

--- a/tests/testcases/inference/TC-INFERENCE-004.yml
+++ b/tests/testcases/inference/TC-INFERENCE-004.yml
@@ -27,8 +27,12 @@ steps:

  - name: Verify model loaded to GPU
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== Model Loading Check for gemma3:12b ==="

@@ -63,8 +67,12 @@ steps:

  - name: Check for inference errors
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=5m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=5m 2>&1)
+      fi

      echo "=== Inference Error Check ==="

--- a/tests/testcases/inference/TC-INFERENCE-005.yml
+++ b/tests/testcases/inference/TC-INFERENCE-005.yml
@@ -39,8 +39,12 @@ steps:

  - name: Verify model loaded across GPUs
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=10m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=10m 2>&1)
+      fi

      echo "=== Model Loading Check for gemma3:27b ==="

@@ -96,8 +100,12 @@ steps:

  - name: Check for inference errors
    command: |
-      cd docker
-      LOGS=$(docker compose logs --since=10m 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs --since=10m 2>&1)
+      fi

      echo "=== Inference Error Check ==="

--- a/tests/testcases/runtime/TC-RUNTIME-001.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-001.yml
@@ -22,12 +22,21 @@ steps:

  - name: Capture startup logs
    command: |
-      cd docker && docker compose logs 2>&1 | head -100
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        head -100 /tmp/test-${TEST_ID}-logs.txt
+      else
+        cd docker && docker compose logs 2>&1 | head -100
+      fi

  - name: Check for startup errors in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      # Check for critical errors
      if echo "$LOGS" | grep -qE "(level=ERROR|CUBLAS_STATUS_|CUDA error|cudaMalloc failed)"; then
--- a/tests/testcases/runtime/TC-RUNTIME-002.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-002.yml
@@ -30,8 +30,12 @@ steps:

  - name: Verify GPU detection in Ollama logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== GPU Detection Check ==="

@@ -60,8 +64,12 @@ steps:

  - name: Check for GPU-related errors in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== GPU Error Check ==="

@@ -82,8 +90,12 @@ steps:

  - name: Display GPU memory status from logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== GPU Memory Status ==="
      echo "$LOGS" | grep -E "gpu memory.*library=CUDA" | tail -4
--- a/tests/testcases/runtime/TC-RUNTIME-003.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-003.yml
@@ -30,8 +30,12 @@ steps:

  - name: Verify server listening in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Server Status Check ==="

@@ -46,8 +50,12 @@ steps:

  - name: Check for runtime errors in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== Runtime Error Check ==="

@@ -71,8 +79,12 @@ steps:

  - name: Verify API request handling in logs
    command: |
-      cd docker
-      LOGS=$(docker compose logs 2>&1)
+      # Use log collector file if available, fallback to docker compose logs
+      if [ -f "/tmp/test-${TEST_ID}-logs.txt" ]; then
+        LOGS=$(cat /tmp/test-${TEST_ID}-logs.txt)
+      else
+        LOGS=$(cd docker && docker compose logs 2>&1)
+      fi

      echo "=== API Request Logs ==="