Implement Go-based test runner framework for Tesla K80 testing

Add comprehensive test orchestration framework: Test Runner (cmd/test-runner/): - config.go: YAML configuration loading and validation - server.go: Ollama server lifecycle management (start/stop/health checks) - monitor.go: Real-time log monitoring with pattern matching - test.go: Model testing via Ollama API (pull, chat, validation) - validate.go: Test result validation (GPU usage, response quality, log analysis) - report.go: Structured reporting (JSON and Markdown formats) - main.go: CLI interface with run/validate/list commands Test Configurations (test/config/): - models.yaml: Full test suite with quick/full/stress profiles - quick.yaml: Fast smoke test with gemma2:2b Updated Workflow: - tesla-k80-tests.yml: Use test-runner instead of shell scripts - Run quick tests first, then full tests if passing - Generate structured JSON reports for pass/fail checking - Upload test results as artifacts Features: - Multi-model testing with configurable profiles - API-based testing (not CLI commands) - Real-time log monitoring for GPU events and errors - Automatic validation of GPU loading and response quality - Structured JSON and Markdown reports - Graceful server lifecycle management - Interrupt handling (Ctrl+C cleanup) Addresses limitations of shell-based testing by providing: - Better error handling and reporting - Programmatic test orchestration - Reusable test framework - Clear pass/fail criteria - Detailed test metrics and timing
2025-12-10 07:46:59 +00:00 · 2025-10-30 11:04:48 +08:00
parent aaaf334e7f
commit d59284d30a
10 changed files with 1631 additions and 113 deletions
--- a/test/config/models.yaml
+++ b/test/config/models.yaml
@@ -0,0 +1,92 @@
+# Test configuration for Tesla K80 model testing
+# This file defines test profiles with different model sizes and test scenarios
+
+profiles:
+  # Quick test profile - small models only, fast execution
+  quick:
+    timeout: 5m
+    models:
+      - name: gemma2:2b
+        prompts:
+          - "Hello, respond with a brief greeting."
+        min_response_tokens: 5
+        max_response_tokens: 100
+        timeout: 30s
+
+  # Full test profile - comprehensive testing across model sizes
+  full:
+    timeout: 30m
+    models:
+      - name: gemma2:2b
+        prompts:
+          - "Hello, respond with a brief greeting."
+          - "What is 2+2? Answer briefly."
+        min_response_tokens: 5
+        max_response_tokens: 100
+        timeout: 30s
+
+      - name: gemma3:4b
+        prompts:
+          - "Explain photosynthesis in one sentence."
+        min_response_tokens: 10
+        max_response_tokens: 200
+        timeout: 60s
+
+      - name: gemma3:12b
+        prompts:
+          - "Write a short haiku about GPUs."
+        min_response_tokens: 15
+        max_response_tokens: 150
+        timeout: 120s
+
+  # Stress test profile - larger models and longer prompts
+  stress:
+    timeout: 60m
+    models:
+      - name: gemma3:12b
+        prompts:
+          - "Write a detailed explanation of how neural networks work, focusing on backpropagation."
+          - "Describe the architecture of a transformer model in detail."
+        min_response_tokens: 50
+        max_response_tokens: 1000
+        timeout: 300s
+
+# Validation rules applied to all tests
+validation:
+  # Require GPU acceleration (fail if CPU fallback detected)
+  gpu_required: true
+
+  # Require single GPU usage for Tesla K80 (detect unnecessary multi-GPU splits)
+  single_gpu_preferred: true
+
+  # Log patterns to check for success/failure
+  check_patterns:
+    success:
+      - "loaded model"
+      - "offload.*GPU"
+      - "CUDA backend"
+    failure:
+      - "CUDA.*error"
+      - "out of memory"
+      - "OOM"
+      - "CPU backend"
+      - "failed to load"
+    warning:
+      - "fallback"
+      - "using CPU"
+
+# Server configuration
+server:
+  host: "localhost"
+  port: 11434
+  startup_timeout: 30s
+  health_check_interval: 1s
+  health_check_endpoint: "/api/tags"
+
+# Reporting configuration
+reporting:
+  formats:
+    - json
+    - markdown
+  include_logs: true
+  log_excerpt_lines: 50  # Lines of log to include per failure
--- a/test/config/quick.yaml
+++ b/test/config/quick.yaml
@@ -0,0 +1,38 @@
+# Quick test profile - fast smoke test with small model
+# Run time: ~1-2 minutes
+
+profiles:
+  quick:
+    timeout: 5m
+    models:
+      - name: gemma2:2b
+        prompts:
+          - "Hello, respond with a brief greeting."
+        min_response_tokens: 5
+        max_response_tokens: 100
+        timeout: 30s
+
+validation:
+  gpu_required: true
+  single_gpu_preferred: true
+  check_patterns:
+    success:
+      - "loaded model"
+      - "offload.*GPU"
+    failure:
+      - "CUDA.*error"
+      - "out of memory"
+      - "CPU backend"
+
+server:
+  host: "localhost"
+  port: 11434
+  startup_timeout: 30s
+  health_check_interval: 1s
+  health_check_endpoint: "/api/tags"
+
+reporting:
+  formats:
+    - json
+  include_logs: true
+  log_excerpt_lines: 50