mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Add comprehensive test orchestration framework: Test Runner (cmd/test-runner/): - config.go: YAML configuration loading and validation - server.go: Ollama server lifecycle management (start/stop/health checks) - monitor.go: Real-time log monitoring with pattern matching - test.go: Model testing via Ollama API (pull, chat, validation) - validate.go: Test result validation (GPU usage, response quality, log analysis) - report.go: Structured reporting (JSON and Markdown formats) - main.go: CLI interface with run/validate/list commands Test Configurations (test/config/): - models.yaml: Full test suite with quick/full/stress profiles - quick.yaml: Fast smoke test with gemma2:2b Updated Workflow: - tesla-k80-tests.yml: Use test-runner instead of shell scripts - Run quick tests first, then full tests if passing - Generate structured JSON reports for pass/fail checking - Upload test results as artifacts Features: - Multi-model testing with configurable profiles - API-based testing (not CLI commands) - Real-time log monitoring for GPU events and errors - Automatic validation of GPU loading and response quality - Structured JSON and Markdown reports - Graceful server lifecycle management - Interrupt handling (Ctrl+C cleanup) Addresses limitations of shell-based testing by providing: - Better error handling and reporting - Programmatic test orchestration - Reusable test framework - Clear pass/fail criteria - Detailed test metrics and timing
93 lines
2.3 KiB
YAML
93 lines
2.3 KiB
YAML
# Test configuration for Tesla K80 model testing
|
|
# This file defines test profiles with different model sizes and test scenarios
|
|
|
|
profiles:
|
|
# Quick test profile - small models only, fast execution
|
|
quick:
|
|
timeout: 5m
|
|
models:
|
|
- name: gemma2:2b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 30s
|
|
|
|
# Full test profile - comprehensive testing across model sizes
|
|
full:
|
|
timeout: 30m
|
|
models:
|
|
- name: gemma2:2b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
- "What is 2+2? Answer briefly."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 30s
|
|
|
|
- name: gemma3:4b
|
|
prompts:
|
|
- "Explain photosynthesis in one sentence."
|
|
min_response_tokens: 10
|
|
max_response_tokens: 200
|
|
timeout: 60s
|
|
|
|
- name: gemma3:12b
|
|
prompts:
|
|
- "Write a short haiku about GPUs."
|
|
min_response_tokens: 15
|
|
max_response_tokens: 150
|
|
timeout: 120s
|
|
|
|
# Stress test profile - larger models and longer prompts
|
|
stress:
|
|
timeout: 60m
|
|
models:
|
|
- name: gemma3:12b
|
|
prompts:
|
|
- "Write a detailed explanation of how neural networks work, focusing on backpropagation."
|
|
- "Describe the architecture of a transformer model in detail."
|
|
min_response_tokens: 50
|
|
max_response_tokens: 1000
|
|
timeout: 300s
|
|
|
|
# Validation rules applied to all tests
|
|
validation:
|
|
# Require GPU acceleration (fail if CPU fallback detected)
|
|
gpu_required: true
|
|
|
|
# Require single GPU usage for Tesla K80 (detect unnecessary multi-GPU splits)
|
|
single_gpu_preferred: true
|
|
|
|
# Log patterns to check for success/failure
|
|
check_patterns:
|
|
success:
|
|
- "loaded model"
|
|
- "offload.*GPU"
|
|
- "CUDA backend"
|
|
failure:
|
|
- "CUDA.*error"
|
|
- "out of memory"
|
|
- "OOM"
|
|
- "CPU backend"
|
|
- "failed to load"
|
|
warning:
|
|
- "fallback"
|
|
- "using CPU"
|
|
|
|
# Server configuration
|
|
server:
|
|
host: "localhost"
|
|
port: 11434
|
|
startup_timeout: 30s
|
|
health_check_interval: 1s
|
|
health_check_endpoint: "/api/tags"
|
|
|
|
# Reporting configuration
|
|
reporting:
|
|
formats:
|
|
- json
|
|
- markdown
|
|
include_logs: true
|
|
log_excerpt_lines: 50 # Lines of log to include per failure
|