mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Changes: 1. Update quick test to use gemma3:4b (was gemma2:2b) - Increased timeout to 60s for larger model 2. Implement Claude headless validation (validate.go) - Hybrid approach: simple checks first, then Claude validation ALWAYS runs - Claude validates response quality, coherence, relevance - Detects gibberish, errors, and malformed responses - Falls back to simple validation if Claude CLI unavailable - Verbose logging shows Claude validation results 3. Validation flow: - Step 1: Fast checks (empty response, token count) - Step 2: Claude AI analysis (runs regardless of simple check) - Claude result overrides simple checks - If Claude unavailable, uses simple validation only 4. Workflow improvements: - Remove useless GPU memory check step (server already stopped) - Cleaner workflow output Benefits: - Intelligent response quality validation - Catches subtle issues (gibberish, off-topic responses) - Better than hardcoded pattern matching - Graceful degradation when Claude unavailable
39 lines
738 B
YAML
39 lines
738 B
YAML
# Quick test profile - fast smoke test with small model
|
|
# Run time: ~1-2 minutes
|
|
|
|
profiles:
|
|
quick:
|
|
timeout: 5m
|
|
models:
|
|
- name: gemma3:4b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 60s
|
|
|
|
validation:
|
|
gpu_required: true
|
|
single_gpu_preferred: true
|
|
check_patterns:
|
|
success:
|
|
- "loaded model"
|
|
- "offload.*GPU"
|
|
failure:
|
|
- "CUDA.*error"
|
|
- "out of memory"
|
|
- "CPU backend"
|
|
|
|
server:
|
|
host: "localhost"
|
|
port: 11434
|
|
startup_timeout: 30s
|
|
health_check_interval: 1s
|
|
health_check_endpoint: "/api/tags"
|
|
|
|
reporting:
|
|
formats:
|
|
- json
|
|
include_logs: true
|
|
log_excerpt_lines: 50
|