Files
ollama37/test/config/models.yaml
Shang Chieh Tseng 40b956b23c Fix false positive CPU backend error in test configuration
The test configuration was treating 'CPU backend' as a failure pattern,
but this is incorrect. Loading the CPU backend library is normal - ollama
loads both CUDA and CPU backends for fallback operations.

The log line 'load_backend: loaded CPU backend from libggml-cpu-*.so'
is a success message, not an error.

Changed failure patterns from:
- 'CPU backend' (too broad, matches normal loading)
- 'failed to load.*CUDA' (too specific)

To more accurate patterns:
- 'failed to load.*backend' (matches actual load failures)
- 'backend.*failed' (matches failure messages)

This prevents false positives while still catching real backend failures.
2025-10-30 16:00:20 +08:00

99 lines
2.5 KiB
YAML

# Test configuration for Tesla K80 model testing
# This file defines test profiles with different model sizes and test scenarios
profiles:
# Quick test profile - fast smoke test with medium model
quick:
timeout: 5m
models:
- name: gemma3:4b
prompts:
- "Hello, respond with a brief greeting."
min_response_tokens: 5
max_response_tokens: 100
timeout: 60s
# Full test profile - test largest model that fits on single K80
full:
timeout: 30m
models:
- name: gemma3:12b
prompts:
- "Hello, respond with a brief greeting."
min_response_tokens: 5
max_response_tokens: 100
timeout: 120s
# Multi-GPU test profile - test models requiring 2x Tesla K80s
multi-gpu:
timeout: 45m
models:
- name: gemma3:27b
prompts:
- "Hello, respond with a brief greeting."
min_response_tokens: 5
max_response_tokens: 100
timeout: 300s
- name: gpt-oss:20b
prompts:
- "Hello, respond with a brief greeting."
min_response_tokens: 5
max_response_tokens: 100
timeout: 240s
validation:
# Override single_gpu_preferred for multi-GPU tests
gpu_required: true
single_gpu_preferred: false
check_patterns:
success:
- "loaded model"
- "offload.*GPU"
- "CUDA backend"
- "split.*layer.*GPU" # Expect multi-GPU split
failure:
- "CUDA.*error"
- "out of memory"
- "OOM"
- "failed to load.*backend"
- "backend.*failed"
# Validation rules applied to all tests (unless overridden in profile)
validation:
# Require GPU acceleration (fail if CPU fallback detected)
gpu_required: true
# Require single GPU usage for Tesla K80 (detect unnecessary multi-GPU splits)
single_gpu_preferred: true
# Log patterns to check for success/failure
check_patterns:
success:
- "loaded model"
- "offload.*GPU"
- "CUDA backend"
failure:
- "CUDA.*error"
- "out of memory"
- "OOM"
- "failed to load.*backend"
- "backend.*failed"
warning:
- "fallback"
- "using CPU"
# Server configuration
server:
host: "localhost"
port: 11434
startup_timeout: 30s
health_check_interval: 1s
health_check_endpoint: "/api/tags"
# Reporting configuration
reporting:
formats:
- json
- markdown
include_logs: true
log_excerpt_lines: 50 # Lines of log to include per failure