mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
- Rename tesla-k80-tests.yml to tesla-k80-single-gpu-tests.yml for clarity - Add new tesla-k80-multi-gpu-tests.yml workflow for large models - Add multi-gpu profile to test/config/models.yaml with gemma3:27b and gpt-oss:20b - Multi-GPU workflow includes GPU count verification and weekly schedule - Profile-specific validation allows multi-GPU splits for large models - Separate workflows optimize CI efficiency: quick tests vs. thorough tests
99 lines
2.5 KiB
YAML
99 lines
2.5 KiB
YAML
# Test configuration for Tesla K80 model testing
|
|
# This file defines test profiles with different model sizes and test scenarios
|
|
|
|
profiles:
|
|
# Quick test profile - fast smoke test with medium model
|
|
quick:
|
|
timeout: 5m
|
|
models:
|
|
- name: gemma3:4b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 60s
|
|
|
|
# Full test profile - test largest model that fits on single K80
|
|
full:
|
|
timeout: 30m
|
|
models:
|
|
- name: gemma3:12b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 120s
|
|
|
|
# Multi-GPU test profile - test models requiring 2x Tesla K80s
|
|
multi-gpu:
|
|
timeout: 45m
|
|
models:
|
|
- name: gemma3:27b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 300s
|
|
- name: gpt-oss:20b
|
|
prompts:
|
|
- "Hello, respond with a brief greeting."
|
|
min_response_tokens: 5
|
|
max_response_tokens: 100
|
|
timeout: 240s
|
|
validation:
|
|
# Override single_gpu_preferred for multi-GPU tests
|
|
gpu_required: true
|
|
single_gpu_preferred: false
|
|
check_patterns:
|
|
success:
|
|
- "loaded model"
|
|
- "offload.*GPU"
|
|
- "CUDA backend"
|
|
- "split.*layer.*GPU" # Expect multi-GPU split
|
|
failure:
|
|
- "CUDA.*error"
|
|
- "out of memory"
|
|
- "OOM"
|
|
- "CPU backend"
|
|
- "failed to load"
|
|
|
|
# Validation rules applied to all tests (unless overridden in profile)
|
|
validation:
|
|
# Require GPU acceleration (fail if CPU fallback detected)
|
|
gpu_required: true
|
|
|
|
# Require single GPU usage for Tesla K80 (detect unnecessary multi-GPU splits)
|
|
single_gpu_preferred: true
|
|
|
|
# Log patterns to check for success/failure
|
|
check_patterns:
|
|
success:
|
|
- "loaded model"
|
|
- "offload.*GPU"
|
|
- "CUDA backend"
|
|
failure:
|
|
- "CUDA.*error"
|
|
- "out of memory"
|
|
- "OOM"
|
|
- "CPU backend"
|
|
- "failed to load"
|
|
warning:
|
|
- "fallback"
|
|
- "using CPU"
|
|
|
|
# Server configuration
|
|
server:
|
|
host: "localhost"
|
|
port: 11434
|
|
startup_timeout: 30s
|
|
health_check_interval: 1s
|
|
health_check_endpoint: "/api/tags"
|
|
|
|
# Reporting configuration
|
|
reporting:
|
|
formats:
|
|
- json
|
|
- markdown
|
|
include_logs: true
|
|
log_excerpt_lines: 50 # Lines of log to include per failure
|