ollama37/test/config/models.yaml

# Test configuration for Tesla K80 model testing
# This file defines test profiles with different model sizes and test scenarios

profiles:
  # Quick test profile - fast smoke test with medium model
  quick:
    timeout: 5m
    models:
      - name: gemma3:4b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
        timeout: 60s

  # Full test profile - test largest model that fits on single K80
  full:
    timeout: 30m
    models:
      - name: gemma3:12b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
        timeout: 120s

  # Multi-GPU test profile - test models requiring 2x Tesla K80s
  multi-gpu:
    timeout: 45m
    models:
      - name: gemma3:27b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
        timeout: 300s
      - name: gpt-oss:20b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
        timeout: 240s
    validation:
      # Override single_gpu_preferred for multi-GPU tests
      gpu_required: true
      single_gpu_preferred: false
      check_patterns:
        success:
          - "loaded model"
          - "offload.*GPU"
          - "CUDA backend"
          - "split.*layer.*GPU" # Expect multi-GPU split
        failure:
          - "CUDA.*error"
          - "out of memory"
          - "OOM"
          - "failed to load.*backend"
          - "backend.*failed"

# Validation rules applied to all tests (unless overridden in profile)
validation:
  # Require GPU acceleration (fail if CPU fallback detected)
  gpu_required: true

  # Require single GPU usage for Tesla K80 (detect unnecessary multi-GPU splits)
  single_gpu_preferred: true

  # Log patterns to check for success/failure
  check_patterns:
    success:
      - "loaded model"
      - "offload.*GPU"
      - "CUDA backend"
    failure:
      - "CUDA.*error"
      - "out of memory"
      - "OOM"
      - "failed to load.*backend"
      - "backend.*failed"
    warning:
      - "fallback"
      - "using CPU"

# Server configuration
server:
  host: "localhost"
  port: 11434
  startup_timeout: 30s
  health_check_interval: 1s
  health_check_endpoint: "/api/tags"

# Reporting configuration
reporting:
  formats:
    - json
    - markdown
  include_logs: true
  log_excerpt_lines: 50 # Lines of log to include per failure