Add multi-GPU test workflow and rename single-GPU workflow

- Rename tesla-k80-tests.yml to tesla-k80-single-gpu-tests.yml for clarity - Add new tesla-k80-multi-gpu-tests.yml workflow for large models - Add multi-gpu profile to test/config/models.yaml with gemma3:27b and gpt-oss:20b - Multi-GPU workflow includes GPU count verification and weekly schedule - Profile-specific validation allows multi-GPU splits for large models - Separate workflows optimize CI efficiency: quick tests vs. thorough tests
2025-12-10 15:57:04 +00:00 · 2025-10-30 12:04:50 +08:00
parent 1aa80e9411
commit 6c3876a30d
3 changed files with 110 additions and 10 deletions
--- a/test/config/models.yaml
+++ b/test/config/models.yaml
@@ -24,19 +24,40 @@ profiles:
        max_response_tokens: 100
        timeout: 120s

-  # Stress test profile - larger models and longer prompts
-  stress:
-    timeout: 60m
+  # Multi-GPU test profile - test models requiring 2x Tesla K80s
+  multi-gpu:
+    timeout: 45m
    models:
-      - name: gemma3:12b
+      - name: gemma3:27b
        prompts:
-          - "Write a detailed explanation of how neural networks work, focusing on backpropagation."
-          - "Describe the architecture of a transformer model in detail."
-        min_response_tokens: 50
-        max_response_tokens: 1000
+          - "Hello, respond with a brief greeting."
+        min_response_tokens: 5
+        max_response_tokens: 100
        timeout: 300s
+      - name: gpt-oss:20b
+        prompts:
+          - "Hello, respond with a brief greeting."
+        min_response_tokens: 5
+        max_response_tokens: 100
+        timeout: 240s
+    validation:
+      # Override single_gpu_preferred for multi-GPU tests
+      gpu_required: true
+      single_gpu_preferred: false
+      check_patterns:
+        success:
+          - "loaded model"
+          - "offload.*GPU"
+          - "CUDA backend"
+          - "split.*layer.*GPU" # Expect multi-GPU split
+        failure:
+          - "CUDA.*error"
+          - "out of memory"
+          - "OOM"
+          - "CPU backend"
+          - "failed to load"

-# Validation rules applied to all tests
+# Validation rules applied to all tests (unless overridden in profile)
 validation:
  # Require GPU acceleration (fail if CPU fallback detected)
  gpu_required: true