Add multi-model inference tests for gemma3 12b and 27b

- TC-INFERENCE-004: gemma3:12b single GPU test - TC-INFERENCE-005: gemma3:27b dual-GPU test (K80 layer split) - Each test unloads previous model before loading next - Workflows unload all 3 model sizes after inference suite - 27b test verifies both GPUs have memory allocated
2025-12-20 12:47:00 +00:00 · 2025-12-17 17:01:25 +08:00
parent 22e77e0dde
commit 806232d95f
4 changed files with 290 additions and 6 deletions
--- a/.github/workflows/full-pipeline.yml
+++ b/.github/workflows/full-pipeline.yml
@@ -165,12 +165,14 @@ jobs:
            exit 1
          fi

-      - name: Unload test model from VRAM
+      - name: Unload test models from VRAM
        if: always()
        run: |
-          echo "Unloading gemma3:4b from VRAM..."
+          echo "Unloading all test models from VRAM..."
          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
-          echo "Model unloaded"
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:12b","keep_alive":0}' || true
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:27b","keep_alive":0}' || true
+          echo "All models unloaded"

      - name: Upload inference results
        uses: actions/upload-artifact@v4
--- a/.github/workflows/inference.yml
+++ b/.github/workflows/inference.yml
@@ -111,12 +111,14 @@ jobs:
            exit 1
          fi

-      - name: Unload test model from VRAM
+      - name: Unload test models from VRAM
        if: always()
        run: |
-          echo "Unloading gemma3:4b from VRAM..."
+          echo "Unloading all test models from VRAM..."
          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
-          echo "Model unloaded"
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:12b","keep_alive":0}' || true
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:27b","keep_alive":0}' || true
+          echo "All models unloaded"

      - name: Upload inference results
        uses: actions/upload-artifact@v4