Unload models from VRAM after use to free GPU memory

- Add unloadModel() method to LLMJudge class - CLI calls unloadModel() after judging completes - Workflows unload gemma3:4b after inference tests - Uses Ollama API with keep_alive:0 to trigger unload
2025-12-20 12:47:00 +00:00 · 2025-12-17 16:51:12 +08:00
parent 7bb050f146
commit 22e77e0dde
4 changed files with 43 additions and 0 deletions
--- a/.github/workflows/full-pipeline.yml
+++ b/.github/workflows/full-pipeline.yml
@@ -165,6 +165,13 @@ jobs:
            exit 1
          fi

+      - name: Unload test model from VRAM
+        if: always()
+        run: |
+          echo "Unloading gemma3:4b from VRAM..."
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
+          echo "Model unloaded"
+
      - name: Upload inference results
        uses: actions/upload-artifact@v4
        if: always()
--- a/.github/workflows/inference.yml
+++ b/.github/workflows/inference.yml
@@ -111,6 +111,13 @@ jobs:
            exit 1
          fi

+      - name: Unload test model from VRAM
+        if: always()
+        run: |
+          echo "Unloading gemma3:4b from VRAM..."
+          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
+          echo "Model unloaded"
+
      - name: Upload inference results
        uses: actions/upload-artifact@v4
        if: always()