Unload models from VRAM after use to free GPU memory

- Add unloadModel() method to LLMJudge class - CLI calls unloadModel() after judging completes - Workflows unload gemma3:4b after inference tests - Uses Ollama API with keep_alive:0 to trigger unload
2025-12-22 05:37:01 +00:00 · 2025-12-17 16:51:12 +08:00
parent 7bb050f146
commit 22e77e0dde
4 changed files with 43 additions and 0 deletions
--- a/.github/workflows/full-pipeline.yml
+++ b/.github/workflows/full-pipeline.yml
@@ -165,6 +165,13 @@ jobs:
            exit 1
          fi
      - name: Unload test model from VRAM
        if: always()
        run: |
          echo "Unloading gemma3:4b from VRAM..."
          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
          echo "Model unloaded"
      - name: Upload inference results
        uses: actions/upload-artifact@v4
        if: always()
--- a/.github/workflows/inference.yml
+++ b/.github/workflows/inference.yml
@@ -111,6 +111,13 @@ jobs:
            exit 1
          fi
      - name: Unload test model from VRAM
        if: always()
        run: |
          echo "Unloading gemma3:4b from VRAM..."
          curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
          echo "Model unloaded"
      - name: Upload inference results
        uses: actions/upload-artifact@v4
        if: always()
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -169,6 +169,11 @@ program
      }
    }
    // Unload judge model from VRAM if LLM was used
    if (options.llm !== false) {
      await judge.unloadModel();
    }
    // Create reports (with separate verdicts in dual-judge mode)
    const reports = Reporter.createReports(
      results,
--- a/tests/src/judge.ts
+++ b/tests/src/judge.ts
@@ -175,6 +175,30 @@ Respond ONLY with the JSON array, no other text.`;
    }
  }
  // Unload the judge model from VRAM to free memory for other tests
  async unloadModel(): Promise<void> {
    try {
      process.stderr.write(
        `  Unloading judge model ${this.model} from VRAM...\n`,
      );
      await axios.post(
        `${this.ollamaUrl}/api/generate`,
        {
          model: this.model,
          keep_alive: 0,
        },
        {
          timeout: 30000,
        },
      );
      process.stderr.write(`  Judge model unloaded.\n`);
    } catch (error) {
      process.stderr.write(
        `  Warning: Failed to unload judge model: ${error}\n`,
      );
    }
  }
  // Fallback: Simple rule-based judgment (no LLM)
  simpleJudge(result: TestResult): Judgment {
    const allStepsPassed = result.steps.every((s) => s.exitCode === 0);