Files
ollama37/.github/workflows/inference.yml
Shang Chieh Tseng 22e77e0dde Unload models from VRAM after use to free GPU memory
- Add unloadModel() method to LLMJudge class
- CLI calls unloadModel() after judging completes
- Workflows unload gemma3:4b after inference tests
- Uses Ollama API with keep_alive:0 to trigger unload
2025-12-17 16:51:12 +08:00

134 lines
3.9 KiB
YAML

name: Inference Tests
on:
workflow_dispatch: # Manual trigger
inputs:
use_existing_container:
description: "Use existing running container"
required: false
default: "false"
type: choice
options:
- "true"
- "false"
judge_mode:
description: "Test judge mode"
required: false
default: "dual"
type: choice
options:
- "simple"
- "llm"
- "dual"
judge_model:
description: "LLM model for judging (if llm/dual mode)"
required: false
default: "gemma3:12b"
type: string
workflow_call: # Called by other workflows
inputs:
use_existing_container:
description: "Container is already running"
required: false
default: false
type: boolean
judge_mode:
description: "Test judge mode (simple, llm, dual)"
required: false
default: "dual"
type: string
judge_model:
description: "LLM model for judging"
required: false
default: "gemma3:12b"
type: string
outputs:
result:
description: "Inference test result"
value: ${{ jobs.inference.outputs.result }}
env:
OLLAMA_HOST: http://localhost:11434
jobs:
inference:
name: Inference Tests
runs-on: self-hosted
outputs:
result: ${{ steps.inference-tests.outcome }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Install test runner dependencies
run: cd tests && npm ci
- name: Start container (if needed)
if: ${{ inputs.use_existing_container != 'true' && inputs.use_existing_container != true }}
run: |
cd docker
docker compose down 2>/dev/null || true
docker compose up -d
sleep 10
- name: Run inference tests
id: inference-tests
run: |
cd tests
# Build judge flags based on input
JUDGE_FLAGS=""
if [ "${{ inputs.judge_mode }}" = "simple" ]; then
JUDGE_FLAGS="--no-llm"
elif [ "${{ inputs.judge_mode }}" = "llm" ]; then
JUDGE_FLAGS="--judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
else
# dual mode (default)
JUDGE_FLAGS="--dual-judge --judge-model ${{ inputs.judge_model || 'gemma3:12b' }}"
fi
echo "Judge mode: ${{ inputs.judge_mode || 'dual' }}"
echo "Judge flags: $JUDGE_FLAGS"
# Progress goes to stderr (visible), JSON results go to file
npm run --silent dev -- run --suite inference $JUDGE_FLAGS --output json > /tmp/inference-results.json || true
echo "--- JSON Results ---"
cat /tmp/inference-results.json
- name: Check test results
run: |
FAILED=$(jq '.summary.failed' /tmp/inference-results.json)
echo "Failed tests: $FAILED"
if [ "$FAILED" -gt 0 ]; then
echo "::error::$FAILED inference test(s) failed"
exit 1
fi
- name: Unload test model from VRAM
if: always()
run: |
echo "Unloading gemma3:4b from VRAM..."
curl -s http://localhost:11434/api/generate -d '{"model":"gemma3:4b","keep_alive":0}' || true
echo "Model unloaded"
- name: Upload inference results
uses: actions/upload-artifact@v4
if: always()
with:
name: inference-test-results
path: /tmp/inference-results.json
- name: Stop container (if we started it)
if: ${{ always() && inputs.use_existing_container != 'true' && inputs.use_existing_container != true }}
run: |
cd docker
docker compose down || true
echo "Container stopped"