ollama37/.github/workflows/tesla-k80-multi-gpu-tests.yml

name: Tesla K80 Multi-GPU Tests

on:
  workflow_dispatch: # Manual trigger only
  schedule:
    # Run weekly on Sundays at 2 AM UTC (less frequent than single-GPU tests)
    - cron: "0 2 * * 0"

jobs:
  multi-gpu-test:
    runs-on: self-hosted

    timeout-minutes: 90 # Longer timeout for large models

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Download ollama binary from latest build
        uses: dawidd6/action-download-artifact@v6
        with:
          workflow: tesla-k80-ci.yml
          name: ollama-binary
          github_token: ${{ secrets.GITHUB_TOKEN }}
          check_artifacts: true
          search_artifacts: true

      - name: Make ollama binary executable
        run: |
          chmod +x ollama
          ls -lh ollama
          ./ollama --version

      - name: Verify multi-GPU setup
        run: |
          nvidia-smi --list-gpus
          GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
          if [ "$GPU_COUNT" -lt 2 ]; then
            echo "Error: Multi-GPU tests require at least 2 GPUs. Found: $GPU_COUNT"
            exit 1
          fi
          echo "Found $GPU_COUNT GPUs - proceeding with multi-GPU tests"

      - name: Build test-runner
        run: |
          cd cmd/test-runner
          go mod init github.com/ollama/ollama/cmd/test-runner || true
          go mod tidy
          go build -o ../../test-runner .
          cd ../..
          ls -lh test-runner

      - name: Validate multi-GPU test configuration
        run: |
          ./test-runner validate --config test/config/models.yaml --profile multi-gpu

      - name: Run multi-GPU tests
        run: |
          ./test-runner run --profile multi-gpu --config test/config/models.yaml --output test-report-multi-gpu --verbose
        timeout-minutes: 60

      - name: Check multi-GPU test results
        run: |
          if ! jq -e '.summary.failed == 0' test-report-multi-gpu.json; then
            echo "Multi-GPU tests failed!"
            jq '.results[] | select(.status == "FAILED")' test-report-multi-gpu.json
            exit 1
          fi
          echo "All multi-GPU tests passed!"

      - name: Display GPU memory usage
        if: always()
        run: |
          echo "=== Final GPU Memory State ==="
          nvidia-smi

      - name: Upload multi-GPU test results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: multi-gpu-test-results
          path: |
            test-report-multi-gpu.json
            test-report-multi-gpu.md
            ollama.log
          retention-days: 30 # Keep longer for analysis