Add GitHub Actions CI/CD pipeline and test framework

- Add .github/workflows/build-test.yml for automated testing - Add tests/ directory with TypeScript test runner - Add docs/CICD.md documentation - Remove .gitlab-ci.yml (migrated to GitHub Actions) - Update .gitignore for test artifacts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-17 11:17:11 +00:00 · 2025-12-15 14:06:44 +08:00
parent 2b5aeaf86b
commit d11140c016
23 changed files with 3014 additions and 50 deletions
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -0,0 +1,187 @@
+name: Build and Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+env:
+  TESTLINK_URL: http://localhost:8090
+  TESTLINK_PROJECT_ID: "1"
+  OLLAMA_HOST: http://localhost:11434
+
+jobs:
+  build:
+    name: Build Docker Images
+    runs-on: [self-hosted, k80, cuda11]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install test runner dependencies
+        run: cd tests && npm ci
+
+      - name: Run build tests
+        id: build-tests
+        run: |
+          cd tests
+          npm run dev -- run --suite build --no-llm --output json > /tmp/build-results.json 2>&1 || true
+          cat /tmp/build-results.json
+
+          # Check if any tests failed
+          if grep -q '"passed": false' /tmp/build-results.json; then
+            echo "Some build tests failed"
+            exit 1
+          fi
+
+      - name: Upload build results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: build-test-results
+          path: /tmp/build-results.json
+
+  runtime:
+    name: Runtime Tests
+    runs-on: [self-hosted, k80, cuda11]
+    needs: build
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install test runner dependencies
+        run: cd tests && npm ci
+
+      - name: Start container
+        run: |
+          cd docker
+          docker compose down 2>/dev/null || true
+          docker compose up -d
+          sleep 10
+
+      - name: Run runtime tests
+        id: runtime-tests
+        run: |
+          cd tests
+          npm run dev -- run --suite runtime --no-llm --output json > /tmp/runtime-results.json 2>&1 || true
+          cat /tmp/runtime-results.json
+
+      - name: Upload runtime results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: runtime-test-results
+          path: /tmp/runtime-results.json
+
+  inference:
+    name: Inference Tests
+    runs-on: [self-hosted, k80, cuda11]
+    needs: runtime
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install test runner dependencies
+        run: cd tests && npm ci
+
+      - name: Run inference tests
+        id: inference-tests
+        run: |
+          cd tests
+          npm run dev -- run --suite inference --no-llm --output json > /tmp/inference-results.json 2>&1 || true
+          cat /tmp/inference-results.json
+
+      - name: Upload inference results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: inference-test-results
+          path: /tmp/inference-results.json
+
+  llm-judge:
+    name: LLM Judge Evaluation
+    runs-on: [self-hosted, k80, cuda11]
+    needs: [build, runtime, inference]
+    if: always()
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Install test runner dependencies
+        run: cd tests && npm ci
+
+      - name: Download all test results
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp/results
+
+      - name: Run LLM judge on all results
+        run: |
+          cd tests
+          echo "Running LLM judge evaluation..."
+
+          # Re-run all tests with LLM judge using local Ollama
+          npm run dev -- run --output json > /tmp/llm-judged-results.json 2>&1 || true
+          cat /tmp/llm-judged-results.json
+
+      - name: Upload final results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: llm-judged-results
+          path: /tmp/llm-judged-results.json
+
+  cleanup:
+    name: Cleanup
+    runs-on: [self-hosted, k80, cuda11]
+    needs: [build, runtime, inference, llm-judge]
+    if: always()
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Stop Container
+        run: |
+          cd docker
+          docker compose down || true
+          echo "Container stopped"
+
+      - name: Summary
+        run: |
+          echo "## Build and Test Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "| Stage | Status |" >> $GITHUB_STEP_SUMMARY
+          echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY
+          echo "| Build | ${{ needs.build.result }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Runtime | ${{ needs.runtime.result }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| Inference | ${{ needs.inference.result }} |" >> $GITHUB_STEP_SUMMARY
+          echo "| LLM Judge | ${{ needs.llm-judge.result }} |" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "Commit: ${{ github.sha }}" >> $GITHUB_STEP_SUMMARY
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,8 @@
 .venv
 .swp
 dist
-build
+/build
+!tests/testcases/build
 .cache
 .gocache
 *.exe
@@ -16,3 +17,4 @@ llama/build
 llama/vendor
 /ollama
 docker/output/
+tests/node_modules/
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,49 +0,0 @@
-# This file is a template, and might need editing before it works on your project.
-# This is a sample GitLab CI/CD configuration file that should run without any modifications.
-# It demonstrates a basic 3 stage CI/CD pipeline. Instead of real tests or scripts,
-# it uses echo commands to simulate the pipeline execution.
-#
-# A pipeline is composed of independent jobs that run scripts, grouped into stages.
-# Stages run in sequential order, but jobs within stages run in parallel.
-#
-# For more information, see: https://docs.gitlab.com/ee/ci/yaml/#stages
-#
-# You can copy and paste this template into a new `.gitlab-ci.yml` file.
-# You should not add this template to an existing `.gitlab-ci.yml` file by using the `include:` keyword.
-#
-# To contribute improvements to CI/CD templates, please follow the Development guide at:
-# https://docs.gitlab.com/development/cicd/templates/
-# This specific template is located at:
-# https://gitlab.com/gitlab-org/gitlab/-/blob/master/lib/gitlab/ci/templates/Getting-Started.gitlab-ci.yml
-
-stages:          # List of stages for jobs, and their order of execution
-  - build
-  - test
-  - deploy
-
-build-job:       # This job runs in the build stage, which runs first.
-  stage: build
-  script:
-    - echo "Compiling the code..."
-    - echo "Compile complete."
-
-unit-test-job:   # This job runs in the test stage.
-  stage: test    # It only starts when the job in the build stage completes successfully.
-  script:
-    - echo "Running unit tests... This will take about 60 seconds."
-    - sleep 60
-    - echo "Code coverage is 90%"
-
-lint-test-job:   # This job also runs in the test stage.
-  stage: test    # It can run at the same time as unit-test-job (in parallel).
-  script:
-    - echo "Linting code... This will take about 10 seconds."
-    - sleep 10
-    - echo "No lint issues found."
-
-deploy-job:      # This job runs in the deploy stage.
-  stage: deploy  # It only runs when *both* jobs in the test stage complete successfully.
-  environment: production
-  script:
-    - echo "Deploying application..."
-    - echo "Application successfully deployed."
--- a/docs/CICD.md
+++ b/docs/CICD.md
@@ -0,0 +1,318 @@
+# CI/CD Plan for Ollama37
+
+This document describes the CI/CD pipeline for building and testing Ollama37 with Tesla K80 (CUDA compute capability 3.7) support.
+
+## Infrastructure Overview
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                              GITHUB                                      │
+│                     dogkeeper886/ollama37                                │
+│                                                                         │
+│  Push to main ──────────────────────────────────────────────────────┐   │
+└─────────────────────────────────────────────────────────────────────│───┘
+                                                                      │
+                                                                      ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         CI/CD NODE                                       │
+│                                                                         │
+│  Hardware:                                                              │
+│    - Tesla K80 GPU (compute capability 3.7)                            │
+│    - NVIDIA Driver 470.x                                               │
+│                                                                         │
+│  Software:                                                              │
+│    - Rocky Linux 9.7                                                   │
+│    - Docker 29.1.3 + Docker Compose 5.0.0                              │
+│    - NVIDIA Container Toolkit                                          │
+│    - GitHub Actions Runner (self-hosted, labels: k80, cuda11)          │
+│                                                                         │
+│  Services:                                                              │
+│    - TestLink (http://localhost:8090) - Test management                │
+│    - TestLink MCP - Claude Code integration                            │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+                                                                      │
+                                                                      ▼
+┌─────────────────────────────────────────────────────────────────────────┐
+│                         SERVE NODE                                       │
+│                                                                         │
+│  Services:                                                              │
+│    - Ollama (production)                                               │
+│    - Dify (LLM application platform)                                   │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+## Build Strategy: Docker-Based
+
+We use the two-stage Docker build system located in `/docker/`:
+
+### Stage 1: Builder Image (Cached)
+
+**Image:** `ollama37-builder:latest` (~15GB)
+
+**Contents:**
+- Rocky Linux 8
+- CUDA 11.4 toolkit
+- GCC 10 (built from source)
+- CMake 4.0 (built from source)
+- Go 1.25.3
+
+**Build time:** ~90 minutes (first time only, then cached)
+
+**Build command:**
+```bash
+cd docker && make build-builder
+```
+
+### Stage 2: Runtime Image (Per Build)
+
+**Image:** `ollama37:latest` (~18GB)
+
+**Process:**
+1. Clone source from GitHub
+2. Configure with CMake ("CUDA 11" preset)
+3. Build C/C++/CUDA libraries
+4. Build Go binary
+5. Package runtime environment
+
+**Build time:** ~10 minutes
+
+**Build command:**
+```bash
+cd docker && make build-runtime
+```
+
+## Pipeline Stages
+
+### Stage 1: Docker Build
+
+**Trigger:** Push to `main` branch
+
+**Steps:**
+1. Checkout repository
+2. Ensure builder image exists (build if not)
+3. Build runtime image: `make build-runtime`
+4. Verify image created successfully
+
+**Test Cases:**
+- TC-BUILD-001: Builder Image Verification
+- TC-BUILD-002: Runtime Image Build
+- TC-BUILD-003: Image Size Validation
+
+### Stage 2: Container Startup
+
+**Steps:**
+1. Start container with GPU: `docker compose up -d`
+2. Wait for health check to pass
+3. Verify Ollama server is responding
+
+**Test Cases:**
+- TC-RUNTIME-001: Container Startup
+- TC-RUNTIME-002: GPU Detection
+- TC-RUNTIME-003: Health Check
+
+### Stage 3: Inference Tests
+
+**Steps:**
+1. Pull test model (gemma3:4b)
+2. Run inference tests
+3. Verify CUBLAS legacy fallback
+
+**Test Cases:**
+- TC-INFERENCE-001: Model Pull
+- TC-INFERENCE-002: Basic Inference
+- TC-INFERENCE-003: API Endpoint Test
+- TC-INFERENCE-004: CUBLAS Fallback Verification
+
+### Stage 4: Cleanup & Report
+
+**Steps:**
+1. Stop container: `docker compose down`
+2. Report results to TestLink
+3. Clean up resources
+
+## Test Case Design
+
+### Build Tests (Suite: Build Tests)
+
+| ID | Name | Type | Description |
+|----|------|------|-------------|
+| TC-BUILD-001 | Builder Image Verification | Automated | Verify builder image exists with correct tools |
+| TC-BUILD-002 | Runtime Image Build | Automated | Build runtime image from GitHub source |
+| TC-BUILD-003 | Image Size Validation | Automated | Verify image sizes are within expected range |
+
+### Runtime Tests (Suite: Runtime Tests)
+
+| ID | Name | Type | Description |
+|----|------|------|-------------|
+| TC-RUNTIME-001 | Container Startup | Automated | Start container with GPU passthrough |
+| TC-RUNTIME-002 | GPU Detection | Automated | Verify Tesla K80 detected inside container |
+| TC-RUNTIME-003 | Health Check | Automated | Verify Ollama health check passes |
+
+### Inference Tests (Suite: Inference Tests)
+
+| ID | Name | Type | Description |
+|----|------|------|-------------|
+| TC-INFERENCE-001 | Model Pull | Automated | Pull gemma3:4b model |
+| TC-INFERENCE-002 | Basic Inference | Automated | Run simple prompt and verify response |
+| TC-INFERENCE-003 | API Endpoint Test | Automated | Test /api/generate endpoint |
+| TC-INFERENCE-004 | CUBLAS Fallback Verification | Automated | Verify legacy CUBLAS functions used |
+
+## GitHub Actions Workflow
+
+**File:** `.github/workflows/build-test.yml`
+
+**Triggers:**
+- Push to `main` branch
+- Pull request to `main` branch
+- Manual trigger (workflow_dispatch)
+
+**Runner:** Self-hosted with labels `[self-hosted, k80, cuda11]`
+
+**Jobs:**
+1. `build` - Build Docker runtime image
+2. `test` - Run inference tests in container
+3. `report` - Report results to TestLink
+
+## TestLink Integration
+
+**URL:** http://localhost:8090
+
+**Project:** ollama37
+
+**Test Suites:**
+- Build Tests
+- Runtime Tests
+- Inference Tests
+
+**Test Plan:** Created per release/sprint
+
+**Builds:** Created per CI run (commit SHA)
+
+**Execution Recording:**
+- Each test case result recorded via TestLink API
+- Pass/Fail status with notes
+- Linked to specific build/commit
+
+## Makefile Targets for CI
+
+| Target | Description | When to Use |
+|--------|-------------|-------------|
+| `make build-builder` | Build base image | First time setup |
+| `make build-runtime` | Build from GitHub | Normal CI builds |
+| `make build-runtime-no-cache` | Fresh GitHub clone | When cache is stale |
+| `make build-runtime-local` | Build from local | Local testing |
+
+## Environment Variables
+
+### Build Environment
+
+| Variable | Value | Description |
+|----------|-------|-------------|
+| `BUILDER_IMAGE` | ollama37-builder | Builder image name |
+| `RUNTIME_IMAGE` | ollama37 | Runtime image name |
+
+### Runtime Environment
+
+| Variable | Value | Description |
+|----------|-------|-------------|
+| `OLLAMA_HOST` | 0.0.0.0:11434 | Server listen address |
+| `NVIDIA_VISIBLE_DEVICES` | all | GPU visibility |
+| `OLLAMA_DEBUG` | 1 (optional) | Enable debug logging |
+| `GGML_CUDA_DEBUG` | 1 (optional) | Enable CUDA debug |
+
+### TestLink Environment
+
+| Variable | Value | Description |
+|----------|-------|-------------|
+| `TESTLINK_URL` | http://localhost:8090 | TestLink server URL |
+| `TESTLINK_API_KEY` | (configured) | API key for automation |
+
+## Prerequisites
+
+### One-Time Setup on CI/CD Node
+
+1. **Install GitHub Actions Runner:**
+   ```bash
+   mkdir -p ~/actions-runner && cd ~/actions-runner
+   curl -o actions-runner-linux-x64-2.321.0.tar.gz -L \
+     https://github.com/actions/runner/releases/download/v2.321.0/actions-runner-linux-x64-2.321.0.tar.gz
+   tar xzf ./actions-runner-linux-x64-2.321.0.tar.gz
+   ./config.sh --url https://github.com/dogkeeper886/ollama37 --token YOUR_TOKEN --labels k80,cuda11
+   sudo ./svc.sh install && sudo ./svc.sh start
+   ```
+
+2. **Build Builder Image (one-time, ~90 min):**
+   ```bash
+   cd /home/jack/src/ollama37/docker
+   make build-builder
+   ```
+
+3. **Verify GPU Access in Docker:**
+   ```bash
+   docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
+   ```
+
+4. **Start TestLink:**
+   ```bash
+   cd /home/jack/src/testlink-code
+   docker compose up -d
+   ```
+
+## Monitoring & Logs
+
+### View CI/CD Logs
+
+```bash
+# GitHub Actions Runner logs
+journalctl -u actions.runner.* -f
+
+# Docker build logs
+docker compose logs -f
+
+# TestLink logs
+cd /home/jack/src/testlink-code && docker compose logs -f
+```
+
+### Test Results
+
+- **TestLink Dashboard:** http://localhost:8090
+- **GitHub Actions:** https://github.com/dogkeeper886/ollama37/actions
+
+## Troubleshooting
+
+### Builder Image Missing
+
+```bash
+cd docker && make build-builder
+```
+
+### GPU Not Detected in Container
+
+```bash
+# Check UVM device files on host
+ls -l /dev/nvidia-uvm*
+
+# Create if missing
+nvidia-modprobe -u -c=0
+
+# Restart container
+docker compose restart
+```
+
+### Build Cache Stale
+
+```bash
+cd docker && make build-runtime-no-cache
+```
+
+### TestLink Connection Failed
+
+```bash
+# Check TestLink is running
+curl http://localhost:8090
+
+# Restart if needed
+cd /home/jack/src/testlink-code && docker compose restart
+```
--- a/tests/package-lock.json
+++ b/tests/package-lock.json
--- a/tests/package.json
+++ b/tests/package.json
@@ -0,0 +1,33 @@
+{
+  "name": "ollama37-test-runner",
+  "version": "1.0.0",
+  "description": "Scalable test runner with LLM-as-judge for ollama37",
+  "type": "module",
+  "main": "dist/index.js",
+  "bin": {
+    "ollama37-test": "dist/cli.js"
+  },
+  "scripts": {
+    "build": "tsc",
+    "start": "node dist/cli.js",
+    "dev": "tsx src/cli.ts",
+    "test": "tsx src/cli.ts run",
+    "test:build": "tsx src/cli.ts run --suite build",
+    "test:runtime": "tsx src/cli.ts run --suite runtime",
+    "test:inference": "tsx src/cli.ts run --suite inference"
+  },
+  "dependencies": {
+    "axios": "^1.7.2",
+    "chalk": "^5.3.0",
+    "commander": "^12.1.0",
+    "glob": "^10.3.10",
+    "js-yaml": "^4.1.0",
+    "p-limit": "^5.0.0"
+  },
+  "devDependencies": {
+    "@types/js-yaml": "^4.0.9",
+    "@types/node": "^20.14.0",
+    "tsx": "^4.16.0",
+    "typescript": "^5.5.0"
+  }
+}
--- a/tests/src/cli.ts
+++ b/tests/src/cli.ts
@@ -0,0 +1,165 @@
+#!/usr/bin/env node
+
+import { Command } from 'commander'
+import { writeFileSync } from 'fs'
+import path from 'path'
+import { fileURLToPath } from 'url'
+import { TestLoader } from './loader.js'
+import { TestExecutor } from './executor.js'
+import { LLMJudge } from './judge.js'
+import { Reporter, TestLinkReporter } from './reporter.js'
+import { RunnerOptions } from './types.js'
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url))
+const defaultTestcasesDir = path.join(__dirname, '..', 'testcases')
+
+const program = new Command()
+
+program
+  .name('ollama37-test')
+  .description('Scalable test runner with LLM-as-judge for ollama37')
+  .version('1.0.0')
+
+program
+  .command('run')
+  .description('Run test cases')
+  .option('-s, --suite <suite>', 'Run only tests in specified suite (build, runtime, inference)')
+  .option('-i, --id <id>', 'Run only specified test case by ID')
+  .option('-w, --workers <n>', 'Number of parallel workers', '1')
+  .option('-d, --dry-run', 'Show what would be executed without running')
+  .option('-o, --output <format>', 'Output format: console, json, junit', 'console')
+  .option('--report-testlink', 'Report results to TestLink')
+  .option('--ollama-url <url>', 'Ollama server URL', 'http://localhost:11434')
+  .option('--ollama-model <model>', 'Ollama model for judging', 'gemma3:4b')
+  .option('--testlink-url <url>', 'TestLink server URL', 'http://localhost:8090')
+  .option('--testlink-api-key <key>', 'TestLink API key')
+  .option('--no-llm', 'Skip LLM judging, use simple exit code check')
+  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .action(async (options) => {
+    console.log('='.repeat(60))
+    console.log('OLLAMA37 TEST RUNNER')
+    console.log('='.repeat(60))
+
+    const loader = new TestLoader(options.testcasesDir)
+    const executor = new TestExecutor(path.join(__dirname, '..', '..'))
+    const judge = new LLMJudge(options.ollamaUrl, options.ollamaModel)
+
+    // Load test cases
+    console.log('\nLoading test cases...')
+    let testCases = await loader.loadAll()
+
+    if (options.suite) {
+      testCases = testCases.filter(tc => tc.suite === options.suite)
+      console.log(`  Filtered by suite: ${options.suite}`)
+    }
+
+    if (options.id) {
+      testCases = testCases.filter(tc => tc.id === options.id)
+      console.log(`  Filtered by ID: ${options.id}`)
+    }
+
+    // Sort by dependencies
+    testCases = loader.sortByDependencies(testCases)
+
+    console.log(`  Found ${testCases.length} test cases`)
+
+    if (testCases.length === 0) {
+      console.log('\nNo test cases found!')
+      process.exit(1)
+    }
+
+    // Dry run
+    if (options.dryRun) {
+      console.log('\nDRY RUN - Would execute:')
+      for (const tc of testCases) {
+        console.log(`  ${tc.id}: ${tc.name}`)
+        for (const step of tc.steps) {
+          console.log(`    - ${step.name}: ${step.command}`)
+        }
+      }
+      process.exit(0)
+    }
+
+    // Execute tests
+    console.log('\nExecuting tests...')
+    const workers = parseInt(options.workers)
+    const results = await executor.executeAll(testCases, workers)
+
+    // Judge results
+    console.log('\nJudging results...')
+    let judgments
+    if (options.llm === false) {
+      console.log('  Using simple exit code check (--no-llm)')
+      judgments = results.map(r => judge.simpleJudge(r))
+    } else {
+      try {
+        judgments = await judge.judgeResults(results)
+      } catch (error) {
+        console.error('  LLM judging failed, falling back to simple check:', error)
+        judgments = results.map(r => judge.simpleJudge(r))
+      }
+    }
+
+    // Create reports
+    const reports = Reporter.createReports(results, judgments)
+
+    // Output results
+    switch (options.output) {
+      case 'json':
+        const json = Reporter.toJSON(reports)
+        console.log(json)
+        writeFileSync('test-results.json', json)
+        console.log('\nResults written to test-results.json')
+        break
+
+      case 'junit':
+        const junit = Reporter.toJUnit(reports)
+        writeFileSync('test-results.xml', junit)
+        console.log('\nResults written to test-results.xml')
+        break
+
+      case 'console':
+      default:
+        Reporter.toConsole(reports)
+        break
+    }
+
+    // Report to TestLink
+    if (options.reportTestlink && options.testlinkApiKey) {
+      const testlinkReporter = new TestLinkReporter(
+        options.testlinkUrl,
+        options.testlinkApiKey
+      )
+      // Would need plan ID and build ID
+      // await testlinkReporter.reportResults(reports, planId, buildId)
+      console.log('\nTestLink reporting not yet implemented')
+    }
+
+    // Exit with appropriate code
+    const failed = reports.filter(r => !r.pass).length
+    process.exit(failed > 0 ? 1 : 0)
+  })
+
+program
+  .command('list')
+  .description('List all test cases')
+  .option('--testcases-dir <dir>', 'Test cases directory', defaultTestcasesDir)
+  .action(async (options) => {
+    const loader = new TestLoader(options.testcasesDir)
+    const testCases = await loader.loadAll()
+
+    const grouped = loader.groupBySuite(testCases)
+
+    console.log('Available Test Cases:\n')
+    for (const [suite, cases] of grouped) {
+      console.log(`${suite.toUpperCase()}:`)
+      for (const tc of cases) {
+        console.log(`  ${tc.id}: ${tc.name}`)
+      }
+      console.log()
+    }
+
+    console.log(`Total: ${testCases.length} test cases`)
+  })
+
+program.parse()
--- a/tests/src/executor.ts
+++ b/tests/src/executor.ts
@@ -0,0 +1,119 @@
+import { exec } from 'child_process'
+import { promisify } from 'util'
+import { TestCase, TestResult, StepResult } from './types.js'
+
+const execAsync = promisify(exec)
+
+export class TestExecutor {
+  private workingDir: string
+
+  constructor(workingDir: string = process.cwd()) {
+    this.workingDir = workingDir
+  }
+
+  async executeStep(command: string, timeout: number): Promise<StepResult> {
+    const startTime = Date.now()
+    let stdout = ''
+    let stderr = ''
+    let exitCode = 0
+
+    try {
+      const result = await execAsync(command, {
+        cwd: this.workingDir,
+        timeout,
+        maxBuffer: 10 * 1024 * 1024, // 10MB buffer
+        shell: '/bin/bash'
+      })
+      stdout = result.stdout
+      stderr = result.stderr
+    } catch (error: any) {
+      stdout = error.stdout || ''
+      stderr = error.stderr || error.message || 'Unknown error'
+      exitCode = error.code || 1
+    }
+
+    const duration = Date.now() - startTime
+
+    return {
+      name: '',
+      command,
+      stdout,
+      stderr,
+      exitCode,
+      duration
+    }
+  }
+
+  async executeTestCase(testCase: TestCase): Promise<TestResult> {
+    const startTime = Date.now()
+    const stepResults: StepResult[] = []
+
+    console.log(`  Executing: ${testCase.id} - ${testCase.name}`)
+
+    for (const step of testCase.steps) {
+      console.log(`    Step: ${step.name}`)
+
+      const timeout = step.timeout || testCase.timeout
+      const result = await this.executeStep(step.command, timeout)
+      result.name = step.name
+
+      stepResults.push(result)
+
+      // Log step result
+      if (result.exitCode === 0) {
+        console.log(`      Exit: ${result.exitCode} (${result.duration}ms)`)
+      } else {
+        console.log(`      Exit: ${result.exitCode} (FAILED, ${result.duration}ms)`)
+      }
+    }
+
+    const totalDuration = Date.now() - startTime
+
+    // Combine all logs
+    const logs = stepResults.map(r => {
+      return `=== Step: ${r.name} ===
+Command: ${r.command}
+Exit Code: ${r.exitCode}
+Duration: ${r.duration}ms
+
+STDOUT:
+${r.stdout || '(empty)'}
+
+STDERR:
+${r.stderr || '(empty)'}
+`
+    }).join('\n' + '='.repeat(50) + '\n')
+
+    return {
+      testCase,
+      steps: stepResults,
+      totalDuration,
+      logs
+    }
+  }
+
+  async executeAll(testCases: TestCase[], concurrency: number = 1): Promise<TestResult[]> {
+    const results: TestResult[] = []
+
+    if (concurrency === 1) {
+      // Sequential execution
+      for (const tc of testCases) {
+        const result = await this.executeTestCase(tc)
+        results.push(result)
+      }
+    } else {
+      // Parallel execution with p-limit
+      const pLimit = (await import('p-limit')).default
+      const limit = pLimit(concurrency)
+
+      const promises = testCases.map(tc =>
+        limit(() => this.executeTestCase(tc))
+      )
+
+      const parallelResults = await Promise.all(promises)
+      results.push(...parallelResults)
+    }
+
+    return results
+  }
+}
--- a/tests/src/judge.ts
+++ b/tests/src/judge.ts
@@ -0,0 +1,146 @@
+import axios from 'axios'
+import { TestResult, Judgment } from './types.js'
+
+export class LLMJudge {
+  private ollamaUrl: string
+  private model: string
+  private batchSize: number
+
+  constructor(ollamaUrl: string = 'http://localhost:11434', model: string = 'gemma3:4b') {
+    this.ollamaUrl = ollamaUrl
+    this.model = model
+    this.batchSize = 5 // Judge 5 tests per LLM call
+  }
+
+  private buildPrompt(results: TestResult[]): string {
+    const testsSection = results.map((r, i) => {
+      return `
+### Test ${i + 1}: ${r.testCase.id} - ${r.testCase.name}
+
+**Criteria:**
+${r.testCase.criteria}
+
+**Execution Logs:**
+\`\`\`
+${r.logs.substring(0, 3000)}${r.logs.length > 3000 ? '\n... (truncated)' : ''}
+\`\`\`
+`
+    }).join('\n---\n')
+
+    return `You are a test evaluation judge. Analyze the following test results and determine if each test passed or failed based on the criteria provided.
+
+For each test, examine:
+1. The expected criteria
+2. The actual execution logs (stdout, stderr, exit codes)
+3. Whether the output meets the criteria (use fuzzy matching for AI outputs)
+
+${testsSection}
+
+Respond with a JSON array containing one object per test:
+[
+  {"testId": "TC-XXX-001", "pass": true, "reason": "Brief explanation"},
+  {"testId": "TC-XXX-002", "pass": false, "reason": "Brief explanation"}
+]
+
+Important:
+- For AI-generated text, accept reasonable variations (e.g., "4", "four", "The answer is 4" are all valid for math questions)
+- For build/runtime tests, check exit codes and absence of error messages
+- Be lenient with formatting differences, focus on semantic correctness
+
+Respond ONLY with the JSON array, no other text.`
+  }
+
+  async judgeResults(results: TestResult[]): Promise<Judgment[]> {
+    const allJudgments: Judgment[] = []
+
+    // Process in batches
+    for (let i = 0; i < results.length; i += this.batchSize) {
+      const batch = results.slice(i, i + this.batchSize)
+      console.log(`  Judging batch ${Math.floor(i / this.batchSize) + 1}/${Math.ceil(results.length / this.batchSize)}...`)
+
+      try {
+        const judgments = await this.judgeBatch(batch)
+        allJudgments.push(...judgments)
+      } catch (error) {
+        console.error(`  Failed to judge batch:`, error)
+        // Mark all tests in batch as failed
+        for (const r of batch) {
+          allJudgments.push({
+            testId: r.testCase.id,
+            pass: false,
+            reason: 'LLM judgment failed: ' + String(error)
+          })
+        }
+      }
+    }
+
+    return allJudgments
+  }
+
+  private async judgeBatch(results: TestResult[]): Promise<Judgment[]> {
+    const prompt = this.buildPrompt(results)
+
+    const response = await axios.post(`${this.ollamaUrl}/api/generate`, {
+      model: this.model,
+      prompt,
+      stream: false,
+      options: {
+        temperature: 0.1, // Low temperature for consistent judging
+        num_predict: 1000
+      }
+    }, {
+      timeout: 120000 // 2 minute timeout
+    })
+
+    const responseText = response.data.response
+
+    // Extract JSON from response
+    const jsonMatch = responseText.match(/\[[\s\S]*\]/)
+    if (!jsonMatch) {
+      throw new Error('No JSON array found in LLM response')
+    }
+
+    try {
+      const judgments = JSON.parse(jsonMatch[0]) as Judgment[]
+
+      // Validate and fill missing
+      const resultIds = results.map(r => r.testCase.id)
+      const judgedIds = new Set(judgments.map(j => j.testId))
+
+      // Add missing judgments
+      for (const id of resultIds) {
+        if (!judgedIds.has(id)) {
+          judgments.push({
+            testId: id,
+            pass: false,
+            reason: 'No judgment provided by LLM'
+          })
+        }
+      }
+
+      return judgments
+    } catch (parseError) {
+      throw new Error(`Failed to parse LLM response: ${responseText.substring(0, 200)}`)
+    }
+  }
+
+  // Fallback: Simple rule-based judgment (no LLM)
+  simpleJudge(result: TestResult): Judgment {
+    const allStepsPassed = result.steps.every(s => s.exitCode === 0)
+
+    if (allStepsPassed) {
+      return {
+        testId: result.testCase.id,
+        pass: true,
+        reason: 'All steps completed with exit code 0'
+      }
+    } else {
+      const failedSteps = result.steps.filter(s => s.exitCode !== 0)
+      return {
+        testId: result.testCase.id,
+        pass: false,
+        reason: `Steps failed: ${failedSteps.map(s => s.name).join(', ')}`
+      }
+    }
+  }
+}
--- a/tests/src/loader.ts
+++ b/tests/src/loader.ts
@@ -0,0 +1,91 @@
+import { readFileSync } from 'fs'
+import { glob } from 'glob'
+import yaml from 'js-yaml'
+import path from 'path'
+import { TestCase } from './types.js'
+
+export class TestLoader {
+  private testcasesDir: string
+
+  constructor(testcasesDir: string = './testcases') {
+    this.testcasesDir = testcasesDir
+  }
+
+  async loadAll(): Promise<TestCase[]> {
+    const pattern = path.join(this.testcasesDir, '**/*.yml')
+    const files = await glob(pattern)
+
+    const testCases: TestCase[] = []
+
+    for (const file of files) {
+      try {
+        const content = readFileSync(file, 'utf-8')
+        const testCase = yaml.load(content) as TestCase
+
+        // Set defaults
+        testCase.timeout = testCase.timeout || 60000
+        testCase.dependencies = testCase.dependencies || []
+        testCase.priority = testCase.priority || 1
+
+        testCases.push(testCase)
+      } catch (error) {
+        console.error(`Failed to load ${file}:`, error)
+      }
+    }
+
+    return testCases
+  }
+
+  async loadBySuite(suite: string): Promise<TestCase[]> {
+    const all = await this.loadAll()
+    return all.filter(tc => tc.suite === suite)
+  }
+
+  async loadById(id: string): Promise<TestCase | undefined> {
+    const all = await this.loadAll()
+    return all.find(tc => tc.id === id)
+  }
+
+  // Sort test cases by dependencies (topological sort)
+  sortByDependencies(testCases: TestCase[]): TestCase[] {
+    const sorted: TestCase[] = []
+    const visited = new Set<string>()
+    const idMap = new Map(testCases.map(tc => [tc.id, tc]))
+
+    const visit = (tc: TestCase) => {
+      if (visited.has(tc.id)) return
+      visited.add(tc.id)
+
+      // Visit dependencies first
+      for (const depId of tc.dependencies) {
+        const dep = idMap.get(depId)
+        if (dep) visit(dep)
+      }
+
+      sorted.push(tc)
+    }
+
+    // Sort by priority first, then by dependencies
+    const byPriority = [...testCases].sort((a, b) => a.priority - b.priority)
+    for (const tc of byPriority) {
+      visit(tc)
+    }
+
+    return sorted
+  }
+
+  // Group test cases by suite for parallel execution
+  groupBySuite(testCases: TestCase[]): Map<string, TestCase[]> {
+    const groups = new Map<string, TestCase[]>()
+
+    for (const tc of testCases) {
+      const suite = tc.suite
+      if (!groups.has(suite)) {
+        groups.set(suite, [])
+      }
+      groups.get(suite)!.push(tc)
+    }
+
+    return groups
+  }
+}
--- a/tests/src/reporter.ts
+++ b/tests/src/reporter.ts
@@ -0,0 +1,138 @@
+import axios from 'axios'
+import { TestReport, Judgment, TestResult } from './types.js'
+
+export class Reporter {
+  // Console reporter
+  static toConsole(reports: TestReport[]): void {
+    console.log('\n' + '='.repeat(60))
+    console.log('TEST RESULTS')
+    console.log('='.repeat(60))
+
+    const passed = reports.filter(r => r.pass)
+    const failed = reports.filter(r => !r.pass)
+
+    for (const report of reports) {
+      const status = report.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m'
+      console.log(`[${status}] ${report.testId}: ${report.name}`)
+      console.log(`       Reason: ${report.reason}`)
+      console.log(`       Duration: ${report.duration}ms`)
+    }
+
+    console.log('\n' + '-'.repeat(60))
+    console.log(`Total: ${reports.length} | Passed: ${passed.length} | Failed: ${failed.length}`)
+    console.log('='.repeat(60))
+  }
+
+  // JSON reporter
+  static toJSON(reports: TestReport[]): string {
+    return JSON.stringify({
+      summary: {
+        total: reports.length,
+        passed: reports.filter(r => r.pass).length,
+        failed: reports.filter(r => !r.pass).length,
+        timestamp: new Date().toISOString()
+      },
+      results: reports
+    }, null, 2)
+  }
+
+  // JUnit XML reporter (for CI/CD integration)
+  static toJUnit(reports: TestReport[]): string {
+    const escapeXml = (s: string) => s
+      .replace(/&/g, '&amp;')
+      .replace(/</g, '&lt;')
+      .replace(/>/g, '&gt;')
+      .replace(/"/g, '&quot;')
+      .replace(/'/g, '&apos;')
+
+    const testcases = reports.map(r => {
+      if (r.pass) {
+        return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}"/>`
+      } else {
+        return `    <testcase name="${escapeXml(r.testId)}: ${escapeXml(r.name)}" classname="${r.suite}" time="${r.duration / 1000}">
+      <failure message="${escapeXml(r.reason)}">${escapeXml(r.logs.substring(0, 1000))}</failure>
+    </testcase>`
+      }
+    }).join('\n')
+
+    const failures = reports.filter(r => !r.pass).length
+    const time = reports.reduce((sum, r) => sum + r.duration, 0) / 1000
+
+    return `<?xml version="1.0" encoding="UTF-8"?>
+<testsuite name="ollama37-tests" tests="${reports.length}" failures="${failures}" time="${time}">
+${testcases}
+</testsuite>`
+  }
+
+  // Combine results and judgments into reports
+  static createReports(results: TestResult[], judgments: Judgment[]): TestReport[] {
+    const judgmentMap = new Map(judgments.map(j => [j.testId, j]))
+
+    return results.map(result => {
+      const judgment = judgmentMap.get(result.testCase.id)
+
+      return {
+        testId: result.testCase.id,
+        name: result.testCase.name,
+        suite: result.testCase.suite,
+        pass: judgment?.pass ?? false,
+        reason: judgment?.reason ?? 'No judgment',
+        duration: result.totalDuration,
+        logs: result.logs
+      }
+    })
+  }
+}
+
+// TestLink reporter
+export class TestLinkReporter {
+  private url: string
+  private apiKey: string
+
+  constructor(url: string, apiKey: string) {
+    this.url = url
+    this.apiKey = apiKey
+  }
+
+  async reportResults(
+    reports: TestReport[],
+    planId: string,
+    buildId: string
+  ): Promise<void> {
+    console.log('\nReporting to TestLink...')
+
+    for (const report of reports) {
+      try {
+        await this.reportTestExecution(report, planId, buildId)
+        console.log(`  Reported: ${report.testId}`)
+      } catch (error) {
+        console.error(`  Failed to report ${report.testId}:`, error)
+      }
+    }
+  }
+
+  private async reportTestExecution(
+    report: TestReport,
+    planId: string,
+    buildId: string
+  ): Promise<void> {
+    // Extract numeric test case ID from external ID (e.g., "ollama37-8" -> need internal ID)
+    // This would need to be mapped from TestLink
+
+    const status = report.pass ? 'p' : 'f' // p=passed, f=failed, b=blocked
+
+    // Note: This uses the TestLink XML-RPC API
+    // In practice, you'd use the testlink-mcp or direct API calls
+    const payload = {
+      devKey: this.apiKey,
+      testcaseexternalid: report.testId,
+      testplanid: planId,
+      buildid: buildId,
+      status,
+      notes: `${report.reason}\n\nDuration: ${report.duration}ms\n\nLogs:\n${report.logs.substring(0, 4000)}`
+    }
+
+    // For now, just log - actual implementation would call TestLink API
+    console.log(`    Would report: ${report.testId} = ${status}`)
+  }
+}
--- a/tests/src/types.ts
+++ b/tests/src/types.ts
@@ -0,0 +1,66 @@
+// Test case definition
+export interface TestStep {
+  name: string
+  command: string
+  timeout?: number
+}
+
+export interface TestCase {
+  id: string
+  name: string
+  suite: string
+  priority: number
+  timeout: number
+  dependencies: string[]
+  steps: TestStep[]
+  criteria: string
+}
+
+// Execution results
+export interface StepResult {
+  name: string
+  command: string
+  stdout: string
+  stderr: string
+  exitCode: number
+  duration: number
+}
+
+export interface TestResult {
+  testCase: TestCase
+  steps: StepResult[]
+  totalDuration: number
+  logs: string
+}
+
+// LLM judgment
+export interface Judgment {
+  testId: string
+  pass: boolean
+  reason: string
+}
+
+// Final report
+export interface TestReport {
+  testId: string
+  name: string
+  suite: string
+  pass: boolean
+  reason: string
+  duration: number
+  logs: string
+}
+
+// Runner options
+export interface RunnerOptions {
+  suite?: string
+  id?: string
+  workers: number
+  dryRun: boolean
+  output: 'console' | 'json' | 'junit'
+  reportTestlink: boolean
+  ollamaUrl: string
+  ollamaModel: string
+  testlinkUrl: string
+  testlinkApiKey: string
+}
--- a/tests/testcases/build/TC-BUILD-001.yml
+++ b/tests/testcases/build/TC-BUILD-001.yml
@@ -0,0 +1,31 @@
+id: TC-BUILD-001
+name: Builder Image Verification
+suite: build
+priority: 1
+timeout: 120000
+
+dependencies: []
+
+steps:
+  - name: Check image exists
+    command: docker images ollama37-builder:latest --format '{{.Repository}}:{{.Tag}}'
+
+  - name: Verify CUDA toolkit
+    command: docker run --rm ollama37-builder:latest nvcc --version
+
+  - name: Verify GCC version
+    command: docker run --rm ollama37-builder:latest gcc --version | head -1
+
+  - name: Verify Go version
+    command: docker run --rm ollama37-builder:latest go version
+
+criteria: |
+  All commands should succeed (exit code 0).
+
+  Expected outputs:
+  - Image exists: should show "ollama37-builder:latest"
+  - CUDA: should show version 11.4 (accept 11.4.x)
+  - GCC: should show version 10 (accept GCC 10.x)
+  - Go: should show version 1.25 or higher
+
+  Accept minor version variations. Focus on major versions being correct.
--- a/tests/testcases/build/TC-BUILD-002.yml
+++ b/tests/testcases/build/TC-BUILD-002.yml
@@ -0,0 +1,27 @@
+id: TC-BUILD-002
+name: Runtime Image Build
+suite: build
+priority: 2
+timeout: 900000
+
+dependencies:
+  - TC-BUILD-001
+
+steps:
+  - name: Build runtime image
+    command: cd docker && make build-runtime-no-cache 2>&1 | tail -50
+    timeout: 900000
+
+  - name: Verify runtime image exists
+    command: docker images ollama37:latest --format '{{.Repository}}:{{.Tag}} {{.Size}}'
+
+criteria: |
+  The runtime Docker image should build successfully from GitHub source.
+
+  Expected:
+  - Build completes without fatal errors
+  - Final output should mention "successfully" or similar completion message
+  - Runtime image "ollama37:latest" should exist after build
+  - Image size should be substantial (>10GB is expected due to CUDA)
+
+  Accept build warnings. Only fail on actual build errors.
--- a/tests/testcases/build/TC-BUILD-003.yml
+++ b/tests/testcases/build/TC-BUILD-003.yml
@@ -0,0 +1,25 @@
+id: TC-BUILD-003
+name: Image Size Validation
+suite: build
+priority: 3
+timeout: 30000
+
+dependencies:
+  - TC-BUILD-002
+
+steps:
+  - name: Check builder image size
+    command: docker images ollama37-builder:latest --format '{{.Size}}'
+
+  - name: Check runtime image size
+    command: docker images ollama37:latest --format '{{.Size}}'
+
+criteria: |
+  Docker images should be within expected size ranges.
+
+  Expected:
+  - Builder image: 10GB to 20GB (contains CUDA, GCC, CMake, Go)
+  - Runtime image: 15GB to 25GB (contains builder + compiled ollama)
+
+  These are large images due to CUDA toolkit and build tools.
+  Accept sizes within reasonable range of expectations.
--- a/tests/testcases/inference/TC-INFERENCE-001.yml
+++ b/tests/testcases/inference/TC-INFERENCE-001.yml
@@ -0,0 +1,30 @@
+id: TC-INFERENCE-001
+name: Model Pull
+suite: inference
+priority: 1
+timeout: 600000
+
+dependencies:
+  - TC-RUNTIME-003
+
+steps:
+  - name: Check if model exists
+    command: docker exec ollama37 ollama list | grep -q "gemma3:4b" && echo "Model exists" || echo "Model not found"
+
+  - name: Pull model if needed
+    command: docker exec ollama37 ollama list | grep -q "gemma3:4b" || docker exec ollama37 ollama pull gemma3:4b
+    timeout: 600000
+
+  - name: Verify model available
+    command: docker exec ollama37 ollama list
+
+criteria: |
+  The gemma3:4b model should be available for inference.
+
+  Expected:
+  - Model is either already present or successfully downloaded
+  - "ollama list" shows gemma3:4b in the output
+  - No download errors
+
+  Accept if model already exists (skip download).
+  Model size is ~3GB, download may take time.
--- a/tests/testcases/inference/TC-INFERENCE-002.yml
+++ b/tests/testcases/inference/TC-INFERENCE-002.yml
@@ -0,0 +1,28 @@
+id: TC-INFERENCE-002
+name: Basic Inference
+suite: inference
+priority: 2
+timeout: 180000
+
+dependencies:
+  - TC-INFERENCE-001
+
+steps:
+  - name: Run simple math question
+    command: docker exec ollama37 ollama run gemma3:4b "What is 2+2? Answer with just the number." 2>&1
+    timeout: 120000
+
+  - name: Check GPU memory usage
+    command: docker exec ollama37 nvidia-smi --query-compute-apps=pid,used_memory --format=csv 2>/dev/null || echo "No GPU processes"
+
+criteria: |
+  Basic inference should work on Tesla K80.
+
+  Expected:
+  - Model responds to the math question
+  - Response should indicate "4" (accept variations: "4", "four", "The answer is 4", etc.)
+  - GPU memory should be allocated during inference
+  - No CUDA errors in output
+
+  This is AI-generated output - accept reasonable variations.
+  Focus on the model producing a coherent response.
--- a/tests/testcases/inference/TC-INFERENCE-003.yml
+++ b/tests/testcases/inference/TC-INFERENCE-003.yml
@@ -0,0 +1,34 @@
+id: TC-INFERENCE-003
+name: API Endpoint Test
+suite: inference
+priority: 3
+timeout: 120000
+
+dependencies:
+  - TC-INFERENCE-001
+
+steps:
+  - name: Test generate endpoint (non-streaming)
+    command: |
+      curl -s http://localhost:11434/api/generate \
+        -d '{"model":"gemma3:4b","prompt":"Say hello in one word","stream":false}' \
+        | head -c 500
+
+  - name: Test generate endpoint (streaming)
+    command: |
+      curl -s http://localhost:11434/api/generate \
+        -d '{"model":"gemma3:4b","prompt":"Count from 1 to 3","stream":true}' \
+        | head -5
+
+criteria: |
+  Ollama REST API should handle inference requests.
+
+  Expected for non-streaming:
+  - Returns JSON with "response" field
+  - Response contains some greeting (hello, hi, etc.)
+
+  Expected for streaming:
+  - Returns multiple JSON lines
+  - Each line contains partial response
+
+  Accept any valid JSON response. Content may vary.
--- a/tests/testcases/inference/TC-INFERENCE-004.yml
+++ b/tests/testcases/inference/TC-INFERENCE-004.yml
@@ -0,0 +1,32 @@
+id: TC-INFERENCE-004
+name: CUBLAS Fallback Verification
+suite: inference
+priority: 4
+timeout: 120000
+
+dependencies:
+  - TC-INFERENCE-002
+
+steps:
+  - name: Check for CUBLAS errors in logs
+    command: cd docker && docker compose logs 2>&1 | grep -i "CUBLAS_STATUS" | grep -v "SUCCESS" | head -10 || echo "No CUBLAS errors"
+
+  - name: Check compute capability detection
+    command: cd docker && docker compose logs 2>&1 | grep -iE "compute|capability|cc.*3" | head -10 || echo "No compute capability logs"
+
+  - name: Verify no GPU errors
+    command: cd docker && docker compose logs 2>&1 | grep -iE "error|fail" | grep -i gpu | head -10 || echo "No GPU errors"
+
+criteria: |
+  CUBLAS should work correctly on Tesla K80 using legacy fallback.
+
+  Expected:
+  - No CUBLAS_STATUS_ARCH_MISMATCH errors
+  - No CUBLAS_STATUS_NOT_SUPPORTED errors
+  - Compute capability 3.7 may be mentioned in debug logs
+  - No fatal GPU-related errors
+
+  The K80 uses legacy CUBLAS functions (cublasSgemmBatched)
+  instead of modern Ex variants. This should work transparently.
+
+  Accept warnings. Only fail on actual CUBLAS errors.
--- a/tests/testcases/runtime/TC-RUNTIME-001.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-001.yml
@@ -0,0 +1,31 @@
+id: TC-RUNTIME-001
+name: Container Startup
+suite: runtime
+priority: 1
+timeout: 120000
+
+dependencies:
+  - TC-BUILD-002
+
+steps:
+  - name: Stop existing container
+    command: cd docker && docker compose down 2>/dev/null || true
+
+  - name: Start container with GPU
+    command: cd docker && docker compose up -d
+
+  - name: Wait for startup
+    command: sleep 15
+
+  - name: Check container status
+    command: cd docker && docker compose ps
+
+criteria: |
+  The ollama37 container should start successfully with GPU access.
+
+  Expected:
+  - Container starts without errors
+  - docker compose ps shows container in "Up" state
+  - No "Exited" or "Restarting" status
+
+  Accept startup warnings. Container should be running.
--- a/tests/testcases/runtime/TC-RUNTIME-002.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-002.yml
@@ -0,0 +1,29 @@
+id: TC-RUNTIME-002
+name: GPU Detection
+suite: runtime
+priority: 2
+timeout: 60000
+
+dependencies:
+  - TC-RUNTIME-001
+
+steps:
+  - name: Check nvidia-smi inside container
+    command: docker exec ollama37 nvidia-smi
+
+  - name: Check CUDA libraries
+    command: docker exec ollama37 ldconfig -p | grep -i cuda | head -5
+
+  - name: Check Ollama GPU detection
+    command: cd docker && docker compose logs 2>&1 | grep -i gpu | head -10
+
+criteria: |
+  Tesla K80 GPU should be detected inside the container.
+
+  Expected:
+  - nvidia-smi shows Tesla K80 GPU(s)
+  - Driver version 470.x (or compatible)
+  - CUDA libraries are available (libcuda, libcublas, etc.)
+  - Ollama logs mention GPU detection
+
+  The K80 has 12GB VRAM per GPU. Accept variations in reported memory.
--- a/tests/testcases/runtime/TC-RUNTIME-003.yml
+++ b/tests/testcases/runtime/TC-RUNTIME-003.yml
@@ -0,0 +1,39 @@
+id: TC-RUNTIME-003
+name: Health Check
+suite: runtime
+priority: 3
+timeout: 180000
+
+dependencies:
+  - TC-RUNTIME-001
+
+steps:
+  - name: Wait for health check
+    command: |
+      for i in {1..30}; do
+        STATUS=$(docker inspect ollama37 --format='{{.State.Health.Status}}' 2>/dev/null || echo "starting")
+        echo "Health status: $STATUS (attempt $i/30)"
+        if [ "$STATUS" = "healthy" ]; then
+          echo "Container is healthy"
+          exit 0
+        fi
+        sleep 5
+      done
+      echo "Health check timeout"
+      exit 1
+
+  - name: Test API endpoint
+    command: curl -s http://localhost:11434/api/tags
+
+  - name: Check Ollama version
+    command: docker exec ollama37 ollama --version
+
+criteria: |
+  Ollama server should be healthy and API responsive.
+
+  Expected:
+  - Container health status becomes "healthy"
+  - /api/tags endpoint returns JSON response (even if empty models)
+  - ollama --version shows version information
+
+  Accept any valid JSON response from API. Version format may vary.
--- a/tests/tsconfig.json
+++ b/tests/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "node",
+    "esModuleInterop": true,
+    "strict": true,
+    "outDir": "dist",
+    "rootDir": "src",
+    "declaration": true,
+    "skipLibCheck": true,
+    "resolveJsonModule": true
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist"]
+}