Implement Go-based test runner framework for Tesla K80 testing

Add comprehensive test orchestration framework: Test Runner (cmd/test-runner/): - config.go: YAML configuration loading and validation - server.go: Ollama server lifecycle management (start/stop/health checks) - monitor.go: Real-time log monitoring with pattern matching - test.go: Model testing via Ollama API (pull, chat, validation) - validate.go: Test result validation (GPU usage, response quality, log analysis) - report.go: Structured reporting (JSON and Markdown formats) - main.go: CLI interface with run/validate/list commands Test Configurations (test/config/): - models.yaml: Full test suite with quick/full/stress profiles - quick.yaml: Fast smoke test with gemma2:2b Updated Workflow: - tesla-k80-tests.yml: Use test-runner instead of shell scripts - Run quick tests first, then full tests if passing - Generate structured JSON reports for pass/fail checking - Upload test results as artifacts Features: - Multi-model testing with configurable profiles - API-based testing (not CLI commands) - Real-time log monitoring for GPU events and errors - Automatic validation of GPU loading and response quality - Structured JSON and Markdown reports - Graceful server lifecycle management - Interrupt handling (Ctrl+C cleanup) Addresses limitations of shell-based testing by providing: - Better error handling and reporting - Programmatic test orchestration - Reusable test framework - Clear pass/fail criteria - Detailed test metrics and timing
2025-12-10 07:46:59 +00:00 · 2025-10-30 11:04:48 +08:00
parent aaaf334e7f
commit d59284d30a
10 changed files with 1631 additions and 113 deletions
--- a/cmd/test-runner/config.go
+++ b/cmd/test-runner/config.go
@@ -0,0 +1,154 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"time"
+
+	"gopkg.in/yaml.v3"
+)
+
+// Config represents the complete test configuration
+type Config struct {
+	Profiles   map[string]Profile `yaml:"profiles"`
+	Validation Validation         `yaml:"validation"`
+	Server     ServerConfig       `yaml:"server"`
+	Reporting  ReportingConfig    `yaml:"reporting"`
+}
+
+// Profile represents a test profile with multiple models
+type Profile struct {
+	Timeout time.Duration `yaml:"timeout"`
+	Models  []ModelTest   `yaml:"models"`
+}
+
+// ModelTest represents a single model test configuration
+type ModelTest struct {
+	Name              string        `yaml:"name"`
+	Prompts           []string      `yaml:"prompts"`
+	MinResponseTokens int           `yaml:"min_response_tokens"`
+	MaxResponseTokens int           `yaml:"max_response_tokens"`
+	Timeout           time.Duration `yaml:"timeout"`
+}
+
+// Validation represents validation rules
+type Validation struct {
+	GPURequired        bool           `yaml:"gpu_required"`
+	SingleGPUPreferred bool           `yaml:"single_gpu_preferred"`
+	CheckPatterns      CheckPatterns  `yaml:"check_patterns"`
+}
+
+// CheckPatterns defines log patterns to match
+type CheckPatterns struct {
+	Success []string `yaml:"success"`
+	Failure []string `yaml:"failure"`
+	Warning []string `yaml:"warning"`
+}
+
+// ServerConfig represents server configuration
+type ServerConfig struct {
+	Host                string        `yaml:"host"`
+	Port                int           `yaml:"port"`
+	StartupTimeout      time.Duration `yaml:"startup_timeout"`
+	HealthCheckInterval time.Duration `yaml:"health_check_interval"`
+	HealthCheckEndpoint string        `yaml:"health_check_endpoint"`
+}
+
+// ReportingConfig represents reporting configuration
+type ReportingConfig struct {
+	Formats          []string `yaml:"formats"`
+	IncludeLogs      bool     `yaml:"include_logs"`
+	LogExcerptLines  int      `yaml:"log_excerpt_lines"`
+}
+
+// LoadConfig loads and validates a test configuration from a YAML file
+func LoadConfig(path string) (*Config, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read config file: %w", err)
+	}
+
+	var config Config
+	if err := yaml.Unmarshal(data, &config); err != nil {
+		return nil, fmt.Errorf("failed to parse config YAML: %w", err)
+	}
+
+	// Set defaults
+	if config.Server.Host == "" {
+		config.Server.Host = "localhost"
+	}
+	if config.Server.Port == 0 {
+		config.Server.Port = 11434
+	}
+	if config.Server.StartupTimeout == 0 {
+		config.Server.StartupTimeout = 30 * time.Second
+	}
+	if config.Server.HealthCheckInterval == 0 {
+		config.Server.HealthCheckInterval = 1 * time.Second
+	}
+	if config.Server.HealthCheckEndpoint == "" {
+		config.Server.HealthCheckEndpoint = "/api/tags"
+	}
+	if config.Reporting.LogExcerptLines == 0 {
+		config.Reporting.LogExcerptLines = 50
+	}
+	if len(config.Reporting.Formats) == 0 {
+		config.Reporting.Formats = []string{"json"}
+	}
+
+	// Validate config
+	if err := validateConfig(&config); err != nil {
+		return nil, fmt.Errorf("invalid config: %w", err)
+	}
+
+	return &config, nil
+}
+
+// validateConfig validates the loaded configuration
+func validateConfig(config *Config) error {
+	if len(config.Profiles) == 0 {
+		return fmt.Errorf("no profiles defined in config")
+	}
+
+	for profileName, profile := range config.Profiles {
+		if len(profile.Models) == 0 {
+			return fmt.Errorf("profile %q has no models defined", profileName)
+		}
+
+		for i, model := range profile.Models {
+			if model.Name == "" {
+				return fmt.Errorf("profile %q model %d has no name", profileName, i)
+			}
+			if len(model.Prompts) == 0 {
+				return fmt.Errorf("profile %q model %q has no prompts", profileName, model.Name)
+			}
+			if model.Timeout == 0 {
+				return fmt.Errorf("profile %q model %q has no timeout", profileName, model.Name)
+			}
+		}
+
+		if profile.Timeout == 0 {
+			return fmt.Errorf("profile %q has no timeout", profileName)
+		}
+	}
+
+	return nil
+}
+
+// GetProfile returns a specific profile by name
+func (c *Config) GetProfile(name string) (*Profile, error) {
+	profile, ok := c.Profiles[name]
+	if !ok {
+		return nil, fmt.Errorf("profile %q not found", name)
+	}
+	return &profile, nil
+}
+
+// ListProfiles returns a list of all profile names
+func (c *Config) ListProfiles() []string {
+	profiles := make([]string, 0, len(c.Profiles))
+	for name := range c.Profiles {
+		profiles = append(profiles, name)
+	}
+	return profiles
+}
--- a/cmd/test-runner/main.go
+++ b/cmd/test-runner/main.go
@@ -0,0 +1,243 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"os"
+	"os/signal"
+	"strings"
+	"syscall"
+	"time"
+)
+
+const (
+	defaultConfigPath = "test/config/models.yaml"
+	defaultOllamaBin  = "./ollama"
+	defaultLogPath    = "ollama.log"
+	defaultOutputPath = "test-report"
+)
+
+func main() {
+	// Define subcommands
+	runCmd := flag.NewFlagSet("run", flag.ExitOnError)
+	validateCmd := flag.NewFlagSet("validate", flag.ExitOnError)
+	listCmd := flag.NewFlagSet("list", flag.ExitOnError)
+
+	// Run command flags
+	runConfig := runCmd.String("config", defaultConfigPath, "Path to test configuration file")
+	runProfile := runCmd.String("profile", "quick", "Test profile to run")
+	runOllamaBin := runCmd.String("ollama-bin", defaultOllamaBin, "Path to ollama binary")
+	runOutput := runCmd.String("output", defaultOutputPath, "Output path for test report")
+	runVerbose := runCmd.Bool("verbose", false, "Enable verbose logging")
+	runKeepModels := runCmd.Bool("keep-models", false, "Don't delete models after test")
+
+	// Validate command flags
+	validateConfig := validateCmd.String("config", defaultConfigPath, "Path to test configuration file")
+
+	// List command flags
+	listConfig := listCmd.String("config", defaultConfigPath, "Path to test configuration file")
+
+	// Parse command
+	if len(os.Args) < 2 {
+		printUsage()
+		os.Exit(1)
+	}
+
+	switch os.Args[1] {
+	case "run":
+		runCmd.Parse(os.Args[2:])
+		os.Exit(runTests(*runConfig, *runProfile, *runOllamaBin, *runOutput, *runVerbose, *runKeepModels))
+	case "validate":
+		validateCmd.Parse(os.Args[2:])
+		os.Exit(validateConfig(*validateConfig))
+	case "list":
+		listCmd.Parse(os.Args[2:])
+		os.Exit(listProfiles(*listConfig))
+	case "-h", "--help", "help":
+		printUsage()
+		os.Exit(0)
+	default:
+		fmt.Printf("Unknown command: %s\n\n", os.Args[1])
+		printUsage()
+		os.Exit(1)
+	}
+}
+
+func printUsage() {
+	fmt.Println("Tesla K80 Test Runner")
+	fmt.Println("\nUsage:")
+	fmt.Println("  test-runner <command> [flags]")
+	fmt.Println("\nCommands:")
+	fmt.Println("  run        Run tests")
+	fmt.Println("  validate   Validate configuration file")
+	fmt.Println("  list       List available test profiles")
+	fmt.Println("  help       Show this help message")
+	fmt.Println("\nRun 'test-runner <command> -h' for command-specific help")
+}
+
+func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, keepModels bool) int {
+	// Load config
+	config, err := LoadConfig(configPath)
+	if err != nil {
+		fmt.Printf("Error loading config: %v\n", err)
+		return 1
+	}
+
+	// Get profile
+	profile, err := config.GetProfile(profileName)
+	if err != nil {
+		fmt.Printf("Error: %v\n", err)
+		fmt.Printf("Available profiles: %v\n", config.ListProfiles())
+		return 1
+	}
+
+	fmt.Printf("Running test profile: %s\n", profileName)
+	fmt.Printf("Models to test: %d\n", len(profile.Models))
+	fmt.Printf("Ollama binary: %s\n", ollamaBin)
+	fmt.Println()
+
+	// Setup context with cancellation
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Handle Ctrl+C
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
+	go func() {
+		<-sigChan
+		fmt.Println("\n\nInterrupt received, shutting down...")
+		cancel()
+	}()
+
+	// Start server
+	logPath := defaultLogPath
+	server := NewServer(config.Server, ollamaBin)
+
+	fmt.Println("Starting ollama server...")
+	if err := server.Start(ctx, logPath); err != nil {
+		fmt.Printf("Error starting server: %v\n", err)
+		return 1
+	}
+	defer func() {
+		fmt.Println("\nStopping server...")
+		server.Stop()
+	}()
+
+	// Start log monitor
+	monitor, err := NewLogMonitor(logPath, config.Validation.CheckPatterns)
+	if err != nil {
+		fmt.Printf("Error creating log monitor: %v\n", err)
+		return 1
+	}
+
+	monitorCtx, monitorCancel := context.WithCancel(ctx)
+	defer monitorCancel()
+
+	go func() {
+		if err := monitor.Start(monitorCtx); err != nil && err != context.Canceled {
+			if verbose {
+				fmt.Printf("Log monitor error: %v\n", err)
+			}
+		}
+	}()
+
+	// Wait a moment for log monitor to initialize
+	time.Sleep(500 * time.Millisecond)
+
+	// Run tests
+	startTime := time.Now()
+	tester := NewModelTester(server.BaseURL())
+	validator := NewValidator(config.Validation, monitor)
+
+	results := make([]TestResult, 0, len(profile.Models))
+
+	for i, modelTest := range profile.Models {
+		fmt.Printf("\n[%d/%d] Testing model: %s\n", i+1, len(profile.Models), modelTest.Name)
+		fmt.Println(strings.Repeat("-", 60))
+
+		// Reset monitor events for this model
+		monitor.Reset()
+
+		// Run test
+		result := tester.TestModel(ctx, modelTest)
+
+		// Validate result
+		validator.ValidateResult(&result)
+
+		results = append(results, result)
+
+		fmt.Printf("Result: %s\n", result.Status)
+		if result.ErrorMessage != "" {
+			fmt.Printf("Error: %s\n", result.ErrorMessage)
+		}
+	}
+
+	endTime := time.Now()
+
+	// Generate report
+	reporter := NewReporter(config.Reporting, monitor)
+	report, err := reporter.GenerateReport(results, startTime, endTime)
+	if err != nil {
+		fmt.Printf("Error generating report: %v\n", err)
+		return 1
+	}
+
+	// Save report
+	if err := reporter.SaveReport(report, outputPath); err != nil {
+		fmt.Printf("Error saving report: %v\n", err)
+		return 1
+	}
+
+	// Print summary
+	reporter.PrintSummary(report)
+
+	// Return exit code based on test results
+	if report.Summary.Failed > 0 {
+		return 1
+	}
+	return 0
+}
+
+func validateConfig(configPath string) int {
+	fmt.Printf("Validating configuration: %s\n", configPath)
+
+	config, err := LoadConfig(configPath)
+	if err != nil {
+		fmt.Printf("❌ Configuration is invalid: %v\n", err)
+		return 1
+	}
+
+	fmt.Printf("✅ Configuration is valid\n")
+	fmt.Printf("Profiles found: %d\n", len(config.Profiles))
+
+	for profileName, profile := range config.Profiles {
+		fmt.Printf("  - %s: %d models, timeout %s\n", profileName, len(profile.Models), profile.Timeout)
+	}
+
+	return 0
+}
+
+func listProfiles(configPath string) int {
+	config, err := LoadConfig(configPath)
+	if err != nil {
+		fmt.Printf("Error loading config: %v\n", err)
+		return 1
+	}
+
+	fmt.Println("Available test profiles:")
+	fmt.Println()
+
+	for _, profileName := range config.ListProfiles() {
+		profile, _ := config.GetProfile(profileName)
+		fmt.Printf("Profile: %s\n", profileName)
+		fmt.Printf("  Timeout: %s\n", profile.Timeout)
+		fmt.Printf("  Models: %d\n", len(profile.Models))
+		for _, model := range profile.Models {
+			fmt.Printf("    - %s (%d prompts)\n", model.Name, len(model.Prompts))
+		}
+		fmt.Println()
+	}
+
+	return 0
+}
--- a/cmd/test-runner/monitor.go
+++ b/cmd/test-runner/monitor.go
@@ -0,0 +1,240 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"regexp"
+	"sync"
+	"time"
+)
+
+// LogEvent represents a significant event found in logs
+type LogEvent struct {
+	Timestamp time.Time
+	Line      string
+	Type      EventType
+	Message   string
+}
+
+// EventType represents the type of log event
+type EventType int
+
+const (
+	EventInfo EventType = iota
+	EventSuccess
+	EventWarning
+	EventError
+)
+
+func (e EventType) String() string {
+	switch e {
+	case EventInfo:
+		return "INFO"
+	case EventSuccess:
+		return "SUCCESS"
+	case EventWarning:
+		return "WARNING"
+	case EventError:
+		return "ERROR"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// LogMonitor monitors log files for important events
+type LogMonitor struct {
+	logPath        string
+	patterns       CheckPatterns
+	events         []LogEvent
+	mu             sync.RWMutex
+	successRegexps []*regexp.Regexp
+	failureRegexps []*regexp.Regexp
+	warningRegexps []*regexp.Regexp
+}
+
+// NewLogMonitor creates a new log monitor
+func NewLogMonitor(logPath string, patterns CheckPatterns) (*LogMonitor, error) {
+	monitor := &LogMonitor{
+		logPath:  logPath,
+		patterns: patterns,
+		events:   make([]LogEvent, 0),
+	}
+
+	// Compile regex patterns
+	var err error
+	monitor.successRegexps, err = compilePatterns(patterns.Success)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile success patterns: %w", err)
+	}
+
+	monitor.failureRegexps, err = compilePatterns(patterns.Failure)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile failure patterns: %w", err)
+	}
+
+	monitor.warningRegexps, err = compilePatterns(patterns.Warning)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile warning patterns: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// compilePatterns compiles a list of pattern strings into regexps
+func compilePatterns(patterns []string) ([]*regexp.Regexp, error) {
+	regexps := make([]*regexp.Regexp, len(patterns))
+	for i, pattern := range patterns {
+		re, err := regexp.Compile(pattern)
+		if err != nil {
+			return nil, fmt.Errorf("invalid pattern %q: %w", pattern, err)
+		}
+		regexps[i] = re
+	}
+	return regexps, nil
+}
+
+// Start starts monitoring the log file
+func (m *LogMonitor) Start(ctx context.Context) error {
+	file, err := os.Open(m.logPath)
+	if err != nil {
+		return fmt.Errorf("failed to open log file: %w", err)
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+
+	// Use a larger buffer for long log lines
+	buf := make([]byte, 0, 64*1024)
+	scanner.Buffer(buf, 1024*1024)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+			if !scanner.Scan() {
+				// No more lines, wait a bit and retry
+				time.Sleep(100 * time.Millisecond)
+				continue
+			}
+
+			line := scanner.Text()
+			m.processLine(line)
+		}
+	}
+}
+
+// processLine processes a single log line
+func (m *LogMonitor) processLine(line string) {
+	event := LogEvent{
+		Timestamp: time.Now(),
+		Line:      line,
+		Type:      EventInfo,
+	}
+
+	// Check for failure patterns (highest priority)
+	for _, re := range m.failureRegexps {
+		if re.MatchString(line) {
+			event.Type = EventError
+			event.Message = fmt.Sprintf("Failure pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+
+	// Check for warning patterns
+	for _, re := range m.warningRegexps {
+		if re.MatchString(line) {
+			event.Type = EventWarning
+			event.Message = fmt.Sprintf("Warning pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+
+	// Check for success patterns
+	for _, re := range m.successRegexps {
+		if re.MatchString(line) {
+			event.Type = EventSuccess
+			event.Message = fmt.Sprintf("Success pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+}
+
+// addEvent adds an event to the event list
+func (m *LogMonitor) addEvent(event LogEvent) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.events = append(m.events, event)
+}
+
+// GetEvents returns all events of a specific type
+func (m *LogMonitor) GetEvents(eventType EventType) []LogEvent {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	filtered := make([]LogEvent, 0)
+	for _, event := range m.events {
+		if event.Type == eventType {
+			filtered = append(filtered, event)
+		}
+	}
+	return filtered
+}
+
+// GetAllEvents returns all events
+func (m *LogMonitor) GetAllEvents() []LogEvent {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+	return append([]LogEvent{}, m.events...)
+}
+
+// HasErrors returns true if any error events were detected
+func (m *LogMonitor) HasErrors() bool {
+	return len(m.GetEvents(EventError)) > 0
+}
+
+// HasWarnings returns true if any warning events were detected
+func (m *LogMonitor) HasWarnings() bool {
+	return len(m.GetEvents(EventWarning)) > 0
+}
+
+// GetLogExcerpt returns the last N lines from the log file
+func (m *LogMonitor) GetLogExcerpt(lines int) ([]string, error) {
+	file, err := os.Open(m.logPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open log file: %w", err)
+	}
+	defer file.Close()
+
+	// Read all lines
+	allLines := make([]string, 0)
+	scanner := bufio.NewScanner(file)
+	buf := make([]byte, 0, 64*1024)
+	scanner.Buffer(buf, 1024*1024)
+
+	for scanner.Scan() {
+		allLines = append(allLines, scanner.Text())
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading log file: %w", err)
+	}
+
+	// Return last N lines
+	if len(allLines) <= lines {
+		return allLines, nil
+	}
+	return allLines[len(allLines)-lines:], nil
+}
+
+// Reset clears all collected events
+func (m *LogMonitor) Reset() {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.events = make([]LogEvent, 0)
+}
--- a/cmd/test-runner/report.go
+++ b/cmd/test-runner/report.go
@@ -0,0 +1,254 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"strings"
+	"time"
+)
+
+// TestReport represents the complete test report
+type TestReport struct {
+	Summary      Summary        `json:"summary"`
+	Results      []TestResult   `json:"results"`
+	LogExcerpts  map[string][]string `json:"log_excerpts,omitempty"`
+	StartTime    time.Time      `json:"start_time"`
+	EndTime      time.Time      `json:"end_time"`
+	TotalDuration time.Duration `json:"total_duration"`
+}
+
+// Summary represents test summary statistics
+type Summary struct {
+	TotalTests  int `json:"total_tests"`
+	Passed      int `json:"passed"`
+	Failed      int `json:"failed"`
+	Skipped     int `json:"skipped"`
+	TotalPrompts int `json:"total_prompts"`
+}
+
+// Reporter generates test reports
+type Reporter struct {
+	config      ReportingConfig
+	logMonitor  *LogMonitor
+}
+
+// NewReporter creates a new reporter
+func NewReporter(config ReportingConfig, logMonitor *LogMonitor) *Reporter {
+	return &Reporter{
+		config:     config,
+		logMonitor: logMonitor,
+	}
+}
+
+// GenerateReport generates a complete test report
+func (r *Reporter) GenerateReport(results []TestResult, startTime, endTime time.Time) (*TestReport, error) {
+	report := &TestReport{
+		Results:       results,
+		StartTime:     startTime,
+		EndTime:       endTime,
+		TotalDuration: endTime.Sub(startTime),
+	}
+
+	// Calculate summary
+	report.Summary = r.calculateSummary(results)
+
+	// Add log excerpts for failed tests if configured
+	if r.config.IncludeLogs && r.logMonitor != nil {
+		report.LogExcerpts = make(map[string][]string)
+		for _, result := range results {
+			if result.Status == StatusFailed {
+				excerpt, err := r.logMonitor.GetLogExcerpt(r.config.LogExcerptLines)
+				if err == nil {
+					report.LogExcerpts[result.ModelName] = excerpt
+				}
+			}
+		}
+	}
+
+	return report, nil
+}
+
+// calculateSummary calculates summary statistics
+func (r *Reporter) calculateSummary(results []TestResult) Summary {
+	summary := Summary{
+		TotalTests: len(results),
+	}
+
+	for _, result := range results {
+		switch result.Status {
+		case StatusPassed:
+			summary.Passed++
+		case StatusFailed:
+			summary.Failed++
+		case StatusSkipped:
+			summary.Skipped++
+		}
+		summary.TotalPrompts += len(result.PromptTests)
+	}
+
+	return summary
+}
+
+// SaveReport saves the report in configured formats
+func (r *Reporter) SaveReport(report *TestReport, outputPath string) error {
+	for _, format := range r.config.Formats {
+		switch format {
+		case "json":
+			if err := r.saveJSON(report, outputPath+".json"); err != nil {
+				return fmt.Errorf("failed to save JSON report: %w", err)
+			}
+		case "markdown":
+			if err := r.saveMarkdown(report, outputPath+".md"); err != nil {
+				return fmt.Errorf("failed to save Markdown report: %w", err)
+			}
+		default:
+			fmt.Printf("Warning: unknown report format %q\n", format)
+		}
+	}
+	return nil
+}
+
+// saveJSON saves the report as JSON
+func (r *Reporter) saveJSON(report *TestReport, path string) error {
+	file, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	encoder := json.NewEncoder(file)
+	encoder.SetIndent("", "  ")
+	if err := encoder.Encode(report); err != nil {
+		return err
+	}
+
+	fmt.Printf("JSON report saved to: %s\n", path)
+	return nil
+}
+
+// saveMarkdown saves the report as Markdown
+func (r *Reporter) saveMarkdown(report *TestReport, path string) error {
+	file, err := os.Create(path)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	var sb strings.Builder
+
+	// Title and summary
+	sb.WriteString("# Tesla K80 Test Report\n\n")
+	sb.WriteString(fmt.Sprintf("**Generated:** %s\n\n", time.Now().Format(time.RFC3339)))
+	sb.WriteString(fmt.Sprintf("**Duration:** %s\n\n", report.TotalDuration.Round(time.Second)))
+
+	// Summary table
+	sb.WriteString("## Summary\n\n")
+	sb.WriteString("| Metric | Count |\n")
+	sb.WriteString("|--------|-------|\n")
+	sb.WriteString(fmt.Sprintf("| Total Tests | %d |\n", report.Summary.TotalTests))
+	sb.WriteString(fmt.Sprintf("| Passed | %d |\n", report.Summary.Passed))
+	sb.WriteString(fmt.Sprintf("| Failed | %d |\n", report.Summary.Failed))
+	sb.WriteString(fmt.Sprintf("| Skipped | %d |\n", report.Summary.Skipped))
+	sb.WriteString(fmt.Sprintf("| Total Prompts | %d |\n\n", report.Summary.TotalPrompts))
+
+	// Results
+	sb.WriteString("## Test Results\n\n")
+	for _, result := range report.Results {
+		r.writeModelResult(&sb, result)
+	}
+
+	// Log excerpts
+	if len(report.LogExcerpts) > 0 {
+		sb.WriteString("## Log Excerpts\n\n")
+		for modelName, excerpt := range report.LogExcerpts {
+			sb.WriteString(fmt.Sprintf("### %s\n\n", modelName))
+			sb.WriteString("```\n")
+			for _, line := range excerpt {
+				sb.WriteString(line + "\n")
+			}
+			sb.WriteString("```\n\n")
+		}
+	}
+
+	if _, err := file.WriteString(sb.String()); err != nil {
+		return err
+	}
+
+	fmt.Printf("Markdown report saved to: %s\n", path)
+	return nil
+}
+
+// writeModelResult writes a model result to the markdown builder
+func (r *Reporter) writeModelResult(sb *strings.Builder, result TestResult) {
+	statusEmoji := "✅"
+	if result.Status == StatusFailed {
+		statusEmoji = "❌"
+	} else if result.Status == StatusSkipped {
+		statusEmoji = "⏭️"
+	}
+
+	sb.WriteString(fmt.Sprintf("### %s %s\n\n", statusEmoji, result.ModelName))
+	sb.WriteString(fmt.Sprintf("**Status:** %s\n\n", result.Status))
+	sb.WriteString(fmt.Sprintf("**Duration:** %s\n\n", result.Duration.Round(time.Millisecond)))
+
+	if result.ErrorMessage != "" {
+		sb.WriteString(fmt.Sprintf("**Error:** %s\n\n", result.ErrorMessage))
+	}
+
+	if len(result.Warnings) > 0 {
+		sb.WriteString("**Warnings:**\n")
+		for _, warning := range result.Warnings {
+			sb.WriteString(fmt.Sprintf("- %s\n", warning))
+		}
+		sb.WriteString("\n")
+	}
+
+	// Prompt tests
+	if len(result.PromptTests) > 0 {
+		sb.WriteString("**Prompt Tests:**\n\n")
+		for i, prompt := range result.PromptTests {
+			promptStatus := "✅"
+			if prompt.Status == StatusFailed {
+				promptStatus = "❌"
+			}
+			sb.WriteString(fmt.Sprintf("%d. %s **Prompt:** %s\n", i+1, promptStatus, prompt.Prompt))
+			sb.WriteString(fmt.Sprintf("   - **Duration:** %s\n", prompt.Duration.Round(time.Millisecond)))
+			sb.WriteString(fmt.Sprintf("   - **Response Tokens:** %d\n", prompt.ResponseTokens))
+			if prompt.ErrorMessage != "" {
+				sb.WriteString(fmt.Sprintf("   - **Error:** %s\n", prompt.ErrorMessage))
+			}
+			if prompt.Response != "" && len(prompt.Response) < 200 {
+				sb.WriteString(fmt.Sprintf("   - **Response:** %s\n", prompt.Response))
+			}
+			sb.WriteString("\n")
+		}
+	}
+
+	sb.WriteString("---\n\n")
+}
+
+// PrintSummary prints a summary to stdout
+func (r *Reporter) PrintSummary(report *TestReport) {
+	fmt.Println("\n" + strings.Repeat("=", 60))
+	fmt.Println("TEST SUMMARY")
+	fmt.Println(strings.Repeat("=", 60))
+	fmt.Printf("Total Tests:    %d\n", report.Summary.TotalTests)
+	fmt.Printf("Passed:         %d\n", report.Summary.Passed)
+	fmt.Printf("Failed:         %d\n", report.Summary.Failed)
+	fmt.Printf("Skipped:        %d\n", report.Summary.Skipped)
+	fmt.Printf("Total Prompts:  %d\n", report.Summary.TotalPrompts)
+	fmt.Printf("Duration:       %s\n", report.TotalDuration.Round(time.Second))
+	fmt.Println(strings.Repeat("=", 60))
+
+	if report.Summary.Failed > 0 {
+		fmt.Println("\nFAILED TESTS:")
+		for _, result := range report.Results {
+			if result.Status == StatusFailed {
+				fmt.Printf("  ❌ %s: %s\n", result.ModelName, result.ErrorMessage)
+			}
+		}
+	}
+
+	fmt.Println()
+}
--- a/cmd/test-runner/server.go
+++ b/cmd/test-runner/server.go
@@ -0,0 +1,168 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"time"
+)
+
+// Server manages the ollama server lifecycle
+type Server struct {
+	config     ServerConfig
+	ollamaBin  string
+	logFile    *os.File
+	cmd        *exec.Cmd
+	baseURL    string
+}
+
+// NewServer creates a new server manager
+func NewServer(config ServerConfig, ollamaBin string) *Server {
+	baseURL := fmt.Sprintf("http://%s:%d", config.Host, config.Port)
+	return &Server{
+		config:    config,
+		ollamaBin: ollamaBin,
+		baseURL:   baseURL,
+	}
+}
+
+// Start starts the ollama server
+func (s *Server) Start(ctx context.Context, logPath string) error {
+	// Create log file
+	logFile, err := os.Create(logPath)
+	if err != nil {
+		return fmt.Errorf("failed to create log file: %w", err)
+	}
+	s.logFile = logFile
+
+	// Resolve ollama binary path
+	binPath, err := filepath.Abs(s.ollamaBin)
+	if err != nil {
+		return fmt.Errorf("failed to resolve ollama binary path: %w", err)
+	}
+
+	// Check if binary exists
+	if _, err := os.Stat(binPath); err != nil {
+		return fmt.Errorf("ollama binary not found at %s: %w", binPath, err)
+	}
+
+	// Create command
+	s.cmd = exec.CommandContext(ctx, binPath, "serve")
+	s.cmd.Stdout = logFile
+	s.cmd.Stderr = logFile
+
+	// Set working directory to binary location
+	s.cmd.Dir = filepath.Dir(binPath)
+
+	// Start server
+	if err := s.cmd.Start(); err != nil {
+		logFile.Close()
+		return fmt.Errorf("failed to start ollama server: %w", err)
+	}
+
+	fmt.Printf("Started ollama server (PID: %d)\n", s.cmd.Process.Pid)
+	fmt.Printf("Server logs: %s\n", logPath)
+
+	// Wait for server to be ready
+	if err := s.WaitForReady(ctx); err != nil {
+		s.Stop()
+		return fmt.Errorf("server failed to become ready: %w", err)
+	}
+
+	fmt.Printf("Server is ready at %s\n", s.baseURL)
+	return nil
+}
+
+// WaitForReady waits for the server to be ready
+func (s *Server) WaitForReady(ctx context.Context) error {
+	healthURL := s.baseURL + s.config.HealthCheckEndpoint
+
+	timeout := time.After(s.config.StartupTimeout)
+	ticker := time.NewTicker(s.config.HealthCheckInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-timeout:
+			return fmt.Errorf("timeout waiting for server to be ready")
+		case <-ticker.C:
+			req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
+			if err != nil {
+				continue
+			}
+
+			resp, err := http.DefaultClient.Do(req)
+			if err != nil {
+				continue
+			}
+			resp.Body.Close()
+
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+		}
+	}
+}
+
+// Stop stops the ollama server
+func (s *Server) Stop() error {
+	var errs []error
+
+	// Stop the process
+	if s.cmd != nil && s.cmd.Process != nil {
+		fmt.Printf("Stopping ollama server (PID: %d)\n", s.cmd.Process.Pid)
+
+		// Try graceful shutdown first
+		if err := s.cmd.Process.Signal(os.Interrupt); err != nil {
+			errs = append(errs, fmt.Errorf("failed to send interrupt signal: %w", err))
+		}
+
+		// Wait for process to exit (with timeout)
+		done := make(chan error, 1)
+		go func() {
+			done <- s.cmd.Wait()
+		}()
+
+		select {
+		case <-time.After(10 * time.Second):
+			// Force kill if graceful shutdown times out
+			if err := s.cmd.Process.Kill(); err != nil {
+				errs = append(errs, fmt.Errorf("failed to kill process: %w", err))
+			}
+			<-done // Wait for process to actually die
+		case err := <-done:
+			if err != nil && err.Error() != "signal: interrupt" {
+				errs = append(errs, fmt.Errorf("process exited with error: %w", err))
+			}
+		}
+	}
+
+	// Close log file
+	if s.logFile != nil {
+		if err := s.logFile.Close(); err != nil {
+			errs = append(errs, fmt.Errorf("failed to close log file: %w", err))
+		}
+	}
+
+	if len(errs) > 0 {
+		return fmt.Errorf("errors during shutdown: %v", errs)
+	}
+
+	fmt.Println("Server stopped successfully")
+	return nil
+}
+
+// BaseURL returns the server base URL
+func (s *Server) BaseURL() string {
+	return s.baseURL
+}
+
+// IsRunning returns true if the server is running
+func (s *Server) IsRunning() bool {
+	return s.cmd != nil && s.cmd.Process != nil
+}
--- a/cmd/test-runner/test.go
+++ b/cmd/test-runner/test.go
@@ -0,0 +1,223 @@
+package main
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+)
+
+// TestResult represents the result of a model test
+type TestResult struct {
+	ModelName    string        `json:"model_name"`
+	Status       TestStatus    `json:"status"`
+	StartTime    time.Time     `json:"start_time"`
+	EndTime      time.Time     `json:"end_time"`
+	Duration     time.Duration `json:"duration"`
+	PromptTests  []PromptTest  `json:"prompt_tests"`
+	ErrorMessage string        `json:"error_message,omitempty"`
+	Warnings     []string      `json:"warnings,omitempty"`
+}
+
+// TestStatus represents the status of a test
+type TestStatus string
+
+const (
+	StatusPassed TestStatus = "PASSED"
+	StatusFailed TestStatus = "FAILED"
+	StatusSkipped TestStatus = "SKIPPED"
+)
+
+// PromptTest represents the result of a single prompt test
+type PromptTest struct {
+	Prompt           string        `json:"prompt"`
+	Response         string        `json:"response"`
+	ResponseTokens   int           `json:"response_tokens"`
+	Duration         time.Duration `json:"duration"`
+	Status           TestStatus    `json:"status"`
+	ErrorMessage     string        `json:"error_message,omitempty"`
+}
+
+// ModelTester runs tests for models
+type ModelTester struct {
+	serverURL  string
+	httpClient *http.Client
+}
+
+// NewModelTester creates a new model tester
+func NewModelTester(serverURL string) *ModelTester {
+	return &ModelTester{
+		serverURL: serverURL,
+		httpClient: &http.Client{
+			Timeout: 5 * time.Minute, // Long timeout for model operations
+		},
+	}
+}
+
+// TestModel runs all tests for a single model
+func (t *ModelTester) TestModel(ctx context.Context, modelTest ModelTest) TestResult {
+	result := TestResult{
+		ModelName:   modelTest.Name,
+		StartTime:   time.Now(),
+		Status:      StatusPassed,
+		PromptTests: make([]PromptTest, 0),
+	}
+
+	// Pull model first
+	fmt.Printf("Pulling model %s...\n", modelTest.Name)
+	if err := t.pullModel(ctx, modelTest.Name); err != nil {
+		result.Status = StatusFailed
+		result.ErrorMessage = fmt.Sprintf("Failed to pull model: %v", err)
+		result.EndTime = time.Now()
+		result.Duration = result.EndTime.Sub(result.StartTime)
+		return result
+	}
+	fmt.Printf("Model %s pulled successfully\n", modelTest.Name)
+
+	// Run each prompt test
+	for i, prompt := range modelTest.Prompts {
+		fmt.Printf("Testing prompt %d/%d for %s\n", i+1, len(modelTest.Prompts), modelTest.Name)
+
+		promptTest := t.testPrompt(ctx, modelTest.Name, prompt, modelTest.Timeout)
+		result.PromptTests = append(result.PromptTests, promptTest)
+
+		// Update overall status based on prompt test result
+		if promptTest.Status == StatusFailed {
+			result.Status = StatusFailed
+		}
+	}
+
+	result.EndTime = time.Now()
+	result.Duration = result.EndTime.Sub(result.StartTime)
+
+	fmt.Printf("Model %s test completed: %s\n", modelTest.Name, result.Status)
+	return result
+}
+
+// pullModel pulls a model using the Ollama API
+func (t *ModelTester) pullModel(ctx context.Context, modelName string) error {
+	url := t.serverURL + "/api/pull"
+
+	reqBody := map[string]interface{}{
+		"name": modelName,
+	}
+
+	jsonData, err := json.Marshal(reqBody)
+	if err != nil {
+		return fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
+	if err != nil {
+		return fmt.Errorf("failed to create request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := t.httpClient.Do(req)
+	if err != nil {
+		return fmt.Errorf("request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("pull failed with status %d: %s", resp.StatusCode, string(body))
+	}
+
+	// Read response stream (pull progress)
+	scanner := bufio.NewScanner(resp.Body)
+	for scanner.Scan() {
+		var progress map[string]interface{}
+		if err := json.Unmarshal(scanner.Bytes(), &progress); err != nil {
+			continue
+		}
+		// Could print progress here if verbose mode is enabled
+	}
+
+	return nil
+}
+
+// testPrompt tests a single prompt
+func (t *ModelTester) testPrompt(ctx context.Context, modelName, prompt string, timeout time.Duration) PromptTest {
+	result := PromptTest{
+		Prompt: prompt,
+		Status: StatusPassed,
+	}
+
+	startTime := time.Now()
+
+	// Create context with timeout
+	testCtx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	// Call chat API
+	response, err := t.chat(testCtx, modelName, prompt)
+	if err != nil {
+		result.Status = StatusFailed
+		result.ErrorMessage = err.Error()
+		result.Duration = time.Since(startTime)
+		return result
+	}
+
+	result.Response = response
+	result.ResponseTokens = estimateTokens(response)
+	result.Duration = time.Since(startTime)
+
+	return result
+}
+
+// chat sends a chat request to the ollama API
+func (t *ModelTester) chat(ctx context.Context, modelName, prompt string) (string, error) {
+	url := t.serverURL + "/api/generate"
+
+	reqBody := map[string]interface{}{
+		"model":  modelName,
+		"prompt": prompt,
+		"stream": false,
+	}
+
+	jsonData, err := json.Marshal(reqBody)
+	if err != nil {
+		return "", fmt.Errorf("failed to marshal request: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
+	if err != nil {
+		return "", fmt.Errorf("failed to create request: %w", err)
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	resp, err := t.httpClient.Do(req)
+	if err != nil {
+		return "", fmt.Errorf("request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return "", fmt.Errorf("chat failed with status %d: %s", resp.StatusCode, string(body))
+	}
+
+	var response struct {
+		Response string `json:"response"`
+	}
+
+	if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
+		return "", fmt.Errorf("failed to decode response: %w", err)
+	}
+
+	return response.Response, nil
+}
+
+// estimateTokens estimates the number of tokens in a text
+// This is a rough approximation
+func estimateTokens(text string) int {
+	// Rough estimate: 1 token ≈ 4 characters on average
+	words := strings.Fields(text)
+	return len(words)
+}
--- a/cmd/test-runner/validate.go
+++ b/cmd/test-runner/validate.go
@@ -0,0 +1,164 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+)
+
+// Validator validates test results against configuration
+type Validator struct {
+	config     Validation
+	logMonitor *LogMonitor
+}
+
+// NewValidator creates a new validator
+func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
+	return &Validator{
+		config:     config,
+		logMonitor: logMonitor,
+	}
+}
+
+// ValidateResult validates a test result
+func (v *Validator) ValidateResult(result *TestResult) {
+	// Validate prompts
+	for i := range result.PromptTests {
+		v.validatePrompt(&result.PromptTests[i])
+	}
+
+	// Check logs for errors and warnings
+	if v.logMonitor != nil {
+		v.validateLogs(result)
+	}
+}
+
+// validatePrompt validates a single prompt test
+func (v *Validator) validatePrompt(prompt *PromptTest) {
+	// Already failed, skip
+	if prompt.Status == StatusFailed {
+		return
+	}
+
+	// Check if response is empty
+	if strings.TrimSpace(prompt.Response) == "" {
+		prompt.Status = StatusFailed
+		prompt.ErrorMessage = "Response is empty"
+		return
+	}
+
+	// Check token count
+	if prompt.ResponseTokens < 1 {
+		prompt.Status = StatusFailed
+		prompt.ErrorMessage = "Response has no tokens"
+		return
+	}
+}
+
+// validateLogs validates log events
+func (v *Validator) validateLogs(result *TestResult) {
+	// Check for error events
+	errorEvents := v.logMonitor.GetEvents(EventError)
+	if len(errorEvents) > 0 {
+		result.Status = StatusFailed
+		errorMessages := make([]string, len(errorEvents))
+		for i, event := range errorEvents {
+			errorMessages[i] = event.Line
+		}
+		if result.ErrorMessage == "" {
+			result.ErrorMessage = fmt.Sprintf("Errors found in logs: %s", strings.Join(errorMessages, "; "))
+		} else {
+			result.ErrorMessage += fmt.Sprintf("; Log errors: %s", strings.Join(errorMessages, "; "))
+		}
+	}
+
+	// Check for warning events
+	warningEvents := v.logMonitor.GetEvents(EventWarning)
+	if len(warningEvents) > 0 {
+		warnings := make([]string, len(warningEvents))
+		for i, event := range warningEvents {
+			warnings[i] = event.Line
+		}
+		result.Warnings = append(result.Warnings, warnings...)
+	}
+
+	// Check if GPU was used (if required)
+	if v.config.GPURequired {
+		if !v.hasGPULoading() {
+			result.Status = StatusFailed
+			if result.ErrorMessage == "" {
+				result.ErrorMessage = "GPU acceleration not detected in logs (GPU required)"
+			} else {
+				result.ErrorMessage += "; GPU acceleration not detected"
+			}
+		}
+	}
+
+	// Check for CPU fallback (if single GPU preferred)
+	if v.config.SingleGPUPreferred {
+		if v.hasCPUFallback() {
+			warning := "CPU fallback or multi-GPU split detected (single GPU preferred)"
+			result.Warnings = append(result.Warnings, warning)
+		}
+	}
+}
+
+// hasGPULoading checks if logs indicate GPU loading
+func (v *Validator) hasGPULoading() bool {
+	successEvents := v.logMonitor.GetEvents(EventSuccess)
+
+	// Look for patterns indicating GPU usage
+	gpuPatterns := []string{
+		"offload",
+		"GPU",
+		"CUDA",
+	}
+
+	for _, event := range successEvents {
+		line := strings.ToLower(event.Line)
+		for _, pattern := range gpuPatterns {
+			if strings.Contains(line, strings.ToLower(pattern)) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// hasCPUFallback checks if logs indicate CPU fallback
+func (v *Validator) hasCPUFallback() bool {
+	allEvents := v.logMonitor.GetAllEvents()
+
+	// Look for patterns indicating CPU usage or multi-GPU split
+	cpuPatterns := []string{
+		"CPU backend",
+		"using CPU",
+		"fallback",
+	}
+
+	for _, event := range allEvents {
+		line := strings.ToLower(event.Line)
+		for _, pattern := range cpuPatterns {
+			if strings.Contains(line, strings.ToLower(pattern)) {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// ValidateResponse validates a response against expected criteria
+func ValidateResponse(response string, minTokens, maxTokens int) error {
+	tokens := estimateTokens(response)
+
+	if minTokens > 0 && tokens < minTokens {
+		return fmt.Errorf("response too short: %d tokens (min: %d)", tokens, minTokens)
+	}
+
+	if maxTokens > 0 && tokens > maxTokens {
+		return fmt.Errorf("response too long: %d tokens (max: %d)", tokens, maxTokens)
+	}
+
+	return nil
+}