Implement Go-based test runner framework for Tesla K80 testing

Add comprehensive test orchestration framework:

Test Runner (cmd/test-runner/):
- config.go: YAML configuration loading and validation
- server.go: Ollama server lifecycle management (start/stop/health checks)
- monitor.go: Real-time log monitoring with pattern matching
- test.go: Model testing via Ollama API (pull, chat, validation)
- validate.go: Test result validation (GPU usage, response quality, log analysis)
- report.go: Structured reporting (JSON and Markdown formats)
- main.go: CLI interface with run/validate/list commands

Test Configurations (test/config/):
- models.yaml: Full test suite with quick/full/stress profiles
- quick.yaml: Fast smoke test with gemma2:2b

Updated Workflow:
- tesla-k80-tests.yml: Use test-runner instead of shell scripts
- Run quick tests first, then full tests if passing
- Generate structured JSON reports for pass/fail checking
- Upload test results as artifacts

Features:
- Multi-model testing with configurable profiles
- API-based testing (not CLI commands)
- Real-time log monitoring for GPU events and errors
- Automatic validation of GPU loading and response quality
- Structured JSON and Markdown reports
- Graceful server lifecycle management
- Interrupt handling (Ctrl+C cleanup)

Addresses limitations of shell-based testing by providing:
- Better error handling and reporting
- Programmatic test orchestration
- Reusable test framework
- Clear pass/fail criteria
- Detailed test metrics and timing
This commit is contained in:
Shang Chieh Tseng
2025-10-30 11:04:48 +08:00
parent aaaf334e7f
commit d59284d30a
10 changed files with 1631 additions and 113 deletions

154
cmd/test-runner/config.go Normal file
View File

@@ -0,0 +1,154 @@
package main
import (
"fmt"
"os"
"time"
"gopkg.in/yaml.v3"
)
// Config represents the complete test configuration
type Config struct {
Profiles map[string]Profile `yaml:"profiles"`
Validation Validation `yaml:"validation"`
Server ServerConfig `yaml:"server"`
Reporting ReportingConfig `yaml:"reporting"`
}
// Profile represents a test profile with multiple models
type Profile struct {
Timeout time.Duration `yaml:"timeout"`
Models []ModelTest `yaml:"models"`
}
// ModelTest represents a single model test configuration
type ModelTest struct {
Name string `yaml:"name"`
Prompts []string `yaml:"prompts"`
MinResponseTokens int `yaml:"min_response_tokens"`
MaxResponseTokens int `yaml:"max_response_tokens"`
Timeout time.Duration `yaml:"timeout"`
}
// Validation represents validation rules
type Validation struct {
GPURequired bool `yaml:"gpu_required"`
SingleGPUPreferred bool `yaml:"single_gpu_preferred"`
CheckPatterns CheckPatterns `yaml:"check_patterns"`
}
// CheckPatterns defines log patterns to match
type CheckPatterns struct {
Success []string `yaml:"success"`
Failure []string `yaml:"failure"`
Warning []string `yaml:"warning"`
}
// ServerConfig represents server configuration
type ServerConfig struct {
Host string `yaml:"host"`
Port int `yaml:"port"`
StartupTimeout time.Duration `yaml:"startup_timeout"`
HealthCheckInterval time.Duration `yaml:"health_check_interval"`
HealthCheckEndpoint string `yaml:"health_check_endpoint"`
}
// ReportingConfig represents reporting configuration
type ReportingConfig struct {
Formats []string `yaml:"formats"`
IncludeLogs bool `yaml:"include_logs"`
LogExcerptLines int `yaml:"log_excerpt_lines"`
}
// LoadConfig loads and validates a test configuration from a YAML file
func LoadConfig(path string) (*Config, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, fmt.Errorf("failed to read config file: %w", err)
}
var config Config
if err := yaml.Unmarshal(data, &config); err != nil {
return nil, fmt.Errorf("failed to parse config YAML: %w", err)
}
// Set defaults
if config.Server.Host == "" {
config.Server.Host = "localhost"
}
if config.Server.Port == 0 {
config.Server.Port = 11434
}
if config.Server.StartupTimeout == 0 {
config.Server.StartupTimeout = 30 * time.Second
}
if config.Server.HealthCheckInterval == 0 {
config.Server.HealthCheckInterval = 1 * time.Second
}
if config.Server.HealthCheckEndpoint == "" {
config.Server.HealthCheckEndpoint = "/api/tags"
}
if config.Reporting.LogExcerptLines == 0 {
config.Reporting.LogExcerptLines = 50
}
if len(config.Reporting.Formats) == 0 {
config.Reporting.Formats = []string{"json"}
}
// Validate config
if err := validateConfig(&config); err != nil {
return nil, fmt.Errorf("invalid config: %w", err)
}
return &config, nil
}
// validateConfig validates the loaded configuration
func validateConfig(config *Config) error {
if len(config.Profiles) == 0 {
return fmt.Errorf("no profiles defined in config")
}
for profileName, profile := range config.Profiles {
if len(profile.Models) == 0 {
return fmt.Errorf("profile %q has no models defined", profileName)
}
for i, model := range profile.Models {
if model.Name == "" {
return fmt.Errorf("profile %q model %d has no name", profileName, i)
}
if len(model.Prompts) == 0 {
return fmt.Errorf("profile %q model %q has no prompts", profileName, model.Name)
}
if model.Timeout == 0 {
return fmt.Errorf("profile %q model %q has no timeout", profileName, model.Name)
}
}
if profile.Timeout == 0 {
return fmt.Errorf("profile %q has no timeout", profileName)
}
}
return nil
}
// GetProfile returns a specific profile by name
func (c *Config) GetProfile(name string) (*Profile, error) {
profile, ok := c.Profiles[name]
if !ok {
return nil, fmt.Errorf("profile %q not found", name)
}
return &profile, nil
}
// ListProfiles returns a list of all profile names
func (c *Config) ListProfiles() []string {
profiles := make([]string, 0, len(c.Profiles))
for name := range c.Profiles {
profiles = append(profiles, name)
}
return profiles
}

243
cmd/test-runner/main.go Normal file
View File

@@ -0,0 +1,243 @@
package main
import (
"context"
"flag"
"fmt"
"os"
"os/signal"
"strings"
"syscall"
"time"
)
const (
defaultConfigPath = "test/config/models.yaml"
defaultOllamaBin = "./ollama"
defaultLogPath = "ollama.log"
defaultOutputPath = "test-report"
)
func main() {
// Define subcommands
runCmd := flag.NewFlagSet("run", flag.ExitOnError)
validateCmd := flag.NewFlagSet("validate", flag.ExitOnError)
listCmd := flag.NewFlagSet("list", flag.ExitOnError)
// Run command flags
runConfig := runCmd.String("config", defaultConfigPath, "Path to test configuration file")
runProfile := runCmd.String("profile", "quick", "Test profile to run")
runOllamaBin := runCmd.String("ollama-bin", defaultOllamaBin, "Path to ollama binary")
runOutput := runCmd.String("output", defaultOutputPath, "Output path for test report")
runVerbose := runCmd.Bool("verbose", false, "Enable verbose logging")
runKeepModels := runCmd.Bool("keep-models", false, "Don't delete models after test")
// Validate command flags
validateConfig := validateCmd.String("config", defaultConfigPath, "Path to test configuration file")
// List command flags
listConfig := listCmd.String("config", defaultConfigPath, "Path to test configuration file")
// Parse command
if len(os.Args) < 2 {
printUsage()
os.Exit(1)
}
switch os.Args[1] {
case "run":
runCmd.Parse(os.Args[2:])
os.Exit(runTests(*runConfig, *runProfile, *runOllamaBin, *runOutput, *runVerbose, *runKeepModels))
case "validate":
validateCmd.Parse(os.Args[2:])
os.Exit(validateConfig(*validateConfig))
case "list":
listCmd.Parse(os.Args[2:])
os.Exit(listProfiles(*listConfig))
case "-h", "--help", "help":
printUsage()
os.Exit(0)
default:
fmt.Printf("Unknown command: %s\n\n", os.Args[1])
printUsage()
os.Exit(1)
}
}
func printUsage() {
fmt.Println("Tesla K80 Test Runner")
fmt.Println("\nUsage:")
fmt.Println(" test-runner <command> [flags]")
fmt.Println("\nCommands:")
fmt.Println(" run Run tests")
fmt.Println(" validate Validate configuration file")
fmt.Println(" list List available test profiles")
fmt.Println(" help Show this help message")
fmt.Println("\nRun 'test-runner <command> -h' for command-specific help")
}
func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, keepModels bool) int {
// Load config
config, err := LoadConfig(configPath)
if err != nil {
fmt.Printf("Error loading config: %v\n", err)
return 1
}
// Get profile
profile, err := config.GetProfile(profileName)
if err != nil {
fmt.Printf("Error: %v\n", err)
fmt.Printf("Available profiles: %v\n", config.ListProfiles())
return 1
}
fmt.Printf("Running test profile: %s\n", profileName)
fmt.Printf("Models to test: %d\n", len(profile.Models))
fmt.Printf("Ollama binary: %s\n", ollamaBin)
fmt.Println()
// Setup context with cancellation
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Handle Ctrl+C
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, os.Interrupt, syscall.SIGTERM)
go func() {
<-sigChan
fmt.Println("\n\nInterrupt received, shutting down...")
cancel()
}()
// Start server
logPath := defaultLogPath
server := NewServer(config.Server, ollamaBin)
fmt.Println("Starting ollama server...")
if err := server.Start(ctx, logPath); err != nil {
fmt.Printf("Error starting server: %v\n", err)
return 1
}
defer func() {
fmt.Println("\nStopping server...")
server.Stop()
}()
// Start log monitor
monitor, err := NewLogMonitor(logPath, config.Validation.CheckPatterns)
if err != nil {
fmt.Printf("Error creating log monitor: %v\n", err)
return 1
}
monitorCtx, monitorCancel := context.WithCancel(ctx)
defer monitorCancel()
go func() {
if err := monitor.Start(monitorCtx); err != nil && err != context.Canceled {
if verbose {
fmt.Printf("Log monitor error: %v\n", err)
}
}
}()
// Wait a moment for log monitor to initialize
time.Sleep(500 * time.Millisecond)
// Run tests
startTime := time.Now()
tester := NewModelTester(server.BaseURL())
validator := NewValidator(config.Validation, monitor)
results := make([]TestResult, 0, len(profile.Models))
for i, modelTest := range profile.Models {
fmt.Printf("\n[%d/%d] Testing model: %s\n", i+1, len(profile.Models), modelTest.Name)
fmt.Println(strings.Repeat("-", 60))
// Reset monitor events for this model
monitor.Reset()
// Run test
result := tester.TestModel(ctx, modelTest)
// Validate result
validator.ValidateResult(&result)
results = append(results, result)
fmt.Printf("Result: %s\n", result.Status)
if result.ErrorMessage != "" {
fmt.Printf("Error: %s\n", result.ErrorMessage)
}
}
endTime := time.Now()
// Generate report
reporter := NewReporter(config.Reporting, monitor)
report, err := reporter.GenerateReport(results, startTime, endTime)
if err != nil {
fmt.Printf("Error generating report: %v\n", err)
return 1
}
// Save report
if err := reporter.SaveReport(report, outputPath); err != nil {
fmt.Printf("Error saving report: %v\n", err)
return 1
}
// Print summary
reporter.PrintSummary(report)
// Return exit code based on test results
if report.Summary.Failed > 0 {
return 1
}
return 0
}
func validateConfig(configPath string) int {
fmt.Printf("Validating configuration: %s\n", configPath)
config, err := LoadConfig(configPath)
if err != nil {
fmt.Printf("❌ Configuration is invalid: %v\n", err)
return 1
}
fmt.Printf("✅ Configuration is valid\n")
fmt.Printf("Profiles found: %d\n", len(config.Profiles))
for profileName, profile := range config.Profiles {
fmt.Printf(" - %s: %d models, timeout %s\n", profileName, len(profile.Models), profile.Timeout)
}
return 0
}
func listProfiles(configPath string) int {
config, err := LoadConfig(configPath)
if err != nil {
fmt.Printf("Error loading config: %v\n", err)
return 1
}
fmt.Println("Available test profiles:")
fmt.Println()
for _, profileName := range config.ListProfiles() {
profile, _ := config.GetProfile(profileName)
fmt.Printf("Profile: %s\n", profileName)
fmt.Printf(" Timeout: %s\n", profile.Timeout)
fmt.Printf(" Models: %d\n", len(profile.Models))
for _, model := range profile.Models {
fmt.Printf(" - %s (%d prompts)\n", model.Name, len(model.Prompts))
}
fmt.Println()
}
return 0
}

240
cmd/test-runner/monitor.go Normal file
View File

@@ -0,0 +1,240 @@
package main
import (
"bufio"
"context"
"fmt"
"os"
"regexp"
"sync"
"time"
)
// LogEvent represents a significant event found in logs
type LogEvent struct {
Timestamp time.Time
Line string
Type EventType
Message string
}
// EventType represents the type of log event
type EventType int
const (
EventInfo EventType = iota
EventSuccess
EventWarning
EventError
)
func (e EventType) String() string {
switch e {
case EventInfo:
return "INFO"
case EventSuccess:
return "SUCCESS"
case EventWarning:
return "WARNING"
case EventError:
return "ERROR"
default:
return "UNKNOWN"
}
}
// LogMonitor monitors log files for important events
type LogMonitor struct {
logPath string
patterns CheckPatterns
events []LogEvent
mu sync.RWMutex
successRegexps []*regexp.Regexp
failureRegexps []*regexp.Regexp
warningRegexps []*regexp.Regexp
}
// NewLogMonitor creates a new log monitor
func NewLogMonitor(logPath string, patterns CheckPatterns) (*LogMonitor, error) {
monitor := &LogMonitor{
logPath: logPath,
patterns: patterns,
events: make([]LogEvent, 0),
}
// Compile regex patterns
var err error
monitor.successRegexps, err = compilePatterns(patterns.Success)
if err != nil {
return nil, fmt.Errorf("failed to compile success patterns: %w", err)
}
monitor.failureRegexps, err = compilePatterns(patterns.Failure)
if err != nil {
return nil, fmt.Errorf("failed to compile failure patterns: %w", err)
}
monitor.warningRegexps, err = compilePatterns(patterns.Warning)
if err != nil {
return nil, fmt.Errorf("failed to compile warning patterns: %w", err)
}
return monitor, nil
}
// compilePatterns compiles a list of pattern strings into regexps
func compilePatterns(patterns []string) ([]*regexp.Regexp, error) {
regexps := make([]*regexp.Regexp, len(patterns))
for i, pattern := range patterns {
re, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("invalid pattern %q: %w", pattern, err)
}
regexps[i] = re
}
return regexps, nil
}
// Start starts monitoring the log file
func (m *LogMonitor) Start(ctx context.Context) error {
file, err := os.Open(m.logPath)
if err != nil {
return fmt.Errorf("failed to open log file: %w", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
// Use a larger buffer for long log lines
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
if !scanner.Scan() {
// No more lines, wait a bit and retry
time.Sleep(100 * time.Millisecond)
continue
}
line := scanner.Text()
m.processLine(line)
}
}
}
// processLine processes a single log line
func (m *LogMonitor) processLine(line string) {
event := LogEvent{
Timestamp: time.Now(),
Line: line,
Type: EventInfo,
}
// Check for failure patterns (highest priority)
for _, re := range m.failureRegexps {
if re.MatchString(line) {
event.Type = EventError
event.Message = fmt.Sprintf("Failure pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
// Check for warning patterns
for _, re := range m.warningRegexps {
if re.MatchString(line) {
event.Type = EventWarning
event.Message = fmt.Sprintf("Warning pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
// Check for success patterns
for _, re := range m.successRegexps {
if re.MatchString(line) {
event.Type = EventSuccess
event.Message = fmt.Sprintf("Success pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
}
// addEvent adds an event to the event list
func (m *LogMonitor) addEvent(event LogEvent) {
m.mu.Lock()
defer m.mu.Unlock()
m.events = append(m.events, event)
}
// GetEvents returns all events of a specific type
func (m *LogMonitor) GetEvents(eventType EventType) []LogEvent {
m.mu.RLock()
defer m.mu.RUnlock()
filtered := make([]LogEvent, 0)
for _, event := range m.events {
if event.Type == eventType {
filtered = append(filtered, event)
}
}
return filtered
}
// GetAllEvents returns all events
func (m *LogMonitor) GetAllEvents() []LogEvent {
m.mu.RLock()
defer m.mu.RUnlock()
return append([]LogEvent{}, m.events...)
}
// HasErrors returns true if any error events were detected
func (m *LogMonitor) HasErrors() bool {
return len(m.GetEvents(EventError)) > 0
}
// HasWarnings returns true if any warning events were detected
func (m *LogMonitor) HasWarnings() bool {
return len(m.GetEvents(EventWarning)) > 0
}
// GetLogExcerpt returns the last N lines from the log file
func (m *LogMonitor) GetLogExcerpt(lines int) ([]string, error) {
file, err := os.Open(m.logPath)
if err != nil {
return nil, fmt.Errorf("failed to open log file: %w", err)
}
defer file.Close()
// Read all lines
allLines := make([]string, 0)
scanner := bufio.NewScanner(file)
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
allLines = append(allLines, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading log file: %w", err)
}
// Return last N lines
if len(allLines) <= lines {
return allLines, nil
}
return allLines[len(allLines)-lines:], nil
}
// Reset clears all collected events
func (m *LogMonitor) Reset() {
m.mu.Lock()
defer m.mu.Unlock()
m.events = make([]LogEvent, 0)
}

254
cmd/test-runner/report.go Normal file
View File

@@ -0,0 +1,254 @@
package main
import (
"encoding/json"
"fmt"
"os"
"strings"
"time"
)
// TestReport represents the complete test report
type TestReport struct {
Summary Summary `json:"summary"`
Results []TestResult `json:"results"`
LogExcerpts map[string][]string `json:"log_excerpts,omitempty"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
TotalDuration time.Duration `json:"total_duration"`
}
// Summary represents test summary statistics
type Summary struct {
TotalTests int `json:"total_tests"`
Passed int `json:"passed"`
Failed int `json:"failed"`
Skipped int `json:"skipped"`
TotalPrompts int `json:"total_prompts"`
}
// Reporter generates test reports
type Reporter struct {
config ReportingConfig
logMonitor *LogMonitor
}
// NewReporter creates a new reporter
func NewReporter(config ReportingConfig, logMonitor *LogMonitor) *Reporter {
return &Reporter{
config: config,
logMonitor: logMonitor,
}
}
// GenerateReport generates a complete test report
func (r *Reporter) GenerateReport(results []TestResult, startTime, endTime time.Time) (*TestReport, error) {
report := &TestReport{
Results: results,
StartTime: startTime,
EndTime: endTime,
TotalDuration: endTime.Sub(startTime),
}
// Calculate summary
report.Summary = r.calculateSummary(results)
// Add log excerpts for failed tests if configured
if r.config.IncludeLogs && r.logMonitor != nil {
report.LogExcerpts = make(map[string][]string)
for _, result := range results {
if result.Status == StatusFailed {
excerpt, err := r.logMonitor.GetLogExcerpt(r.config.LogExcerptLines)
if err == nil {
report.LogExcerpts[result.ModelName] = excerpt
}
}
}
}
return report, nil
}
// calculateSummary calculates summary statistics
func (r *Reporter) calculateSummary(results []TestResult) Summary {
summary := Summary{
TotalTests: len(results),
}
for _, result := range results {
switch result.Status {
case StatusPassed:
summary.Passed++
case StatusFailed:
summary.Failed++
case StatusSkipped:
summary.Skipped++
}
summary.TotalPrompts += len(result.PromptTests)
}
return summary
}
// SaveReport saves the report in configured formats
func (r *Reporter) SaveReport(report *TestReport, outputPath string) error {
for _, format := range r.config.Formats {
switch format {
case "json":
if err := r.saveJSON(report, outputPath+".json"); err != nil {
return fmt.Errorf("failed to save JSON report: %w", err)
}
case "markdown":
if err := r.saveMarkdown(report, outputPath+".md"); err != nil {
return fmt.Errorf("failed to save Markdown report: %w", err)
}
default:
fmt.Printf("Warning: unknown report format %q\n", format)
}
}
return nil
}
// saveJSON saves the report as JSON
func (r *Reporter) saveJSON(report *TestReport, path string) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ")
if err := encoder.Encode(report); err != nil {
return err
}
fmt.Printf("JSON report saved to: %s\n", path)
return nil
}
// saveMarkdown saves the report as Markdown
func (r *Reporter) saveMarkdown(report *TestReport, path string) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
var sb strings.Builder
// Title and summary
sb.WriteString("# Tesla K80 Test Report\n\n")
sb.WriteString(fmt.Sprintf("**Generated:** %s\n\n", time.Now().Format(time.RFC3339)))
sb.WriteString(fmt.Sprintf("**Duration:** %s\n\n", report.TotalDuration.Round(time.Second)))
// Summary table
sb.WriteString("## Summary\n\n")
sb.WriteString("| Metric | Count |\n")
sb.WriteString("|--------|-------|\n")
sb.WriteString(fmt.Sprintf("| Total Tests | %d |\n", report.Summary.TotalTests))
sb.WriteString(fmt.Sprintf("| Passed | %d |\n", report.Summary.Passed))
sb.WriteString(fmt.Sprintf("| Failed | %d |\n", report.Summary.Failed))
sb.WriteString(fmt.Sprintf("| Skipped | %d |\n", report.Summary.Skipped))
sb.WriteString(fmt.Sprintf("| Total Prompts | %d |\n\n", report.Summary.TotalPrompts))
// Results
sb.WriteString("## Test Results\n\n")
for _, result := range report.Results {
r.writeModelResult(&sb, result)
}
// Log excerpts
if len(report.LogExcerpts) > 0 {
sb.WriteString("## Log Excerpts\n\n")
for modelName, excerpt := range report.LogExcerpts {
sb.WriteString(fmt.Sprintf("### %s\n\n", modelName))
sb.WriteString("```\n")
for _, line := range excerpt {
sb.WriteString(line + "\n")
}
sb.WriteString("```\n\n")
}
}
if _, err := file.WriteString(sb.String()); err != nil {
return err
}
fmt.Printf("Markdown report saved to: %s\n", path)
return nil
}
// writeModelResult writes a model result to the markdown builder
func (r *Reporter) writeModelResult(sb *strings.Builder, result TestResult) {
statusEmoji := "✅"
if result.Status == StatusFailed {
statusEmoji = "❌"
} else if result.Status == StatusSkipped {
statusEmoji = "⏭️"
}
sb.WriteString(fmt.Sprintf("### %s %s\n\n", statusEmoji, result.ModelName))
sb.WriteString(fmt.Sprintf("**Status:** %s\n\n", result.Status))
sb.WriteString(fmt.Sprintf("**Duration:** %s\n\n", result.Duration.Round(time.Millisecond)))
if result.ErrorMessage != "" {
sb.WriteString(fmt.Sprintf("**Error:** %s\n\n", result.ErrorMessage))
}
if len(result.Warnings) > 0 {
sb.WriteString("**Warnings:**\n")
for _, warning := range result.Warnings {
sb.WriteString(fmt.Sprintf("- %s\n", warning))
}
sb.WriteString("\n")
}
// Prompt tests
if len(result.PromptTests) > 0 {
sb.WriteString("**Prompt Tests:**\n\n")
for i, prompt := range result.PromptTests {
promptStatus := "✅"
if prompt.Status == StatusFailed {
promptStatus = "❌"
}
sb.WriteString(fmt.Sprintf("%d. %s **Prompt:** %s\n", i+1, promptStatus, prompt.Prompt))
sb.WriteString(fmt.Sprintf(" - **Duration:** %s\n", prompt.Duration.Round(time.Millisecond)))
sb.WriteString(fmt.Sprintf(" - **Response Tokens:** %d\n", prompt.ResponseTokens))
if prompt.ErrorMessage != "" {
sb.WriteString(fmt.Sprintf(" - **Error:** %s\n", prompt.ErrorMessage))
}
if prompt.Response != "" && len(prompt.Response) < 200 {
sb.WriteString(fmt.Sprintf(" - **Response:** %s\n", prompt.Response))
}
sb.WriteString("\n")
}
}
sb.WriteString("---\n\n")
}
// PrintSummary prints a summary to stdout
func (r *Reporter) PrintSummary(report *TestReport) {
fmt.Println("\n" + strings.Repeat("=", 60))
fmt.Println("TEST SUMMARY")
fmt.Println(strings.Repeat("=", 60))
fmt.Printf("Total Tests: %d\n", report.Summary.TotalTests)
fmt.Printf("Passed: %d\n", report.Summary.Passed)
fmt.Printf("Failed: %d\n", report.Summary.Failed)
fmt.Printf("Skipped: %d\n", report.Summary.Skipped)
fmt.Printf("Total Prompts: %d\n", report.Summary.TotalPrompts)
fmt.Printf("Duration: %s\n", report.TotalDuration.Round(time.Second))
fmt.Println(strings.Repeat("=", 60))
if report.Summary.Failed > 0 {
fmt.Println("\nFAILED TESTS:")
for _, result := range report.Results {
if result.Status == StatusFailed {
fmt.Printf(" ❌ %s: %s\n", result.ModelName, result.ErrorMessage)
}
}
}
fmt.Println()
}

168
cmd/test-runner/server.go Normal file
View File

@@ -0,0 +1,168 @@
package main
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"time"
)
// Server manages the ollama server lifecycle
type Server struct {
config ServerConfig
ollamaBin string
logFile *os.File
cmd *exec.Cmd
baseURL string
}
// NewServer creates a new server manager
func NewServer(config ServerConfig, ollamaBin string) *Server {
baseURL := fmt.Sprintf("http://%s:%d", config.Host, config.Port)
return &Server{
config: config,
ollamaBin: ollamaBin,
baseURL: baseURL,
}
}
// Start starts the ollama server
func (s *Server) Start(ctx context.Context, logPath string) error {
// Create log file
logFile, err := os.Create(logPath)
if err != nil {
return fmt.Errorf("failed to create log file: %w", err)
}
s.logFile = logFile
// Resolve ollama binary path
binPath, err := filepath.Abs(s.ollamaBin)
if err != nil {
return fmt.Errorf("failed to resolve ollama binary path: %w", err)
}
// Check if binary exists
if _, err := os.Stat(binPath); err != nil {
return fmt.Errorf("ollama binary not found at %s: %w", binPath, err)
}
// Create command
s.cmd = exec.CommandContext(ctx, binPath, "serve")
s.cmd.Stdout = logFile
s.cmd.Stderr = logFile
// Set working directory to binary location
s.cmd.Dir = filepath.Dir(binPath)
// Start server
if err := s.cmd.Start(); err != nil {
logFile.Close()
return fmt.Errorf("failed to start ollama server: %w", err)
}
fmt.Printf("Started ollama server (PID: %d)\n", s.cmd.Process.Pid)
fmt.Printf("Server logs: %s\n", logPath)
// Wait for server to be ready
if err := s.WaitForReady(ctx); err != nil {
s.Stop()
return fmt.Errorf("server failed to become ready: %w", err)
}
fmt.Printf("Server is ready at %s\n", s.baseURL)
return nil
}
// WaitForReady waits for the server to be ready
func (s *Server) WaitForReady(ctx context.Context) error {
healthURL := s.baseURL + s.config.HealthCheckEndpoint
timeout := time.After(s.config.StartupTimeout)
ticker := time.NewTicker(s.config.HealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-timeout:
return fmt.Errorf("timeout waiting for server to be ready")
case <-ticker.C:
req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
if err != nil {
continue
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return nil
}
}
}
}
// Stop stops the ollama server
func (s *Server) Stop() error {
var errs []error
// Stop the process
if s.cmd != nil && s.cmd.Process != nil {
fmt.Printf("Stopping ollama server (PID: %d)\n", s.cmd.Process.Pid)
// Try graceful shutdown first
if err := s.cmd.Process.Signal(os.Interrupt); err != nil {
errs = append(errs, fmt.Errorf("failed to send interrupt signal: %w", err))
}
// Wait for process to exit (with timeout)
done := make(chan error, 1)
go func() {
done <- s.cmd.Wait()
}()
select {
case <-time.After(10 * time.Second):
// Force kill if graceful shutdown times out
if err := s.cmd.Process.Kill(); err != nil {
errs = append(errs, fmt.Errorf("failed to kill process: %w", err))
}
<-done // Wait for process to actually die
case err := <-done:
if err != nil && err.Error() != "signal: interrupt" {
errs = append(errs, fmt.Errorf("process exited with error: %w", err))
}
}
}
// Close log file
if s.logFile != nil {
if err := s.logFile.Close(); err != nil {
errs = append(errs, fmt.Errorf("failed to close log file: %w", err))
}
}
if len(errs) > 0 {
return fmt.Errorf("errors during shutdown: %v", errs)
}
fmt.Println("Server stopped successfully")
return nil
}
// BaseURL returns the server base URL
func (s *Server) BaseURL() string {
return s.baseURL
}
// IsRunning returns true if the server is running
func (s *Server) IsRunning() bool {
return s.cmd != nil && s.cmd.Process != nil
}

223
cmd/test-runner/test.go Normal file
View File

@@ -0,0 +1,223 @@
package main
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"strings"
"time"
)
// TestResult represents the result of a model test
type TestResult struct {
ModelName string `json:"model_name"`
Status TestStatus `json:"status"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Duration time.Duration `json:"duration"`
PromptTests []PromptTest `json:"prompt_tests"`
ErrorMessage string `json:"error_message,omitempty"`
Warnings []string `json:"warnings,omitempty"`
}
// TestStatus represents the status of a test
type TestStatus string
const (
StatusPassed TestStatus = "PASSED"
StatusFailed TestStatus = "FAILED"
StatusSkipped TestStatus = "SKIPPED"
)
// PromptTest represents the result of a single prompt test
type PromptTest struct {
Prompt string `json:"prompt"`
Response string `json:"response"`
ResponseTokens int `json:"response_tokens"`
Duration time.Duration `json:"duration"`
Status TestStatus `json:"status"`
ErrorMessage string `json:"error_message,omitempty"`
}
// ModelTester runs tests for models
type ModelTester struct {
serverURL string
httpClient *http.Client
}
// NewModelTester creates a new model tester
func NewModelTester(serverURL string) *ModelTester {
return &ModelTester{
serverURL: serverURL,
httpClient: &http.Client{
Timeout: 5 * time.Minute, // Long timeout for model operations
},
}
}
// TestModel runs all tests for a single model
func (t *ModelTester) TestModel(ctx context.Context, modelTest ModelTest) TestResult {
result := TestResult{
ModelName: modelTest.Name,
StartTime: time.Now(),
Status: StatusPassed,
PromptTests: make([]PromptTest, 0),
}
// Pull model first
fmt.Printf("Pulling model %s...\n", modelTest.Name)
if err := t.pullModel(ctx, modelTest.Name); err != nil {
result.Status = StatusFailed
result.ErrorMessage = fmt.Sprintf("Failed to pull model: %v", err)
result.EndTime = time.Now()
result.Duration = result.EndTime.Sub(result.StartTime)
return result
}
fmt.Printf("Model %s pulled successfully\n", modelTest.Name)
// Run each prompt test
for i, prompt := range modelTest.Prompts {
fmt.Printf("Testing prompt %d/%d for %s\n", i+1, len(modelTest.Prompts), modelTest.Name)
promptTest := t.testPrompt(ctx, modelTest.Name, prompt, modelTest.Timeout)
result.PromptTests = append(result.PromptTests, promptTest)
// Update overall status based on prompt test result
if promptTest.Status == StatusFailed {
result.Status = StatusFailed
}
}
result.EndTime = time.Now()
result.Duration = result.EndTime.Sub(result.StartTime)
fmt.Printf("Model %s test completed: %s\n", modelTest.Name, result.Status)
return result
}
// pullModel pulls a model using the Ollama API
func (t *ModelTester) pullModel(ctx context.Context, modelName string) error {
url := t.serverURL + "/api/pull"
reqBody := map[string]interface{}{
"name": modelName,
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := t.httpClient.Do(req)
if err != nil {
return fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("pull failed with status %d: %s", resp.StatusCode, string(body))
}
// Read response stream (pull progress)
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
var progress map[string]interface{}
if err := json.Unmarshal(scanner.Bytes(), &progress); err != nil {
continue
}
// Could print progress here if verbose mode is enabled
}
return nil
}
// testPrompt tests a single prompt
func (t *ModelTester) testPrompt(ctx context.Context, modelName, prompt string, timeout time.Duration) PromptTest {
result := PromptTest{
Prompt: prompt,
Status: StatusPassed,
}
startTime := time.Now()
// Create context with timeout
testCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
// Call chat API
response, err := t.chat(testCtx, modelName, prompt)
if err != nil {
result.Status = StatusFailed
result.ErrorMessage = err.Error()
result.Duration = time.Since(startTime)
return result
}
result.Response = response
result.ResponseTokens = estimateTokens(response)
result.Duration = time.Since(startTime)
return result
}
// chat sends a chat request to the ollama API
func (t *ModelTester) chat(ctx context.Context, modelName, prompt string) (string, error) {
url := t.serverURL + "/api/generate"
reqBody := map[string]interface{}{
"model": modelName,
"prompt": prompt,
"stream": false,
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return "", fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return "", fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := t.httpClient.Do(req)
if err != nil {
return "", fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("chat failed with status %d: %s", resp.StatusCode, string(body))
}
var response struct {
Response string `json:"response"`
}
if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
return "", fmt.Errorf("failed to decode response: %w", err)
}
return response.Response, nil
}
// estimateTokens estimates the number of tokens in a text
// This is a rough approximation
func estimateTokens(text string) int {
// Rough estimate: 1 token ≈ 4 characters on average
words := strings.Fields(text)
return len(words)
}

164
cmd/test-runner/validate.go Normal file
View File

@@ -0,0 +1,164 @@
package main
import (
"fmt"
"strings"
)
// Validator validates test results against configuration
type Validator struct {
config Validation
logMonitor *LogMonitor
}
// NewValidator creates a new validator
func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
return &Validator{
config: config,
logMonitor: logMonitor,
}
}
// ValidateResult validates a test result
func (v *Validator) ValidateResult(result *TestResult) {
// Validate prompts
for i := range result.PromptTests {
v.validatePrompt(&result.PromptTests[i])
}
// Check logs for errors and warnings
if v.logMonitor != nil {
v.validateLogs(result)
}
}
// validatePrompt validates a single prompt test
func (v *Validator) validatePrompt(prompt *PromptTest) {
// Already failed, skip
if prompt.Status == StatusFailed {
return
}
// Check if response is empty
if strings.TrimSpace(prompt.Response) == "" {
prompt.Status = StatusFailed
prompt.ErrorMessage = "Response is empty"
return
}
// Check token count
if prompt.ResponseTokens < 1 {
prompt.Status = StatusFailed
prompt.ErrorMessage = "Response has no tokens"
return
}
}
// validateLogs validates log events
func (v *Validator) validateLogs(result *TestResult) {
// Check for error events
errorEvents := v.logMonitor.GetEvents(EventError)
if len(errorEvents) > 0 {
result.Status = StatusFailed
errorMessages := make([]string, len(errorEvents))
for i, event := range errorEvents {
errorMessages[i] = event.Line
}
if result.ErrorMessage == "" {
result.ErrorMessage = fmt.Sprintf("Errors found in logs: %s", strings.Join(errorMessages, "; "))
} else {
result.ErrorMessage += fmt.Sprintf("; Log errors: %s", strings.Join(errorMessages, "; "))
}
}
// Check for warning events
warningEvents := v.logMonitor.GetEvents(EventWarning)
if len(warningEvents) > 0 {
warnings := make([]string, len(warningEvents))
for i, event := range warningEvents {
warnings[i] = event.Line
}
result.Warnings = append(result.Warnings, warnings...)
}
// Check if GPU was used (if required)
if v.config.GPURequired {
if !v.hasGPULoading() {
result.Status = StatusFailed
if result.ErrorMessage == "" {
result.ErrorMessage = "GPU acceleration not detected in logs (GPU required)"
} else {
result.ErrorMessage += "; GPU acceleration not detected"
}
}
}
// Check for CPU fallback (if single GPU preferred)
if v.config.SingleGPUPreferred {
if v.hasCPUFallback() {
warning := "CPU fallback or multi-GPU split detected (single GPU preferred)"
result.Warnings = append(result.Warnings, warning)
}
}
}
// hasGPULoading checks if logs indicate GPU loading
func (v *Validator) hasGPULoading() bool {
successEvents := v.logMonitor.GetEvents(EventSuccess)
// Look for patterns indicating GPU usage
gpuPatterns := []string{
"offload",
"GPU",
"CUDA",
}
for _, event := range successEvents {
line := strings.ToLower(event.Line)
for _, pattern := range gpuPatterns {
if strings.Contains(line, strings.ToLower(pattern)) {
return true
}
}
}
return false
}
// hasCPUFallback checks if logs indicate CPU fallback
func (v *Validator) hasCPUFallback() bool {
allEvents := v.logMonitor.GetAllEvents()
// Look for patterns indicating CPU usage or multi-GPU split
cpuPatterns := []string{
"CPU backend",
"using CPU",
"fallback",
}
for _, event := range allEvents {
line := strings.ToLower(event.Line)
for _, pattern := range cpuPatterns {
if strings.Contains(line, strings.ToLower(pattern)) {
return true
}
}
}
return false
}
// ValidateResponse validates a response against expected criteria
func ValidateResponse(response string, minTokens, maxTokens int) error {
tokens := estimateTokens(response)
if minTokens > 0 && tokens < minTokens {
return fmt.Errorf("response too short: %d tokens (min: %d)", tokens, minTokens)
}
if maxTokens > 0 && tokens > maxTokens {
return fmt.Errorf("response too long: %d tokens (max: %d)", tokens, maxTokens)
}
return nil
}