Implement Go-based test runner framework for Tesla K80 testing

Add comprehensive test orchestration framework: Test Runner (cmd/test-runner/): - config.go: YAML configuration loading and validation - server.go: Ollama server lifecycle management (start/stop/health checks) - monitor.go: Real-time log monitoring with pattern matching - test.go: Model testing via Ollama API (pull, chat, validation) - validate.go: Test result validation (GPU usage, response quality, log analysis) - report.go: Structured reporting (JSON and Markdown formats) - main.go: CLI interface with run/validate/list commands Test Configurations (test/config/): - models.yaml: Full test suite with quick/full/stress profiles - quick.yaml: Fast smoke test with gemma2:2b Updated Workflow: - tesla-k80-tests.yml: Use test-runner instead of shell scripts - Run quick tests first, then full tests if passing - Generate structured JSON reports for pass/fail checking - Upload test results as artifacts Features: - Multi-model testing with configurable profiles - API-based testing (not CLI commands) - Real-time log monitoring for GPU events and errors - Automatic validation of GPU loading and response quality - Structured JSON and Markdown reports - Graceful server lifecycle management - Interrupt handling (Ctrl+C cleanup) Addresses limitations of shell-based testing by providing: - Better error handling and reporting - Programmatic test orchestration - Reusable test framework - Clear pass/fail criteria - Detailed test metrics and timing
2025-12-14 17:57:06 +00:00 · 2025-10-30 11:04:48 +08:00
parent aaaf334e7f
commit d59284d30a
10 changed files with 1631 additions and 113 deletions
--- a/cmd/test-runner/monitor.go
+++ b/cmd/test-runner/monitor.go
@@ -0,0 +1,240 @@
+package main
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"os"
+	"regexp"
+	"sync"
+	"time"
+)
+
+// LogEvent represents a significant event found in logs
+type LogEvent struct {
+	Timestamp time.Time
+	Line      string
+	Type      EventType
+	Message   string
+}
+
+// EventType represents the type of log event
+type EventType int
+
+const (
+	EventInfo EventType = iota
+	EventSuccess
+	EventWarning
+	EventError
+)
+
+func (e EventType) String() string {
+	switch e {
+	case EventInfo:
+		return "INFO"
+	case EventSuccess:
+		return "SUCCESS"
+	case EventWarning:
+		return "WARNING"
+	case EventError:
+		return "ERROR"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// LogMonitor monitors log files for important events
+type LogMonitor struct {
+	logPath        string
+	patterns       CheckPatterns
+	events         []LogEvent
+	mu             sync.RWMutex
+	successRegexps []*regexp.Regexp
+	failureRegexps []*regexp.Regexp
+	warningRegexps []*regexp.Regexp
+}
+
+// NewLogMonitor creates a new log monitor
+func NewLogMonitor(logPath string, patterns CheckPatterns) (*LogMonitor, error) {
+	monitor := &LogMonitor{
+		logPath:  logPath,
+		patterns: patterns,
+		events:   make([]LogEvent, 0),
+	}
+
+	// Compile regex patterns
+	var err error
+	monitor.successRegexps, err = compilePatterns(patterns.Success)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile success patterns: %w", err)
+	}
+
+	monitor.failureRegexps, err = compilePatterns(patterns.Failure)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile failure patterns: %w", err)
+	}
+
+	monitor.warningRegexps, err = compilePatterns(patterns.Warning)
+	if err != nil {
+		return nil, fmt.Errorf("failed to compile warning patterns: %w", err)
+	}
+
+	return monitor, nil
+}
+
+// compilePatterns compiles a list of pattern strings into regexps
+func compilePatterns(patterns []string) ([]*regexp.Regexp, error) {
+	regexps := make([]*regexp.Regexp, len(patterns))
+	for i, pattern := range patterns {
+		re, err := regexp.Compile(pattern)
+		if err != nil {
+			return nil, fmt.Errorf("invalid pattern %q: %w", pattern, err)
+		}
+		regexps[i] = re
+	}
+	return regexps, nil
+}
+
+// Start starts monitoring the log file
+func (m *LogMonitor) Start(ctx context.Context) error {
+	file, err := os.Open(m.logPath)
+	if err != nil {
+		return fmt.Errorf("failed to open log file: %w", err)
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+
+	// Use a larger buffer for long log lines
+	buf := make([]byte, 0, 64*1024)
+	scanner.Buffer(buf, 1024*1024)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+			if !scanner.Scan() {
+				// No more lines, wait a bit and retry
+				time.Sleep(100 * time.Millisecond)
+				continue
+			}
+
+			line := scanner.Text()
+			m.processLine(line)
+		}
+	}
+}
+
+// processLine processes a single log line
+func (m *LogMonitor) processLine(line string) {
+	event := LogEvent{
+		Timestamp: time.Now(),
+		Line:      line,
+		Type:      EventInfo,
+	}
+
+	// Check for failure patterns (highest priority)
+	for _, re := range m.failureRegexps {
+		if re.MatchString(line) {
+			event.Type = EventError
+			event.Message = fmt.Sprintf("Failure pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+
+	// Check for warning patterns
+	for _, re := range m.warningRegexps {
+		if re.MatchString(line) {
+			event.Type = EventWarning
+			event.Message = fmt.Sprintf("Warning pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+
+	// Check for success patterns
+	for _, re := range m.successRegexps {
+		if re.MatchString(line) {
+			event.Type = EventSuccess
+			event.Message = fmt.Sprintf("Success pattern matched: %s", re.String())
+			m.addEvent(event)
+			return
+		}
+	}
+}
+
+// addEvent adds an event to the event list
+func (m *LogMonitor) addEvent(event LogEvent) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.events = append(m.events, event)
+}
+
+// GetEvents returns all events of a specific type
+func (m *LogMonitor) GetEvents(eventType EventType) []LogEvent {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+
+	filtered := make([]LogEvent, 0)
+	for _, event := range m.events {
+		if event.Type == eventType {
+			filtered = append(filtered, event)
+		}
+	}
+	return filtered
+}
+
+// GetAllEvents returns all events
+func (m *LogMonitor) GetAllEvents() []LogEvent {
+	m.mu.RLock()
+	defer m.mu.RUnlock()
+	return append([]LogEvent{}, m.events...)
+}
+
+// HasErrors returns true if any error events were detected
+func (m *LogMonitor) HasErrors() bool {
+	return len(m.GetEvents(EventError)) > 0
+}
+
+// HasWarnings returns true if any warning events were detected
+func (m *LogMonitor) HasWarnings() bool {
+	return len(m.GetEvents(EventWarning)) > 0
+}
+
+// GetLogExcerpt returns the last N lines from the log file
+func (m *LogMonitor) GetLogExcerpt(lines int) ([]string, error) {
+	file, err := os.Open(m.logPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open log file: %w", err)
+	}
+	defer file.Close()
+
+	// Read all lines
+	allLines := make([]string, 0)
+	scanner := bufio.NewScanner(file)
+	buf := make([]byte, 0, 64*1024)
+	scanner.Buffer(buf, 1024*1024)
+
+	for scanner.Scan() {
+		allLines = append(allLines, scanner.Text())
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading log file: %w", err)
+	}
+
+	// Return last N lines
+	if len(allLines) <= lines {
+		return allLines, nil
+	}
+	return allLines[len(allLines)-lines:], nil
+}
+
+// Reset clears all collected events
+func (m *LogMonitor) Reset() {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.events = make([]LogEvent, 0)
+}