Files
ollama37/cmd/test-runner/monitor.go
Shang Chieh Tseng d59284d30a Implement Go-based test runner framework for Tesla K80 testing
Add comprehensive test orchestration framework:

Test Runner (cmd/test-runner/):
- config.go: YAML configuration loading and validation
- server.go: Ollama server lifecycle management (start/stop/health checks)
- monitor.go: Real-time log monitoring with pattern matching
- test.go: Model testing via Ollama API (pull, chat, validation)
- validate.go: Test result validation (GPU usage, response quality, log analysis)
- report.go: Structured reporting (JSON and Markdown formats)
- main.go: CLI interface with run/validate/list commands

Test Configurations (test/config/):
- models.yaml: Full test suite with quick/full/stress profiles
- quick.yaml: Fast smoke test with gemma2:2b

Updated Workflow:
- tesla-k80-tests.yml: Use test-runner instead of shell scripts
- Run quick tests first, then full tests if passing
- Generate structured JSON reports for pass/fail checking
- Upload test results as artifacts

Features:
- Multi-model testing with configurable profiles
- API-based testing (not CLI commands)
- Real-time log monitoring for GPU events and errors
- Automatic validation of GPU loading and response quality
- Structured JSON and Markdown reports
- Graceful server lifecycle management
- Interrupt handling (Ctrl+C cleanup)

Addresses limitations of shell-based testing by providing:
- Better error handling and reporting
- Programmatic test orchestration
- Reusable test framework
- Clear pass/fail criteria
- Detailed test metrics and timing
2025-10-30 11:04:48 +08:00

241 lines
5.3 KiB
Go

package main
import (
"bufio"
"context"
"fmt"
"os"
"regexp"
"sync"
"time"
)
// LogEvent represents a significant event found in logs
type LogEvent struct {
Timestamp time.Time
Line string
Type EventType
Message string
}
// EventType represents the type of log event
type EventType int
const (
EventInfo EventType = iota
EventSuccess
EventWarning
EventError
)
func (e EventType) String() string {
switch e {
case EventInfo:
return "INFO"
case EventSuccess:
return "SUCCESS"
case EventWarning:
return "WARNING"
case EventError:
return "ERROR"
default:
return "UNKNOWN"
}
}
// LogMonitor monitors log files for important events
type LogMonitor struct {
logPath string
patterns CheckPatterns
events []LogEvent
mu sync.RWMutex
successRegexps []*regexp.Regexp
failureRegexps []*regexp.Regexp
warningRegexps []*regexp.Regexp
}
// NewLogMonitor creates a new log monitor
func NewLogMonitor(logPath string, patterns CheckPatterns) (*LogMonitor, error) {
monitor := &LogMonitor{
logPath: logPath,
patterns: patterns,
events: make([]LogEvent, 0),
}
// Compile regex patterns
var err error
monitor.successRegexps, err = compilePatterns(patterns.Success)
if err != nil {
return nil, fmt.Errorf("failed to compile success patterns: %w", err)
}
monitor.failureRegexps, err = compilePatterns(patterns.Failure)
if err != nil {
return nil, fmt.Errorf("failed to compile failure patterns: %w", err)
}
monitor.warningRegexps, err = compilePatterns(patterns.Warning)
if err != nil {
return nil, fmt.Errorf("failed to compile warning patterns: %w", err)
}
return monitor, nil
}
// compilePatterns compiles a list of pattern strings into regexps
func compilePatterns(patterns []string) ([]*regexp.Regexp, error) {
regexps := make([]*regexp.Regexp, len(patterns))
for i, pattern := range patterns {
re, err := regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("invalid pattern %q: %w", pattern, err)
}
regexps[i] = re
}
return regexps, nil
}
// Start starts monitoring the log file
func (m *LogMonitor) Start(ctx context.Context) error {
file, err := os.Open(m.logPath)
if err != nil {
return fmt.Errorf("failed to open log file: %w", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
// Use a larger buffer for long log lines
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
for {
select {
case <-ctx.Done():
return ctx.Err()
default:
if !scanner.Scan() {
// No more lines, wait a bit and retry
time.Sleep(100 * time.Millisecond)
continue
}
line := scanner.Text()
m.processLine(line)
}
}
}
// processLine processes a single log line
func (m *LogMonitor) processLine(line string) {
event := LogEvent{
Timestamp: time.Now(),
Line: line,
Type: EventInfo,
}
// Check for failure patterns (highest priority)
for _, re := range m.failureRegexps {
if re.MatchString(line) {
event.Type = EventError
event.Message = fmt.Sprintf("Failure pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
// Check for warning patterns
for _, re := range m.warningRegexps {
if re.MatchString(line) {
event.Type = EventWarning
event.Message = fmt.Sprintf("Warning pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
// Check for success patterns
for _, re := range m.successRegexps {
if re.MatchString(line) {
event.Type = EventSuccess
event.Message = fmt.Sprintf("Success pattern matched: %s", re.String())
m.addEvent(event)
return
}
}
}
// addEvent adds an event to the event list
func (m *LogMonitor) addEvent(event LogEvent) {
m.mu.Lock()
defer m.mu.Unlock()
m.events = append(m.events, event)
}
// GetEvents returns all events of a specific type
func (m *LogMonitor) GetEvents(eventType EventType) []LogEvent {
m.mu.RLock()
defer m.mu.RUnlock()
filtered := make([]LogEvent, 0)
for _, event := range m.events {
if event.Type == eventType {
filtered = append(filtered, event)
}
}
return filtered
}
// GetAllEvents returns all events
func (m *LogMonitor) GetAllEvents() []LogEvent {
m.mu.RLock()
defer m.mu.RUnlock()
return append([]LogEvent{}, m.events...)
}
// HasErrors returns true if any error events were detected
func (m *LogMonitor) HasErrors() bool {
return len(m.GetEvents(EventError)) > 0
}
// HasWarnings returns true if any warning events were detected
func (m *LogMonitor) HasWarnings() bool {
return len(m.GetEvents(EventWarning)) > 0
}
// GetLogExcerpt returns the last N lines from the log file
func (m *LogMonitor) GetLogExcerpt(lines int) ([]string, error) {
file, err := os.Open(m.logPath)
if err != nil {
return nil, fmt.Errorf("failed to open log file: %w", err)
}
defer file.Close()
// Read all lines
allLines := make([]string, 0)
scanner := bufio.NewScanner(file)
buf := make([]byte, 0, 64*1024)
scanner.Buffer(buf, 1024*1024)
for scanner.Scan() {
allLines = append(allLines, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading log file: %w", err)
}
// Return last N lines
if len(allLines) <= lines {
return allLines, nil
}
return allLines[len(allLines)-lines:], nil
}
// Reset clears all collected events
func (m *LogMonitor) Reset() {
m.mu.Lock()
defer m.mu.Unlock()
m.events = make([]LogEvent, 0)
}