diff --git a/.github/workflows/tesla-k80-tests.yml b/.github/workflows/tesla-k80-tests.yml index 9a365e39..b9f5d873 100644 --- a/.github/workflows/tesla-k80-tests.yml +++ b/.github/workflows/tesla-k80-tests.yml @@ -85,9 +85,3 @@ jobs: test-report-full.md ollama.log retention-days: 14 - - - name: Check GPU memory usage - if: always() - run: | - echo "=== GPU Memory Status ===" - nvidia-smi --query-gpu=memory.used,memory.total --format=csv diff --git a/cmd/test-runner/main.go b/cmd/test-runner/main.go index 158d6618..ad673569 100644 --- a/cmd/test-runner/main.go +++ b/cmd/test-runner/main.go @@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke // Run tests startTime := time.Now() tester := NewModelTester(server.BaseURL()) - validator := NewValidator(config.Validation, monitor) + validator := NewValidator(config.Validation, monitor, verbose) results := make([]TestResult, 0, len(profile.Models)) diff --git a/cmd/test-runner/validate.go b/cmd/test-runner/validate.go index 5af6f830..ee9b6d0e 100644 --- a/cmd/test-runner/validate.go +++ b/cmd/test-runner/validate.go @@ -2,20 +2,46 @@ package main import ( "fmt" + "os" + "os/exec" + "path/filepath" "strings" ) // Validator validates test results against configuration type Validator struct { - config Validation - logMonitor *LogMonitor + config Validation + logMonitor *LogMonitor + claudeEnabled bool + claudeTempDir string + verbose bool } // NewValidator creates a new validator -func NewValidator(config Validation, logMonitor *LogMonitor) *Validator { +func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator { + // Check if Claude CLI is available + claudeEnabled := false + if _, err := exec.LookPath("claude"); err == nil { + claudeEnabled = true + if verbose { + fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled") + } + } else { + if verbose { + fmt.Println("⚠ Claude CLI not found - using basic validation only") + } + } + + // Create temp directory for Claude analysis files + tempDir := filepath.Join(os.TempDir(), "test-runner-claude") + os.MkdirAll(tempDir, 0755) + return &Validator{ - config: config, - logMonitor: logMonitor, + config: config, + logMonitor: logMonitor, + claudeEnabled: claudeEnabled, + claudeTempDir: tempDir, + verbose: verbose, } } @@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) { // validatePrompt validates a single prompt test func (v *Validator) validatePrompt(prompt *PromptTest) { - // Already failed, skip + // Step 1: Simple/fast checks first + simpleCheckPassed := true + simpleCheckReason := "" + if prompt.Status == StatusFailed { - return + simpleCheckPassed = false + simpleCheckReason = prompt.ErrorMessage + } else if strings.TrimSpace(prompt.Response) == "" { + simpleCheckPassed = false + simpleCheckReason = "Response is empty" + } else if prompt.ResponseTokens < 1 { + simpleCheckPassed = false + simpleCheckReason = "Response has no tokens" } - // Check if response is empty - if strings.TrimSpace(prompt.Response) == "" { - prompt.Status = StatusFailed - prompt.ErrorMessage = "Response is empty" - return - } + // Step 2: Claude validation ALWAYS runs (regardless of simple check result) + if v.claudeEnabled { + claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason) - // Check token count - if prompt.ResponseTokens < 1 { - prompt.Status = StatusFailed - prompt.ErrorMessage = "Response has no tokens" - return + // Claude validation overrides everything + if claudeResult.Status == StatusFailed { + prompt.Status = StatusFailed + prompt.ErrorMessage = claudeResult.Reason + } else if claudeResult.Status == StatusPassed { + prompt.Status = StatusPassed + // Clear simple check error if Claude says it's OK + if prompt.ErrorMessage == simpleCheckReason { + prompt.ErrorMessage = "" + } + } + } else { + // If Claude not available, use simple check results + if !simpleCheckPassed { + prompt.Status = StatusFailed + prompt.ErrorMessage = simpleCheckReason + } } } @@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool { return false } +// ClaudeValidationResult represents Claude's validation result +type ClaudeValidationResult struct { + Status TestStatus + Reason string +} + +// validateWithClaude uses Claude headless mode to validate a prompt response +func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult { + if v.verbose { + fmt.Println(" 🤖 Running Claude AI validation...") + } + + // Create analysis prompt + var analysisPrompt strings.Builder + + analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n") + analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt)) + analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response)) + + if !simpleCheckPassed { + analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason)) + } + + analysisPrompt.WriteString(`Verify that the response: +1. Is relevant and responsive to the prompt +2. Is coherent and makes sense (not gibberish or garbled text) +3. Is in proper language (not error messages, binary data, or Unicode errors) +4. Appears to be from a working LLM model (not system errors or failures) +5. Has reasonable quality for a 4B parameter model + +Respond with ONLY one of these formats: +- "PASS" if the response is valid and acceptable +- "FAIL: " if the response has issues + +Be concise. One line only.`) + + // Write to temp file + promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid())) + if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil { + fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err) + return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"} + } + defer os.Remove(promptFile) + + // Run Claude headless + cmd := exec.Command("claude", "-p", promptFile) + output, err := cmd.CombinedOutput() + if err != nil { + fmt.Printf("Warning: Claude validation failed to run: %v\n", err) + return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"} + } + + // Parse result + result := strings.TrimSpace(string(output)) + + if strings.HasPrefix(result, "PASS") { + if v.verbose { + fmt.Println(" ✓ Claude: Response is valid") + } + return ClaudeValidationResult{ + Status: StatusPassed, + Reason: "Claude validation: Response is valid and acceptable", + } + } else if strings.HasPrefix(result, "FAIL:") { + failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:")) + if v.verbose { + fmt.Printf(" ✗ Claude: %s\n", failReason) + } + return ClaudeValidationResult{ + Status: StatusFailed, + Reason: failReason, + } + } else { + // Unexpected format, treat as warning but pass + fmt.Printf("Warning: Unexpected Claude response format: %s\n", result) + return ClaudeValidationResult{ + Status: StatusPassed, + Reason: "Claude validation unclear, defaulting to pass", + } + } +} + // ValidateResponse validates a response against expected criteria func ValidateResponse(response string, minTokens, maxTokens int) error { tokens := estimateTokens(response) diff --git a/test/config/quick.yaml b/test/config/quick.yaml index 1c9c5a1c..317f3ce4 100644 --- a/test/config/quick.yaml +++ b/test/config/quick.yaml @@ -5,12 +5,12 @@ profiles: quick: timeout: 5m models: - - name: gemma2:2b + - name: gemma3:4b prompts: - "Hello, respond with a brief greeting." min_response_tokens: 5 max_response_tokens: 100 - timeout: 30s + timeout: 60s validation: gpu_required: true