Add Claude AI-powered response validation and update test model

Changes: 1. Update quick test to use gemma3:4b (was gemma2:2b) - Increased timeout to 60s for larger model 2. Implement Claude headless validation (validate.go) - Hybrid approach: simple checks first, then Claude validation ALWAYS runs - Claude validates response quality, coherence, relevance - Detects gibberish, errors, and malformed responses - Falls back to simple validation if Claude CLI unavailable - Verbose logging shows Claude validation results 3. Validation flow: - Step 1: Fast checks (empty response, token count) - Step 2: Claude AI analysis (runs regardless of simple check) - Claude result overrides simple checks - If Claude unavailable, uses simple validation only 4. Workflow improvements: - Remove useless GPU memory check step (server already stopped) - Cleaner workflow output Benefits: - Intelligent response quality validation - Catches subtle issues (gibberish, off-topic responses) - Better than hardcoded pattern matching - Graceful degradation when Claude unavailable
2025-12-10 15:57:04 +00:00 · 2025-10-30 11:42:10 +08:00
parent d59284d30a
commit 4de7dd453b
4 changed files with 148 additions and 27 deletions
--- a/.github/workflows/tesla-k80-tests.yml
+++ b/.github/workflows/tesla-k80-tests.yml
@@ -85,9 +85,3 @@ jobs:
            test-report-full.md
            ollama.log
          retention-days: 14
      - name: Check GPU memory usage
        if: always()
        run: |
          echo "=== GPU Memory Status ==="
          nvidia-smi --query-gpu=memory.used,memory.total --format=csv
--- a/cmd/test-runner/main.go
+++ b/cmd/test-runner/main.go
@@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke
 	// Run tests
 	startTime := time.Now()
 	tester := NewModelTester(server.BaseURL())
-	validator := NewValidator(config.Validation, monitor)
+	validator := NewValidator(config.Validation, monitor, verbose)
 	results := make([]TestResult, 0, len(profile.Models))
--- a/cmd/test-runner/validate.go
+++ b/cmd/test-runner/validate.go
@@ -2,20 +2,46 @@ package main
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
 )
 // Validator validates test results against configuration
 type Validator struct {
-	config     Validation
+	config           Validation
-	logMonitor *LogMonitor
+	logMonitor       *LogMonitor
 	claudeEnabled    bool
 	claudeTempDir    string
 	verbose          bool
 }
 // NewValidator creates a new validator
-func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
+func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator {
 	// Check if Claude CLI is available
 	claudeEnabled := false
 	if _, err := exec.LookPath("claude"); err == nil {
 		claudeEnabled = true
 		if verbose {
 			fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled")
 		}
 	} else {
 		if verbose {
 			fmt.Println("⚠ Claude CLI not found - using basic validation only")
 		}
 	}
 	// Create temp directory for Claude analysis files
 	tempDir := filepath.Join(os.TempDir(), "test-runner-claude")
 	os.MkdirAll(tempDir, 0755)
 	return &Validator{
-		config:     config,
+		config:        config,
-		logMonitor: logMonitor,
+		logMonitor:    logMonitor,
 		claudeEnabled: claudeEnabled,
 		claudeTempDir: tempDir,
 		verbose:       verbose,
 	}
 }
@@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) {
 // validatePrompt validates a single prompt test
 func (v *Validator) validatePrompt(prompt *PromptTest) {
-	// Already failed, skip
+	// Step 1: Simple/fast checks first
 	simpleCheckPassed := true
 	simpleCheckReason := ""
 	if prompt.Status == StatusFailed {
-		return
+		simpleCheckPassed = false
 		simpleCheckReason = prompt.ErrorMessage
 	} else if strings.TrimSpace(prompt.Response) == "" {
 		simpleCheckPassed = false
 		simpleCheckReason = "Response is empty"
 	} else if prompt.ResponseTokens < 1 {
 		simpleCheckPassed = false
 		simpleCheckReason = "Response has no tokens"
 	}
-	// Check if response is empty
+	// Step 2: Claude validation ALWAYS runs (regardless of simple check result)
-	if strings.TrimSpace(prompt.Response) == "" {
+	if v.claudeEnabled {
-		prompt.Status = StatusFailed
+		claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason)
 		prompt.ErrorMessage = "Response is empty"
 		return
 	}
-	// Check token count
+		// Claude validation overrides everything
-	if prompt.ResponseTokens < 1 {
+		if claudeResult.Status == StatusFailed {
-		prompt.Status = StatusFailed
+			prompt.Status = StatusFailed
-		prompt.ErrorMessage = "Response has no tokens"
+			prompt.ErrorMessage = claudeResult.Reason
-		return
+		} else if claudeResult.Status == StatusPassed {
 			prompt.Status = StatusPassed
 			// Clear simple check error if Claude says it's OK
 			if prompt.ErrorMessage == simpleCheckReason {
 				prompt.ErrorMessage = ""
 			}
 		}
 	} else {
 		// If Claude not available, use simple check results
 		if !simpleCheckPassed {
 			prompt.Status = StatusFailed
 			prompt.ErrorMessage = simpleCheckReason
 		}
 	}
 }
@@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool {
 	return false
 }
 // ClaudeValidationResult represents Claude's validation result
 type ClaudeValidationResult struct {
 	Status TestStatus
 	Reason string
 }
 // validateWithClaude uses Claude headless mode to validate a prompt response
 func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult {
 	if v.verbose {
 		fmt.Println("  🤖 Running Claude AI validation...")
 	}
 	// Create analysis prompt
 	var analysisPrompt strings.Builder
 	analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n")
 	analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt))
 	analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response))
 	if !simpleCheckPassed {
 		analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason))
 	}
 	analysisPrompt.WriteString(`Verify that the response:
 1. Is relevant and responsive to the prompt
 2. Is coherent and makes sense (not gibberish or garbled text)
 3. Is in proper language (not error messages, binary data, or Unicode errors)
 4. Appears to be from a working LLM model (not system errors or failures)
 5. Has reasonable quality for a 4B parameter model
 Respond with ONLY one of these formats:
 - "PASS" if the response is valid and acceptable
 - "FAIL: <brief reason>" if the response has issues
 Be concise. One line only.`)
 	// Write to temp file
 	promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid()))
 	if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil {
 		fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err)
 		return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"}
 	}
 	defer os.Remove(promptFile)
 	// Run Claude headless
 	cmd := exec.Command("claude", "-p", promptFile)
 	output, err := cmd.CombinedOutput()
 	if err != nil {
 		fmt.Printf("Warning: Claude validation failed to run: %v\n", err)
 		return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"}
 	}
 	// Parse result
 	result := strings.TrimSpace(string(output))
 	if strings.HasPrefix(result, "PASS") {
 		if v.verbose {
 			fmt.Println("  ✓ Claude: Response is valid")
 		}
 		return ClaudeValidationResult{
 			Status: StatusPassed,
 			Reason: "Claude validation: Response is valid and acceptable",
 		}
 	} else if strings.HasPrefix(result, "FAIL:") {
 		failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:"))
 		if v.verbose {
 			fmt.Printf("  ✗ Claude: %s\n", failReason)
 		}
 		return ClaudeValidationResult{
 			Status: StatusFailed,
 			Reason: failReason,
 		}
 	} else {
 		// Unexpected format, treat as warning but pass
 		fmt.Printf("Warning: Unexpected Claude response format: %s\n", result)
 		return ClaudeValidationResult{
 			Status: StatusPassed,
 			Reason: "Claude validation unclear, defaulting to pass",
 		}
 	}
 }
 // ValidateResponse validates a response against expected criteria
 func ValidateResponse(response string, minTokens, maxTokens int) error {
 	tokens := estimateTokens(response)
--- a/test/config/quick.yaml
+++ b/test/config/quick.yaml
@@ -5,12 +5,12 @@ profiles:
  quick:
    timeout: 5m
    models:
-      - name: gemma2:2b
+      - name: gemma3:4b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
-        timeout: 30s
+        timeout: 60s
 validation:
  gpu_required: true