Add Claude AI-powered response validation and update test model

Changes: 1. Update quick test to use gemma3:4b (was gemma2:2b) - Increased timeout to 60s for larger model 2. Implement Claude headless validation (validate.go) - Hybrid approach: simple checks first, then Claude validation ALWAYS runs - Claude validates response quality, coherence, relevance - Detects gibberish, errors, and malformed responses - Falls back to simple validation if Claude CLI unavailable - Verbose logging shows Claude validation results 3. Validation flow: - Step 1: Fast checks (empty response, token count) - Step 2: Claude AI analysis (runs regardless of simple check) - Claude result overrides simple checks - If Claude unavailable, uses simple validation only 4. Workflow improvements: - Remove useless GPU memory check step (server already stopped) - Cleaner workflow output Benefits: - Intelligent response quality validation - Catches subtle issues (gibberish, off-topic responses) - Better than hardcoded pattern matching - Graceful degradation when Claude unavailable
2025-12-10 15:57:04 +00:00 · 2025-10-30 11:42:10 +08:00
parent d59284d30a
commit 4de7dd453b
4 changed files with 148 additions and 27 deletions
--- a/.github/workflows/tesla-k80-tests.yml
+++ b/.github/workflows/tesla-k80-tests.yml
@@ -85,9 +85,3 @@ jobs:
            test-report-full.md
            ollama.log
          retention-days: 14
-
-      - name: Check GPU memory usage
-        if: always()
-        run: |
-          echo "=== GPU Memory Status ==="
-          nvidia-smi --query-gpu=memory.used,memory.total --format=csv
--- a/cmd/test-runner/main.go
+++ b/cmd/test-runner/main.go
@@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke
 	// Run tests
 	startTime := time.Now()
 	tester := NewModelTester(server.BaseURL())
-	validator := NewValidator(config.Validation, monitor)
+	validator := NewValidator(config.Validation, monitor, verbose)

 	results := make([]TestResult, 0, len(profile.Models))

--- a/cmd/test-runner/validate.go
+++ b/cmd/test-runner/validate.go
@@ -2,6 +2,9 @@ package main

 import (
 	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
 	"strings"
 )

@@ -9,13 +12,36 @@ import (
 type Validator struct {
 	config           Validation
 	logMonitor       *LogMonitor
+	claudeEnabled    bool
+	claudeTempDir    string
+	verbose          bool
 }

 // NewValidator creates a new validator
-func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
+func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator {
+	// Check if Claude CLI is available
+	claudeEnabled := false
+	if _, err := exec.LookPath("claude"); err == nil {
+		claudeEnabled = true
+		if verbose {
+			fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled")
+		}
+	} else {
+		if verbose {
+			fmt.Println("⚠ Claude CLI not found - using basic validation only")
+		}
+	}
+
+	// Create temp directory for Claude analysis files
+	tempDir := filepath.Join(os.TempDir(), "test-runner-claude")
+	os.MkdirAll(tempDir, 0755)
+
 	return &Validator{
 		config:        config,
 		logMonitor:    logMonitor,
+		claudeEnabled: claudeEnabled,
+		claudeTempDir: tempDir,
+		verbose:       verbose,
 	}
 }

@@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) {

 // validatePrompt validates a single prompt test
 func (v *Validator) validatePrompt(prompt *PromptTest) {
-	// Already failed, skip
+	// Step 1: Simple/fast checks first
+	simpleCheckPassed := true
+	simpleCheckReason := ""
+
 	if prompt.Status == StatusFailed {
-		return
+		simpleCheckPassed = false
+		simpleCheckReason = prompt.ErrorMessage
+	} else if strings.TrimSpace(prompt.Response) == "" {
+		simpleCheckPassed = false
+		simpleCheckReason = "Response is empty"
+	} else if prompt.ResponseTokens < 1 {
+		simpleCheckPassed = false
+		simpleCheckReason = "Response has no tokens"
 	}

-	// Check if response is empty
-	if strings.TrimSpace(prompt.Response) == "" {
-		prompt.Status = StatusFailed
-		prompt.ErrorMessage = "Response is empty"
-		return
-	}
+	// Step 2: Claude validation ALWAYS runs (regardless of simple check result)
+	if v.claudeEnabled {
+		claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason)

-	// Check token count
-	if prompt.ResponseTokens < 1 {
+		// Claude validation overrides everything
+		if claudeResult.Status == StatusFailed {
 			prompt.Status = StatusFailed
-		prompt.ErrorMessage = "Response has no tokens"
-		return
+			prompt.ErrorMessage = claudeResult.Reason
+		} else if claudeResult.Status == StatusPassed {
+			prompt.Status = StatusPassed
+			// Clear simple check error if Claude says it's OK
+			if prompt.ErrorMessage == simpleCheckReason {
+				prompt.ErrorMessage = ""
+			}
+		}
+	} else {
+		// If Claude not available, use simple check results
+		if !simpleCheckPassed {
+			prompt.Status = StatusFailed
+			prompt.ErrorMessage = simpleCheckReason
+		}
 	}
 }

@@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool {
 	return false
 }

+// ClaudeValidationResult represents Claude's validation result
+type ClaudeValidationResult struct {
+	Status TestStatus
+	Reason string
+}
+
+// validateWithClaude uses Claude headless mode to validate a prompt response
+func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult {
+	if v.verbose {
+		fmt.Println("  🤖 Running Claude AI validation...")
+	}
+
+	// Create analysis prompt
+	var analysisPrompt strings.Builder
+
+	analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n")
+	analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt))
+	analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response))
+
+	if !simpleCheckPassed {
+		analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason))
+	}
+
+	analysisPrompt.WriteString(`Verify that the response:
+1. Is relevant and responsive to the prompt
+2. Is coherent and makes sense (not gibberish or garbled text)
+3. Is in proper language (not error messages, binary data, or Unicode errors)
+4. Appears to be from a working LLM model (not system errors or failures)
+5. Has reasonable quality for a 4B parameter model
+
+Respond with ONLY one of these formats:
+- "PASS" if the response is valid and acceptable
+- "FAIL: <brief reason>" if the response has issues
+
+Be concise. One line only.`)
+
+	// Write to temp file
+	promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid()))
+	if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil {
+		fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err)
+		return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"}
+	}
+	defer os.Remove(promptFile)
+
+	// Run Claude headless
+	cmd := exec.Command("claude", "-p", promptFile)
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Printf("Warning: Claude validation failed to run: %v\n", err)
+		return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"}
+	}
+
+	// Parse result
+	result := strings.TrimSpace(string(output))
+
+	if strings.HasPrefix(result, "PASS") {
+		if v.verbose {
+			fmt.Println("  ✓ Claude: Response is valid")
+		}
+		return ClaudeValidationResult{
+			Status: StatusPassed,
+			Reason: "Claude validation: Response is valid and acceptable",
+		}
+	} else if strings.HasPrefix(result, "FAIL:") {
+		failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:"))
+		if v.verbose {
+			fmt.Printf("  ✗ Claude: %s\n", failReason)
+		}
+		return ClaudeValidationResult{
+			Status: StatusFailed,
+			Reason: failReason,
+		}
+	} else {
+		// Unexpected format, treat as warning but pass
+		fmt.Printf("Warning: Unexpected Claude response format: %s\n", result)
+		return ClaudeValidationResult{
+			Status: StatusPassed,
+			Reason: "Claude validation unclear, defaulting to pass",
+		}
+	}
+}
+
 // ValidateResponse validates a response against expected criteria
 func ValidateResponse(response string, minTokens, maxTokens int) error {
 	tokens := estimateTokens(response)
--- a/test/config/quick.yaml
+++ b/test/config/quick.yaml
@@ -5,12 +5,12 @@ profiles:
  quick:
    timeout: 5m
    models:
-      - name: gemma2:2b
+      - name: gemma3:4b
        prompts:
          - "Hello, respond with a brief greeting."
        min_response_tokens: 5
        max_response_tokens: 100
-        timeout: 30s
+        timeout: 60s

 validation:
  gpu_required: true