Add Claude AI-powered response validation and update test model

Changes:
1. Update quick test to use gemma3:4b (was gemma2:2b)
   - Increased timeout to 60s for larger model

2. Implement Claude headless validation (validate.go)
   - Hybrid approach: simple checks first, then Claude validation ALWAYS runs
   - Claude validates response quality, coherence, relevance
   - Detects gibberish, errors, and malformed responses
   - Falls back to simple validation if Claude CLI unavailable
   - Verbose logging shows Claude validation results

3. Validation flow:
   - Step 1: Fast checks (empty response, token count)
   - Step 2: Claude AI analysis (runs regardless of simple check)
   - Claude result overrides simple checks
   - If Claude unavailable, uses simple validation only

4. Workflow improvements:
   - Remove useless GPU memory check step (server already stopped)
   - Cleaner workflow output

Benefits:
- Intelligent response quality validation
- Catches subtle issues (gibberish, off-topic responses)
- Better than hardcoded pattern matching
- Graceful degradation when Claude unavailable
This commit is contained in:
Shang Chieh Tseng
2025-10-30 11:42:10 +08:00
parent d59284d30a
commit 4de7dd453b
4 changed files with 148 additions and 27 deletions

View File

@@ -85,9 +85,3 @@ jobs:
test-report-full.md
ollama.log
retention-days: 14
- name: Check GPU memory usage
if: always()
run: |
echo "=== GPU Memory Status ==="
nvidia-smi --query-gpu=memory.used,memory.total --format=csv

View File

@@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke
// Run tests
startTime := time.Now()
tester := NewModelTester(server.BaseURL())
validator := NewValidator(config.Validation, monitor)
validator := NewValidator(config.Validation, monitor, verbose)
results := make([]TestResult, 0, len(profile.Models))

View File

@@ -2,20 +2,46 @@ package main
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
)
// Validator validates test results against configuration
type Validator struct {
config Validation
logMonitor *LogMonitor
config Validation
logMonitor *LogMonitor
claudeEnabled bool
claudeTempDir string
verbose bool
}
// NewValidator creates a new validator
func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator {
// Check if Claude CLI is available
claudeEnabled := false
if _, err := exec.LookPath("claude"); err == nil {
claudeEnabled = true
if verbose {
fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled")
}
} else {
if verbose {
fmt.Println("⚠ Claude CLI not found - using basic validation only")
}
}
// Create temp directory for Claude analysis files
tempDir := filepath.Join(os.TempDir(), "test-runner-claude")
os.MkdirAll(tempDir, 0755)
return &Validator{
config: config,
logMonitor: logMonitor,
config: config,
logMonitor: logMonitor,
claudeEnabled: claudeEnabled,
claudeTempDir: tempDir,
verbose: verbose,
}
}
@@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) {
// validatePrompt validates a single prompt test
func (v *Validator) validatePrompt(prompt *PromptTest) {
// Already failed, skip
// Step 1: Simple/fast checks first
simpleCheckPassed := true
simpleCheckReason := ""
if prompt.Status == StatusFailed {
return
simpleCheckPassed = false
simpleCheckReason = prompt.ErrorMessage
} else if strings.TrimSpace(prompt.Response) == "" {
simpleCheckPassed = false
simpleCheckReason = "Response is empty"
} else if prompt.ResponseTokens < 1 {
simpleCheckPassed = false
simpleCheckReason = "Response has no tokens"
}
// Check if response is empty
if strings.TrimSpace(prompt.Response) == "" {
prompt.Status = StatusFailed
prompt.ErrorMessage = "Response is empty"
return
}
// Step 2: Claude validation ALWAYS runs (regardless of simple check result)
if v.claudeEnabled {
claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason)
// Check token count
if prompt.ResponseTokens < 1 {
prompt.Status = StatusFailed
prompt.ErrorMessage = "Response has no tokens"
return
// Claude validation overrides everything
if claudeResult.Status == StatusFailed {
prompt.Status = StatusFailed
prompt.ErrorMessage = claudeResult.Reason
} else if claudeResult.Status == StatusPassed {
prompt.Status = StatusPassed
// Clear simple check error if Claude says it's OK
if prompt.ErrorMessage == simpleCheckReason {
prompt.ErrorMessage = ""
}
}
} else {
// If Claude not available, use simple check results
if !simpleCheckPassed {
prompt.Status = StatusFailed
prompt.ErrorMessage = simpleCheckReason
}
}
}
@@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool {
return false
}
// ClaudeValidationResult represents Claude's validation result
type ClaudeValidationResult struct {
Status TestStatus
Reason string
}
// validateWithClaude uses Claude headless mode to validate a prompt response
func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult {
if v.verbose {
fmt.Println(" 🤖 Running Claude AI validation...")
}
// Create analysis prompt
var analysisPrompt strings.Builder
analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n")
analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt))
analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response))
if !simpleCheckPassed {
analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason))
}
analysisPrompt.WriteString(`Verify that the response:
1. Is relevant and responsive to the prompt
2. Is coherent and makes sense (not gibberish or garbled text)
3. Is in proper language (not error messages, binary data, or Unicode errors)
4. Appears to be from a working LLM model (not system errors or failures)
5. Has reasonable quality for a 4B parameter model
Respond with ONLY one of these formats:
- "PASS" if the response is valid and acceptable
- "FAIL: <brief reason>" if the response has issues
Be concise. One line only.`)
// Write to temp file
promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid()))
if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil {
fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err)
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"}
}
defer os.Remove(promptFile)
// Run Claude headless
cmd := exec.Command("claude", "-p", promptFile)
output, err := cmd.CombinedOutput()
if err != nil {
fmt.Printf("Warning: Claude validation failed to run: %v\n", err)
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"}
}
// Parse result
result := strings.TrimSpace(string(output))
if strings.HasPrefix(result, "PASS") {
if v.verbose {
fmt.Println(" ✓ Claude: Response is valid")
}
return ClaudeValidationResult{
Status: StatusPassed,
Reason: "Claude validation: Response is valid and acceptable",
}
} else if strings.HasPrefix(result, "FAIL:") {
failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:"))
if v.verbose {
fmt.Printf(" ✗ Claude: %s\n", failReason)
}
return ClaudeValidationResult{
Status: StatusFailed,
Reason: failReason,
}
} else {
// Unexpected format, treat as warning but pass
fmt.Printf("Warning: Unexpected Claude response format: %s\n", result)
return ClaudeValidationResult{
Status: StatusPassed,
Reason: "Claude validation unclear, defaulting to pass",
}
}
}
// ValidateResponse validates a response against expected criteria
func ValidateResponse(response string, minTokens, maxTokens int) error {
tokens := estimateTokens(response)

View File

@@ -5,12 +5,12 @@ profiles:
quick:
timeout: 5m
models:
- name: gemma2:2b
- name: gemma3:4b
prompts:
- "Hello, respond with a brief greeting."
min_response_tokens: 5
max_response_tokens: 100
timeout: 30s
timeout: 60s
validation:
gpu_required: true