mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Add Claude AI-powered response validation and update test model
Changes: 1. Update quick test to use gemma3:4b (was gemma2:2b) - Increased timeout to 60s for larger model 2. Implement Claude headless validation (validate.go) - Hybrid approach: simple checks first, then Claude validation ALWAYS runs - Claude validates response quality, coherence, relevance - Detects gibberish, errors, and malformed responses - Falls back to simple validation if Claude CLI unavailable - Verbose logging shows Claude validation results 3. Validation flow: - Step 1: Fast checks (empty response, token count) - Step 2: Claude AI analysis (runs regardless of simple check) - Claude result overrides simple checks - If Claude unavailable, uses simple validation only 4. Workflow improvements: - Remove useless GPU memory check step (server already stopped) - Cleaner workflow output Benefits: - Intelligent response quality validation - Catches subtle issues (gibberish, off-topic responses) - Better than hardcoded pattern matching - Graceful degradation when Claude unavailable
This commit is contained in:
6
.github/workflows/tesla-k80-tests.yml
vendored
6
.github/workflows/tesla-k80-tests.yml
vendored
@@ -85,9 +85,3 @@ jobs:
|
||||
test-report-full.md
|
||||
ollama.log
|
||||
retention-days: 14
|
||||
|
||||
- name: Check GPU memory usage
|
||||
if: always()
|
||||
run: |
|
||||
echo "=== GPU Memory Status ==="
|
||||
nvidia-smi --query-gpu=memory.used,memory.total --format=csv
|
||||
|
||||
@@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke
|
||||
// Run tests
|
||||
startTime := time.Now()
|
||||
tester := NewModelTester(server.BaseURL())
|
||||
validator := NewValidator(config.Validation, monitor)
|
||||
validator := NewValidator(config.Validation, monitor, verbose)
|
||||
|
||||
results := make([]TestResult, 0, len(profile.Models))
|
||||
|
||||
|
||||
@@ -2,6 +2,9 @@ package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -9,13 +12,36 @@ import (
|
||||
type Validator struct {
|
||||
config Validation
|
||||
logMonitor *LogMonitor
|
||||
claudeEnabled bool
|
||||
claudeTempDir string
|
||||
verbose bool
|
||||
}
|
||||
|
||||
// NewValidator creates a new validator
|
||||
func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
|
||||
func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator {
|
||||
// Check if Claude CLI is available
|
||||
claudeEnabled := false
|
||||
if _, err := exec.LookPath("claude"); err == nil {
|
||||
claudeEnabled = true
|
||||
if verbose {
|
||||
fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled")
|
||||
}
|
||||
} else {
|
||||
if verbose {
|
||||
fmt.Println("⚠ Claude CLI not found - using basic validation only")
|
||||
}
|
||||
}
|
||||
|
||||
// Create temp directory for Claude analysis files
|
||||
tempDir := filepath.Join(os.TempDir(), "test-runner-claude")
|
||||
os.MkdirAll(tempDir, 0755)
|
||||
|
||||
return &Validator{
|
||||
config: config,
|
||||
logMonitor: logMonitor,
|
||||
claudeEnabled: claudeEnabled,
|
||||
claudeTempDir: tempDir,
|
||||
verbose: verbose,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) {
|
||||
|
||||
// validatePrompt validates a single prompt test
|
||||
func (v *Validator) validatePrompt(prompt *PromptTest) {
|
||||
// Already failed, skip
|
||||
// Step 1: Simple/fast checks first
|
||||
simpleCheckPassed := true
|
||||
simpleCheckReason := ""
|
||||
|
||||
if prompt.Status == StatusFailed {
|
||||
return
|
||||
simpleCheckPassed = false
|
||||
simpleCheckReason = prompt.ErrorMessage
|
||||
} else if strings.TrimSpace(prompt.Response) == "" {
|
||||
simpleCheckPassed = false
|
||||
simpleCheckReason = "Response is empty"
|
||||
} else if prompt.ResponseTokens < 1 {
|
||||
simpleCheckPassed = false
|
||||
simpleCheckReason = "Response has no tokens"
|
||||
}
|
||||
|
||||
// Check if response is empty
|
||||
if strings.TrimSpace(prompt.Response) == "" {
|
||||
prompt.Status = StatusFailed
|
||||
prompt.ErrorMessage = "Response is empty"
|
||||
return
|
||||
}
|
||||
// Step 2: Claude validation ALWAYS runs (regardless of simple check result)
|
||||
if v.claudeEnabled {
|
||||
claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason)
|
||||
|
||||
// Check token count
|
||||
if prompt.ResponseTokens < 1 {
|
||||
// Claude validation overrides everything
|
||||
if claudeResult.Status == StatusFailed {
|
||||
prompt.Status = StatusFailed
|
||||
prompt.ErrorMessage = "Response has no tokens"
|
||||
return
|
||||
prompt.ErrorMessage = claudeResult.Reason
|
||||
} else if claudeResult.Status == StatusPassed {
|
||||
prompt.Status = StatusPassed
|
||||
// Clear simple check error if Claude says it's OK
|
||||
if prompt.ErrorMessage == simpleCheckReason {
|
||||
prompt.ErrorMessage = ""
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If Claude not available, use simple check results
|
||||
if !simpleCheckPassed {
|
||||
prompt.Status = StatusFailed
|
||||
prompt.ErrorMessage = simpleCheckReason
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// ClaudeValidationResult represents Claude's validation result
|
||||
type ClaudeValidationResult struct {
|
||||
Status TestStatus
|
||||
Reason string
|
||||
}
|
||||
|
||||
// validateWithClaude uses Claude headless mode to validate a prompt response
|
||||
func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult {
|
||||
if v.verbose {
|
||||
fmt.Println(" 🤖 Running Claude AI validation...")
|
||||
}
|
||||
|
||||
// Create analysis prompt
|
||||
var analysisPrompt strings.Builder
|
||||
|
||||
analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n")
|
||||
analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt))
|
||||
analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response))
|
||||
|
||||
if !simpleCheckPassed {
|
||||
analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason))
|
||||
}
|
||||
|
||||
analysisPrompt.WriteString(`Verify that the response:
|
||||
1. Is relevant and responsive to the prompt
|
||||
2. Is coherent and makes sense (not gibberish or garbled text)
|
||||
3. Is in proper language (not error messages, binary data, or Unicode errors)
|
||||
4. Appears to be from a working LLM model (not system errors or failures)
|
||||
5. Has reasonable quality for a 4B parameter model
|
||||
|
||||
Respond with ONLY one of these formats:
|
||||
- "PASS" if the response is valid and acceptable
|
||||
- "FAIL: <brief reason>" if the response has issues
|
||||
|
||||
Be concise. One line only.`)
|
||||
|
||||
// Write to temp file
|
||||
promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid()))
|
||||
if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil {
|
||||
fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err)
|
||||
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"}
|
||||
}
|
||||
defer os.Remove(promptFile)
|
||||
|
||||
// Run Claude headless
|
||||
cmd := exec.Command("claude", "-p", promptFile)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
fmt.Printf("Warning: Claude validation failed to run: %v\n", err)
|
||||
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"}
|
||||
}
|
||||
|
||||
// Parse result
|
||||
result := strings.TrimSpace(string(output))
|
||||
|
||||
if strings.HasPrefix(result, "PASS") {
|
||||
if v.verbose {
|
||||
fmt.Println(" ✓ Claude: Response is valid")
|
||||
}
|
||||
return ClaudeValidationResult{
|
||||
Status: StatusPassed,
|
||||
Reason: "Claude validation: Response is valid and acceptable",
|
||||
}
|
||||
} else if strings.HasPrefix(result, "FAIL:") {
|
||||
failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:"))
|
||||
if v.verbose {
|
||||
fmt.Printf(" ✗ Claude: %s\n", failReason)
|
||||
}
|
||||
return ClaudeValidationResult{
|
||||
Status: StatusFailed,
|
||||
Reason: failReason,
|
||||
}
|
||||
} else {
|
||||
// Unexpected format, treat as warning but pass
|
||||
fmt.Printf("Warning: Unexpected Claude response format: %s\n", result)
|
||||
return ClaudeValidationResult{
|
||||
Status: StatusPassed,
|
||||
Reason: "Claude validation unclear, defaulting to pass",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ValidateResponse validates a response against expected criteria
|
||||
func ValidateResponse(response string, minTokens, maxTokens int) error {
|
||||
tokens := estimateTokens(response)
|
||||
|
||||
@@ -5,12 +5,12 @@ profiles:
|
||||
quick:
|
||||
timeout: 5m
|
||||
models:
|
||||
- name: gemma2:2b
|
||||
- name: gemma3:4b
|
||||
prompts:
|
||||
- "Hello, respond with a brief greeting."
|
||||
min_response_tokens: 5
|
||||
max_response_tokens: 100
|
||||
timeout: 30s
|
||||
timeout: 60s
|
||||
|
||||
validation:
|
||||
gpu_required: true
|
||||
|
||||
Reference in New Issue
Block a user