mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Add Claude AI-powered response validation and update test model
Changes: 1. Update quick test to use gemma3:4b (was gemma2:2b) - Increased timeout to 60s for larger model 2. Implement Claude headless validation (validate.go) - Hybrid approach: simple checks first, then Claude validation ALWAYS runs - Claude validates response quality, coherence, relevance - Detects gibberish, errors, and malformed responses - Falls back to simple validation if Claude CLI unavailable - Verbose logging shows Claude validation results 3. Validation flow: - Step 1: Fast checks (empty response, token count) - Step 2: Claude AI analysis (runs regardless of simple check) - Claude result overrides simple checks - If Claude unavailable, uses simple validation only 4. Workflow improvements: - Remove useless GPU memory check step (server already stopped) - Cleaner workflow output Benefits: - Intelligent response quality validation - Catches subtle issues (gibberish, off-topic responses) - Better than hardcoded pattern matching - Graceful degradation when Claude unavailable
This commit is contained in:
6
.github/workflows/tesla-k80-tests.yml
vendored
6
.github/workflows/tesla-k80-tests.yml
vendored
@@ -85,9 +85,3 @@ jobs:
|
|||||||
test-report-full.md
|
test-report-full.md
|
||||||
ollama.log
|
ollama.log
|
||||||
retention-days: 14
|
retention-days: 14
|
||||||
|
|
||||||
- name: Check GPU memory usage
|
|
||||||
if: always()
|
|
||||||
run: |
|
|
||||||
echo "=== GPU Memory Status ==="
|
|
||||||
nvidia-smi --query-gpu=memory.used,memory.total --format=csv
|
|
||||||
|
|||||||
@@ -148,7 +148,7 @@ func runTests(configPath, profileName, ollamaBin, outputPath string, verbose, ke
|
|||||||
// Run tests
|
// Run tests
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
tester := NewModelTester(server.BaseURL())
|
tester := NewModelTester(server.BaseURL())
|
||||||
validator := NewValidator(config.Validation, monitor)
|
validator := NewValidator(config.Validation, monitor, verbose)
|
||||||
|
|
||||||
results := make([]TestResult, 0, len(profile.Models))
|
results := make([]TestResult, 0, len(profile.Models))
|
||||||
|
|
||||||
|
|||||||
@@ -2,20 +2,46 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Validator validates test results against configuration
|
// Validator validates test results against configuration
|
||||||
type Validator struct {
|
type Validator struct {
|
||||||
config Validation
|
config Validation
|
||||||
logMonitor *LogMonitor
|
logMonitor *LogMonitor
|
||||||
|
claudeEnabled bool
|
||||||
|
claudeTempDir string
|
||||||
|
verbose bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewValidator creates a new validator
|
// NewValidator creates a new validator
|
||||||
func NewValidator(config Validation, logMonitor *LogMonitor) *Validator {
|
func NewValidator(config Validation, logMonitor *LogMonitor, verbose bool) *Validator {
|
||||||
|
// Check if Claude CLI is available
|
||||||
|
claudeEnabled := false
|
||||||
|
if _, err := exec.LookPath("claude"); err == nil {
|
||||||
|
claudeEnabled = true
|
||||||
|
if verbose {
|
||||||
|
fmt.Println("✓ Claude CLI detected - AI-powered response validation enabled")
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if verbose {
|
||||||
|
fmt.Println("⚠ Claude CLI not found - using basic validation only")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create temp directory for Claude analysis files
|
||||||
|
tempDir := filepath.Join(os.TempDir(), "test-runner-claude")
|
||||||
|
os.MkdirAll(tempDir, 0755)
|
||||||
|
|
||||||
return &Validator{
|
return &Validator{
|
||||||
config: config,
|
config: config,
|
||||||
logMonitor: logMonitor,
|
logMonitor: logMonitor,
|
||||||
|
claudeEnabled: claudeEnabled,
|
||||||
|
claudeTempDir: tempDir,
|
||||||
|
verbose: verbose,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -34,23 +60,42 @@ func (v *Validator) ValidateResult(result *TestResult) {
|
|||||||
|
|
||||||
// validatePrompt validates a single prompt test
|
// validatePrompt validates a single prompt test
|
||||||
func (v *Validator) validatePrompt(prompt *PromptTest) {
|
func (v *Validator) validatePrompt(prompt *PromptTest) {
|
||||||
// Already failed, skip
|
// Step 1: Simple/fast checks first
|
||||||
|
simpleCheckPassed := true
|
||||||
|
simpleCheckReason := ""
|
||||||
|
|
||||||
if prompt.Status == StatusFailed {
|
if prompt.Status == StatusFailed {
|
||||||
return
|
simpleCheckPassed = false
|
||||||
|
simpleCheckReason = prompt.ErrorMessage
|
||||||
|
} else if strings.TrimSpace(prompt.Response) == "" {
|
||||||
|
simpleCheckPassed = false
|
||||||
|
simpleCheckReason = "Response is empty"
|
||||||
|
} else if prompt.ResponseTokens < 1 {
|
||||||
|
simpleCheckPassed = false
|
||||||
|
simpleCheckReason = "Response has no tokens"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if response is empty
|
// Step 2: Claude validation ALWAYS runs (regardless of simple check result)
|
||||||
if strings.TrimSpace(prompt.Response) == "" {
|
if v.claudeEnabled {
|
||||||
prompt.Status = StatusFailed
|
claudeResult := v.validateWithClaude(prompt, simpleCheckPassed, simpleCheckReason)
|
||||||
prompt.ErrorMessage = "Response is empty"
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check token count
|
// Claude validation overrides everything
|
||||||
if prompt.ResponseTokens < 1 {
|
if claudeResult.Status == StatusFailed {
|
||||||
prompt.Status = StatusFailed
|
prompt.Status = StatusFailed
|
||||||
prompt.ErrorMessage = "Response has no tokens"
|
prompt.ErrorMessage = claudeResult.Reason
|
||||||
return
|
} else if claudeResult.Status == StatusPassed {
|
||||||
|
prompt.Status = StatusPassed
|
||||||
|
// Clear simple check error if Claude says it's OK
|
||||||
|
if prompt.ErrorMessage == simpleCheckReason {
|
||||||
|
prompt.ErrorMessage = ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If Claude not available, use simple check results
|
||||||
|
if !simpleCheckPassed {
|
||||||
|
prompt.Status = StatusFailed
|
||||||
|
prompt.ErrorMessage = simpleCheckReason
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -148,6 +193,88 @@ func (v *Validator) hasCPUFallback() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ClaudeValidationResult represents Claude's validation result
|
||||||
|
type ClaudeValidationResult struct {
|
||||||
|
Status TestStatus
|
||||||
|
Reason string
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateWithClaude uses Claude headless mode to validate a prompt response
|
||||||
|
func (v *Validator) validateWithClaude(prompt *PromptTest, simpleCheckPassed bool, simpleCheckReason string) ClaudeValidationResult {
|
||||||
|
if v.verbose {
|
||||||
|
fmt.Println(" 🤖 Running Claude AI validation...")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create analysis prompt
|
||||||
|
var analysisPrompt strings.Builder
|
||||||
|
|
||||||
|
analysisPrompt.WriteString("Analyze this LLM response from a Tesla K80 GPU test.\n\n")
|
||||||
|
analysisPrompt.WriteString(fmt.Sprintf("Prompt: %s\n\n", prompt.Prompt))
|
||||||
|
analysisPrompt.WriteString(fmt.Sprintf("Response: %s\n\n", prompt.Response))
|
||||||
|
|
||||||
|
if !simpleCheckPassed {
|
||||||
|
analysisPrompt.WriteString(fmt.Sprintf("Note: Basic validation failed: %s\n\n", simpleCheckReason))
|
||||||
|
}
|
||||||
|
|
||||||
|
analysisPrompt.WriteString(`Verify that the response:
|
||||||
|
1. Is relevant and responsive to the prompt
|
||||||
|
2. Is coherent and makes sense (not gibberish or garbled text)
|
||||||
|
3. Is in proper language (not error messages, binary data, or Unicode errors)
|
||||||
|
4. Appears to be from a working LLM model (not system errors or failures)
|
||||||
|
5. Has reasonable quality for a 4B parameter model
|
||||||
|
|
||||||
|
Respond with ONLY one of these formats:
|
||||||
|
- "PASS" if the response is valid and acceptable
|
||||||
|
- "FAIL: <brief reason>" if the response has issues
|
||||||
|
|
||||||
|
Be concise. One line only.`)
|
||||||
|
|
||||||
|
// Write to temp file
|
||||||
|
promptFile := filepath.Join(v.claudeTempDir, fmt.Sprintf("prompt_%d.txt", os.Getpid()))
|
||||||
|
if err := os.WriteFile(promptFile, []byte(analysisPrompt.String()), 0644); err != nil {
|
||||||
|
fmt.Printf("Warning: Failed to write Claude prompt file: %v\n", err)
|
||||||
|
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (file write error)"}
|
||||||
|
}
|
||||||
|
defer os.Remove(promptFile)
|
||||||
|
|
||||||
|
// Run Claude headless
|
||||||
|
cmd := exec.Command("claude", "-p", promptFile)
|
||||||
|
output, err := cmd.CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Printf("Warning: Claude validation failed to run: %v\n", err)
|
||||||
|
return ClaudeValidationResult{Status: StatusPassed, Reason: "Claude validation skipped (execution error)"}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse result
|
||||||
|
result := strings.TrimSpace(string(output))
|
||||||
|
|
||||||
|
if strings.HasPrefix(result, "PASS") {
|
||||||
|
if v.verbose {
|
||||||
|
fmt.Println(" ✓ Claude: Response is valid")
|
||||||
|
}
|
||||||
|
return ClaudeValidationResult{
|
||||||
|
Status: StatusPassed,
|
||||||
|
Reason: "Claude validation: Response is valid and acceptable",
|
||||||
|
}
|
||||||
|
} else if strings.HasPrefix(result, "FAIL:") {
|
||||||
|
failReason := strings.TrimSpace(strings.TrimPrefix(result, "FAIL:"))
|
||||||
|
if v.verbose {
|
||||||
|
fmt.Printf(" ✗ Claude: %s\n", failReason)
|
||||||
|
}
|
||||||
|
return ClaudeValidationResult{
|
||||||
|
Status: StatusFailed,
|
||||||
|
Reason: failReason,
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Unexpected format, treat as warning but pass
|
||||||
|
fmt.Printf("Warning: Unexpected Claude response format: %s\n", result)
|
||||||
|
return ClaudeValidationResult{
|
||||||
|
Status: StatusPassed,
|
||||||
|
Reason: "Claude validation unclear, defaulting to pass",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ValidateResponse validates a response against expected criteria
|
// ValidateResponse validates a response against expected criteria
|
||||||
func ValidateResponse(response string, minTokens, maxTokens int) error {
|
func ValidateResponse(response string, minTokens, maxTokens int) error {
|
||||||
tokens := estimateTokens(response)
|
tokens := estimateTokens(response)
|
||||||
|
|||||||
@@ -5,12 +5,12 @@ profiles:
|
|||||||
quick:
|
quick:
|
||||||
timeout: 5m
|
timeout: 5m
|
||||||
models:
|
models:
|
||||||
- name: gemma2:2b
|
- name: gemma3:4b
|
||||||
prompts:
|
prompts:
|
||||||
- "Hello, respond with a brief greeting."
|
- "Hello, respond with a brief greeting."
|
||||||
min_response_tokens: 5
|
min_response_tokens: 5
|
||||||
max_response_tokens: 100
|
max_response_tokens: 100
|
||||||
timeout: 30s
|
timeout: 60s
|
||||||
|
|
||||||
validation:
|
validation:
|
||||||
gpu_required: true
|
gpu_required: true
|
||||||
|
|||||||
Reference in New Issue
Block a user