Files
ollama37/cmd/test-runner/server.go
Shang Chieh Tseng d8ea75a3e2 Fix test-runner to inherit LD_LIBRARY_PATH for CUDA backend loading
The test-runner was starting the ollama server subprocess without inheriting
environment variables, causing the GGML CUDA backend to fail loading even
though LD_LIBRARY_PATH was set in the GitHub Actions workflow.

Changes:
- Added s.cmd.Env = os.Environ() to inherit all environment variables
- This ensures LD_LIBRARY_PATH is passed to the ollama server subprocess
- Fixes GPU offloading failure where layers were not being loaded to GPU

Root cause analysis from logs:
- GPUs were detected: Tesla K80 with 11.1 GiB available
- Server scheduled 35 layers for GPU offload
- But actual offload was 0/35 layers (all stayed on CPU)
- Runner subprocess couldn't find CUDA libraries without LD_LIBRARY_PATH

This fix ensures the runner subprocess can dynamically load libggml-cuda.so
by inheriting the CUDA library paths from the parent process.
2025-10-30 14:08:24 +08:00

172 lines
4.0 KiB
Go

package main
import (
"context"
"fmt"
"net/http"
"os"
"os/exec"
"path/filepath"
"time"
)
// Server manages the ollama server lifecycle
type Server struct {
config ServerConfig
ollamaBin string
logFile *os.File
cmd *exec.Cmd
baseURL string
}
// NewServer creates a new server manager
func NewServer(config ServerConfig, ollamaBin string) *Server {
baseURL := fmt.Sprintf("http://%s:%d", config.Host, config.Port)
return &Server{
config: config,
ollamaBin: ollamaBin,
baseURL: baseURL,
}
}
// Start starts the ollama server
func (s *Server) Start(ctx context.Context, logPath string) error {
// Create log file
logFile, err := os.Create(logPath)
if err != nil {
return fmt.Errorf("failed to create log file: %w", err)
}
s.logFile = logFile
// Resolve ollama binary path
binPath, err := filepath.Abs(s.ollamaBin)
if err != nil {
return fmt.Errorf("failed to resolve ollama binary path: %w", err)
}
// Check if binary exists
if _, err := os.Stat(binPath); err != nil {
return fmt.Errorf("ollama binary not found at %s: %w", binPath, err)
}
// Create command
s.cmd = exec.CommandContext(ctx, binPath, "serve")
s.cmd.Stdout = logFile
s.cmd.Stderr = logFile
// Set working directory to binary location
s.cmd.Dir = filepath.Dir(binPath)
// Inherit environment variables (including LD_LIBRARY_PATH for CUDA libraries)
s.cmd.Env = os.Environ()
// Start server
if err := s.cmd.Start(); err != nil {
logFile.Close()
return fmt.Errorf("failed to start ollama server: %w", err)
}
fmt.Printf("Started ollama server (PID: %d)\n", s.cmd.Process.Pid)
fmt.Printf("Server logs: %s\n", logPath)
// Wait for server to be ready
if err := s.WaitForReady(ctx); err != nil {
s.Stop()
return fmt.Errorf("server failed to become ready: %w", err)
}
fmt.Printf("Server is ready at %s\n", s.baseURL)
return nil
}
// WaitForReady waits for the server to be ready
func (s *Server) WaitForReady(ctx context.Context) error {
healthURL := s.baseURL + s.config.HealthCheckEndpoint
timeout := time.After(s.config.StartupTimeout)
ticker := time.NewTicker(s.config.HealthCheckInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-timeout:
return fmt.Errorf("timeout waiting for server to be ready")
case <-ticker.C:
req, err := http.NewRequestWithContext(ctx, "GET", healthURL, nil)
if err != nil {
continue
}
resp, err := http.DefaultClient.Do(req)
if err != nil {
continue
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return nil
}
}
}
}
// Stop stops the ollama server
func (s *Server) Stop() error {
var errs []error
// Stop the process
if s.cmd != nil && s.cmd.Process != nil {
fmt.Printf("Stopping ollama server (PID: %d)\n", s.cmd.Process.Pid)
// Try graceful shutdown first
if err := s.cmd.Process.Signal(os.Interrupt); err != nil {
errs = append(errs, fmt.Errorf("failed to send interrupt signal: %w", err))
}
// Wait for process to exit (with timeout)
done := make(chan error, 1)
go func() {
done <- s.cmd.Wait()
}()
select {
case <-time.After(10 * time.Second):
// Force kill if graceful shutdown times out
if err := s.cmd.Process.Kill(); err != nil {
errs = append(errs, fmt.Errorf("failed to kill process: %w", err))
}
<-done // Wait for process to actually die
case err := <-done:
if err != nil && err.Error() != "signal: interrupt" {
errs = append(errs, fmt.Errorf("process exited with error: %w", err))
}
}
}
// Close log file
if s.logFile != nil {
if err := s.logFile.Close(); err != nil {
errs = append(errs, fmt.Errorf("failed to close log file: %w", err))
}
}
if len(errs) > 0 {
return fmt.Errorf("errors during shutdown: %v", errs)
}
fmt.Println("Server stopped successfully")
return nil
}
// BaseURL returns the server base URL
func (s *Server) BaseURL() string {
return s.baseURL
}
// IsRunning returns true if the server is running
func (s *Server) IsRunning() bool {
return s.cmd != nil && s.cmd.Process != nil
}