Fix llava models not working after first request (#4164)

* fix llava models not working after first request * individual requests only for llava models
2025-12-11 16:26:59 +00:00 · 2024-05-05 20:50:31 -07:00
parent dfa2f32ca0
commit 1b0e6c9c0e
2 changed files with 32 additions and 1 deletions
--- a/llm/server.go
+++ b/llm/server.go
@@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 	}

-	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
 	numParallel := envconfig.NumParallel
+
+	// TODO (jmorganca): multimodal models don't support parallel yet
+	// see https://github.com/ollama/ollama/issues/4165
+	if len(projectors) > 0 {
+		numParallel = 1
+		slog.Warn("multimodal models don't support parallel requests yet")
+	}
+
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))

 	for i := 0; i < len(servers); i++ {