Fix llava models not working after first request (#4164)

* fix llava models not working after first request

* individual requests only for llava models
This commit is contained in:
Jeffrey Morgan
2024-05-05 20:50:31 -07:00
committed by GitHub
parent dfa2f32ca0
commit 1b0e6c9c0e
2 changed files with 32 additions and 1 deletions

View File

@@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
params = append(params, "--numa")
}
// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
numParallel := envconfig.NumParallel
// TODO (jmorganca): multimodal models don't support parallel yet
// see https://github.com/ollama/ollama/issues/4165
if len(projectors) > 0 {
numParallel = 1
slog.Warn("multimodal models don't support parallel requests yet")
}
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
for i := 0; i < len(servers); i++ {