Prevent partial loading on mixed GPU brands

In mult-brand GPU setups, if we couldn't fully load the model we
would fall through the scheduler and mistakenly try to load across
a mix of brands.  This makes sure we find the set of GPU(s) that
best fit for the partial load.
This commit is contained in:
Daniel Hiltgen
2024-07-22 11:57:26 -07:00
parent 0be8baad2b
commit 345420998e
2 changed files with 66 additions and 4 deletions

View File

@@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh)
}
func TestHomogeneousGPUs(t *testing.T) {
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.getGpuFn = func() gpu.GpuInfoList {
// Set memory values to require the model to be spread
gpus := []gpu.GpuInfo{
{Library: "cuda"},
{Library: "rocm"},
}
gpus[0].TotalMemory = 1 * format.GibiByte
gpus[0].FreeMemory = 256 * format.MebiByte
gpus[1].TotalMemory = 1 * format.GibiByte
gpus[1].FreeMemory = 256 * format.MebiByte
return gpus
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
}
type mockLlm struct {
pingResp error
waitResp error