mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 08:17:03 +00:00
Prevent partial loading on mixed GPU brands
In mult-brand GPU setups, if we couldn't fully load the model we would fall through the scheduler and mistakenly try to load across a mix of brands. This makes sure we find the set of GPU(s) that best fit for the partial load.
This commit is contained in:
@@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
|
||||
require.Empty(t, scenario1a.req.successCh)
|
||||
}
|
||||
|
||||
func TestHomogeneousGPUs(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
s := InitScheduler(ctx)
|
||||
|
||||
s.getGpuFn = func() gpu.GpuInfoList {
|
||||
// Set memory values to require the model to be spread
|
||||
gpus := []gpu.GpuInfo{
|
||||
{Library: "cuda"},
|
||||
{Library: "rocm"},
|
||||
}
|
||||
gpus[0].TotalMemory = 1 * format.GibiByte
|
||||
gpus[0].FreeMemory = 256 * format.MebiByte
|
||||
gpus[1].TotalMemory = 1 * format.GibiByte
|
||||
gpus[1].FreeMemory = 256 * format.MebiByte
|
||||
return gpus
|
||||
}
|
||||
s.getCpuFn = getCpuFn
|
||||
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
|
||||
s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
|
||||
require.Len(t, gpus, 1)
|
||||
return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
|
||||
}
|
||||
slog.Info("a")
|
||||
s.pendingReqCh <- a.req
|
||||
require.Len(t, s.pendingReqCh, 1)
|
||||
s.Run(ctx)
|
||||
select {
|
||||
case resp := <-a.req.successCh:
|
||||
require.Equal(t, resp.llama, a.srv)
|
||||
require.Empty(t, s.pendingReqCh)
|
||||
require.Empty(t, a.req.errCh)
|
||||
case err := <-a.req.errCh:
|
||||
t.Fatal(err.Error())
|
||||
case <-ctx.Done():
|
||||
t.Fatal("timeout")
|
||||
}
|
||||
}
|
||||
|
||||
type mockLlm struct {
|
||||
pingResp error
|
||||
waitResp error
|
||||
|
||||
Reference in New Issue
Block a user