mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 00:07:07 +00:00
review comments and coverage
This commit is contained in:
@@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
||||
// We want to avoid loading on any GPUs that have other
|
||||
// models still loading on them to avoid potential races
|
||||
// with VRAM consumption ramping up during load
|
||||
availGpus := s.filterGPUsWithLoadingModels(gpus)
|
||||
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
|
||||
|
||||
// Update free memory from currently loaded models
|
||||
s.updateFreeSpace(availGpus)
|
||||
@@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||
r.refMu.Lock()
|
||||
if r.llama != nil {
|
||||
for _, gpu := range allGpus {
|
||||
// if slices.Contains(gpuIDs, gpu.ID) {
|
||||
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
|
||||
// }
|
||||
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
|
||||
}
|
||||
} else {
|
||||
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
|
||||
@@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
||||
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
||||
// This routine returns the set of GPUs that do not have an active loading model.
|
||||
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
||||
func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
||||
ret := append(gpu.GpuInfoList{}, allGpus...)
|
||||
s.loadedMu.Lock()
|
||||
defer s.loadedMu.Unlock()
|
||||
@@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML,
|
||||
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
|
||||
|
||||
return s.findRunnerToUnload()
|
||||
|
||||
}
|
||||
|
||||
@@ -156,7 +156,7 @@ func TestRequests(t *testing.T) {
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
|
||||
scenario1a.req.sessionDuration = 0
|
||||
scenario1a.req.sessionDuration = 5 * time.Millisecond
|
||||
scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
|
||||
scenario1b.req.model = scenario1a.req.model
|
||||
scenario1b.ggml = scenario1a.ggml
|
||||
@@ -167,6 +167,7 @@ func TestRequests(t *testing.T) {
|
||||
tmpModel := *scenario1a.req.model
|
||||
scenario2a.req.model = &tmpModel
|
||||
scenario2a.ggml = scenario1a.ggml
|
||||
scenario2a.req.sessionDuration = 5 * time.Millisecond
|
||||
|
||||
// Multiple loaded models
|
||||
scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
|
||||
@@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
|
||||
// Same model, same request
|
||||
scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
|
||||
scenario1a.req.sessionDuration = 0
|
||||
scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
|
||||
@@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) {
|
||||
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
|
||||
}
|
||||
|
||||
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
gpus := gpu.GpuInfoList{
|
||||
{
|
||||
Library: "cuda",
|
||||
ID: "0",
|
||||
},
|
||||
{
|
||||
Library: "cuda",
|
||||
ID: "1",
|
||||
},
|
||||
}
|
||||
r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
|
||||
|
||||
s := InitScheduler(ctx)
|
||||
s.loadedMu.Lock()
|
||||
s.loaded["a"] = r1
|
||||
s.loadedMu.Unlock()
|
||||
|
||||
tmp := s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "1", tmp[0].ID)
|
||||
|
||||
r1.gpus = gpu.GpuInfoList{gpus[1]}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 1)
|
||||
require.Equal(t, "0", tmp[0].ID)
|
||||
|
||||
r1.gpus = gpu.GpuInfoList{}
|
||||
tmp = s.filterGPUsWithoutLoadingModels(gpus)
|
||||
require.Len(t, tmp, 2)
|
||||
}
|
||||
|
||||
func TestFindRunnerToUnload(t *testing.T) {
|
||||
ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
|
||||
defer done()
|
||||
@@ -607,4 +641,4 @@ func (s *mockLlm) Close() error {
|
||||
}
|
||||
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
|
||||
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
|
||||
func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
|
||||
|
||||
Reference in New Issue
Block a user