This commit is contained in:
Michael Yang
2024-07-03 17:22:13 -07:00
parent 66fe77f084
commit 55cd3ddcca
8 changed files with 82 additions and 83 deletions

View File

@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
return err
}
if !envconfig.NoPrune && old != nil {
if !envconfig.NoPrune() && old != nil {
if err := old.RemoveLayers(); err != nil {
return err
}
@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
// build deleteMap to prune unused layers
deleteMap := make(map[string]struct{})
if !envconfig.NoPrune {
if !envconfig.NoPrune() {
manifest, _, err = GetManifest(mp)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err

View File

@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
return err
}
if !envconfig.NoPrune {
if !envconfig.NoPrune() {
// clean up unused layers and manifests
if err := PruneLayers(); err != nil {
return err

View File

@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
// First attempt to fit the model into a single GPU
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if !envconfig.SchedSpread {
if !envconfig.SchedSpread() {
for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))