ollama37/llm/memory.go

package llm

import (
	"fmt"
	"log/slog"
	"os"
	"strconv"
	"strings"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/discover"
	"github.com/ollama/ollama/envconfig"
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/fs/ggml"
)

// This algorithm looks for a complete fit to determine if we need to unload other models
func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
	// Split up the GPUs by type and try them
	var estimatedVRAM uint64
	for _, gpus := range allGpus.ByLibrary() {
		var layerCount int
		estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
		if opts.NumGPU < 0 {
			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
				return true, estimatedVRAM
			}
		} else {
			if layerCount > 0 && layerCount >= opts.NumGPU {
				return true, estimatedVRAM
			}
		}
	}
	return false, estimatedVRAM
}

type MemoryEstimate struct {
	// How many layers we predict we can load
	Layers int

	// The size of the graph which occupies the main GPU
	Graph uint64

	// How much VRAM will be allocated given the number of layers we predict
	VRAMSize uint64

	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
	TotalSize uint64

	// For multi-GPU scenarios, this provides the tensor split parameter
	TensorSplit string

	// For multi-GPU scenarios, this is the size in bytes per GPU
	GPUSizes []uint64

	// internal fields for logging purposes
	inferenceLibrary    string
	layersRequested     int
	layersModel         int
	availableList       []string
	kv                  uint64
	allocationsList     []string
	memoryWeights       uint64
	memoryLayerOutput   uint64
	graphFullOffload    uint64
	graphPartialOffload uint64

	projectorWeights, projectorGraph uint64
}

// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
// The GPUs provided must all be the same Library
func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
	// Graph size for a partial offload, applies to all GPUs
	var graphPartialOffload uint64

	// Graph size when all layers are offloaded, applies to all GPUs
	var graphFullOffload uint64

	// Final graph offload once we know full or partial
	var graphOffload uint64

	// Projectors loaded into GPU0 only
	var llamaEngineProjectorWeights uint64

	// Projectors loaded with output layer
	var ollamaEngineProjectorWeights uint64
	var ollamaEngineProjectorGraph uint64

	// Conditional output size on GPU 0
	var memoryLayerOutput uint64

	// The sizes of a layer
	var layerSize uint64

	// The sum of all the layer sizes (just for logging)
	var memoryWeights uint64

	// True if all the layers are loaded
	var fullyLoaded bool

	// Overflow that didn't fit into the GPU
	var overflow uint64

	overhead := envconfig.GpuOverhead()
	availableList := make([]string, len(gpus))
	for i, gpu := range gpus {
		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
	}
	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

	for _, projector := range projectors {
		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)

		// multimodal models require at least 2048 context
		opts.NumCtx = max(opts.NumCtx, 2048)
	}
	if llamaEngineProjectorWeights == 0 {
		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
		opts.NumCtx = max(opts.NumCtx, 2048)
	}

	layers := f.Tensors().GroupLayers()
	// add one layer worth of memory as a buffer
	if blk0, ok := layers["blk.0"]; ok {
		layerSize = blk0.Size()
	} else {
		slog.Warn("model missing blk.0 layer size")
	}

	var kvct string
	if envconfig.FlashAttention() &&
		discover.GetGPUInfo().FlashAttentionSupported() &&
		f.SupportsFlashAttention() {
		requested := strings.ToLower(envconfig.KvCacheType())
		if requested != "" && f.SupportsKVCacheType(requested) {
			kvct = requested
		}
	}

	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)

	if len(kv) > 0 {
		layerSize += kv[0]
	}

	var kvTotal uint64
	for _, kvLayer := range kv {
		kvTotal += kvLayer
	}

	if graphPartialOffload == 0 {
		headsKV := f.KV().HeadCountKVMin()
		if headsKV == 0 {
			headsKV = 1
		}
		gqa := f.KV().HeadCountMax() / headsKV
		graphPartialOffload = gqa * kvTotal / 6
	}
	if graphFullOffload == 0 {
		graphFullOffload = graphPartialOffload
	}

	// on metal there's no partial offload overhead
	if gpus[0].Library == "metal" {
		graphPartialOffload = graphFullOffload
	} else if len(gpus) > 1 {
		// multigpu should always use the partial graph size
		graphFullOffload = graphPartialOffload
	}

	// ollama37: Apply empirical correction factor for Tesla K80 (CC 3.7)
	// Measured: graph estimates are consistently 15-20% higher than actual usage
	// Example: gemma3:12b estimated 1.3 GiB, actual 1.1 GiB (85% of estimate)
	if gpus[0].Library == "cuda" && gpus[0].Compute == "3.7" {
		graphPartialOffload = (graphPartialOffload * 85) / 100
		graphFullOffload = (graphFullOffload * 85) / 100
		slog.Debug("applied CC 3.7 graph correction",
			"partial", format.HumanBytes2(graphPartialOffload),
			"full", format.HumanBytes2(graphFullOffload))
	}

	// Output layer handled at the end if we have space
	if layer, ok := layers["output_norm"]; ok {
		memoryLayerOutput += layer.Size()
	}
	if layer, ok := layers["output"]; ok {
		memoryLayerOutput += layer.Size()
	} else if layer, ok := layers["token_embd"]; ok {
		memoryLayerOutput += layer.Size()
	}

	gpuZeroOverhead := llamaEngineProjectorWeights

	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
	var layerCount int
	layerCounts := make([]int, len(gpus))
	gpuAllocations := make([]uint64, len(gpus))
	type gs struct {
		i int
		g *discover.GpuInfo
	}
	gpusWithSpace := []gs{}
	for i := range gpus {
		var gzo uint64
		if len(gpusWithSpace) == 0 {
			gzo = gpuZeroOverhead
		}
		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
		if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
			slog.Debug("gpu has too little memory to allocate any layers",
				"id", gpus[i].ID,
				"library", gpus[i].Library,
				"variant", gpus[i].Variant,
				"compute", gpus[i].Compute,
				"driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
				"name", gpus[i].Name,
				"total", format.HumanBytes2(gpus[i].TotalMemory),
				"available", format.HumanBytes2(gpus[i].FreeMemory),
				"minimum_memory", gpus[i].MinimumMemory,
				"layer_size", format.HumanBytes2(layerSize),
				"gpu_zer_overhead", format.HumanBytes2(gzo),
				"partial_offload", format.HumanBytes2(graphPartialOffload),
				"full_offload", format.HumanBytes2(graphFullOffload),
			)
			continue
		}
		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
	}

	var gpuZeroID int
	if len(gpusWithSpace) > 0 {
		gpuZeroID = gpusWithSpace[0].i
		gpuAllocations[gpuZeroID] += gpuZeroOverhead
	} else {
		overflow += gpuZeroOverhead
	}

	// ollama37: Create per-GPU graph allocations for Tesla K80 multi-GPU optimization
	// Secondary GPUs use measured 190 MiB, primary GPU uses full graph
	gpuGraphAllocations := make(map[int]uint64)
	for i := range gpus {
		if len(gpus) > 1 && i < len(gpus)-1 {
			// Secondary GPU: use empirically measured value (181 MiB, rounded to 190 MiB)
			gpuGraphAllocations[i] = 190 * 1024 * 1024
		} else {
			// Primary GPU or single GPU: use full graph
			gpuGraphAllocations[i] = max(graphPartialOffload, graphFullOffload)
		}
		slog.Debug("graph allocation per GPU",
			"gpu", i,
			"graph_alloc", format.HumanBytes2(gpuGraphAllocations[i]),
			"is_multi_gpu", len(gpus) > 1,
			"is_secondary", len(gpus) > 1 && i < len(gpus)-1)
	}

	// For all the layers, find where they can fit on the GPU(s)
	slog.Debug("starting layer placement",
		"total_layers", f.KV().BlockCount(),
		"num_gpus", len(gpus),
		"gpus_with_space", len(gpusWithSpace),
		"overhead", format.HumanBytes2(overhead))

	for i := int(f.KV().BlockCount()) - 1; i >= 0; i-- {
		// Some models have inconsistent layer sizes
		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			layerSize = blk.Size()
			layerSize += kv[i]
			memoryWeights += blk.Size()
		}

		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
			// Stop allocating on GPU(s) once we hit the users target NumGPU
			overflow += layerSize
			continue
		}

		// distribute the layers across the GPU(s) that have space
		// ollama37: Prefer loading on last GPU first (single-GPU preference for Tesla K80)
		placed := false
		for j := len(gpusWithSpace); j > 0; j-- {
			// Try GPUs in reverse order (highest index first) instead of round-robin
			g := gpusWithSpace[j-1]
			used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]  // ollama37: use per-GPU graph allocation
			required := overhead + used + layerSize

			if i == int(f.KV().BlockCount())-1 || i == int(f.KV().BlockCount())-2 || i == 0 {
				// Debug log for first 2 and last layer
				slog.Debug("layer placement attempt",
					"layer", i,
					"gpu", g.i,
					"gpu_free", format.HumanBytes2(g.g.FreeMemory),
					"overhead", format.HumanBytes2(overhead),
					"used", format.HumanBytes2(used),
					"layer_size", format.HumanBytes2(layerSize),
					"required", format.HumanBytes2(required),
					"fits", g.g.FreeMemory > required)
			}

			if g.g.FreeMemory > overhead+used+layerSize {
				gpuAllocations[g.i] += layerSize
				layerCounts[g.i]++
				layerCount++
				placed = true
				break
			} else {
				gpusWithSpace = append(gpusWithSpace[:j-1], gpusWithSpace[j:]...)
			}
		}

		if !placed {
			overflow += layerSize
		}
	}
	if layerCount >= int(f.KV().BlockCount()) {
		fullyLoaded = true
	}

	// Determine if we need to consider output then find where it fits
	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
	slog.Debug("output layer placement",
		"memory_last_layer", format.HumanBytes2(memoryLastLayer),
		"layer_count_before", layerCount,
		"block_count", f.KV().BlockCount(),
		"gpus_with_space", len(gpusWithSpace))

	if memoryLastLayer > 0 {
		outputPlaced := false
		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
			// ollama37: Prefer last GPU first (single-GPU preference for Tesla K80)
			for j := len(gpusWithSpace); j > 0; j-- {
				g := gpusWithSpace[j-1]  // Try GPUs in reverse order

				// ollama37: Use actual graph allocation (not conservative estimate)
				// This allows tighter packing on single GPU
				used := gpuAllocations[g.i] + gpuGraphAllocations[g.i]

				if g.g.FreeMemory > overhead+used+memoryLastLayer {
					gpuAllocations[g.i] += memoryLastLayer
					layerCounts[g.i]++
					layerCount++
					outputPlaced = true
					slog.Debug("output layer placed",
						"gpu", g.i,
						"layer_count_after", layerCount,
						"fully_loaded", layerCount >= int(f.KV().BlockCount())+1)
					break
				}
			}
		}

		if layerCount < int(f.KV().BlockCount())+1 {
			fullyLoaded = false
			overflow += memoryLastLayer
			slog.Debug("output layer overflow",
				"layer_count", layerCount,
				"required", int(f.KV().BlockCount())+1,
				"output_placed", outputPlaced)
		}
	}

	// Add the applicable (full or partial) graph allocations
	// ollama37: Use per-GPU graph allocations calculated earlier
	// Secondary GPUs use measured 190 MiB, primary GPU uses full graph
	for i := range gpus {
		if layerCounts[i] <= 0 {
			continue
		}
		gpuAllocations[i] += gpuGraphAllocations[i]
	}
	if fullyLoaded {
		graphOffload = graphFullOffload
	} else {
		graphOffload = graphPartialOffload
	}

	// Summaries for the log
	var memoryRequiredPartial, memoryRequiredTotal uint64
	for i := range gpuAllocations {
		memoryRequiredPartial += gpuAllocations[i]
	}
	memoryRequiredTotal = memoryRequiredPartial + overflow

	tensorSplit := ""
	if len(gpus) > 1 {
		splits := make([]string, len(gpus))
		for i, count := range layerCounts {
			splits[i] = strconv.Itoa(count)
		}
		tensorSplit = strings.Join(splits, ",")
	}
	allocationsList := []string{}
	for _, a := range gpuAllocations {
		allocationsList = append(allocationsList, format.HumanBytes2(a))
	}

	estimate := MemoryEstimate{
		TotalSize: memoryRequiredTotal,
		Layers:    0,
		Graph:     0,
		VRAMSize:  0,
		GPUSizes:  []uint64{},

		inferenceLibrary:    gpus[0].Library,
		layersRequested:     opts.NumGPU,
		layersModel:         int(f.KV().BlockCount()) + 1,
		availableList:       availableList,
		kv:                  kvTotal,
		allocationsList:     allocationsList,
		memoryWeights:       memoryWeights,
		memoryLayerOutput:   memoryLayerOutput,
		graphFullOffload:    graphFullOffload,
		graphPartialOffload: graphPartialOffload,
		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
		projectorGraph:      ollamaEngineProjectorGraph,
	}

	if gpus[0].Library == "cpu" {
		return estimate
	}
	if layerCount == 0 {
		slog.Debug("insufficient VRAM to load any model layers")
		return estimate
	}
	estimate.Layers = layerCount
	estimate.Graph = graphOffload
	estimate.VRAMSize = memoryRequiredPartial
	estimate.TotalSize = memoryRequiredTotal
	estimate.TensorSplit = tensorSplit
	estimate.GPUSizes = gpuAllocations
	return estimate
}

func (m MemoryEstimate) LogValue() slog.Value {
	attrs := []slog.Attr{
		slog.String("library", m.inferenceLibrary),
		slog.Group(
			"layers",
			// requested number of layers to offload
			"requested", m.layersRequested,
			// The number of layers the model has (including output)
			"model", m.layersModel,
			// estimated number of layers that can be offloaded
			"offload", m.Layers,
			// multi-gpu split for tensors
			"split", m.TensorSplit,
		),
		slog.Group(
			"memory",
			// memory available by GPU for offloading
			"available", m.availableList,
			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
			slog.Group(
				"required",
				// memory required for full offloading
				"full", format.HumanBytes2(m.TotalSize),
				// memory required to offload layers.estimate layers
				"partial", format.HumanBytes2(m.VRAMSize),
				// memory of KV cache
				"kv", format.HumanBytes2(m.kv),
				// Allocations across the GPUs
				"allocations", m.allocationsList,
			),
			slog.Group(
				"weights",
				// memory of the weights
				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
				// memory of repeating layers
				"repeating", format.HumanBytes2(m.memoryWeights),
				// memory of non-repeating layers
				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
			),
			slog.Group(
				"graph",
				// memory of graph when fully offloaded
				"full", format.HumanBytes2(m.graphFullOffload),
				// memory of graph when not fully offloaded
				"partial", format.HumanBytes2(m.graphPartialOffload),
			),
		),
	}

	if m.projectorWeights > 0 {
		attrs = append(attrs, slog.Group(
			"projector",
			"weights", format.HumanBytes2(m.projectorWeights),
			"graph", format.HumanBytes2(m.projectorGraph),
		))
	}

	return slog.GroupValue(attrs...)
}

func projectorMemoryRequirements(filename string) (weights uint64) {
	file, err := os.Open(filename)
	if err != nil {
		return 0
	}
	defer file.Close()

	ggml, err := ggml.Decode(file, 1024)
	if err != nil {
		return 0
	}

	for _, layer := range ggml.Tensors().GroupLayers() {
		weights += layer.Size()
	}

	return weights
}