update memory calcualtions

count each layer independently when deciding gpu offloading
This commit is contained in:
Michael Yang
2024-03-18 10:45:22 +01:00
parent d338d70492
commit 91b3e4d282
7 changed files with 125 additions and 89 deletions

View File

@@ -5,10 +5,11 @@ import (
"fmt"
"log/slog"
"os"
"runtime"
"slices"
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
"mamba",
}
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
}
defer f.Close()
ggml, size, err := DecodeGGML(f)
ggml, _, err := DecodeGGML(f)
if err != nil {
return nil, err
}
@@ -49,92 +50,101 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
opts.NumCtx = 4
}
vram, _ := gpu.CheckVRAM()
availableMemory, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo()
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
usedMemory := info.MinimumMemory
for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
// this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calculations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa
graph := int64(ggml.KV().GQA()) * kv / 6
usedMemory += graph
if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
opts.NumGPU = 0
if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
info.Library = "cpu"
}
info := gpu.GetGPUInfo()
switch runtime.GOOS {
case "darwin":
if opts.NumGPU == 0 {
break
}
requiredMemory := usedMemory
if size+kv+graph > vram {
slog.Info("not enough vram available, setting num_gpu=0")
opts.NumGPU = 0
break
}
var layers int
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
requiredMemory += layerMemory
// TODO: implement layer splitting on macOS
opts.NumGPU = 999
default:
if info.Library == "cpu" {
slog.Info("GPU not available, falling back to CPU")
opts.NumGPU = 0
break
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
usedMemory += layerMemory
layers++
}
// don't use GPU at all if no layers are loaded
if opts.NumGPU == 0 {
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
break
}
// user-defined GPU count
if opts.NumGPU != -1 {
break
}
// the "main" GPU needs the most memory and determines the limit
// of how many layers can be loaded. It needs to fit:
// 1. the full compute graph allocation for all devices (graph)
// 2. the proportional kv cache for all devices (kv * % layers)
// 3. the proportional model (size * % layers / # devices)
// This estimates the number of layers
maxlayers := int64(ggml.KV().BlockCount()) + 1
devices := int64(info.DeviceCount)
avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices)
if layers > maxlayers {
layers = maxlayers
}
// 1 + 2 must fit on the main gpu
min := graph + kv*layers/maxlayers
if layers <= 0 || min > avg {
slog.Info("not enough vram available, falling back to CPU only")
info.Library = "cpu"
info.Variant = gpu.GetCPUVariant()
opts.NumGPU = 0
break
}
opts.NumGPU = int(layers)
}
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
memOutputLayer := ggml.LayerSize("output.")
requiredMemory += memOutputLayer
// only offload output layer if all repeating layers are offloaded
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
usedMemory += memOutputLayer
layers++
}
slog.Info(
"offload to gpu",
"layers", layers,
"required", format.HumanBytes2(requiredMemory),
"used", format.HumanBytes2(usedMemory),
"available", format.HumanBytes2(availableMemory),
"kv", format.HumanBytes2(kv),
"graph", format.HumanBytes2(graph),
)
if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}
return newLlmServer(info, model, adapters, projectors, opts)
}
func projectorMemoryRequirements(filename string) int64 {
file, err := os.Open(filename)
if err != nil {
return 0
}
defer file.Close()
ggml, _, err := DecodeGGML(file)
if err != nil {
return 0
}
prefixes := make(map[string]struct{})
for _, layer := range ggml.Tensors() {
parts := strings.Split(layer.Name, ".")
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
}
var ask int64
for prefix := range prefixes {
ask += ggml.LayerSize(prefix)
}
return ask
}
// Give any native cgo implementations an opportunity to initialize
func Init() error {
return nativeInit()
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting