mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 00:07:07 +00:00
update memory calcualtions
count each layer independently when deciding gpu offloading
This commit is contained in:
@@ -39,7 +39,7 @@ import (
|
||||
|
||||
type dynExtServer struct {
|
||||
s C.struct_dynamic_llama_server
|
||||
options api.Options
|
||||
options *api.Options
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
||||
return fmt.Errorf(C.GoString(resp.msg))
|
||||
}
|
||||
|
||||
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
|
||||
if !mutex.TryLock() {
|
||||
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||||
mutex.Lock()
|
||||
|
||||
11
llm/ggml.go
11
llm/ggml.go
@@ -5,6 +5,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type GGML struct {
|
||||
@@ -12,6 +13,16 @@ type GGML struct {
|
||||
model
|
||||
}
|
||||
|
||||
func (ggml *GGML) LayerSize(prefix string) (n int64) {
|
||||
for _, t := range ggml.Tensors() {
|
||||
if strings.HasPrefix(t.Name, prefix) {
|
||||
n += int64(t.size())
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
const (
|
||||
fileTypeF32 uint32 = iota
|
||||
fileTypeF16
|
||||
|
||||
144
llm/llm.go
144
llm/llm.go
@@ -5,10 +5,11 @@ import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
|
||||
"mamba",
|
||||
}
|
||||
|
||||
func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
ggml, size, err := DecodeGGML(f)
|
||||
ggml, _, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -49,92 +50,101 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
vram, _ := gpu.CheckVRAM()
|
||||
availableMemory, _ := gpu.CheckVRAM()
|
||||
info := gpu.GetGPUInfo()
|
||||
|
||||
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
|
||||
usedMemory := info.MinimumMemory
|
||||
for _, projector := range projectors {
|
||||
usedMemory += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
|
||||
|
||||
// this amount is the overhead + tensors in memory
|
||||
// TODO: get this from the llama.cpp's graph calculations instead of
|
||||
// estimating it's 1/6 * kv_cache_size * num_gqa
|
||||
graph := int64(ggml.KV().GQA()) * kv / 6
|
||||
usedMemory += graph
|
||||
|
||||
if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
||||
opts.NumGPU = 0
|
||||
if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
||||
info.Library = "cpu"
|
||||
}
|
||||
|
||||
info := gpu.GetGPUInfo()
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
if opts.NumGPU == 0 {
|
||||
break
|
||||
}
|
||||
requiredMemory := usedMemory
|
||||
|
||||
if size+kv+graph > vram {
|
||||
slog.Info("not enough vram available, setting num_gpu=0")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
var layers int
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
|
||||
requiredMemory += layerMemory
|
||||
|
||||
// TODO: implement layer splitting on macOS
|
||||
opts.NumGPU = 999
|
||||
default:
|
||||
if info.Library == "cpu" {
|
||||
slog.Info("GPU not available, falling back to CPU")
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
|
||||
usedMemory += layerMemory
|
||||
layers++
|
||||
}
|
||||
|
||||
// don't use GPU at all if no layers are loaded
|
||||
if opts.NumGPU == 0 {
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
break
|
||||
}
|
||||
|
||||
// user-defined GPU count
|
||||
if opts.NumGPU != -1 {
|
||||
break
|
||||
}
|
||||
|
||||
// the "main" GPU needs the most memory and determines the limit
|
||||
// of how many layers can be loaded. It needs to fit:
|
||||
// 1. the full compute graph allocation for all devices (graph)
|
||||
// 2. the proportional kv cache for all devices (kv * % layers)
|
||||
// 3. the proportional model (size * % layers / # devices)
|
||||
// This estimates the number of layers
|
||||
maxlayers := int64(ggml.KV().BlockCount()) + 1
|
||||
devices := int64(info.DeviceCount)
|
||||
avg := vram / devices
|
||||
layers := maxlayers * (avg - graph) / (kv + size/devices)
|
||||
if layers > maxlayers {
|
||||
layers = maxlayers
|
||||
}
|
||||
|
||||
// 1 + 2 must fit on the main gpu
|
||||
min := graph + kv*layers/maxlayers
|
||||
if layers <= 0 || min > avg {
|
||||
slog.Info("not enough vram available, falling back to CPU only")
|
||||
info.Library = "cpu"
|
||||
info.Variant = gpu.GetCPUVariant()
|
||||
opts.NumGPU = 0
|
||||
break
|
||||
}
|
||||
|
||||
opts.NumGPU = int(layers)
|
||||
}
|
||||
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
memOutputLayer := ggml.LayerSize("output.")
|
||||
requiredMemory += memOutputLayer
|
||||
|
||||
// only offload output layer if all repeating layers are offloaded
|
||||
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
|
||||
usedMemory += memOutputLayer
|
||||
layers++
|
||||
}
|
||||
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
"layers", layers,
|
||||
"required", format.HumanBytes2(requiredMemory),
|
||||
"used", format.HumanBytes2(usedMemory),
|
||||
"available", format.HumanBytes2(availableMemory),
|
||||
"kv", format.HumanBytes2(kv),
|
||||
"graph", format.HumanBytes2(graph),
|
||||
)
|
||||
|
||||
if opts.NumGPU < 0 && info.Library != "cpu" {
|
||||
opts.NumGPU = layers
|
||||
}
|
||||
|
||||
return newLlmServer(info, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func projectorMemoryRequirements(filename string) int64 {
|
||||
file, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(file)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
prefixes := make(map[string]struct{})
|
||||
for _, layer := range ggml.Tensors() {
|
||||
parts := strings.Split(layer.Name, ".")
|
||||
prefixes[strings.Join(parts[:2], ".")] = struct{}{}
|
||||
}
|
||||
|
||||
var ask int64
|
||||
for prefix := range prefixes {
|
||||
ask += ggml.LayerSize(prefix)
|
||||
}
|
||||
|
||||
return ask
|
||||
}
|
||||
|
||||
// Give any native cgo implementations an opportunity to initialize
|
||||
func Init() error {
|
||||
return nativeInit()
|
||||
}
|
||||
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
|
||||
dynLibs := getDynLibs(gpuInfo)
|
||||
|
||||
// Check to see if the user has requested a specific library instead of auto-detecting
|
||||
|
||||
Reference in New Issue
Block a user