ggml: Seperate tensor load from backend creation

Currently, when the backend is created, the tensors are loaded at the
same time, which is a slow operation. This separates them to be two
steps:
 - Create backend, including enumerating tensors and memory allocation
 - Loading tensor data

This allows more flexibility in managing model loading.
This commit is contained in:
Jesse Gross
2025-04-17 13:42:40 -07:00
committed by Jesse Gross
parent d755577473
commit 94ab428e3f
13 changed files with 131 additions and 115 deletions

View File

@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
if err == nil {
defer r.Close()
f, _, err := ggml.Decode(r, 1024)
f, err := ggml.Decode(r, 1024)
if err == nil {
if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
capabilities = append(capabilities, model.CapabilityEmbedding)