ggml: Seperate tensor load from backend creation

Currently, when the backend is created, the tensors are loaded at the
same time, which is a slow operation. This separates them to be two
steps:
 - Create backend, including enumerating tensors and memory allocation
 - Loading tensor data

This allows more flexibility in managing model loading.
This commit is contained in:
Jesse Gross
2025-04-17 13:42:40 -07:00
committed by Jesse Gross
parent d755577473
commit 94ab428e3f
13 changed files with 131 additions and 115 deletions

View File

@@ -15,6 +15,7 @@ import (
type GGML struct {
container
model
Length int64
}
type model interface {
@@ -386,12 +387,12 @@ func DetectContentType(b []byte) string {
//
// It collects array values for arrays with a size less than or equal to
// maxArraySize. If the maxArraySize is negative, all arrays are collected.
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
var magic uint32
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
return nil, 0, err
return nil, err
}
var c container
@@ -401,24 +402,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
case FILE_MAGIC_GGUF_BE:
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
default:
return nil, 0, errors.New("invalid file magic")
return nil, errors.New("invalid file magic")
}
model, err := c.Decode(rs)
if err != nil {
return nil, 0, err
return nil, err
}
offset, err := rs.Seek(0, io.SeekCurrent)
if err != nil {
return nil, 0, err
return nil, err
}
// final model type
return &GGML{
container: c,
model: model,
}, offset, nil
Length: offset,
}, nil
}
func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {