kvcache: create cache ctx per layer

each cache layer creates and maintains its own context instead of using
a large context for all layers
This commit is contained in:
Michael Yang
2025-02-25 12:57:49 -08:00
parent bfce55db3d
commit 764e199d67
4 changed files with 68 additions and 46 deletions

View File

@@ -339,14 +339,15 @@ func (b *Backend) Get(name string) ml.Tensor {
}
func (b *Backend) NewContext() ml.Context {
maxTensors := max(8192, len(b.meta.Tensors().Items())*5)
maxGraphNodes := max(8192, len(b.meta.Tensors().Items())*5)
return &Context{
b: b,
maxTensors: maxTensors,
ctx: C.ggml_init(C.struct_ggml_init_params{
mem_size: C.size_t(maxTensors)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxTensors), false),
mem_size: C.size_t(maxGraphNodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxGraphNodes), false),
no_alloc: true,
}),
backend: C.ggml_backend_sched_get_backend(b.sched, 0),
maxGraphNodes: maxGraphNodes,
}
}
@@ -363,13 +364,14 @@ type Context struct {
ctx *C.struct_ggml_context
graph *C.struct_ggml_cgraph
backend *C.struct_ggml_backend
maxTensors int
maxGraphNodes int
}
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
if c.graph == nil {
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxTensors), false)
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
}
for _, tensor := range tensors {
@@ -399,8 +401,8 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
}
}
func (c *Context) MaxTensors() int {
return c.maxTensors
func (c *Context) MaxGraphNodes() int {
return c.maxGraphNodes
}
func shapeToGGML(shape []int) *C.int64_t {
@@ -435,7 +437,7 @@ func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
panic("unsupported dtype")
}
b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
C.ggml_set_input(t)
return &Tensor{b: ctx.b, t: t}
@@ -469,7 +471,7 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u
}
t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
C.ggml_set_input(t)
@@ -484,8 +486,8 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
return fromSlice(c, s, shape, C.GGML_TYPE_I32)
}
func (c *Context) Close() {
if c != nil {
func (c Context) Close() {
if c.ctx != nil {
C.ggml_free(c.ctx)
}
}