mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
kvcache: create cache ctx per layer
each cache layer creates and maintains its own context instead of using a large context for all layers
This commit is contained in:
@@ -339,14 +339,15 @@ func (b *Backend) Get(name string) ml.Tensor {
|
||||
}
|
||||
|
||||
func (b *Backend) NewContext() ml.Context {
|
||||
maxTensors := max(8192, len(b.meta.Tensors().Items())*5)
|
||||
maxGraphNodes := max(8192, len(b.meta.Tensors().Items())*5)
|
||||
return &Context{
|
||||
b: b,
|
||||
maxTensors: maxTensors,
|
||||
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(maxTensors)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxTensors), false),
|
||||
mem_size: C.size_t(maxGraphNodes)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(maxGraphNodes), false),
|
||||
no_alloc: true,
|
||||
}),
|
||||
backend: C.ggml_backend_sched_get_backend(b.sched, 0),
|
||||
maxGraphNodes: maxGraphNodes,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,13 +364,14 @@ type Context struct {
|
||||
|
||||
ctx *C.struct_ggml_context
|
||||
graph *C.struct_ggml_cgraph
|
||||
backend *C.struct_ggml_backend
|
||||
|
||||
maxTensors int
|
||||
maxGraphNodes int
|
||||
}
|
||||
|
||||
func (c *Context) Forward(tensors ...ml.Tensor) ml.Context {
|
||||
if c.graph == nil {
|
||||
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxTensors), false)
|
||||
c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.maxGraphNodes), false)
|
||||
}
|
||||
|
||||
for _, tensor := range tensors {
|
||||
@@ -399,8 +401,8 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Context) MaxTensors() int {
|
||||
return c.maxTensors
|
||||
func (c *Context) MaxGraphNodes() int {
|
||||
return c.maxGraphNodes
|
||||
}
|
||||
|
||||
func shapeToGGML(shape []int) *C.int64_t {
|
||||
@@ -435,7 +437,7 @@ func newTensor(ctx Context, dtype ml.DType, shape []int) ml.Tensor {
|
||||
panic("unsupported dtype")
|
||||
}
|
||||
|
||||
b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
|
||||
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_set_input(t)
|
||||
return &Tensor{b: ctx.b, t: t}
|
||||
@@ -469,7 +471,7 @@ func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype u
|
||||
}
|
||||
|
||||
t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), shapeToGGML(shape))
|
||||
b := C.ggml_backend_alloc_buffer(C.ggml_backend_sched_get_backend(ctx.b.sched, 0), C.ggml_nbytes(t))
|
||||
b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
|
||||
C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
|
||||
C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
|
||||
C.ggml_set_input(t)
|
||||
@@ -484,8 +486,8 @@ func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
|
||||
return fromSlice(c, s, shape, C.GGML_TYPE_I32)
|
||||
}
|
||||
|
||||
func (c *Context) Close() {
|
||||
if c != nil {
|
||||
func (c Context) Close() {
|
||||
if c.ctx != nil {
|
||||
C.ggml_free(c.ctx)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user