llama: update vendored code to commit 40c6d79f (#7875)

This commit is contained in:
Jeffrey Morgan
2024-12-10 19:21:34 -08:00
committed by GitHub
parent a37f4a86a7
commit 527cc97899
289 changed files with 58552 additions and 41806 deletions

52
llama/ggml-alloc.c vendored
View File

@@ -1,5 +1,5 @@
/**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
@@ -40,7 +40,7 @@
//#define GGML_ALLOCATOR_DEBUG
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
#define AT_PRINTF(...)
@@ -115,7 +115,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
size = GGML_PAD(size, talloc->alignment);
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
@@ -198,7 +198,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
}
@@ -235,16 +235,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
}
}
}
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
fprintf(stderr, "\n");
GGML_LOG_DEBUG("\n");
}
#endif
@@ -374,7 +374,6 @@ struct tensor_alloc {
};
struct leaf_alloc {
int buffer_id;
struct tensor_alloc leaf;
};
@@ -493,18 +492,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return ggml_gallocr_hash_get(galloc, t)->allocated;
}
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->allocated = true;
}
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
}
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@@ -766,7 +759,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -794,13 +786,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -844,21 +836,25 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
size_t node_size = 0;
if (!node->data && !node->view_src) {
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
return talloc->size_max >= node_size;
}
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (galloc->n_nodes != graph->n_nodes) {
#ifndef NDEBUG
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
#endif
return true;
}
if (galloc->n_leafs != graph->n_leafs) {
#ifndef NDEBUG
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
#endif
return true;
}
@@ -869,7 +865,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
#endif
return true;
}
@@ -881,7 +877,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
}
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif
return true;
}
@@ -895,14 +891,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
return false;
}
} else {
#ifndef NDEBUG
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
#endif
return false;
}
@@ -966,7 +962,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
@@ -1016,7 +1012,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}
if (this_size > max_size) {
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
@@ -1048,7 +1044,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
if (n_buffers == 0) {
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}