mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-17 19:27:00 +00:00
llama: update vendor code to commit ba1cb19c (#8101)
This commit is contained in:
177
llama/ggml.c
vendored
177
llama/ggml.c
vendored
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
|
||||
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -34,7 +34,10 @@
|
||||
|
||||
// FIXME: required here for quantization functions
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-aarch64.h"
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
#include <hbwmalloc.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
@@ -814,32 +817,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
||||
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.type_name = "q4_0_4x4",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
[31] = { // GGML_TYPE_Q4_0_4_4
|
||||
.type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.type_name = "q4_0_4x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
[32] = { // GGML_TYPE_Q4_0_4_8
|
||||
.type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.type_name = "q4_0_8x8",
|
||||
.blck_size = QK4_0,
|
||||
.blck_size_interleave = 8,
|
||||
.type_size = sizeof(block_q4_0),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
[33] = { // GGML_TYPE_Q4_0_8_8
|
||||
.type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.type_name = "tq1_0",
|
||||
@@ -857,14 +851,23 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
|
||||
},
|
||||
[GGML_TYPE_IQ4_NL_4_4] = {
|
||||
.type_name = "iq4_nl_4x4",
|
||||
.blck_size = QK4_NL,
|
||||
.blck_size_interleave = 4,
|
||||
.type_size = sizeof(block_iq4_nl),
|
||||
.is_quantized = true,
|
||||
.to_float = NULL,
|
||||
.from_float_ref = NULL,
|
||||
[36] = { // GGML_TYPE_IQ4_NL_4_4
|
||||
.type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[37] = { // GGML_TYPE_IQ4_NL_4_8
|
||||
.type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[38] = { // GGML_TYPE_IQ4_NL_8_8
|
||||
.type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -976,6 +979,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"POOL_2D_BACK",
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
"UNPAD",
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
@@ -1010,7 +1014,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1072,6 +1076,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"pool_2d_back(x)",
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
"unpad(x)",
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
@@ -1106,7 +1111,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -1296,9 +1301,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
|
||||
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
|
||||
case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = GGML_TYPE_Q4_0_4_4; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = GGML_TYPE_Q4_0_4_8; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = GGML_TYPE_Q4_0_8_8; break;
|
||||
case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
|
||||
}
|
||||
@@ -3543,15 +3545,18 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
||||
}
|
||||
|
||||
int sections[4] = {0, 0, 0, 0};
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
||||
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
||||
memcpy(params + 5, &freq_base, sizeof(float));
|
||||
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||
memcpy(params + 11, §ions, sizeof(int)*4);
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE;
|
||||
@@ -3573,6 +3578,53 @@ struct ggml_tensor * ggml_rope(
|
||||
);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_multi(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
int n_dims,
|
||||
int sections[4],
|
||||
int mode,
|
||||
int n_ctx_orig,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
float ext_factor,
|
||||
float attn_factor,
|
||||
float beta_fast,
|
||||
float beta_slow) {
|
||||
// Multimodal Rotary Position Embedding
|
||||
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
||||
|
||||
GGML_ASSERT(ggml_is_vector(b));
|
||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
||||
|
||||
if (c) {
|
||||
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
||||
memcpy(params + 5, &freq_base, sizeof(float));
|
||||
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||
memcpy(¶ms[11], sections, sizeof(int)*4);
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_rope_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -4125,6 +4177,37 @@ struct ggml_tensor * ggml_pad(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_pad_reflect_1d
|
||||
|
||||
struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int p0,
|
||||
int p1) {
|
||||
GGML_ASSERT(p0 >= 0);
|
||||
GGML_ASSERT(p1 >= 0);
|
||||
|
||||
GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
|
||||
GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(a));
|
||||
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
a->ne[0] + p0 + p1,
|
||||
a->ne[1],
|
||||
a->ne[2],
|
||||
a->ne[3]);
|
||||
|
||||
int32_t params[] = { p0, p1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_PAD_REFLECT_1D;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_unpad
|
||||
|
||||
struct ggml_tensor * ggml_unpad(
|
||||
@@ -6318,9 +6401,6 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
@@ -6852,7 +6932,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||
(int64_t) info->ne[2] *
|
||||
(int64_t) info->ne[3];
|
||||
|
||||
if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
|
||||
if (ggml_blck_size(info->type) == 0 ) {
|
||||
// this tensor type support have been removed:
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
|
||||
fclose(file);
|
||||
gguf_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (ne % ggml_blck_size(info->type) != 0) {
|
||||
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
||||
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
||||
fclose(file);
|
||||
|
||||
Reference in New Issue
Block a user