mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
llama: update vendor code to commit ba1cb19c (#8101)
This commit is contained in:
@@ -26,7 +26,7 @@ index fdb4b986..9b80fe07 100644
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index d6e4bfdd..52aec229 100644
|
||||
index c180adc8..000f1777 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 6a6f4c2a..fa09f3b3 100644
|
||||
index abc1252e..626c3e3f 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
|
||||
@@ -6400,16 +6400,7 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
@@ -29,9 +29,9 @@ index 6a6f4c2a..fa09f3b3 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
@@ -6514,7 +6505,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "minerva-7b") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] embeddings
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index fa09f3b3..d1791af0 100644
|
||||
index 626c3e3f..9e292c4f 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
@@ -17419,7 +17419,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -20,7 +20,7 @@ index fa09f3b3..d1791af0 100644
|
||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
|
||||
@@ -17714,7 +17714,6 @@ static int llama_decode_internal(
|
||||
res = nullptr;
|
||||
embd = nullptr;
|
||||
} else if (cparams.embeddings) {
|
||||
@@ -28,7 +28,7 @@ index fa09f3b3..d1791af0 100644
|
||||
embd = nullptr;
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
|
||||
@@ -17722,11 +17721,15 @@ static int llama_decode_internal(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] clip-unicode
|
||||
1 file changed, 39 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index d7c94352..427d5e02 100644
|
||||
index ba28c07c..46998e4c 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -56,6 +56,19 @@
|
||||
@@ -31,7 +31,7 @@ index d7c94352..427d5e02 100644
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1322,8 +1335,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -62,7 +62,7 @@ index d7c94352..427d5e02 100644
|
||||
if (!fin) {
|
||||
LOG_ERR("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1363,7 +1397,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,10 +15,10 @@ in general, the values are (bskcn_tv, 1 - bskcn_tv)
|
||||
1 file changed, 253 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index d1791af0..b01770d0 100644
|
||||
index 9e292c4f..26be6254 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -195,6 +195,7 @@ enum llm_arch {
|
||||
@@ -196,6 +196,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -26,7 +26,7 @@ index d1791af0..b01770d0 100644
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -34,15 +34,15 @@ index d1791af0..b01770d0 100644
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -306,6 +308,7 @@ enum llm_kv {
|
||||
@@ -308,6 +310,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -411,20 +414,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
|
||||
@@ -77,8 +77,8 @@ index d1791af0..b01770d0 100644
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -603,6 +607,7 @@ enum llm_tensor {
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -607,6 +611,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -86,7 +86,7 @@ index d1791af0..b01770d0 100644
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1564,6 +1569,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -111,7 +111,7 @@ index d1791af0..b01770d0 100644
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2401,6 +2424,7 @@ enum e_model {
|
||||
@@ -2425,6 +2448,7 @@ enum e_model {
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
@@ -119,7 +119,7 @@ index d1791af0..b01770d0 100644
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
@@ -2451,6 +2475,8 @@ struct llama_hparams {
|
||||
@@ -2475,6 +2499,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -128,7 +128,7 @@ index d1791af0..b01770d0 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -2521,6 +2547,7 @@ struct llama_hparams {
|
||||
@@ -2546,6 +2572,7 @@ struct llama_hparams {
|
||||
if (this->n_head_arr != other.n_head_arr) return true;
|
||||
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
@@ -136,7 +136,7 @@ index d1791af0..b01770d0 100644
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2630,6 +2657,14 @@ struct llama_hparams {
|
||||
@@ -2658,6 +2685,14 @@ struct llama_hparams {
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
}
|
||||
@@ -151,7 +151,7 @@ index d1791af0..b01770d0 100644
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2816,6 +2851,8 @@ struct llama_layer {
|
||||
@@ -2844,6 +2879,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_gate_scale;
|
||||
struct ggml_tensor * ffn_up_scale;
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
@@ -160,7 +160,7 @@ index d1791af0..b01770d0 100644
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
|
||||
@@ -6247,6 +6284,21 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -182,7 +182,7 @@ index d1791af0..b01770d0 100644
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
@@ -7239,6 +7291,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -190,7 +190,7 @@ index d1791af0..b01770d0 100644
|
||||
};
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
|
||||
@@ -9253,6 +9306,35 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -226,7 +226,7 @@ index d1791af0..b01770d0 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -16652,6 +16734,158 @@ struct llm_build_context {
|
||||
@@ -16671,6 +16753,158 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -385,7 +385,7 @@ index d1791af0..b01770d0 100644
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -16942,6 +17176,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_chameleon();
|
||||
} break;
|
||||
@@ -396,7 +396,7 @@ index d1791af0..b01770d0 100644
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
@@ -20137,6 +20375,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
|
||||
@@ -8,7 +8,7 @@ Subject: [PATCH] conditional-fattn
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 52aec229..cbf4fddf 100644
|
||||
index 000f1777..8fd7c1a3 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
|
||||
@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
|
||||
3 files changed, 467 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 4ca53a0b..d56644a8 100644
|
||||
index 16f30c56..0f0f3f62 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -412,7 +412,7 @@ struct llava_embd_batch {
|
||||
@@ -429,7 +429,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -424,6 +424,7 @@ struct llava_embd_batch {
|
||||
@@ -441,6 +441,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index e85f459f..aba85f86 100644
|
||||
index c67988a3..0f266283 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -245,6 +245,7 @@ extern "C" {
|
||||
@@ -249,6 +249,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -419,6 +420,10 @@ extern "C" {
|
||||
@@ -423,6 +424,10 @@ extern "C" {
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index b01770d0..46881642 100644
|
||||
index 26be6254..4778a9ed 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
|
||||
@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
LLM_ARCH_GROK,
|
||||
@@ -201,6 +202,7 @@ enum llm_arch {
|
||||
@@ -202,6 +203,7 @@ enum llm_arch {
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
@@ -309,6 +311,7 @@ enum llm_kv {
|
||||
@@ -311,6 +313,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -608,6 +612,14 @@ enum llm_tensor {
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -612,6 +616,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
|
||||
{
|
||||
LLM_ARCH_BAICHUAN,
|
||||
{
|
||||
@@ -2432,6 +2478,7 @@ enum e_model {
|
||||
@@ -2456,6 +2502,7 @@ enum e_model {
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_SMALL,
|
||||
@@ -2476,6 +2523,7 @@ struct llama_hparams {
|
||||
@@ -2500,6 +2547,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
|
||||
@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -2544,10 +2592,11 @@ struct llama_hparams {
|
||||
@@ -2569,10 +2617,11 @@ struct llama_hparams {
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
|
||||
@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2665,6 +2714,10 @@ struct llama_hparams {
|
||||
@@ -2693,6 +2742,10 @@ struct llama_hparams {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2694,6 +2747,9 @@ struct llama_cparams {
|
||||
@@ -2722,6 +2775,9 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -2853,6 +2909,16 @@ struct llama_layer {
|
||||
@@ -2881,6 +2937,16 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
|
||||
struct ggml_tensor * bskcn_tv;
|
||||
@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -3439,6 +3505,8 @@ struct llama_context {
|
||||
@@ -3472,6 +3538,8 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
|
||||
@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
|
||||
cache.v_l.reserve(n_layer);
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
|
||||
@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
|
||||
}
|
||||
|
||||
// zero-out the per-layer hparams
|
||||
@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
|
||||
@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
|
||||
@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
|
||||
case LLM_ARCH_MINICPM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
|
||||
};
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
|
||||
@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
|
||||
case LLM_ARCH_MINICPM3:
|
||||
{
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
|
||||
}
|
||||
|
||||
if (params.vocab_only) {
|
||||
@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
|
||||
static void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
@@ -10513,6 +10696,7 @@ struct llm_build_context {
|
||||
@@ -10561,6 +10744,7 @@ struct llm_build_context {
|
||||
lctx.inp_pos_bucket = nullptr;
|
||||
lctx.inp_embd_enc = nullptr;
|
||||
lctx.inp_KQ_mask_cross = nullptr;
|
||||
@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
|
||||
}
|
||||
|
||||
void free() {
|
||||
@@ -10992,6 +11176,240 @@ struct llm_build_context {
|
||||
@@ -11040,6 +11224,240 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_llama();
|
||||
} break;
|
||||
@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
{
|
||||
result = llm.build_baichuan();
|
||||
@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||
@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||
}
|
||||
|
||||
if (ubatch.embd) {
|
||||
@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
|
||||
}
|
||||
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
|
||||
@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
|
||||
n_outputs = 1;
|
||||
}
|
||||
|
||||
@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
|
||||
/* simple_split */ !kv_self.recurrent,
|
||||
/* logits_all */ n_outputs == n_tokens_all);
|
||||
|
||||
@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
|
||||
@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
|
||||
|
||||
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
ctx->cparams.causal_attn = causal_attn;
|
||||
}
|
||||
|
||||
@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
|
||||
struct llama_batch llama_batch_get_one(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens) {
|
||||
@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
|
||||
@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
|
||||
@@ -5,30 +5,30 @@ Subject: [PATCH] add unpad operator
|
||||
|
||||
---
|
||||
ggml/include/ggml.h | 10 +++++
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 58 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
|
||||
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/pad.cuh | 1 +
|
||||
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 45 +++++++++++++++++++++
|
||||
ggml/src/ggml.c | 25 +++++++++++-
|
||||
8 files changed, 219 insertions(+), 2 deletions(-)
|
||||
8 files changed, 220 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index 65cb92c4..acbcccc6 100644
|
||||
index b0c1ac9c..091e6e6b 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -499,6 +499,7 @@ extern "C" {
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
GGML_OP_PAD_REFLECT_1D,
|
||||
+ GGML_OP_UNPAD,
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1695,6 +1696,15 @@ extern "C" {
|
||||
int p2,
|
||||
int p3);
|
||||
@@ -1718,6 +1719,15 @@ extern "C" {
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
+ // unpad each dimension: [x, ..., x, y, ..., y] -> [x, ..., x]
|
||||
+ GGML_API struct ggml_tensor * ggml_unpad(
|
||||
@@ -43,10 +43,10 @@ index 65cb92c4..acbcccc6 100644
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 23ae2e10..111ff3b0 100644
|
||||
index 67e67a08..bebff207 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
|
||||
@@ -10588,6 +10588,59 @@ static void ggml_compute_forward_pad_reflect_1d(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,12 +102,13 @@ index 23ae2e10..111ff3b0 100644
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
|
||||
+
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
static void ggml_compute_forward_arange_f32(
|
||||
@@ -12690,6 +12743,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad(params, tensor);
|
||||
ggml_compute_forward_pad_reflect_1d(params, tensor);
|
||||
} break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
@@ -116,16 +117,16 @@ index 23ae2e10..111ff3b0 100644
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
@@ -13033,6 +13090,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cbf4fddf..9ca6cb77 100644
|
||||
index 8fd7c1a3..7c351b89 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -210,34 +211,34 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 093ae900..cb9a1307 100644
|
||||
index 28f590f9..787fc713 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
|
||||
@@ -311,6 +311,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32,
|
||||
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
|
||||
@@ -910,6 +911,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
@@ -1145,6 +1147,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -3348,6 +3351,36 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -275,10 +276,10 @@ index 093ae900..cb9a1307 100644
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 5caa0846..47038c31 100644
|
||||
index 8ba43904..204c93e6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
|
||||
@@ -2944,6 +2944,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -331,44 +332,44 @@ index 5caa0846..47038c31 100644
|
||||
device char * dst,
|
||||
constant int64_t & ne0,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index 1a9a7efa..ea2b259b 100644
|
||||
index 51cc8566..0e74e554 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"POOL_2D_BACK",
|
||||
@@ -954,6 +954,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
"PAD_REFLECT_1D",
|
||||
+ "UNPAD",
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -987,7 +988,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"pool_2d_back(x)",
|
||||
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
"pad_reflect_1d(x)",
|
||||
+ "unpad(x)",
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1083,7 +1085,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
-static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
+static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
|
||||
@@ -4180,6 +4182,25 @@ struct ggml_tensor * ggml_pad_reflect_1d(
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ the characters
|
||||
2 files changed, 23 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index d1dc9627..05ef0e71 100644
|
||||
index 8c9aaf5a..3e372dc3 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
|
||||
@@ -4,27 +4,13 @@ Date: Tue, 3 Dec 2024 21:30:51 -0800
|
||||
Subject: [PATCH] relative include paths
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +-
|
||||
ggml/src/ggml-quants.c | 2 +-
|
||||
4 files changed, 4 insertions(+), 4 deletions(-)
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp | 3 +--
|
||||
ggml/src/ggml-quants.c | 2 +-
|
||||
3 files changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
index 11152385..bbf8934e 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
-#include "ggml-cpu/ggml-cpu-impl.h"
|
||||
+#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 111ff3b0..df0bd3c6 100644
|
||||
index bebff207..d6dd5600 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10,7 +10,7 @@
|
||||
@@ -37,20 +23,21 @@ index 111ff3b0..df0bd3c6 100644
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
index 77e5d87a..91476ad0 100644
|
||||
index c390957a..1af5f7eb 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
@@ -4,8 +4,7 @@
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml-impl.h"
|
||||
-#include "amx/amx.h"
|
||||
-
|
||||
+#include "amx.h"
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
|
||||
index 7301a9c6..49ab3daf 100644
|
||||
index 7918388a..e2ed84e4 100644
|
||||
--- a/ggml/src/ggml-quants.c
|
||||
+++ b/ggml/src/ggml-quants.c
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sat, 14 Dec 2024 12:54:00 -0800
|
||||
Subject: [PATCH] fix missing arg in static assert on windows
|
||||
|
||||
---
|
||||
ggml/src/ggml-cuda/concat.cu | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
|
||||
index 2f42b8a9..5eb9f08d 100644
|
||||
--- a/ggml/src/ggml-cuda/concat.cu
|
||||
+++ b/ggml/src/ggml-cuda/concat.cu
|
||||
@@ -124,7 +124,7 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE)
|
||||
uint64_t nb1,
|
||||
uint64_t nb2,
|
||||
uint64_t nb3){
|
||||
- static_assert(dim >= 0 && dim <= 3);
|
||||
+ static_assert(dim >= 0 && dim <= 3, "dim must be between 0 and 3");
|
||||
|
||||
const int64_t i3 = blockIdx.z;
|
||||
const int64_t i2 = blockIdx.y;
|
||||
Reference in New Issue
Block a user