llama: update vendor code to commit ba1cb19c (#8101)

This commit is contained in:
Jeffrey Morgan
2024-12-14 14:55:51 -08:00
committed by GitHub
parent 60f75560a2
commit 7a81daf026
273 changed files with 3194 additions and 1900 deletions

View File

@@ -18,10 +18,10 @@ remaining is to implement the cross attention mask
3 files changed, 467 insertions(+), 20 deletions(-)
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 4ca53a0b..d56644a8 100644
index 16f30c56..0f0f3f62 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -412,7 +412,7 @@ struct llava_embd_batch {
@@ -429,7 +429,7 @@ struct llava_embd_batch {
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
@@ -30,7 +30,7 @@ index 4ca53a0b..d56644a8 100644
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
@@ -424,6 +424,7 @@ struct llava_embd_batch {
@@ -441,6 +441,7 @@ struct llava_embd_batch {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
@@ -38,7 +38,7 @@ index 4ca53a0b..d56644a8 100644
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
@@ -464,7 +465,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
n_eval = n_batch;
}
float * embd = image_embed->embed+i*n_embd;
@@ -48,10 +48,10 @@ index 4ca53a0b..d56644a8 100644
LOG_ERR("%s : failed to eval\n", __func__);
return false;
diff --git a/include/llama.h b/include/llama.h
index e85f459f..aba85f86 100644
index c67988a3..0f266283 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -245,6 +245,7 @@ extern "C" {
@@ -249,6 +249,7 @@ extern "C" {
llama_token * token;
float * embd;
@@ -59,7 +59,7 @@ index e85f459f..aba85f86 100644
llama_pos * pos;
int32_t * n_seq_id;
llama_seq_id ** seq_id;
@@ -419,6 +420,10 @@ extern "C" {
@@ -423,6 +424,10 @@ extern "C" {
struct llama_model * model,
struct llama_context_params params);
@@ -71,7 +71,7 @@ index e85f459f..aba85f86 100644
LLAMA_API void llama_free(struct llama_context * ctx);
diff --git a/src/llama.cpp b/src/llama.cpp
index b01770d0..46881642 100644
index 26be6254..4778a9ed 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
@@ -82,7 +82,7 @@ index b01770d0..46881642 100644
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GROK,
@@ -201,6 +202,7 @@ enum llm_arch {
@@ -202,6 +203,7 @@ enum llm_arch {
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_LLAMA, "llama" },
@@ -90,23 +90,23 @@ index b01770d0..46881642 100644
{ LLM_ARCH_FALCON, "falcon" },
{ LLM_ARCH_GROK, "grok" },
{ LLM_ARCH_GPT2, "gpt2" },
@@ -309,6 +311,7 @@ enum llm_kv {
@@ -311,6 +313,7 @@ enum llm_kv {
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
LLM_KV_ROPE_DIMENSION_SECTIONS,
@@ -429,6 +432,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
@@ -608,6 +612,14 @@ enum llm_tensor {
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
@@ -612,6 +616,14 @@ enum llm_tensor {
LLM_TENSOR_CLS,
LLM_TENSOR_CLS_OUT,
LLM_TENSOR_BSKCN_TV,
@@ -121,7 +121,7 @@ index b01770d0..46881642 100644
};
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
@@ -641,6 +653,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
},
},
@@ -162,7 +162,7 @@ index b01770d0..46881642 100644
{
LLM_ARCH_BAICHUAN,
{
@@ -2432,6 +2478,7 @@ enum e_model {
@@ -2456,6 +2502,7 @@ enum e_model {
MODEL_40B,
MODEL_65B,
MODEL_70B,
@@ -170,7 +170,7 @@ index b01770d0..46881642 100644
MODEL_236B,
MODEL_314B,
MODEL_SMALL,
@@ -2476,6 +2523,7 @@ struct llama_hparams {
@@ -2500,6 +2547,7 @@ struct llama_hparams {
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
@@ -178,7 +178,7 @@ index b01770d0..46881642 100644
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
@@ -2544,10 +2592,11 @@ struct llama_hparams {
@@ -2569,10 +2617,11 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;
@@ -194,7 +194,7 @@ index b01770d0..46881642 100644
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
@@ -2665,6 +2714,10 @@ struct llama_hparams {
@@ -2693,6 +2742,10 @@ struct llama_hparams {
GGML_ABORT("fatal error");
}
@@ -205,7 +205,7 @@ index b01770d0..46881642 100644
};
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
@@ -2694,6 +2747,9 @@ struct llama_cparams {
@@ -2722,6 +2775,9 @@ struct llama_cparams {
bool offload_kqv;
bool flash_attn;
bool no_perf;
@@ -215,7 +215,7 @@ index b01770d0..46881642 100644
enum llama_pooling_type pooling_type;
@@ -2853,6 +2909,16 @@ struct llama_layer {
@@ -2881,6 +2937,16 @@ struct llama_layer {
struct ggml_tensor * ffn_down_scale;
struct ggml_tensor * bskcn_tv;
@@ -232,7 +232,7 @@ index b01770d0..46881642 100644
};
// very similar to llama_batch,
@@ -3439,6 +3505,8 @@ struct llama_context {
@@ -3472,6 +3538,8 @@ struct llama_context {
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
@@ -241,7 +241,7 @@ index b01770d0..46881642 100644
};
struct llama_lora_weight {
@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
@@ -3610,6 +3678,39 @@ static bool llama_kv_cache_init(
cache.v_l.reserve(n_layer);
for (int i = 0; i < (int) n_layer; i++) {
@@ -281,7 +281,7 @@ index b01770d0..46881642 100644
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
@@ -5547,12 +5648,14 @@ static void llm_load_hparams(
}
// zero-out the per-layer hparams
@@ -301,7 +301,7 @@ index b01770d0..46881642 100644
// n_head_kv is optional, default to n_head
hparams.n_head_kv_arr = hparams.n_head_arr;
@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
@@ -5601,7 +5704,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
@@ -310,7 +310,7 @@ index b01770d0..46881642 100644
if (hparams.n_rot != hparams.n_embd_head_k) {
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
}
@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
@@ -5641,6 +5744,16 @@ static void llm_load_hparams(
}
}
} break;
@@ -327,7 +327,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
@@ -7291,7 +7404,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
// this tensor is loaded for T5, but never used
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
@@ -344,7 +344,7 @@ index b01770d0..46881642 100644
};
// checks if the weight tensor can be used with the specified buffer type and device
@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
@@ -7801,6 +7922,53 @@ static bool llm_load_tensors(
}
}
} break;
@@ -398,7 +398,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_MINICPM3:
{
const int64_t n_embd_head_qk_rope = hparams.n_rot;
@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
@@ -9511,7 +9679,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
@@ -407,7 +407,7 @@ index b01770d0..46881642 100644
}
if (params.vocab_only) {
@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
@@ -9594,6 +9762,21 @@ static struct ggml_tensor * llm_build_inp_embd(
return inpL;
}
@@ -429,7 +429,7 @@ index b01770d0..46881642 100644
static void llm_build_kv_store(
struct ggml_context * ctx,
const llama_hparams & hparams,
@@ -10513,6 +10696,7 @@ struct llm_build_context {
@@ -10561,6 +10744,7 @@ struct llm_build_context {
lctx.inp_pos_bucket = nullptr;
lctx.inp_embd_enc = nullptr;
lctx.inp_KQ_mask_cross = nullptr;
@@ -437,7 +437,7 @@ index b01770d0..46881642 100644
}
void free() {
@@ -10992,6 +11176,240 @@ struct llm_build_context {
@@ -11040,6 +11224,240 @@ struct llm_build_context {
return gf;
}
@@ -678,7 +678,7 @@ index b01770d0..46881642 100644
struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
@@ -16993,6 +17411,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm.build_llama();
} break;
@@ -689,7 +689,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
{
result = llm.build_baichuan();
@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
@@ -17258,10 +17680,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
}
if (ubatch.embd) {
@@ -712,7 +712,7 @@ index b01770d0..46881642 100644
}
if (ubatch.pos && lctx.inp_pos) {
@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
@@ -17862,7 +18293,7 @@ static int llama_decode_internal(
n_outputs = 1;
}
@@ -721,7 +721,7 @@ index b01770d0..46881642 100644
/* simple_split */ !kv_self.recurrent,
/* logits_all */ n_outputs == n_tokens_all);
@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
@@ -18172,7 +18603,7 @@ static int llama_encode_internal(
const int64_t n_embd = hparams.n_embd;
@@ -730,7 +730,7 @@ index b01770d0..46881642 100644
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
@@ -19203,7 +19634,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (llama_model_has_encoder(&model)) {
n_attn_layer *= 3;
}
@@ -741,7 +741,7 @@ index b01770d0..46881642 100644
}
size_t total_size_org = 0;
@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
@@ -20360,6 +20793,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
// use what we call a normal RoPE, operating on pairs of consecutive head values
case LLM_ARCH_LLAMA:
@@ -749,7 +749,7 @@ index b01770d0..46881642 100644
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
@@ -21790,6 +22224,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
@@ -760,7 +760,7 @@ index b01770d0..46881642 100644
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens) {
@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
@@ -21797,6 +22235,7 @@ struct llama_batch llama_batch_get_one(
/*n_tokens =*/ n_tokens,
/*tokens =*/ tokens,
/*embd =*/ nullptr,
@@ -768,7 +768,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21809,6 +22248,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
/*n_tokens =*/ 0,
/*tokens =*/ nullptr,
/*embd =*/ nullptr,
@@ -776,7 +776,7 @@ index b01770d0..46881642 100644
/*pos =*/ nullptr,
/*n_seq_id =*/ nullptr,
/*seq_id =*/ nullptr,
@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
@@ -21817,6 +22257,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);