mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-13 01:07:12 +00:00
llama: update to commit de4c07f93 (#10655)
This commit is contained in:
@@ -5,88 +5,27 @@ Subject: [PATCH] add mllama support
|
||||
|
||||
adds support for the llama 3.2 vision architecture
|
||||
---
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
examples/llava/mtmd.cpp | 6 +-
|
||||
ggml/src/ggml-backend-reg.cpp | 6 +-
|
||||
include/llama.h | 6 +
|
||||
src/llama-arch.cpp | 44 +++++
|
||||
src/llama-arch.h | 10 ++
|
||||
src/llama-batch.cpp | 3 +
|
||||
src/llama-context.cpp | 25 ++-
|
||||
src/llama-context.cpp | 23 ++-
|
||||
src/llama-context.h | 1 +
|
||||
src/llama-cparams.h | 1 +
|
||||
src/llama-graph.cpp | 25 +++
|
||||
src/llama-graph.h | 12 ++
|
||||
src/llama-hparams.cpp | 4 +
|
||||
src/llama-hparams.h | 7 +
|
||||
src/llama-kv-cache.cpp | 12 +-
|
||||
src/llama-kv-cache.cpp | 14 +-
|
||||
src/llama-model-loader.cpp | 2 +
|
||||
src/llama-model.cpp | 309 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.cpp | 311 +++++++++++++++++++++++++++++++++-
|
||||
src/llama-model.h | 12 ++
|
||||
src/llama-quant.cpp | 4 +-
|
||||
19 files changed, 473 insertions(+), 21 deletions(-)
|
||||
tools/mtmd/llava.cpp | 5 +-
|
||||
tools/mtmd/mtmd-helper.cpp | 7 +-
|
||||
19 files changed, 475 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index c00d16ae..bab027b5 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -457,7 +457,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -469,6 +469,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -492,7 +493,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
|
||||
index 7081fd73..c14ac501 100644
|
||||
--- a/examples/llava/mtmd.cpp
|
||||
+++ b/examples/llava/mtmd.cpp
|
||||
@@ -476,7 +476,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -487,6 +487,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -610,7 +611,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
- decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_embd(embd, n_embd, n_tokens, n_past, 0);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
|
||||
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
|
||||
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
|
||||
index 405d8e31..82ae1b5b 100644
|
||||
--- a/ggml/src/ggml-backend-reg.cpp
|
||||
@@ -105,10 +44,10 @@ index 405d8e31..82ae1b5b 100644
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 06c56395..f1628e88 100644
|
||||
index abedebdb..41beef21 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -256,6 +256,7 @@ extern "C" {
|
||||
@@ -258,6 +258,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -116,15 +55,15 @@ index 06c56395..f1628e88 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -358,6 +359,7 @@ extern "C" {
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
@@ -365,6 +366,7 @@ extern "C" {
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
bool no_perf; // whether to measure performance timings
|
||||
bool op_offload; // whether to offload host tensor operations to device
|
||||
+ bool cross_attn; // whether to use cross attention
|
||||
};
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -459,6 +461,10 @@ extern "C" {
|
||||
// model quantization parameters
|
||||
@@ -464,6 +466,10 @@ extern "C" {
|
||||
struct llama_context_params params),
|
||||
"use llama_init_from_model instead");
|
||||
|
||||
@@ -247,10 +186,10 @@ index 525c1b7d..bc8a4f0b 100644
|
||||
LLM_TENSOR_CONVNEXT_DW,
|
||||
LLM_TENSOR_CONVNEXT_NORM,
|
||||
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
|
||||
index 01d5ca57..8682b0e6 100644
|
||||
index a88b2fe3..241b316e 100644
|
||||
--- a/src/llama-batch.cpp
|
||||
+++ b/src/llama-batch.cpp
|
||||
@@ -316,6 +316,7 @@ struct llama_batch llama_batch_get_one(
|
||||
@@ -320,6 +320,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -258,7 +197,7 @@ index 01d5ca57..8682b0e6 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -328,6 +329,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -332,6 +333,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -266,7 +205,7 @@ index 01d5ca57..8682b0e6 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -336,6 +338,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -340,6 +342,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
@@ -275,10 +214,10 @@ index 01d5ca57..8682b0e6 100644
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 9c1fe93f..cd06ad91 100644
|
||||
index dca22d8b..c22687e4 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -851,7 +851,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
@@ -514,7 +514,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
@@ -287,7 +226,7 @@ index 9c1fe93f..cd06ad91 100644
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -972,6 +972,10 @@ void llama_context::set_warmup(bool value) {
|
||||
@@ -632,6 +632,10 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
@@ -298,16 +237,16 @@ index 9c1fe93f..cd06ad91 100644
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1047,7 +1051,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
@@ -709,7 +713,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
- sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
+ sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
- llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
+ llama_sbatch sbatch = llama_sbatch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1187,10 +1191,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -863,10 +867,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -319,16 +258,16 @@ index 9c1fe93f..cd06ad91 100644
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -1238,7 +1241,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1087,7 +1090,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
- const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
+ const uint32_t n_vocab = model.hparams.n_vocab;
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
- sbatch.from_batch(batch, n_embd,
|
||||
+ sbatch.from_batch(batch, batch.n_embd,
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
|
||||
@@ -1472,12 +1475,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -1142,12 +1145,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
@@ -342,16 +281,7 @@ index 9c1fe93f..cd06ad91 100644
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -1545,7 +1547,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
- const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
+ const uint32_t n_vocab = model.hparams.n_vocab;
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -2052,7 +2054,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
@@ -1682,7 +1684,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
||||
@@ -360,15 +290,15 @@ index 9c1fe93f..cd06ad91 100644
|
||||
|
||||
io.write(&logits_size, sizeof(logits_size));
|
||||
|
||||
@@ -2235,6 +2237,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
@@ -2091,6 +2093,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
+ /*.cross_attn =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -2362,6 +2365,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
|
||||
return result;
|
||||
@@ -2216,6 +2219,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
@@ -380,10 +310,10 @@ index 9c1fe93f..cd06ad91 100644
|
||||
ctx->synchronize();
|
||||
}
|
||||
diff --git a/src/llama-context.h b/src/llama-context.h
|
||||
index 5457f077..a50c4afa 100644
|
||||
index c0ceacb1..c4ab242a 100644
|
||||
--- a/src/llama-context.h
|
||||
+++ b/src/llama-context.h
|
||||
@@ -65,6 +65,7 @@ struct llama_context {
|
||||
@@ -71,6 +71,7 @@ struct llama_context {
|
||||
void set_embeddings (bool value);
|
||||
void set_causal_attn(bool value);
|
||||
void set_warmup(bool value);
|
||||
@@ -392,22 +322,22 @@ index 5457f077..a50c4afa 100644
|
||||
void set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
|
||||
index 30e550f0..85ad91b9 100644
|
||||
index 246fa577..7a6156ce 100644
|
||||
--- a/src/llama-cparams.h
|
||||
+++ b/src/llama-cparams.h
|
||||
@@ -29,6 +29,7 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
@@ -31,6 +31,7 @@ struct llama_cparams {
|
||||
bool no_perf;
|
||||
+ bool cross_attn;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
+ bool cross_attn;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index fabb9ca2..b67216a4 100644
|
||||
index b0e3f635..f14869cf 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -560,6 +560,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
@@ -532,6 +532,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,7 +350,7 @@ index fabb9ca2..b67216a4 100644
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@@ -1532,6 +1538,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
@@ -1514,6 +1520,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
@@ -447,10 +377,10 @@ index fabb9ca2..b67216a4 100644
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index d0c8d321..0fe18150 100644
|
||||
index 832a8c09..5a322785 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
@@ -87,6 +87,7 @@ public:
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
@@ -458,7 +388,7 @@ index d0c8d321..0fe18150 100644
|
||||
};
|
||||
|
||||
class llm_graph_input_pos : public llm_graph_input_i {
|
||||
@@ -283,6 +284,16 @@ public:
|
||||
@@ -284,6 +285,16 @@ public:
|
||||
const llama_cross * cross = nullptr;
|
||||
};
|
||||
|
||||
@@ -475,7 +405,7 @@ index d0c8d321..0fe18150 100644
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -491,6 +502,7 @@ struct llm_graph_context {
|
||||
@@ -495,6 +506,7 @@ struct llm_graph_context {
|
||||
ggml_tensor * build_inp_cls() const;
|
||||
ggml_tensor * build_inp_s_copy() const;
|
||||
ggml_tensor * build_inp_s_mask() const;
|
||||
@@ -535,11 +465,11 @@ index 48dce407..b6fc7e6d 100644
|
||||
};
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 7c9d46d8..69f8d35a 100644
|
||||
index 3dcad65b..a7b0a7eb 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
|
||||
return false;
|
||||
@@ -100,8 +100,16 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
throw std::runtime_error("failed to create ggml context for kv cache");
|
||||
}
|
||||
|
||||
- ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
@@ -557,8 +487,17 @@ index 7c9d46d8..69f8d35a 100644
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
k_l.push_back(k);
|
||||
@@ -446,7 +454,7 @@ void llama_kv_cache_unified::set_full() {
|
||||
llama_sbatch llama_kv_cache_unified::sbatch_init(
|
||||
const llama_batch & batch,
|
||||
bool logits_all) {
|
||||
- return llama_sbatch(batch, hparams.n_embd, true, logits_all);
|
||||
+ return llama_sbatch(batch, batch.n_embd, true, logits_all);
|
||||
}
|
||||
|
||||
llama_ubatch llama_kv_cache_unified::ubatch_next(
|
||||
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
|
||||
index a012aeae..2e11507d 100644
|
||||
index 7f6617fa..2acfd4a8 100644
|
||||
--- a/src/llama-model-loader.cpp
|
||||
+++ b/src/llama-model-loader.cpp
|
||||
@@ -315,6 +315,8 @@ namespace GGUFMeta {
|
||||
@@ -571,10 +510,10 @@ index a012aeae..2e11507d 100644
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 572378c9..9d099f11 100644
|
||||
index 831b68c0..e8298f56 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -423,6 +423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -433,6 +433,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
// get general kv
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
||||
@@ -582,7 +521,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
// everything past this point is not vocab-related
|
||||
if (hparams.vocab_only) {
|
||||
@@ -434,6 +435,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -444,6 +445,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
@@ -590,7 +529,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
||||
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
||||
@@ -457,9 +459,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -467,9 +469,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
@@ -602,7 +541,7 @@ index 572378c9..9d099f11 100644
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -512,7 +516,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -522,7 +526,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
@@ -611,7 +550,7 @@ index 572378c9..9d099f11 100644
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -575,6 +579,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -585,6 +589,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
hparams.use_kq_norm = false;
|
||||
}
|
||||
} break;
|
||||
@@ -628,7 +567,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1562,7 +1576,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1581,7 +1595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
@@ -637,7 +576,7 @@ index 572378c9..9d099f11 100644
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1815,6 +1829,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1840,6 +1854,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -690,7 +629,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4707,6 +4767,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
@@ -4756,6 +4816,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -832,7 +771,7 @@ index 572378c9..9d099f11 100644
|
||||
+ // self attention layer
|
||||
+
|
||||
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
+ ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
+
|
||||
+ // compute Q and K and RoPE them
|
||||
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -937,7 +876,16 @@ index 572378c9..9d099f11 100644
|
||||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -13063,6 +13363,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -12496,7 +12796,7 @@ struct llm_build_solar : public llm_graph_context {
|
||||
// self-attention
|
||||
{
|
||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
||||
+ ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||
|
||||
// compute Q and K and RoPE them
|
||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||
@@ -13128,6 +13428,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
@@ -948,7 +896,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@@ -13424,6 +13728,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13489,6 +13793,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
@@ -957,7 +905,7 @@ index 572378c9..9d099f11 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 856e6042..6be91282 100644
|
||||
index 43746c7d..9281e629 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -968,7 +916,7 @@ index 856e6042..6be91282 100644
|
||||
|
||||
struct llama_cparams;
|
||||
struct llama_ubatch;
|
||||
@@ -73,6 +74,7 @@ enum llm_type {
|
||||
@@ -74,6 +75,7 @@ enum llm_type {
|
||||
LLM_TYPE_40B,
|
||||
LLM_TYPE_65B,
|
||||
LLM_TYPE_70B,
|
||||
@@ -976,7 +924,7 @@ index 856e6042..6be91282 100644
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_290B,
|
||||
LLM_TYPE_314B,
|
||||
@@ -314,6 +316,16 @@ struct llama_layer {
|
||||
@@ -318,6 +320,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
@@ -994,7 +942,7 @@ index 856e6042..6be91282 100644
|
||||
|
||||
struct llama_layer_convnext convnext;
|
||||
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
|
||||
index 7dc54227..223e1f3f 100644
|
||||
index 820d5128..56531980 100644
|
||||
--- a/src/llama-quant.cpp
|
||||
+++ b/src/llama-quant.cpp
|
||||
@@ -639,7 +639,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
@@ -1008,3 +956,72 @@ index 7dc54227..223e1f3f 100644
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
diff --git a/tools/mtmd/llava.cpp b/tools/mtmd/llava.cpp
|
||||
index ebef8b3c..b0eb79bb 100644
|
||||
--- a/tools/mtmd/llava.cpp
|
||||
+++ b/tools/mtmd/llava.cpp
|
||||
@@ -462,7 +462,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -474,6 +474,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -497,7 +498,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
|
||||
index 7a328867..61ebdd43 100644
|
||||
--- a/tools/mtmd/mtmd-helper.cpp
|
||||
+++ b/tools/mtmd/mtmd-helper.cpp
|
||||
@@ -58,7 +58,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
+ decode_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
|
||||
pos .resize(n_tokens * n_pos_per_embd);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -69,6 +69,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -131,6 +132,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ batch.embd + offset * n_mmproj_embd,
|
||||
+ /*n_embd =*/ batch.n_embd,
|
||||
/*pos =*/ pos_ptr,
|
||||
/*n_seq_id =*/ batch.n_seq_id + offset,
|
||||
/*seq_id =*/ batch.seq_id + offset,
|
||||
@@ -166,7 +168,8 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(image_tokens);
|
||||
int32_t i_batch = 0;
|
||||
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
|
||||
- decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
+ decode_embd_batch batch_embd(encoded_embd, n_embd, n_tokens, n_past, seq_id);
|
||||
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
|
||||
Reference in New Issue
Block a user