mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
llama: update vendored code to commit 40c6d79f (#7875)
This commit is contained in:
@@ -4,47 +4,33 @@ Date: Thu, 6 Jun 2024 23:55:47 -0700
|
||||
Subject: [PATCH] cuda
|
||||
|
||||
---
|
||||
ggml/include/ggml-cuda.h | 2 ++
|
||||
ggml/src/ggml-backend.c | 5 +++++
|
||||
ggml/src/ggml-cuda.cu | 6 ++++--
|
||||
3 files changed, 11 insertions(+), 2 deletions(-)
|
||||
ggml/src/ggml-backend.cpp | 5 +++++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++
|
||||
2 files changed, 9 insertions(+)
|
||||
|
||||
diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h
|
||||
index 71bb6dcf..08be0895 100644
|
||||
--- a/ggml/include/ggml-cuda.h
|
||||
+++ b/ggml/include/ggml-cuda.h
|
||||
@@ -34,6 +34,8 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_typ
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
+GGML_API GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
+
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
|
||||
index ba280e06..d5c3fe49 100644
|
||||
--- a/ggml/src/ggml-backend.c
|
||||
+++ b/ggml/src/ggml-backend.c
|
||||
@@ -83,7 +83,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index fdb4b986..9b80fe07 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -106,7 +106,12 @@ void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer->iface.free_buffer != NULL) {
|
||||
buffer->iface.free_buffer(buffer);
|
||||
}
|
||||
+
|
||||
+// TODO: this needs to be freed in cuda and hipblas backends because
|
||||
+// TODO: this needs to be freed in cuda and hip backends because
|
||||
+// the cuda backend implementation compiled with msvc
|
||||
+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
|
||||
free(buffer);
|
||||
+#if !defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
|
||||
delete buffer;
|
||||
+#endif
|
||||
}
|
||||
|
||||
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
|
||||
index 6efdab14..809d6ab1 100644
|
||||
--- a/ggml/src/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda.cu
|
||||
@@ -469,6 +469,10 @@ GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer)
|
||||
GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index d6e4bfdd..52aec229 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -424,6 +424,10 @@ struct ggml_backend_cuda_buffer_context {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
+
|
||||
@@ -53,13 +39,4 @@ index 6efdab14..809d6ab1 100644
|
||||
+ free(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -3204,8 +3208,6 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params,
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
-extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
|
||||
-
|
||||
GGML_CALL int ggml_backend_cuda_reg_devices() {
|
||||
int device_count = ggml_backend_cuda_get_device_count();
|
||||
//int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
|
||||
@@ -8,10 +8,10 @@ Subject: [PATCH] pretokenizer
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 4c0a1bb6..800dfb95 100644
|
||||
index 6a6f4c2a..fa09f3b3 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -6287,16 +6287,7 @@ static void llm_load_vocab(
|
||||
@@ -6362,16 +6362,7 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
@@ -29,7 +29,7 @@ index 4c0a1bb6..800dfb95 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -6398,7 +6389,8 @@ static void llm_load_vocab(
|
||||
@@ -6473,7 +6464,8 @@ static void llm_load_vocab(
|
||||
vocab.tokenizer_add_bos = true;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else {
|
||||
|
||||
@@ -4,14 +4,14 @@ Date: Mon, 16 Sep 2024 15:53:14 -0700
|
||||
Subject: [PATCH] embeddings
|
||||
|
||||
---
|
||||
src/llama.cpp | 15 +++++++++------
|
||||
1 file changed, 9 insertions(+), 6 deletions(-)
|
||||
src/llama.cpp | 9 ++++++---
|
||||
1 file changed, 6 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 800dfb95..a639522d 100644
|
||||
index fa09f3b3..d1791af0 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -16920,7 +16920,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
@@ -17398,7 +17398,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -20,20 +20,15 @@ index 800dfb95..a639522d 100644
|
||||
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
||||
|
||||
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
||||
@@ -17192,20 +17192,23 @@ static int llama_decode_internal(
|
||||
// no output
|
||||
@@ -17693,7 +17693,6 @@ static int llama_decode_internal(
|
||||
res = nullptr;
|
||||
embd = nullptr;
|
||||
- } else if (cparams.embeddings) {
|
||||
} else if (cparams.embeddings) {
|
||||
- res = nullptr; // do not extract logits for embedding case
|
||||
- embd = nullptr;
|
||||
+ }
|
||||
+
|
||||
+ if (cparams.embeddings) {
|
||||
embd = nullptr;
|
||||
for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
|
||||
+ embd = ggml_graph_node(gf, i);
|
||||
if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
|
||||
- embd = ggml_graph_node(gf, i);
|
||||
@@ -17701,11 +17700,15 @@ static int llama_decode_internal(
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -46,6 +41,7 @@ index 800dfb95..a639522d 100644
|
||||
+ if (!cparams.causal_attn) {
|
||||
+ res = nullptr; // do not extract logits when not needed
|
||||
+ }
|
||||
+
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
||||
@@ -1,54 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Yang <mxyng@pm.me>
|
||||
Date: Mon, 16 Sep 2024 15:53:12 -0700
|
||||
Subject: [PATCH] metal
|
||||
|
||||
---
|
||||
ggml/src/ggml-metal.m | 30 +++++++++++++-----------------
|
||||
1 file changed, 13 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
|
||||
index 9da08fe2..3a433703 100644
|
||||
--- a/ggml/src/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal.m
|
||||
@@ -1720,27 +1720,23 @@ static void ggml_metal_encode_node(
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
-#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
- switch (src0t) {
|
||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q4_0:
|
||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
- case GGML_TYPE_Q5_0: // not tested yet
|
||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
- default: ne11_mm_min = 1; break;
|
||||
- }
|
||||
+ switch (src0t) {
|
||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q4_0:
|
||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
+ default: ne11_mm_min = 1; break;
|
||||
}
|
||||
-#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
@@ -8,12 +8,12 @@ Subject: [PATCH] clip-unicode
|
||||
1 file changed, 39 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 14e02c8d..6e849d8e 100644
|
||||
index d7c94352..427d5e02 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -44,6 +44,19 @@
|
||||
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
||||
@@ -56,6 +56,19 @@
|
||||
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
|
||||
#endif // defined(LLAVA_LOG_OFF)
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
@@ -31,7 +31,7 @@ index 14e02c8d..6e849d8e 100644
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
|
||||
// RGB uint8 image
|
||||
@@ -1225,8 +1238,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1242,8 +1255,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -62,7 +62,7 @@ index 14e02c8d..6e849d8e 100644
|
||||
if (!fin) {
|
||||
LOG_ERR("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
@@ -1266,7 +1300,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
@@ -1283,7 +1317,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Wed, 12 Jun 2024 12:18:40 -0700
|
||||
Subject: [PATCH] ggml-metal
|
||||
|
||||
---
|
||||
ggml/src/ggml-metal.m | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
|
||||
index 3a433703..829c5e39 100644
|
||||
--- a/ggml/src/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal.m
|
||||
@@ -392,8 +392,8 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||
#if GGML_METAL_EMBED_LIBRARY
|
||||
GGML_METAL_LOG_INFO("%s: using embedded metal library\n", __func__);
|
||||
|
||||
- extern const char ggml_metallib_start[];
|
||||
- extern const char ggml_metallib_end[];
|
||||
+ extern const char *ggml_metallib_start;
|
||||
+ extern const char *ggml_metallib_end;
|
||||
|
||||
NSString * src = [[NSString alloc] initWithBytes:ggml_metallib_start length:(ggml_metallib_end-ggml_metallib_start) encoding:NSUTF8StringEncoding];
|
||||
#else
|
||||
@@ -11,14 +11,14 @@ tensor to store the scalar. the scalar is implemented a 1-dimensional
|
||||
tensor with 2 elements dervied from the model's bskcn_tv configuration.
|
||||
in general, the values are (bskcn_tv, 1 - bskcn_tv)
|
||||
---
|
||||
src/llama.cpp | 269 +++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 255 insertions(+), 14 deletions(-)
|
||||
src/llama.cpp | 267 +++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
1 file changed, 253 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index a639522d..83b80b59 100644
|
||||
index d1791af0..b01770d0 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -217,6 +217,7 @@ enum llm_arch {
|
||||
@@ -195,6 +195,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GRANITE,
|
||||
LLM_ARCH_GRANITE_MOE,
|
||||
LLM_ARCH_CHAMELEON,
|
||||
@@ -26,7 +26,7 @@ index a639522d..83b80b59 100644
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -270,6 +271,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -249,6 +250,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GRANITE, "granite" },
|
||||
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
||||
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
||||
@@ -34,7 +34,7 @@ index a639522d..83b80b59 100644
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -327,6 +329,7 @@ enum llm_kv {
|
||||
@@ -306,6 +308,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
@@ -42,7 +42,7 @@ index a639522d..83b80b59 100644
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
@@ -421,20 +424,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -408,20 +411,21 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_RESIDUAL_SCALE, "%s.residual_scale" },
|
||||
{ LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" },
|
||||
|
||||
@@ -76,17 +76,17 @@ index a639522d..83b80b59 100644
|
||||
+ { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -608,6 +612,7 @@ enum llm_tensor {
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -603,6 +607,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
+ LLM_TENSOR_BSKCN_TV,
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
@@ -1527,6 +1532,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -1541,6 +1546,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -111,15 +111,15 @@ index a639522d..83b80b59 100644
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -2360,6 +2383,7 @@ enum e_model {
|
||||
@@ -2401,6 +2424,7 @@ enum e_model {
|
||||
MODEL_15B,
|
||||
MODEL_16B,
|
||||
MODEL_20B,
|
||||
+ MODEL_22B,
|
||||
MODEL_30B,
|
||||
MODEL_32B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
@@ -2409,6 +2433,8 @@ struct llama_hparams {
|
||||
@@ -2451,6 +2475,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -128,7 +128,7 @@ index a639522d..83b80b59 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -2479,6 +2505,7 @@ struct llama_hparams {
|
||||
@@ -2521,6 +2547,7 @@ struct llama_hparams {
|
||||
if (this->n_head_arr != other.n_head_arr) return true;
|
||||
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
@@ -136,7 +136,7 @@ index a639522d..83b80b59 100644
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2588,6 +2615,14 @@ struct llama_hparams {
|
||||
@@ -2630,6 +2657,14 @@ struct llama_hparams {
|
||||
return ssm_d_state * ssm_d_inner;
|
||||
}
|
||||
}
|
||||
@@ -151,7 +151,7 @@ index a639522d..83b80b59 100644
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2769,6 +2804,8 @@ struct llama_layer {
|
||||
@@ -2816,6 +2851,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_gate_scale;
|
||||
struct ggml_tensor * ffn_up_scale;
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
@@ -160,7 +160,7 @@ index a639522d..83b80b59 100644
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -6134,6 +6171,21 @@ static void llm_load_hparams(
|
||||
@@ -6209,6 +6246,21 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -182,46 +182,51 @@ index a639522d..83b80b59 100644
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -8831,6 +8883,38 @@ static bool llm_load_tensors(
|
||||
@@ -7198,6 +7250,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
|
||||
};
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -9205,6 +9258,35 @@ static bool llm_load_tensors(
|
||||
|
||||
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ }
|
||||
+ } break;
|
||||
+ case LLM_ARCH_SOLAR:
|
||||
+ {
|
||||
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
+
|
||||
+ // output
|
||||
+ {
|
||||
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
+
|
||||
+ auto & layer = model.layers[i];
|
||||
+
|
||||
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
+
|
||||
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
+
|
||||
+ layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+ layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
@@ -16179,6 +16263,158 @@ struct llm_build_context {
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -16652,6 +16734,158 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
@@ -239,7 +244,7 @@ index a639522d..83b80b59 100644
|
||||
+ struct ggml_tensor * cur;
|
||||
+ struct ggml_tensor * inpL;
|
||||
+
|
||||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
+
|
||||
+ // inp_pos - contains the positions
|
||||
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
@@ -380,7 +385,7 @@ index a639522d..83b80b59 100644
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -16443,6 +16679,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -16921,6 +17155,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_chameleon();
|
||||
} break;
|
||||
@@ -391,7 +396,7 @@ index a639522d..83b80b59 100644
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -19589,6 +19829,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
@@ -20132,6 +20370,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@@ -4,14 +4,14 @@ Date: Wed, 9 Oct 2024 17:26:23 -0700
|
||||
Subject: [PATCH] conditional-fattn
|
||||
|
||||
---
|
||||
ggml/src/ggml-cuda.cu | 2 ++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
|
||||
index 809d6ab1..fe77b81c 100644
|
||||
--- a/ggml/src/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda.cu
|
||||
@@ -2347,9 +2347,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 52aec229..cbf4fddf 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2162,9 +2162,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
26
llama/patches/0007-blas.patch
Normal file
26
llama/patches/0007-blas.patch
Normal file
@@ -0,0 +1,26 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Mon, 30 Sep 2024 16:31:04 -0700
|
||||
Subject: [PATCH] blas
|
||||
|
||||
---
|
||||
ggml/src/ggml-blas/ggml-blas.cpp | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
index ec158dfa..b3ac1fa4 100644
|
||||
--- a/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
|
||||
@@ -1,3 +1,5 @@
|
||||
+#ifdef GGML_USE_BLAS
|
||||
+
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
@@ -515,3 +517,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||
}
|
||||
|
||||
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)
|
||||
+
|
||||
+#endif // GGML_USE_BLAS
|
||||
\ No newline at end of file
|
||||
@@ -12,29 +12,46 @@ kv cache once per run
|
||||
|
||||
remaining is to implement the cross attention mask
|
||||
---
|
||||
examples/llava/llava.cpp | 2 +-
|
||||
examples/llava/llava.cpp | 5 +-
|
||||
include/llama.h | 5 +
|
||||
src/llama.cpp | 447 +++++++++++++++++++++++++++++++++++++--
|
||||
3 files changed, 436 insertions(+), 18 deletions(-)
|
||||
src/llama.cpp | 477 +++++++++++++++++++++++++++++++++++++--
|
||||
3 files changed, 467 insertions(+), 20 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
|
||||
index 8558c6bd..37b2f2e2 100644
|
||||
index 4ca53a0b..d56644a8 100644
|
||||
--- a/examples/llava/llava.cpp
|
||||
+++ b/examples/llava/llava.cpp
|
||||
@@ -409,7 +409,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
if (n_eval > n_batch) {
|
||||
@@ -412,7 +412,7 @@ struct llava_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
- llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
+ llava_embd_batch(float * embd, int32_t n_embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -424,6 +424,7 @@ struct llava_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
+ /*n_embd =*/ n_embd,
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -447,7 +448,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
||||
n_eval = n_batch;
|
||||
}
|
||||
- llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||
+ llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), n_embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
float * embd = image_embed->embed+i*n_embd;
|
||||
- llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
|
||||
+ llava_embd_batch llava_batch = llava_embd_batch(embd, n_embd, n_eval, *n_past, 0);
|
||||
if (llama_decode(ctx_llama, llava_batch.batch)) {
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 7cae1bbe..aca09310 100644
|
||||
index e85f459f..aba85f86 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -240,6 +240,7 @@ extern "C" {
|
||||
@@ -245,6 +245,7 @@ extern "C" {
|
||||
|
||||
llama_token * token;
|
||||
float * embd;
|
||||
@@ -42,7 +59,7 @@ index 7cae1bbe..aca09310 100644
|
||||
llama_pos * pos;
|
||||
int32_t * n_seq_id;
|
||||
llama_seq_id ** seq_id;
|
||||
@@ -423,6 +424,10 @@ extern "C" {
|
||||
@@ -419,6 +420,10 @@ extern "C" {
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
@@ -54,10 +71,10 @@ index 7cae1bbe..aca09310 100644
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 83b80b59..35748488 100644
|
||||
index b01770d0..46881642 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -169,6 +169,7 @@ static std::string format(const char * fmt, ...) {
|
||||
@@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) {
|
||||
|
||||
enum llm_arch {
|
||||
LLM_ARCH_LLAMA,
|
||||
@@ -65,7 +82,7 @@ index 83b80b59..35748488 100644
|
||||
LLM_ARCH_FALCON,
|
||||
LLM_ARCH_BAICHUAN,
|
||||
LLM_ARCH_GROK,
|
||||
@@ -223,6 +224,7 @@ enum llm_arch {
|
||||
@@ -201,6 +202,7 @@ enum llm_arch {
|
||||
|
||||
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
@@ -73,7 +90,7 @@ index 83b80b59..35748488 100644
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GROK, "grok" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
@@ -330,6 +332,7 @@ enum llm_kv {
|
||||
@@ -309,6 +311,7 @@ enum llm_kv {
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
@@ -81,15 +98,15 @@ index 83b80b59..35748488 100644
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_FREQ_BASE,
|
||||
@@ -439,6 +442,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
@@ -426,6 +429,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -613,6 +617,14 @@ enum llm_tensor {
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
||||
@@ -608,6 +612,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
@@ -103,8 +120,8 @@ index 83b80b59..35748488 100644
|
||||
+ LLM_TENSOR_CROSS_ATTN_MLP_GATE,
|
||||
};
|
||||
|
||||
static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
@@ -642,6 +654,40 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
||||
@@ -637,6 +649,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
@@ -145,7 +162,7 @@ index 83b80b59..35748488 100644
|
||||
{
|
||||
LLM_ARCH_BAICHUAN,
|
||||
{
|
||||
@@ -2390,6 +2436,7 @@ enum e_model {
|
||||
@@ -2432,6 +2478,7 @@ enum e_model {
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
@@ -153,7 +170,7 @@ index 83b80b59..35748488 100644
|
||||
MODEL_236B,
|
||||
MODEL_314B,
|
||||
MODEL_SMALL,
|
||||
@@ -2434,6 +2481,7 @@ struct llama_hparams {
|
||||
@@ -2476,6 +2523,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
|
||||
@@ -161,7 +178,7 @@ index 83b80b59..35748488 100644
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -2502,10 +2550,11 @@ struct llama_hparams {
|
||||
@@ -2544,10 +2592,11 @@ struct llama_hparams {
|
||||
if (this->n_expert != other.n_expert) return true;
|
||||
if (this->n_expert_used != other.n_expert_used) return true;
|
||||
|
||||
@@ -169,15 +186,15 @@ index 83b80b59..35748488 100644
|
||||
- if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
- if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
- if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
+ if (this->n_head_arr != other.n_head_arr) return true;
|
||||
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
+ if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
+ if (this->n_head_arr != other.n_head_arr) return true;
|
||||
+ if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
||||
+ if (this->n_ff_arr != other.n_ff_arr) return true;
|
||||
+ if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
|
||||
+ if (this->cross_attn_layers != other.cross_attn_layers) return true;
|
||||
|
||||
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
||||
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
||||
@@ -2623,6 +2672,10 @@ struct llama_hparams {
|
||||
@@ -2665,6 +2714,10 @@ struct llama_hparams {
|
||||
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@@ -188,7 +205,7 @@ index 83b80b59..35748488 100644
|
||||
};
|
||||
|
||||
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
||||
@@ -2652,6 +2705,9 @@ struct llama_cparams {
|
||||
@@ -2694,6 +2747,9 @@ struct llama_cparams {
|
||||
bool offload_kqv;
|
||||
bool flash_attn;
|
||||
bool no_perf;
|
||||
@@ -198,7 +215,7 @@ index 83b80b59..35748488 100644
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
@@ -2806,6 +2862,16 @@ struct llama_layer {
|
||||
@@ -2853,6 +2909,16 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_down_scale;
|
||||
|
||||
struct ggml_tensor * bskcn_tv;
|
||||
@@ -215,7 +232,7 @@ index 83b80b59..35748488 100644
|
||||
};
|
||||
|
||||
// very similar to llama_batch,
|
||||
@@ -3452,6 +3518,8 @@ struct llama_context {
|
||||
@@ -3439,6 +3505,8 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
|
||||
struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
|
||||
struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
|
||||
@@ -224,13 +241,34 @@ index 83b80b59..35748488 100644
|
||||
};
|
||||
|
||||
struct llama_lora_weight {
|
||||
@@ -3686,6 +3754,18 @@ static bool llama_kv_cache_init(
|
||||
@@ -3577,6 +3645,39 @@ static bool llama_kv_cache_init(
|
||||
cache.v_l.reserve(n_layer);
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
+ // for cross attention layers
|
||||
+ if (model.arch == LLM_ARCH_MLLAMA && hparams.cross_attention_layers(i)) {
|
||||
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
+ const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
+ const llama_model::buft_list_t * buft_list;
|
||||
+ if (offload) {
|
||||
+ buft_list = model.dev_layer.at(i).buft_list;
|
||||
+ } else {
|
||||
+ buft_list = &model.cpu_buft_list;
|
||||
+ }
|
||||
+ ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
||||
+ [&](ggml_context * ctx) {
|
||||
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
+ if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
||||
+ return k;
|
||||
+ }
|
||||
+ ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
||||
+ return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
|
||||
+ });
|
||||
+ ggml_context * ctx = ctx_for_buft(buft);
|
||||
+
|
||||
+ if (!ctx) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to create ggml context for kv cache\n", __func__);
|
||||
+ return false;
|
||||
+ }
|
||||
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_k, 6404, hparams.n_head_kv(i));
|
||||
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hparams.n_embd_head_v, 6404, hparams.n_head_kv(i));
|
||||
+ ggml_format_name(k, "cache_k_l%d", i);
|
||||
@@ -243,17 +281,17 @@ index 83b80b59..35748488 100644
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
@@ -5460,12 +5540,14 @@ static void llm_load_hparams(
|
||||
@@ -5520,12 +5621,14 @@ static void llm_load_hparams(
|
||||
}
|
||||
|
||||
// zero-out the per-layer hparams
|
||||
- std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
- std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
- std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||
+ std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
||||
+ std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
||||
+ std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
||||
+ std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
|
||||
|
||||
- ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
||||
- ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
||||
@@ -263,7 +301,7 @@ index 83b80b59..35748488 100644
|
||||
|
||||
// n_head_kv is optional, default to n_head
|
||||
hparams.n_head_kv_arr = hparams.n_head_arr;
|
||||
@@ -5514,7 +5596,7 @@ static void llm_load_hparams(
|
||||
@@ -5574,7 +5677,7 @@ static void llm_load_hparams(
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
||||
|
||||
@@ -272,7 +310,7 @@ index 83b80b59..35748488 100644
|
||||
if (hparams.n_rot != hparams.n_embd_head_k) {
|
||||
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
||||
}
|
||||
@@ -5554,6 +5636,16 @@ static void llm_load_hparams(
|
||||
@@ -5614,6 +5717,16 @@ static void llm_load_hparams(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -289,63 +327,78 @@ index 83b80b59..35748488 100644
|
||||
case LLM_ARCH_MINICPM:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -7249,6 +7341,55 @@ static bool llm_load_tensors(
|
||||
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
@@ -7250,7 +7363,15 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
- {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}
|
||||
+ {LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_K_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_O_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_Q_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_V_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
+ {LLM_TENSOR_CROSS_ATTN_MLP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
};
|
||||
|
||||
// checks if the weight tensor can be used with the specified buffer type and device
|
||||
@@ -7754,6 +7875,53 @@ static bool llm_load_tensors(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
+ case LLM_ARCH_MLLAMA:
|
||||
+ {
|
||||
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8});
|
||||
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
|
||||
+
|
||||
+ // output
|
||||
+ {
|
||||
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
+
|
||||
+ // if output is NULL, init from the input tok embed
|
||||
+ if (model.output == NULL) {
|
||||
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
+ model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ for (int i = 0; i < n_layer; ++i) {
|
||||
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
+
|
||||
+ auto & layer = model.layers[i];
|
||||
+
|
||||
+ if (hparams.cross_attention_layers(i)) {
|
||||
+ layer.cross_attn_k_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128});
|
||||
+ layer.cross_attn_k_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024});
|
||||
+ layer.cross_attn_o_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd});
|
||||
+ layer.cross_attn_q_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128});
|
||||
+ layer.cross_attn_q_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd});
|
||||
+ layer.cross_attn_v_proj = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024});
|
||||
+ layer.cross_attn_attn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1});
|
||||
+ layer.cross_attn_mlp_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1});
|
||||
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
+ layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
|
||||
+ layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||
+ layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
+ layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
|
||||
+ layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
|
||||
+ layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
|
||||
+ layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
|
||||
+ layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
|
||||
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
||||
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
+ } else {
|
||||
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
||||
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
||||
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
||||
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
||||
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
+ layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
+ }
|
||||
+ }
|
||||
+ } break;
|
||||
case LLM_ARCH_GROK:
|
||||
case LLM_ARCH_MINICPM3:
|
||||
{
|
||||
if (n_expert == 0) {
|
||||
@@ -9093,7 +9234,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
@@ -9463,7 +9631,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
@@ -354,7 +407,7 @@ index 83b80b59..35748488 100644
|
||||
}
|
||||
|
||||
if (params.vocab_only) {
|
||||
@@ -9193,6 +9334,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
@@ -9546,6 +9714,21 @@ static struct ggml_tensor * llm_build_inp_embd(
|
||||
return inpL;
|
||||
}
|
||||
|
||||
@@ -376,7 +429,7 @@ index 83b80b59..35748488 100644
|
||||
static void llm_build_kv_store(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
@@ -10167,6 +10323,7 @@ struct llm_build_context {
|
||||
@@ -10513,6 +10696,7 @@ struct llm_build_context {
|
||||
lctx.inp_pos_bucket = nullptr;
|
||||
lctx.inp_embd_enc = nullptr;
|
||||
lctx.inp_KQ_mask_cross = nullptr;
|
||||
@@ -384,18 +437,10 @@ index 83b80b59..35748488 100644
|
||||
}
|
||||
|
||||
void free() {
|
||||
@@ -10754,6 +10911,239 @@ struct llm_build_context {
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
@@ -10992,6 +11176,240 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
+ cb(cur, "result_output", -1);
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
+ struct ggml_cgraph * build_mllama() {
|
||||
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
+
|
||||
@@ -410,7 +455,7 @@ index 83b80b59..35748488 100644
|
||||
+ struct ggml_tensor * inpL;
|
||||
+ struct ggml_tensor * inpCAS;
|
||||
+
|
||||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
+ inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
|
||||
+
|
||||
+ // inp_pos - contains the positions
|
||||
@@ -429,7 +474,7 @@ index 83b80b59..35748488 100644
|
||||
+ cb(cur, "attn_norm", il);
|
||||
+
|
||||
+ if (hparams.cross_attention_layers(il)) {
|
||||
+ if (!batch.embd && !cparams.cross_attn) {
|
||||
+ if (!ubatch.embd && !cparams.cross_attn) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
@@ -447,7 +492,7 @@ index 83b80b59..35748488 100644
|
||||
+ cb(Qcur, "Qcur", il);
|
||||
+
|
||||
+ struct ggml_tensor * Kcur, * Vcur;
|
||||
+ if (batch.embd) {
|
||||
+ if (ubatch.embd) {
|
||||
+ Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+
|
||||
@@ -621,10 +666,19 @@ index 83b80b59..35748488 100644
|
||||
+ LLM_NORM_RMS, cb, -1);
|
||||
+ cb(cur, "result_norm", -1);
|
||||
+
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
@@ -16501,6 +16891,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
+ // lm_head
|
||||
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
+ cb(cur, "result_output", -1);
|
||||
+
|
||||
+ ggml_build_forward_expand(gf, cur);
|
||||
+
|
||||
+ return gf;
|
||||
+ }
|
||||
+
|
||||
struct ggml_cgraph * build_baichuan() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
@@ -16973,6 +17391,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_llama();
|
||||
} break;
|
||||
@@ -635,14 +689,14 @@ index 83b80b59..35748488 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
{
|
||||
result = llm.build_baichuan();
|
||||
@@ -16761,10 +17155,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
|
||||
@@ -17237,10 +17659,19 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
||||
}
|
||||
|
||||
if (batch.embd) {
|
||||
if (ubatch.embd) {
|
||||
- const int64_t n_embd = hparams.n_embd;
|
||||
- const int64_t n_tokens = batch.n_tokens;
|
||||
- const int64_t n_tokens = ubatch.n_tokens;
|
||||
+ if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
|
||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, batch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||
+ ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
|
||||
+ // zero out inp_embd since it's not used
|
||||
+ float * inp_embd_data = (float *)lctx.inp_embd->data;
|
||||
+ for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
|
||||
@@ -650,24 +704,24 @@ index 83b80b59..35748488 100644
|
||||
+ }
|
||||
+ } else {
|
||||
+ const int64_t n_embd = hparams.n_embd;
|
||||
+ const int64_t n_tokens = batch.n_tokens;
|
||||
+ const int64_t n_tokens = ubatch.n_tokens;
|
||||
|
||||
- ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
- ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
|
||||
+ }
|
||||
}
|
||||
|
||||
if (batch.pos && lctx.inp_pos) {
|
||||
@@ -17345,7 +17748,7 @@ static int llama_decode_internal(
|
||||
if (ubatch.pos && lctx.inp_pos) {
|
||||
@@ -17841,7 +18272,7 @@ static int llama_decode_internal(
|
||||
n_outputs = 1;
|
||||
}
|
||||
|
||||
- lctx.sbatch.from_batch(batch_all, n_embd,
|
||||
+ lctx.sbatch.from_batch(batch_all, batch_all.n_embd,
|
||||
- lctx.sbatch.from_batch(batch, n_embd,
|
||||
+ lctx.sbatch.from_batch(batch, batch.n_embd,
|
||||
/* simple_split */ !kv_self.recurrent,
|
||||
/* logits_all */ n_outputs == n_tokens_all);
|
||||
|
||||
@@ -17638,7 +18041,7 @@ static int llama_encode_internal(
|
||||
@@ -18151,7 +18582,7 @@ static int llama_encode_internal(
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -676,7 +730,7 @@ index 83b80b59..35748488 100644
|
||||
|
||||
const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -18648,7 +19051,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
@@ -19189,7 +19620,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
if (llama_model_has_encoder(&model)) {
|
||||
n_attn_layer *= 3;
|
||||
}
|
||||
@@ -687,7 +741,7 @@ index 83b80b59..35748488 100644
|
||||
}
|
||||
|
||||
size_t total_size_org = 0;
|
||||
@@ -19814,6 +20219,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
@@ -20355,6 +20788,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
@@ -695,7 +749,7 @@ index 83b80b59..35748488 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
case LLM_ARCH_PLAMO:
|
||||
@@ -21230,6 +21636,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
@@ -21782,6 +22216,10 @@ void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
||||
ctx->cparams.causal_attn = causal_attn;
|
||||
}
|
||||
|
||||
@@ -705,8 +759,8 @@ index 83b80b59..35748488 100644
|
||||
+
|
||||
struct llama_batch llama_batch_get_one(
|
||||
llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
@@ -21239,6 +21649,7 @@ struct llama_batch llama_batch_get_one(
|
||||
int32_t n_tokens) {
|
||||
@@ -21789,6 +22227,7 @@ struct llama_batch llama_batch_get_one(
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ tokens,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -714,7 +768,7 @@ index 83b80b59..35748488 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21254,6 +21665,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -21801,6 +22240,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
/*n_tokens =*/ 0,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ nullptr,
|
||||
@@ -722,7 +776,7 @@ index 83b80b59..35748488 100644
|
||||
/*pos =*/ nullptr,
|
||||
/*n_seq_id =*/ nullptr,
|
||||
/*seq_id =*/ nullptr,
|
||||
@@ -21265,6 +21677,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
@@ -21809,6 +22249,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
@@ -4,20 +4,21 @@ Date: Thu, 17 Oct 2024 17:19:25 -0700
|
||||
Subject: [PATCH] add unpad operator
|
||||
|
||||
---
|
||||
ggml/include/ggml.h | 10 ++++
|
||||
ggml/src/ggml-cuda.cu | 4 ++
|
||||
ggml/src/ggml-cuda/pad.cu | 46 +++++++++++++++++++
|
||||
ggml/src/ggml-cuda/pad.cuh | 1 +
|
||||
ggml/src/ggml-metal.m | 33 ++++++++++++++
|
||||
ggml/src/ggml-metal.metal | 45 ++++++++++++++++++
|
||||
ggml/src/ggml.c | 93 +++++++++++++++++++++++++++++++++++++-
|
||||
7 files changed, 230 insertions(+), 2 deletions(-)
|
||||
ggml/include/ggml.h | 10 +++++
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 57 ++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++
|
||||
ggml/src/ggml-cuda/pad.cu | 46 ++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/pad.cuh | 1 +
|
||||
ggml/src/ggml-metal/ggml-metal.m | 33 ++++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 45 ++++++++++++++++++++++
|
||||
ggml/src/ggml.c | 25 +++++++++++-
|
||||
8 files changed, 219 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
||||
index ce3d92cb..962cb5f7 100644
|
||||
index 65cb92c4..acbcccc6 100644
|
||||
--- a/ggml/include/ggml.h
|
||||
+++ b/ggml/include/ggml.h
|
||||
@@ -506,6 +506,7 @@ extern "C" {
|
||||
@@ -499,6 +499,7 @@ extern "C" {
|
||||
GGML_OP_POOL_2D_BACK,
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_PAD,
|
||||
@@ -25,7 +26,7 @@ index ce3d92cb..962cb5f7 100644
|
||||
GGML_OP_ARANGE,
|
||||
GGML_OP_TIMESTEP_EMBEDDING,
|
||||
GGML_OP_ARGSORT,
|
||||
@@ -1764,6 +1765,15 @@ extern "C" {
|
||||
@@ -1695,6 +1696,15 @@ extern "C" {
|
||||
int p2,
|
||||
int p3);
|
||||
|
||||
@@ -41,11 +42,93 @@ index ce3d92cb..962cb5f7 100644
|
||||
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
||||
// timesteps: [N,]
|
||||
// return: [N, dim]
|
||||
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
|
||||
index fe77b81c..6e84af56 100644
|
||||
--- a/ggml/src/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda.cu
|
||||
@@ -2270,6 +2270,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 23ae2e10..111ff3b0 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10439,6 +10439,58 @@ static void ggml_compute_forward_pad(
|
||||
}
|
||||
}
|
||||
|
||||
+static void ggml_compute_forward_unpad_f32(
|
||||
+ const struct ggml_compute_params *params,
|
||||
+ struct ggml_tensor *dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||
+
|
||||
+ float * dst_ptr = (float *) dst->data;
|
||||
+
|
||||
+ // TODO: optimize
|
||||
+
|
||||
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||
+
|
||||
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
+
|
||||
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
+ dst_ptr[dst_idx] = *src_ptr;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void ggml_compute_forward_unpad(
|
||||
+ const struct ggml_compute_params * params,
|
||||
+ struct ggml_tensor * dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ switch (src0->type) {
|
||||
+ case GGML_TYPE_F32:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad_f32(params, dst);
|
||||
+ } break;
|
||||
+ default:
|
||||
+ {
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
@@ -12535,6 +12587,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad(params, tensor);
|
||||
} break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad(params, tensor);
|
||||
+ } break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -12877,6 +12933,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cbf4fddf..9ca6cb77 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2085,6 +2085,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
@@ -55,7 +138,7 @@ index fe77b81c..6e84af56 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -2992,6 +2995,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
@@ -3012,6 +3015,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_GROUP_NORM:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
@@ -126,35 +209,35 @@ index 8fd386b0..e2ededc3 100644
|
||||
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
|
||||
index 829c5e39..25702d85 100644
|
||||
--- a/ggml/src/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal.m
|
||||
@@ -193,6 +193,7 @@
|
||||
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 093ae900..cb9a1307 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -310,6 +310,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32,
|
||||
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_PAD_F32,
|
||||
+ GGML_METAL_KERNEL_TYPE_UNPAD_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -689,6 +690,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
@@ -877,6 +878,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONV_TRANSPOSE_1D_F16_F32, conv_transpose_1d_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UNPAD_F32, unpad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -846,6 +848,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
|
||||
return false;
|
||||
@@ -1099,6 +1101,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -2655,6 +2658,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -3258,6 +3261,36 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -191,11 +274,11 @@ index 829c5e39..25702d85 100644
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
|
||||
index 2b200032..09887511 100644
|
||||
--- a/ggml/src/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal.metal
|
||||
@@ -2029,6 +2029,51 @@ kernel void kernel_pad_f32(
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index 5caa0846..47038c31 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2897,6 +2897,51 @@ kernel void kernel_pad_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,10 +331,10 @@ index 2b200032..09887511 100644
|
||||
device char * dst,
|
||||
constant int64_t & ne0,
|
||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
||||
index bcbc32d9..f4864ac8 100644
|
||||
index 1a9a7efa..ea2b259b 100644
|
||||
--- a/ggml/src/ggml.c
|
||||
+++ b/ggml/src/ggml.c
|
||||
@@ -2997,6 +2997,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -950,6 +950,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"POOL_2D_BACK",
|
||||
"UPSCALE",
|
||||
"PAD",
|
||||
@@ -259,16 +342,16 @@ index bcbc32d9..f4864ac8 100644
|
||||
"ARANGE",
|
||||
"TIMESTEP_EMBEDDING",
|
||||
"ARGSORT",
|
||||
@@ -3030,7 +3031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
@@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
|
||||
+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -3091,6 +3092,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1045,6 +1046,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"pool_2d_back(x)",
|
||||
"upscale(x)",
|
||||
"pad(x)",
|
||||
@@ -276,16 +359,16 @@ index bcbc32d9..f4864ac8 100644
|
||||
"arange(start, stop, step)",
|
||||
"timestep_embedding(timesteps, dim, max_period)",
|
||||
"argsort(x)",
|
||||
@@ -3124,7 +3126,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
@@ -1078,7 +1080,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"adamw(x)",
|
||||
};
|
||||
|
||||
-static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
|
||||
+static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -6955,6 +6957,32 @@ struct ggml_tensor * ggml_pad(
|
||||
@@ -4097,6 +4099,25 @@ struct ggml_tensor * ggml_pad(
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -295,12 +378,6 @@ index bcbc32d9..f4864ac8 100644
|
||||
+ struct ggml_context * ctx,
|
||||
+ struct ggml_tensor * a,
|
||||
+ int p0, int p1, int p2, int p3) {
|
||||
+ bool is_node = false;
|
||||
+
|
||||
+ if (a->grad) {
|
||||
+ GGML_ABORT("fatal error"); // TODO: implement backward
|
||||
+ is_node = true;
|
||||
+ }
|
||||
+
|
||||
+ struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
|
||||
+ a->ne[0] - p0,
|
||||
@@ -309,7 +386,6 @@ index bcbc32d9..f4864ac8 100644
|
||||
+ a->ne[3] - p3);
|
||||
+
|
||||
+ result->op = GGML_OP_UNPAD;
|
||||
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
+ result->src[0] = a;
|
||||
+
|
||||
+ return result;
|
||||
@@ -318,92 +394,3 @@ index bcbc32d9..f4864ac8 100644
|
||||
// ggml_arange
|
||||
|
||||
struct ggml_tensor * ggml_arange(
|
||||
@@ -15312,6 +15340,58 @@ static void ggml_compute_forward_pad(
|
||||
}
|
||||
}
|
||||
|
||||
+static void ggml_compute_forward_unpad_f32(
|
||||
+ const struct ggml_compute_params *params,
|
||||
+ struct ggml_tensor *dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
+ GGML_ASSERT( dst->nb[0] == sizeof(float));
|
||||
+
|
||||
+ const int ith = params->ith;
|
||||
+ const int nth = params->nth;
|
||||
+
|
||||
+ GGML_TENSOR_UNARY_OP_LOCALS
|
||||
+
|
||||
+ float * dst_ptr = (float *) dst->data;
|
||||
+
|
||||
+ // TODO: optimize
|
||||
+
|
||||
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
||||
+ for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
|
||||
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||
+ for (int64_t i3 = 0; i3 < ne3; ++i3) {
|
||||
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
|
||||
+
|
||||
+ const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||
+
|
||||
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
+ dst_ptr[dst_idx] = *src_ptr;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void ggml_compute_forward_unpad(
|
||||
+ const struct ggml_compute_params * params,
|
||||
+ struct ggml_tensor * dst) {
|
||||
+
|
||||
+ const struct ggml_tensor * src0 = dst->src[0];
|
||||
+
|
||||
+ switch (src0->type) {
|
||||
+ case GGML_TYPE_F32:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad_f32(params, dst);
|
||||
+ } break;
|
||||
+ default:
|
||||
+ {
|
||||
+ GGML_ABORT("fatal error");
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
|
||||
// ggml_compute_forward_arange
|
||||
|
||||
@@ -17294,6 +17374,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_pad(params, tensor);
|
||||
} break;
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
+ ggml_compute_forward_unpad(params, tensor);
|
||||
+ } break;
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
ggml_compute_forward_arange(params, tensor);
|
||||
@@ -18369,6 +18453,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
}
|
||||
+ case GGML_OP_UNPAD:
|
||||
+ {
|
||||
+ GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
+ }
|
||||
case GGML_OP_ARANGE:
|
||||
{
|
||||
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||
@@ -19165,6 +19253,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
} break;
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
+ case GGML_OP_UNPAD:
|
||||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
@@ -1,25 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Mon, 30 Sep 2024 16:31:04 -0700
|
||||
Subject: [PATCH] blas
|
||||
|
||||
---
|
||||
ggml/src/ggml-blas.cpp | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
|
||||
index 6d99c6be..8e1ab99d 100644
|
||||
--- a/ggml/src/ggml-blas.cpp
|
||||
+++ b/ggml/src/ggml-blas.cpp
|
||||
@@ -1,3 +1,5 @@
|
||||
+#ifdef GGML_USE_BLAS
|
||||
+
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-blas.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
@@ -366,3 +368,5 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
|
||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
+
|
||||
+#endif
|
||||
@@ -7,11 +7,11 @@ On windows compiled with gcc the c++ regex library failed to handle
|
||||
the characters
|
||||
---
|
||||
src/llama-vocab.cpp | 2 +-
|
||||
src/unicode.cpp | 21 +++++++++++++++++++++
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
src/unicode.cpp | 22 ++++++++++++++++++++++
|
||||
2 files changed, 23 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index d2f34ddd..3ef6af19 100644
|
||||
index d1dc9627..05ef0e71 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -389,7 +389,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -24,7 +24,7 @@ index d2f34ddd..3ef6af19 100644
|
||||
"\\s+$",
|
||||
"[一-龥ࠀ-一가-]+",
|
||||
diff --git a/src/unicode.cpp b/src/unicode.cpp
|
||||
index f4e941cd..9d78ff16 100644
|
||||
index 3d459263..51dd81fb 100644
|
||||
--- a/src/unicode.cpp
|
||||
+++ b/src/unicode.cpp
|
||||
@@ -2,6 +2,11 @@
|
||||
@@ -39,7 +39,7 @@ index f4e941cd..9d78ff16 100644
|
||||
#include "unicode.h"
|
||||
#include "unicode-data.h"
|
||||
|
||||
@@ -201,8 +206,24 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
@@ -201,6 +206,22 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
|
||||
}
|
||||
|
||||
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
@@ -58,7 +58,13 @@ index f4e941cd..9d78ff16 100644
|
||||
+ free(wbuf);
|
||||
+ return ret;
|
||||
+#else
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||
+
|
||||
#if defined(__clang__)
|
||||
// disable C++17 deprecation warning for std::codecvt_utf8
|
||||
# pragma clang diagnostic push
|
||||
@@ -214,6 +235,7 @@ static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
|
||||
#endif
|
||||
|
||||
return conv.from_bytes(s);
|
||||
+#endif
|
||||
}
|
||||
64
llama/patches/0011-relative-include-paths.patch
Normal file
64
llama/patches/0011-relative-include-paths.patch
Normal file
@@ -0,0 +1,64 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 3 Dec 2024 21:30:51 -0800
|
||||
Subject: [PATCH] relative include paths
|
||||
|
||||
---
|
||||
ggml/src/ggml-cpu/ggml-cpu-aarch64.c | 2 +-
|
||||
ggml/src/ggml-cpu/ggml-cpu.c | 2 +-
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp | 2 +-
|
||||
ggml/src/ggml-quants.c | 2 +-
|
||||
4 files changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
index 11152385..bbf8934e 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
@@ -4,7 +4,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
-#include "ggml-cpu/ggml-cpu-impl.h"
|
||||
+#include "ggml-cpu-impl.h"
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
index 111ff3b0..df0bd3c6 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
@@ -10,7 +10,7 @@
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "ggml-threading.h"
|
||||
-#include "amx/amx.h"
|
||||
+#include "amx.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
index 77e5d87a..91476ad0 100644
|
||||
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
@@ -3,7 +3,7 @@
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-impl.h"
|
||||
-#include "amx/amx.h"
|
||||
+#include "amx.h"
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
|
||||
index 7301a9c6..49ab3daf 100644
|
||||
--- a/ggml/src/ggml-quants.c
|
||||
+++ b/ggml/src/ggml-quants.c
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-impl.h"
|
||||
-#include "ggml-cpu/ggml-cpu-impl.h"
|
||||
+#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
#include <math.h>
|
||||
Reference in New Issue
Block a user