mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
llama: update to commit 2016f07b (#10352)
This commit is contained in:
@@ -65,10 +65,10 @@ index 273075f4..dd11f304 100644
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
|
||||
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
index cec36b36..4b057973 100644
|
||||
index e2617b06..242e50a7 100644
|
||||
--- a/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
|
||||
@@ -530,6 +530,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
||||
@@ -800,6 +800,7 @@ static void ggml_backend_cann_buffer_free_buffer(
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||
delete ctx;
|
||||
@@ -76,7 +76,7 @@ index cec36b36..4b057973 100644
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1199,6 +1200,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
@@ -1472,6 +1473,7 @@ static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buf
|
||||
*/
|
||||
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
@@ -85,10 +85,10 @@ index cec36b36..4b057973 100644
|
||||
|
||||
/**
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index fafe9633..59a49560 100644
|
||||
index a7febef7..31750b6f 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -533,6 +533,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
@@ -534,6 +534,7 @@ struct ggml_backend_cuda_buffer_context {
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -96,7 +96,7 @@ index fafe9633..59a49560 100644
|
||||
}
|
||||
|
||||
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
|
||||
@@ -788,6 +789,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
@@ -789,6 +790,7 @@ struct ggml_backend_cuda_split_buffer_context {
|
||||
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
@@ -104,7 +104,7 @@ index fafe9633..59a49560 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -1061,6 +1063,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
@@ -1062,6 +1064,7 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||
@@ -125,10 +125,10 @@ index 50579227..2799a0a5 100644
|
||||
|
||||
static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 9f1c6c6c..310afe8a 100644
|
||||
index 266d8af4..12886cd3 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -4641,6 +4641,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
@@ -4759,6 +4759,7 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
@@ -137,10 +137,10 @@ index 9f1c6c6c..310afe8a 100644
|
||||
|
||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
index b8b5cbd3..14d4561b 100644
|
||||
index 05a2f4e6..392cc18d 100644
|
||||
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
|
||||
@@ -1443,6 +1443,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
@@ -1940,6 +1940,7 @@ struct ggml_backend_opencl_buffer_context {
|
||||
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
@@ -149,10 +149,10 @@ index b8b5cbd3..14d4561b 100644
|
||||
|
||||
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
index 862b9b66..34536681 100644
|
||||
index a0667b7d..bd83adc5 100644
|
||||
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
|
||||
@@ -443,6 +443,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -468,6 +468,7 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_FREE_BUFFER, &request, sizeof(request), nullptr, 0);
|
||||
GGML_ASSERT(status);
|
||||
delete ctx;
|
||||
@@ -161,7 +161,7 @@ index 862b9b66..34536681 100644
|
||||
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
index 3e48a924..a3d182fc 100644
|
||||
index 1de34c96..4600f61e 100644
|
||||
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
|
||||
@@ -316,6 +316,7 @@ ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
|
||||
@@ -189,10 +189,10 @@ index 3e48a924..a3d182fc 100644
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
index 783a0ff8..8ac1e07e 100644
|
||||
index 39f3cd34..c569a8a5 100644
|
||||
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
|
||||
@@ -8639,6 +8639,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
@@ -8653,6 +8653,7 @@ static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
||||
ggml_vk_destroy_buffer(ctx->dev_buffer);
|
||||
delete ctx;
|
||||
@@ -200,7 +200,7 @@ index 783a0ff8..8ac1e07e 100644
|
||||
}
|
||||
|
||||
static void * ggml_backend_vk_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@@ -8782,6 +8783,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
@@ -8796,6 +8797,7 @@ static const char * ggml_backend_vk_host_buffer_name(ggml_backend_buffer_t buffe
|
||||
static void ggml_backend_vk_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
VK_LOG_MEMORY("ggml_backend_vk_host_buffer_free_buffer()");
|
||||
ggml_vk_host_free(vk_instance.devices[0], buffer->context);
|
||||
|
||||
@@ -10,7 +10,7 @@ logs instead of throwing an error
|
||||
1 file changed, 3 insertions(+), 11 deletions(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 464ff01e..0125ee53 100644
|
||||
index 48060517..a35b498c 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1491,16 +1491,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
|
||||
@@ -11,10 +11,10 @@ instead of forcing one or the error
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 4735e98e..65135172 100644
|
||||
index 983385f8..32f59819 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -1232,7 +1232,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1236,7 +1236,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
int64_t n_outputs_all = 0;
|
||||
|
||||
// count outputs
|
||||
@@ -23,7 +23,7 @@ index 4735e98e..65135172 100644
|
||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||
n_outputs_all += batch.logits[i] != 0;
|
||||
}
|
||||
@@ -1344,7 +1344,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1348,7 +1348,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
//}
|
||||
|
||||
@@ -32,7 +32,7 @@ index 4735e98e..65135172 100644
|
||||
auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
|
||||
|
||||
if (t_embd && res->get_embd_pooled()) {
|
||||
@@ -1488,7 +1488,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1492,7 +1492,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
|
||||
@@ -10,12 +10,12 @@ filesystems for paths that include wide characters
|
||||
1 file changed, 39 insertions(+)
|
||||
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index 49c90b75..4b72ea9f 100644
|
||||
index 75970615..d57b4bd6 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -28,6 +28,19 @@
|
||||
#include <cinttypes>
|
||||
@@ -29,6 +29,19 @@
|
||||
#include <limits>
|
||||
#include <array>
|
||||
|
||||
+#if defined(_WIN32)
|
||||
+#define WIN32_LEAN_AND_MEAN
|
||||
@@ -33,7 +33,7 @@ index 49c90b75..4b72ea9f 100644
|
||||
struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
|
||||
|
||||
//#define CLIP_DEBUG_FUNCTIONS
|
||||
@@ -1429,7 +1442,29 @@ struct clip_model_loader {
|
||||
@@ -1430,7 +1443,29 @@ struct clip_model_loader {
|
||||
{
|
||||
std::vector<uint8_t> read_buf;
|
||||
|
||||
@@ -63,7 +63,7 @@ index 49c90b75..4b72ea9f 100644
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str()));
|
||||
}
|
||||
@@ -1456,7 +1491,11 @@ struct clip_model_loader {
|
||||
@@ -1457,7 +1492,11 @@ struct clip_model_loader {
|
||||
ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 16:03:51 -0700
|
||||
Date: Sun, 20 Apr 2025 16:11:09 -0700
|
||||
Subject: [PATCH] solar-pro
|
||||
|
||||
adds support for the Solar Pro architecture
|
||||
@@ -15,7 +15,7 @@ adds support for the Solar Pro architecture
|
||||
7 files changed, 248 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index a6fddc7f..0b0fedcd 100644
|
||||
index 62e1480b..f754bc8f 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -31,10 +31,10 @@ index a6fddc7f..0b0fedcd 100644
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
+ { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -1478,6 +1480,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1482,6 +1484,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
},
|
||||
},
|
||||
@@ -59,7 +59,7 @@ index a6fddc7f..0b0fedcd 100644
|
||||
{
|
||||
LLM_ARCH_WAVTOKENIZER_DEC,
|
||||
{
|
||||
@@ -1671,6 +1691,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1660,6 +1680,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
@@ -68,7 +68,7 @@ index a6fddc7f..0b0fedcd 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 2c2099b3..74aa3dd0 100644
|
||||
index 98ca00a1..439aaeab 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -72,6 +72,7 @@ enum llm_arch {
|
||||
@@ -84,10 +84,10 @@ index 2c2099b3..74aa3dd0 100644
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
+ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -340,6 +342,7 @@ enum llm_tensor {
|
||||
@@ -344,6 +346,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ENC_OUTPUT_NORM,
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
@@ -115,10 +115,10 @@ index 90dfe7a7..8a667960 100644
|
||||
if (il < n_layer) {
|
||||
return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index 4e0b5719..c3147cbc 100644
|
||||
index 80fcd65d..6e278945 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -51,6 +51,8 @@ struct llama_hparams {
|
||||
@@ -55,6 +55,8 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
@@ -127,7 +127,7 @@ index 4e0b5719..c3147cbc 100644
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
uint32_t n_lora_kv = 0;
|
||||
@@ -149,6 +151,9 @@ struct llama_hparams {
|
||||
@@ -153,6 +155,9 @@ struct llama_hparams {
|
||||
// dimension of the recurrent state embeddings
|
||||
uint32_t n_embd_v_s() const;
|
||||
|
||||
@@ -150,10 +150,10 @@ index ea73a8a7..a012aeae 100644
|
||||
llama_model_loader::llama_model_loader(
|
||||
const std::string & fname,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index b74dd72c..5fbd0055 100644
|
||||
index 6b7bfecf..aba42819 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1372,6 +1372,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1374,6 +1374,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -175,7 +175,7 @@ index b74dd72c..5fbd0055 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
@@ -3701,6 +3716,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -3717,6 +3732,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
|
||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
@@ -210,7 +210,7 @@ index b74dd72c..5fbd0055 100644
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
@@ -12244,6 +12287,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
@@ -12296,6 +12339,165 @@ struct llm_build_chameleon : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -309,14 +309,14 @@ index b74dd72c..5fbd0055 100644
|
||||
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
+ ext_factor, attn_factor, beta_fast, beta_slow
|
||||
+ );
|
||||
+
|
||||
+
|
||||
+ cb(Qcur, "Qcur", il);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+ cb(Vcur, "Vcur", il);
|
||||
+
|
||||
+ cur = build_attn(inp_attn, gf,
|
||||
+ model.layers[il].wo, model.layers[il].bo,
|
||||
+ Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
||||
+ Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
||||
+ cb(cur, "attn_out", il);
|
||||
+ }
|
||||
+
|
||||
@@ -376,7 +376,7 @@ index b74dd72c..5fbd0055 100644
|
||||
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
||||
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
ggml_tensor * cur;
|
||||
@@ -12993,6 +13195,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13045,6 +13247,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_chameleon>(*this, params, gf);
|
||||
} break;
|
||||
@@ -387,7 +387,7 @@ index b74dd72c..5fbd0055 100644
|
||||
case LLM_ARCH_WAVTOKENIZER_DEC:
|
||||
{
|
||||
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
||||
@@ -13139,6 +13345,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13191,6 +13397,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_GRANITE:
|
||||
case LLM_ARCH_GRANITE_MOE:
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
@@ -396,7 +396,7 @@ index b74dd72c..5fbd0055 100644
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index 0f18dac1..e08d4ae4 100644
|
||||
index fd82d106..5865d5e9 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -62,6 +62,7 @@ enum llm_type {
|
||||
@@ -407,7 +407,7 @@ index 0f18dac1..e08d4ae4 100644
|
||||
LLM_TYPE_30B,
|
||||
LLM_TYPE_32B,
|
||||
LLM_TYPE_34B,
|
||||
@@ -305,6 +306,8 @@ struct llama_layer {
|
||||
@@ -307,6 +308,8 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_up_scale = nullptr;
|
||||
struct ggml_tensor * ffn_down_scale = nullptr;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 19:27:12 -0700
|
||||
Date: Sun, 20 Apr 2025 16:12:36 -0700
|
||||
Subject: [PATCH] add mllama support
|
||||
|
||||
adds support for the llama 3.2 vision architecture
|
||||
@@ -28,7 +28,7 @@ adds support for the llama 3.2 vision architecture
|
||||
20 files changed, 475 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp
|
||||
index 91a07e2a..13127c7b 100644
|
||||
index 3d566475..654d1358 100644
|
||||
--- a/examples/llava/gemma3-cli.cpp
|
||||
+++ b/examples/llava/gemma3-cli.cpp
|
||||
@@ -106,7 +106,7 @@ struct decode_embd_batch {
|
||||
@@ -79,10 +79,10 @@ index 03a22cbb..5eb40bcd 100644
|
||||
LOG_ERR("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
|
||||
index 114c274b..a0e649ad 100644
|
||||
index 3fd5bebc..f0cec596 100644
|
||||
--- a/examples/llava/mtmd.cpp
|
||||
+++ b/examples/llava/mtmd.cpp
|
||||
@@ -213,7 +213,7 @@ struct decode_embd_batch {
|
||||
@@ -233,7 +233,7 @@ struct decode_embd_batch {
|
||||
std::vector<llama_seq_id *> seq_ids;
|
||||
std::vector<int8_t> logits;
|
||||
llama_batch batch;
|
||||
@@ -91,7 +91,7 @@ index 114c274b..a0e649ad 100644
|
||||
pos .resize(n_tokens);
|
||||
n_seq_id.resize(n_tokens);
|
||||
seq_ids .resize(n_tokens + 1);
|
||||
@@ -225,6 +225,7 @@ struct decode_embd_batch {
|
||||
@@ -245,6 +245,7 @@ struct decode_embd_batch {
|
||||
/*n_tokens =*/ n_tokens,
|
||||
/*tokens =*/ nullptr,
|
||||
/*embd =*/ embd,
|
||||
@@ -99,9 +99,9 @@ index 114c274b..a0e649ad 100644
|
||||
/*pos =*/ pos.data(),
|
||||
/*n_seq_id =*/ n_seq_id.data(),
|
||||
/*seq_id =*/ seq_ids.data(),
|
||||
@@ -291,7 +292,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
@@ -311,7 +312,8 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
|
||||
|
||||
int32_t n_tokens = chunk.tokens_image->n_tokens();
|
||||
int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
|
||||
float * embd = mtmd_get_output_embd(ctx);
|
||||
- decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
|
||||
+ int n_embd = llama_model_n_embd(llama_get_model(lctx));
|
||||
@@ -158,7 +158,7 @@ index 5657fbf0..f91896e4 100644
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index 0b0fedcd..c1f78618 100644
|
||||
index f754bc8f..0568565f 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -6,6 +6,7 @@
|
||||
@@ -174,10 +174,10 @@ index 0b0fedcd..c1f78618 100644
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection" },
|
||||
+ { LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, "%s.attention.cross_attention_layers" },
|
||||
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
||||
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
||||
|
||||
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
||||
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
||||
@@ -269,6 +271,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -271,6 +273,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
@@ -218,7 +218,7 @@ index 0b0fedcd..c1f78618 100644
|
||||
{
|
||||
LLM_ARCH_DECI,
|
||||
{
|
||||
@@ -1692,6 +1728,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
@@ -1681,6 +1717,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
// this tensor is loaded for T5, but never used
|
||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||
{LLM_TENSOR_BSKCN_TV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
@@ -234,7 +234,7 @@ index 0b0fedcd..c1f78618 100644
|
||||
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index 74aa3dd0..f987844d 100644
|
||||
index 439aaeab..6a989034 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -250,10 +250,10 @@ index 74aa3dd0..f987844d 100644
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
|
||||
+ LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
|
||||
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
||||
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
||||
|
||||
LLM_KV_ROPE_DIMENSION_COUNT,
|
||||
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
||||
@@ -343,6 +345,14 @@ enum llm_tensor {
|
||||
@@ -347,6 +349,14 @@ enum llm_tensor {
|
||||
LLM_TENSOR_CLS,
|
||||
LLM_TENSOR_CLS_OUT,
|
||||
LLM_TENSOR_BSKCN_TV,
|
||||
@@ -297,10 +297,10 @@ index 01d5ca57..8682b0e6 100644
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
}
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index 65135172..afe6f552 100644
|
||||
index 32f59819..0343ba8a 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -858,7 +858,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
@@ -862,7 +862,7 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
|
||||
}
|
||||
|
||||
@@ -309,7 +309,7 @@ index 65135172..afe6f552 100644
|
||||
} catch (const std::exception & err) {
|
||||
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
|
||||
#ifndef NDEBUG
|
||||
@@ -979,6 +979,10 @@ void llama_context::set_warmup(bool value) {
|
||||
@@ -983,6 +983,10 @@ void llama_context::set_warmup(bool value) {
|
||||
cparams.warmup = value;
|
||||
}
|
||||
|
||||
@@ -320,7 +320,7 @@ index 65135172..afe6f552 100644
|
||||
void llama_context::set_adapter_lora(
|
||||
llama_adapter_lora * adapter,
|
||||
float scale) {
|
||||
@@ -1054,7 +1058,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
@@ -1058,7 +1062,7 @@ int llama_context::encode(llama_batch & inp_batch) {
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
@@ -329,7 +329,7 @@ index 65135172..afe6f552 100644
|
||||
|
||||
const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
|
||||
|
||||
@@ -1194,10 +1198,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1198,10 +1202,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const llama_batch & batch = batch_allocr.batch;
|
||||
|
||||
@@ -341,7 +341,7 @@ index 65135172..afe6f552 100644
|
||||
|
||||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
@@ -1245,7 +1248,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1249,7 +1252,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
const bool logits_all = n_outputs_all == n_tokens_all;
|
||||
|
||||
@@ -350,7 +350,7 @@ index 65135172..afe6f552 100644
|
||||
/* simple_split */ !kv_self->recurrent,
|
||||
/* logits_all */ logits_all);
|
||||
|
||||
@@ -1479,12 +1482,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1483,12 +1486,11 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
|
||||
int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
const auto & hparams = model.hparams;
|
||||
@@ -364,7 +364,7 @@ index 65135172..afe6f552 100644
|
||||
const auto n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: use a per-batch flag for logits presence instead
|
||||
@@ -1554,7 +1556,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
@@ -1558,7 +1560,7 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
void llama_context::output_reorder() {
|
||||
auto & out_ids = sbatch.out_ids;
|
||||
if (!out_ids.empty()) {
|
||||
@@ -373,7 +373,7 @@ index 65135172..afe6f552 100644
|
||||
const uint32_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
@@ -2061,7 +2063,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
@@ -2065,7 +2067,7 @@ size_t llama_context::state_write_data(llama_io_write_i & io) {
|
||||
{
|
||||
LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
|
||||
|
||||
@@ -382,7 +382,7 @@ index 65135172..afe6f552 100644
|
||||
|
||||
io.write(&logits_size, sizeof(logits_size));
|
||||
|
||||
@@ -2244,6 +2246,7 @@ llama_context_params llama_context_default_params() {
|
||||
@@ -2248,6 +2250,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.flash_attn =*/ false,
|
||||
/*.no_perf =*/ true,
|
||||
@@ -390,7 +390,7 @@ index 65135172..afe6f552 100644
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
@@ -2371,6 +2374,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
@@ -2375,6 +2378,10 @@ void llama_set_warmup(llama_context * ctx, bool warmup) {
|
||||
ctx->set_warmup(warmup);
|
||||
}
|
||||
|
||||
@@ -426,7 +426,7 @@ index 30e550f0..85ad91b9 100644
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
||||
index cd955d63..83f3c5a8 100644
|
||||
index a85e9728..d740c120 100644
|
||||
--- a/src/llama-graph.cpp
|
||||
+++ b/src/llama-graph.cpp
|
||||
@@ -546,6 +546,12 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
@@ -442,7 +442,7 @@ index cd955d63..83f3c5a8 100644
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@@ -1495,6 +1501,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
@@ -1506,6 +1512,25 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
return (llm_graph_input_attn_cross *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
@@ -469,7 +469,7 @@ index cd955d63..83f3c5a8 100644
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
||||
index 5b6618f9..51993998 100644
|
||||
index d192dc14..260a2af2 100644
|
||||
--- a/src/llama-graph.h
|
||||
+++ b/src/llama-graph.h
|
||||
@@ -86,6 +86,7 @@ public:
|
||||
@@ -518,7 +518,7 @@ index 8a667960..6a02de03 100644
|
||||
+ return std::find(cross_attn_layers.begin(), cross_attn_layers.end(), il) != cross_attn_layers.end();
|
||||
+}
|
||||
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
|
||||
index c3147cbc..4567a0e9 100644
|
||||
index 6e278945..c8a34d52 100644
|
||||
--- a/src/llama-hparams.h
|
||||
+++ b/src/llama-hparams.h
|
||||
@@ -2,6 +2,8 @@
|
||||
@@ -536,9 +536,9 @@ index c3147cbc..4567a0e9 100644
|
||||
uint32_t n_rel_attn_bkts = 0;
|
||||
+ uint32_t n_vocab = 0;
|
||||
|
||||
// for WavTokenizer
|
||||
struct llama_hparams_posnet posnet;
|
||||
@@ -52,6 +55,7 @@ struct llama_hparams {
|
||||
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
||||
uint32_t n_embd_head_k_mla = 0;
|
||||
@@ -56,6 +59,7 @@ struct llama_hparams {
|
||||
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
||||
|
||||
std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr = {};
|
||||
@@ -546,7 +546,7 @@ index c3147cbc..4567a0e9 100644
|
||||
|
||||
uint32_t n_layer_dense_lead = 0;
|
||||
uint32_t n_lora_q = 0;
|
||||
@@ -154,6 +158,9 @@ struct llama_hparams {
|
||||
@@ -158,6 +162,9 @@ struct llama_hparams {
|
||||
// Block skip connection
|
||||
bool n_bskcn(uint32_t n, uint32_t il) const;
|
||||
|
||||
@@ -557,7 +557,7 @@ index c3147cbc..4567a0e9 100644
|
||||
};
|
||||
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index dbf5f118..9310f262 100644
|
||||
index 7c9d46d8..69f8d35a 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -95,8 +95,16 @@ bool llama_kv_cache_unified::init(
|
||||
@@ -593,7 +593,7 @@ index a012aeae..2e11507d 100644
|
||||
bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
|
||||
const int kid = gguf_find_key(meta.get(), key.c_str());
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index 5fbd0055..d5ad466e 100644
|
||||
index aba42819..d051696c 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -419,6 +419,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -650,7 +650,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@@ -1548,7 +1562,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1550,7 +1564,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_ff = hparams.n_ff();
|
||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||
@@ -659,7 +659,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
const int64_t n_token_types = vocab.n_token_types();
|
||||
const int64_t n_rot = hparams.n_rot;
|
||||
const int64_t n_expert = hparams.n_expert;
|
||||
@@ -1801,6 +1815,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
@@ -1803,6 +1817,52 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
}
|
||||
}
|
||||
} break;
|
||||
@@ -712,7 +712,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@@ -4665,6 +4725,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
@@ -4683,6 +4743,246 @@ struct llm_build_llama : public llm_graph_context {
|
||||
}
|
||||
};
|
||||
|
||||
@@ -893,14 +893,14 @@ index 5fbd0055..d5ad466e 100644
|
||||
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
+ ext_factor, attn_factor, beta_fast, beta_slow
|
||||
+ );
|
||||
+
|
||||
+
|
||||
+ cb(Qcur, "Qcur", il);
|
||||
+ cb(Kcur, "Kcur", il);
|
||||
+ cb(Vcur, "Vcur", il);
|
||||
+
|
||||
+ cur = build_attn(inp_attn, gf,
|
||||
+ model.layers[il].wo, model.layers[il].bo,
|
||||
+ Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
||||
+
|
||||
+ if (il == n_layer - 1) {
|
||||
+ // skip computing output for unused tokens
|
||||
@@ -959,7 +959,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
struct llm_build_deci : public llm_graph_context {
|
||||
llm_build_deci(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
@@ -12965,6 +13265,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
@@ -13017,6 +13317,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
||||
{
|
||||
llm = std::make_unique<llm_build_llama>(*this, params, gf);
|
||||
} break;
|
||||
@@ -970,7 +970,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
case LLM_ARCH_DECI:
|
||||
{
|
||||
llm = std::make_unique<llm_build_deci>(*this, params, gf);
|
||||
@@ -13325,6 +13629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13377,6 +13681,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
||||
case LLM_ARCH_LLAMA:
|
||||
case LLM_ARCH_LLAMA4:
|
||||
@@ -979,7 +979,7 @@ index 5fbd0055..d5ad466e 100644
|
||||
case LLM_ARCH_BAICHUAN:
|
||||
case LLM_ARCH_STARCODER:
|
||||
diff --git a/src/llama-model.h b/src/llama-model.h
|
||||
index e08d4ae4..21c4617b 100644
|
||||
index 5865d5e9..72bab5be 100644
|
||||
--- a/src/llama-model.h
|
||||
+++ b/src/llama-model.h
|
||||
@@ -11,6 +11,7 @@
|
||||
@@ -998,7 +998,7 @@ index e08d4ae4..21c4617b 100644
|
||||
LLM_TYPE_236B,
|
||||
LLM_TYPE_314B,
|
||||
LLM_TYPE_671B,
|
||||
@@ -308,6 +310,16 @@ struct llama_layer {
|
||||
@@ -310,6 +312,16 @@ struct llama_layer {
|
||||
|
||||
struct ggml_tensor * bskcn_tv = nullptr;
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Hiltgen <daniel@ollama.com>
|
||||
Date: Wed, 9 Oct 2024 17:26:23 -0700
|
||||
Subject: [PATCH] conditional-fattn
|
||||
|
||||
---
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 59a49560..b70c6a32 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2338,9 +2338,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_ARGSORT:
|
||||
ggml_cuda_op_argsort(ctx, dst);
|
||||
break;
|
||||
+#if !defined(GGML_DISABLE_FLASH_ATTN)
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
ggml_cuda_flash_attn_ext(ctx, dst);
|
||||
break;
|
||||
+#endif
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
ggml_cuda_cross_entropy_loss(ctx, dst);
|
||||
break;
|
||||
@@ -147,10 +147,10 @@ index 410a3720..3eca1cf8 100644
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index b70c6a32..67208cba 100644
|
||||
index 31750b6f..0fef9522 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2245,6 +2245,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
@@ -2246,6 +2246,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||
case GGML_OP_PAD:
|
||||
ggml_cuda_op_pad(ctx, dst);
|
||||
break;
|
||||
@@ -160,7 +160,7 @@ index b70c6a32..67208cba 100644
|
||||
case GGML_OP_ARANGE:
|
||||
ggml_cuda_op_arange(ctx, dst);
|
||||
break;
|
||||
@@ -3223,6 +3226,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
@@ -3222,6 +3225,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_UPSCALE:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
||||
case GGML_OP_PAD:
|
||||
@@ -233,7 +233,7 @@ index 8fd386b0..e2ededc3 100644
|
||||
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
+void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 310afe8a..b121ab9e 100644
|
||||
index 12886cd3..b2e95a66 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -341,6 +341,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
@@ -244,7 +244,7 @@ index 310afe8a..b121ab9e 100644
|
||||
GGML_METAL_KERNEL_TYPE_ARANGE_F32,
|
||||
GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
@@ -998,6 +999,7 @@ @implementation GGMLMetalClass
|
||||
@@ -1020,6 +1021,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_REFLECT_1D_F32, pad_reflect_1d_f32, true);
|
||||
@@ -252,7 +252,7 @@ index 310afe8a..b121ab9e 100644
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
@@ -1339,6 +1341,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
@@ -1384,6 +1386,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
@@ -260,7 +260,7 @@ index 310afe8a..b121ab9e 100644
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
@@ -3669,6 +3672,36 @@ static void ggml_metal_encode_node(
|
||||
@@ -3731,6 +3734,36 @@ static void ggml_metal_encode_node(
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -298,10 +298,10 @@ index 310afe8a..b121ab9e 100644
|
||||
} break;
|
||||
case GGML_OP_ARANGE:
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index b08666e2..e3185e5b 100644
|
||||
index 8d6e99e6..71f0f97f 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -2968,6 +2968,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
@@ -2975,6 +2975,51 @@ kernel void kernel_pad_reflect_1d_f32(
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ regex
|
||||
2 files changed, 22 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index 0125ee53..d74919d2 100644
|
||||
index a35b498c..032019c9 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -296,7 +296,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
@@ -22,10 +22,10 @@ multiple batches of processing until everything is complete.
|
||||
4 files changed, 51 insertions(+), 106 deletions(-)
|
||||
|
||||
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
|
||||
index afe6f552..d6e7b3af 100644
|
||||
index 0343ba8a..4b3e6a83 100644
|
||||
--- a/src/llama-context.cpp
|
||||
+++ b/src/llama-context.cpp
|
||||
@@ -590,13 +590,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
@@ -594,13 +594,12 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
|
||||
|
||||
llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_context * ctx0,
|
||||
@@ -41,7 +41,7 @@ index afe6f552..d6e7b3af 100644
|
||||
#if 0
|
||||
// CPU defrag
|
||||
//
|
||||
@@ -668,32 +667,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -672,32 +671,20 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
||||
}
|
||||
#else
|
||||
@@ -79,7 +79,7 @@ index afe6f552..d6e7b3af 100644
|
||||
|
||||
ggml_tensor * view_v_src;
|
||||
ggml_tensor * view_v_dst;
|
||||
@@ -701,34 +688,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -705,34 +692,30 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
if (cparams.flash_attn) {
|
||||
// NOTE: the V cache is not transposed when using flash attention
|
||||
view_v_src = ggml_view_2d(ctx0, kv_self->v_l[il],
|
||||
@@ -122,7 +122,7 @@ index afe6f552..d6e7b3af 100644
|
||||
#endif
|
||||
|
||||
return res;
|
||||
@@ -737,8 +720,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
@@ -741,8 +724,6 @@ llm_graph_result_ptr llama_context::build_kv_self_defrag(
|
||||
void llama_context::kv_self_update() {
|
||||
auto & kv = kv_self;
|
||||
|
||||
@@ -131,7 +131,7 @@ index afe6f552..d6e7b3af 100644
|
||||
if (kv->has_shift) {
|
||||
if (!kv->get_can_shift()) {
|
||||
GGML_ABORT("The current context does not support K-shift");
|
||||
@@ -759,8 +740,6 @@ void llama_context::kv_self_update() {
|
||||
@@ -763,8 +744,6 @@ void llama_context::kv_self_update() {
|
||||
res->set_inputs(nullptr);
|
||||
|
||||
graph_compute(gf, false);
|
||||
@@ -140,7 +140,7 @@ index afe6f552..d6e7b3af 100644
|
||||
}
|
||||
|
||||
{
|
||||
@@ -775,49 +754,28 @@ void llama_context::kv_self_update() {
|
||||
@@ -779,49 +758,28 @@ void llama_context::kv_self_update() {
|
||||
// defragment the KV cache if needed
|
||||
if (kv->do_defrag) {
|
||||
LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
||||
@@ -202,7 +202,7 @@ index afe6f552..d6e7b3af 100644
|
||||
}
|
||||
|
||||
enum llama_pooling_type llama_context::pooling_type() const {
|
||||
@@ -1301,9 +1259,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
@@ -1305,9 +1263,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
||||
// find KV slot
|
||||
{
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
@@ -241,7 +241,7 @@ index baa03276..a59ff8fd 100644
|
||||
// TODO: read/write lora adapters and cvec
|
||||
size_t state_write_data(llama_io_write_i & io);
|
||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
||||
index 9310f262..5c941e7c 100644
|
||||
index 69f8d35a..35a750d3 100644
|
||||
--- a/src/llama-kv-cache.cpp
|
||||
+++ b/src/llama-kv-cache.cpp
|
||||
@@ -781,17 +781,7 @@ bool llama_kv_cache_unified::defrag_prepare(int32_t n_max_nodes) {
|
||||
@@ -53,7 +53,7 @@ index 381a9c7d..e45b453d 100644
|
||||
}
|
||||
|
||||
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
|
||||
index d74919d2..c90f636c 100644
|
||||
index 032019c9..ba37df35 100644
|
||||
--- a/src/llama-vocab.cpp
|
||||
+++ b/src/llama-vocab.cpp
|
||||
@@ -1459,7 +1459,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
@@ -13,7 +13,7 @@ models not supported in llama.cpp
|
||||
4 files changed, 24 insertions(+)
|
||||
|
||||
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
|
||||
index c1f78618..bdf3d898 100644
|
||||
index 0568565f..dd01df60 100644
|
||||
--- a/src/llama-arch.cpp
|
||||
+++ b/src/llama-arch.cpp
|
||||
@@ -73,6 +73,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
@@ -24,7 +24,7 @@ index c1f78618..bdf3d898 100644
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -1582,6 +1583,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
@@ -1586,6 +1587,22 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
},
|
||||
},
|
||||
@@ -48,7 +48,7 @@ index c1f78618..bdf3d898 100644
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
diff --git a/src/llama-arch.h b/src/llama-arch.h
|
||||
index f987844d..ee081fbf 100644
|
||||
index 6a989034..b6227eeb 100644
|
||||
--- a/src/llama-arch.h
|
||||
+++ b/src/llama-arch.h
|
||||
@@ -75,6 +75,7 @@ enum llm_arch {
|
||||
@@ -60,10 +60,10 @@ index f987844d..ee081fbf 100644
|
||||
LLM_ARCH_BAILINGMOE,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
index d5ad466e..cd1d239c 100644
|
||||
index d051696c..c8374159 100644
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1423,6 +1423,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
@@ -1425,6 +1425,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -71,7 +71,7 @@ index d5ad466e..cd1d239c 100644
|
||||
default: throw std::runtime_error("unsupported model architecture");
|
||||
}
|
||||
|
||||
@@ -13652,6 +13653,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
@@ -13704,6 +13705,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
||||
case LLM_ARCH_CHAMELEON:
|
||||
case LLM_ARCH_SOLAR:
|
||||
case LLM_ARCH_BAILINGMOE:
|
||||
@@ -1,76 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 20:41:24 -0700
|
||||
Subject: [PATCH] add op_neg
|
||||
|
||||
adds the neg operator to ggml
|
||||
---
|
||||
ggml/src/ggml-metal/ggml-metal.m | 15 +++++++++++++++
|
||||
ggml/src/ggml-metal/ggml-metal.metal | 7 +++++++
|
||||
2 files changed, 22 insertions(+)
|
||||
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index b121ab9e..fea50521 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -461,6 +461,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
|
||||
GGML_METAL_KERNEL_TYPE_SQRT,
|
||||
GGML_METAL_KERNEL_TYPE_SIN,
|
||||
GGML_METAL_KERNEL_TYPE_COS,
|
||||
+ GGML_METAL_KERNEL_TYPE_NEG,
|
||||
GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
|
||||
GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
|
||||
@@ -1119,6 +1120,7 @@ @implementation GGMLMetalClass
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true);
|
||||
+ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true);
|
||||
@@ -1280,6 +1282,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
+ case GGML_UNARY_OP_NEG:
|
||||
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
@@ -1966,6 +1969,18 @@ static void ggml_metal_encode_node(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
+ case GGML_UNARY_OP_NEG:
|
||||
+ {
|
||||
+ id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
|
||||
+
|
||||
+ [encoder setComputePipelineState:pipeline];
|
||||
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
+
|
||||
+ const int64_t n = ggml_nelements(dst);
|
||||
+
|
||||
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
+ } break;
|
||||
default:
|
||||
{
|
||||
GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
index e3185e5b..ede9d1e6 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.metal
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.metal
|
||||
@@ -949,6 +949,13 @@ kernel void kernel_cos(
|
||||
dst[tpig] = cos(src0[tpig]);
|
||||
}
|
||||
|
||||
+kernel void kernel_neg(
|
||||
+ device const float * src0,
|
||||
+ device float * dst,
|
||||
+ uint tpig[[thread_position_in_grid]]) {
|
||||
+ dst[tpig] = -src0[tpig];
|
||||
+}
|
||||
+
|
||||
kernel void kernel_sum_rows(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
@@ -1,39 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Tue, 8 Apr 2025 20:49:50 -0700
|
||||
Subject: [PATCH] fix compiler error in clip.h
|
||||
|
||||
fixes an error that occurs in clip.h when compiling
|
||||
using CGo
|
||||
---
|
||||
examples/llava/clip.h | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
|
||||
index cc133a58..5fc45d3e 100644
|
||||
--- a/examples/llava/clip.h
|
||||
+++ b/examples/llava/clip.h
|
||||
@@ -30,12 +30,13 @@ struct clip_image_size {
|
||||
int height;
|
||||
};
|
||||
|
||||
+struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
- ggml_log_level verbosity;
|
||||
+ enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
// deprecated, use clip_init
|
||||
@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
|
||||
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
|
||||
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
|
||||
-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
|
||||
|
||||
/**
|
||||
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
|
||||
@@ -1,600 +0,0 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: jmorganca <jmorganca@gmail.com>
|
||||
Date: Sat, 12 Apr 2025 13:06:57 -0700
|
||||
Subject: [PATCH] Revert "Simplify and improve CUDA graphs through use of
|
||||
indirect copy pointers (#9017)"
|
||||
|
||||
this commit in llama.cpp causes errors when running llama 3.2
|
||||
vision - temporarily revert it
|
||||
|
||||
This reverts commit 3f9da22c2b21a2cef216de50006436ef1cab8764.
|
||||
---
|
||||
ggml/src/ggml-cuda/common.cuh | 8 +-
|
||||
ggml/src/ggml-cuda/cpy.cu | 149 ++++++++++++--------------------
|
||||
ggml/src/ggml-cuda/cpy.cuh | 2 -
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 93 +++++++++++++++-----
|
||||
4 files changed, 124 insertions(+), 128 deletions(-)
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
|
||||
index 8284a001..a718b6a1 100644
|
||||
--- a/ggml/src/ggml-cuda/common.cuh
|
||||
+++ b/ggml/src/ggml-cuda/common.cuh
|
||||
@@ -729,13 +729,7 @@ struct ggml_cuda_graph {
|
||||
bool disable_due_to_failed_graph_capture = false;
|
||||
int number_consecutive_updates = 0;
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
- bool use_cpy_indirection = false;
|
||||
- std::vector<char *> cpy_dest_ptrs;
|
||||
- char ** dest_ptrs_d;
|
||||
- int dest_ptrs_size = 0;
|
||||
- // Index to allow each cpy kernel to be aware of it's position within the graph
|
||||
- // relative to other cpy nodes.
|
||||
- int graph_cpynode_index = -1;
|
||||
+ std::vector<char **> updated_kernel_arg;
|
||||
#endif
|
||||
};
|
||||
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
|
||||
index 4f4faa3e..8396df28 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cu
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cu
|
||||
@@ -39,18 +39,16 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_1>
|
||||
-static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
|
||||
+static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
- const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
+ const int nb12, const int nb13) {
|
||||
const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
- char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
-
|
||||
// determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
|
||||
// then combine those indices with the corresponding byte offsets to get the total offsets
|
||||
const int64_t i03 = i/(ne00 * ne01 * ne02);
|
||||
@@ -297,18 +295,16 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
-static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
|
||||
+static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
- const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
+ const int nb12, const int nb13) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
- char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
-
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
@@ -325,18 +321,16 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int
|
||||
}
|
||||
|
||||
template <cpy_kernel_t cpy_blck, int qk>
|
||||
-static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
|
||||
+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
- const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
+ const int nb12, const int nb13) {
|
||||
const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
|
||||
|
||||
if (i >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
- char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
-
|
||||
const int i03 = i/(ne00 * ne01 * ne02);
|
||||
const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
|
||||
const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
|
||||
@@ -352,97 +346,76 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
|
||||
cpy_blck(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
-// Copy destination pointers to GPU to be available when pointer indirection is in use
|
||||
-
|
||||
-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
||||
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
- if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
||||
- CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||
- if (cuda_graph->dest_ptrs_d != nullptr) {
|
||||
- CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
|
||||
- }
|
||||
- CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
|
||||
- cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
|
||||
- }
|
||||
- // copy destination pointers to GPU
|
||||
- CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
|
||||
- cuda_graph->graph_cpynode_index = 0; // reset index
|
||||
-#else
|
||||
- GGML_UNUSED(cuda_graph); GGML_UNUSED(host_dest_ptrs);
|
||||
- GGML_UNUSED(host_dest_ptrs_size); GGML_UNUSED(stream);
|
||||
-#endif
|
||||
-}
|
||||
-
|
||||
static void ggml_cpy_f16_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_bf16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK8_0 == 0);
|
||||
const int num_blocks = ne / QK8_0;
|
||||
cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q8_0_f32_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_0 == 0);
|
||||
const int num_blocks = ne / QK4_0;
|
||||
cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_0_f32_cuda(
|
||||
@@ -451,22 +424,22 @@ static void ggml_cpy_q4_0_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
- cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ cudaStream_t stream) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q4_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_1 == 0);
|
||||
const int num_blocks = ne / QK4_1;
|
||||
cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q4_1_f32_cuda(
|
||||
@@ -475,22 +448,22 @@ static void ggml_cpy_q4_1_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
- cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ cudaStream_t stream) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_0_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_0 == 0);
|
||||
const int num_blocks = ne / QK5_0;
|
||||
cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_0_f32_cuda(
|
||||
@@ -499,22 +472,22 @@ static void ggml_cpy_q5_0_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
- cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ cudaStream_t stream) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q5_1_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK5_1 == 0);
|
||||
const int num_blocks = ne / QK5_1;
|
||||
cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_q5_1_f32_cuda(
|
||||
@@ -523,32 +496,32 @@ static void ggml_cpy_q5_1_f32_cuda(
|
||||
const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12,
|
||||
const int nb10, const int nb11, const int nb12, const int nb13,
|
||||
- cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ cudaStream_t stream) {
|
||||
const int num_blocks = ne;
|
||||
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
|
||||
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
||||
- ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_iq4_nl_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(ne % QK4_NL == 0);
|
||||
const int num_blocks = ne / QK4_NL;
|
||||
cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
static void ggml_cpy_f16_f16_cuda(
|
||||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
|
||||
}
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
|
||||
@@ -585,62 +558,48 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
char * src0_ddc = (char *) src0->data;
|
||||
char * src1_ddc = (char *) src1->data;
|
||||
|
||||
- char ** dest_ptrs_d = nullptr;
|
||||
- int graph_cpynode_index = -1;
|
||||
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
- if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
- dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||
- graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||
- }
|
||||
-#endif
|
||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||
- ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
|
||||
- ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_bf16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||
- ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||
- ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
|
||||
- ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||
- ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||
- ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||
- ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
|
||||
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
|
||||
- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||
- ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||
- ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
|
||||
- ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||
- ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||
- ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
|
||||
+ ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
-#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
||||
- if(ctx.cuda_graph->use_cpy_indirection) {
|
||||
- ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||
- }
|
||||
-#endif
|
||||
-
|
||||
}
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
diff --git a/ggml/src/ggml-cuda/cpy.cuh b/ggml/src/ggml-cuda/cpy.cuh
|
||||
index 6bed0564..28b06cdd 100644
|
||||
--- a/ggml/src/ggml-cuda/cpy.cuh
|
||||
+++ b/ggml/src/ggml-cuda/cpy.cuh
|
||||
@@ -7,5 +7,3 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
|
||||
-
|
||||
-void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index 67208cba..a44788db 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2477,11 +2477,10 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
- bool use_cuda_graph) {
|
||||
+ std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
|
||||
|
||||
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||
- cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
||||
-
|
||||
+ cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
@@ -2513,11 +2512,8 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
-
|
||||
- // Store the pointers which are updated for each token, such that these can be sent
|
||||
- // to the device and accessed using indirection from CUDA graph
|
||||
- cuda_ctx->cuda_graph->cpy_dest_ptrs.push_back((char *) node->src[1]->data);
|
||||
-
|
||||
+ // store the copy op parameter which changes with each token.
|
||||
+ cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (!ptr) {
|
||||
@@ -2525,6 +2521,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||
#endif
|
||||
+ } else {
|
||||
+ if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
+ ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2533,12 +2533,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
||||
}
|
||||
}
|
||||
|
||||
- if (use_cuda_graph) {
|
||||
- cuda_ctx->cuda_graph->use_cpy_indirection = true;
|
||||
- // copy pointers to GPU so they can be accessed via indirection within CUDA graph
|
||||
- ggml_cuda_cpy_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
|
||||
- }
|
||||
-
|
||||
return use_cuda_graph;
|
||||
}
|
||||
|
||||
@@ -2593,6 +2587,51 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
||||
return true;
|
||||
}
|
||||
|
||||
+static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool cuda_graph_update_required) {
|
||||
+
|
||||
+ if (cuda_graph_update_required) {
|
||||
+ // Extract nodes from graph
|
||||
+ // First call with null argument gets number of nodes in graph
|
||||
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||
+ // Subsequent call with non-null argument gets nodes
|
||||
+ cuda_ctx->cuda_graph->nodes.clear();
|
||||
+ cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
+ cuda_ctx->cuda_graph->params.clear();
|
||||
+ cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||
+ if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
||||
+ CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
||||
+
|
||||
+ // Loop over nodes, and extract kernel parameters from each node
|
||||
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
+ cudaGraphNodeType node_type;
|
||||
+ CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
||||
+ if (node_type == cudaGraphNodeTypeKernel) {
|
||||
+ cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
||||
+ if (stat == cudaErrorInvalidDeviceFunction) {
|
||||
+ // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
||||
+ // We don't need to update blas nodes, so clear error and move on.
|
||||
+ (void)cudaGetLastError();
|
||||
+ } else {
|
||||
+ GGML_ASSERT(stat == cudaSuccess);
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ // One of the arguments to the copy kernel is updated for each token, hence we need to
|
||||
+ // replace that argument with the updated value in the CUDA graph
|
||||
+ // on update steps, the live parameters will already be captured
|
||||
+ int k = 0;
|
||||
+ for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
+ if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
+ char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
+ *(void**)cuda_ctx->cuda_graph->params[i].kernelParams[1] = *(void**)updated_kernel_arg_ptr;
|
||||
+ CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
|
||||
|
||||
bool cuda_graph_update_required = false;
|
||||
@@ -2652,7 +2691,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
|
||||
#endif
|
||||
|
||||
static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
|
||||
- bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
|
||||
+ [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph,
|
||||
+ bool & cuda_graph_update_required) {
|
||||
|
||||
while (!graph_evaluated_or_captured) {
|
||||
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||
@@ -2702,9 +2742,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
|
||||
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
|
||||
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||
}
|
||||
- if (cuda_graph_update_required) { // Update graph executable
|
||||
- update_cuda_graph_executable(cuda_ctx);
|
||||
- }
|
||||
+
|
||||
+ // Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
||||
+ maintain_cuda_graph(cuda_ctx, ggml_cuda_cpy_fn_ptrs, cuda_graph_update_required);
|
||||
+
|
||||
+ // Update graph executable
|
||||
+ update_cuda_graph_executable(cuda_ctx);
|
||||
+
|
||||
// Launch graph
|
||||
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||
#else
|
||||
@@ -2718,6 +2762,10 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
|
||||
ggml_cuda_set_device(cuda_ctx->device);
|
||||
|
||||
+ // vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
+ // kernel parameters which need updated in the graph for each token
|
||||
+ std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
+
|
||||
#ifdef USE_CUDA_GRAPH
|
||||
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||
|
||||
@@ -2751,7 +2799,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
if (use_cuda_graph) {
|
||||
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);
|
||||
|
||||
- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph, use_cuda_graph);
|
||||
+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops(cuda_ctx, cgraph,
|
||||
+ ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
@@ -2772,10 +2821,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||
}
|
||||
|
||||
- if (!use_cuda_graph) {
|
||||
- cuda_ctx->cuda_graph->use_cpy_indirection = false;
|
||||
- }
|
||||
-
|
||||
#else
|
||||
bool use_cuda_graph = false;
|
||||
bool cuda_graph_update_required = false;
|
||||
@@ -2783,7 +2828,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
||||
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
- evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
+ evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
Reference in New Issue
Block a user