mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
* Enable CUDA Graphs for gemma3n. Similar to https://github.com/ggml-org/llama.cpp/pull/14741, though ollama has a slightly different model graph than llama.cpp which requires different workaround checks. * Remove residual check by reshaping differently in gemma3n model This should make the heuristics more robust
51 lines
3.0 KiB
Diff
51 lines
3.0 KiB
Diff
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
|
From: Oliver Simons <osimons@nvidia.com>
|
|
Date: Tue, 22 Jul 2025 11:02:28 +0200
|
|
Subject: [PATCH] Enable CUDA Graphs for gemma3n.
|
|
|
|
Similar to
|
|
https://github.com/ggml-org/llama.cpp/pull/14741,
|
|
though ollama has a slightly different model graph
|
|
than llama.cpp which requires different workaround
|
|
checks.
|
|
---
|
|
ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++++++++----
|
|
1 file changed, 12 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
index 2b9fabf4..28ccf4be 100644
|
|
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
@@ -2474,6 +2474,9 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
|
cuda_ctx->cuda_graph->cpy_dest_ptrs.clear();
|
|
|
|
+ const std::string gemma3n_per_layer_proj_src1_name = " (reshaped)";
|
|
+ const std::string gemma3n_node_name = "node_";
|
|
+
|
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
ggml_tensor * node = cgraph->nodes[i];
|
|
|
|
@@ -2495,12 +2498,17 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
|
|
#endif
|
|
}
|
|
|
|
- if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
|
- // disable CUDA graphs for batch size > 1 for now.
|
|
- // Changes in batch size or context size can cause changes to the grid size of some kernels.
|
|
+ // workarounds to exclude Gemma3n's `project_per_layer_input` operation from the batch-size heuristic, specific to ollama's implementation of gemma3n
|
|
+ // number of layers is different for per_layer_proj between gemma3n:2b and gemma3n:4b, which is why we don't check that value here
|
|
+ if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1 && !(node->ne[0] == 256
|
|
+ && node->ne[2] == 1
|
|
+ && node->ne[3] == 1
|
|
+ && node->src[0] ? std::string(node->src[0]->name).find(gemma3n_node_name) != std::string::npos : false
|
|
+ && node->src[1] ? node->src[1]->name == gemma3n_per_layer_proj_src1_name : false)) {
|
|
+ // Generally, changes in batch size or context size can cause changes to the grid size of some kernels.
|
|
use_cuda_graph = false;
|
|
#ifndef NDEBUG
|
|
- GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
|
+ GGML_LOG_INFO("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
|
#endif
|
|
}
|
|
|