llama: update vendored code to commit 40c6d79f (#7875)

2025-12-20 04:37:00 +00:00 · 2024-12-10 19:21:34 -08:00
parent a37f4a86a7
commit 527cc97899
289 changed files with 58552 additions and 41806 deletions
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
+ * llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
 *
 * MIT License
 *
@@ -34,8 +34,6 @@ void ggml_cuda_op_mul_mat_q(

    const int64_t ne00 = src0->ne[0];

-    const int64_t nb01 = src0->nb[1];
-
    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];
    GGML_ASSERT(ne10 % QK8_1 == 0);
@@ -43,7 +41,7 @@ void ggml_cuda_op_mul_mat_q(
    const int64_t ne0 = dst->ne[0];

    const int64_t row_diff = row_high - row_low;
-    const int64_t stride00 = nb01 / ggml_type_size(src0->type);
+    const int64_t stride00 = ne00 / ggml_blck_size(src0->type);

    int id = ggml_cuda_get_device();
    const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -176,5 +174,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
        return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
    }

-    return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
 }