From 2be9575694c8b99c8e88291a767950fc4ca3dfbd Mon Sep 17 00:00:00 2001
From: Shang Chieh Tseng <shangchieh.tseng@tsengsyu.com>
Date: Fri, 8 Aug 2025 15:15:49 +0800
Subject: [PATCH] Fix BF16 compatibility for Tesla K80 (Compute Capability 3.7)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add runtime check for BF16 support which requires Compute Capability 8.0+.
Tesla K80 and other CC 3.7 GPUs will fallback to FP16/FP32 operations.
This ensures the upstream BF16 optimizations work on newer GPUs while
maintaining compatibility with legacy hardware.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
index 080e7467..812da076 100644
--- a/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1205,7 +1205,11 @@ static void ggml_cuda_op_mul_mat_cublas(
 
     const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT && src0->type != GGML_TYPE_MXFP4;
 
-    if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+    // BF16 requires compute capability 8.0 (Ampere) or higher for CUDA_R_16BF support
+    // For older GPUs like Tesla K80 (cc 3.7), we need to fallback to FP16 or FP32
+    const bool bf16_supported = GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
+    
+    if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && bf16_supported) {
         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
         if (src1->type != GGML_TYPE_BF16) {
             const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type);