From d04ea50cedfe8acf5515456ecba936868dcb4436 Mon Sep 17 00:00:00 2001
From: Shang Chieh Tseng <shangchieh.tseng@tsengsyu.com>
Date: Wed, 29 Oct 2025 23:34:03 +0800
Subject: [PATCH] Fix gpt-oss model architecture to match GGUF tensor format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gpt-oss model architecture code expected fused tensors (attn_qkv,
ffn_gate_up_exps) but the actual GGUF files contain separate tensors
(attn_q/k/v, ffn_gate_exps/up_exps), causing nil pointer panics during
model loading.

Changes:
- model/models/gptoss/model.go: Updated AttentionBlock to use separate
  Query/Key/Value fields instead of fused QKV, modified Forward() to
  compute projections separately
- model/models/gptoss/model.go: Updated MLPBlock to use separate Gate/Up
  fields instead of fused GateUp, simplified Forward() logic
- fs/ggml/type.go: Reorganized MXFP4 tensor type constant ordering
- ml/backend/ggml/ggml/include/ggml.h: Moved GGML_TYPE_MXFP4 to end of
  enum to match GGUF file format specification
- ml/backend/ggml/ggml/src/ggml.c: Updated type name array to match
  reordered enum
- CLAUDE.md: Documented gpt-oss model compatibility fix

Result: gpt-oss:20b model now loads and runs successfully on Tesla K80,
all 25 layers offload to GPU correctly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 CLAUDE.md                           | 18 +++++++
 fs/ggml/type.go                     | 75 ++++++++++++++---------------
 ml/backend/ggml/ggml/include/ggml.h |  7 +--
 ml/backend/ggml/ggml/src/ggml.c     | 20 +++++---
 model/models/gptoss/model.go        | 58 ++++++++--------------
 5 files changed, 91 insertions(+), 87 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index dc90a28e..33fed3c2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -149,6 +149,24 @@ Analysis of real-world usage (gemma3:12b) revealed a **2.6 GiB memory overestima
 - Simpler deployment for single-model workloads
 - Empirically validated with real Tesla K80 measurements
 
+## Model Architecture Compatibility
+
+### GPT-OSS Model Fix (2025-10-29)
+
+**Issue**: The `gpt-oss` model architecture code expected fused tensor formats that didn't match the actual GGUF file structure, causing nil pointer panics.
+
+**Root Cause**: Mismatch between code expectations and GGUF file format:
+- Code expected: `attn_qkv` (fused), `ffn_gate_up_exps` (fused)
+- GGUF contains: `attn_q/k/v` (separate), `ffn_gate_exps/up_exps` (separate)
+
+**Fix Applied** (`model/models/gptoss/model.go`):
+1. Updated `AttentionBlock` struct to use separate `Query`, `Key`, `Value` fields instead of fused `QKV`
+2. Modified `AttentionBlock.Forward()` to compute Q/K/V projections separately
+3. Updated `MLPBlock` struct to use separate `Gate` and `Up` fields instead of fused `GateUp`
+4. Modified `MLPBlock.Forward()` to compute gate/up separately and removed incorrect reshape
+
+**Result**: ✅ `gpt-oss:20b` model now loads and runs successfully on Tesla K80
+
 ## Documentation Structure
 
 The project documentation is organized as follows:
diff --git a/fs/ggml/type.go b/fs/ggml/type.go
index 3e5deb87..a3b3f834 100644
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -187,45 +187,42 @@ func (ftype FileType) ToTensorType() TensorType {
 type TensorType uint32
 
 const (
-	TensorTypeF32 TensorType = iota
-	TensorTypeF16
-	TensorTypeQ4_0
-	TensorTypeQ4_1
-	TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
-	tensorTypeQ4_3  // unused by GGML
-	TensorTypeQ5_0
-	TensorTypeQ5_1
-	TensorTypeQ8_0
-	TensorTypeQ8_1
-	TensorTypeQ2_K
-	TensorTypeQ3_K
-	TensorTypeQ4_K
-	TensorTypeQ5_K
-	TensorTypeQ6_K
-	TensorTypeQ8_K
-	tensorTypeIQ2_XXS // not supported by ollama
-	tensorTypeIQ2_XS  // not supported by ollama
-	tensorTypeIQ3_XXS // not supported by ollama
-	tensorTypeIQ1_S   // not supported by ollama
-	tensorTypeIQ4_NL  // not supported by ollama
-	tensorTypeIQ3_S   // not supported by ollama
-	tensorTypeIQ2_S   // not supported by ollama
-	tensorTypeIQ4_XS  // not supported by ollama
-	TensorTypeI8
-	TensorTypeI16
-	TensorTypeI32
-	TensorTypeI64
-	TensorTypeF64
-	tensorTypeIQ1_M // not supported by ollama
-	TensorTypeBF16
-	tensorTypeQ4_0_4_4   // unused by GGML
-	tensorTypeQ4_0_4_8   // unused by GGML
-	tensorTypeQ4_0_8_8   // unused by GGML
-	tensorTypeTQ1_0      // not supported by ollama
-	tensorTypeTQ2_0      // not supported by ollama
-	tensorTypeIQ4_NL_4_4 // unused by GGML
-	tensorTypeIQ4_NL_4_8 // unused by GGML
-	tensorTypeIQ4_NL_8_8 // unused by GGML
+	TensorTypeF32 TensorType = 0
+	TensorTypeF16            = 1
+	TensorTypeQ4_0           = 2
+	TensorTypeQ4_1           = 3
+	// 4 = Q4_2 removed
+	// 5 = Q4_3 removed
+	TensorTypeQ5_0           = 6
+	TensorTypeQ5_1           = 7
+	TensorTypeQ8_0           = 8
+	TensorTypeQ8_1           = 9
+	TensorTypeQ2_K           = 10
+	TensorTypeQ3_K           = 11
+	TensorTypeQ4_K           = 12
+	TensorTypeQ5_K           = 13
+	TensorTypeQ6_K           = 14
+	TensorTypeQ8_K           = 15
+	tensorTypeIQ2_XXS        = 16 // not supported by ollama
+	tensorTypeIQ2_XS         = 17 // not supported by ollama
+	tensorTypeIQ3_XXS        = 18 // not supported by ollama
+	tensorTypeIQ1_S          = 19 // not supported by ollama
+	tensorTypeIQ4_NL         = 20 // not supported by ollama
+	tensorTypeIQ3_S          = 21 // not supported by ollama
+	tensorTypeIQ2_S          = 22 // not supported by ollama
+	tensorTypeIQ4_XS         = 23 // not supported by ollama
+	TensorTypeI8             = 24
+	TensorTypeI16            = 25
+	TensorTypeI32            = 26
+	TensorTypeI64            = 27
+	TensorTypeF64            = 28
+	tensorTypeIQ1_M          = 29 // not supported by ollama
+	TensorTypeBF16           = 30
+	// 31-33 = Q4_0 variants removed
+	tensorTypeTQ1_0          = 34 // not supported by ollama
+	tensorTypeTQ2_0          = 35 // not supported by ollama
+	// 36-38 = IQ4_NL variants removed
+	TensorTypeMXFP4          = 39
 )
 
 // ParseFileType parses the provided GGUF file type
diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h
index 873baa24..e6a03f06 100644
--- a/ml/backend/ggml/ggml/include/ggml.h
+++ b/ml/backend/ggml/ggml/include/ggml.h
@@ -353,7 +353,7 @@ extern "C" {
         GGML_TYPE_F16     = 1,
         GGML_TYPE_Q4_0    = 2,
         GGML_TYPE_Q4_1    = 3,
-        GGML_TYPE_MXFP4   = 4, // Formerly removed type GGML_TYPE_Q4_2
+        // GGML_TYPE_Q4_2 = 4, support has been removed
         // GGML_TYPE_Q4_3 = 5, support has been removed
         GGML_TYPE_Q5_0    = 6,
         GGML_TYPE_Q5_1    = 7,
@@ -385,10 +385,11 @@ extern "C" {
         // GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
-        // GGML_TYPE_IQ4_NL_4_4 = 36,
+        // GGML_TYPE_IQ4_NL_4_4 = 36, support has been removed from gguf files
         // GGML_TYPE_IQ4_NL_4_8 = 37,
         // GGML_TYPE_IQ4_NL_8_8 = 38,
-        GGML_TYPE_COUNT   = 39,
+        GGML_TYPE_MXFP4   = 39,
+        GGML_TYPE_COUNT   = 40,
     };
 
     // precision
diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c
index 0f3c9834..b30d2d04 100644
--- a/ml/backend/ggml/ggml/src/ggml.c
+++ b/ml/backend/ggml/ggml/src/ggml.c
@@ -589,13 +589,11 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q4_1,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q4_1_ref,
     },
-    [GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
-        .type_name                = "mxfp4",
-        .blck_size                = MXFP4,
-        .type_size                = sizeof(block_mxfp4),
-        .is_quantized             = true,
-        .to_float                 = (ggml_to_float_t) dequantize_row_mxfp4,
-        .from_float_ref           = (ggml_from_float_t) quantize_row_mxfp4_ref,
+    [4] = { // GGML_TYPE_Q4_2
+        .type_name                = "DEPRECATED",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
     [5] = { // GGML_TYPE_Q4_3
         .type_name                = "DEPRECATED",
@@ -812,6 +810,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .type_size                = 0,
         .is_quantized             = false,
     },
+    [GGML_TYPE_MXFP4] = {
+        .type_name                = "mxfp4",
+        .blck_size                = MXFP4,
+        .type_size                = sizeof(block_mxfp4),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_mxfp4,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_mxfp4_ref,
+    },
 };
 
 const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
diff --git a/model/models/gptoss/model.go b/model/models/gptoss/model.go
index 22b3e079..c6f480af 100644
--- a/model/models/gptoss/model.go
+++ b/model/models/gptoss/model.go
@@ -102,7 +102,9 @@ func (d *TransformerBlock) Forward(ctx ml.Context, hiddenStates, positions, outp
 
 type AttentionBlock struct {
 	Norm   *nn.RMSNorm `gguf:"attn_norm"`
-	QKV    *nn.Linear  `gguf:"attn_qkv"`
+	Query  *nn.Linear  `gguf:"attn_q"`
+	Key    *nn.Linear  `gguf:"attn_k"`
+	Value  *nn.Linear  `gguf:"attn_v"`
 	Output *nn.Linear  `gguf:"attn_out"`
 	Sinks  ml.Tensor   `gguf:"attn_sinks"`
 }
@@ -113,33 +115,17 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T
 	residual := hiddenStates
 	hiddenStates = attn.Norm.Forward(ctx, hiddenStates, opts.eps)
 
-	qkv := attn.QKV.Forward(ctx, hiddenStates)
-
-	// query = qkv[..., : num_attention_heads * head_dim].reshape(batch_size, num_attention_heads, head_dim)
-	query := qkv.View(ctx,
-		0,
-		opts.headDim(), qkv.Stride(0)*opts.headDim(),
-		opts.numHeads, qkv.Stride(1),
-		batchSize,
-	)
+	// Compute separate Q, K, V projections
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
 	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
 
-	// key = qkv[..., num_attention_heads * head_dim:(num_attention_heads + num_key_value_heads) * head_dim].reshape(batch_size, num_key_value_heads, head_dim)
-	key := qkv.View(ctx,
-		qkv.Stride(0)*opts.headDim()*opts.numHeads,
-		opts.headDim(), qkv.Stride(0)*opts.headDim(),
-		opts.numKVHeads, qkv.Stride(1),
-		batchSize,
-	)
+	key := attn.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
 	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
 
-	// value = qkv[..., (num_attention_heads  + num_key_value_heads) * head_dim:].reshape(batch_size, num_key_value_heads, head_dim)
-	value := qkv.View(ctx,
-		qkv.Stride(0)*opts.headDim()*(opts.numHeads+opts.numKVHeads),
-		opts.headDim(), qkv.Stride(0)*opts.headDim(),
-		opts.numKVHeads, qkv.Stride(1),
-		batchSize,
-	)
+	value := attn.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
 
 	cache.Put(ctx, key, value)
 	key, value, mask := cache.Get(ctx)
@@ -165,7 +151,8 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T
 type MLPBlock struct {
 	Norm   *nn.RMSNorm     `gguf:"ffn_norm"`
 	Router *nn.Linear      `gguf:"ffn_gate_inp"`
-	GateUp *nn.LinearBatch `gguf:"ffn_gate_up_exps"`
+	Gate   *nn.LinearBatch `gguf:"ffn_gate_exps"`
+	Up     *nn.LinearBatch `gguf:"ffn_up_exps"`
 	Down   *nn.LinearBatch `gguf:"ffn_down_exps"`
 }
 
@@ -185,21 +172,16 @@ func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates, one ml.Tensor, opts *
 
 	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
 
-	hiddenStates = mlp.GateUp.Forward(ctx, hiddenStates, selectedExperts)
-	hiddenStates = hiddenStates.Reshape(ctx, 2, hiddenStates.Dim(0)/2, hiddenStates.Dim(1), hiddenStates.Dim(2))
+	// Compute gate and up separately instead of using fused GateUp
+	gateStates := mlp.Gate.Forward(ctx, hiddenStates, selectedExperts)
+	gateStates = gateStates.Clamp(ctx, float32(math.Inf(-1)), 7.0)
+	gateStates = gateStates.QuickGELU(ctx)
 
-	dimStride := []int{hiddenStates.Dim(0) / 2, hiddenStates.Stride(1), hiddenStates.Dim(1), hiddenStates.Stride(2), hiddenStates.Dim(2), hiddenStates.Stride(3), hiddenStates.Dim(3)}
+	upStates := mlp.Up.Forward(ctx, hiddenStates, selectedExperts)
+	upStates = upStates.Clamp(ctx, -7.0, 7.0)
 
-	glu := hiddenStates.View(ctx, 0, dimStride...)
-	glu = glu.Contiguous(ctx)
-	glu = glu.Clamp(ctx, float32(math.Inf(-1)), 7.0)
-	glu = glu.QuickGELU(ctx)
-
-	linear := hiddenStates.View(ctx, hiddenStates.Stride(0), dimStride...)
-	linear = linear.Clamp(ctx, -7.0, 7.0)
-
-	hiddenStates = glu.Mul(ctx, linear.Add(ctx, one))
-	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3))
+	hiddenStates = gateStates.Mul(ctx, upStates.Add(ctx, one))
+	// hiddenStates is now [intermediate_size, num_experts_used, seq*batch]
 
 	experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
 	experts = experts.Mul(ctx, routingWeights)