From d04ea50cedfe8acf5515456ecba936868dcb4436 Mon Sep 17 00:00:00 2001 From: Shang Chieh Tseng Date: Wed, 29 Oct 2025 23:34:03 +0800 Subject: [PATCH] Fix gpt-oss model architecture to match GGUF tensor format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gpt-oss model architecture code expected fused tensors (attn_qkv, ffn_gate_up_exps) but the actual GGUF files contain separate tensors (attn_q/k/v, ffn_gate_exps/up_exps), causing nil pointer panics during model loading. Changes: - model/models/gptoss/model.go: Updated AttentionBlock to use separate Query/Key/Value fields instead of fused QKV, modified Forward() to compute projections separately - model/models/gptoss/model.go: Updated MLPBlock to use separate Gate/Up fields instead of fused GateUp, simplified Forward() logic - fs/ggml/type.go: Reorganized MXFP4 tensor type constant ordering - ml/backend/ggml/ggml/include/ggml.h: Moved GGML_TYPE_MXFP4 to end of enum to match GGUF file format specification - ml/backend/ggml/ggml/src/ggml.c: Updated type name array to match reordered enum - CLAUDE.md: Documented gpt-oss model compatibility fix Result: gpt-oss:20b model now loads and runs successfully on Tesla K80, all 25 layers offload to GPU correctly. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- CLAUDE.md | 18 +++++++ fs/ggml/type.go | 75 ++++++++++++++--------------- ml/backend/ggml/ggml/include/ggml.h | 7 +-- ml/backend/ggml/ggml/src/ggml.c | 20 +++++--- model/models/gptoss/model.go | 58 ++++++++-------------- 5 files changed, 91 insertions(+), 87 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index dc90a28e..33fed3c2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -149,6 +149,24 @@ Analysis of real-world usage (gemma3:12b) revealed a **2.6 GiB memory overestima - Simpler deployment for single-model workloads - Empirically validated with real Tesla K80 measurements +## Model Architecture Compatibility + +### GPT-OSS Model Fix (2025-10-29) + +**Issue**: The `gpt-oss` model architecture code expected fused tensor formats that didn't match the actual GGUF file structure, causing nil pointer panics. + +**Root Cause**: Mismatch between code expectations and GGUF file format: +- Code expected: `attn_qkv` (fused), `ffn_gate_up_exps` (fused) +- GGUF contains: `attn_q/k/v` (separate), `ffn_gate_exps/up_exps` (separate) + +**Fix Applied** (`model/models/gptoss/model.go`): +1. Updated `AttentionBlock` struct to use separate `Query`, `Key`, `Value` fields instead of fused `QKV` +2. Modified `AttentionBlock.Forward()` to compute Q/K/V projections separately +3. Updated `MLPBlock` struct to use separate `Gate` and `Up` fields instead of fused `GateUp` +4. Modified `MLPBlock.Forward()` to compute gate/up separately and removed incorrect reshape + +**Result**: ✅ `gpt-oss:20b` model now loads and runs successfully on Tesla K80 + ## Documentation Structure The project documentation is organized as follows: diff --git a/fs/ggml/type.go b/fs/ggml/type.go index 3e5deb87..a3b3f834 100644 --- a/fs/ggml/type.go +++ b/fs/ggml/type.go @@ -187,45 +187,42 @@ func (ftype FileType) ToTensorType() TensorType { type TensorType uint32 const ( - TensorTypeF32 TensorType = iota - TensorTypeF16 - TensorTypeQ4_0 - TensorTypeQ4_1 - TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2 - tensorTypeQ4_3 // unused by GGML - TensorTypeQ5_0 - TensorTypeQ5_1 - TensorTypeQ8_0 - TensorTypeQ8_1 - TensorTypeQ2_K - TensorTypeQ3_K - TensorTypeQ4_K - TensorTypeQ5_K - TensorTypeQ6_K - TensorTypeQ8_K - tensorTypeIQ2_XXS // not supported by ollama - tensorTypeIQ2_XS // not supported by ollama - tensorTypeIQ3_XXS // not supported by ollama - tensorTypeIQ1_S // not supported by ollama - tensorTypeIQ4_NL // not supported by ollama - tensorTypeIQ3_S // not supported by ollama - tensorTypeIQ2_S // not supported by ollama - tensorTypeIQ4_XS // not supported by ollama - TensorTypeI8 - TensorTypeI16 - TensorTypeI32 - TensorTypeI64 - TensorTypeF64 - tensorTypeIQ1_M // not supported by ollama - TensorTypeBF16 - tensorTypeQ4_0_4_4 // unused by GGML - tensorTypeQ4_0_4_8 // unused by GGML - tensorTypeQ4_0_8_8 // unused by GGML - tensorTypeTQ1_0 // not supported by ollama - tensorTypeTQ2_0 // not supported by ollama - tensorTypeIQ4_NL_4_4 // unused by GGML - tensorTypeIQ4_NL_4_8 // unused by GGML - tensorTypeIQ4_NL_8_8 // unused by GGML + TensorTypeF32 TensorType = 0 + TensorTypeF16 = 1 + TensorTypeQ4_0 = 2 + TensorTypeQ4_1 = 3 + // 4 = Q4_2 removed + // 5 = Q4_3 removed + TensorTypeQ5_0 = 6 + TensorTypeQ5_1 = 7 + TensorTypeQ8_0 = 8 + TensorTypeQ8_1 = 9 + TensorTypeQ2_K = 10 + TensorTypeQ3_K = 11 + TensorTypeQ4_K = 12 + TensorTypeQ5_K = 13 + TensorTypeQ6_K = 14 + TensorTypeQ8_K = 15 + tensorTypeIQ2_XXS = 16 // not supported by ollama + tensorTypeIQ2_XS = 17 // not supported by ollama + tensorTypeIQ3_XXS = 18 // not supported by ollama + tensorTypeIQ1_S = 19 // not supported by ollama + tensorTypeIQ4_NL = 20 // not supported by ollama + tensorTypeIQ3_S = 21 // not supported by ollama + tensorTypeIQ2_S = 22 // not supported by ollama + tensorTypeIQ4_XS = 23 // not supported by ollama + TensorTypeI8 = 24 + TensorTypeI16 = 25 + TensorTypeI32 = 26 + TensorTypeI64 = 27 + TensorTypeF64 = 28 + tensorTypeIQ1_M = 29 // not supported by ollama + TensorTypeBF16 = 30 + // 31-33 = Q4_0 variants removed + tensorTypeTQ1_0 = 34 // not supported by ollama + tensorTypeTQ2_0 = 35 // not supported by ollama + // 36-38 = IQ4_NL variants removed + TensorTypeMXFP4 = 39 ) // ParseFileType parses the provided GGUF file type diff --git a/ml/backend/ggml/ggml/include/ggml.h b/ml/backend/ggml/ggml/include/ggml.h index 873baa24..e6a03f06 100644 --- a/ml/backend/ggml/ggml/include/ggml.h +++ b/ml/backend/ggml/ggml/include/ggml.h @@ -353,7 +353,7 @@ extern "C" { GGML_TYPE_F16 = 1, GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_1 = 3, - GGML_TYPE_MXFP4 = 4, // Formerly removed type GGML_TYPE_Q4_2 + // GGML_TYPE_Q4_2 = 4, support has been removed // GGML_TYPE_Q4_3 = 5, support has been removed GGML_TYPE_Q5_0 = 6, GGML_TYPE_Q5_1 = 7, @@ -385,10 +385,11 @@ extern "C" { // GGML_TYPE_Q4_0_8_8 = 33, GGML_TYPE_TQ1_0 = 34, GGML_TYPE_TQ2_0 = 35, - // GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_4 = 36, support has been removed from gguf files // GGML_TYPE_IQ4_NL_4_8 = 37, // GGML_TYPE_IQ4_NL_8_8 = 38, - GGML_TYPE_COUNT = 39, + GGML_TYPE_MXFP4 = 39, + GGML_TYPE_COUNT = 40, }; // precision diff --git a/ml/backend/ggml/ggml/src/ggml.c b/ml/backend/ggml/ggml/src/ggml.c index 0f3c9834..b30d2d04 100644 --- a/ml/backend/ggml/ggml/src/ggml.c +++ b/ml/backend/ggml/ggml/src/ggml.c @@ -589,13 +589,11 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q4_1, .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref, }, - [GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2 - .type_name = "mxfp4", - .blck_size = MXFP4, - .type_size = sizeof(block_mxfp4), - .is_quantized = true, - .to_float = (ggml_to_float_t) dequantize_row_mxfp4, - .from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref, + [4] = { // GGML_TYPE_Q4_2 + .type_name = "DEPRECATED", + .blck_size = 0, + .type_size = 0, + .is_quantized = false, }, [5] = { // GGML_TYPE_Q4_3 .type_name = "DEPRECATED", @@ -812,6 +810,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .type_size = 0, .is_quantized = false, }, + [GGML_TYPE_MXFP4] = { + .type_name = "mxfp4", + .blck_size = MXFP4, + .type_size = sizeof(block_mxfp4), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_mxfp4, + .from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref, + }, }; const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) { diff --git a/model/models/gptoss/model.go b/model/models/gptoss/model.go index 22b3e079..c6f480af 100644 --- a/model/models/gptoss/model.go +++ b/model/models/gptoss/model.go @@ -102,7 +102,9 @@ func (d *TransformerBlock) Forward(ctx ml.Context, hiddenStates, positions, outp type AttentionBlock struct { Norm *nn.RMSNorm `gguf:"attn_norm"` - QKV *nn.Linear `gguf:"attn_qkv"` + Query *nn.Linear `gguf:"attn_q"` + Key *nn.Linear `gguf:"attn_k"` + Value *nn.Linear `gguf:"attn_v"` Output *nn.Linear `gguf:"attn_out"` Sinks ml.Tensor `gguf:"attn_sinks"` } @@ -113,33 +115,17 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T residual := hiddenStates hiddenStates = attn.Norm.Forward(ctx, hiddenStates, opts.eps) - qkv := attn.QKV.Forward(ctx, hiddenStates) - - // query = qkv[..., : num_attention_heads * head_dim].reshape(batch_size, num_attention_heads, head_dim) - query := qkv.View(ctx, - 0, - opts.headDim(), qkv.Stride(0)*opts.headDim(), - opts.numHeads, qkv.Stride(1), - batchSize, - ) + // Compute separate Q, K, V projections + query := attn.Query.Forward(ctx, hiddenStates) + query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize) query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...) - // key = qkv[..., num_attention_heads * head_dim:(num_attention_heads + num_key_value_heads) * head_dim].reshape(batch_size, num_key_value_heads, head_dim) - key := qkv.View(ctx, - qkv.Stride(0)*opts.headDim()*opts.numHeads, - opts.headDim(), qkv.Stride(0)*opts.headDim(), - opts.numKVHeads, qkv.Stride(1), - batchSize, - ) + key := attn.Key.Forward(ctx, hiddenStates) + key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...) - // value = qkv[..., (num_attention_heads + num_key_value_heads) * head_dim:].reshape(batch_size, num_key_value_heads, head_dim) - value := qkv.View(ctx, - qkv.Stride(0)*opts.headDim()*(opts.numHeads+opts.numKVHeads), - opts.headDim(), qkv.Stride(0)*opts.headDim(), - opts.numKVHeads, qkv.Stride(1), - batchSize, - ) + value := attn.Value.Forward(ctx, hiddenStates) + value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize) cache.Put(ctx, key, value) key, value, mask := cache.Get(ctx) @@ -165,7 +151,8 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T type MLPBlock struct { Norm *nn.RMSNorm `gguf:"ffn_norm"` Router *nn.Linear `gguf:"ffn_gate_inp"` - GateUp *nn.LinearBatch `gguf:"ffn_gate_up_exps"` + Gate *nn.LinearBatch `gguf:"ffn_gate_exps"` + Up *nn.LinearBatch `gguf:"ffn_up_exps"` Down *nn.LinearBatch `gguf:"ffn_down_exps"` } @@ -185,21 +172,16 @@ func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates, one ml.Tensor, opts * hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1)) - hiddenStates = mlp.GateUp.Forward(ctx, hiddenStates, selectedExperts) - hiddenStates = hiddenStates.Reshape(ctx, 2, hiddenStates.Dim(0)/2, hiddenStates.Dim(1), hiddenStates.Dim(2)) + // Compute gate and up separately instead of using fused GateUp + gateStates := mlp.Gate.Forward(ctx, hiddenStates, selectedExperts) + gateStates = gateStates.Clamp(ctx, float32(math.Inf(-1)), 7.0) + gateStates = gateStates.QuickGELU(ctx) - dimStride := []int{hiddenStates.Dim(0) / 2, hiddenStates.Stride(1), hiddenStates.Dim(1), hiddenStates.Stride(2), hiddenStates.Dim(2), hiddenStates.Stride(3), hiddenStates.Dim(3)} + upStates := mlp.Up.Forward(ctx, hiddenStates, selectedExperts) + upStates = upStates.Clamp(ctx, -7.0, 7.0) - glu := hiddenStates.View(ctx, 0, dimStride...) - glu = glu.Contiguous(ctx) - glu = glu.Clamp(ctx, float32(math.Inf(-1)), 7.0) - glu = glu.QuickGELU(ctx) - - linear := hiddenStates.View(ctx, hiddenStates.Stride(0), dimStride...) - linear = linear.Clamp(ctx, -7.0, 7.0) - - hiddenStates = glu.Mul(ctx, linear.Add(ctx, one)) - hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3)) + hiddenStates = gateStates.Mul(ctx, upStates.Add(ctx, one)) + // hiddenStates is now [intermediate_size, num_experts_used, seq*batch] experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts) experts = experts.Mul(ctx, routingWeights)