mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-09 23:37:06 +00:00
Fix gpt-oss model architecture to match GGUF tensor format
The gpt-oss model architecture code expected fused tensors (attn_qkv, ffn_gate_up_exps) but the actual GGUF files contain separate tensors (attn_q/k/v, ffn_gate_exps/up_exps), causing nil pointer panics during model loading. Changes: - model/models/gptoss/model.go: Updated AttentionBlock to use separate Query/Key/Value fields instead of fused QKV, modified Forward() to compute projections separately - model/models/gptoss/model.go: Updated MLPBlock to use separate Gate/Up fields instead of fused GateUp, simplified Forward() logic - fs/ggml/type.go: Reorganized MXFP4 tensor type constant ordering - ml/backend/ggml/ggml/include/ggml.h: Moved GGML_TYPE_MXFP4 to end of enum to match GGUF file format specification - ml/backend/ggml/ggml/src/ggml.c: Updated type name array to match reordered enum - CLAUDE.md: Documented gpt-oss model compatibility fix Result: gpt-oss:20b model now loads and runs successfully on Tesla K80, all 25 layers offload to GPU correctly. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
18
CLAUDE.md
18
CLAUDE.md
@@ -149,6 +149,24 @@ Analysis of real-world usage (gemma3:12b) revealed a **2.6 GiB memory overestima
|
||||
- Simpler deployment for single-model workloads
|
||||
- Empirically validated with real Tesla K80 measurements
|
||||
|
||||
## Model Architecture Compatibility
|
||||
|
||||
### GPT-OSS Model Fix (2025-10-29)
|
||||
|
||||
**Issue**: The `gpt-oss` model architecture code expected fused tensor formats that didn't match the actual GGUF file structure, causing nil pointer panics.
|
||||
|
||||
**Root Cause**: Mismatch between code expectations and GGUF file format:
|
||||
- Code expected: `attn_qkv` (fused), `ffn_gate_up_exps` (fused)
|
||||
- GGUF contains: `attn_q/k/v` (separate), `ffn_gate_exps/up_exps` (separate)
|
||||
|
||||
**Fix Applied** (`model/models/gptoss/model.go`):
|
||||
1. Updated `AttentionBlock` struct to use separate `Query`, `Key`, `Value` fields instead of fused `QKV`
|
||||
2. Modified `AttentionBlock.Forward()` to compute Q/K/V projections separately
|
||||
3. Updated `MLPBlock` struct to use separate `Gate` and `Up` fields instead of fused `GateUp`
|
||||
4. Modified `MLPBlock.Forward()` to compute gate/up separately and removed incorrect reshape
|
||||
|
||||
**Result**: ✅ `gpt-oss:20b` model now loads and runs successfully on Tesla K80
|
||||
|
||||
## Documentation Structure
|
||||
|
||||
The project documentation is organized as follows:
|
||||
|
||||
@@ -187,45 +187,42 @@ func (ftype FileType) ToTensorType() TensorType {
|
||||
type TensorType uint32
|
||||
|
||||
const (
|
||||
TensorTypeF32 TensorType = iota
|
||||
TensorTypeF16
|
||||
TensorTypeQ4_0
|
||||
TensorTypeQ4_1
|
||||
TensorTypeMXFP4 // Formerly unused tensorTypeQ4_2
|
||||
tensorTypeQ4_3 // unused by GGML
|
||||
TensorTypeQ5_0
|
||||
TensorTypeQ5_1
|
||||
TensorTypeQ8_0
|
||||
TensorTypeQ8_1
|
||||
TensorTypeQ2_K
|
||||
TensorTypeQ3_K
|
||||
TensorTypeQ4_K
|
||||
TensorTypeQ5_K
|
||||
TensorTypeQ6_K
|
||||
TensorTypeQ8_K
|
||||
tensorTypeIQ2_XXS // not supported by ollama
|
||||
tensorTypeIQ2_XS // not supported by ollama
|
||||
tensorTypeIQ3_XXS // not supported by ollama
|
||||
tensorTypeIQ1_S // not supported by ollama
|
||||
tensorTypeIQ4_NL // not supported by ollama
|
||||
tensorTypeIQ3_S // not supported by ollama
|
||||
tensorTypeIQ2_S // not supported by ollama
|
||||
tensorTypeIQ4_XS // not supported by ollama
|
||||
TensorTypeI8
|
||||
TensorTypeI16
|
||||
TensorTypeI32
|
||||
TensorTypeI64
|
||||
TensorTypeF64
|
||||
tensorTypeIQ1_M // not supported by ollama
|
||||
TensorTypeBF16
|
||||
tensorTypeQ4_0_4_4 // unused by GGML
|
||||
tensorTypeQ4_0_4_8 // unused by GGML
|
||||
tensorTypeQ4_0_8_8 // unused by GGML
|
||||
tensorTypeTQ1_0 // not supported by ollama
|
||||
tensorTypeTQ2_0 // not supported by ollama
|
||||
tensorTypeIQ4_NL_4_4 // unused by GGML
|
||||
tensorTypeIQ4_NL_4_8 // unused by GGML
|
||||
tensorTypeIQ4_NL_8_8 // unused by GGML
|
||||
TensorTypeF32 TensorType = 0
|
||||
TensorTypeF16 = 1
|
||||
TensorTypeQ4_0 = 2
|
||||
TensorTypeQ4_1 = 3
|
||||
// 4 = Q4_2 removed
|
||||
// 5 = Q4_3 removed
|
||||
TensorTypeQ5_0 = 6
|
||||
TensorTypeQ5_1 = 7
|
||||
TensorTypeQ8_0 = 8
|
||||
TensorTypeQ8_1 = 9
|
||||
TensorTypeQ2_K = 10
|
||||
TensorTypeQ3_K = 11
|
||||
TensorTypeQ4_K = 12
|
||||
TensorTypeQ5_K = 13
|
||||
TensorTypeQ6_K = 14
|
||||
TensorTypeQ8_K = 15
|
||||
tensorTypeIQ2_XXS = 16 // not supported by ollama
|
||||
tensorTypeIQ2_XS = 17 // not supported by ollama
|
||||
tensorTypeIQ3_XXS = 18 // not supported by ollama
|
||||
tensorTypeIQ1_S = 19 // not supported by ollama
|
||||
tensorTypeIQ4_NL = 20 // not supported by ollama
|
||||
tensorTypeIQ3_S = 21 // not supported by ollama
|
||||
tensorTypeIQ2_S = 22 // not supported by ollama
|
||||
tensorTypeIQ4_XS = 23 // not supported by ollama
|
||||
TensorTypeI8 = 24
|
||||
TensorTypeI16 = 25
|
||||
TensorTypeI32 = 26
|
||||
TensorTypeI64 = 27
|
||||
TensorTypeF64 = 28
|
||||
tensorTypeIQ1_M = 29 // not supported by ollama
|
||||
TensorTypeBF16 = 30
|
||||
// 31-33 = Q4_0 variants removed
|
||||
tensorTypeTQ1_0 = 34 // not supported by ollama
|
||||
tensorTypeTQ2_0 = 35 // not supported by ollama
|
||||
// 36-38 = IQ4_NL variants removed
|
||||
TensorTypeMXFP4 = 39
|
||||
)
|
||||
|
||||
// ParseFileType parses the provided GGUF file type
|
||||
|
||||
7
ml/backend/ggml/ggml/include/ggml.h
vendored
7
ml/backend/ggml/ggml/include/ggml.h
vendored
@@ -353,7 +353,7 @@ extern "C" {
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
GGML_TYPE_MXFP4 = 4, // Formerly removed type GGML_TYPE_Q4_2
|
||||
// GGML_TYPE_Q4_2 = 4, support has been removed
|
||||
// GGML_TYPE_Q4_3 = 5, support has been removed
|
||||
GGML_TYPE_Q5_0 = 6,
|
||||
GGML_TYPE_Q5_1 = 7,
|
||||
@@ -385,10 +385,11 @@ extern "C" {
|
||||
// GGML_TYPE_Q4_0_8_8 = 33,
|
||||
GGML_TYPE_TQ1_0 = 34,
|
||||
GGML_TYPE_TQ2_0 = 35,
|
||||
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
||||
// GGML_TYPE_IQ4_NL_4_4 = 36, support has been removed from gguf files
|
||||
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
||||
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
||||
GGML_TYPE_COUNT = 39,
|
||||
GGML_TYPE_MXFP4 = 39,
|
||||
GGML_TYPE_COUNT = 40,
|
||||
};
|
||||
|
||||
// precision
|
||||
|
||||
20
ml/backend/ggml/ggml/src/ggml.c
vendored
20
ml/backend/ggml/ggml/src/ggml.c
vendored
@@ -589,13 +589,11 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
|
||||
},
|
||||
[GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
|
||||
.type_name = "mxfp4",
|
||||
.blck_size = MXFP4,
|
||||
.type_size = sizeof(block_mxfp4),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref,
|
||||
[4] = { // GGML_TYPE_Q4_2
|
||||
.type_name = "DEPRECATED",
|
||||
.blck_size = 0,
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[5] = { // GGML_TYPE_Q4_3
|
||||
.type_name = "DEPRECATED",
|
||||
@@ -812,6 +810,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
.type_size = 0,
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_MXFP4] = {
|
||||
.type_name = "mxfp4",
|
||||
.blck_size = MXFP4,
|
||||
.type_size = sizeof(block_mxfp4),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
|
||||
.from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref,
|
||||
},
|
||||
};
|
||||
|
||||
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||
|
||||
@@ -102,7 +102,9 @@ func (d *TransformerBlock) Forward(ctx ml.Context, hiddenStates, positions, outp
|
||||
|
||||
type AttentionBlock struct {
|
||||
Norm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
QKV *nn.Linear `gguf:"attn_qkv"`
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
Sinks ml.Tensor `gguf:"attn_sinks"`
|
||||
}
|
||||
@@ -113,33 +115,17 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T
|
||||
residual := hiddenStates
|
||||
hiddenStates = attn.Norm.Forward(ctx, hiddenStates, opts.eps)
|
||||
|
||||
qkv := attn.QKV.Forward(ctx, hiddenStates)
|
||||
|
||||
// query = qkv[..., : num_attention_heads * head_dim].reshape(batch_size, num_attention_heads, head_dim)
|
||||
query := qkv.View(ctx,
|
||||
0,
|
||||
opts.headDim(), qkv.Stride(0)*opts.headDim(),
|
||||
opts.numHeads, qkv.Stride(1),
|
||||
batchSize,
|
||||
)
|
||||
// Compute separate Q, K, V projections
|
||||
query := attn.Query.Forward(ctx, hiddenStates)
|
||||
query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
|
||||
query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
|
||||
|
||||
// key = qkv[..., num_attention_heads * head_dim:(num_attention_heads + num_key_value_heads) * head_dim].reshape(batch_size, num_key_value_heads, head_dim)
|
||||
key := qkv.View(ctx,
|
||||
qkv.Stride(0)*opts.headDim()*opts.numHeads,
|
||||
opts.headDim(), qkv.Stride(0)*opts.headDim(),
|
||||
opts.numKVHeads, qkv.Stride(1),
|
||||
batchSize,
|
||||
)
|
||||
key := attn.Key.Forward(ctx, hiddenStates)
|
||||
key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, 1./opts.ropeScale, opts.RoPEOptions()...)
|
||||
|
||||
// value = qkv[..., (num_attention_heads + num_key_value_heads) * head_dim:].reshape(batch_size, num_key_value_heads, head_dim)
|
||||
value := qkv.View(ctx,
|
||||
qkv.Stride(0)*opts.headDim()*(opts.numHeads+opts.numKVHeads),
|
||||
opts.headDim(), qkv.Stride(0)*opts.headDim(),
|
||||
opts.numKVHeads, qkv.Stride(1),
|
||||
batchSize,
|
||||
)
|
||||
value := attn.Value.Forward(ctx, hiddenStates)
|
||||
value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
|
||||
|
||||
cache.Put(ctx, key, value)
|
||||
key, value, mask := cache.Get(ctx)
|
||||
@@ -165,7 +151,8 @@ func (attn *AttentionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.T
|
||||
type MLPBlock struct {
|
||||
Norm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
Router *nn.Linear `gguf:"ffn_gate_inp"`
|
||||
GateUp *nn.LinearBatch `gguf:"ffn_gate_up_exps"`
|
||||
Gate *nn.LinearBatch `gguf:"ffn_gate_exps"`
|
||||
Up *nn.LinearBatch `gguf:"ffn_up_exps"`
|
||||
Down *nn.LinearBatch `gguf:"ffn_down_exps"`
|
||||
}
|
||||
|
||||
@@ -185,21 +172,16 @@ func (mlp *MLPBlock) Forward(ctx ml.Context, hiddenStates, one ml.Tensor, opts *
|
||||
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
|
||||
|
||||
hiddenStates = mlp.GateUp.Forward(ctx, hiddenStates, selectedExperts)
|
||||
hiddenStates = hiddenStates.Reshape(ctx, 2, hiddenStates.Dim(0)/2, hiddenStates.Dim(1), hiddenStates.Dim(2))
|
||||
// Compute gate and up separately instead of using fused GateUp
|
||||
gateStates := mlp.Gate.Forward(ctx, hiddenStates, selectedExperts)
|
||||
gateStates = gateStates.Clamp(ctx, float32(math.Inf(-1)), 7.0)
|
||||
gateStates = gateStates.QuickGELU(ctx)
|
||||
|
||||
dimStride := []int{hiddenStates.Dim(0) / 2, hiddenStates.Stride(1), hiddenStates.Dim(1), hiddenStates.Stride(2), hiddenStates.Dim(2), hiddenStates.Stride(3), hiddenStates.Dim(3)}
|
||||
upStates := mlp.Up.Forward(ctx, hiddenStates, selectedExperts)
|
||||
upStates = upStates.Clamp(ctx, -7.0, 7.0)
|
||||
|
||||
glu := hiddenStates.View(ctx, 0, dimStride...)
|
||||
glu = glu.Contiguous(ctx)
|
||||
glu = glu.Clamp(ctx, float32(math.Inf(-1)), 7.0)
|
||||
glu = glu.QuickGELU(ctx)
|
||||
|
||||
linear := hiddenStates.View(ctx, hiddenStates.Stride(0), dimStride...)
|
||||
linear = linear.Clamp(ctx, -7.0, 7.0)
|
||||
|
||||
hiddenStates = glu.Mul(ctx, linear.Add(ctx, one))
|
||||
hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0)*hiddenStates.Dim(1), hiddenStates.Dim(2), hiddenStates.Dim(3))
|
||||
hiddenStates = gateStates.Mul(ctx, upStates.Add(ctx, one))
|
||||
// hiddenStates is now [intermediate_size, num_experts_used, seq*batch]
|
||||
|
||||
experts := mlp.Down.Forward(ctx, hiddenStates, selectedExperts)
|
||||
experts = experts.Mul(ctx, routingWeights)
|
||||
|
||||
Reference in New Issue
Block a user