Fix gpt-oss model architecture to match GGUF tensor format

The gpt-oss model architecture code expected fused tensors (attn_qkv,
ffn_gate_up_exps) but the actual GGUF files contain separate tensors
(attn_q/k/v, ffn_gate_exps/up_exps), causing nil pointer panics during
model loading.

Changes:
- model/models/gptoss/model.go: Updated AttentionBlock to use separate
  Query/Key/Value fields instead of fused QKV, modified Forward() to
  compute projections separately
- model/models/gptoss/model.go: Updated MLPBlock to use separate Gate/Up
  fields instead of fused GateUp, simplified Forward() logic
- fs/ggml/type.go: Reorganized MXFP4 tensor type constant ordering
- ml/backend/ggml/ggml/include/ggml.h: Moved GGML_TYPE_MXFP4 to end of
  enum to match GGUF file format specification
- ml/backend/ggml/ggml/src/ggml.c: Updated type name array to match
  reordered enum
- CLAUDE.md: Documented gpt-oss model compatibility fix

Result: gpt-oss:20b model now loads and runs successfully on Tesla K80,
all 25 layers offload to GPU correctly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Shang Chieh Tseng
2025-10-29 23:34:03 +08:00
parent 241a03402e
commit d04ea50ced
5 changed files with 91 additions and 87 deletions

View File

@@ -353,7 +353,7 @@ extern "C" {
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_MXFP4 = 4, // Formerly removed type GGML_TYPE_Q4_2
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 = 5, support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
@@ -385,10 +385,11 @@ extern "C" {
// GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
// GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_4 = 36, support has been removed from gguf files
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT = 39,
GGML_TYPE_MXFP4 = 39,
GGML_TYPE_COUNT = 40,
};
// precision

View File

@@ -589,13 +589,11 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
},
[GGML_TYPE_MXFP4] = { // formerly deprecated GGML_TYPE_Q4_2
.type_name = "mxfp4",
.blck_size = MXFP4,
.type_size = sizeof(block_mxfp4),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
.from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref,
[4] = { // GGML_TYPE_Q4_2
.type_name = "DEPRECATED",
.blck_size = 0,
.type_size = 0,
.is_quantized = false,
},
[5] = { // GGML_TYPE_Q4_3
.type_name = "DEPRECATED",
@@ -812,6 +810,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
.type_size = 0,
.is_quantized = false,
},
[GGML_TYPE_MXFP4] = {
.type_name = "mxfp4",
.blck_size = MXFP4,
.type_size = sizeof(block_mxfp4),
.is_quantized = true,
.to_float = (ggml_to_float_t) dequantize_row_mxfp4,
.from_float_ref = (ggml_from_float_t) quantize_row_mxfp4_ref,
},
};
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {