Files
ollama37/ml/backend/ggml/mxfp4_test.go
Michael Yang fa7776fd24 gpt-oss (#11672)
* bf16

* tests

* gpt-oss

* enable gptoss for engine

* rough estimate

* convert to mxfp4

* handle safetensors U8

* clamp glu/linear

* update tokenizer

* MXFP4 support

This implements the Open Compute Microscaling (MX) FP4 format
as a tensor type with backend implementations focusing
on mulmat and mulmatid on CPU, CUDA, and Metal.

* Unit tests for MXFP4 support

This exercises various operations and shapes on both CPU and GPU (if detected
on the system)

* cuda graph

* unit test adjustments

* cuda: optimize memory access

Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4

* mac: fix crash on old macos versions

cblas_sgemm is only supported on v13.3 and up, however bf16 is
only supported on v14+ so we were falling back to ggml-blas and
crashing on bf16 tensors.  Checking for the function being null
seems to be the simplest way to condittionally avoid registering the
backend.

* server: Minimum context length for gptoss

This model requires a minimum context length of 8192 to function
effectively. Users can set higher values through all normal mechanisms
but lower values will be silently reset.

* ggml: Multiply by numParallel for gptoss sliding window

When computing the graph size estimate, the context size is already
multiplied by numParallel so estimates reflect that. However, since
sliding window models use a smaller, fixed context size, they need
to manually take numParallel into account.

* gpt-oss integration

includes harmony parser and thinking levels, etc.

* fix sync

* fix tests

* fix lint

---------

Co-authored-by: Daniel Hiltgen <daniel@ollama.com>
Co-authored-by: Jesse Gross <jesse@ollama.com>
Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
2025-08-05 12:21:16 -07:00

796 lines
24 KiB
Go

package ggml
import (
"math"
"math/rand"
"os"
"testing"
"github.com/ollama/ollama/ml"
fsggml "github.com/ollama/ollama/fs/ggml"
)
/*
To get GPUs loading in these tests on windows...
$env:OLLAMA_LIBRARY_PATH="$(pwd)\build\lib\ollama"
$env:PATH="$(pwd)\build\lib\ollama;$env:PATH"
go test .\ml\backend\ggml\... -run TestMXFP4
*/
// MXFP4 reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
// E2M1 values
var mxfp4_vals = []float32{
0.0, // 0 00 0 = 0x0
0.5, // 0 00 1 = 0x1
1.0, // 0 01 0 = 0x2
1.5, // 0 01 1 = 0x3
2.0, // 0 10 0 = 0x4
3.0, // 0 10 1 = 0x5
4.0, // 0 11 0 = 0x6
6.0, // 0 11 1 = 0x7
0.0, // 1 00 0 = 0x8
-0.5, // 1 00 1 = 0x9
-1.0, // 1 01 0 = 0xa
-1.5, // 1 01 1 = 0xb
-2.0, // 1 10 0 = 0xc
-3.0, // 1 10 1 = 0xd
-4.0, // 1 11 0 = 0xe
-6.0, // 1 11 1 = 0xf
}
func TestMXFP4Ops(t *testing.T) {
b := setup(t)
for _, useGPU := range []bool{false, true} {
useGPU := useGPU
var label string
if useGPU {
label = "gpu"
} else {
label = "cpu"
}
t.Run(label, func(t *testing.T) {
t.Run("mulmatid", func(t *testing.T) {
// Use exact values that are supported without scaling so we can compare against an fp32 tensor
t.Run("exact", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s00 = 64
const s01 = 1
const s02 = 2
const s10 = s00
const s11 = 1
const s12 = 1
// const s00 = 2880
// const s01 = 5760
// const s02 = 32
// const s10 = s00
// const s11 = 1
// const s12 = 64
data := [s00 * s01 * s02]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
t1f := ctx.(*Context).FromFloatSlice(data[:], s00, s01, s02)
// for i := range len(data) / 32 { // MXFP4 block size
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
// random 0-1 float
d2 := [s10 * s11 * s12]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s10 {
// vals := [s10]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s10+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
d3 := [4 * s12]int32{}
for i := range d3 {
d3[i] = int32(i) % s02
}
t3 := ctx.(*Context).FromIntSlice(d3[:], 4, s12)
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2)) // lower precision for CPU accuracy
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
t.Run("range", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 64
const s1 = 2
const s2 = 4
const idlen = 4
data := [s0 * s1 * s2]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1, s2)
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1, s2)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s0]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
// TODO - there might be a CUDA bug here...
d3 := [idlen]int32{1, 1, 2, 3}
// for i := range d3 {
// d3[i] = int32(i) % s2
// t.Logf("%d] %d", i, d3[i])
// }
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen)
// t.Log("calling Mulmat")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
// Metal has some drift so use reduced precision for dump comparisons
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("mxfp4 result\n%s", d4)
})
t.Run("random", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s00 = 2880
const s01 = 5760
const s02 = 32
const s10 = s00
const s11 = 1
const s12 = 64
const idlen = 4
data := [s00 * s01 * s02]float32{}
for i := range data {
data[i] = float32(r.Float32() * 10.0)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
t1f := ctx.(*Context).FromFloatSlice(dataf, s00, s01, s02)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s10 * s11 * s12]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
// arange equiv
d3 := [idlen * s12]int32{}
for i := range d3 {
d3[i] = int32(i) % s02
}
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen, s12)
// t.Log("calling Mulmat")
// t3 := t1.Mulmat(ctx, t2)
// t3f := t1f.Mulmat(ctx, t2)
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
// Metal and CPU have some drift so use reduced precision for dump comparisons
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(1))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(1))
// t.Logf("mxfp4 data: \n%s", d4)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
})
// Use data file(s) with real data
t.Run("example_7", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1, err := os.ReadFile("hidden-states-7.bin")
if err != nil {
t.Skip("missing hidden-states.bin file, skipping test")
}
data2, err := os.ReadFile("selected-experts-7.bin")
if err != nil {
t.Skip("missing selected-experts.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 7)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 7)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4)
d4f := ml.Dump(ctx, t4f)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
// Use data file(s) with real data
t.Run("example_384", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1, err := os.ReadFile("hidden-states-384.bin")
if err != nil {
t.Skip("missing hidden-states.bin file, skipping test")
}
data2, err := os.ReadFile("selected-experts-384.bin")
if err != nil {
t.Skip("missing selected-experts.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 384)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 384)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
// Use data file(s) with real data
t.Run("example_1d", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
data1 := [2880]float32{}
for i := range data1 {
data1[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880)
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
data2 := [4]int32{
12, 30, 17, 7,
// 7, 17, 12, 30,
}
t3 := ctx.(*Context).FromIntSlice(data2[:], 4)
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
// t.Log("calling MulmatID")
t4 := t1.MulmatID(ctx, t2, t3)
t4f := t1f.MulmatID(ctx, t2, t3)
d4 := ml.Dump(ctx, t4)
d4f := ml.Dump(ctx, t4f)
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("MulmatID results matched:\n%s", d4)
})
})
t.Run("mm", func(t *testing.T) {
t.Run("example", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
data0, err := os.ReadFile("mlp-gateup.bin")
if err != nil {
t.Skip("missing mlp-gateup.bin file, skipping test")
}
data1 := [2880 * 1 * 32]float32{}
for i := range data1 {
data1[i] = float32(r.Float32())
}
dtype := ml.DTypeMXFP4
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880, 1, 32)
t4 := t1.Mulmat(ctx, t2)
t4f := t1f.Mulmat(ctx, t2)
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
r4 := t4.Floats()
r4f := t4f.Floats()
sim := cosineSimilarity(r4, r4f)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d4 != d4f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
}
// t.Logf("Mulmat results matched:\n%s", d4)
})
t.Run("exact/3x3", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s10 = 64
const s11 = 1
const s12 = 2
const s20 = s10
const s21 = 1
const s22 = 2
data := [s10 * s11 * s12]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s10, s11, s12)
t1f := ctx.(*Context).FromFloatSlice(data[:], s10, s11, s12)
d2 := [s20 * s21 * s22]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], s20, s21, s22)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("exact/2x2", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 32
const s1 = 64
data := [s0 * s1]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range 4 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
d2 := [s0 * s1]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("exact/2x1", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 64
const s1 = 4
data := [s0 * s1]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// for i := range len(mxData) / 17 {
// vals := [17]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
// }
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
// }
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
d2 := [s0]float32{}
for i := range d2 {
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*32+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(3))
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(3))
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("range/2d", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
ctx := initContextOrSkip(t, b, useGPU)
const s0 = 32
const s1 = 4
data := [s0 * s1]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
dtype := ml.DTypeMXFP4
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1)
// for i := range len(data) / 32 {
// vals := [32]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
// }
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
// }
d2 := [s0 * s1]float32{}
for i := range d2 {
// d2[i] = float32(i)
d2[i] = float32(r.Float32())
}
// for i := range len(d2) / s0 {
// vals := [s0]string{}
// for j := range vals {
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
// }
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
// }
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
// t.Log("calling Mulmat")
t3 := t1.Mulmat(ctx, t2)
t3f := t1f.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(2))
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(2))
r3 := t3.Floats()
r3f := t3f.Floats()
sim := cosineSimilarity(r3, r3f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
t.Run("range/3d", func(t *testing.T) {
ctx := initContextOrSkip(t, b, useGPU)
data := [32 * 4 * 2]float32{}
inTotal := float32(0)
for i := range data {
data[i] = float32(i)
inTotal += float32(i)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 4, 2)
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 4, 2)
d2 := [32 * 4 * 2]float32{}
for i := range d2 {
d2[i] = 2.0
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 4, 2)
// t.Log("calling Mulmat")
t3 := t1.Mulmat(ctx, t2)
t3f := t1f.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
r3 := t3.Floats()
r3f := t3f.Floats()
sim := cosineSimilarity(r3, r3f)
if sim < 0.99 {
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
})
})
})
}
}
func TestMXFP4Simple(t *testing.T) {
b := setup(t)
t.Run("fixed", func(t *testing.T) {
ctx := initContextOrSkip(t, b, false)
data := [32 * 2]float32{
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
dtype := ml.DTypeMXFP4
// Reconvert back to floats to remove the quantization fidelity loss for comparison
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 2)
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 2)
d2 := [32 * 2]float32{
// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 2)
t.Log("calling Mulmat")
t3f := t1f.Mulmat(ctx, t2)
t3 := t1.Mulmat(ctx, t2)
d3 := ml.Dump(ctx, t3)
d3f := ml.Dump(ctx, t3f)
if d3 != d3f {
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
}
t.Logf("result (mxfp4): \n%s", d3)
})
}
func TestMXFP4Conversion(t *testing.T) {
t.Run("quantize/exact", func(t *testing.T) {
r := rand.New(rand.NewSource(0))
data := [32 * 4]float32{}
for i := range data {
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
if len(data) != len(newData) {
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
}
for i := range data {
if data[i] != newData[i] {
t.Logf("started with: %v", data)
t.Logf("got : %v", newData)
t.Fatalf("mismatched data starting at offset %d started with %f but got %f", i, data[i], newData[i])
}
}
})
t.Run("quantize/arange", func(t *testing.T) {
data := [32 * 8]float32{}
for i := range data {
data[i] = float32(i) // / float32(6.0)
}
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
if len(data) != len(newData) {
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
}
sim := cosineSimilarity(data[:], newData)
if sim < 0.99 {
t.Fatalf("failed similarity test: %f", sim)
}
t.Logf("similarity: %f", sim)
})
}
func dotProduct[V float32 | float64](v1, v2 []V) V {
var result V = 0
for i := range v1 {
result += v1[i] * v2[i]
}
return result
}
func magnitude[V float32 | float64](v []V) V {
var result V = 0
for _, val := range v {
result += val * val
}
return V(math.Sqrt(float64(result)))
}
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
}