mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
* bf16 * tests * gpt-oss * enable gptoss for engine * rough estimate * convert to mxfp4 * handle safetensors U8 * clamp glu/linear * update tokenizer * MXFP4 support This implements the Open Compute Microscaling (MX) FP4 format as a tensor type with backend implementations focusing on mulmat and mulmatid on CPU, CUDA, and Metal. * Unit tests for MXFP4 support This exercises various operations and shapes on both CPU and GPU (if detected on the system) * cuda graph * unit test adjustments * cuda: optimize memory access Read 4 bytes at a time (8 elements) when performing mul_mat_vec_mxfp4 * mac: fix crash on old macos versions cblas_sgemm is only supported on v13.3 and up, however bf16 is only supported on v14+ so we were falling back to ggml-blas and crashing on bf16 tensors. Checking for the function being null seems to be the simplest way to condittionally avoid registering the backend. * server: Minimum context length for gptoss This model requires a minimum context length of 8192 to function effectively. Users can set higher values through all normal mechanisms but lower values will be silently reset. * ggml: Multiply by numParallel for gptoss sliding window When computing the graph size estimate, the context size is already multiplied by numParallel so estimates reflect that. However, since sliding window models use a smaller, fixed context size, they need to manually take numParallel into account. * gpt-oss integration includes harmony parser and thinking levels, etc. * fix sync * fix tests * fix lint --------- Co-authored-by: Daniel Hiltgen <daniel@ollama.com> Co-authored-by: Jesse Gross <jesse@ollama.com> Co-authored-by: Devon Rifkin <drifkin@drifkin.net>
796 lines
24 KiB
Go
796 lines
24 KiB
Go
package ggml
|
|
|
|
import (
|
|
"math"
|
|
"math/rand"
|
|
"os"
|
|
"testing"
|
|
|
|
"github.com/ollama/ollama/ml"
|
|
|
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
|
)
|
|
|
|
/*
|
|
To get GPUs loading in these tests on windows...
|
|
|
|
$env:OLLAMA_LIBRARY_PATH="$(pwd)\build\lib\ollama"
|
|
$env:PATH="$(pwd)\build\lib\ollama;$env:PATH"
|
|
|
|
go test .\ml\backend\ggml\... -run TestMXFP4
|
|
*/
|
|
|
|
// MXFP4 reference: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
|
|
|
// E2M1 values
|
|
var mxfp4_vals = []float32{
|
|
0.0, // 0 00 0 = 0x0
|
|
0.5, // 0 00 1 = 0x1
|
|
1.0, // 0 01 0 = 0x2
|
|
1.5, // 0 01 1 = 0x3
|
|
2.0, // 0 10 0 = 0x4
|
|
3.0, // 0 10 1 = 0x5
|
|
4.0, // 0 11 0 = 0x6
|
|
6.0, // 0 11 1 = 0x7
|
|
0.0, // 1 00 0 = 0x8
|
|
-0.5, // 1 00 1 = 0x9
|
|
-1.0, // 1 01 0 = 0xa
|
|
-1.5, // 1 01 1 = 0xb
|
|
-2.0, // 1 10 0 = 0xc
|
|
-3.0, // 1 10 1 = 0xd
|
|
-4.0, // 1 11 0 = 0xe
|
|
-6.0, // 1 11 1 = 0xf
|
|
}
|
|
|
|
func TestMXFP4Ops(t *testing.T) {
|
|
b := setup(t)
|
|
for _, useGPU := range []bool{false, true} {
|
|
useGPU := useGPU
|
|
var label string
|
|
if useGPU {
|
|
label = "gpu"
|
|
} else {
|
|
label = "cpu"
|
|
}
|
|
t.Run(label, func(t *testing.T) {
|
|
t.Run("mulmatid", func(t *testing.T) {
|
|
// Use exact values that are supported without scaling so we can compare against an fp32 tensor
|
|
t.Run("exact", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s00 = 64
|
|
const s01 = 1
|
|
const s02 = 2
|
|
const s10 = s00
|
|
const s11 = 1
|
|
const s12 = 1
|
|
// const s00 = 2880
|
|
// const s01 = 5760
|
|
// const s02 = 32
|
|
// const s10 = s00
|
|
// const s11 = 1
|
|
// const s12 = 64
|
|
|
|
data := [s00 * s01 * s02]float32{}
|
|
for i := range data {
|
|
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
|
|
t1f := ctx.(*Context).FromFloatSlice(data[:], s00, s01, s02)
|
|
// for i := range len(data) / 32 { // MXFP4 block size
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
|
|
// }
|
|
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
// random 0-1 float
|
|
d2 := [s10 * s11 * s12]float32{}
|
|
for i := range d2 {
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
// for i := range len(d2) / s10 {
|
|
// vals := [s10]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s10+j])
|
|
// }
|
|
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
|
|
|
|
d3 := [4 * s12]int32{}
|
|
for i := range d3 {
|
|
d3[i] = int32(i) % s02
|
|
}
|
|
t3 := ctx.(*Context).FromIntSlice(d3[:], 4, s12)
|
|
|
|
// t.Log("calling MulmatID")
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2)) // lower precision for CPU accuracy
|
|
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("MulmatID results matched:\n%s", d4)
|
|
})
|
|
|
|
t.Run("range", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s0 = 64
|
|
const s1 = 2
|
|
const s2 = 4
|
|
const idlen = 4
|
|
data := [s0 * s1 * s2]float32{}
|
|
inTotal := float32(0)
|
|
for i := range data {
|
|
data[i] = float32(i)
|
|
inTotal += float32(i)
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// Reconvert back to floats to remove the quantization fidelity loss for comparison
|
|
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1, s2)
|
|
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1, s2)
|
|
// for i := range len(data) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
|
|
// }
|
|
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
d2 := [s0]float32{}
|
|
for i := range d2 {
|
|
// d2[i] = float32(i)
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
// for i := range len(d2) / s0 {
|
|
// vals := [s0]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
|
|
// }
|
|
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
|
|
|
|
// TODO - there might be a CUDA bug here...
|
|
d3 := [idlen]int32{1, 1, 2, 3}
|
|
// for i := range d3 {
|
|
// d3[i] = int32(i) % s2
|
|
// t.Logf("%d] %d", i, d3[i])
|
|
// }
|
|
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen)
|
|
|
|
// t.Log("calling Mulmat")
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
// Metal has some drift so use reduced precision for dump comparisons
|
|
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(2))
|
|
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(2))
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("mxfp4 result\n%s", d4)
|
|
})
|
|
t.Run("random", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s00 = 2880
|
|
const s01 = 5760
|
|
const s02 = 32
|
|
const s10 = s00
|
|
const s11 = 1
|
|
const s12 = 64
|
|
const idlen = 4
|
|
|
|
data := [s00 * s01 * s02]float32{}
|
|
for i := range data {
|
|
data[i] = float32(r.Float32() * 10.0)
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// Reconvert back to floats to remove the quantization fidelity loss for comparison
|
|
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s00, s01, s02)
|
|
t1f := ctx.(*Context).FromFloatSlice(dataf, s00, s01, s02)
|
|
// for i := range len(data) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
|
|
// }
|
|
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
d2 := [s10 * s11 * s12]float32{}
|
|
for i := range d2 {
|
|
// d2[i] = float32(i)
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
// for i := range len(d2) / s0 {
|
|
// vals := [s0]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
|
|
// }
|
|
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s10, s11, s12)
|
|
|
|
// arange equiv
|
|
d3 := [idlen * s12]int32{}
|
|
for i := range d3 {
|
|
d3[i] = int32(i) % s02
|
|
}
|
|
t3 := ctx.(*Context).FromIntSlice(d3[:], idlen, s12)
|
|
|
|
// t.Log("calling Mulmat")
|
|
// t3 := t1.Mulmat(ctx, t2)
|
|
// t3f := t1f.Mulmat(ctx, t2)
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
// Metal and CPU have some drift so use reduced precision for dump comparisons
|
|
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(1))
|
|
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(1))
|
|
// t.Logf("mxfp4 data: \n%s", d4)
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
})
|
|
|
|
// Use data file(s) with real data
|
|
t.Run("example_7", func(t *testing.T) {
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
data0, err := os.ReadFile("mlp-gateup.bin")
|
|
if err != nil {
|
|
t.Skip("missing mlp-gateup.bin file, skipping test")
|
|
}
|
|
data1, err := os.ReadFile("hidden-states-7.bin")
|
|
if err != nil {
|
|
t.Skip("missing hidden-states.bin file, skipping test")
|
|
}
|
|
data2, err := os.ReadFile("selected-experts-7.bin")
|
|
if err != nil {
|
|
t.Skip("missing selected-experts.bin file, skipping test")
|
|
}
|
|
|
|
dtype := ml.DTypeMXFP4
|
|
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
|
|
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
|
|
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
|
|
|
|
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
|
|
|
|
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 7)
|
|
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
|
|
|
|
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 7)
|
|
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
|
|
|
|
// t.Log("calling MulmatID")
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
|
|
d4 := ml.Dump(ctx, t4)
|
|
d4f := ml.Dump(ctx, t4f)
|
|
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("MulmatID results matched:\n%s", d4)
|
|
})
|
|
|
|
// Use data file(s) with real data
|
|
t.Run("example_384", func(t *testing.T) {
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
data0, err := os.ReadFile("mlp-gateup.bin")
|
|
if err != nil {
|
|
t.Skip("missing mlp-gateup.bin file, skipping test")
|
|
}
|
|
data1, err := os.ReadFile("hidden-states-384.bin")
|
|
if err != nil {
|
|
t.Skip("missing hidden-states.bin file, skipping test")
|
|
}
|
|
data2, err := os.ReadFile("selected-experts-384.bin")
|
|
if err != nil {
|
|
t.Skip("missing selected-experts.bin file, skipping test")
|
|
}
|
|
|
|
dtype := ml.DTypeMXFP4
|
|
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
|
|
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
|
|
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
|
|
|
|
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
|
|
|
|
t2 := ctx.(*Context).FromBytes(ml.DTypeF32, data1, 2880, 1, 384)
|
|
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
|
|
|
|
t3 := ctx.(*Context).FromBytes(ml.DTypeI32, data2, 4, 384)
|
|
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
|
|
|
|
// t.Log("calling MulmatID")
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
|
|
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
|
|
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
|
|
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("MulmatID results matched:\n%s", d4)
|
|
})
|
|
|
|
// Use data file(s) with real data
|
|
t.Run("example_1d", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
data0, err := os.ReadFile("mlp-gateup.bin")
|
|
if err != nil {
|
|
t.Skip("missing mlp-gateup.bin file, skipping test")
|
|
}
|
|
|
|
dtype := ml.DTypeMXFP4
|
|
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
|
|
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
|
|
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
|
|
|
|
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
|
|
data1 := [2880]float32{}
|
|
for i := range data1 {
|
|
data1[i] = float32(r.Float32())
|
|
}
|
|
|
|
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880)
|
|
// t.Logf("hidden-state: \n%s", ml.Dump(ctx, t2))
|
|
data2 := [4]int32{
|
|
12, 30, 17, 7,
|
|
// 7, 17, 12, 30,
|
|
}
|
|
|
|
t3 := ctx.(*Context).FromIntSlice(data2[:], 4)
|
|
// t.Logf("experts: \n%s", ml.Dump(ctx, t3))
|
|
|
|
// t.Log("calling MulmatID")
|
|
t4 := t1.MulmatID(ctx, t2, t3)
|
|
t4f := t1f.MulmatID(ctx, t2, t3)
|
|
|
|
d4 := ml.Dump(ctx, t4)
|
|
d4f := ml.Dump(ctx, t4f)
|
|
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("MulmatID results matched:\n%s", d4)
|
|
})
|
|
})
|
|
|
|
t.Run("mm", func(t *testing.T) {
|
|
t.Run("example", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
data0, err := os.ReadFile("mlp-gateup.bin")
|
|
if err != nil {
|
|
t.Skip("missing mlp-gateup.bin file, skipping test")
|
|
}
|
|
data1 := [2880 * 1 * 32]float32{}
|
|
for i := range data1 {
|
|
data1[i] = float32(r.Float32())
|
|
}
|
|
|
|
dtype := ml.DTypeMXFP4
|
|
data0f := ConvertToF32(data0, uint32(fsggml.TensorTypeMXFP4), 2880*5760*32)
|
|
t1 := ctx.(*Context).FromBytes(dtype, data0, 2880, 5760, 32)
|
|
t1f := ctx.(*Context).FromFloatSlice(data0f, 2880, 5760, 32)
|
|
|
|
// t.Logf("f32: \n%s", ml.Dump(ctx, t1f))
|
|
|
|
t2 := ctx.(*Context).FromFloatSlice(data1[:], 2880, 1, 32)
|
|
|
|
t4 := t1.Mulmat(ctx, t2)
|
|
t4f := t1f.Mulmat(ctx, t2)
|
|
|
|
d4 := ml.Dump(ctx, t4, ml.DumpWithPrecision(3))
|
|
d4f := ml.Dump(ctx, t4f, ml.DumpWithPrecision(3))
|
|
|
|
r4 := t4.Floats()
|
|
r4f := t4f.Floats()
|
|
sim := cosineSimilarity(r4, r4f)
|
|
if sim < 0.99 {
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
|
|
if d4 != d4f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d4f, d4)
|
|
}
|
|
// t.Logf("Mulmat results matched:\n%s", d4)
|
|
})
|
|
|
|
t.Run("exact/3x3", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s10 = 64
|
|
const s11 = 1
|
|
const s12 = 2
|
|
const s20 = s10
|
|
const s21 = 1
|
|
const s22 = 2
|
|
|
|
data := [s10 * s11 * s12]float32{}
|
|
for i := range data {
|
|
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
|
|
}
|
|
// for i := range len(data) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
|
|
// }
|
|
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// for i := range len(mxData) / 17 {
|
|
// vals := [17]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
|
|
// }
|
|
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s10, s11, s12)
|
|
t1f := ctx.(*Context).FromFloatSlice(data[:], s10, s11, s12)
|
|
|
|
d2 := [s20 * s21 * s22]float32{}
|
|
for i := range d2 {
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s20, s21, s22)
|
|
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3)
|
|
d3f := ml.Dump(ctx, t3f)
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
})
|
|
|
|
t.Run("exact/2x2", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s0 = 32
|
|
const s1 = 64
|
|
|
|
data := [s0 * s1]float32{}
|
|
for i := range data {
|
|
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
|
|
}
|
|
// for i := range 4 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
|
|
// }
|
|
// t.Logf(" [%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// for i := range len(mxData) / 17 {
|
|
// vals := [17]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
|
|
// }
|
|
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
|
|
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
|
|
|
|
d2 := [s0 * s1]float32{}
|
|
for i := range d2 {
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
|
|
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3)
|
|
d3f := ml.Dump(ctx, t3f)
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
})
|
|
t.Run("exact/2x1", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s0 = 64
|
|
const s1 = 4
|
|
|
|
data := [s0 * s1]float32{}
|
|
for i := range data {
|
|
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
|
|
}
|
|
// for i := range len(data) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", data[i*32+j])
|
|
// }
|
|
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// for i := range len(mxData) / 17 {
|
|
// vals := [17]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2x", mxData[i*17+j])
|
|
// }
|
|
// t.Logf(" %s\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
|
|
t1f := ctx.(*Context).FromFloatSlice(data[:], s0, s1)
|
|
|
|
d2 := [s0]float32{}
|
|
for i := range d2 {
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
// for i := range len(d2) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", d2[i*32+j])
|
|
// }
|
|
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0)
|
|
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(3))
|
|
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(3))
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
})
|
|
|
|
t.Run("range/2d", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
const s0 = 32
|
|
const s1 = 4
|
|
data := [s0 * s1]float32{}
|
|
inTotal := float32(0)
|
|
for i := range data {
|
|
data[i] = float32(i)
|
|
inTotal += float32(i)
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
// Reconvert back to floats to remove the quantization fidelity loss for comparison
|
|
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
dtype := ml.DTypeMXFP4
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, s0, s1)
|
|
t1f := ctx.(*Context).FromFloatSlice(dataf, s0, s1)
|
|
// for i := range len(data) / 32 {
|
|
// vals := [32]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", dataf[i*32+j])
|
|
// }
|
|
// t.Logf(" t1[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
d2 := [s0 * s1]float32{}
|
|
for i := range d2 {
|
|
// d2[i] = float32(i)
|
|
d2[i] = float32(r.Float32())
|
|
}
|
|
// for i := range len(d2) / s0 {
|
|
// vals := [s0]string{}
|
|
// for j := range vals {
|
|
// vals[j] = fmt.Sprintf("%0.2f", d2[i*s0+j])
|
|
// }
|
|
// t.Logf(" t2[%s]\n", strings.Join(vals[:], ", "))
|
|
// }
|
|
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], s0, s1)
|
|
|
|
// t.Log("calling Mulmat")
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3, ml.DumpWithPrecision(2))
|
|
d3f := ml.Dump(ctx, t3f, ml.DumpWithPrecision(2))
|
|
r3 := t3.Floats()
|
|
r3f := t3f.Floats()
|
|
sim := cosineSimilarity(r3, r3f)
|
|
if sim < 0.99 {
|
|
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
})
|
|
|
|
t.Run("range/3d", func(t *testing.T) {
|
|
ctx := initContextOrSkip(t, b, useGPU)
|
|
data := [32 * 4 * 2]float32{}
|
|
inTotal := float32(0)
|
|
for i := range data {
|
|
data[i] = float32(i)
|
|
inTotal += float32(i)
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
dtype := ml.DTypeMXFP4
|
|
// Reconvert back to floats to remove the quantization fidelity loss for comparison
|
|
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 4, 2)
|
|
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 4, 2)
|
|
|
|
d2 := [32 * 4 * 2]float32{}
|
|
for i := range d2 {
|
|
d2[i] = 2.0
|
|
}
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 4, 2)
|
|
|
|
// t.Log("calling Mulmat")
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3)
|
|
d3f := ml.Dump(ctx, t3f)
|
|
r3 := t3.Floats()
|
|
r3f := t3f.Floats()
|
|
sim := cosineSimilarity(r3, r3f)
|
|
if sim < 0.99 {
|
|
t.Logf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
})
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestMXFP4Simple(t *testing.T) {
|
|
b := setup(t)
|
|
|
|
t.Run("fixed", func(t *testing.T) {
|
|
ctx := initContextOrSkip(t, b, false)
|
|
data := [32 * 2]float32{
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
dtype := ml.DTypeMXFP4
|
|
// Reconvert back to floats to remove the quantization fidelity loss for comparison
|
|
dataf := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
t1 := ctx.(*Context).FromBytes(dtype, mxData, 32, 2)
|
|
t1f := ctx.(*Context).FromFloatSlice(dataf, 32, 2)
|
|
|
|
d2 := [32 * 2]float32{
|
|
// 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
}
|
|
t2 := ctx.(*Context).FromFloatSlice(d2[:], 32, 2)
|
|
|
|
t.Log("calling Mulmat")
|
|
t3f := t1f.Mulmat(ctx, t2)
|
|
t3 := t1.Mulmat(ctx, t2)
|
|
d3 := ml.Dump(ctx, t3)
|
|
d3f := ml.Dump(ctx, t3f)
|
|
if d3 != d3f {
|
|
t.Fatalf("expected (f32): \n%s\n\n but got (mxfp4): \n%s", d3f, d3)
|
|
}
|
|
t.Logf("result (mxfp4): \n%s", d3)
|
|
})
|
|
}
|
|
|
|
func TestMXFP4Conversion(t *testing.T) {
|
|
t.Run("quantize/exact", func(t *testing.T) {
|
|
r := rand.New(rand.NewSource(0))
|
|
|
|
data := [32 * 4]float32{}
|
|
for i := range data {
|
|
data[i] = mxfp4_vals[r.Int()%len(mxfp4_vals)]
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
|
|
if len(data) != len(newData) {
|
|
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
|
|
}
|
|
for i := range data {
|
|
if data[i] != newData[i] {
|
|
t.Logf("started with: %v", data)
|
|
t.Logf("got : %v", newData)
|
|
t.Fatalf("mismatched data starting at offset %d started with %f but got %f", i, data[i], newData[i])
|
|
}
|
|
}
|
|
})
|
|
t.Run("quantize/arange", func(t *testing.T) {
|
|
data := [32 * 8]float32{}
|
|
for i := range data {
|
|
data[i] = float32(i) // / float32(6.0)
|
|
}
|
|
mxData := Quantize(fsggml.TensorTypeMXFP4, data[:], []uint64{uint64(len(data))})
|
|
newData := ConvertToF32(mxData, uint32(fsggml.TensorTypeMXFP4), uint64(len(data)))
|
|
|
|
if len(data) != len(newData) {
|
|
t.Fatalf("length mismatch. started with %d but got %d", len(data), len(newData))
|
|
}
|
|
sim := cosineSimilarity(data[:], newData)
|
|
if sim < 0.99 {
|
|
t.Fatalf("failed similarity test: %f", sim)
|
|
}
|
|
t.Logf("similarity: %f", sim)
|
|
})
|
|
}
|
|
|
|
func dotProduct[V float32 | float64](v1, v2 []V) V {
|
|
var result V = 0
|
|
for i := range v1 {
|
|
result += v1[i] * v2[i]
|
|
}
|
|
return result
|
|
}
|
|
|
|
func magnitude[V float32 | float64](v []V) V {
|
|
var result V = 0
|
|
for _, val := range v {
|
|
result += val * val
|
|
}
|
|
return V(math.Sqrt(float64(result)))
|
|
}
|
|
|
|
func cosineSimilarity[V float32 | float64](v1, v2 []V) V {
|
|
return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2))
|
|
}
|