mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 07:46:59 +00:00
Merge upstream ollama/ollama main branch while preserving CUDA 3.7 support
- Added support for new gpt-oss model from upstream - Preserved CUDA Compute Capability 3.7 (Tesla K80) support - Kept CUDA 11 configuration alongside CUDA 12 - Maintained all documentation specific to ollama37 fork - Integrated new tool parsing improvements - Added new backend methods and patches from upstream
This commit is contained in:
@@ -202,6 +202,8 @@ func ConvertModel(fsys fs.FS, f *os.File) error {
|
||||
conv = &bertModel{}
|
||||
case "CohereForCausalLM":
|
||||
conv = &commandrModel{}
|
||||
case "GptOssForCausalLM":
|
||||
conv = &gptossModel{}
|
||||
default:
|
||||
return fmt.Errorf("unsupported architecture %q", p.Architectures[0])
|
||||
}
|
||||
|
||||
178
convert/convert_gptoss.go
Normal file
178
convert/convert_gptoss.go
Normal file
@@ -0,0 +1,178 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"cmp"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/pdevine/tensor"
|
||||
"github.com/pdevine/tensor/native"
|
||||
)
|
||||
|
||||
type gptossModel struct {
|
||||
ModelParameters
|
||||
HiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
AttentionHeads uint32 `json:"num_attention_heads"`
|
||||
KeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
HeadDim uint32 `json:"head_dim"`
|
||||
Experts uint32 `json:"num_experts"`
|
||||
ExpertsPerToken uint32 `json:"experts_per_token"`
|
||||
RMSNormEpsilon float32 `json:"rms_norm_eps"`
|
||||
InitialContextLength uint32 `json:"initial_context_length"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
RopeScalingFactor float32 `json:"rope_scaling_factor"`
|
||||
SlidingWindow uint32 `json:"sliding_window"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*gptossModel)(nil)
|
||||
|
||||
func (m *gptossModel) KV(t *Tokenizer) ggml.KV {
|
||||
kv := m.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "gptoss"
|
||||
kv["general.file_type"] = uint32(4)
|
||||
kv["gptoss.context_length"] = uint32(m.RopeScalingFactor * float32(m.InitialContextLength))
|
||||
kv["gptoss.block_count"] = m.HiddenLayers
|
||||
kv["gptoss.embedding_length"] = m.HiddenSize
|
||||
kv["gptoss.feed_forward_length"] = m.IntermediateSize
|
||||
kv["gptoss.expert_count"] = m.Experts
|
||||
kv["gptoss.expert_used_count"] = m.ExpertsPerToken
|
||||
kv["gptoss.attention.head_count"] = m.AttentionHeads
|
||||
kv["gptoss.attention.head_count_kv"] = m.KeyValueHeads
|
||||
kv["gptoss.attention.key_length"] = m.HeadDim
|
||||
kv["gptoss.attention.value_length"] = m.HeadDim
|
||||
kv["gptoss.attention.layer_norm_rms_epsilon"] = cmp.Or(m.RMSNormEpsilon, 1e-5)
|
||||
kv["gptoss.attention.sliding_window"] = m.SlidingWindow
|
||||
kv["gptoss.rope.freq_base"] = m.RopeTheta
|
||||
kv["gptoss.rope.scaling.factor"] = m.RopeScalingFactor
|
||||
kv["gptoss.rope.scaling.original_context_length"] = m.InitialContextLength
|
||||
kv["tokenizer.ggml.bos_token_id"] = uint32(199998) // <|startoftext|>
|
||||
kv["tokenizer.ggml.add_bos_token"] = false
|
||||
kv["tokenizer.ggml.eos_token_id"] = uint32(199999) // <|endoftext|>
|
||||
kv["tokenizer.ggml.eos_token_ids"] = []int32{
|
||||
199999, /* <|endoftext|> */
|
||||
200002, /* <|return|> */
|
||||
200012, /* <|call|> */
|
||||
}
|
||||
kv["tokenizer.ggml.add_eos_token"] = false
|
||||
return kv
|
||||
}
|
||||
|
||||
func (m *gptossModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
mxfp4s := make(map[string]*mxfp4)
|
||||
for _, t := range ts {
|
||||
if strings.HasSuffix(t.Name(), ".blocks") || strings.HasSuffix(t.Name(), ".scales") {
|
||||
dot := strings.LastIndex(t.Name(), ".")
|
||||
name, suffix := t.Name()[:dot], t.Name()[dot+1:]
|
||||
if _, ok := mxfp4s[name]; !ok {
|
||||
mxfp4s[name] = &mxfp4{}
|
||||
}
|
||||
|
||||
switch suffix {
|
||||
case "blocks":
|
||||
mxfp4s[name].blocks = t
|
||||
case "scales":
|
||||
mxfp4s[name].scales = t
|
||||
}
|
||||
} else {
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: t.Name(),
|
||||
Kind: t.Kind(),
|
||||
Shape: t.Shape(),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for name, mxfp4 := range mxfp4s {
|
||||
dims := mxfp4.blocks.Shape()
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name,
|
||||
Kind: uint32(ggml.TensorTypeMXFP4),
|
||||
Shape: []uint64{dims[0], dims[1], dims[2] * dims[3] * 2},
|
||||
WriterTo: mxfp4,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (m *gptossModel) Replacements() []string {
|
||||
return []string{
|
||||
// noop replacements so other replacements will not be applied
|
||||
".blocks", ".blocks",
|
||||
".scales", ".scales",
|
||||
// real replacements
|
||||
"block", "blk",
|
||||
"attn.norm", "attn_norm",
|
||||
"attn.qkv", "attn_qkv",
|
||||
"attn.sinks", "attn_sinks",
|
||||
"attn.out", "attn_out",
|
||||
"mlp.norm", "ffn_norm",
|
||||
"mlp.gate", "ffn_gate_inp",
|
||||
"mlp.mlp1_", "ffn_gate_up_exps.",
|
||||
"mlp.mlp2_", "ffn_down_exps.",
|
||||
"embedding", "token_embd",
|
||||
"norm", "output_norm",
|
||||
"unembedding", "output",
|
||||
"scale", "weight",
|
||||
}
|
||||
}
|
||||
|
||||
type mxfp4 struct {
|
||||
blocks, scales Tensor
|
||||
}
|
||||
|
||||
func (m *mxfp4) WriteTo(w io.Writer) (int64, error) {
|
||||
var b bytes.Buffer
|
||||
if _, err := m.blocks.WriteTo(&b); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
blocksDims := make([]int, len(m.blocks.Shape()))
|
||||
for i, d := range m.blocks.Shape() {
|
||||
blocksDims[i] = int(d)
|
||||
}
|
||||
|
||||
var blocks tensor.Tensor = tensor.New(tensor.WithShape(blocksDims...), tensor.WithBacking(b.Bytes()))
|
||||
|
||||
var s bytes.Buffer
|
||||
if _, err := m.scales.WriteTo(&s); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
scalesDims := slices.Repeat([]int{1}, len(m.blocks.Shape()))
|
||||
for i, d := range m.scales.Shape() {
|
||||
scalesDims[i] = int(d)
|
||||
}
|
||||
|
||||
var scales tensor.Tensor = tensor.New(tensor.WithShape(scalesDims...), tensor.WithBacking(s.Bytes()))
|
||||
|
||||
out, err := tensor.Concat(3, scales, blocks)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
out = tensor.Materialize(out)
|
||||
|
||||
if err := out.Reshape(out.Shape().TotalSize()); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
u8s, err := native.VectorU8(out.(*tensor.Dense))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if err := binary.Write(w, binary.LittleEndian, u8s); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
@@ -11,14 +11,13 @@ import (
|
||||
"io"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
@@ -137,9 +136,7 @@ func TestConvertModel(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
keys := maps.Keys(expect)
|
||||
slices.Sort(keys)
|
||||
for _, k := range keys {
|
||||
for _, k := range slices.Sorted(maps.Keys(expect)) {
|
||||
if v, ok := actual[k]; !ok {
|
||||
t.Errorf("missing %s", k)
|
||||
} else if v != expect[k] {
|
||||
@@ -343,9 +340,7 @@ func TestConvertAdapter(t *testing.T) {
|
||||
|
||||
actual := generateResultsJSON(t, r, m.KV(), m.Tensors())
|
||||
|
||||
keys := maps.Keys(c.Expected)
|
||||
slices.Sort(keys)
|
||||
for _, k := range keys {
|
||||
for _, k := range slices.Sorted(maps.Keys(c.Expected)) {
|
||||
if v, ok := actual[k]; !ok {
|
||||
t.Errorf("missing %s", k)
|
||||
} else if v != c.Expected[k] {
|
||||
|
||||
@@ -31,8 +31,10 @@ func (t tensorBase) Shape() []uint64 {
|
||||
}
|
||||
|
||||
const (
|
||||
tensorKindF32 uint32 = iota
|
||||
tensorKindF16
|
||||
tensorKindFP32 uint32 = iota
|
||||
tensorKindFP16
|
||||
tensorKindMXFP4 = 4
|
||||
tensorKindBF16 = 30
|
||||
)
|
||||
|
||||
func (t tensorBase) Kind() uint32 {
|
||||
@@ -43,16 +45,16 @@ func (t tensorBase) Kind() uint32 {
|
||||
t.name == "v.pre_tile_position_embd.weight" ||
|
||||
t.name == "v.post_tile_position_embd.weight" {
|
||||
// these tensors are always F32
|
||||
return 0
|
||||
return tensorKindFP32
|
||||
}
|
||||
|
||||
switch len(t.shape) {
|
||||
case 0:
|
||||
panic("invalid tensor shape")
|
||||
case 1:
|
||||
return tensorKindF32
|
||||
return tensorKindFP32
|
||||
default:
|
||||
return tensorKindF16
|
||||
return tensorKindFP16
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,12 +8,12 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"maps"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/d4l3k/go-bfloat16"
|
||||
"github.com/x448/float16"
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
type safetensorMetadata struct {
|
||||
@@ -46,8 +46,7 @@ func parseSafetensors(fsys fs.FS, replacer *strings.Replacer, ps ...string) ([]T
|
||||
return nil, err
|
||||
}
|
||||
|
||||
keys := maps.Keys(headers)
|
||||
slices.Sort(keys)
|
||||
keys := slices.Sorted(maps.Keys(headers))
|
||||
|
||||
names := make(map[string]struct{}, len(keys))
|
||||
|
||||
@@ -94,6 +93,15 @@ type safetensor struct {
|
||||
*tensorBase
|
||||
}
|
||||
|
||||
func (st safetensor) Kind() uint32 {
|
||||
kind := st.tensorBase.Kind()
|
||||
if st.dtype == "BF16" && kind != tensorKindFP32 {
|
||||
kind = tensorKindBF16
|
||||
}
|
||||
|
||||
return kind
|
||||
}
|
||||
|
||||
func (st safetensor) Clone() Tensor {
|
||||
return &safetensor{
|
||||
fs: st.fs,
|
||||
@@ -151,6 +159,9 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
||||
}
|
||||
|
||||
f32s = bfloat16.DecodeFloat32(u8s)
|
||||
case "U8":
|
||||
// U8 tensors do not support repacking or type conversion.
|
||||
return io.CopyN(w, f, st.size)
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown data type: %s", st.dtype)
|
||||
}
|
||||
@@ -163,15 +174,18 @@ func (st safetensor) WriteTo(w io.Writer) (int64, error) {
|
||||
}
|
||||
|
||||
switch st.Kind() {
|
||||
case tensorKindF32:
|
||||
case tensorKindFP32:
|
||||
return 0, binary.Write(w, binary.LittleEndian, f32s)
|
||||
case tensorKindF16:
|
||||
case tensorKindFP16:
|
||||
f16s := make([]uint16, len(f32s))
|
||||
for i := range f32s {
|
||||
f16s[i] = float16.Fromfloat32(f32s[i]).Bits()
|
||||
}
|
||||
|
||||
return 0, binary.Write(w, binary.LittleEndian, f16s)
|
||||
case tensorKindBF16:
|
||||
u8s := bfloat16.EncodeFloat32(f32s)
|
||||
return 0, binary.Write(w, binary.LittleEndian, u8s)
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown storage type: %d", st.Kind())
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,11 +8,10 @@ import (
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/maps"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -260,11 +259,8 @@ func parseVocabularyFromTokenizer(fsys fs.FS) (*Vocabulary, error) {
|
||||
tokens[token.ID] = token
|
||||
}
|
||||
|
||||
keys := maps.Keys(tokens)
|
||||
slices.Sort(keys)
|
||||
|
||||
v := Vocabulary{Model: "gpt2"}
|
||||
for _, k := range keys {
|
||||
for _, k := range slices.Sorted(maps.Keys(tokens)) {
|
||||
token := tokens[k]
|
||||
v.Tokens = append(v.Tokens, token.Content)
|
||||
v.Scores = append(v.Scores, float32(token.ID))
|
||||
|
||||
Reference in New Issue
Block a user