Move quantization to new backend (#10363)

* Move quantization logic to GGML via new backend This moves the model aware logic to Go code and calls GGMLs quantization code for model creation. * Remove "add model quantizations" This is no longer needed now that quantization is implemented in Go+GGML code directly.
2025-12-14 17:57:06 +00:00 · 2025-05-06 11:20:48 -07:00
parent 95e744beeb
commit 424810450f
39 changed files with 1854 additions and 440 deletions
--- a/llama/llama.cpp/src/llama-model.cpp
+++ b/llama/llama.cpp/src/llama-model.cpp
@@ -1437,7 +1437,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
-        case LLM_ARCH_MISTRAL3: break;
        default: throw std::runtime_error("unsupported model architecture");
    }

@@ -13752,7 +13751,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_SOLAR:
        case LLM_ARCH_BAILINGMOE:
-        case LLM_ARCH_MISTRAL3:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2