model: support for mistral-small in the ollama runner

Mistral is a popular research lab making open source models. This updates the forward pass of llama architecture models to support both llama models and mistral models by accounting for additional metadata present in mistral models, and finding the correct dimensions for the output projection.
2025-12-16 10:47:01 +00:00 · 2025-03-14 16:56:32 -07:00
parent 1861fbdeb5
commit 6bd0a983cd
27 changed files with 1116 additions and 350 deletions
--- a/llama/llama.cpp/src/llama-quant.cpp
+++ b/llama/llama.cpp/src/llama-quant.cpp
@@ -738,13 +738,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?

        // don't quantize vision stuff
-        quantize &= name.find("v.blk.") == std::string::npos;
-
-        quantize &= name.find("mm.mm_input_projection.weight") == std::string::npos;
-        quantize &= name.find("mm.mm_soft_emb_norm.weight") == std::string::npos;
-        quantize &= name.find("v.patch_embedding.weight") == std::string::npos;
-        quantize &= name.find("v.position_embedding.weight") == std::string::npos;
-        quantize &= name.find("v.post_layernorm.weight") == std::string::npos;
+        quantize &= name.find("v.") == std::string::npos;
+        quantize &= name.find("mm.") == std::string::npos;

        // quantize only 2D and 3D tensors (experts)
        quantize &= (ggml_n_dims(tensor) >= 2);