llama: update vendored code to commit 46e3556 (#8308)

2025-12-12 00:37:04 +00:00 · 2025-01-08 11:22:01 -08:00
parent 57f038ec7b
commit 1deafd8254
305 changed files with 16048 additions and 12926 deletions
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
 *
 * MIT License
 *
@@ -28,7 +28,7 @@

 #pragma once

-#include "llama.h"
+#include "llama-cpp.h"

 #include <string>
 #include <vector>
@@ -53,10 +53,8 @@
 struct common_lora_adapter_info {
    std::string path;
    float scale;
-};

-struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct llama_lora_adapter * ptr;
 };

 using llama_tokens = std::vector<llama_token>;
@@ -106,6 +104,7 @@ enum llama_example {
    LLAMA_EXAMPLE_LLAVA,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,

    LLAMA_EXAMPLE_COUNT,
 };
@@ -121,6 +120,7 @@ enum common_sampler_type {
    COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
    COMMON_SAMPLER_TYPE_XTC         = 8,
    COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };

 // dimensionality reduction methods, used by cvector-generator
@@ -156,7 +156,6 @@ struct common_params_sampling {
    int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau       = 5.00f; // target entropy
    float   mirostat_eta       = 0.10f; // learning rate
-    bool    penalize_nl        = false; // consider newlines as a repeatable token
    bool    ignore_eos         = false;
    bool    no_perf            = false; // disable performance metrics
    bool    timing_per_token   = false;
@@ -165,6 +164,7 @@ struct common_params_sampling {


    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
        COMMON_SAMPLER_TYPE_TOP_K,
        COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -184,6 +184,7 @@ struct common_params_sampling {

 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -197,6 +198,14 @@ struct common_params_speculative {
    std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };

+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
+
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
@@ -219,11 +228,13 @@ struct common_params {
    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold

    // offload params
-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
@@ -237,8 +248,9 @@ struct common_params {
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
    struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;

    std::string model                = ""; // model path                                                    // NOLINT
    std::string model_alias          = ""; // model alias                                                   // NOLINT
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //

+// note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<common_lora_adapter_container> lora_adapters;
+    llama_model_ptr   model;
+    llama_context_ptr context;
+
+    std::vector<llama_lora_adapter_ptr> lora;
 };

 struct common_init_result     common_init_from_params(common_params & params);
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
    const struct llama_model_params & params);

 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);

 //
 // Batch utils
@@ -583,6 +597,9 @@ struct common_chat_msg {
    std::string content;
 };

+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);

@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //

-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);

 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);

@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //

-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+namespace {
+
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+
+}