llama: update vendored code to commit 46e3556 (#8308)

This commit is contained in:
Jeffrey Morgan
2025-01-08 11:22:01 -08:00
committed by GitHub
parent 57f038ec7b
commit 1deafd8254
305 changed files with 16048 additions and 12926 deletions

82
llama/common.cpp vendored
View File

@@ -1,5 +1,5 @@
/**
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
*
* MIT License
*
@@ -44,6 +44,7 @@
#include <cstdarg>
#include <cstring>
#include <ctime>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <iterator>
@@ -88,7 +89,9 @@
#ifdef __linux__
#include <linux/limits.h>
#elif defined(_WIN32)
#define PATH_MAX MAX_PATH
# if !defined(PATH_MAX)
# define PATH_MAX MAX_PATH
# endif
#else
#include <sys/syslimits.h>
#endif
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
}
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
llama_free_model(model);
return iparams;
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
params.ctx_shift = false;
}
if (!params.control_vectors.empty()) {
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
common_lora_adapter_container loaded_la;
loaded_la.path = la.path;
loaded_la.scale = la.scale;
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
if (loaded_la.adapter == nullptr) {
llama_lora_adapter_ptr lora;
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
if (lora == nullptr) {
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
llama_free(lctx);
llama_free_model(model);
return iparams;
}
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
la.ptr = lora.get();
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
}
if (!params.lora_init_without_apply) {
common_lora_adapters_apply(lctx, iparams.lora_adapters);
common_lora_adapters_apply(lctx, params.lora_adapters);
}
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
params.sampling.ignore_eos = false;
}
if (params.sampling.ignore_eos) {
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
if (llama_token_is_eog(model, i)) {
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
params.sampling.logit_bias.push_back({i, -INFINITY});
}
}
}
if (params.sampling.penalty_last_n == -1) {
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.penalty_last_n = llama_n_ctx(lctx);
}
if (params.sampling.dry_penalty_last_n == -1) {
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
}
if (params.warmup) {
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
llama_perf_context_reset(lctx);
}
iparams.model = model;
iparams.context = lctx;
iparams.model.reset(model);
iparams.context.reset(lctx);
return iparams;
}
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
llama_lora_adapter_clear(ctx);
for (auto & la : lora_adapters) {
for (auto & la : lora) {
if (la.scale != 0.0f) {
llama_lora_adapter_set(ctx, la.adapter, la.scale);
llama_lora_adapter_set(ctx, la.ptr, la.scale);
}
}
}
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;
while (remaining_attempts > 0) {
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
}
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
if (!curl) {
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
#endif
// Check if the file already exists locally
struct stat model_file_info;
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
auto file_exists = std::filesystem::exists(path);
// If the file exists, check its JSON metadata companion file.
std::string metadata_path = path + ".json";
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
std::string etag;
std::string last_modified;
};
common_load_model_from_url_headers headers;
{
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
static std::regex header_regex("([^:]+): (.*)\r\n");
static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
// Chat template utils
//
std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template";
// call with NULL buffer to get the total size of the string
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
}
bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
break;
case 0: // max absolute
for (int i = 0; i < n; i++) {
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
if (sum < std::abs(inp[i])) {
sum = std::abs(inp[i]);
}
}
sum /= 32760.0; // make an int16 range
break;