mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 08:47:01 +00:00
llama: update vendored code to commit 46e3556 (#8308)
This commit is contained in:
82
llama/common.cpp
vendored
82
llama/common.cpp
vendored
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
|
||||
* llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -44,6 +44,7 @@
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <iterator>
|
||||
@@ -88,7 +89,9 @@
|
||||
#ifdef __linux__
|
||||
#include <linux/limits.h>
|
||||
#elif defined(_WIN32)
|
||||
#define PATH_MAX MAX_PATH
|
||||
# if !defined(PATH_MAX)
|
||||
# define PATH_MAX MAX_PATH
|
||||
# endif
|
||||
#else
|
||||
#include <sys/syslimits.h>
|
||||
#endif
|
||||
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
}
|
||||
|
||||
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
|
||||
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
|
||||
params.ctx_shift = false;
|
||||
}
|
||||
|
||||
if (!params.control_vectors.empty()) {
|
||||
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
common_lora_adapter_container loaded_la;
|
||||
loaded_la.path = la.path;
|
||||
loaded_la.scale = la.scale;
|
||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||
if (loaded_la.adapter == nullptr) {
|
||||
llama_lora_adapter_ptr lora;
|
||||
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
|
||||
if (lora == nullptr) {
|
||||
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
|
||||
llama_free(lctx);
|
||||
llama_free_model(model);
|
||||
return iparams;
|
||||
}
|
||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||
|
||||
la.ptr = lora.get();
|
||||
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
|
||||
}
|
||||
|
||||
if (!params.lora_init_without_apply) {
|
||||
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||
common_lora_adapters_apply(lctx, params.lora_adapters);
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
params.sampling.ignore_eos = false;
|
||||
}
|
||||
|
||||
if (params.sampling.ignore_eos) {
|
||||
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
|
||||
if (llama_token_is_eog(model, i)) {
|
||||
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
||||
params.sampling.logit_bias.push_back({i, -INFINITY});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (params.sampling.penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.sampling.dry_penalty_last_n == -1) {
|
||||
LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
||||
params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
|
||||
}
|
||||
|
||||
if (params.warmup) {
|
||||
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
|
||||
|
||||
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
llama_perf_context_reset(lctx);
|
||||
}
|
||||
|
||||
iparams.model = model;
|
||||
iparams.context = lctx;
|
||||
iparams.model.reset(model);
|
||||
iparams.context.reset(lctx);
|
||||
|
||||
return iparams;
|
||||
}
|
||||
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
|
||||
llama_lora_adapter_clear(ctx);
|
||||
for (auto & la : lora_adapters) {
|
||||
for (auto & la : lora) {
|
||||
if (la.scale != 0.0f) {
|
||||
llama_lora_adapter_set(ctx, la.adapter, la.scale);
|
||||
llama_lora_adapter_set(ctx, la.ptr, la.scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||
#define CURL_MAX_RETRY 3
|
||||
#define CURL_RETRY_DELAY_SECONDS 2
|
||||
|
||||
static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
|
||||
static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
|
||||
int remaining_attempts = max_attempts;
|
||||
|
||||
while (remaining_attempts > 0) {
|
||||
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
||||
}
|
||||
|
||||
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
||||
|
||||
// Initialize libcurl
|
||||
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
||||
if (!curl) {
|
||||
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||
#endif
|
||||
|
||||
// Check if the file already exists locally
|
||||
struct stat model_file_info;
|
||||
auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
|
||||
auto file_exists = std::filesystem::exists(path);
|
||||
|
||||
// If the file exists, check its JSON metadata companion file.
|
||||
std::string metadata_path = path + ".json";
|
||||
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
|
||||
std::string etag;
|
||||
std::string last_modified;
|
||||
};
|
||||
|
||||
common_load_model_from_url_headers headers;
|
||||
|
||||
{
|
||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||
|
||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
|
||||
// Chat template utils
|
||||
//
|
||||
|
||||
std::string common_get_builtin_chat_template(const struct llama_model * model) {
|
||||
static const char * template_key = "tokenizer.chat_template";
|
||||
// call with NULL buffer to get the total size of the string
|
||||
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
|
||||
if (res > 0) {
|
||||
std::vector<char> model_template(res + 1, 0);
|
||||
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
|
||||
return std::string(model_template.data(), model_template.size() - 1);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
bool common_chat_verify_template(const std::string & tmpl) {
|
||||
llama_chat_message chat[] = {{"user", "test"}};
|
||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
||||
break;
|
||||
case 0: // max absolute
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
|
||||
if (sum < std::abs(inp[i])) {
|
||||
sum = std::abs(inp[i]);
|
||||
}
|
||||
}
|
||||
sum /= 32760.0; // make an int16 range
|
||||
break;
|
||||
|
||||
Reference in New Issue
Block a user