mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 08:47:01 +00:00
subprocess llama.cpp server (#401)
* remove c code * pack llama.cpp * use request context for llama_cpp * let llama_cpp decide the number of threads to use * stop llama runner when app stops * remove sample count and duration metrics * use go generate to get libraries * tmp dir for running llm
This commit is contained in:
51
llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
Normal file
51
llm/llama.cpp/ggml_patch/0001-add-detokenize-endpoint.patch
Normal file
@@ -0,0 +1,51 @@
|
||||
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:12 -0400
|
||||
Subject: [PATCH] add detokenize endpoint
|
||||
|
||||
---
|
||||
examples/server/server.cpp | 21 +++++++++++++++++++++
|
||||
1 file changed, 21 insertions(+)
|
||||
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index 9966045..5014691 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{"tokens", tokens}};
|
||||
}
|
||||
|
||||
+static json format_detokenized_response(std::string content)
|
||||
+{
|
||||
+ return json{
|
||||
+ {"content", content}};
|
||||
+}
|
||||
+
|
||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||
{
|
||||
gpt_params default_params;
|
||||
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
||||
+ {
|
||||
+ auto lock = llama.lock();
|
||||
+
|
||||
+ const json body = json::parse(req.body);
|
||||
+ std::string content;
|
||||
+ if (body.count("tokens") != 0)
|
||||
+ {
|
||||
+ const std::vector<llama_token> tokens = body["tokens"];
|
||||
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
||||
+ }
|
||||
+
|
||||
+ const json data = format_detokenized_response(content);
|
||||
+ return res.set_content(data.dump(), "application/json"); });
|
||||
+
|
||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
||||
89
llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
Normal file
89
llm/llama.cpp/ggml_patch/0002-34B-model-support.patch
Normal file
@@ -0,0 +1,89 @@
|
||||
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:53 -0400
|
||||
Subject: [PATCH] 34B model support
|
||||
|
||||
---
|
||||
llama.cpp | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index f2cbe76..62c5cdf 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -79,6 +79,7 @@ enum e_model {
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
+ MODEL_34B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
};
|
||||
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
||||
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
||||
};
|
||||
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{ MODEL_7B, 160ull * MB },
|
||||
{ MODEL_13B, 192ull * MB },
|
||||
{ MODEL_30B, 256ull * MB },
|
||||
+ { MODEL_34B, 256ull * MB },
|
||||
{ MODEL_65B, 384ull * MB }, // guess
|
||||
{ MODEL_70B, 304ull * MB },
|
||||
};
|
||||
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{ MODEL_7B, 10ull * MB },
|
||||
{ MODEL_13B, 12ull * MB },
|
||||
{ MODEL_30B, 16ull * MB },
|
||||
+ { MODEL_34B, 16ull * MB },
|
||||
{ MODEL_65B, 24ull * MB }, // guess
|
||||
{ MODEL_70B, 24ull * MB },
|
||||
};
|
||||
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||
{ MODEL_7B, 512ull * kB },
|
||||
{ MODEL_13B, 640ull * kB },
|
||||
{ MODEL_30B, 768ull * kB },
|
||||
+ { MODEL_34B, 768ull * kB },
|
||||
{ MODEL_65B, 1280ull * kB },
|
||||
{ MODEL_70B, 1280ull * kB },
|
||||
};
|
||||
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||
{ MODEL_7B, 128ull },
|
||||
{ MODEL_13B, 160ull },
|
||||
{ MODEL_30B, 208ull },
|
||||
+ { MODEL_34B, 208ull },
|
||||
{ MODEL_65B, 256ull },
|
||||
{ MODEL_70B, 256ull },
|
||||
};
|
||||
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_30B: return "30B";
|
||||
+ case MODEL_34B: return "34B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
default: LLAMA_ASSERT(false);
|
||||
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
+ case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = e_model::MODEL_65B; break;
|
||||
default:
|
||||
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
|
||||
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||
model.type = e_model::MODEL_70B;
|
||||
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
||||
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
|
||||
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
|
||||
}
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
||||
Reference in New Issue
Block a user