mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-12 00:37:04 +00:00
deprecate ggml
- remove ggml runner - automatically pull gguf models when ggml detected - tell users to update to gguf in the case automatic pull fails Co-Authored-By: Jeffrey Morgan <jmorganca@gmail.com>
This commit is contained in:
committed by
Daniel Hiltgen
parent
ed195f3562
commit
811b1f03c8
@@ -1,51 +0,0 @@
|
||||
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:12 -0400
|
||||
Subject: [PATCH] add detokenize endpoint
|
||||
|
||||
---
|
||||
examples/server/server.cpp | 21 +++++++++++++++++++++
|
||||
1 file changed, 21 insertions(+)
|
||||
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index 9966045..5014691 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
|
||||
{"tokens", tokens}};
|
||||
}
|
||||
|
||||
+static json format_detokenized_response(std::string content)
|
||||
+{
|
||||
+ return json{
|
||||
+ {"content", content}};
|
||||
+}
|
||||
+
|
||||
static void parse_options_completion(const json &body, llama_server_context &llama)
|
||||
{
|
||||
gpt_params default_params;
|
||||
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
|
||||
const json data = format_tokenizer_response(tokens);
|
||||
return res.set_content(data.dump(), "application/json"); });
|
||||
|
||||
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
|
||||
+ {
|
||||
+ auto lock = llama.lock();
|
||||
+
|
||||
+ const json body = json::parse(req.body);
|
||||
+ std::string content;
|
||||
+ if (body.count("tokens") != 0)
|
||||
+ {
|
||||
+ const std::vector<llama_token> tokens = body["tokens"];
|
||||
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
|
||||
+ }
|
||||
+
|
||||
+ const json data = format_detokenized_response(content);
|
||||
+ return res.set_content(data.dump(), "application/json"); });
|
||||
+
|
||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||
{
|
||||
auto lock = llama.lock();
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
||||
@@ -1,89 +0,0 @@
|
||||
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
|
||||
From: Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||
Date: Mon, 28 Aug 2023 18:08:53 -0400
|
||||
Subject: [PATCH] 34B model support
|
||||
|
||||
---
|
||||
llama.cpp | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/llama.cpp b/llama.cpp
|
||||
index f2cbe76..62c5cdf 100644
|
||||
--- a/llama.cpp
|
||||
+++ b/llama.cpp
|
||||
@@ -79,6 +79,7 @@ enum e_model {
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
+ MODEL_34B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
};
|
||||
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
||||
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
|
||||
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
|
||||
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
|
||||
};
|
||||
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{ MODEL_7B, 160ull * MB },
|
||||
{ MODEL_13B, 192ull * MB },
|
||||
{ MODEL_30B, 256ull * MB },
|
||||
+ { MODEL_34B, 256ull * MB },
|
||||
{ MODEL_65B, 384ull * MB }, // guess
|
||||
{ MODEL_70B, 304ull * MB },
|
||||
};
|
||||
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{ MODEL_7B, 10ull * MB },
|
||||
{ MODEL_13B, 12ull * MB },
|
||||
{ MODEL_30B, 16ull * MB },
|
||||
+ { MODEL_34B, 16ull * MB },
|
||||
{ MODEL_65B, 24ull * MB }, // guess
|
||||
{ MODEL_70B, 24ull * MB },
|
||||
};
|
||||
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
|
||||
{ MODEL_7B, 512ull * kB },
|
||||
{ MODEL_13B, 640ull * kB },
|
||||
{ MODEL_30B, 768ull * kB },
|
||||
+ { MODEL_34B, 768ull * kB },
|
||||
{ MODEL_65B, 1280ull * kB },
|
||||
{ MODEL_70B, 1280ull * kB },
|
||||
};
|
||||
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
|
||||
{ MODEL_7B, 128ull },
|
||||
{ MODEL_13B, 160ull },
|
||||
{ MODEL_30B, 208ull },
|
||||
+ { MODEL_34B, 208ull },
|
||||
{ MODEL_65B, 256ull },
|
||||
{ MODEL_70B, 256ull },
|
||||
};
|
||||
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_30B: return "30B";
|
||||
+ case MODEL_34B: return "34B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
default: LLAMA_ASSERT(false);
|
||||
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
+ case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = e_model::MODEL_65B; break;
|
||||
default:
|
||||
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
|
||||
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
|
||||
model.type = e_model::MODEL_70B;
|
||||
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
|
||||
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
|
||||
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
|
||||
}
|
||||
|
||||
hparams.rope_freq_base = rope_freq_base;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
|
||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Date: Mon, 21 Aug 2023 06:59:29 -0400
|
||||
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
|
||||
kernel (#2686)
|
||||
|
||||
---
|
||||
ggml-metal.metal | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 3f31252..88d48f6 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
+ threadgroup_barrier(mem_flags::mem_device);
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
+ threadgroup_barrier(mem_flags::mem_device);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
--
|
||||
2.41.0
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
|
||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Date: Tue, 22 Aug 2023 02:18:40 -0400
|
||||
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
|
||||
|
||||
---
|
||||
ggml-metal.metal | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 88d48f6..ce3541f 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
//load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
--
|
||||
2.41.0
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
|
||||
From: Kylin <56434533+KyL0N@users.noreply.github.com>
|
||||
Date: Tue, 22 Aug 2023 15:14:23 +0800
|
||||
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
|
||||
|
||||
* ggml: support CUDA's half type for aarch64(#1455)
|
||||
support CUDA's half type for aarch64 in ggml_fp16_t definition
|
||||
|
||||
* ggml: use __CUDACC__ to recognise nvcc compiler
|
||||
---
|
||||
ggml.h | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml.h b/ggml.h
|
||||
index 544ad2d..0ec7ec5 100644
|
||||
--- a/ggml.h
|
||||
+++ b/ggml.h
|
||||
@@ -259,8 +259,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
-#ifdef __ARM_NEON
|
||||
- // we use the built-in 16-bit float type
|
||||
+#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||
+ typedef half ggml_fp16_t;
|
||||
+#elif defined(__ARM_NEON)
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
--
|
||||
2.39.2 (Apple Git-143)
|
||||
|
||||
Reference in New Issue
Block a user