embed libraries using cmake

This commit is contained in:
Michael Yang
2023-09-20 12:15:23 -07:00
parent fc6ec356fc
commit 6c6a31a1e8
12 changed files with 43 additions and 58 deletions

View File

@@ -0,0 +1,51 @@
From 032ef7ff2423f5117bb59d42fb71be9cebf0a2de Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:12 -0400
Subject: [PATCH] add detokenize endpoint
---
examples/server/server.cpp | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9966045..5014691 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1075,6 +1075,12 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{"tokens", tokens}};
}
+static json format_detokenized_response(std::string content)
+{
+ return json{
+ {"content", content}};
+}
+
static void parse_options_completion(const json &body, llama_server_context &llama)
{
gpt_params default_params;
@@ -1361,6 +1367,21 @@ int main(int argc, char **argv)
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); });
+ svr.Post("/detokenize", [&llama](const Request &req, Response &res)
+ {
+ auto lock = llama.lock();
+
+ const json body = json::parse(req.body);
+ std::string content;
+ if (body.count("tokens") != 0)
+ {
+ const std::vector<llama_token> tokens = body["tokens"];
+ content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend());
+ }
+
+ const json data = format_detokenized_response(content);
+ return res.set_content(data.dump(), "application/json"); });
+
svr.Post("/embedding", [&llama](const Request &req, Response &res)
{
auto lock = llama.lock();
--
2.39.2 (Apple Git-143)

View File

@@ -0,0 +1,27 @@
From 5dd02993e8cc2ce309157736b95bb572f274a3fd Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Wed, 20 Sep 2023 14:19:52 -0700
Subject: [PATCH] copy cuda runtime libraries
---
CMakeLists.txt | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 824d9f2..dd24137 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -274,6 +274,10 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcudart.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcudart.so.${CUDAToolkit_VERSION_MAJOR}.0 COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublas.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublas.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+ configure_file(${CUDAToolkit_LIBRARY_DIR}/libcublasLt.so ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/libcublasLt.so.${CUDAToolkit_VERSION_MAJOR} COPYONLY)
+
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
--
2.42.0

View File

@@ -0,0 +1,89 @@
From 6145068a6613c37bb43a7408b5496524bdcfc402 Mon Sep 17 00:00:00 2001
From: Bruce MacDonald <brucewmacdonald@gmail.com>
Date: Mon, 28 Aug 2023 18:08:53 -0400
Subject: [PATCH] 34B model support
---
llama.cpp | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/llama.cpp b/llama.cpp
index f2cbe76..62c5cdf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,7 @@ enum e_model {
MODEL_7B,
MODEL_13B,
MODEL_30B,
+ MODEL_34B,
MODEL_65B,
MODEL_70B,
};
@@ -122,6 +123,7 @@ static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
{ MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB },
+ { MODEL_34B, ((size_t) n_ctx / 9ull + 160ull) * MB },
{ MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess
{ MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB },
};
@@ -135,6 +137,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
{ MODEL_7B, 160ull * MB },
{ MODEL_13B, 192ull * MB },
{ MODEL_30B, 256ull * MB },
+ { MODEL_34B, 256ull * MB },
{ MODEL_65B, 384ull * MB }, // guess
{ MODEL_70B, 304ull * MB },
};
@@ -149,6 +152,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
{ MODEL_7B, 10ull * MB },
{ MODEL_13B, 12ull * MB },
{ MODEL_30B, 16ull * MB },
+ { MODEL_34B, 16ull * MB },
{ MODEL_65B, 24ull * MB }, // guess
{ MODEL_70B, 24ull * MB },
};
@@ -164,6 +168,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
{ MODEL_7B, 512ull * kB },
{ MODEL_13B, 640ull * kB },
{ MODEL_30B, 768ull * kB },
+ { MODEL_34B, 768ull * kB },
{ MODEL_65B, 1280ull * kB },
{ MODEL_70B, 1280ull * kB },
};
@@ -179,6 +184,7 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
{ MODEL_7B, 128ull },
{ MODEL_13B, 160ull },
{ MODEL_30B, 208ull },
+ { MODEL_34B, 208ull },
{ MODEL_65B, 256ull },
{ MODEL_70B, 256ull },
};
@@ -1027,6 +1033,7 @@ static const char * llama_model_type_name(e_model type) {
case MODEL_7B: return "7B";
case MODEL_13B: return "13B";
case MODEL_30B: return "30B";
+ case MODEL_34B: return "34B";
case MODEL_65B: return "65B";
case MODEL_70B: return "70B";
default: LLAMA_ASSERT(false);
@@ -1074,6 +1081,7 @@ static void llama_model_load_internal(
case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
+ case 48: model.type = e_model::MODEL_34B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
@@ -1094,6 +1102,8 @@ static void llama_model_load_internal(
LLAMA_LOG_WARN("%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
model.type = e_model::MODEL_70B;
hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model
+ } else if (model.type == e_model::MODEL_34B && n_gqa == 8) {
+ hparams.f_ffn_mult = 1.0f; // from the params.json of the 34B model
}
hparams.rope_freq_base = rope_freq_base;
--
2.39.2 (Apple Git-143)

View File

@@ -0,0 +1,30 @@
From dadbed99e65252d79f81101a392d0d6497b86caa Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Mon, 21 Aug 2023 06:59:29 -0400
Subject: [PATCH] metal : fix synchronization in new matrix multiplication
kernel (#2686)
---
ggml-metal.metal | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 3f31252..88d48f6 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1898,10 +1898,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
+ threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_threadgroup);
+ threadgroup_barrier(mem_flags::mem_device);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0

View File

@@ -0,0 +1,41 @@
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
From: Shouzheng Liu <lshzh.hi@gmail.com>
Date: Tue, 22 Aug 2023 02:18:40 -0400
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
---
ggml-metal.metal | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 88d48f6..ce3541f 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
//load data and store to threadgroup memory
half4x4 temp_a;
dequantize_func(x, il, temp_a);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
#pragma unroll(16)
for (int i = 0; i < 16; i++) {
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
} else {
// block is smaller than 64x32, we should avoid writing data outside of the matrix
+ threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
for (int i = 0; i < 8; i++) {
- threadgroup_barrier(mem_flags::mem_device);
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
}
- threadgroup_barrier(mem_flags::mem_device);
+ threadgroup_barrier(mem_flags::mem_threadgroup);
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
if (sgitg==0) {
for (int i = 0; i < n_rows; i++) {
--
2.41.0

View File

@@ -0,0 +1,32 @@
From 1e3bc523d8053a77df3ac7126a84d0297ee97ef6 Mon Sep 17 00:00:00 2001
From: Kylin <56434533+KyL0N@users.noreply.github.com>
Date: Tue, 22 Aug 2023 15:14:23 +0800
Subject: [PATCH] ggml : support CUDA's half type for aarch64(#1455) (#2670)
* ggml: support CUDA's half type for aarch64(#1455)
support CUDA's half type for aarch64 in ggml_fp16_t definition
* ggml: use __CUDACC__ to recognise nvcc compiler
---
ggml.h | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/ggml.h b/ggml.h
index 544ad2d..0ec7ec5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -259,8 +259,9 @@
extern "C" {
#endif
-#ifdef __ARM_NEON
- // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+ typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
--
2.39.2 (Apple Git-143)