mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-15 18:27:08 +00:00
embed libraries using cmake
This commit is contained in:
@@ -0,0 +1,41 @@
|
||||
From 14b1d7e6f720dee41ce5a826376df738096d9033 Mon Sep 17 00:00:00 2001
|
||||
From: Shouzheng Liu <lshzh.hi@gmail.com>
|
||||
Date: Tue, 22 Aug 2023 02:18:40 -0400
|
||||
Subject: [PATCH] metal : add missing barriers for mul-mat (#2699)
|
||||
|
||||
---
|
||||
ggml-metal.metal | 5 +++--
|
||||
1 file changed, 3 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/ggml-metal.metal b/ggml-metal.metal
|
||||
index 88d48f6..ce3541f 100644
|
||||
--- a/ggml-metal.metal
|
||||
+++ b/ggml-metal.metal
|
||||
@@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
//load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
@@ -1895,14 +1896,14 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
- threadgroup_barrier(mem_flags::mem_device);
|
||||
+ threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
--
|
||||
2.41.0
|
||||
|
||||
Reference in New Issue
Block a user