mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-20 20:57:01 +00:00
llama: update vendor code to commit ba1cb19c (#8101)
This commit is contained in:
190
llama/ggml-metal-embed.metal
vendored
190
llama/ggml-metal-embed.metal
vendored
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
|
||||
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -28,7 +28,7 @@
|
||||
#define GGML_COMMON_IMPL_METAL
|
||||
#if defined(GGML_METAL_EMBED_LIBRARY)
|
||||
/**
|
||||
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
|
||||
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -61,7 +61,20 @@
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
typedef uint16_t ggml_half;
|
||||
typedef uint32_t ggml_half2;
|
||||
|
||||
// std-c++ allow anonymous unions but some compiler warn on it
|
||||
#define GGML_COMMON_AGGR_U data
|
||||
// std-c++ do not allow it.
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_METAL)
|
||||
@@ -70,7 +83,8 @@ typedef uint32_t ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_CUDA)
|
||||
@@ -84,7 +98,8 @@ typedef half2 ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_HIP)
|
||||
@@ -94,7 +109,8 @@ typedef half2 ggml_half2;
|
||||
typedef half ggml_half;
|
||||
typedef half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#elif defined(GGML_COMMON_DECL_SYCL)
|
||||
@@ -104,7 +120,8 @@ typedef half2 ggml_half2;
|
||||
typedef sycl::half ggml_half;
|
||||
typedef sycl::half2 ggml_half2;
|
||||
|
||||
#define GGML_COMMON_AGGR data
|
||||
#define GGML_COMMON_AGGR_U
|
||||
#define GGML_COMMON_AGGR_S data
|
||||
|
||||
#define GGML_COMMON_DECL
|
||||
#endif
|
||||
@@ -209,9 +226,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
@@ -230,9 +247,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t qh[4]; // 5-th bit of quants
|
||||
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
||||
} block_q5_1;
|
||||
@@ -251,37 +268,13 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 ds;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
int8_t qs[QK8_1]; // quants
|
||||
} block_q8_1;
|
||||
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
|
||||
} block_q4_0x4;
|
||||
static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q4_0 blocks
|
||||
uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
|
||||
} block_q4_0x8;
|
||||
static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 4]; // quants for 4 q8_0 blocks
|
||||
} block_q8_0x4;
|
||||
static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[8]; // deltas for 8 q8_0 blocks
|
||||
int8_t qs[QK8_0 * 8]; // quants for 8 q8_0 blocks
|
||||
} block_q8_0x8;
|
||||
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
||||
|
||||
//
|
||||
// Ternary quantization
|
||||
//
|
||||
@@ -316,9 +309,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
} block_q2_K;
|
||||
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
||||
|
||||
@@ -343,9 +336,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
@@ -360,9 +353,9 @@ typedef struct {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
} GGML_COMMON_AGGR;
|
||||
} GGML_COMMON_AGGR_S;
|
||||
ggml_half2 dm;
|
||||
};
|
||||
} GGML_COMMON_AGGR_U;
|
||||
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
@@ -473,12 +466,6 @@ typedef struct {
|
||||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
|
||||
typedef struct {
|
||||
ggml_half d[4]; // deltas for 4 iq4_nl blocks
|
||||
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
|
||||
} block_iq4_nlx4;
|
||||
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
|
||||
|
||||
#endif // GGML_COMMON_DECL
|
||||
#endif // GGML_COMMON_DECL
|
||||
|
||||
@@ -492,6 +479,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_CPP)
|
||||
#include <cstdint>
|
||||
|
||||
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
|
||||
#define GGML_TABLE_END() };
|
||||
|
||||
#define GGML_COMMON_IMPL
|
||||
#elif defined(GGML_COMMON_IMPL_METAL)
|
||||
#include <metal_stdlib>
|
||||
@@ -534,7 +528,7 @@ GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
|
||||
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
||||
GGML_TABLE_END()
|
||||
|
||||
//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
//#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
|
||||
0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
|
||||
0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
|
||||
@@ -1917,7 +1911,7 @@ GGML_TABLE_END()
|
||||
#include "../ggml-common.h"
|
||||
#endif
|
||||
/**
|
||||
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
|
||||
* llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -2046,6 +2040,21 @@ typedef struct {
|
||||
uint64_t nb3;
|
||||
} ggml_metal_kargs_cpy;
|
||||
|
||||
typedef struct {
|
||||
int64_t ne10;
|
||||
int64_t ne11;
|
||||
int64_t ne12;
|
||||
uint64_t nb10;
|
||||
uint64_t nb11;
|
||||
uint64_t nb12;
|
||||
uint64_t nb13;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
uint64_t offs;
|
||||
bool inplace;
|
||||
} ggml_metal_kargs_set;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne01;
|
||||
@@ -5105,6 +5114,53 @@ kernel void kernel_pad_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_pad_reflect_1d_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant int64_t & ne0,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & p0,
|
||||
constant int32_t & p1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
|
||||
if (i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
if (i0 < p0) {
|
||||
dst_ptr[i0] = src0_ptr[p0 - i0];
|
||||
} else if (i0 < ne0 - p1) {
|
||||
dst_ptr[i0] = src0_ptr[i0 - p0];
|
||||
} else {
|
||||
dst_ptr[i0] = src0_ptr[(ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_unpad_f32(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
@@ -6133,6 +6189,38 @@ template [[host_name("kernel_flash_attn_ext_vec_q8_0_h256")]] kernel flash_attn_
|
||||
|
||||
#undef FA_TYPES
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_set(
|
||||
constant ggml_metal_kargs_set & args,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
const int i13 = tgpig[2];
|
||||
const int i12 = tgpig[1];
|
||||
const int i11 = tgpig[0];
|
||||
|
||||
const int64_t n = i13*args.ne12*args.ne11*args.ne10 + i12*args.ne11*args.ne10 + i11*args.ne10;
|
||||
|
||||
const int64_t i3 = n / (args.ne12*args.ne11*args.ne10);
|
||||
const int64_t i2 = (n - i3*args.ne12*args.ne11*args.ne10) / (args.ne11*args.ne10);
|
||||
const int64_t i1 = (n - i3*args.ne12*args.ne11*args.ne10 - i2*args.ne11*args.ne10) / args.ne10;
|
||||
|
||||
device T * dst_data = (device T *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + args.offs);
|
||||
|
||||
for (int64_t i10 = tpitg.x; i10 < args.ne10; i10 += ntg.x) {
|
||||
device const T * src = (device T *) (src1 + i13*args.nb13 + i12*args.nb12 + i11*args.nb11 + i10*args.nb10);
|
||||
dst_data[i10] = (T) src[0];
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_set<float>) kernel_set_t;
|
||||
|
||||
template [[host_name("kernel_set_f32")]] kernel kernel_set_t kernel_set<float>;
|
||||
template [[host_name("kernel_set_i32")]] kernel kernel_set_t kernel_set<int32_t>;
|
||||
|
||||
template<typename T0, typename T1>
|
||||
kernel void kernel_cpy(
|
||||
constant ggml_metal_kargs_cpy & args,
|
||||
|
||||
Reference in New Issue
Block a user