mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -190,6 +190,7 @@ func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn fu
|
||||
"seed": predict.Options.Seed,
|
||||
"stop": predict.Options.Stop,
|
||||
"image_data": imageData,
|
||||
"cache_prompt": true,
|
||||
}
|
||||
|
||||
if predict.Format == "json" {
|
||||
|
||||
@@ -39,6 +39,9 @@ init_vars() {
|
||||
*)
|
||||
;;
|
||||
esac
|
||||
if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
|
||||
CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
fi
|
||||
}
|
||||
|
||||
git_module_setup() {
|
||||
@@ -61,6 +64,17 @@ apply_patches() {
|
||||
if ! grep ollama ${LLAMACPP_DIR}/examples/server/CMakeLists.txt; then
|
||||
echo 'include (../../../ext_server/CMakeLists.txt) # ollama' >>${LLAMACPP_DIR}/examples/server/CMakeLists.txt
|
||||
fi
|
||||
|
||||
# apply temporary patches until fix is upstream
|
||||
for patch in ../patches/*.diff; do
|
||||
for file in $(grep "^+++ " ${patch} | cut -f2 -d' ' | cut -f2- -d/); do
|
||||
(cd ${LLAMACPP_DIR}; git checkout ${file})
|
||||
done
|
||||
done
|
||||
for patch in ../patches/*.diff; do
|
||||
(cd ${LLAMACPP_DIR} && git apply ${patch})
|
||||
done
|
||||
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
sed -e 's/int main(/int __main(/g' <${LLAMACPP_DIR}/examples/server/server.cpp >${LLAMACPP_DIR}/examples/server/server.cpp.tmp &&
|
||||
mv ${LLAMACPP_DIR}/examples/server/server.cpp.tmp ${LLAMACPP_DIR}/examples/server/server.cpp
|
||||
|
||||
@@ -140,7 +140,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
if [ -n "${CUDA_MAJOR}" ]; then
|
||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
build
|
||||
|
||||
@@ -25,6 +25,11 @@ function init_vars {
|
||||
}
|
||||
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
} else {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES=$env:CMAKE_CUDA_ARCHITECTURES
|
||||
}
|
||||
}
|
||||
|
||||
function git_module_setup {
|
||||
@@ -40,6 +45,29 @@ function apply_patches {
|
||||
if (!(Select-String -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Pattern 'ollama')) {
|
||||
Add-Content -Path "${script:llamacppDir}/examples/server/CMakeLists.txt" -Value 'include (../../../ext_server/CMakeLists.txt) # ollama'
|
||||
}
|
||||
|
||||
# Apply temporary patches until fix is upstream
|
||||
$patches = Get-ChildItem "../patches/*.diff"
|
||||
foreach ($patch in $patches) {
|
||||
# Extract file paths from the patch file
|
||||
$filePaths = Get-Content $patch.FullName | Where-Object { $_ -match '^\+\+\+ ' } | ForEach-Object {
|
||||
$parts = $_ -split ' '
|
||||
($parts[1] -split '/', 2)[1]
|
||||
}
|
||||
|
||||
# Checkout each file
|
||||
foreach ($file in $filePaths) {
|
||||
Set-Location -Path ${script:llamacppDir}
|
||||
git checkout $file
|
||||
}
|
||||
}
|
||||
|
||||
# Apply each patch
|
||||
foreach ($patch in $patches) {
|
||||
Set-Location -Path ${script:llamacppDir}
|
||||
git apply $patch.FullName
|
||||
}
|
||||
|
||||
# Avoid duplicate main symbols when we link into the cgo binary
|
||||
$content = Get-Content -Path "${script:llamacppDir}/examples/server/server.cpp"
|
||||
$content = $content -replace 'int main\(', 'int __main('
|
||||
@@ -128,7 +156,7 @@ if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
build
|
||||
install
|
||||
cp "${script:CUDA_LIB_DIR}/cudart64_*.dll" "${script:buildDir}/lib"
|
||||
|
||||
115
llm/gguf.go
115
llm/gguf.go
@@ -69,12 +69,65 @@ type tensor struct {
|
||||
name string
|
||||
kind uint32
|
||||
offset uint64
|
||||
size uint64
|
||||
|
||||
// shape is the number of elements in each dimension
|
||||
shape [4]uint64
|
||||
}
|
||||
|
||||
func (t tensor) blockSize() uint64 {
|
||||
switch {
|
||||
case t.kind < 2:
|
||||
return 1
|
||||
case t.kind < 10:
|
||||
return 32
|
||||
default:
|
||||
return 256
|
||||
}
|
||||
}
|
||||
|
||||
func (t tensor) typeSize() uint64 {
|
||||
blockSize := t.blockSize()
|
||||
|
||||
switch t.kind {
|
||||
case 0: // FP32
|
||||
return 4
|
||||
case 1: // FP16
|
||||
return 2
|
||||
case 2: // Q4_0
|
||||
return 2 + blockSize/2
|
||||
case 3: // Q4_1
|
||||
return 2 + 2 + blockSize/2
|
||||
case 6: // Q5_0
|
||||
return 2 + 4 + blockSize/2
|
||||
case 7: // Q5_1
|
||||
return 2 + 2 + 4 + blockSize/2
|
||||
case 8: // Q8_0
|
||||
return 2 + blockSize
|
||||
case 9: // Q8_1
|
||||
return 4 + 4 + blockSize
|
||||
case 10: // Q2_K
|
||||
return blockSize/16 + blockSize/4 + 2 + 2
|
||||
case 11: // Q3_K
|
||||
return blockSize/8 + blockSize/4 + 12 + 2
|
||||
case 12: // Q4_K
|
||||
return 2 + 2 + 12 + blockSize/2
|
||||
case 13: // Q5_K
|
||||
return 2 + 2 + 12 + blockSize/8 + blockSize/2
|
||||
case 14: // Q6_K
|
||||
return blockSize/2 + blockSize/4 + blockSize/16 + 2
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func (t tensor) parameters() uint64 {
|
||||
return t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3]
|
||||
}
|
||||
|
||||
func (t tensor) size() uint64 {
|
||||
return t.parameters() * t.typeSize() / t.blockSize()
|
||||
}
|
||||
|
||||
type ggufModel struct {
|
||||
*containerGGUF
|
||||
|
||||
@@ -201,61 +254,15 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
|
||||
shape[i] = llm.readU64(rso)
|
||||
}
|
||||
|
||||
kind := llm.readU32(rso)
|
||||
offset := llm.readU64(rso)
|
||||
|
||||
var blockSize uint64
|
||||
switch {
|
||||
case kind < 2:
|
||||
blockSize = 1
|
||||
case kind < 10:
|
||||
blockSize = 32
|
||||
default:
|
||||
blockSize = 256
|
||||
}
|
||||
|
||||
var typeSize uint64
|
||||
switch kind {
|
||||
case 0: // FP32
|
||||
typeSize = 4
|
||||
case 1: // FP16
|
||||
typeSize = 2
|
||||
case 2: // Q4_0
|
||||
typeSize = 2 + blockSize/2
|
||||
case 3: // Q4_1
|
||||
typeSize = 2 + 2 + blockSize/2
|
||||
case 6: // Q5_0
|
||||
typeSize = 2 + 4 + blockSize/2
|
||||
case 7: // Q5_1
|
||||
typeSize = 2 + 2 + 4 + blockSize/2
|
||||
case 8: // Q8_0
|
||||
typeSize = 2 + blockSize
|
||||
case 9: // Q8_1
|
||||
typeSize = 4 + 4 + blockSize
|
||||
case 10: // Q2_K
|
||||
typeSize = blockSize/16 + blockSize/4 + 2 + 2
|
||||
case 11: // Q3_K
|
||||
typeSize = blockSize/8 + blockSize/4 + 12 + 2
|
||||
case 12: // Q4_K
|
||||
typeSize = 2 + 2 + 12 + blockSize/2
|
||||
case 13: // Q5_K
|
||||
typeSize = 2 + 2 + 12 + blockSize/8 + blockSize/2
|
||||
case 14: // Q6_K
|
||||
typeSize = blockSize/2 + blockSize/4 + blockSize/16 + 2
|
||||
}
|
||||
|
||||
parameters := shape[0] * shape[1] * shape[2] * shape[3]
|
||||
size := parameters * typeSize / blockSize
|
||||
|
||||
llm.tensors = append(llm.tensors, tensor{
|
||||
tensor := tensor{
|
||||
name: name,
|
||||
kind: kind,
|
||||
offset: offset,
|
||||
size: size,
|
||||
kind: llm.readU32(rso),
|
||||
offset: llm.readU64(rso),
|
||||
shape: shape,
|
||||
})
|
||||
}
|
||||
|
||||
llm.parameters += parameters
|
||||
llm.tensors = append(llm.tensors, tensor)
|
||||
llm.parameters += tensor.parameters()
|
||||
}
|
||||
|
||||
alignment, ok := llm.kv["general.alignment"].(uint32)
|
||||
@@ -265,7 +272,7 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
|
||||
|
||||
rso.Seek(int64(alignment)-rso.offset%int64(alignment), io.SeekCurrent)
|
||||
for _, tensor := range llm.tensors {
|
||||
padded := (int64(tensor.size) + int64(alignment) - 1) & ^(int64(alignment) - 1)
|
||||
padded := (int64(tensor.size()) + int64(alignment) - 1) & ^(int64(alignment) - 1)
|
||||
rso.Seek(padded, io.SeekCurrent)
|
||||
}
|
||||
|
||||
|
||||
Submodule llm/llama.cpp updated: 011e8ec577...cd4fddb29f
30
llm/patches/01-cache.diff
Normal file
30
llm/patches/01-cache.diff
Normal file
@@ -0,0 +1,30 @@
|
||||
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
|
||||
index 0462fbd2..4fa7b57f 100644
|
||||
--- a/examples/server/server.cpp
|
||||
+++ b/examples/server/server.cpp
|
||||
@@ -1857,12 +1857,6 @@ struct llama_server_context
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
}
|
||||
|
||||
- LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||
-
|
||||
- llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||
-
|
||||
- slot.cache_tokens = prompt_tokens;
|
||||
-
|
||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
||||
{
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
@@ -1870,6 +1864,12 @@ struct llama_server_context
|
||||
slot.n_past--;
|
||||
}
|
||||
|
||||
+ LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
|
||||
+
|
||||
+ llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
|
||||
+
|
||||
+ slot.cache_tokens = prompt_tokens;
|
||||
+
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
{"n_past", slot.n_past},
|
||||
{"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
|
||||
Reference in New Issue
Block a user