mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-10 15:57:04 +00:00
Merge remote-tracking branch 'upstream/main' into pr3702
This commit is contained in:
17
llm/ext_server/server.cpp
vendored
17
llm/ext_server/server.cpp
vendored
@@ -1032,7 +1032,7 @@ struct llama_server_context
|
||||
slot.has_next_token = false;
|
||||
}
|
||||
|
||||
if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model))
|
||||
if (!slot.cache_tokens.empty() && llama_token_is_eog(model, result.tok))
|
||||
{
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
@@ -1144,12 +1144,15 @@ struct llama_server_context
|
||||
|
||||
res.result_json = json
|
||||
{
|
||||
{"content", tkn.text_to_send},
|
||||
{"stop", false},
|
||||
{"slot_id", slot.id},
|
||||
{"multimodal", multimodal}
|
||||
};
|
||||
|
||||
if (!llama_token_is_eog(model, tkn.tok)) {
|
||||
res.result_json["content"] = tkn.text_to_send;
|
||||
}
|
||||
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
@@ -1183,8 +1186,6 @@ struct llama_server_context
|
||||
{"model", params.model_alias},
|
||||
{"tokens_predicted", slot.n_decoded},
|
||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||
{"generation_settings", get_formated_generation(slot)},
|
||||
{"prompt", slot.prompt},
|
||||
{"truncated", slot.truncated},
|
||||
{"stopped_eos", slot.stopped_eos},
|
||||
{"stopped_word", slot.stopped_word},
|
||||
@@ -2644,18 +2645,18 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
kvo.val_i64 = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
kvo.val_f64 = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
kvo.val_bool = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
kvo.val_bool = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
|
||||
140
llm/filetype.go
Normal file
140
llm/filetype.go
Normal file
@@ -0,0 +1,140 @@
|
||||
package llm
|
||||
|
||||
import "fmt"
|
||||
|
||||
type fileType uint32
|
||||
|
||||
const (
|
||||
fileTypeF32 fileType = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ4_2 // unused
|
||||
fileTypeQ4_3 // unused
|
||||
fileTypeQ8_0
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeQ3_K_XS
|
||||
fileTypeIQ3_XXS
|
||||
|
||||
fileTypeUnknown
|
||||
)
|
||||
|
||||
func ParseFileType(s string) (fileType, error) {
|
||||
switch s {
|
||||
case "F32":
|
||||
return fileTypeF32, nil
|
||||
case "F16":
|
||||
return fileTypeF16, nil
|
||||
case "Q4_0":
|
||||
return fileTypeQ4_0, nil
|
||||
case "Q4_1":
|
||||
return fileTypeQ4_1, nil
|
||||
case "Q4_1_F16":
|
||||
return fileTypeQ4_1_F16, nil
|
||||
case "Q8_0":
|
||||
return fileTypeQ8_0, nil
|
||||
case "Q5_0":
|
||||
return fileTypeQ5_0, nil
|
||||
case "Q5_1":
|
||||
return fileTypeQ5_1, nil
|
||||
case "Q2_K":
|
||||
return fileTypeQ2_K, nil
|
||||
case "Q3_K_S":
|
||||
return fileTypeQ3_K_S, nil
|
||||
case "Q3_K_M":
|
||||
return fileTypeQ3_K_M, nil
|
||||
case "Q3_K_L":
|
||||
return fileTypeQ3_K_L, nil
|
||||
case "Q4_K_S":
|
||||
return fileTypeQ4_K_S, nil
|
||||
case "Q4_K_M":
|
||||
return fileTypeQ4_K_M, nil
|
||||
case "Q5_K_S":
|
||||
return fileTypeQ5_K_S, nil
|
||||
case "Q5_K_M":
|
||||
return fileTypeQ5_K_M, nil
|
||||
case "Q6_K":
|
||||
return fileTypeQ6_K, nil
|
||||
case "IQ2_XXS":
|
||||
return fileTypeIQ2_XXS, nil
|
||||
case "IQ2_XS":
|
||||
return fileTypeIQ2_XS, nil
|
||||
case "Q2_K_S":
|
||||
return fileTypeQ2_K_S, nil
|
||||
case "Q3_K_XS":
|
||||
return fileTypeQ3_K_XS, nil
|
||||
case "IQ3_XXS":
|
||||
return fileTypeIQ3_XXS, nil
|
||||
default:
|
||||
return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) String() string {
|
||||
switch t {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeIQ2_XXS:
|
||||
return "IQ2_XXS"
|
||||
case fileTypeIQ2_XS:
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeQ3_K_XS:
|
||||
return "Q3_K_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
func (t fileType) Value() uint32 {
|
||||
return uint32(t)
|
||||
}
|
||||
@@ -21,7 +21,7 @@ init_vars() {
|
||||
# TODO - add additional optimization flags...
|
||||
CMAKE_DEFS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_SERVER_VERBOSE=off ${CMAKE_DEFS}"
|
||||
fi
|
||||
case $(uname -s) in
|
||||
case $(uname -s) in
|
||||
"Darwin")
|
||||
LIB_EXT="dylib"
|
||||
WHOLE_ARCHIVE="-Wl,-force_load"
|
||||
|
||||
@@ -57,21 +57,21 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
init_vars
|
||||
if [ -z "${OLLAMA_SKIP_STATIC_GENERATE}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
||||
# Builds by default, allows skipping, forces build if OLLAMA_CPU_TARGET="static"
|
||||
# Enables optimized Dockerfile builds using a blanket skip and targeted overrides
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
fi
|
||||
|
||||
init_vars
|
||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
|
||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
|
||||
# Static build for linking into the Go binary
|
||||
init_vars
|
||||
CMAKE_TARGETS="--target llama --target ggml"
|
||||
CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}_static"
|
||||
echo "Building static library"
|
||||
build
|
||||
fi
|
||||
|
||||
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||
@@ -165,14 +165,22 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
|
||||
fi
|
||||
if [ "${ARCH}" == "arm64" ]; then
|
||||
echo "ARM CPU detected - disabling unsupported AVX instructions"
|
||||
|
||||
|
||||
# ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
|
||||
#
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
# CUDA compute < 6.0 lacks proper FP16 support on ARM.
|
||||
# Disabling has minimal performance effect while maintaining compatibility.
|
||||
ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
|
||||
echo "Building custom CUDA GPU"
|
||||
else
|
||||
CMAKE_CUDA_DEFS="-DLLAMA_CUDA=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
|
||||
fi
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS}"
|
||||
BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
|
||||
EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
|
||||
build
|
||||
@@ -217,6 +225,12 @@ if [ -d "${ROCM_PATH}" ]; then
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
|
||||
if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
|
||||
CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
|
||||
echo "Building custom ROCM GPU"
|
||||
fi
|
||||
BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
|
||||
EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
|
||||
build
|
||||
|
||||
@@ -26,15 +26,25 @@ function amdGPUs {
|
||||
$GPU_LIST -join ';'
|
||||
}
|
||||
|
||||
|
||||
function init_vars {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
if (!$script:SRC_DIR) {
|
||||
$script:SRC_DIR = $(resolve-path "..\..\")
|
||||
}
|
||||
if (!$script:llamacppDir) {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
}
|
||||
if (!$script:cmakeTargets) {
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
}
|
||||
$script:cmakeDefs = @(
|
||||
"-DBUILD_SHARED_LIBS=on",
|
||||
"-DLLAMA_NATIVE=off"
|
||||
)
|
||||
$script:cmakeTargets = @("ollama_llama_server")
|
||||
$script:ARCH = "amd64" # arm not yet supported.
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
$script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
|
||||
$script:DIST_BASE = "${script:SRC_DIR}\dist\windows-${script:ARCH}\ollama_runners"
|
||||
md "$script:DIST_BASE" -ea 0 > $null
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
|
||||
$script:config = "RelWithDebInfo"
|
||||
@@ -55,7 +65,6 @@ function init_vars {
|
||||
} else {
|
||||
$script:CUDA_LIB_DIR=$env:CUDA_LIB_DIR
|
||||
}
|
||||
$script:GZIP=(get-command -ea 'silentlycontinue' gzip).path
|
||||
$script:DUMPBIN=(get-command -ea 'silentlycontinue' dumpbin).path
|
||||
if ($null -eq $env:CMAKE_CUDA_ARCHITECTURES) {
|
||||
$script:CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
|
||||
@@ -134,21 +143,18 @@ function sign {
|
||||
}
|
||||
}
|
||||
|
||||
function compress {
|
||||
if ($script:GZIP -eq $null) {
|
||||
write-host "gzip not installed, not compressing files"
|
||||
return
|
||||
}
|
||||
write-host "Compressing binaries..."
|
||||
function install {
|
||||
write-host "Installing binaries to dist dir ${script:distDir}"
|
||||
mkdir ${script:distDir} -ErrorAction SilentlyContinue
|
||||
$binaries = dir "${script:buildDir}/bin/*.exe"
|
||||
foreach ($file in $binaries) {
|
||||
& "$script:GZIP" --best -f $file
|
||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
||||
}
|
||||
|
||||
write-host "Compressing dlls..."
|
||||
write-host "Installing dlls to dist dir ${script:distDir}"
|
||||
$dlls = dir "${script:buildDir}/bin/*.dll"
|
||||
foreach ($file in $dlls) {
|
||||
& "$script:GZIP" --best -f $file
|
||||
copy-item -Path $file -Destination ${script:distDir} -Force
|
||||
}
|
||||
}
|
||||
|
||||
@@ -169,123 +175,195 @@ function cleanup {
|
||||
}
|
||||
}
|
||||
|
||||
init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
|
||||
|
||||
if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
|
||||
function build_static() {
|
||||
if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$oldTargets = $script:cmakeTargets
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
$script:cmakeTargets = $oldTargets
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu($gen_arch) {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
$script:distDir="$script:DIST_BASE\cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cpu_avx2() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx2"))) {
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
$script:distDir="$script:DIST_BASE\cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
install
|
||||
} else {
|
||||
write-host "Skipping CPU AVX2 generation step as requested"
|
||||
}
|
||||
}
|
||||
|
||||
function build_cuda() {
|
||||
if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${script:CUDA_LIB_DIR}")) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_CUDA_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_CUDA_DEFS=`"${env:OLLAMA_CUSTOM_CUDA_DEFS}`""
|
||||
$script:cmakeDefs +=@("${env:OLLAMA_CUSTOM_CUDA_DEFS}")
|
||||
write-host "building custom CUDA GPU"
|
||||
}
|
||||
build
|
||||
sign
|
||||
install
|
||||
|
||||
write-host "copying CUDA dependencies to ${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cudart64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublas64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
cp "${script:CUDA_LIB_DIR}\cublasLt64_*.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\"
|
||||
} else {
|
||||
write-host "Skipping CUDA generation step"
|
||||
}
|
||||
}
|
||||
|
||||
function build_rocm() {
|
||||
if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
if ($null -ne $env:OLLAMA_CUSTOM_ROCM_DEFS) {
|
||||
write-host "OLLAMA_CUSTOM_ROCM_DEFS=`"${env:OLLAMA_CUSTOM_ROCM_DEFS}`""
|
||||
$script:cmakeDefs += @("${env:OLLAMA_CUSTOM_ROCM_DEFS}")
|
||||
write-host "building custom ROCM GPU"
|
||||
}
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
install
|
||||
|
||||
# Assumes v5.7, may need adjustments for v6
|
||||
rm -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
md "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\" -ea 0 > $null
|
||||
cp "${env:HIP_PATH}\bin\hipblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
cp "${env:HIP_PATH}\bin\rocblas.dll" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\"
|
||||
# amdhip64.dll dependency comes from the driver and must be installed on the host to use AMD GPUs
|
||||
cp "${env:HIP_PATH}\bin\rocblas\library\*" "${script:SRC_DIR}\dist\windows-${script:ARCH}\rocm\rocblas\library\"
|
||||
} else {
|
||||
write-host "Skipping ROCm generation step"
|
||||
}
|
||||
}
|
||||
|
||||
# GCC build for direct linking into the Go binary
|
||||
init_vars
|
||||
# cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
|
||||
# as we need this to be compiled by gcc for golang to be able to link with itx
|
||||
write-host "Checking for MinGW..."
|
||||
# error action ensures we exit on failure
|
||||
get-command gcc
|
||||
get-command mingw32-make
|
||||
$script:cmakeTargets = @("llama", "ggml")
|
||||
$script:cmakeDefs = @(
|
||||
"-G", "MinGW Makefiles"
|
||||
"-DCMAKE_C_COMPILER=gcc.exe",
|
||||
"-DCMAKE_CXX_COMPILER=g++.exe",
|
||||
"-DBUILD_SHARED_LIBS=off",
|
||||
"-DLLAMA_NATIVE=off",
|
||||
"-DLLAMA_AVX=off",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DLLAMA_AVX512=off",
|
||||
"-DLLAMA_F16C=off",
|
||||
"-DLLAMA_FMA=off")
|
||||
$script:buildDir="../build/windows/${script:ARCH}_static"
|
||||
write-host "Building static library"
|
||||
build
|
||||
if ($($args.count) -eq 0) {
|
||||
git_module_setup
|
||||
apply_patches
|
||||
build_static
|
||||
if ($script:ARCH -eq "arm64") {
|
||||
build_cpu("ARM64")
|
||||
} else { # amd64
|
||||
build_cpu("x64")
|
||||
build_cpu_avx
|
||||
build_cpu_avx2
|
||||
build_cuda
|
||||
build_rocm
|
||||
}
|
||||
|
||||
# remaining llama.cpp builds use MSVC
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
|
||||
init_vars
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
sign
|
||||
compress
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path $script:DIST_BASE)"
|
||||
} else {
|
||||
write-host "Skipping CPU generation step as requested"
|
||||
}
|
||||
|
||||
if ($null -ne $script:CUDA_LIB_DIR) {
|
||||
# Then build cuda as a dynamically loaded library
|
||||
$nvcc = "$script:CUDA_LIB_DIR\nvcc.exe"
|
||||
$script:CUDA_VERSION=(get-item ($nvcc | split-path | split-path)).Basename
|
||||
if ($null -ne $script:CUDA_VERSION) {
|
||||
$script:CUDA_VARIANT="_"+$script:CUDA_VERSION
|
||||
}
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
|
||||
$script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUDA=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
|
||||
build
|
||||
sign
|
||||
compress
|
||||
}
|
||||
|
||||
if ($null -ne $env:HIP_PATH) {
|
||||
$script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
|
||||
if ($null -ne $script:ROCM_VERSION) {
|
||||
$script:ROCM_VARIANT="_v"+$script:ROCM_VERSION
|
||||
}
|
||||
|
||||
init_vars
|
||||
$script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
|
||||
$script:cmakeDefs += @(
|
||||
"-G", "Ninja",
|
||||
"-DCMAKE_C_COMPILER=clang.exe",
|
||||
"-DCMAKE_CXX_COMPILER=clang++.exe",
|
||||
"-DLLAMA_HIPBLAS=on",
|
||||
"-DHIP_PLATFORM=amd",
|
||||
"-DLLAMA_AVX=on",
|
||||
"-DLLAMA_AVX2=off",
|
||||
"-DCMAKE_POSITION_INDEPENDENT_CODE=on",
|
||||
"-DAMDGPU_TARGETS=$(amdGPUs)",
|
||||
"-DGPU_TARGETS=$(amdGPUs)"
|
||||
)
|
||||
|
||||
# Make sure the ROCm binary dir is first in the path
|
||||
$env:PATH="$env:HIP_PATH\bin;$env:PATH"
|
||||
|
||||
# We have to clobber the LIB var from the developer shell for clang to work properly
|
||||
$env:LIB=""
|
||||
|
||||
write-host "Building ROCm"
|
||||
build
|
||||
# Ninja doesn't prefix with config name
|
||||
${script:config}=""
|
||||
if ($null -ne $script:DUMPBIN) {
|
||||
& "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
|
||||
}
|
||||
sign
|
||||
compress
|
||||
}
|
||||
|
||||
|
||||
cleanup
|
||||
write-host "`ngo generate completed. LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
|
||||
for ( $i = 0; $i -lt $args.count; $i++ ) {
|
||||
write-host "performing $($args[$i])"
|
||||
& $($args[$i])
|
||||
}
|
||||
}
|
||||
107
llm/ggml.go
107
llm/ggml.go
@@ -13,82 +13,6 @@ type GGML struct {
|
||||
model
|
||||
}
|
||||
|
||||
const (
|
||||
fileTypeF32 uint32 = iota
|
||||
fileTypeF16
|
||||
fileTypeQ4_0
|
||||
fileTypeQ4_1
|
||||
fileTypeQ4_1_F16
|
||||
fileTypeQ8_0 uint32 = iota + 2
|
||||
fileTypeQ5_0
|
||||
fileTypeQ5_1
|
||||
fileTypeQ2_K
|
||||
fileTypeQ3_K_S
|
||||
fileTypeQ3_K_M
|
||||
fileTypeQ3_K_L
|
||||
fileTypeQ4_K_S
|
||||
fileTypeQ4_K_M
|
||||
fileTypeQ5_K_S
|
||||
fileTypeQ5_K_M
|
||||
fileTypeQ6_K
|
||||
fileTypeIQ2_XXS
|
||||
fileTypeIQ2_XS
|
||||
fileTypeQ2_K_S
|
||||
fileTypeQ3_K_XS
|
||||
fileTypeIQ3_XXS
|
||||
)
|
||||
|
||||
func fileType(fileType uint32) string {
|
||||
switch fileType {
|
||||
case fileTypeF32:
|
||||
return "F32"
|
||||
case fileTypeF16:
|
||||
return "F16"
|
||||
case fileTypeQ4_0:
|
||||
return "Q4_0"
|
||||
case fileTypeQ4_1:
|
||||
return "Q4_1"
|
||||
case fileTypeQ4_1_F16:
|
||||
return "Q4_1_F16"
|
||||
case fileTypeQ8_0:
|
||||
return "Q8_0"
|
||||
case fileTypeQ5_0:
|
||||
return "Q5_0"
|
||||
case fileTypeQ5_1:
|
||||
return "Q5_1"
|
||||
case fileTypeQ2_K:
|
||||
return "Q2_K"
|
||||
case fileTypeQ3_K_S:
|
||||
return "Q3_K_S"
|
||||
case fileTypeQ3_K_M:
|
||||
return "Q3_K_M"
|
||||
case fileTypeQ3_K_L:
|
||||
return "Q3_K_L"
|
||||
case fileTypeQ4_K_S:
|
||||
return "Q4_K_S"
|
||||
case fileTypeQ4_K_M:
|
||||
return "Q4_K_M"
|
||||
case fileTypeQ5_K_S:
|
||||
return "Q5_K_S"
|
||||
case fileTypeQ5_K_M:
|
||||
return "Q5_K_M"
|
||||
case fileTypeQ6_K:
|
||||
return "Q6_K"
|
||||
case fileTypeIQ2_XXS:
|
||||
return "IQ2_XXS"
|
||||
case fileTypeIQ2_XS:
|
||||
return "IQ2_XS"
|
||||
case fileTypeQ2_K_S:
|
||||
return "Q2_K_S"
|
||||
case fileTypeQ3_K_XS:
|
||||
return "Q3_K_XS"
|
||||
case fileTypeIQ3_XXS:
|
||||
return "IQ3_XXS"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
type model interface {
|
||||
KV() KV
|
||||
Tensors() Tensors
|
||||
@@ -121,12 +45,12 @@ func (kv KV) ParameterCount() uint64 {
|
||||
return kv.u64("general.parameter_count")
|
||||
}
|
||||
|
||||
func (kv KV) FileType() string {
|
||||
func (kv KV) FileType() fileType {
|
||||
if u64 := kv.u64("general.file_type"); u64 > 0 {
|
||||
return fileType(uint32(u64))
|
||||
}
|
||||
|
||||
return "unknown"
|
||||
return fileTypeUnknown
|
||||
}
|
||||
|
||||
func (kv KV) BlockCount() uint64 {
|
||||
@@ -286,6 +210,23 @@ const (
|
||||
|
||||
var ErrUnsupportedFormat = errors.New("unsupported model format")
|
||||
|
||||
func DetectGGMLType(b []byte) string {
|
||||
switch binary.LittleEndian.Uint32(b[:4]) {
|
||||
case FILE_MAGIC_GGML:
|
||||
return "ggml"
|
||||
case FILE_MAGIC_GGMF:
|
||||
return "ggmf"
|
||||
case FILE_MAGIC_GGJT:
|
||||
return "ggjt"
|
||||
case FILE_MAGIC_GGLA:
|
||||
return "ggla"
|
||||
case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
|
||||
return "gguf"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
||||
var magic uint32
|
||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||
@@ -343,7 +284,15 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
4*batch*(embedding+vocab)+embedding*vocab*105/128,
|
||||
)
|
||||
|
||||
if ffnGateWeight, ok := layers["0"]["ffn_gate.0.weight"]; ok {
|
||||
if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
|
||||
// mixtral 8x22b
|
||||
ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
|
||||
partialOffload = max(
|
||||
3*ffnGateExpsWeight.size()+4*batch*(2*ff+headsKV+embedding+context+embedding/heads*headsKV),
|
||||
4*(context*batch*heads+context*embedding/heads*headsKV+batch*1024+embedding/heads*headsKV*batch),
|
||||
)
|
||||
} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
|
||||
// mixtral 8x7b
|
||||
ffnGateWeight1 := ffnGateWeight.Shape[1]
|
||||
fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
|
||||
partialOffload = max(
|
||||
|
||||
16
llm/gguf.go
16
llm/gguf.go
@@ -190,8 +190,6 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
||||
llm.kv[k] = v
|
||||
}
|
||||
|
||||
slog.Debug(fmt.Sprintf("general.architecture = %s", llm.kv["general.architecture"]))
|
||||
|
||||
// decode tensors
|
||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
||||
name, err := readGGUFString(llm, rs)
|
||||
@@ -465,11 +463,13 @@ var ggufKVOrder = map[string][]string{
|
||||
"llama.embedding_length",
|
||||
"llama.block_count",
|
||||
"llama.feed_forward_length",
|
||||
"llama.rope.dimension_count",
|
||||
"llama.attention.head_count",
|
||||
"llama.attention.head_count_kv",
|
||||
"llama.attention.layer_norm_rms_epsilon",
|
||||
"llama.rope.freq_base",
|
||||
"llama.rope.dimension_count",
|
||||
"llama.expert_count",
|
||||
"llama.expert_used_count",
|
||||
"gemma.context_length",
|
||||
"gemma.embedding_length",
|
||||
"gemma.block_count",
|
||||
@@ -577,6 +577,8 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
||||
return err
|
||||
}
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("improper type for '%s'", k)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -598,9 +600,11 @@ func (llm *gguf) Encode(ws io.WriteSeeker, kv KV, tensors []Tensor) error {
|
||||
return err
|
||||
}
|
||||
|
||||
dims := 1
|
||||
if tensor.Shape[1] > 0 {
|
||||
dims = 2
|
||||
dims := 0
|
||||
for cnt := 0; cnt < len(tensor.Shape); cnt++ {
|
||||
if tensor.Shape[cnt] > 0 {
|
||||
dims++
|
||||
}
|
||||
}
|
||||
|
||||
if err := binary.Write(ws, llm.ByteOrder, uint32(dims)); err != nil {
|
||||
|
||||
Submodule llm/llama.cpp updated: 7593639ce3...952d03dbea
57
llm/llm.go
57
llm/llm.go
@@ -4,6 +4,7 @@ package llm
|
||||
// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
|
||||
// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
|
||||
// #cgo windows,arm64 LDFLAGS: ${SRCDIR}/build/windows/arm64_static/libllama.a -static -lstdc++
|
||||
// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
|
||||
// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
|
||||
// #include <stdlib.h>
|
||||
@@ -19,7 +20,7 @@ func SystemInfo() string {
|
||||
return C.GoString(C.llama_print_system_info())
|
||||
}
|
||||
|
||||
func Quantize(infile, outfile, filetype string) error {
|
||||
func Quantize(infile, outfile string, ftype fileType) error {
|
||||
cinfile := C.CString(infile)
|
||||
defer C.free(unsafe.Pointer(cinfile))
|
||||
|
||||
@@ -28,58 +29,10 @@ func Quantize(infile, outfile, filetype string) error {
|
||||
|
||||
params := C.llama_model_quantize_default_params()
|
||||
params.nthread = -1
|
||||
params.ftype = ftype.Value()
|
||||
|
||||
switch filetype {
|
||||
case "F32":
|
||||
params.ftype = fileTypeF32
|
||||
case "F16":
|
||||
params.ftype = fileTypeF16
|
||||
case "Q4_0":
|
||||
params.ftype = fileTypeQ4_0
|
||||
case "Q4_1":
|
||||
params.ftype = fileTypeQ4_1
|
||||
case "Q4_1_F16":
|
||||
params.ftype = fileTypeQ4_1_F16
|
||||
case "Q8_0":
|
||||
params.ftype = fileTypeQ8_0
|
||||
case "Q5_0":
|
||||
params.ftype = fileTypeQ5_0
|
||||
case "Q5_1":
|
||||
params.ftype = fileTypeQ5_1
|
||||
case "Q2_K":
|
||||
params.ftype = fileTypeQ2_K
|
||||
case "Q3_K_S":
|
||||
params.ftype = fileTypeQ3_K_S
|
||||
case "Q3_K_M":
|
||||
params.ftype = fileTypeQ3_K_M
|
||||
case "Q3_K_L":
|
||||
params.ftype = fileTypeQ3_K_L
|
||||
case "Q4_K_S":
|
||||
params.ftype = fileTypeQ4_K_S
|
||||
case "Q4_K_M":
|
||||
params.ftype = fileTypeQ4_K_M
|
||||
case "Q5_K_S":
|
||||
params.ftype = fileTypeQ5_K_S
|
||||
case "Q5_K_M":
|
||||
params.ftype = fileTypeQ5_K_M
|
||||
case "Q6_K":
|
||||
params.ftype = fileTypeQ6_K
|
||||
case "IQ2_XXS":
|
||||
params.ftype = fileTypeIQ2_XXS
|
||||
case "IQ2_XS":
|
||||
params.ftype = fileTypeIQ2_XS
|
||||
case "Q2_K_S":
|
||||
params.ftype = fileTypeQ2_K_S
|
||||
case "Q3_K_XS":
|
||||
params.ftype = fileTypeQ3_K_XS
|
||||
case "IQ3_XXS":
|
||||
params.ftype = fileTypeIQ3_XXS
|
||||
default:
|
||||
return fmt.Errorf("unknown filetype: %s", filetype)
|
||||
}
|
||||
|
||||
if retval := C.llama_model_quantize(cinfile, coutfile, ¶ms); retval != 0 {
|
||||
return fmt.Errorf("llama_model_quantize: %d", retval)
|
||||
if rc := C.llama_model_quantize(cinfile, coutfile, ¶ms); rc != 0 {
|
||||
return fmt.Errorf("llama_model_quantize: %d", rc)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -2,5 +2,5 @@ package llm
|
||||
|
||||
import "embed"
|
||||
|
||||
//go:embed build/windows/*/*/bin/*
|
||||
// unused on windows
|
||||
var libEmbed embed.FS
|
||||
|
||||
185
llm/memory.go
Normal file
185
llm/memory.go
Normal file
@@ -0,0 +1,185 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
// This algorithm looks for a complete fit to determine if we need to unload other models
|
||||
func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
|
||||
var estimatedVRAM uint64
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
// Split up the GPUs by type and try them
|
||||
for _, gpus := range allGpus.ByLibrary() {
|
||||
var layerCount int
|
||||
layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
if opts.NumGPU < 0 {
|
||||
if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
|
||||
return true, estimatedVRAM
|
||||
}
|
||||
} else {
|
||||
if layerCount > 0 && layerCount >= opts.NumGPU {
|
||||
return true, estimatedVRAM
|
||||
}
|
||||
}
|
||||
}
|
||||
return false, estimatedVRAM
|
||||
}
|
||||
|
||||
// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
|
||||
// The GPUs provided must all be the same Library
|
||||
func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
|
||||
var memoryAvailable uint64
|
||||
for _, info := range gpus {
|
||||
memoryAvailable += info.FreeMemory
|
||||
}
|
||||
if envconfig.MaxVRAM > 0 {
|
||||
memoryAvailable = envconfig.MaxVRAM
|
||||
}
|
||||
|
||||
slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
|
||||
|
||||
// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
|
||||
memoryMinimum := gpus[0].MinimumMemory
|
||||
|
||||
for _, projector := range projectors {
|
||||
memoryMinimum += projectorMemoryRequirements(projector)
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
|
||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
graphFullOffload *= uint64(len(gpus))
|
||||
graphPartialOffload *= uint64(len(gpus))
|
||||
|
||||
// on metal there's no partial offload overhead
|
||||
if gpus[0].Library == "metal" {
|
||||
graphPartialOffload = graphFullOffload
|
||||
}
|
||||
|
||||
layers := ggml.Tensors().Layers()
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
if layer, ok := layers["output_norm"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if layer, ok := layers["output"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
} else if layer, ok := layers["token_embd"]; ok {
|
||||
memoryLayerOutput += layer.size()
|
||||
}
|
||||
|
||||
if gpus[0].Library == "metal" && opts.UseMMap {
|
||||
// memory is preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
memoryRequiredPartial += memoryLayerOutput
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
}
|
||||
}
|
||||
|
||||
if gpus[0].Library != "metal" || !opts.UseMMap {
|
||||
// memory was not preallocated for output tensors
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
}
|
||||
|
||||
if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
|
||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
||||
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
slog.Group(
|
||||
"layers",
|
||||
// actual number of layers offloaded
|
||||
"real", opts.NumGPU,
|
||||
// estimated number of layers that can be offloaded
|
||||
"estimate", layerCount,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
// memory available for offloading
|
||||
"available", format.HumanBytes2(memoryAvailable),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
||||
// memory required to offload layers.estimate layers
|
||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
||||
// memory of KV cache
|
||||
"kv", format.HumanBytes2(kv),
|
||||
),
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
||||
),
|
||||
slog.Group(
|
||||
"graph",
|
||||
// memory of graph when fully offloaded
|
||||
"full", format.HumanBytes2(graphFullOffload),
|
||||
// memory of graph when not fully offloaded
|
||||
"partial", format.HumanBytes2(graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
if gpus[0].Library == "cpu" {
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
return 0, 0, memoryRequiredTotal
|
||||
}
|
||||
|
||||
return layerCount, memoryRequiredPartial, memoryRequiredTotal
|
||||
}
|
||||
12
llm/patches/02-clip-log.diff
Normal file
12
llm/patches/02-clip-log.diff
Normal file
@@ -0,0 +1,12 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e431c7f7..f077e688 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -3,6 +3,7 @@
|
||||
// I'll gradually clean and extend it
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
+#include "common.h"
|
||||
#include "log.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
45
llm/patches/04-metal.diff
Normal file
45
llm/patches/04-metal.diff
Normal file
@@ -0,0 +1,45 @@
|
||||
diff --git a/ggml-metal.m b/ggml-metal.m
|
||||
index 0207b787..b5e9884b 100644
|
||||
--- a/ggml-metal.m
|
||||
+++ b/ggml-metal.m
|
||||
@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
-#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
- if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
- switch (src0t) {
|
||||
- case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
- case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q4_0:
|
||||
- case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
- case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
- case GGML_TYPE_Q5_0: // not tested yet
|
||||
- case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
- case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
- case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
- default: ne11_mm_min = 1; break;
|
||||
- }
|
||||
+ switch (src0t) {
|
||||
+ case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
+ case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q4_0:
|
||||
+ case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
+ case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
+ case GGML_TYPE_Q5_0: // not tested yet
|
||||
+ case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
+ case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
+ case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
+ default: ne11_mm_min = 1; break;
|
||||
}
|
||||
-#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
24
llm/patches/05-clip-fix.diff
Normal file
24
llm/patches/05-clip-fix.diff
Normal file
@@ -0,0 +1,24 @@
|
||||
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
|
||||
index e3c9bcd4..b43f892d 100644
|
||||
--- a/examples/llava/clip.cpp
|
||||
+++ b/examples/llava/clip.cpp
|
||||
@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
struct ggml_tensor * embeddings = inp;
|
||||
if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||
+ }
|
||||
+ ggml_set_name(embeddings, "embeddings");
|
||||
+ ggml_set_input(embeddings);
|
||||
+
|
||||
+ if (ctx->has_class_embedding) {
|
||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||
}
|
||||
- ggml_set_name(embeddings, "embeddings");
|
||||
- ggml_set_input(embeddings);
|
||||
-
|
||||
|
||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||
ggml_set_name(positions, "positions");
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/exp/slices"
|
||||
@@ -17,7 +18,7 @@ import (
|
||||
"github.com/ollama/ollama/gpu"
|
||||
)
|
||||
|
||||
var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
|
||||
var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
|
||||
|
||||
func Init() error {
|
||||
payloadsDir, err := gpu.PayloadsDir()
|
||||
@@ -25,13 +26,15 @@ func Init() error {
|
||||
return err
|
||||
}
|
||||
|
||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||
binGlob := "build/*/*/*/bin/*"
|
||||
if runtime.GOOS != "windows" {
|
||||
slog.Info("extracting embedded files", "dir", payloadsDir)
|
||||
binGlob := "build/*/*/*/bin/*"
|
||||
|
||||
// extract server libraries
|
||||
err = extractFiles(payloadsDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
// extract server libraries
|
||||
err = extractFiles(payloadsDir, binGlob)
|
||||
if err != nil {
|
||||
return fmt.Errorf("extract binaries: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
var variants []string
|
||||
@@ -138,6 +141,23 @@ func serversForGpu(info gpu.GpuInfo) []string {
|
||||
return servers
|
||||
}
|
||||
|
||||
// Return the optimal server for this CPU architecture
|
||||
func serverForCpu() string {
|
||||
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
|
||||
return "metal"
|
||||
}
|
||||
variant := gpu.GetCPUVariant()
|
||||
availableServers := availableServers()
|
||||
if variant != "" {
|
||||
for cmp := range availableServers {
|
||||
if cmp == "cpu_"+variant {
|
||||
return cmp
|
||||
}
|
||||
}
|
||||
}
|
||||
return "cpu"
|
||||
}
|
||||
|
||||
// extract extracts the embedded files to the target directory
|
||||
func extractFiles(targetDir string, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
|
||||
624
llm/server.go
624
llm/server.go
@@ -21,21 +21,47 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/gpu"
|
||||
"github.com/ollama/ollama/server/envconfig"
|
||||
)
|
||||
|
||||
// LlamaServer is an instance of the llama.cpp server
|
||||
type LlamaServer struct {
|
||||
type LlamaServer interface {
|
||||
Ping(ctx context.Context) error
|
||||
WaitUntilRunning(ctx context.Context) error
|
||||
Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
|
||||
Embedding(ctx context.Context, prompt string) ([]float64, error)
|
||||
Tokenize(ctx context.Context, content string) ([]int, error)
|
||||
Detokenize(ctx context.Context, tokens []int) (string, error)
|
||||
Close() error
|
||||
EstimatedVRAM() uint64
|
||||
}
|
||||
|
||||
// llmServer is an instance of the llama.cpp server
|
||||
type llmServer struct {
|
||||
port int
|
||||
cmd *exec.Cmd
|
||||
done chan error // Channel to signal when the process exits
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
|
||||
// TODO - this should be broken down by GPU
|
||||
estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
|
||||
estimatedTotal uint64 // Total size of model
|
||||
totalLayers uint64
|
||||
gpuCount int
|
||||
|
||||
sem *semaphore.Weighted
|
||||
}
|
||||
|
||||
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
|
||||
func LoadModel(model string) (*GGML, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f, err := os.Open(model)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -43,144 +69,69 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
defer f.Close()
|
||||
|
||||
ggml, _, err := DecodeGGML(f)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ggml, err
|
||||
}
|
||||
|
||||
// NewLlamaServer will run a server for the given GPUs
|
||||
// The gpu list must be a single family.
|
||||
func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
|
||||
var err error
|
||||
if opts.NumCtx > int(ggml.KV().ContextLength()) {
|
||||
slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
|
||||
opts.NumCtx = int(ggml.KV().ContextLength())
|
||||
slog.Warn("requested context length is greater than the model's training context window size", "requested", opts.NumCtx, "training size", ggml.KV().ContextLength())
|
||||
}
|
||||
|
||||
if opts.NumCtx < 4 {
|
||||
opts.NumCtx = 4
|
||||
}
|
||||
|
||||
memoryAvailable, _ := gpu.CheckVRAM()
|
||||
info := gpu.GetGPUInfo()
|
||||
cpuRunner := ""
|
||||
var estimatedVRAM uint64
|
||||
var estimatedTotal uint64
|
||||
var systemMemory uint64
|
||||
gpuCount := len(gpus)
|
||||
if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
|
||||
|
||||
memoryMinimum := info.MinimumMemory
|
||||
for _, projector := range projectors {
|
||||
memoryMinimum += projectorMemoryRequirements(projector)
|
||||
// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
|
||||
|
||||
// multimodal models require at least 2048 context
|
||||
opts.NumCtx = max(opts.NumCtx, 2048)
|
||||
}
|
||||
cpuRunner = serverForCpu()
|
||||
gpuCount = 0
|
||||
} else {
|
||||
if gpus[0].Library == "metal" {
|
||||
memInfo, err := gpu.GetCPUMem()
|
||||
if err != nil {
|
||||
slog.Error("failed to lookup system memory", "error", err)
|
||||
} else {
|
||||
systemMemory = memInfo.TotalMemory
|
||||
slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
||||
}
|
||||
}
|
||||
var layers int
|
||||
layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||
|
||||
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
||||
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
|
||||
|
||||
graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||
if graphPartialOffload == 0 {
|
||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||
}
|
||||
|
||||
if graphFullOffload == 0 {
|
||||
graphFullOffload = graphPartialOffload
|
||||
}
|
||||
|
||||
graphFullOffload *= uint64(info.DeviceCount)
|
||||
graphPartialOffload *= uint64(info.DeviceCount)
|
||||
|
||||
// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
|
||||
memoryRequiredTotal := memoryMinimum + graphFullOffload
|
||||
|
||||
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
|
||||
memoryRequiredPartial := memoryMinimum + graphPartialOffload
|
||||
|
||||
if info.Library != "metal" {
|
||||
if memoryRequiredPartial > memoryAvailable {
|
||||
info.Library = "cpu"
|
||||
if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
|
||||
// disable partial offloading when model is greater than total system memory as this
|
||||
// can lead to locking up the system
|
||||
opts.NumGPU = 0
|
||||
} else if opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu" {
|
||||
opts.NumGPU = layers
|
||||
}
|
||||
}
|
||||
|
||||
var layerCount int
|
||||
layers := ggml.Tensors().Layers()
|
||||
for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
||||
memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
|
||||
|
||||
// KV is proportional to the number of layers
|
||||
memoryLayer += kv / ggml.KV().BlockCount()
|
||||
|
||||
memoryRequiredTotal += memoryLayer
|
||||
if memoryAvailable > memoryRequiredPartial+memoryLayer {
|
||||
memoryRequiredPartial += memoryLayer
|
||||
layerCount++
|
||||
}
|
||||
}
|
||||
|
||||
var memoryLayerOutput uint64
|
||||
for k, v := range layers {
|
||||
if !strings.HasPrefix(k, "blk.") {
|
||||
memoryLayerOutput += v.size()
|
||||
}
|
||||
}
|
||||
|
||||
memoryRequiredTotal += memoryLayerOutput
|
||||
|
||||
if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
|
||||
// disable partial offloading when model is greater than total system memory
|
||||
opts.NumGPU = 0
|
||||
} else if memoryAvailable > memoryRequiredTotal {
|
||||
layerCount = int(ggml.KV().BlockCount()) + 1
|
||||
memoryRequiredPartial = memoryRequiredTotal
|
||||
}
|
||||
|
||||
if opts.NumGPU < 0 {
|
||||
opts.NumGPU = layerCount
|
||||
}
|
||||
|
||||
memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
|
||||
|
||||
slog.Info(
|
||||
"offload to gpu",
|
||||
slog.Group(
|
||||
"layers",
|
||||
// actual number of layers offloaded
|
||||
"real", opts.NumGPU,
|
||||
// estimated number of layers that can be offloaded
|
||||
"estimate", layerCount,
|
||||
),
|
||||
slog.Group(
|
||||
"memory",
|
||||
// memory available for offloading
|
||||
"available", format.HumanBytes2(memoryAvailable),
|
||||
slog.Group(
|
||||
"required",
|
||||
// memory required for full offloading
|
||||
"full", format.HumanBytes2(memoryRequiredTotal),
|
||||
// memory required to offload layers.estimate layers
|
||||
"partial", format.HumanBytes2(memoryRequiredPartial),
|
||||
// memory of KV cache
|
||||
"kv", format.HumanBytes2(kv),
|
||||
),
|
||||
slog.Group(
|
||||
"weights",
|
||||
// memory of the weights
|
||||
"total", format.HumanBytes2(memoryWeights),
|
||||
// memory of repeating layers
|
||||
"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
|
||||
// memory of non-repeating layers
|
||||
"nonrepeating", format.HumanBytes2(memoryLayerOutput),
|
||||
),
|
||||
slog.Group(
|
||||
"graph",
|
||||
// memory of graph when fully offloaded
|
||||
"full", format.HumanBytes2(graphFullOffload),
|
||||
// memory of graph when not fully offloaded
|
||||
"partial", format.HumanBytes2(graphPartialOffload),
|
||||
),
|
||||
),
|
||||
)
|
||||
// Loop through potential servers
|
||||
finalErr := fmt.Errorf("no suitable llama servers found")
|
||||
|
||||
if len(adapters) > 1 {
|
||||
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
|
||||
}
|
||||
|
||||
availableServers := availableServers()
|
||||
servers := serversForGpu(info)
|
||||
|
||||
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
|
||||
var servers []string
|
||||
if cpuRunner != "" {
|
||||
servers = []string{cpuRunner}
|
||||
} else {
|
||||
servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
|
||||
}
|
||||
demandLib := envconfig.LLMLibrary
|
||||
if demandLib != "" {
|
||||
serverPath := availableServers[demandLib]
|
||||
if serverPath == "" {
|
||||
@@ -188,11 +139,15 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
} else {
|
||||
slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
|
||||
servers = []string{demandLib}
|
||||
if strings.HasPrefix(demandLib, "cpu") {
|
||||
// Omit the GPU flag to silence the warning
|
||||
opts.NumGPU = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(servers) == 0 {
|
||||
return nil, fmt.Errorf("no servers found for %v", info)
|
||||
return nil, fmt.Errorf("no servers found for %v", gpus)
|
||||
}
|
||||
|
||||
params := []string{
|
||||
@@ -201,7 +156,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
||||
"--embedding",
|
||||
}
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
if envconfig.Debug {
|
||||
params = append(params, "--log-format", "json")
|
||||
} else {
|
||||
params = append(params, "--log-disable")
|
||||
@@ -211,7 +166,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
|
||||
}
|
||||
|
||||
if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
|
||||
if envconfig.Debug {
|
||||
params = append(params, "--verbose")
|
||||
}
|
||||
|
||||
@@ -249,10 +204,30 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
|
||||
// Loop through potential servers
|
||||
var finalErr error
|
||||
numParallel := envconfig.NumParallel
|
||||
|
||||
// TODO (jmorganca): multimodal models don't support parallel yet
|
||||
// see https://github.com/ollama/ollama/issues/4165
|
||||
if len(projectors) > 0 {
|
||||
numParallel = 1
|
||||
slog.Warn("multimodal models don't support parallel requests yet")
|
||||
}
|
||||
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
for i := 0; i < len(servers); i++ {
|
||||
dir := availableServers[servers[i]]
|
||||
if dir == "" {
|
||||
// Shouldn't happen
|
||||
finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
|
||||
slog.Error("sever list inconsistent", "error", finalErr)
|
||||
continue
|
||||
}
|
||||
|
||||
if strings.HasPrefix(servers[i], "cpu") {
|
||||
// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
|
||||
gpuCount = 0
|
||||
}
|
||||
|
||||
// Find an availableServers port, retry on each iterration in case the failure was a port conflict race
|
||||
port := 0
|
||||
@@ -273,12 +248,21 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
if runtime.GOOS == "windows" {
|
||||
pathEnv = "PATH"
|
||||
}
|
||||
// append the server directory to LD_LIBRARY_PATH/PATH
|
||||
// prepend the server directory to LD_LIBRARY_PATH/PATH
|
||||
libraryPaths := []string{dir}
|
||||
|
||||
if libraryPath, ok := os.LookupEnv(pathEnv); ok {
|
||||
// Append our runner directory to the path
|
||||
// This will favor system libraries over our bundled library dependencies
|
||||
libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
|
||||
libraryPaths = append(libraryPaths, filepath.SplitList(libraryPath)...)
|
||||
}
|
||||
|
||||
// Note: we always put the dependency path first
|
||||
// since this was the exact version we verified for AMD GPUs
|
||||
// and we favor what the user had in their path
|
||||
if gpus[0].DependencyPath != "" {
|
||||
// TODO refine for multi-gpu support
|
||||
libraryPaths = append([]string{gpus[0].DependencyPath}, libraryPaths...)
|
||||
}
|
||||
|
||||
server := filepath.Join(dir, "ollama_llama_server")
|
||||
@@ -286,21 +270,66 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
server = server + ".exe"
|
||||
}
|
||||
|
||||
s := &LlamaServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
// Detect tmp cleaners wiping out the file
|
||||
_, err := os.Stat(server)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||
err = Init()
|
||||
if err != nil {
|
||||
slog.Warn("failed to reinitialize payloads", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
|
||||
slog.Debug(libEnv)
|
||||
s.cmd.Env = append(os.Environ(), libEnv)
|
||||
|
||||
s := &llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
estimatedVRAM: estimatedVRAM,
|
||||
estimatedTotal: estimatedTotal,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
gpuCount: gpuCount,
|
||||
}
|
||||
|
||||
s.cmd.Env = os.Environ()
|
||||
s.cmd.Stdout = os.Stdout
|
||||
s.cmd.Stderr = s.status
|
||||
|
||||
visibleDevicesEnv, visibleDevicesEnvVal := gpu.GpuInfoList(gpus).GetVisibleDevicesEnv()
|
||||
pathEnvVal := strings.Join(libraryPaths, string(filepath.ListSeparator))
|
||||
|
||||
// Update or add the path and visible devices variable with our adjusted version
|
||||
pathNeeded := true
|
||||
devicesNeeded := visibleDevicesEnv != ""
|
||||
for i := range s.cmd.Env {
|
||||
cmp := strings.SplitN(s.cmd.Env[i], "=", 2)
|
||||
if strings.EqualFold(cmp[0], pathEnv) {
|
||||
s.cmd.Env[i] = pathEnv + "=" + pathEnvVal
|
||||
pathNeeded = false
|
||||
} else if devicesNeeded && strings.EqualFold(cmp[0], visibleDevicesEnv) {
|
||||
s.cmd.Env[i] = visibleDevicesEnv + "=" + visibleDevicesEnvVal
|
||||
devicesNeeded = false
|
||||
}
|
||||
}
|
||||
if pathNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, pathEnv+"="+pathEnvVal)
|
||||
}
|
||||
if devicesNeeded {
|
||||
s.cmd.Env = append(s.cmd.Env, visibleDevicesEnv+"="+visibleDevicesEnvVal)
|
||||
}
|
||||
|
||||
slog.Info("starting llama server", "cmd", s.cmd.String())
|
||||
// Log at debug as the environment is inherited and might contain sensitive information
|
||||
slog.Debug("subprocess", "environment", s.cmd.Env)
|
||||
|
||||
if err = s.cmd.Start(); err != nil {
|
||||
// Detect permission denied and augment them essage about noexec
|
||||
if errors.Is(err, os.ErrPermission) {
|
||||
finalErr = fmt.Errorf("unable to start server %w. %s may have noexec set. Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
|
||||
continue
|
||||
}
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
@@ -310,12 +339,6 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
|
||||
continue
|
||||
}
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
// Exit status managed via getServerStatus
|
||||
_ = s.cmd.Wait()
|
||||
}()
|
||||
|
||||
return s, nil
|
||||
}
|
||||
|
||||
@@ -347,12 +370,27 @@ type ServerStatus int
|
||||
|
||||
const ( // iota is reset to 0
|
||||
ServerStatusReady ServerStatus = iota
|
||||
ServerStatusNoSlotsAvaialble
|
||||
ServerStatusNoSlotsAvailable
|
||||
ServerStatusLoadingModel
|
||||
ServerStatusNotResponding
|
||||
ServerStatusError
|
||||
)
|
||||
|
||||
func (s ServerStatus) ToString() string {
|
||||
switch s {
|
||||
case ServerStatusReady:
|
||||
return "llm server ready"
|
||||
case ServerStatusNoSlotsAvailable:
|
||||
return "llm busy - no slots available"
|
||||
case ServerStatusLoadingModel:
|
||||
return "llm server loading model"
|
||||
case ServerStatusNotResponding:
|
||||
return "llm server not responding"
|
||||
default:
|
||||
return "llm server error"
|
||||
}
|
||||
}
|
||||
|
||||
type ServerStatusResp struct {
|
||||
Status string `json:"status"`
|
||||
SlotsIdle int `json:"slots_idle"`
|
||||
@@ -360,13 +398,17 @@ type ServerStatusResp struct {
|
||||
Error string `json:"error"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
func (s *llmServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
|
||||
// Fail fast if its exited
|
||||
if s.cmd.ProcessState != nil {
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
if s.cmd.ProcessState.ExitCode() == -1 {
|
||||
// Most likely a signal killed it, log some more details to try to help troubleshoot
|
||||
slog.Warn("llama runner process no longer running", "sys", s.cmd.ProcessState.Sys(), "string", s.cmd.ProcessState.String())
|
||||
}
|
||||
return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
||||
}
|
||||
|
||||
@@ -399,7 +441,7 @@ func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error)
|
||||
case "ok":
|
||||
return ServerStatusReady, nil
|
||||
case "no slot available":
|
||||
return ServerStatusNoSlotsAvaialble, nil
|
||||
return ServerStatusNoSlotsAvailable, nil
|
||||
case "loading model":
|
||||
return ServerStatusLoadingModel, nil
|
||||
default:
|
||||
@@ -407,7 +449,30 @@ func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Ping(ctx context.Context) error {
|
||||
// getServerStatusRetry will retry if ServerStatusNoSlotsAvailable is received
|
||||
func (s *llmServer) getServerStatusRetry(ctx context.Context) (ServerStatus, error) {
|
||||
var retries int
|
||||
for {
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return status, err
|
||||
}
|
||||
|
||||
if status == ServerStatusNoSlotsAvailable {
|
||||
if retries >= 10 {
|
||||
return status, fmt.Errorf("no slots available after %d retries", retries)
|
||||
}
|
||||
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
retries++
|
||||
continue
|
||||
}
|
||||
|
||||
return status, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *llmServer) Ping(ctx context.Context) error {
|
||||
_, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
slog.Debug("server unhealthy", "error", err)
|
||||
@@ -416,13 +481,25 @@ func (s *LlamaServer) Ping(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *LlamaServer) WaitUntilRunning() error {
|
||||
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
start := time.Now()
|
||||
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
||||
|
||||
slog.Info("waiting for llama runner to start responding")
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
slog.Info("context expired before server started")
|
||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
||||
case err := <-s.done:
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||
default:
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
|
||||
defer cancel()
|
||||
status, err := s.getServerStatus(ctx)
|
||||
@@ -487,7 +564,6 @@ ws ::= ([ \t\n] ws)?
|
||||
`
|
||||
|
||||
const maxBufferSize = 512 * format.KiloByte
|
||||
const maxRetries = 3
|
||||
|
||||
type ImageData struct {
|
||||
Data []byte `json:"data"`
|
||||
@@ -524,7 +600,19 @@ type CompletionResponse struct {
|
||||
EvalDuration time.Duration
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// only allow maximum 10 "context shifts" to avoid infinite generation
|
||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
|
||||
}
|
||||
|
||||
request := map[string]any{
|
||||
"prompt": req.Prompt,
|
||||
"stream": true,
|
||||
@@ -551,11 +639,11 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
|
||||
}
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if status != ServerStatusReady {
|
||||
return fmt.Errorf("unexpected server status: %d", status)
|
||||
return fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
if req.Format == "json" {
|
||||
@@ -565,133 +653,113 @@ func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn
|
||||
}
|
||||
}
|
||||
|
||||
retryDelay := 100 * time.Microsecond
|
||||
for retries := 0; retries < maxRetries; retries++ {
|
||||
if retries > 0 {
|
||||
time.Sleep(retryDelay) // wait before retrying
|
||||
retryDelay *= 2 // exponential backoff
|
||||
}
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
enc.SetEscapeHTML(false)
|
||||
|
||||
// Handling JSON marshaling with special characters unescaped.
|
||||
buffer := &bytes.Buffer{}
|
||||
enc := json.NewEncoder(buffer)
|
||||
enc.SetEscapeHTML(false)
|
||||
if err := enc.Encode(request); err != nil {
|
||||
return fmt.Errorf("failed to marshal data: %v", err)
|
||||
}
|
||||
|
||||
if err := enc.Encode(request); err != nil {
|
||||
return fmt.Errorf("failed to marshal data: %v", err)
|
||||
}
|
||||
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
|
||||
serverReq, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating POST request: %v", err)
|
||||
}
|
||||
serverReq.Header.Set("Content-Type", "application/json")
|
||||
|
||||
endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
|
||||
res, err := http.DefaultClient.Do(serverReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("POST predict: %v", err)
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode >= 400 {
|
||||
bodyBytes, err := io.ReadAll(res.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating POST request: %v", err)
|
||||
return fmt.Errorf("failed reading llm error response: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
log.Printf("llm predict error: %s", bodyBytes)
|
||||
return fmt.Errorf("%s", bodyBytes)
|
||||
}
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("POST predict: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
scanner := bufio.NewScanner(res.Body)
|
||||
buf := make([]byte, 0, maxBufferSize)
|
||||
scanner.Buffer(buf, maxBufferSize)
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed reading llm error response: %w", err)
|
||||
// keep track of the last token generated, this is used to abort if the model starts looping
|
||||
var lastToken string
|
||||
var tokenRepeat int
|
||||
|
||||
for scanner.Scan() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
return ctx.Err()
|
||||
default:
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
log.Printf("llm predict error: %s", bodyBytes)
|
||||
return fmt.Errorf("%s", bodyBytes)
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
buf := make([]byte, 0, maxBufferSize)
|
||||
scanner.Buffer(buf, maxBufferSize)
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
|
||||
retryNeeded := false
|
||||
// keep track of the last token generated, this is used to abort if the model starts looping
|
||||
var lastToken string
|
||||
var tokenRepeat int
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
|
||||
for scanner.Scan() {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
return ctx.Err()
|
||||
switch {
|
||||
case strings.TrimSpace(c.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
default:
|
||||
line := scanner.Bytes()
|
||||
if len(line) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// try again on slot unavailable
|
||||
if bytes.Contains(line, []byte("slot unavailable")) {
|
||||
retryNeeded = true
|
||||
break
|
||||
}
|
||||
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
}
|
||||
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.TrimSpace(c.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
default:
|
||||
lastToken = strings.TrimSpace(c.Content)
|
||||
tokenRepeat = 0
|
||||
}
|
||||
|
||||
// 30 picked as an arbitrary max token repeat limit, modify as needed
|
||||
if tokenRepeat > 30 {
|
||||
slog.Debug("prediction aborted, token repeat limit reached")
|
||||
return ctx.Err()
|
||||
}
|
||||
|
||||
if c.Content != "" {
|
||||
fn(CompletionResponse{
|
||||
Content: c.Content,
|
||||
})
|
||||
}
|
||||
|
||||
if c.Stop {
|
||||
fn(CompletionResponse{
|
||||
Done: true,
|
||||
PromptEvalCount: c.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||
EvalCount: c.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
lastToken = strings.TrimSpace(c.Content)
|
||||
tokenRepeat = 0
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
if strings.Contains(err.Error(), "unexpected EOF") {
|
||||
s.Close()
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
|
||||
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
|
||||
// 30 picked as an arbitrary max token repeat limit, modify as needed
|
||||
if tokenRepeat > 30 {
|
||||
slog.Debug("prediction aborted, token repeat limit reached")
|
||||
return ctx.Err()
|
||||
}
|
||||
return fmt.Errorf("error reading llm response: %v", err)
|
||||
}
|
||||
|
||||
if !retryNeeded {
|
||||
return nil // success
|
||||
if c.Content != "" {
|
||||
fn(CompletionResponse{
|
||||
Content: c.Content,
|
||||
})
|
||||
}
|
||||
|
||||
if c.Stop {
|
||||
fn(CompletionResponse{
|
||||
Done: true,
|
||||
PromptEvalCount: c.Timings.PromptN,
|
||||
PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
|
||||
EvalCount: c.Timings.PredictedN,
|
||||
EvalDuration: parseDurationMs(c.Timings.PredictedMS),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here ideally
|
||||
return fmt.Errorf("max retries exceeded")
|
||||
if err := scanner.Err(); err != nil {
|
||||
if strings.Contains(err.Error(), "unexpected EOF") {
|
||||
s.Close()
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
|
||||
}
|
||||
|
||||
return fmt.Errorf("error reading llm response: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
@@ -702,13 +770,19 @@ type EmbeddingResponse struct {
|
||||
Embedding []float64 `json:"embedding"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||
func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
|
||||
if err := s.sem.Acquire(ctx, 1); err != nil {
|
||||
slog.Error("Failed to acquire semaphore", "error", err)
|
||||
return nil, err
|
||||
}
|
||||
defer s.sem.Release(1)
|
||||
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
status, err := s.getServerStatusRetry(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||
@@ -754,13 +828,13 @@ type TokenizeResponse struct {
|
||||
Tokens []int `json:"tokens"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady {
|
||||
return nil, fmt.Errorf("unexpected server status: %d", status)
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(TokenizeRequest{Content: content})
|
||||
@@ -806,13 +880,13 @@ type DetokenizeResponse struct {
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else if status != ServerStatusReady {
|
||||
return "", fmt.Errorf("unexpected server status: %d", status)
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
||||
@@ -850,15 +924,25 @@ func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, err
|
||||
return decoded.Content, nil
|
||||
}
|
||||
|
||||
func (s *LlamaServer) Close() error {
|
||||
func (s *llmServer) Close() error {
|
||||
if s.cmd != nil {
|
||||
slog.Debug("stopping llama server")
|
||||
return s.cmd.Process.Kill()
|
||||
if err := s.cmd.Process.Kill(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_ = s.cmd.Wait()
|
||||
|
||||
slog.Debug("llama server stopped")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *llmServer) EstimatedVRAM() uint64 {
|
||||
return s.estimatedVRAM
|
||||
}
|
||||
|
||||
func parseDurationMs(ms float64) time.Duration {
|
||||
dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user