Build multiple CPU variants and pick the best

This reduces the built-in linux version to not use any vector extensions
which enables the resulting builds to run under Rosetta on MacOS in
Docker.  Then at runtime it checks for the actual CPU vector
extensions and loads the best CPU library available
This commit is contained in:
Daniel Hiltgen
2024-01-07 15:48:05 -08:00
parent 052b33b81b
commit d88c527be3
15 changed files with 202 additions and 66 deletions

View File

@@ -1,3 +1,5 @@
//go:build !darwin
package llm
import (
@@ -7,9 +9,9 @@ import (
)
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
// On windows and linux we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
// This ensures we can update the PATH at runtime to get everything loaded
// This should never happen as we'll always try to load one or more cpu dynamic libaries before hitting default
return nil, fmt.Errorf("no available default llm library on windows")
return nil, fmt.Errorf("no available default llm library")
}

View File

@@ -15,12 +15,6 @@ package llm
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
@@ -43,6 +37,8 @@ import (
"github.com/jmorganca/ollama/api"
)
// TODO switch Linux to always be dynamic
// If that works out, then look at the impact of doing the same for Mac
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)

View File

@@ -1,4 +1,4 @@
//go:build !windows
//go:build darwin
package llm
@@ -14,6 +14,8 @@ import (
"github.com/jmorganca/ollama/api"
)
// TODO - explore shifting Darwin to a dynamic loading pattern for consistency with Linux and Windows
type llamaExtServer struct {
api.Options
}

View File

@@ -51,6 +51,16 @@ install() {
cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
}
link_server_lib() {
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/lib/libext_server.a \
-Wl,--no-whole-archive \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)

View File

@@ -49,17 +49,68 @@ git_module_setup
apply_patches
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
#
# CPU first for the default library
#
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
echo "Building custom CPU"
build
install
link_server_lib
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
build
install
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
echo "Building LCD CPU"
build
install
link_server_lib
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
touch ${BUILD_DIR}/lib/dummy.so
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
echo "Building AVX CPU"
build
install
link_server_lib
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
echo "Building AVX2 CPU"
build
install
link_server_lib
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/lib/libext_server.a \
-Wl,--no-whole-archive \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a
fi
else
echo "Skipping CPU generation step as requested"
fi

View File

@@ -139,7 +139,22 @@ func Init(workdir string) error {
}
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
for _, shim := range getShims(gpuInfo) {
shims := getShims(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
libPath := availableShims[demandLib]
if libPath == "" {
log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
} else {
log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
shims = []string{libPath}
}
}
for _, shim := range shims {
// TODO - only applies on Darwin (switch to fully dynamic there too...)
if shim == "default" {
break
}

View File

@@ -15,14 +15,20 @@ import (
"github.com/jmorganca/ollama/gpu"
)
// Shims names may contain an optional variant separated by '_'
// Libraries names may contain an optional variant separated by '_'
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
// Any library without a variant is the lowest common denominator
var availableShims = map[string]string{}
const pathComponentCount = 6
// getShims returns an ordered list of shims to try, starting with the best
func getShims(gpuInfo gpu.GpuInfo) []string {
// Short circuit if we know we're using the default built-in (darwin only)
if gpuInfo.Library == "default" {
return []string{"default"}
}
exactMatch := ""
shims := []string{}
altShims := []string{}
@@ -30,30 +36,18 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
if gpuInfo.Variant != "" {
requested += "_" + gpuInfo.Variant
}
// First try to find an exact match
// Try to find an exact match
for cmp := range availableShims {
if requested == cmp {
exactMatch = cmp
shims = append(shims, availableShims[cmp])
shims = []string{availableShims[cmp]}
break
}
}
// Then load alternates and sort the list for consistent load ordering
for cmp := range availableShims {
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
altShims = append(altShims, cmp)
}
}
slices.Sort(altShims)
for _, altShim := range altShims {
shims = append(shims, availableShims[altShim])
}
// Load up the CPU alternates if not primary requested
// Then for GPUs load alternates and sort the list for consistent load ordering
if gpuInfo.Library != "cpu" {
altShims = []string{}
for cmp := range availableShims {
if strings.Split(cmp, "_")[0] == "cpu" {
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
altShims = append(altShims, cmp)
}
}
@@ -62,8 +56,30 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
shims = append(shims, availableShims[altShim])
}
}
// default is always last as the lowest common denominator
shims = append(shims, "default")
// Load up the best CPU variant if not primary requested
if gpuInfo.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableShims {
if cmp == "cpu_"+variant {
shims = append(shims, availableShims[cmp])
break
}
}
} else {
shims = append(shims, availableShims["cpu"])
}
}
// Finaly, if we didn't find any matches, LCD CPU FTW
if len(shims) == 0 {
shims = []string{availableShims["cpu"]}
}
return shims
}
@@ -116,7 +132,8 @@ func nativeInit(workdir string) error {
variants[i] = variant
i++
}
log.Printf("Dynamic LLM variants %v", variants)
log.Printf("Dynamic LLM libraries %v", variants)
log.Printf("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}

View File

@@ -11,13 +11,13 @@ import (
var libEmbed embed.FS
func updatePath(dir string) {
pathComponents := strings.Split(os.Getenv("PATH"), ":")
pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
for _, comp := range pathComponents {
if comp == dir {
return
}
}
newPath := strings.Join(append(pathComponents, dir), ":")
log.Printf("Updating PATH to %s", newPath)
os.Setenv("PATH", newPath)
newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
os.Setenv("LD_LIBRARY_PATH", newPath)
}

View File

@@ -13,9 +13,8 @@ func TestGetShims(t *testing.T) {
}
assert.Equal(t, false, rocmShimPresent())
res := getShims(gpu.GpuInfo{Library: "cpu"})
assert.Len(t, res, 2)
assert.Len(t, res, 1)
assert.Equal(t, availableShims["cpu"], res[0])
assert.Equal(t, "default", res[1])
availableShims = map[string]string{
"rocm_v5": "X_rocm_v5",
@@ -24,28 +23,24 @@ func TestGetShims(t *testing.T) {
}
assert.Equal(t, true, rocmShimPresent())
res = getShims(gpu.GpuInfo{Library: "rocm"})
assert.Len(t, res, 4)
assert.Len(t, res, 3)
assert.Equal(t, availableShims["rocm_v5"], res[0])
assert.Equal(t, availableShims["rocm_v6"], res[1])
assert.Equal(t, availableShims["cpu"], res[2])
assert.Equal(t, "default", res[3])
res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 4)
assert.Len(t, res, 3)
assert.Equal(t, availableShims["rocm_v6"], res[0])
assert.Equal(t, availableShims["rocm_v5"], res[1])
assert.Equal(t, availableShims["cpu"], res[2])
assert.Equal(t, "default", res[3])
res = getShims(gpu.GpuInfo{Library: "cuda"})
assert.Len(t, res, 2)
assert.Len(t, res, 1)
assert.Equal(t, availableShims["cpu"], res[0])
assert.Equal(t, "default", res[1])
res = getShims(gpu.GpuInfo{Library: "default"})
assert.Len(t, res, 2)
assert.Equal(t, availableShims["cpu"], res[0])
assert.Equal(t, "default", res[1])
assert.Len(t, res, 1)
assert.Equal(t, "default", res[0])
availableShims = map[string]string{
"rocm": "X_rocm_v5",
@@ -53,9 +48,7 @@ func TestGetShims(t *testing.T) {
}
assert.Equal(t, true, rocmShimPresent())
res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 3)
assert.Len(t, res, 2)
assert.Equal(t, availableShims["rocm"], res[0])
assert.Equal(t, availableShims["cpu"], res[1])
assert.Equal(t, "default", res[2])
}