Merge pull request #1819 from dhiltgen/multi_variant

Support multiple LLM libs; ROCm v5 and v6; Rosetta, AVX, and AVX2 compatible CPU builds
This commit is contained in:
Daniel Hiltgen
2024-01-11 14:00:48 -08:00
committed by GitHub
33 changed files with 822 additions and 626 deletions

View File

@@ -1,4 +1,4 @@
#include "dynamic_shim.h"
#include "dyn_ext_server.h"
#include <stdio.h>
#include <string.h>
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
#define UNLOAD_LIBRARY(handle) dlclose(handle)
#endif
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err) {
int i = 0;
struct lookup {
@@ -58,7 +58,7 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
{"", NULL},
};
printf("Lazy loading %s library\n", libPath);
printf("loading %s library\n", libPath);
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
if (!s->handle) {
err->id = -1;
@@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
}
}
inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
inline void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err) {
s.llama_server_init(sparams, err);
}
inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
s.llama_server_start();
}
inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
s.llama_server_stop();
}
inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp) {
s.llama_server_completion(json_req, resp);
}
inline void dynamic_shim_llama_server_completion_next_result(
inline void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result) {
s.llama_server_completion_next_result(task_id, result);
}
inline void dynamic_shim_llama_server_completion_cancel(
inline void dyn_llama_server_completion_cancel(
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
s.llama_server_completion_cancel(task_id, err);
}
inline void dynamic_shim_llama_server_release_task_result(
inline void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result) {
s.llama_server_release_task_result(result);
}
inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_tokenize(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_detokenize(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err) {
s.llama_server_embedding(json_req, json_resp, err);
}
inline void dynamic_shim_llama_server_release_json_resp(
inline void dyn_llama_server_release_json_resp(
struct dynamic_llama_server s, char **json_resp) {
s.llama_server_release_json_resp(json_resp);
}

View File

@@ -10,31 +10,25 @@ package llm
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
#cgo linux CFLAGS: -D_GNU_SOURCE
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
#cgo linux windows LDFLAGS: -lpthread
#include <stdlib.h>
#include "ext_server.h"
#include "dyn_ext_server.h"
*/
import "C"
import (
"bytes"
"context"
"encoding/json"
"fmt"
"log"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"time"
@@ -43,19 +37,9 @@ import (
"github.com/jmorganca/ollama/api"
)
type extServer interface {
LLM
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
llama_server_start()
llama_server_stop()
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
llama_server_release_task_result(result *C.ext_server_task_result_t)
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
llama_server_release_json_resp(json_resp **C.char)
type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
@@ -80,11 +64,30 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
// Note: current implementation does not support concurrent instantiations
var llm *dynExtServer
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if !mutex.TryLock() {
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()
}
updatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dyn_init(libPath, &srv, &resp)
if resp.id < 0 {
mutex.Unlock()
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm = &dynExtServer{
s: srv,
options: opts,
}
log.Printf("Loading Dynamic llm server: %s", library)
var sparams C.ext_server_params_t
sparams.model = C.CString(model)
@@ -133,20 +136,20 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
sparams.n_threads = C.uint(opts.NumThread)
log.Printf("Initializing internal llama server")
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
server.llama_server_init(&sparams, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
log.Printf("Initializing llama server")
initResp := newExtServerResp(128)
defer freeExtServerResp(initResp)
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
if initResp.id < 0 {
return nil, extServerResponseToErr(initResp)
}
log.Printf("Starting internal llama main loop")
server.llama_server_start()
return server, nil
log.Printf("Starting llama main loop")
C.dyn_llama_server_start(llm.s)
return llm, nil
}
func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var imageData []ImageData
@@ -204,7 +207,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
req := C.CString(buffer.String())
defer C.free(unsafe.Pointer(req))
llm.llama_server_completion(req, &resp)
C.dyn_llama_server_completion(llm.s, req, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
}
@@ -215,7 +218,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
select {
case <-ctx.Done():
// This handles the request cancellation
llm.llama_server_completion_cancel(resp.id, &resp)
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return extServerResponseToErr(resp)
} else {
@@ -223,13 +226,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
}
default:
var result C.ext_server_task_result_t
llm.llama_server_completion_next_result(resp.id, &result)
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
json_resp := C.GoString(result.json_resp)
llm.llama_server_release_task_result(&result)
C.dyn_llama_server_release_task_result(llm.s, &result)
var p prediction
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
llm.llama_server_completion_cancel(resp.id, &resp)
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
if resp.id < 0 {
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
} else {
@@ -270,7 +273,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
return fmt.Errorf("max retries exceeded")
}
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
data, err := json.Marshal(TokenizeRequest{Content: prompt})
if err != nil {
return nil, fmt.Errorf("marshaling encode data: %w", err)
@@ -280,11 +283,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_tokenize(req, &json_resp, &resp)
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var encoded TokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
@@ -294,7 +297,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
return encoded.Tokens, err
}
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
if len(tokens) == 0 {
return "", nil
}
@@ -308,11 +311,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_detokenize(req, &json_resp, &resp)
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return "", extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var decoded DetokenizeResponse
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
@@ -322,7 +325,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
return decoded.Content, err
}
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
data, err := json.Marshal(TokenizeRequest{Content: input})
if err != nil {
return nil, fmt.Errorf("error marshaling embed data: %w", err)
@@ -333,11 +336,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
var json_resp *C.char
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
llm.llama_server_embedding(req, &json_resp, &resp)
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
if resp.id < 0 {
return nil, extServerResponseToErr(resp)
}
defer llm.llama_server_release_json_resp(&json_resp)
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
var embedding EmbeddingResponse
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
@@ -347,7 +350,38 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
return embedding.Embedding, nil
}
func close(llm extServer) {
llm.llama_server_stop()
func (llm *dynExtServer) Close() {
C.dyn_llama_server_stop(llm.s)
mutex.Unlock()
}
func updatePath(dir string) {
if runtime.GOOS == "windows" {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
log.Printf("Updating PATH to %s", newPath)
os.Setenv("PATH", newPath)
} else {
pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
for _, comp := range pathComponents {
if comp == dir {
return
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
os.Setenv("LD_LIBRARY_PATH", newPath)
}
}

View File

@@ -27,46 +27,46 @@ struct dynamic_llama_server {
void (*llama_server_release_json_resp)(char **json_resp);
};
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
ext_server_resp_t *err);
// No good way to call C function pointers from Go so inline the indirection
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
void dyn_llama_server_init(struct dynamic_llama_server s,
ext_server_params_t *sparams,
ext_server_resp_t *err);
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
void dyn_llama_server_start(struct dynamic_llama_server s);
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
void dyn_llama_server_stop(struct dynamic_llama_server s);
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
void dyn_llama_server_completion(struct dynamic_llama_server s,
const char *json_req,
ext_server_resp_t *resp);
void dynamic_shim_llama_server_completion_next_result(
void dyn_llama_server_completion_next_result(
struct dynamic_llama_server s, const int task_id,
ext_server_task_result_t *result);
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
const int task_id,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_task_result(
void dyn_llama_server_release_task_result(
struct dynamic_llama_server s, ext_server_task_result_t *result);
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
const char *json_req,
char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
void dyn_llama_server_embedding(struct dynamic_llama_server s,
const char *json_req, char **json_resp,
ext_server_resp_t *err);
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
char **json_resp);
#ifdef __cplusplus

View File

@@ -1,4 +1,18 @@
# Extern C Server
This directory contains a thin facade we layer on top of the Llama.cpp server
to expose `extern C` interfaces to access the functionality through direct API calls in-process
This directory contains a thin facade we layer on top of the Llama.cpp server to
expose `extern C` interfaces to access the functionality through direct API
calls in-process. The llama.cpp code uses compile time macros to configure GPU
type along with other settings. During the `go generate ./...` execution, the
build will generate one or more copies of the llama.cpp `extern C` server based
on what GPU libraries are detected to support multiple GPU types as well as CPU
only support. The Ollama go build then embeds these different servers to support
different GPUs and settings at runtime.
If you are making changes to the code in this directory, make sure to disable
caching during your go build to ensure you pick up your changes. A typical
iteration cycle from the top of the source tree looks like:
```
go generate ./... && go build -a .
```

View File

@@ -1,80 +0,0 @@
//go:build !windows
package llm
/*
#include <stdlib.h>
#include "ext_server.h"
*/
import "C"
import (
"context"
"github.com/jmorganca/ollama/api"
)
type llamaExtServer struct {
api.Options
}
func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.llama_server_init(sparams, err)
}
func (llm *llamaExtServer) llama_server_start() {
C.llama_server_start()
}
func (llm *llamaExtServer) llama_server_stop() {
C.llama_server_stop()
}
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.llama_server_completion(json_req, resp)
}
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.llama_server_completion_next_result(task_id, resp)
}
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.llama_server_completion_cancel(task_id, err)
}
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.llama_server_release_task_result(result)
}
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_tokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_detokenize(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.llama_server_embedding(json_req, json_resp, err)
}
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.llama_server_release_json_resp(json_resp)
}
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
server := &llamaExtServer{opts}
return newExtServer(server, model, adapters, projectors, opts)
}
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(ctx, llm, pred, fn)
}
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func (llm *llamaExtServer) Close() {
close(llm)
}

View File

@@ -1,12 +0,0 @@
package llm
import (
"github.com/jmorganca/ollama/api"
)
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
// This ensures we can update the PATH at runtime to get everything loaded
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts)
}

View File

@@ -51,6 +51,16 @@ install() {
cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
}
link_server_lib() {
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,--whole-archive \
${BUILD_DIR}/lib/libext_server.a \
-Wl,--no-whole-archive \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a
}
# Keep the local tree clean after we're done with the build
cleanup() {
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)

View File

@@ -29,4 +29,16 @@ git_module_setup
apply_patches
build
install
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
-Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
${BUILD_DIR}/lib/libcommon.a \
${BUILD_DIR}/lib/libllama.a \
${BUILD_DIR}/lib/libggml_static.a \
-lpthread -ldl -lm -lc++ \
-framework Accelerate \
-framework Foundation \
-framework Metal \
-framework MetalKit \
-framework MetalPerformanceShaders
cleanup

View File

@@ -48,23 +48,76 @@ init_vars
git_module_setup
apply_patches
#
# CPU first for the default library
#
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
# Users building from source can tune the exact flags we pass to cmake for configuring
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
echo "Building custom CPU"
build
install
link_server_lib
else
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
# Note: the following seem to yield slower results than AVX2 - ymmv
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
build
install
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
#
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
#
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
echo "Building LCD CPU"
build
install
link_server_lib
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
touch ${BUILD_DIR}/lib/dummy.so
#
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
# Approximately 400% faster than LCD on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
echo "Building AVX CPU"
build
install
link_server_lib
#
# ~2013 CPU Dynamic library
# Approximately 10% faster than AVX on same CPU
#
init_vars
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
echo "Building AVX2 CPU"
build
install
link_server_lib
fi
else
echo "Skipping CPU generation step as requested"
fi
if [ -d /usr/local/cuda/lib64/ ]; then
echo "CUDA libraries detected - building dynamic CUDA library"
init_vars
CUDA_MAJOR=$(ls /usr/local/cuda/lib64/libcudart.so.* | head -1 | cut -f3 -d. || true)
if [ -n "${CUDA_MAJOR}" ]; then
CUDA_VARIANT=_v${CUDA_MAJOR}
fi
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
CUDA_LIB_DIR=/usr/local/cuda/lib64
build
install
@@ -96,9 +149,12 @@ fi
if [ -d "${ROCM_PATH}" ]; then
echo "ROCm libraries detected - building dynamic ROCm library"
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
fi
init_vars
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}"
build
install
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \

View File

@@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop"
function init_vars {
$script:llamacppDir = "../llama.cpp"
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64")
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
if ($env:CGO_CFLAGS -contains "-g") {
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
@@ -63,16 +63,36 @@ init_vars
git_module_setup
apply_patches
# first build CPU based
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
write-host "Building LCD CPU"
build
install
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
write-host "Building AVX CPU"
build
install
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
write-host "Building AVX2 CPU"
build
install
# Then build cuda as a dynamically loaded library
# TODO figure out how to detect cuda version
init_vars
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
build
install

View File

@@ -18,8 +18,6 @@ type LLM interface {
Close()
}
var AvailableShims = map[string]string{}
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
@@ -112,7 +110,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0
return newLlmServer(library, model, adapters, projectors, opts)
gpuInfo := gpu.GetGPUInfo()
return newLlmServer(gpuInfo, model, adapters, projectors, opts)
}
// Give any native cgo implementations an opportunity to initialize
@@ -120,15 +119,30 @@ func Init(workdir string) error {
return nativeInit(workdir)
}
func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
dynLibs := getDynLibs(gpuInfo)
// Check to see if the user has requested a specific library instead of auto-detecting
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
if demandLib != "" {
libPath := availableDynLibs[demandLib]
if libPath == "" {
log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
} else {
log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
dynLibs = []string{libPath}
}
}
err2 := fmt.Errorf("unable to locate suitable llm library")
for _, dynLib := range dynLibs {
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
if err == nil {
return srv, nil
}
log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
log.Printf("Failed to load dynamic library %s %s", dynLib, err)
err2 = err
}
return newDefaultExtServer(model, adapters, projectors, opts)
return nil, err2
}

244
llm/payload_common.go Normal file
View File

@@ -0,0 +1,244 @@
package llm
import (
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"runtime"
"slices"
"strings"
"github.com/jmorganca/ollama/gpu"
)
// Libraries names may contain an optional variant separated by '_'
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
// Any library without a variant is the lowest common denominator
var availableDynLibs = map[string]string{}
const pathComponentCount = 6
// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
func getDynLibs(gpuInfo gpu.GpuInfo) []string {
// Short circuit if we know we're using the default built-in (darwin only)
if gpuInfo.Library == "default" {
return []string{"default"}
}
exactMatch := ""
dynLibs := []string{}
altDynLibs := []string{}
requested := gpuInfo.Library
if gpuInfo.Variant != "" {
requested += "_" + gpuInfo.Variant
}
// Try to find an exact match
for cmp := range availableDynLibs {
if requested == cmp {
exactMatch = cmp
dynLibs = []string{availableDynLibs[cmp]}
break
}
}
// Then for GPUs load alternates and sort the list for consistent load ordering
if gpuInfo.Library != "cpu" {
for cmp := range availableDynLibs {
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
altDynLibs = append(altDynLibs, cmp)
}
}
slices.Sort(altDynLibs)
for _, altDynLib := range altDynLibs {
dynLibs = append(dynLibs, availableDynLibs[altDynLib])
}
}
// Load up the best CPU variant if not primary requested
if gpuInfo.Library != "cpu" {
variant := gpu.GetCPUVariant()
// If no variant, then we fall back to default
// If we have a variant, try that if we find an exact match
// Attempting to run the wrong CPU instructions will panic the
// process
if variant != "" {
for cmp := range availableDynLibs {
if cmp == "cpu_"+variant {
dynLibs = append(dynLibs, availableDynLibs[cmp])
break
}
}
} else {
dynLibs = append(dynLibs, availableDynLibs["cpu"])
}
}
// Finaly, if we didn't find any matches, LCD CPU FTW
if len(dynLibs) == 0 {
dynLibs = []string{availableDynLibs["cpu"]}
}
return dynLibs
}
func rocmDynLibPresent() bool {
for dynLibName := range availableDynLibs {
if strings.HasPrefix(dynLibName, "rocm") {
return true
}
}
return false
}
func nativeInit(workdir string) error {
if runtime.GOOS == "darwin" {
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
if err != nil {
if err == payloadMissing {
// TODO perhaps consider this a hard failure on arm macs?
log.Printf("ggml-meta.metal payload missing")
return nil
}
return err
}
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
}
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
if err != nil {
if err == payloadMissing {
log.Printf("%s", payloadMissing)
return nil
}
return err
}
for _, lib := range libs {
// The last dir component is the variant name
variant := filepath.Base(filepath.Dir(lib))
availableDynLibs[variant] = lib
}
if err := verifyDriverAccess(); err != nil {
return err
}
// Report which dynamic libraries we have loaded to assist troubleshooting
variants := make([]string, len(availableDynLibs))
i := 0
for variant := range availableDynLibs {
variants[i] = variant
i++
}
log.Printf("Dynamic LLM libraries %v", variants)
log.Printf("Override detection logic by setting OLLAMA_LLM_LIBRARY")
return nil
}
func extractDynamicLibs(workDir, glob string) ([]string, error) {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return nil, payloadMissing
}
libs := []string{}
for _, file := range files {
pathComps := strings.Split(file, "/")
if len(pathComps) != pathComponentCount {
log.Printf("unexpected payload components: %v", pathComps)
continue
}
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
// Include the variant in the path to avoid conflicts between multiple server libs
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
srcFile, err := libEmbed.Open(file)
if err != nil {
return nil, fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(targetDir, filepath.Base(file))
if strings.Contains(destFile, "server") {
libs = append(libs, destFile)
}
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return nil, fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return nil, fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return nil, fmt.Errorf("stat payload %s: %v", file, err)
}
}
return libs, nil
}
func extractPayloadFiles(workDir, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return payloadMissing
}
for _, file := range files {
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(workDir, 0o755); err != nil {
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(workDir, filepath.Base(file))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", file, err)
}
}
return nil
}
func verifyDriverAccess() error {
if runtime.GOOS != "linux" {
return nil
}
// Only check ROCm access if we have the dynamic lib loaded
if rocmDynLibPresent() {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
}
return nil
}

8
llm/payload_darwin.go Normal file
View File

@@ -0,0 +1,8 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
var libEmbed embed.FS

8
llm/payload_linux.go Normal file
View File

@@ -0,0 +1,8 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/build/linux/*/lib/*.so
var libEmbed embed.FS

54
llm/payload_test.go Normal file
View File

@@ -0,0 +1,54 @@
package llm
import (
"testing"
"github.com/jmorganca/ollama/gpu"
"github.com/stretchr/testify/assert"
)
func TestGetDynLibs(t *testing.T) {
availableDynLibs = map[string]string{
"cpu": "X_cpu",
}
assert.Equal(t, false, rocmDynLibPresent())
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"], res[0])
availableDynLibs = map[string]string{
"rocm_v5": "X_rocm_v5",
"rocm_v6": "X_rocm_v6",
"cpu": "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
assert.Equal(t, availableDynLibs["cpu"], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 3)
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
assert.Equal(t, availableDynLibs["cpu"], res[2])
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
assert.Len(t, res, 1)
assert.Equal(t, availableDynLibs["cpu"], res[0])
res = getDynLibs(gpu.GpuInfo{Library: "default"})
assert.Len(t, res, 1)
assert.Equal(t, "default", res[0])
availableDynLibs = map[string]string{
"rocm": "X_rocm_v5",
"cpu": "X_cpu",
}
assert.Equal(t, true, rocmDynLibPresent())
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
assert.Len(t, res, 2)
assert.Equal(t, availableDynLibs["rocm"], res[0])
assert.Equal(t, availableDynLibs["cpu"], res[1])
}

8
llm/payload_windows.go Normal file
View File

@@ -0,0 +1,8 @@
package llm
import (
"embed"
)
//go:embed llama.cpp/build/windows/*/lib/*.dll
var libEmbed embed.FS

View File

@@ -1,71 +0,0 @@
package llm
import (
"embed"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"github.com/jmorganca/ollama/api"
)
//go:embed llama.cpp/ggml-metal.metal
var libEmbed embed.FS
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
// should never happen...
return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
}
func nativeInit(workdir string) error {
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
if err != nil {
if err == payloadMissing {
// TODO perhaps consider this a hard failure on arm macs?
log.Printf("ggml-meta.metal payload missing")
return nil
}
return err
}
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
return nil
}
func extractPayloadFiles(workDir, glob string) error {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return payloadMissing
}
for _, file := range files {
srcFile, err := libEmbed.Open(file)
if err != nil {
return fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(workDir, 0o755); err != nil {
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(workDir, filepath.Base(file))
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return fmt.Errorf("stat payload %s: %v", file, err)
}
}
return nil
}

View File

@@ -1,193 +0,0 @@
//go:build !darwin
package llm
/*
#include <stdlib.h>
#include "dynamic_shim.h"
*/
import "C"
import (
"context"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"strings"
"sync"
"unsafe"
"github.com/jmorganca/ollama/api"
)
type shimExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
}
// Note: current implementation does not support concurrent instantiations
var shimMutex sync.Mutex
var llm *shimExtServer
const pathComponentCount = 6
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
}
func (llm *shimExtServer) llama_server_start() {
C.dynamic_shim_llama_server_start(llm.s)
}
func (llm *shimExtServer) llama_server_stop() {
C.dynamic_shim_llama_server_stop(llm.s)
}
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
}
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
}
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
}
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
C.dynamic_shim_llama_server_release_task_result(llm.s, result)
}
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
}
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
}
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
shimMutex.Lock()
defer shimMutex.Unlock()
updatePath(filepath.Dir(library))
libPath := C.CString(library)
defer C.free(unsafe.Pointer(libPath))
resp := newExtServerResp(128)
defer freeExtServerResp(resp)
var srv C.struct_dynamic_llama_server
C.dynamic_shim_init(libPath, &srv, &resp)
if resp.id < 0 {
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
}
llm = &shimExtServer{
s: srv,
options: opts,
}
log.Printf("Loading Dynamic Shim llm server: %s", library)
return newExtServer(llm, model, adapters, projectors, opts)
}
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
return predict(ctx, llm, pred, fn)
}
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
return encode(llm, ctx, prompt)
}
func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
return decode(llm, ctx, tokens)
}
func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
return embedding(llm, ctx, input)
}
func (llm *shimExtServer) Close() {
close(llm)
}
func nativeInit(workdir string) error {
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
if err != nil {
if err == payloadMissing {
log.Printf("%s", payloadMissing)
return nil
}
return err
}
for _, lib := range libs {
// The last dir component is the variant name
variant := filepath.Base(filepath.Dir(lib))
AvailableShims[variant] = lib
}
if err := verifyDriverAccess(); err != nil {
return err
}
// Report which dynamic libraries we have loaded to assist troubleshooting
variants := make([]string, len(AvailableShims))
i := 0
for variant := range AvailableShims {
variants[i] = variant
i++
}
log.Printf("Dynamic LLM variants %v", variants)
return nil
}
func extractDynamicLibs(workDir, glob string) ([]string, error) {
files, err := fs.Glob(libEmbed, glob)
if err != nil || len(files) == 0 {
return nil, payloadMissing
}
libs := []string{}
for _, file := range files {
pathComps := strings.Split(file, "/")
if len(pathComps) != pathComponentCount {
log.Printf("unexpected payload components: %v", pathComps)
continue
}
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
// Include the variant in the path to avoid conflicts between multiple server libs
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
srcFile, err := libEmbed.Open(file)
if err != nil {
return nil, fmt.Errorf("read payload %s: %v", file, err)
}
defer srcFile.Close()
if err := os.MkdirAll(targetDir, 0o755); err != nil {
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
}
destFile := filepath.Join(targetDir, filepath.Base(file))
if strings.Contains(destFile, "server") {
libs = append(libs, destFile)
}
_, err = os.Stat(destFile)
switch {
case errors.Is(err, os.ErrNotExist):
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
if err != nil {
return nil, fmt.Errorf("write payload %s: %v", file, err)
}
defer destFile.Close()
if _, err := io.Copy(destFile, srcFile); err != nil {
return nil, fmt.Errorf("copy payload %s: %v", file, err)
}
case err != nil:
return nil, fmt.Errorf("stat payload %s: %v", file, err)
}
}
return libs, nil
}

View File

@@ -1,46 +0,0 @@
package llm
import (
"embed"
"errors"
"fmt"
"io/fs"
"log"
"os"
"strings"
)
//go:embed llama.cpp/build/*/*/lib/*.so
var libEmbed embed.FS
func updatePath(dir string) {
pathComponents := strings.Split(os.Getenv("PATH"), ":")
for _, comp := range pathComponents {
if comp == dir {
return
}
}
newPath := strings.Join(append(pathComponents, dir), ":")
log.Printf("Updating PATH to %s", newPath)
os.Setenv("PATH", newPath)
}
func verifyDriverAccess() error {
// Only check ROCm access if we have the dynamic lib loaded
if _, rocmPresent := AvailableShims["rocm"]; rocmPresent {
// Verify we have permissions - either running as root, or we have group access to the driver
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
} else if errors.Is(err, fs.ErrNotExist) {
// expected behavior without a radeon card
return nil
}
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
}
fd.Close()
}
return nil
}

View File

@@ -1,36 +0,0 @@
package llm
import (
"embed"
"log"
"os"
"path/filepath"
"strings"
)
//go:embed llama.cpp/build/windows/*/lib/*.dll
var libEmbed embed.FS
func updatePath(dir string) {
tmpDir := filepath.Dir(dir)
pathComponents := strings.Split(os.Getenv("PATH"), ";")
i := 0
for _, comp := range pathComponents {
if strings.EqualFold(comp, dir) {
return
}
// Remove any other prior paths to our temp dir
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
pathComponents[i] = comp
i++
}
}
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
log.Printf("Updating PATH to %s", newPath)
os.Setenv("PATH", newPath)
}
func verifyDriverAccess() error {
// TODO if applicable
return nil
}