mirror of
https://github.com/dogkeeper886/ollama37.git
synced 2025-12-11 08:17:03 +00:00
Merge pull request #1819 from dhiltgen/multi_variant
Support multiple LLM libs; ROCm v5 and v6; Rosetta, AVX, and AVX2 compatible CPU builds
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
#include "dynamic_shim.h"
|
||||
#include "dyn_ext_server.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
|
||||
#define UNLOAD_LIBRARY(handle) dlclose(handle)
|
||||
#endif
|
||||
|
||||
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err) {
|
||||
int i = 0;
|
||||
struct lookup {
|
||||
@@ -58,7 +58,7 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
{"", NULL},
|
||||
};
|
||||
|
||||
printf("Lazy loading %s library\n", libPath);
|
||||
printf("loading %s library\n", libPath);
|
||||
s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
|
||||
if (!s->handle) {
|
||||
err->id = -1;
|
||||
@@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
}
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||
inline void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_init(sparams, err);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
|
||||
inline void dyn_llama_server_start(struct dynamic_llama_server s) {
|
||||
s.llama_server_start();
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
|
||||
inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
|
||||
s.llama_server_stop();
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||
inline void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp) {
|
||||
s.llama_server_completion(json_req, resp);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_completion_next_result(
|
||||
inline void dyn_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result) {
|
||||
s.llama_server_completion_next_result(task_id, result);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_completion_cancel(
|
||||
inline void dyn_llama_server_completion_cancel(
|
||||
struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
|
||||
s.llama_server_completion_cancel(task_id, err);
|
||||
}
|
||||
inline void dynamic_shim_llama_server_release_task_result(
|
||||
inline void dyn_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result) {
|
||||
s.llama_server_release_task_result(result);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_tokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_detokenize(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||
inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err) {
|
||||
s.llama_server_embedding(json_req, json_resp, err);
|
||||
}
|
||||
|
||||
inline void dynamic_shim_llama_server_release_json_resp(
|
||||
inline void dyn_llama_server_release_json_resp(
|
||||
struct dynamic_llama_server s, char **json_resp) {
|
||||
s.llama_server_release_json_resp(json_resp);
|
||||
}
|
||||
@@ -10,31 +10,25 @@ package llm
|
||||
#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
#cgo darwin LDFLAGS: -lc++ -framework Accelerate
|
||||
#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
|
||||
#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
|
||||
#cgo linux CFLAGS: -D_GNU_SOURCE
|
||||
#cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
|
||||
#cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libext_server.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libcommon.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libllama.a
|
||||
#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/build/linux/cpu/lib/libggml_static.a
|
||||
#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
|
||||
#cgo linux windows LDFLAGS: -lpthread
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "ext_server.h"
|
||||
#include "dyn_ext_server.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -43,19 +37,9 @@ import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type extServer interface {
|
||||
LLM
|
||||
llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
|
||||
llama_server_start()
|
||||
llama_server_stop()
|
||||
llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
|
||||
llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
|
||||
llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
|
||||
llama_server_release_task_result(result *C.ext_server_task_result_t)
|
||||
llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
|
||||
llama_server_release_json_resp(json_resp **C.char)
|
||||
type dynExtServer struct {
|
||||
s C.struct_dynamic_llama_server
|
||||
options api.Options
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
@@ -80,11 +64,30 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
|
||||
return fmt.Errorf(C.GoString(resp.msg))
|
||||
}
|
||||
|
||||
func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
var llm *dynExtServer
|
||||
|
||||
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
if !mutex.TryLock() {
|
||||
log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
|
||||
mutex.Lock()
|
||||
}
|
||||
updatePath(filepath.Dir(library))
|
||||
libPath := C.CString(library)
|
||||
defer C.free(unsafe.Pointer(libPath))
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var srv C.struct_dynamic_llama_server
|
||||
C.dyn_init(libPath, &srv, &resp)
|
||||
if resp.id < 0 {
|
||||
mutex.Unlock()
|
||||
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
|
||||
}
|
||||
llm = &dynExtServer{
|
||||
s: srv,
|
||||
options: opts,
|
||||
}
|
||||
log.Printf("Loading Dynamic llm server: %s", library)
|
||||
|
||||
var sparams C.ext_server_params_t
|
||||
sparams.model = C.CString(model)
|
||||
@@ -133,20 +136,20 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
|
||||
|
||||
sparams.n_threads = C.uint(opts.NumThread)
|
||||
|
||||
log.Printf("Initializing internal llama server")
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
server.llama_server_init(&sparams, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
log.Printf("Initializing llama server")
|
||||
initResp := newExtServerResp(128)
|
||||
defer freeExtServerResp(initResp)
|
||||
C.dyn_llama_server_init(llm.s, &sparams, &initResp)
|
||||
if initResp.id < 0 {
|
||||
return nil, extServerResponseToErr(initResp)
|
||||
}
|
||||
|
||||
log.Printf("Starting internal llama main loop")
|
||||
server.llama_server_start()
|
||||
return server, nil
|
||||
log.Printf("Starting llama main loop")
|
||||
C.dyn_llama_server_start(llm.s)
|
||||
return llm, nil
|
||||
}
|
||||
|
||||
func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
|
||||
func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var imageData []ImageData
|
||||
@@ -204,7 +207,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
|
||||
req := C.CString(buffer.String())
|
||||
defer C.free(unsafe.Pointer(req))
|
||||
|
||||
llm.llama_server_completion(req, &resp)
|
||||
C.dyn_llama_server_completion(llm.s, req, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
}
|
||||
@@ -215,7 +218,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
// This handles the request cancellation
|
||||
llm.llama_server_completion_cancel(resp.id, &resp)
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return extServerResponseToErr(resp)
|
||||
} else {
|
||||
@@ -223,13 +226,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
|
||||
}
|
||||
default:
|
||||
var result C.ext_server_task_result_t
|
||||
llm.llama_server_completion_next_result(resp.id, &result)
|
||||
C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
|
||||
json_resp := C.GoString(result.json_resp)
|
||||
llm.llama_server_release_task_result(&result)
|
||||
C.dyn_llama_server_release_task_result(llm.s, &result)
|
||||
|
||||
var p prediction
|
||||
if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
|
||||
llm.llama_server_completion_cancel(resp.id, &resp)
|
||||
C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
|
||||
if resp.id < 0 {
|
||||
return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
|
||||
} else {
|
||||
@@ -270,7 +273,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
|
||||
return fmt.Errorf("max retries exceeded")
|
||||
}
|
||||
|
||||
func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
||||
func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
data, err := json.Marshal(TokenizeRequest{Content: prompt})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshaling encode data: %w", err)
|
||||
@@ -280,11 +283,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
llm.llama_server_tokenize(req, &json_resp, &resp)
|
||||
C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
}
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
|
||||
var encoded TokenizeResponse
|
||||
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
|
||||
@@ -294,7 +297,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
|
||||
return encoded.Tokens, err
|
||||
}
|
||||
|
||||
func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
||||
func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
if len(tokens) == 0 {
|
||||
return "", nil
|
||||
}
|
||||
@@ -308,11 +311,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
llm.llama_server_detokenize(req, &json_resp, &resp)
|
||||
C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return "", extServerResponseToErr(resp)
|
||||
}
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
|
||||
var decoded DetokenizeResponse
|
||||
if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
|
||||
@@ -322,7 +325,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
|
||||
return decoded.Content, err
|
||||
}
|
||||
|
||||
func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
|
||||
func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
data, err := json.Marshal(TokenizeRequest{Content: input})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error marshaling embed data: %w", err)
|
||||
@@ -333,11 +336,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
|
||||
var json_resp *C.char
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
llm.llama_server_embedding(req, &json_resp, &resp)
|
||||
C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, extServerResponseToErr(resp)
|
||||
}
|
||||
defer llm.llama_server_release_json_resp(&json_resp)
|
||||
defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
|
||||
|
||||
var embedding EmbeddingResponse
|
||||
if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
|
||||
@@ -347,7 +350,38 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
|
||||
return embedding.Embedding, nil
|
||||
}
|
||||
|
||||
func close(llm extServer) {
|
||||
llm.llama_server_stop()
|
||||
func (llm *dynExtServer) Close() {
|
||||
C.dyn_llama_server_stop(llm.s)
|
||||
mutex.Unlock()
|
||||
}
|
||||
|
||||
func updatePath(dir string) {
|
||||
if runtime.GOOS == "windows" {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
} else {
|
||||
pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
|
||||
for _, comp := range pathComponents {
|
||||
if comp == dir {
|
||||
return
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
|
||||
log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
|
||||
os.Setenv("LD_LIBRARY_PATH", newPath)
|
||||
}
|
||||
}
|
||||
@@ -27,46 +27,46 @@ struct dynamic_llama_server {
|
||||
void (*llama_server_release_json_resp)(char **json_resp);
|
||||
};
|
||||
|
||||
void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
void dyn_init(const char *libPath, struct dynamic_llama_server *s,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
// No good way to call C function pointers from Go so inline the indirection
|
||||
void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_init(struct dynamic_llama_server s,
|
||||
ext_server_params_t *sparams,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
|
||||
void dyn_llama_server_start(struct dynamic_llama_server s);
|
||||
|
||||
void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
|
||||
void dyn_llama_server_stop(struct dynamic_llama_server s);
|
||||
|
||||
void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_completion(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
ext_server_resp_t *resp);
|
||||
|
||||
void dynamic_shim_llama_server_completion_next_result(
|
||||
void dyn_llama_server_completion_next_result(
|
||||
struct dynamic_llama_server s, const int task_id,
|
||||
ext_server_task_result_t *result);
|
||||
|
||||
void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
|
||||
const int task_id,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dynamic_shim_llama_server_release_task_result(
|
||||
void dyn_llama_server_release_task_result(
|
||||
struct dynamic_llama_server s, ext_server_task_result_t *result);
|
||||
|
||||
void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_tokenize(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_detokenize(struct dynamic_llama_server s,
|
||||
const char *json_req,
|
||||
char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
|
||||
void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_embedding(struct dynamic_llama_server s,
|
||||
const char *json_req, char **json_resp,
|
||||
ext_server_resp_t *err);
|
||||
void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||
void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
|
||||
char **json_resp);
|
||||
|
||||
#ifdef __cplusplus
|
||||
@@ -1,4 +1,18 @@
|
||||
# Extern C Server
|
||||
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server
|
||||
to expose `extern C` interfaces to access the functionality through direct API calls in-process
|
||||
This directory contains a thin facade we layer on top of the Llama.cpp server to
|
||||
expose `extern C` interfaces to access the functionality through direct API
|
||||
calls in-process. The llama.cpp code uses compile time macros to configure GPU
|
||||
type along with other settings. During the `go generate ./...` execution, the
|
||||
build will generate one or more copies of the llama.cpp `extern C` server based
|
||||
on what GPU libraries are detected to support multiple GPU types as well as CPU
|
||||
only support. The Ollama go build then embeds these different servers to support
|
||||
different GPUs and settings at runtime.
|
||||
|
||||
If you are making changes to the code in this directory, make sure to disable
|
||||
caching during your go build to ensure you pick up your changes. A typical
|
||||
iteration cycle from the top of the source tree looks like:
|
||||
|
||||
```
|
||||
go generate ./... && go build -a .
|
||||
```
|
||||
@@ -1,80 +0,0 @@
|
||||
//go:build !windows
|
||||
|
||||
package llm
|
||||
|
||||
/*
|
||||
#include <stdlib.h>
|
||||
#include "ext_server.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type llamaExtServer struct {
|
||||
api.Options
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.llama_server_init(sparams, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_start() {
|
||||
C.llama_server_start()
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_stop() {
|
||||
C.llama_server_stop()
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
||||
C.llama_server_completion(json_req, resp)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
||||
C.llama_server_completion_next_result(task_id, resp)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
||||
C.llama_server_completion_cancel(task_id, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
||||
C.llama_server_release_task_result(result)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_tokenize(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_detokenize(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.llama_server_embedding(json_req, json_resp, err)
|
||||
}
|
||||
func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.llama_server_release_json_resp(json_resp)
|
||||
}
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
server := &llamaExtServer{opts}
|
||||
return newExtServer(server, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
return encode(llm, ctx, prompt)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
return decode(llm, ctx, tokens)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
return embedding(llm, ctx, input)
|
||||
}
|
||||
|
||||
func (llm *llamaExtServer) Close() {
|
||||
close(llm)
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// On windows we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
|
||||
// This ensures we can update the PATH at runtime to get everything loaded
|
||||
|
||||
return newDynamicShimExtServer(AvailableShims["cpu"], model, adapters, projectors, opts)
|
||||
}
|
||||
@@ -51,6 +51,16 @@ install() {
|
||||
cp ${BUILD_DIR}/libggml_static.a ${BUILD_DIR}/lib
|
||||
}
|
||||
|
||||
link_server_lib() {
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
||||
-Wl,--whole-archive \
|
||||
${BUILD_DIR}/lib/libext_server.a \
|
||||
-Wl,--no-whole-archive \
|
||||
${BUILD_DIR}/lib/libcommon.a \
|
||||
${BUILD_DIR}/lib/libllama.a
|
||||
|
||||
}
|
||||
|
||||
# Keep the local tree clean after we're done with the build
|
||||
cleanup() {
|
||||
(cd ${LLAMACPP_DIR}/examples/server/ && git checkout CMakeLists.txt server.cpp)
|
||||
|
||||
@@ -29,4 +29,16 @@ git_module_setup
|
||||
apply_patches
|
||||
build
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
||||
-Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
|
||||
${BUILD_DIR}/lib/libcommon.a \
|
||||
${BUILD_DIR}/lib/libllama.a \
|
||||
${BUILD_DIR}/lib/libggml_static.a \
|
||||
-lpthread -ldl -lm -lc++ \
|
||||
-framework Accelerate \
|
||||
-framework Foundation \
|
||||
-framework Metal \
|
||||
-framework MetalKit \
|
||||
-framework MetalPerformanceShaders
|
||||
|
||||
cleanup
|
||||
|
||||
@@ -48,23 +48,76 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
#
|
||||
# CPU first for the default library
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
||||
# Users building from source can tune the exact flags we pass to cmake for configuring
|
||||
# llama.cpp, and we'll build only 1 CPU variant in that case as the default.
|
||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
echo "Building custom CPU"
|
||||
build
|
||||
install
|
||||
link_server_lib
|
||||
else
|
||||
# Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
# Note: the following seem to yield slower results than AVX2 - ymmv
|
||||
# -DLLAMA_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
|
||||
# -DLLAMA_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||
# -DLLAMA_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||
|
||||
build
|
||||
install
|
||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off"
|
||||
#
|
||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||
#
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu"
|
||||
echo "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
link_server_lib
|
||||
|
||||
# Placeholder to keep go embed happy until we start building dynamic CPU lib variants
|
||||
touch ${BUILD_DIR}/lib/dummy.so
|
||||
#
|
||||
# ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
|
||||
# Approximately 400% faster than LCD on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx"
|
||||
echo "Building AVX CPU"
|
||||
build
|
||||
install
|
||||
link_server_lib
|
||||
|
||||
#
|
||||
# ~2013 CPU Dynamic library
|
||||
# Approximately 10% faster than AVX on same CPU
|
||||
#
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cpu_avx2"
|
||||
echo "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
link_server_lib
|
||||
fi
|
||||
else
|
||||
echo "Skipping CPU generation step as requested"
|
||||
fi
|
||||
|
||||
if [ -d /usr/local/cuda/lib64/ ]; then
|
||||
echo "CUDA libraries detected - building dynamic CUDA library"
|
||||
init_vars
|
||||
CUDA_MAJOR=$(ls /usr/local/cuda/lib64/libcudart.so.* | head -1 | cut -f3 -d. || true)
|
||||
if [ -n "${CUDA_MAJOR}" ]; then
|
||||
CUDA_VARIANT=_v${CUDA_MAJOR}
|
||||
fi
|
||||
CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/cuda${CUDA_VARIANT}"
|
||||
CUDA_LIB_DIR=/usr/local/cuda/lib64
|
||||
build
|
||||
install
|
||||
@@ -96,9 +149,12 @@ fi
|
||||
|
||||
if [ -d "${ROCM_PATH}" ]; then
|
||||
echo "ROCm libraries detected - building dynamic ROCm library"
|
||||
if [ -f ${ROCM_PATH}/lib/librocm_smi64.so.? ]; then
|
||||
ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocm_smi64.so.? | cut -f3 -d. || true)
|
||||
fi
|
||||
init_vars
|
||||
CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm"
|
||||
BUILD_DIR="${LLAMACPP_DIR}/build/linux/rocm${ROCM_VARIANT}"
|
||||
build
|
||||
install
|
||||
gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
|
||||
|
||||
@@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop"
|
||||
|
||||
function init_vars {
|
||||
$script:llamacppDir = "../llama.cpp"
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
|
||||
$script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-A","x64")
|
||||
$script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
|
||||
if ($env:CGO_CFLAGS -contains "-g") {
|
||||
$script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
|
||||
@@ -63,16 +63,36 @@ init_vars
|
||||
git_module_setup
|
||||
apply_patches
|
||||
|
||||
# first build CPU based
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
|
||||
# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
|
||||
# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
|
||||
# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
|
||||
# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
|
||||
|
||||
$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu"
|
||||
write-host "Building LCD CPU"
|
||||
build
|
||||
install
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
|
||||
write-host "Building AVX CPU"
|
||||
build
|
||||
install
|
||||
|
||||
$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
|
||||
write-host "Building AVX2 CPU"
|
||||
build
|
||||
install
|
||||
|
||||
# Then build cuda as a dynamically loaded library
|
||||
# TODO figure out how to detect cuda version
|
||||
init_vars
|
||||
$script:buildDir="${script:llamacppDir}/build/windows/cuda"
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
|
||||
$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
|
||||
build
|
||||
install
|
||||
|
||||
|
||||
32
llm/llm.go
32
llm/llm.go
@@ -18,8 +18,6 @@ type LLM interface {
|
||||
Close()
|
||||
}
|
||||
|
||||
var AvailableShims = map[string]string{}
|
||||
|
||||
func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
if _, err := os.Stat(model); err != nil {
|
||||
return nil, err
|
||||
@@ -112,7 +110,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
||||
|
||||
opts.RopeFrequencyBase = 0.0
|
||||
opts.RopeFrequencyScale = 0.0
|
||||
return newLlmServer(library, model, adapters, projectors, opts)
|
||||
gpuInfo := gpu.GetGPUInfo()
|
||||
return newLlmServer(gpuInfo, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
// Give any native cgo implementations an opportunity to initialize
|
||||
@@ -120,15 +119,30 @@ func Init(workdir string) error {
|
||||
return nativeInit(workdir)
|
||||
}
|
||||
|
||||
func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
|
||||
srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
|
||||
func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
||||
dynLibs := getDynLibs(gpuInfo)
|
||||
|
||||
// Check to see if the user has requested a specific library instead of auto-detecting
|
||||
demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
|
||||
if demandLib != "" {
|
||||
libPath := availableDynLibs[demandLib]
|
||||
if libPath == "" {
|
||||
log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
|
||||
} else {
|
||||
log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
|
||||
dynLibs = []string{libPath}
|
||||
}
|
||||
}
|
||||
|
||||
err2 := fmt.Errorf("unable to locate suitable llm library")
|
||||
for _, dynLib := range dynLibs {
|
||||
srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
|
||||
if err == nil {
|
||||
return srv, nil
|
||||
}
|
||||
log.Printf("Failed to load dynamic library %s - falling back to CPU mode %s", library, err)
|
||||
// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
|
||||
log.Printf("Failed to load dynamic library %s %s", dynLib, err)
|
||||
err2 = err
|
||||
}
|
||||
|
||||
return newDefaultExtServer(model, adapters, projectors, opts)
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
244
llm/payload_common.go
Normal file
244
llm/payload_common.go
Normal file
@@ -0,0 +1,244 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
)
|
||||
|
||||
// Libraries names may contain an optional variant separated by '_'
|
||||
// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
|
||||
// Any library without a variant is the lowest common denominator
|
||||
var availableDynLibs = map[string]string{}
|
||||
|
||||
const pathComponentCount = 6
|
||||
|
||||
// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
|
||||
func getDynLibs(gpuInfo gpu.GpuInfo) []string {
|
||||
// Short circuit if we know we're using the default built-in (darwin only)
|
||||
if gpuInfo.Library == "default" {
|
||||
return []string{"default"}
|
||||
}
|
||||
|
||||
exactMatch := ""
|
||||
dynLibs := []string{}
|
||||
altDynLibs := []string{}
|
||||
requested := gpuInfo.Library
|
||||
if gpuInfo.Variant != "" {
|
||||
requested += "_" + gpuInfo.Variant
|
||||
}
|
||||
// Try to find an exact match
|
||||
for cmp := range availableDynLibs {
|
||||
if requested == cmp {
|
||||
exactMatch = cmp
|
||||
dynLibs = []string{availableDynLibs[cmp]}
|
||||
break
|
||||
}
|
||||
}
|
||||
// Then for GPUs load alternates and sort the list for consistent load ordering
|
||||
if gpuInfo.Library != "cpu" {
|
||||
for cmp := range availableDynLibs {
|
||||
if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
|
||||
altDynLibs = append(altDynLibs, cmp)
|
||||
}
|
||||
}
|
||||
slices.Sort(altDynLibs)
|
||||
for _, altDynLib := range altDynLibs {
|
||||
dynLibs = append(dynLibs, availableDynLibs[altDynLib])
|
||||
}
|
||||
}
|
||||
|
||||
// Load up the best CPU variant if not primary requested
|
||||
if gpuInfo.Library != "cpu" {
|
||||
variant := gpu.GetCPUVariant()
|
||||
// If no variant, then we fall back to default
|
||||
// If we have a variant, try that if we find an exact match
|
||||
// Attempting to run the wrong CPU instructions will panic the
|
||||
// process
|
||||
if variant != "" {
|
||||
for cmp := range availableDynLibs {
|
||||
if cmp == "cpu_"+variant {
|
||||
dynLibs = append(dynLibs, availableDynLibs[cmp])
|
||||
break
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dynLibs = append(dynLibs, availableDynLibs["cpu"])
|
||||
}
|
||||
}
|
||||
|
||||
// Finaly, if we didn't find any matches, LCD CPU FTW
|
||||
if len(dynLibs) == 0 {
|
||||
dynLibs = []string{availableDynLibs["cpu"]}
|
||||
}
|
||||
return dynLibs
|
||||
}
|
||||
|
||||
func rocmDynLibPresent() bool {
|
||||
for dynLibName := range availableDynLibs {
|
||||
if strings.HasPrefix(dynLibName, "rocm") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
if runtime.GOOS == "darwin" {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
log.Printf("ggml-meta.metal payload missing")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
|
||||
}
|
||||
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
log.Printf("%s", payloadMissing)
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, lib := range libs {
|
||||
// The last dir component is the variant name
|
||||
variant := filepath.Base(filepath.Dir(lib))
|
||||
availableDynLibs[variant] = lib
|
||||
}
|
||||
|
||||
if err := verifyDriverAccess(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Report which dynamic libraries we have loaded to assist troubleshooting
|
||||
variants := make([]string, len(availableDynLibs))
|
||||
i := 0
|
||||
for variant := range availableDynLibs {
|
||||
variants[i] = variant
|
||||
i++
|
||||
}
|
||||
log.Printf("Dynamic LLM libraries %v", variants)
|
||||
log.Printf("Override detection logic by setting OLLAMA_LLM_LIBRARY")
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
}
|
||||
libs := []string{}
|
||||
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != pathComponentCount {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
continue
|
||||
}
|
||||
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(targetDir, filepath.Base(file))
|
||||
if strings.Contains(destFile, "server") {
|
||||
libs = append(libs, destFile)
|
||||
}
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return nil, fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return nil, fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return libs, nil
|
||||
}
|
||||
|
||||
func extractPayloadFiles(workDir, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return payloadMissing
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(workDir, filepath.Base(file))
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
if runtime.GOOS != "linux" {
|
||||
return nil
|
||||
}
|
||||
// Only check ROCm access if we have the dynamic lib loaded
|
||||
if rocmDynLibPresent() {
|
||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrPermission) {
|
||||
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
||||
} else if errors.Is(err, fs.ErrNotExist) {
|
||||
// expected behavior without a radeon card
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||
}
|
||||
fd.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
8
llm/payload_darwin.go
Normal file
8
llm/payload_darwin.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
8
llm/payload_linux.go
Normal file
8
llm/payload_linux.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/linux/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
54
llm/payload_test.go
Normal file
54
llm/payload_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/jmorganca/ollama/gpu"
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestGetDynLibs(t *testing.T) {
|
||||
availableDynLibs = map[string]string{
|
||||
"cpu": "X_cpu",
|
||||
}
|
||||
assert.Equal(t, false, rocmDynLibPresent())
|
||||
res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[0])
|
||||
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm_v5": "X_rocm_v5",
|
||||
"rocm_v6": "X_rocm_v6",
|
||||
"cpu": "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 3)
|
||||
assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
|
||||
assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[2])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[0])
|
||||
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "default"})
|
||||
assert.Len(t, res, 1)
|
||||
assert.Equal(t, "default", res[0])
|
||||
|
||||
availableDynLibs = map[string]string{
|
||||
"rocm": "X_rocm_v5",
|
||||
"cpu": "X_cpu",
|
||||
}
|
||||
assert.Equal(t, true, rocmDynLibPresent())
|
||||
res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
|
||||
assert.Len(t, res, 2)
|
||||
assert.Equal(t, availableDynLibs["rocm"], res[0])
|
||||
assert.Equal(t, availableDynLibs["cpu"], res[1])
|
||||
}
|
||||
8
llm/payload_windows.go
Normal file
8
llm/payload_windows.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/windows/*/lib/*.dll
|
||||
var libEmbed embed.FS
|
||||
@@ -1,71 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/ggml-metal.metal
|
||||
var libEmbed embed.FS
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
// should never happen...
|
||||
return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
err := extractPayloadFiles(workdir, "llama.cpp/ggml-metal.metal")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
// TODO perhaps consider this a hard failure on arm macs?
|
||||
log.Printf("ggml-meta.metal payload missing")
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractPayloadFiles(workDir, glob string) error {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return payloadMissing
|
||||
}
|
||||
|
||||
for _, file := range files {
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(workDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(workDir, filepath.Base(file))
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,193 +0,0 @@
|
||||
//go:build !darwin
|
||||
|
||||
package llm
|
||||
|
||||
/*
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "dynamic_shim.h"
|
||||
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
|
||||
"github.com/jmorganca/ollama/api"
|
||||
)
|
||||
|
||||
type shimExtServer struct {
|
||||
s C.struct_dynamic_llama_server
|
||||
options api.Options
|
||||
}
|
||||
|
||||
// Note: current implementation does not support concurrent instantiations
|
||||
var shimMutex sync.Mutex
|
||||
var llm *shimExtServer
|
||||
|
||||
const pathComponentCount = 6
|
||||
|
||||
func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_init(llm.s, sparams, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_start() {
|
||||
C.dynamic_shim_llama_server_start(llm.s)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_stop() {
|
||||
C.dynamic_shim_llama_server_stop(llm.s)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
|
||||
C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
|
||||
C.dynamic_shim_llama_server_release_task_result(llm.s, result)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
|
||||
C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
|
||||
}
|
||||
func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
|
||||
C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
|
||||
}
|
||||
|
||||
func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
|
||||
shimMutex.Lock()
|
||||
defer shimMutex.Unlock()
|
||||
updatePath(filepath.Dir(library))
|
||||
libPath := C.CString(library)
|
||||
defer C.free(unsafe.Pointer(libPath))
|
||||
resp := newExtServerResp(128)
|
||||
defer freeExtServerResp(resp)
|
||||
var srv C.struct_dynamic_llama_server
|
||||
C.dynamic_shim_init(libPath, &srv, &resp)
|
||||
if resp.id < 0 {
|
||||
return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
|
||||
}
|
||||
llm = &shimExtServer{
|
||||
s: srv,
|
||||
options: opts,
|
||||
}
|
||||
log.Printf("Loading Dynamic Shim llm server: %s", library)
|
||||
return newExtServer(llm, model, adapters, projectors, opts)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
|
||||
return predict(ctx, llm, pred, fn)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
|
||||
return encode(llm, ctx, prompt)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
|
||||
return decode(llm, ctx, tokens)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
|
||||
return embedding(llm, ctx, input)
|
||||
}
|
||||
|
||||
func (llm *shimExtServer) Close() {
|
||||
close(llm)
|
||||
}
|
||||
|
||||
func nativeInit(workdir string) error {
|
||||
libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
|
||||
if err != nil {
|
||||
if err == payloadMissing {
|
||||
log.Printf("%s", payloadMissing)
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, lib := range libs {
|
||||
// The last dir component is the variant name
|
||||
variant := filepath.Base(filepath.Dir(lib))
|
||||
AvailableShims[variant] = lib
|
||||
}
|
||||
|
||||
if err := verifyDriverAccess(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Report which dynamic libraries we have loaded to assist troubleshooting
|
||||
variants := make([]string, len(AvailableShims))
|
||||
i := 0
|
||||
for variant := range AvailableShims {
|
||||
variants[i] = variant
|
||||
i++
|
||||
}
|
||||
log.Printf("Dynamic LLM variants %v", variants)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func extractDynamicLibs(workDir, glob string) ([]string, error) {
|
||||
files, err := fs.Glob(libEmbed, glob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return nil, payloadMissing
|
||||
}
|
||||
libs := []string{}
|
||||
|
||||
for _, file := range files {
|
||||
pathComps := strings.Split(file, "/")
|
||||
if len(pathComps) != pathComponentCount {
|
||||
log.Printf("unexpected payload components: %v", pathComps)
|
||||
continue
|
||||
}
|
||||
// llama.cpp/build/$OS/$VARIANT/lib/$LIBRARY
|
||||
// Include the variant in the path to avoid conflicts between multiple server libs
|
||||
targetDir := filepath.Join(workDir, pathComps[pathComponentCount-3])
|
||||
srcFile, err := libEmbed.Open(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read payload %s: %v", file, err)
|
||||
}
|
||||
defer srcFile.Close()
|
||||
if err := os.MkdirAll(targetDir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
|
||||
}
|
||||
|
||||
destFile := filepath.Join(targetDir, filepath.Base(file))
|
||||
if strings.Contains(destFile, "server") {
|
||||
libs = append(libs, destFile)
|
||||
}
|
||||
|
||||
_, err = os.Stat(destFile)
|
||||
switch {
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("write payload %s: %v", file, err)
|
||||
}
|
||||
defer destFile.Close()
|
||||
if _, err := io.Copy(destFile, srcFile); err != nil {
|
||||
return nil, fmt.Errorf("copy payload %s: %v", file, err)
|
||||
}
|
||||
case err != nil:
|
||||
return nil, fmt.Errorf("stat payload %s: %v", file, err)
|
||||
}
|
||||
}
|
||||
return libs, nil
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/*/*/lib/*.so
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ":")
|
||||
for _, comp := range pathComponents {
|
||||
if comp == dir {
|
||||
return
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append(pathComponents, dir), ":")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
// Only check ROCm access if we have the dynamic lib loaded
|
||||
if _, rocmPresent := AvailableShims["rocm"]; rocmPresent {
|
||||
// Verify we have permissions - either running as root, or we have group access to the driver
|
||||
fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrPermission) {
|
||||
return fmt.Errorf("Radeon card detected, but permissions not set up properly. Either run ollama as root, or add you user account to the render group.")
|
||||
} else if errors.Is(err, fs.ErrNotExist) {
|
||||
// expected behavior without a radeon card
|
||||
return nil
|
||||
}
|
||||
|
||||
return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
|
||||
}
|
||||
fd.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
package llm
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed llama.cpp/build/windows/*/lib/*.dll
|
||||
var libEmbed embed.FS
|
||||
|
||||
func updatePath(dir string) {
|
||||
tmpDir := filepath.Dir(dir)
|
||||
pathComponents := strings.Split(os.Getenv("PATH"), ";")
|
||||
i := 0
|
||||
for _, comp := range pathComponents {
|
||||
if strings.EqualFold(comp, dir) {
|
||||
return
|
||||
}
|
||||
// Remove any other prior paths to our temp dir
|
||||
if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
|
||||
pathComponents[i] = comp
|
||||
i++
|
||||
}
|
||||
}
|
||||
newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
|
||||
log.Printf("Updating PATH to %s", newPath)
|
||||
os.Setenv("PATH", newPath)
|
||||
}
|
||||
|
||||
func verifyDriverAccess() error {
|
||||
// TODO if applicable
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user