update memory calcualtions

count each layer independently when deciding gpu offloading
This commit is contained in:
Michael Yang
2024-03-18 10:45:22 +01:00
parent d338d70492
commit 91b3e4d282
7 changed files with 125 additions and 89 deletions

View File

@@ -39,7 +39,7 @@ import (
type dynExtServer struct {
s C.struct_dynamic_llama_server
options api.Options
options *api.Options
}
// Note: current implementation does not support concurrent instantiations
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
return fmt.Errorf(C.GoString(resp.msg))
}
func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
if !mutex.TryLock() {
slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
mutex.Lock()