ml/backend/ggml: update model loading for hybrid/multi backends

use a similar strategy as llama.cpp for deciding where tensors should be
allocated. this will be improved later to be aware of usable memory
before assigning the tensor
This commit is contained in:
Michael Yang
2025-02-19 14:26:40 -08:00
parent 0682dae027
commit bab6f34dc0
3 changed files with 249 additions and 147 deletions

View File

@@ -207,13 +207,7 @@ struct ggml_backend_registry {
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i), score);
}
}
void register_device(ggml_backend_dev_t device, int score = -1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back({device, score});
std::stable_sort(devices.begin(), devices.end(),
[](const auto & a, const auto & b) {
return a.second > b.second;
@@ -221,6 +215,21 @@ struct ggml_backend_registry {
);
}
void register_device(ggml_backend_dev_t device, int score = -1) {
switch (ggml_backend_dev_type(device)) {
case GGML_BACKEND_DEVICE_TYPE_CPU:
case GGML_BACKEND_DEVICE_TYPE_GPU:
score += 1 << 16;
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
score += 1 << 20;
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back({device, score});
}
ggml_backend_reg_t load_backend(const std::filesystem::path & path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {