llmodel: skip attempting Metal if model+kvcache > 53% of system ram

This commit is contained in:
Aaron Miller 2023-06-26 14:53:17 -07:00 committed by AT
parent 57fa8644d6
commit db34a2f670
2 changed files with 17 additions and 1 deletions

View File

@ -178,7 +178,9 @@ int32_t LLamaModel::threadCount() const {
LLamaModel::~LLamaModel()
{
llama_free(d_ptr->ctx);
if(d_ptr->ctx) {
llama_free(d_ptr->ctx);
}
}
bool LLamaModel::isModelLoaded() const

View File

@ -1,5 +1,6 @@
#include "llmodel.h"
#include "dlhandle.h"
#include "sysinfo.h"
#include <iostream>
#include <string>
@ -129,7 +130,20 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
if (buildVariant == "auto") {
size_t total_mem = getSystemTotalRAMInBytes();
impl = implementation(f, "metal");
if(impl) {
LLModel* metalimpl = impl->construct();
size_t req_mem = metalimpl->requiredMem(modelPath);
float req_to_total = (float) req_mem / (float) total_mem;
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
if (req_to_total >= 0.53) {
delete metalimpl;
impl = nullptr;
} else {
return metalimpl;
}
}
}
#endif