llmodel: skip attempting Metal if model+kvcache > 53% of system ram

2024-10-01 01:06:10 -04:00 · 2023-06-26 14:53:17 -07:00 · 2023-06-26 14:53:17 -07:00 · db34a2f670
commit db34a2f670
parent 57fa8644d6
2 changed files with 17 additions and 1 deletions
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -178,7 +178,9 @@ int32_t LLamaModel::threadCount() const {

 LLamaModel::~LLamaModel()
 {
-    llama_free(d_ptr->ctx);
+    if(d_ptr->ctx) {
+        llama_free(d_ptr->ctx);
+    }
 }

 bool LLamaModel::isModelLoaded() const
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -1,5 +1,6 @@
 #include "llmodel.h"
 #include "dlhandle.h"
+#include "sysinfo.h"

 #include <iostream>
 #include <string>
@ -129,7 +130,20 @@ LLModel *LLModel::construct(const std::string &modelPath, std::string buildVaria

    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
        if (buildVariant == "auto") {
+            size_t total_mem = getSystemTotalRAMInBytes();
            impl = implementation(f, "metal");
+            if(impl) {
+                LLModel* metalimpl = impl->construct();
+                size_t req_mem = metalimpl->requiredMem(modelPath);
+                float req_to_total = (float) req_mem / (float) total_mem;
+                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
+                if (req_to_total >= 0.53) {
+                    delete metalimpl;
+                    impl = nullptr;
+                } else {
+                    return metalimpl;
+                }
+            }
        }
    #endif