non-llama: explicitly greedy sampling for temp<=0 (#901)

copied directly from llama.cpp - without this temp=0.0 will just scale all the logits to infinity and give bad output
2024-10-01 01:06:10 -04:00 · 2023-06-08 11:08:30 -07:00 · 2023-06-08 11:08:30 -07:00 · 47fbc0e309
commit 47fbc0e309
parent b14953e136
1 changed files with 13 additions and 0 deletions
--- a/gpt4all-backend/utils.cpp
+++ b/gpt4all-backend/utils.cpp
@ -232,6 +232,19 @@ gpt_vocab::id gpt_sample_top_k_top_p(
    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
    const auto * plogits = logits.data();

+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        gpt_vocab::id max_id = 0;
+
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);