From 96cee4f9ace334e237b385a73c624d8ca063db00 Mon Sep 17 00:00:00 2001
From: AT <manyoso@users.noreply.github.com>
Date: Wed, 3 Jan 2024 13:06:08 -0600
Subject: [PATCH] Explicitly clear the kv cache each time we eval tokens to
 match n_past. (#1808)

---
 gpt4all-backend/llamamodel.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
index cc566b43..4a73e46b 100644
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@@ -298,6 +298,8 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
 
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
+    llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
+
     llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
 
     batch.n_tokens = tokens.size();