Fix off-by-one error in exllama_hf caching logic (#4145)

2024-10-01 01:26:03 -04:00 · 2023-10-05 10:20:56 -05:00 · 2023-10-05 10:20:56 -05:00 · cb26163a20
commit cb26163a20
parent b04c08378d
2 changed files with 8 additions and 0 deletions
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@ -94,6 +94,10 @@ class ExllamaHF(PreTrainedModel):
                    ex_cache.current_seq_len = longest_prefix
                    if len(seq_tensor) - longest_prefix > 1:
                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora)
+                    elif len(seq_tensor) == longest_prefix:
+                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
+                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
+                        ex_cache.current_seq_len -= 1

            if reset:
                ex_cache.current_seq_len = 0
--- a/modules/exllamav2_hf.py
+++ b/modules/exllamav2_hf.py
@ -98,6 +98,10 @@ class Exllamav2HF(PreTrainedModel):
                    ex_cache.current_seq_len = longest_prefix
                    if len(seq_tensor) - longest_prefix > 1:
                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True)
+                    elif len(seq_tensor) == longest_prefix:
+                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
+                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
+                        ex_cache.current_seq_len -= 1

            if reset:
                ex_cache.current_seq_len = 0