Fix off-by-one error in exllama_hf caching logic (#4145)

This commit is contained in:
tdrussell 2023-10-05 10:20:56 -05:00 committed by GitHub
parent b04c08378d
commit cb26163a20
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 0 deletions

View File

@ -94,6 +94,10 @@ class ExllamaHF(PreTrainedModel):
ex_cache.current_seq_len = longest_prefix
if len(seq_tensor) - longest_prefix > 1:
self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, lora=self.lora)
elif len(seq_tensor) == longest_prefix:
# Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
# because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
ex_cache.current_seq_len -= 1
if reset:
ex_cache.current_seq_len = 0

View File

@ -98,6 +98,10 @@ class Exllamav2HF(PreTrainedModel):
ex_cache.current_seq_len = longest_prefix
if len(seq_tensor) - longest_prefix > 1:
self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True)
elif len(seq_tensor) == longest_prefix:
# Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
# because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
ex_cache.current_seq_len -= 1
if reset:
ex_cache.current_seq_len = 0