text-generation-webui/modules/cache_utils.py

import torch
from numba import njit

from modules import shared


def process_llamacpp_cache(model, new_sequence, past_sequence):
    if len(past_sequence) == 0 or len(new_sequence) == 0:
        return past_sequence

    i1, i2, j1, j2 = find_longest_common_substring_indices(past_sequence, new_sequence)
    overlap_length = i2 - i1 + 1

    # Do StreamingLLM if i1 > 0 (ie the longest common subsequence is not a prefix)
    # and the overlap length is sufficiently long.
    if i1 > 0 and overlap_length > 0.2 * len(new_sequence):

        new_sequence = torch.tensor(new_sequence)
        past_sequence = torch.tensor(past_sequence)

        prefix_length = find_prefix_length(past_sequence[:i1], new_sequence[:j1])
        sink_length = max(prefix_length, shared.args.attention_sink_size)
        removed_length = i1 - sink_length

        if removed_length <= 0:
            return past_sequence.tolist()

        matching_prefix = past_sequence[:prefix_length]
        removed_chunk = past_sequence[sink_length:i1]
        overlapping_sequence = new_sequence[j1:j2 + 1]
        added_chunk = new_sequence[j2 + 1:]

        # print(past_sequence.tolist())
        # print(new_sequence.tolist())

        print()
        print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))
        print('ADDED CHUNK=', repr(shared.tokenizer.decode(added_chunk)))
        print('REMOVED CHUNK=', repr(shared.tokenizer.decode(removed_chunk)))
        print('REMOVED LENGTH=', removed_length)
        print()

        # Remove interval [sink_length, sink_length + removed_length) from the context
        # Update model.n_tokens
        model._ctx.kv_cache_seq_rm(0, sink_length, sink_length + removed_length)
        model._ctx.kv_cache_seq_shift(0, sink_length + removed_length, -1, -removed_length)

        new_sequence = new_sequence.tolist()
        model.input_ids[:j2 + 1] = new_sequence[:j2 + 1]
        model.n_tokens = j2 + 1

        return new_sequence[:j2 + 1]
    else:
        return past_sequence


def find_prefix_length(past_seq, seq_tensor):
    '''
    Given two torch tensors, finds the length of the longest
    common prefix between the two.
    '''
    min_length = min(past_seq.shape[0], seq_tensor.shape[0])
    indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
    if len(indices) > 0:
        prefix_length = indices[0].item()
    else:
        prefix_length = min_length

    return prefix_length


@njit
def find_longest_common_substring_indices(list1, list2):
    '''
    Given two lists, solves the Longest Common Substring problem.

    It returns the indices where the substring starts and ends in
    s1 and s2.

    Example:

    ir, jr, ir2, jr2 = find_longest_common_substring_indices(s1, s2)
    print(s1[ir:jr + 1])
    print(s2[ir2:jr2 + 1])

    Adapted from
    https://rosettacode.org/wiki/Longest_common_substring#Python
    '''

    len_list1, len_list2 = len(list1), len(list2)
    start_index_list1, end_index_list1 = 0, -1
    start_index_list2, end_index_list2 = 0, -1

    # for index1 in tqdm(range(0, len_list1), desc="StreamingLLM prompt comparison", leave=False):
    for index1 in range(0, len_list1):
        try:
            index2 = list2.index(list1[index1])
        except:
            continue

        while index2 >= 0:
            temp_index1, temp_index2 = index1, index2
            while temp_index1 < len_list1 and temp_index2 < len_list2 and list2[temp_index2] == list1[temp_index1]:
                if temp_index1 - index1 >= end_index_list1 - start_index_list1:
                    start_index_list1, end_index_list1 = index1, temp_index1
                    start_index_list2, end_index_list2 = index2, temp_index2

                temp_index1 += 1
                temp_index2 += 1
            try:
                index2 = list2.index(list1[index1], index2 + 1)
            except:
                break

    return start_index_list1, end_index_list1, start_index_list2, end_index_list2
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`import torch`
Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`from numba import njit`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00
			`from modules import shared`


			`def process_llamacpp_cache(model, new_sequence, past_sequence):`
Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`if len(past_sequence) == 0 or len(new_sequence) == 0:`
			`return past_sequence`

Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`i1, i2, j1, j2 = find_longest_common_substring_indices(past_sequence, new_sequence)`
			`overlap_length = i2 - i1 + 1`

			`# Do StreamingLLM if i1 > 0 (ie the longest common subsequence is not a prefix)`
			`# and the overlap length is sufficiently long.`
			`if i1 > 0 and overlap_length > 0.2 * len(new_sequence):`

			`new_sequence = torch.tensor(new_sequence)`
			`past_sequence = torch.tensor(past_sequence)`

			`prefix_length = find_prefix_length(past_sequence[:i1], new_sequence[:j1])`
Fix StreamingLLM when content is removed from the beginning of the prompt 2024-03-14 12:18:54 -04:00			`sink_length = max(prefix_length, shared.args.attention_sink_size)`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`removed_length = i1 - sink_length`

Fix StreamingLLM when content is removed from the beginning of the prompt 2024-03-14 12:18:54 -04:00			`if removed_length <= 0:`
			`return past_sequence.tolist()`

Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`matching_prefix = past_sequence[:prefix_length]`
			`removed_chunk = past_sequence[sink_length:i1]`
			`overlapping_sequence = new_sequence[j1:j2 + 1]`
			`added_chunk = new_sequence[j2 + 1:]`

Minor logging change 2024-03-11 19:31:13 -04:00			`# print(past_sequence.tolist())`
			`# print(new_sequence.tolist())`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00
			`print()`
			`print('MATCHING PREFIX=', repr(shared.tokenizer.decode(matching_prefix)))`
			`print('ADDED CHUNK=', repr(shared.tokenizer.decode(added_chunk)))`
			`print('REMOVED CHUNK=', repr(shared.tokenizer.decode(removed_chunk)))`
Fix StreamingLLM when content is removed from the beginning of the prompt 2024-03-14 12:18:54 -04:00			`print('REMOVED LENGTH=', removed_length)`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`print()`

			`# Remove interval [sink_length, sink_length + removed_length) from the context`
Fix StreamingLLM when content is removed from the beginning of the prompt 2024-03-14 12:18:54 -04:00			`# Update model.n_tokens`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`model._ctx.kv_cache_seq_rm(0, sink_length, sink_length + removed_length)`
			`model._ctx.kv_cache_seq_shift(0, sink_length + removed_length, -1, -removed_length)`

			`new_sequence = new_sequence.tolist()`
			`model.input_ids[:j2 + 1] = new_sequence[:j2 + 1]`
			`model.n_tokens = j2 + 1`

			`return new_sequence[:j2 + 1]`
			`else:`
			`return past_sequence`


			`def find_prefix_length(past_seq, seq_tensor):`
			`'''`
			`Given two torch tensors, finds the length of the longest`
			`common prefix between the two.`
			`'''`
			`min_length = min(past_seq.shape[0], seq_tensor.shape[0])`
			`indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))`
			`if len(indices) > 0:`
			`prefix_length = indices[0].item()`
			`else:`
			`prefix_length = min_length`

			`return prefix_length`


Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`@njit`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`def find_longest_common_substring_indices(list1, list2):`
			`'''`
			`Given two lists, solves the Longest Common Substring problem.`

			`It returns the indices where the substring starts and ends in`
			`s1 and s2.`

			`Example:`

			`ir, jr, ir2, jr2 = find_longest_common_substring_indices(s1, s2)`
			`print(s1[ir:jr + 1])`
			`print(s2[ir2:jr2 + 1])`

			`Adapted from`
			`https://rosettacode.org/wiki/Longest_common_substring#Python`
			`'''`

			`len_list1, len_list2 = len(list1), len(list2)`
			`start_index_list1, end_index_list1 = 0, -1`
			`start_index_list2, end_index_list2 = 0, -1`

Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`# for index1 in tqdm(range(0, len_list1), desc="StreamingLLM prompt comparison", leave=False):`
			`for index1 in range(0, len_list1):`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`try:`
			`index2 = list2.index(list1[index1])`
Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`except:`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`continue`
Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`while index2 >= 0:`
			`temp_index1, temp_index2 = index1, index2`
			`while temp_index1 < len_list1 and temp_index2 < len_list2 and list2[temp_index2] == list1[temp_index1]:`
			`if temp_index1 - index1 >= end_index_list1 - start_index_list1:`
			`start_index_list1, end_index_list1 = index1, temp_index1`
			`start_index_list2, end_index_list2 = index2, temp_index2`

			`temp_index1 += 1`
			`temp_index2 += 1`
			`try:`
			`index2 = list2.index(list1[index1], index2 + 1)`
Optimize StreamingLLM by over 10x 2024-03-09 00:39:02 -05:00			`except:`
Add StreamingLLM for llamacpp & llamacpp_HF (2nd attempt) (#5669) 2024-03-08 22:25:33 -05:00			`break`

			`return start_index_list1, end_index_list1, start_index_list2, end_index_list2`