text-generation-webui/modules/RWKV.py

'''
This loader is not currently maintained as RWKV can now be loaded
through the transformers library.
'''

import copy
import os
from pathlib import Path

import numpy as np
from tokenizers import Tokenizer
from transformers import is_torch_xpu_available

import modules.shared as shared
from modules.callbacks import Iteratorize

np.set_printoptions(precision=4, suppress=True, linewidth=200)

os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0'  # use CUDA kernel for seq mode (much faster)

from rwkv.model import RWKV
from rwkv.utils import PIPELINE, PIPELINE_ARGS


class RWKVModel:
    def __init__(self):
        pass

    @classmethod
    def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):
        tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")
        if shared.args.rwkv_strategy is None:
            model = RWKV(model=str(path), strategy=f'{device} {dtype}')
        else:
            model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy)

        pipeline = PIPELINE(model, str(tokenizer_path))
        result = self()
        result.pipeline = pipeline
        result.model = model
        result.cached_context = ""
        result.cached_model_state = None
        result.cached_output_logits = None
        return result

    def generate(self, prompt, state, callback=None):
        args = PIPELINE_ARGS(
            temperature=state['temperature'],
            top_p=state['top_p'],
            top_k=state['top_k'],
            alpha_frequency=0.1,  # Frequency Penalty (as in GPT-3)
            alpha_presence=0.1,  # Presence Penalty (as in GPT-3)
            token_ban=[0],  # ban the generation of some tokens
            token_stop=[]
        )

        if self.cached_context != "":
            if prompt.startswith(self.cached_context):
                prompt = prompt[len(self.cached_context):]
            else:
                self.cached_context = ""
                self.cached_model_state = None
                self.cached_output_logits = None

        # out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
        out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)
        return out

    def generate_with_streaming(self, *args, **kwargs):
        with Iteratorize(self.generate, args, kwargs, callback=None) as generator:
            reply = ''
            for token in generator:
                reply += token
                yield reply

    # Similar to the PIPELINE.generate, but lets us maintain the cached_model_state
    def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):
        all_tokens = []
        out_str = ''
        occurrence = {}
        state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None

        # if we ended up with an empty context, just reuse the cached logits
        # this can happen if a user undoes a message and then sends the exact message again
        # in that case the full context ends up being the same as the cached_context, so the remaining context is empty.
        if ctx == "":
            out = self.cached_output_logits

        token = None
        for i in range(token_count):
            # forward
            tokens = self.pipeline.encode(ctx) if i == 0 else [token]
            while len(tokens) > 0:
                out, state = self.model.forward(tokens[:args.chunk_len], state)
                tokens = tokens[args.chunk_len:]
            if i == 0:
                begin_token = len(all_tokens)
                last_token_posi = begin_token
            # cache the model state after scanning the context
            # we don't cache the state after processing our own generated tokens because
            # the output string might be post-processed arbitrarily. Therefore, what's fed into the model
            # on the next round of chat might be slightly different what what it output on the previous round
            if i == 0:
                self.cached_context += ctx
                self.cached_model_state = copy.deepcopy(state)
                self.cached_output_logits = copy.deepcopy(out)

            # adjust probabilities
            for n in args.token_ban:
                out[n] = -float('inf')

            for n in occurrence:
                out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)

            # sampler
            token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)
            if token in args.token_stop:
                break

            all_tokens += [token]
            if token not in occurrence:
                occurrence[token] = 1
            else:
                occurrence[token] += 1

            # output
            tmp = self.pipeline.decode(all_tokens[last_token_posi:])
            if '\ufffd' not in tmp:  # is valid utf-8 string?
                if callback:
                    callback(tmp)

                out_str += tmp
                last_token_posi = begin_token + i + 1
        return out_str


class RWKVTokenizer:
    def __init__(self):
        pass

    @classmethod
    def from_pretrained(self, path):
        tokenizer_path = path / "20B_tokenizer.json"
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
        result = self()
        result.tokenizer = tokenizer
        return result

    def encode(self, prompt):
        return self.tokenizer.encode(prompt).ids

    def decode(self, ids):
        return self.tokenizer.decode(ids)
Add a note about RWKV loader 2023-09-26 20:43:39 -04:00			`'''`
			`This loader is not currently maintained as RWKV can now be loaded`
			`through the transformers library.`
			`'''`

Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`import copy`
Add cpu, bf16 options 2023-02-27 22:09:11 -05:00			`import os`
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00			`from pathlib import Path`
Add cpu, bf16 options 2023-02-27 22:09:11 -05:00
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00			`import numpy as np`
Add RWKV tokenizer 2023-03-06 06:45:49 -05:00			`from tokenizers import Tokenizer`
Intel Gpu support initialization (#4340) 2023-10-26 22:39:51 -04:00			`from transformers import is_torch_xpu_available`
Add cpu, bf16 options 2023-02-27 22:09:11 -05:00
			`import modules.shared as shared`
Remove redeclaration of a function 2023-03-08 00:50:49 -05:00			`from modules.callbacks import Iteratorize`
Add cpu, bf16 options 2023-02-27 22:09:11 -05:00
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00			`np.set_printoptions(precision=4, suppress=True, linewidth=200)`

			`os.environ['RWKV_JIT_ON'] = '1'`
Make the code more like PEP8 for readability (#862) 2023-04-06 23:15:45 -04:00			`os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster)`
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00
			`from rwkv.model import RWKV`
			`from rwkv.utils import PIPELINE, PIPELINE_ARGS`

Sort the imports 2023-03-01 10:18:17 -05:00
Add RWKVModel class 2023-03-01 10:08:55 -05:00			`class RWKVModel:`
			`def __init__(self):`
			`pass`
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00
Add RWKVModel class 2023-03-01 10:08:55 -05:00			`@classmethod`
Intel Gpu support initialization (#4340) 2023-10-26 22:39:51 -04:00			`def from_pretrained(self, path, dtype="bf16" if is_torch_xpu_available() else "fp16", device="xpu" if is_torch_xpu_available() else "cuda"):`
Add RWKVModel class 2023-03-01 10:08:55 -05:00			`tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json")`
Add --rwkv-strategy parameter 2023-03-01 18:02:48 -05:00			`if shared.args.rwkv_strategy is None:`
Use str(Path) instead of os.path.abspath(Path) 2023-03-12 23:08:01 -04:00			`model = RWKV(model=str(path), strategy=f'{device} {dtype}')`
Add --rwkv-strategy parameter 2023-03-01 18:02:48 -05:00			`else:`
Use str(Path) instead of os.path.abspath(Path) 2023-03-12 23:08:01 -04:00			`model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy)`
Move RWKV loader into a separate file 2023-02-27 21:50:16 -05:00
Style improvements (#1957) 2023-05-09 21:49:39 -04:00			`pipeline = PIPELINE(model, str(tokenizer_path))`
Add RWKVModel class 2023-03-01 10:08:55 -05:00			`result = self()`
Rename a variable 2023-03-01 10:33:09 -05:00			`result.pipeline = pipeline`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`result.model = model`
			`result.cached_context = ""`
			`result.cached_model_state = None`
			`result.cached_output_logits = None`
Add RWKVModel class 2023-03-01 10:08:55 -05:00			`return result`

Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`def generate(self, prompt, state, callback=None):`
Add a generate() function for RWKV 2023-03-01 10:16:11 -05:00			`args = PIPELINE_ARGS(`
Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`temperature=state['temperature'],`
			`top_p=state['top_p'],`
			`top_k=state['top_k'],`
			`alpha_frequency=0.1, # Frequency Penalty (as in GPT-3)`
			`alpha_presence=0.1, # Presence Penalty (as in GPT-3)`
			`token_ban=[0], # ban the generation of some tokens`
			`token_stop=[]`
Add a generate() function for RWKV 2023-03-01 10:16:11 -05:00			`)`

Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`if self.cached_context != "":`
Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`if prompt.startswith(self.cached_context):`
			`prompt = prompt[len(self.cached_context):]`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`else:`
			`self.cached_context = ""`
			`self.cached_model_state = None`
			`self.cached_output_logits = None`

Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`# out = self.pipeline.generate(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)`
			`out = self.generate_from_cached_state(prompt, token_count=state['max_new_tokens'], args=args, callback=callback)`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`return out`
Add RWKV tokenizer 2023-03-06 06:45:49 -05:00
Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`def generate_with_streaming(self, args, *kwargs):`
			`with Iteratorize(self.generate, args, kwargs, callback=None) as generator:`
Stop the bot from talking for you in chat mode 2023-03-23 20:38:20 -04:00			`reply = ''`
Use 'with' statement to better handle streaming memory 2023-03-12 00:04:28 -05:00			`for token in generator:`
			`reply += token`
			`yield reply`
Add proper streaming to RWKV 2023-03-07 16:17:56 -05:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# Similar to the PIPELINE.generate, but lets us maintain the cached_model_state`
			`def generate_from_cached_state(self, ctx="", token_count=20, args=None, callback=None):`
			`all_tokens = []`
			`out_str = ''`
			`occurrence = {}`
			`state = copy.deepcopy(self.cached_model_state) if self.cached_model_state is not None else None`

			`# if we ended up with an empty context, just reuse the cached logits`
			`# this can happen if a user undoes a message and then sends the exact message again`
			`# in that case the full context ends up being the same as the cached_context, so the remaining context is empty.`
			`if ctx == "":`
			`out = self.cached_output_logits`

Add ExLlama support (#2444) 2023-06-16 19:35:38 -04:00			`token = None`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`for i in range(token_count):`
			`# forward`
			`tokens = self.pipeline.encode(ctx) if i == 0 else [token]`
			`while len(tokens) > 0:`
			`out, state = self.model.forward(tokens[:args.chunk_len], state)`
			`tokens = tokens[args.chunk_len:]`
Fix the missing Chinese character bug (#2497) 2023-06-02 12:45:41 -04:00			`if i == 0:`
Reorganize model loading UI completely (#2720) 2023-06-16 18:00:37 -04:00			`begin_token = len(all_tokens)`
			`last_token_posi = begin_token`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# cache the model state after scanning the context`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00			`# we don't cache the state after processing our own generated tokens because`
			`# the output string might be post-processed arbitrarily. Therefore, what's fed into the model`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# on the next round of chat might be slightly different what what it output on the previous round`
			`if i == 0:`
			`self.cached_context += ctx`
			`self.cached_model_state = copy.deepcopy(state)`
			`self.cached_output_logits = copy.deepcopy(out)`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# adjust probabilities`
			`for n in args.token_ban:`
			`out[n] = -float('inf')`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`for n in occurrence:`
			`out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# sampler`
			`token = self.pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p, top_k=args.top_k)`
			`if token in args.token_stop:`
			`break`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`all_tokens += [token]`
			`if token not in occurrence:`
			`occurrence[token] = 1`
			`else:`
			`occurrence[token] += 1`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`# output`
Fix the missing Chinese character bug (#2497) 2023-06-02 12:45:41 -04:00			`tmp = self.pipeline.decode(all_tokens[last_token_posi:])`
Style improvements (#1957) 2023-05-09 21:49:39 -04:00			`if '\ufffd' not in tmp: # is valid utf-8 string?`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`if callback:`
			`callback(tmp)`
Reorganize model loading UI completely (#2720) 2023-06-16 18:00:37 -04:00
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`out_str += tmp`
Fix the missing Chinese character bug (#2497) 2023-06-02 12:45:41 -04:00			`last_token_posi = begin_token + i + 1`
Make the RWKV model cache the RNN state between messages (#1354) 2023-05-09 10:12:53 -04:00			`return out_str`

Make the code more like PEP8 for readability (#862) 2023-04-06 23:15:45 -04:00
Add RWKV tokenizer 2023-03-06 06:45:49 -05:00			`class RWKVTokenizer:`
			`def __init__(self):`
			`pass`

			`@classmethod`
			`def from_pretrained(self, path):`
			`tokenizer_path = path / "20B_tokenizer.json"`
Use str(Path) instead of os.path.abspath(Path) 2023-03-12 23:08:01 -04:00			`tokenizer = Tokenizer.from_file(str(tokenizer_path))`
Add RWKV tokenizer 2023-03-06 06:45:49 -05:00			`result = self()`
			`result.tokenizer = tokenizer`
			`return result`

			`def encode(self, prompt):`
			`return self.tokenizer.encode(prompt).ids`

			`def decode(self, ids):`
			`return self.tokenizer.decode(ids)`