Merge branch 'main' of https://github.com/xanthousm/text-generation-webui

2024-10-01 01:26:03 -04:00 · 2023-03-08 22:08:54 +11:00 · 2023-03-08 22:08:54 +11:00 · 5648a41a27
commit 5648a41a27
parent ad6b699503 c09f416adb
16 changed files with 352 additions and 166 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 cache/*
 characters/*
 extensions/silero_tts/outputs/*
 extensions/elevenlabs_tts/outputs/*
 logs/*
 models/*
 softprompts/*
--- a/README.md
+++ b/README.md
@ -21,12 +21,13 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * Advanced chat features (send images, get audio responses with TTS).
 * Stream the text output in real time.
 * Load parameter presets from text files.
-* Load large models in 8-bit mode (see [here](https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652) and [here](https://www.reddit.com/r/PygmalionAI/comments/1115gom/running_pygmalion_6b_with_8gb_of_vram/) if you are on Windows).
+* Load large models in 8-bit mode (see [here](https://github.com/oobabooga/text-generation-webui/issues/147#issuecomment-1456040134), [here](https://github.com/oobabooga/text-generation-webui/issues/20#issuecomment-1411650652) and [here](https://www.reddit.com/r/PygmalionAI/comments/1115gom/running_pygmalion_6b_with_8gb_of_vram/) if you are on Windows).
 * Split large models across your GPU(s), CPU, and disk.
 * CPU mode.
 * [FlexGen offload](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen).
 * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed).
-* [Get responses via API](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py).
+* Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming.
 * [Supports the RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model).
 * Supports softprompts.
 * [Supports extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions).
 * [Works on Google Colab](https://github.com/oobabooga/text-generation-webui/wiki/Running-on-Colab).
@ -82,8 +83,8 @@ Models should be placed under `models/model-name`. For instance, `models/gpt-j-6
 * [Pythia](https://huggingface.co/models?search=eleutherai/pythia)
 * [OPT](https://huggingface.co/models?search=facebook/opt)
 * [GALACTICA](https://huggingface.co/models?search=facebook/galactica)
-* [\*-Erebus](https://huggingface.co/models?search=erebus)
+* [\*-Erebus](https://huggingface.co/models?search=erebus) (NSFW)
-* [Pygmalion](https://huggingface.co/models?search=pygmalion)
+* [Pygmalion](https://huggingface.co/models?search=pygmalion) (NSFW)
 You can automatically download a model from HF using the script `download-model.py`:
@ -149,9 +150,10 @@ Optionally, you can use the following command-line flags:
 | `--deepspeed`    | Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. |
 | `--nvme-offload-dir NVME_OFFLOAD_DIR`    | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. |
 | `--local_rank LOCAL_RANK`    | DeepSpeed: Optional argument for distributed setups. |
-| `--rwkv-strategy RWKV_STRATEGY`         |    The strategy to use while loading RWKV models. Examples: `"cpu fp32"`, `"cuda fp16"`, `"cuda fp16 *30 -> cpu fp32"`. |
+|  `--rwkv-strategy RWKV_STRATEGY`         |    RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". |
 |  `--rwkv-cuda-on`                        |   RWKV: Compile the CUDA kernel for better performance. |
 | `--no-stream`   | Don't stream the text output in real time. This improves the text generation performance.|
-| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example.|
+| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.|
 |  `--extensions EXTENSIONS [EXTENSIONS ...]` |  The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. |
 | `--listen`   | Make the web UI reachable from your local network.|
 |  `--listen-port LISTEN_PORT` | The listening port that the server will use. |
--- a/api-example-stream.py
+++ b/api-example-stream.py
@ -0,0 +1,90 @@
 '''
 Contributed by SagsMug. Thank you SagsMug.
 https://github.com/oobabooga/text-generation-webui/pull/175
 '''
 import asyncio
 import json
 import random
 import string
 import websockets
 def random_hash():
    letters = string.ascii_lowercase + string.digits
    return ''.join(random.choice(letters) for i in range(9))
 async def run(context):
    server = "127.0.0.1"
    params = {
        'max_new_tokens': 200,
        'do_sample': True,
        'temperature': 0.5,
        'top_p': 0.9,
        'typical_p': 1,
        'repetition_penalty': 1.05,
        'top_k': 0,
        'min_length': 0,
        'no_repeat_ngram_size': 0,
        'num_beams': 1,
        'penalty_alpha': 0,
        'length_penalty': 1,
        'early_stopping': False,
    }
    session = random_hash()
    async with websockets.connect(f"ws://{server}:7860/queue/join") as websocket:
        while content := json.loads(await websocket.recv()):
            #Python3.10 syntax, replace with if elif on older
            match content["msg"]:
                case "send_hash":
                    await websocket.send(json.dumps({
                        "session_hash": session,
                        "fn_index": 7
                    }))
                case "estimation":
                    pass
                case "send_data":
                    await websocket.send(json.dumps({
                        "session_hash": session,
                        "fn_index": 7,
                        "data": [
                            context,
                            params['max_new_tokens'],
                            params['do_sample'],
                            params['temperature'],
                            params['top_p'],
                            params['typical_p'],
                            params['repetition_penalty'],
                            params['top_k'],
                            params['min_length'],
                            params['no_repeat_ngram_size'],
                            params['num_beams'],
                            params['penalty_alpha'],
                            params['length_penalty'],
                            params['early_stopping'],
                        ]
                    }))
                case "process_starts":
                    pass
                case "process_generating" | "process_completed":
                    yield content["output"]["data"][0]
                    # You can search for your desired end indicator and 
                    #  stop generation by closing the websocket here
                    if (content["msg"] == "process_completed"):
                        break
 prompt = "What I would like to say is the following: "
 async def get_result():
    async for response in run(prompt):
        # Print intermediate steps
        print(response)
    # Print final result
    print(response)
 asyncio.run(get_result())
--- a/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt
+++ b/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt
--- a/extensions/elevenlabs_tts/requirements.txt
+++ b/extensions/elevenlabs_tts/requirements.txt
@ -0,0 +1,3 @@
 elevenlabslib
 soundfile
 sounddevice
--- a/extensions/elevenlabs_tts/script.py
+++ b/extensions/elevenlabs_tts/script.py
@ -0,0 +1,113 @@
 from pathlib import Path
 import gradio as gr
 from elevenlabslib import *
 from elevenlabslib.helpers import *
 params = {
    'activate': True,
    'api_key': '12345',
    'selected_voice': 'None',
 }
 initial_voice = ['None']
 wav_idx = 0
 user = ElevenLabsUser(params['api_key'])
 user_info = None
 # Check if the API is valid and refresh the UI accordingly.
 def check_valid_api():
    global user, user_info, params
    user = ElevenLabsUser(params['api_key'])
    user_info = user._get_subscription_data()
    print('checking api')
    if params['activate'] == False:
        return gr.update(value='Disconnected')
    elif user_info is None:
        print('Incorrect API Key')
        return gr.update(value='Disconnected')
    else:
        print('Got an API Key!')
        return gr.update(value='Connected')
 # Once the API is verified, get the available voices and update the dropdown list
 def refresh_voices():
    global user, user_info
    your_voices = [None]
    if user_info is not None:
        for voice in user.get_available_voices():
            your_voices.append(voice.initialName)
        return  gr.Dropdown.update(choices=your_voices)
    else:
        return
 def remove_surrounded_chars(string):
    new_string = ""
    in_star = False
    for char in string:
        if char == '*':
            in_star = not in_star
        elif not in_star:
            new_string += char
    return new_string
 def input_modifier(string):
    """
    This function is applied to your text inputs before
    they are fed into the model.
    """
    return string
 def output_modifier(string):
    """
    This function is applied to the model outputs.
    """
    global params, wav_idx, user, user_info
    if params['activate'] == False:
        return string
    elif user_info == None:
        return string
    string = remove_surrounded_chars(string)
    string = string.replace('"', '')
    string = string.replace('“', '')
    string = string.replace('\n', ' ')
    string = string.strip()
    if string == '':
        string = 'empty reply, try regenerating'
    output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.wav'.format(wav_idx))
    voice = user.get_voices_by_name(params['selected_voice'])[0]
    audio_data = voice.generate_audio_bytes(string)
    save_bytes_to_path(Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.wav'), audio_data)
    string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
    wav_idx += 1
    return string
 def ui():
    # Gradio elements
    with gr.Row():
        activate = gr.Checkbox(value=params['activate'], label='Activate TTS')
        connection_status = gr.Textbox(value='Disconnected', label='Connection Status')
    voice = gr.Dropdown(value=params['selected_voice'], choices=initial_voice, label='TTS Voice')
    with gr.Row():
        api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key')
        connect = gr.Button(value='Connect')
    # Event functions to update the parameters in the backend
    activate.change(lambda x: params.update({'activate': x}), activate, None)
    voice.change(lambda x: params.update({'selected_voice': x}), voice, None)
    api_key.change(lambda x: params.update({'api_key': x}), api_key, None)
    connect.click(check_valid_api, [], connection_status)
    connect.click(refresh_voices, [], voice)
--- a/extensions/silero_tts/script.py
+++ b/extensions/silero_tts/script.py
@ -1,4 +1,3 @@
 import asyncio
 from pathlib import Path
 import gradio as gr
@ -94,7 +93,7 @@ def output_modifier(string):
    string ='<speak>'+prosody+xmlesc(string)+'</prosody></speak>'
    output_file = Path(f'extensions/silero_tts/outputs/{wav_idx:06d}.wav')
-    audio = model.save_wav(ssml_text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
+    model.save_wav(text=string, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file))
    string = f'<audio src="file/{output_file.as_posix()}" controls></audio>'
    #reset if too many wavs. set max to -1 for unlimited.
--- a/modules/LLaMA.py
+++ b/modules/LLaMA.py
@ -1,96 +0,0 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the GNU General Public License version 3.
 import json
 import os
 import sys
 import time
 from pathlib import Path
 from typing import Tuple
 import fire
 import torch
 from fairscale.nn.model_parallel.initialize import initialize_model_parallel
 from llama import LLaMA, ModelArgs, Tokenizer, Transformer
 os.environ['RANK'] = '0'
 os.environ['WORLD_SIZE'] = '1'
 os.environ['MP'] = '1'
 os.environ['MASTER_ADDR'] = '127.0.0.1'
 os.environ['MASTER_PORT'] = '2223'
 def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))
    torch.distributed.init_process_group("gloo")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)
    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size
 def load(
    ckpt_dir: str,
    tokenizer_path: str,
    local_rank: int,
    world_size: int,
    max_seq_len: int,
    max_batch_size: int,
 ) -> LLaMA:
    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert world_size == len(
        checkpoints
    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
    ckpt_path = checkpoints[local_rank]
    print("Loading")
    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())
    model_args: ModelArgs = ModelArgs(
        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
    )
    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)
    generator = LLaMA(model, tokenizer)
    print(f"Loaded in {time.time() - start_time:.2f} seconds")
    return generator
 class LLaMAModel:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path, max_seq_len=2048, max_batch_size=1):
        tokenizer_path = path / "tokenizer.model"
        path = os.path.abspath(path)
        tokenizer_path = os.path.abspath(tokenizer_path)
        local_rank, world_size = setup_model_parallel()
        if local_rank > 0:
            sys.stdout = open(os.devnull, "w")
        generator = load(
            path, tokenizer_path, local_rank, world_size, max_seq_len, max_batch_size
        )
        result = self()
        result.pipeline = generator
        return result
    def generate(self, prompt, token_count=512, temperature=0.8, top_p=0.95):
        results = self.pipeline.generate(
            [prompt], max_gen_len=token_count, temperature=temperature, top_p=top_p
        )
        return results[0]
--- a/modules/RWKV.py
+++ b/modules/RWKV.py
@ -1,14 +1,17 @@
 import os
 from pathlib import Path
 from queue import Queue
 from threading import Thread
 import numpy as np
 from tokenizers import Tokenizer
 import modules.shared as shared
 np.set_printoptions(precision=4, suppress=True, linewidth=200)
 os.environ['RWKV_JIT_ON'] = '1'
-os.environ["RWKV_CUDA_ON"] = '0' #  '1' : use CUDA kernel for seq mode (much faster)
+os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster)
 from rwkv.model import RWKV
 from rwkv.utils import PIPELINE, PIPELINE_ARGS
@ -32,10 +35,11 @@ class RWKVModel:
        result.pipeline = pipeline
        return result
-    def generate(self, context, token_count=20, temperature=1, top_p=1, alpha_frequency=0.25, alpha_presence=0.25, token_ban=[0], token_stop=[], callback=None):
+    def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None):
        args = PIPELINE_ARGS(
            temperature = temperature,
            top_p = top_p,
            top_k = top_k,
            alpha_frequency = alpha_frequency, # Frequency Penalty (as in GPT-3)
            alpha_presence = alpha_presence, # Presence Penalty (as in GPT-3)
            token_ban = token_ban, # ban the generation of some tokens
@ -43,3 +47,64 @@ class RWKVModel:
        )
        return context+self.pipeline.generate(context, token_count=token_count, args=args, callback=callback)
    def generate_with_streaming(self, **kwargs):
        iterable = Iteratorize(self.generate, kwargs, callback=None)
        reply = kwargs['context']
        for token in iterable:
            reply += token
            yield reply
 class RWKVTokenizer:
    def __init__(self):
        pass
    @classmethod
    def from_pretrained(self, path):
        tokenizer_path = path / "20B_tokenizer.json"
        tokenizer = Tokenizer.from_file(os.path.abspath(tokenizer_path))
        result = self()
        result.tokenizer = tokenizer
        return result
    def encode(self, prompt):
        return self.tokenizer.encode(prompt).ids
    def decode(self, ids):
        return self.tokenizer.decode(ids)
 class Iteratorize:
    """
    Transforms a function that takes a callback
    into a lazy iterator (generator).
    """
    def __init__(self, func, kwargs={}, callback=None):
        self.mfunc=func
        self.c_callback=callback
        self.q = Queue(maxsize=1)
        self.sentinel = object()
        self.kwargs = kwargs
        def _callback(val):
            self.q.put(val)
        def gentask():
            ret = self.mfunc(callback=_callback, **self.kwargs)
            self.q.put(self.sentinel)
            if self.c_callback:
                self.c_callback(ret)
        Thread(target=gentask).start()
    def __iter__(self):
        return self
    def __next__(self):
        obj = self.q.get(True,None)
        if obj is self.sentinel:
            raise StopIteration
        else:
            return obj
--- a/modules/chat.py
+++ b/modules/chat.py
@ -51,23 +51,29 @@ def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat
    prompt = ''.join(rows)
    return prompt
-def extract_message_from_reply(question, reply, current, other, check, extensions=False):
+def extract_message_from_reply(question, reply, name1, name2, check, impersonate=False):
    next_character_found = False
    substring_found = False
-    previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(current)}:", question)]
+    asker = name1 if not impersonate else name2
-    idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(current)}:", reply)]
+    replier = name2 if not impersonate else name1
    idx = idx[len(previous_idx)-1]
-    if extensions:
+    previous_idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", question)]
-        reply = reply[idx + 1 + len(apply_extensions(f"{current}:", "bot_prefix")):]
+    idx = [m.start() for m in re.finditer(f"(^|\n){re.escape(replier)}:", reply)]
    idx = idx[max(len(previous_idx)-1, 0)]
    if not impersonate:
        reply = reply[idx + 1 + len(apply_extensions(f"{replier}:", "bot_prefix")):]
    else:
-        reply = reply[idx + 1 + len(f"{current}:"):]
+        reply = reply[idx + 1 + len(f"{replier}:"):]
    if check:
-        reply = reply.split('\n')[0].strip()
+        lines = reply.split('\n')
        reply = lines[0].strip()
        if len(lines) > 1:
            next_character_found = True
    else:
-        idx = reply.find(f"\n{other}:")
+        idx = reply.find(f"\n{asker}:")
        if idx != -1:
            reply = reply[:idx]
            next_character_found = True
@ -75,7 +81,7 @@ def extract_message_from_reply(question, reply, current, other, check, extension
        # Detect if something like "\nYo" is generated just before
        # "\nYou:" is completed
-        tmp = f"\n{other}:"
+        tmp = f"\n{asker}:"
        for j in range(1, len(tmp)):
            if reply[-j:] == tmp[:j]:
                substring_found = True
@ -89,6 +95,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
    shared.stop_everything = False
    just_started = True
    eos_token = '\n' if check else None
    name1_original = name1
    if 'pygmalion' in shared.model_name.lower():
        name1 = "You"
@ -119,8 +126,9 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
        for reply in generate_reply(f"{prompt}{' ' if len(reply) > 0 else ''}{reply}", max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name1}:"):
            # Extracting the reply
-            reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name2, name1, check, extensions=True)
+            reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check)
-            visible_reply = apply_extensions(reply, "output")
+            visible_reply = re.sub("(<USER>|<user>|{{user}})", name1_original, reply)
            visible_reply = apply_extensions(visible_reply, "output")
            if shared.args.chat:
                visible_reply = visible_reply.replace('\n', '<br>')
@ -139,6 +147,7 @@ def chatbot_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical
                yield shared.history['visible']
            if next_character_found:
                break
    yield shared.history['visible']
 def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, name1, name2, context, check, chat_prompt_size, chat_generation_attempts=1):
@ -152,7 +161,7 @@ def impersonate_wrapper(text, max_new_tokens, do_sample, temperature, top_p, typ
    reply = ''
    for i in range(chat_generation_attempts):
        for reply in generate_reply(prompt+reply, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=eos_token, stopping_string=f"\n{name2}:"):
-            reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, extensions=False)
+            reply, next_character_found, substring_found = extract_message_from_reply(prompt, reply, name1, name2, check, impersonate=True)
            if not substring_found:
                yield reply
            if next_character_found:
--- a/modules/models.py
+++ b/modules/models.py
@ -39,10 +39,9 @@ def load_model(model_name):
    t0 = time.time()
    shared.is_RWKV = model_name.lower().startswith('rwkv-')
    shared.is_LLaMA = model_name.lower().startswith('llama-')
    # Default settings
-    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV or shared.is_LLaMA):
+    if not (shared.args.cpu or shared.args.load_in_8bit or shared.args.auto_devices or shared.args.disk or shared.args.gpu_memory is not None or shared.args.cpu_memory is not None or shared.args.deepspeed or shared.args.flexgen or shared.is_RWKV):
        if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')):
            model = AutoModelForCausalLM.from_pretrained(Path(f"models/{shared.model_name}"), device_map='auto', load_in_8bit=True)
        else:
@ -80,20 +79,12 @@ def load_model(model_name):
    # RMKV model (not on HuggingFace)
    elif shared.is_RWKV:
-        from modules.RWKV import RWKVModel
+        from modules.RWKV import RWKVModel, RWKVTokenizer
        model = RWKVModel.from_pretrained(Path(f'models/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda")
        tokenizer = RWKVTokenizer.from_pretrained(Path('models'))
-        return model, None
+        return model, tokenizer
    # LLaMA model (not on HuggingFace)
    elif shared.is_LLaMA:
        import modules.LLaMA
        from modules.LLaMA import LLaMAModel
        model = LLaMAModel.from_pretrained(Path(f'models/{model_name}'))
        return model, None
    # Custom
    else:
--- a/modules/shared.py
+++ b/modules/shared.py
@ -6,7 +6,6 @@ model_name = ""
 soft_prompt_tensor = None
 soft_prompt = False
 is_RWKV = False
 is_LLaMA = False
 # Chat variables
 history = {'internal': [], 'visible': []}
@ -44,7 +43,6 @@ settings = {
        'default': 'NovelAI-Sphinx Moth',
        'pygmalion-*': 'Pygmalion',
        'RWKV-*': 'Naive',
        'llama-*': 'Naive',
        '(rosey|chip|joi)_.*_instruct.*': 'Instruct Joi (Contrastive Search)'
    },
    'prompts': {
@ -84,9 +82,10 @@ parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, defaul
 parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
 parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.')
 parser.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.')
-parser.add_argument('--rwkv-strategy', type=str, default=None, help='The strategy to use while loading RWKV models. Examples: "cpu fp32", "cuda fp16", "cuda fp16 *30 -> cpu fp32".')
+parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".')
 parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.')
 parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time. This improves the text generation performance.')
-parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example.')
+parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.')
 parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.')
 parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.')
 parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.')
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -21,12 +21,11 @@ def get_max_prompt_length(tokens):
    return max_length
 def encode(prompt, tokens_to_generate=0, add_special_tokens=True):
-
+    if shared.is_RWKV:
-    # These models do not have explicit tokenizers for now, so
+        input_ids = shared.tokenizer.encode(str(prompt))
-    # we return an estimate for the number of tokens
+        input_ids = np.array(input_ids).reshape(1, len(input_ids))
-    if shared.is_RWKV or shared.is_LLaMA:
+        return input_ids
-        return np.zeros((1, len(prompt)//4))
+    else:
        input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens)
        if shared.args.cpu:
            return input_ids
@ -81,26 +80,30 @@ def formatted_outputs(reply, model_name):
    else:
        return reply
-def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
+def clear_torch_cache():
    gc.collect()
    if not shared.args.cpu:
        torch.cuda.empty_cache()
 def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typical_p, repetition_penalty, top_k, min_length, no_repeat_ngram_size, num_beams, penalty_alpha, length_penalty, early_stopping, eos_token=None, stopping_string=None):
    clear_torch_cache()
    t0 = time.time()
    # These models are not part of Hugging Face, so we handle them
    # separately and terminate the function call earlier
-    if shared.is_RWKV or shared.is_LLaMA:
+    if shared.is_RWKV:
        if shared.args.no_stream:
-            reply = shared.model.generate(question, token_count=max_new_tokens, temperature=temperature, top_p=top_p)
+            reply = shared.model.generate(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k)
            t1 = time.time()
            print(f"Output generated in {(t1-t0):.2f} seconds.")
            yield formatted_outputs(reply, shared.model_name)
        else:
-            for i in tqdm(range(max_new_tokens//8+1)):
+            yield formatted_outputs(question, shared.model_name)
-                reply = shared.model.generate(question, token_count=8, temperature=temperature, top_p=top_p)
+            # RWKV has proper streaming, which is very nice.
            # No need to generate 8 tokens at a time.
            for reply in shared.model.generate_with_streaming(context=question, token_count=max_new_tokens, temperature=temperature, top_p=top_p, top_k=top_k):
                yield formatted_outputs(reply, shared.model_name)
-                question = reply
+
        t1 = time.time()
        print(f"Output generated in {(t1-t0):.2f} seconds.")
        return
    original_question = question
@ -111,8 +114,7 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
    input_ids = encode(question, max_new_tokens)
    cuda = "" if (shared.args.cpu or shared.args.deepspeed or shared.args.flexgen) else ".cuda()"
-    n = shared.tokenizer.eos_token_id if eos_token is None else encode(eos_token)[0][-1]
+    n = shared.tokenizer.eos_token_id if eos_token is None else int(encode(eos_token)[0][-1])
    if stopping_string is not None:
        # The stopping_criteria code below was copied from
        # https://github.com/PygmalionAI/gradio-ui/blob/master/src/model.py
@ -149,14 +151,12 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
            f"temperature={temperature}",
            f"stop={n}",
        ]
    if shared.args.deepspeed:
        generate_params.append("synced_gpus=True")
    if shared.args.no_stream:
        generate_params.append("max_new_tokens=max_new_tokens")
    else:
        generate_params.append("max_new_tokens=8")
    if shared.soft_prompt:
        inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
        generate_params.insert(0, "inputs_embeds=inputs_embeds")
@ -184,6 +184,8 @@ def generate_reply(question, max_new_tokens, do_sample, temperature, top_p, typi
        yield formatted_outputs(original_question, shared.model_name)
        shared.still_streaming = True
        for i in tqdm(range(max_new_tokens//8+1)):
            clear_torch_cache()
            with torch.no_grad():
                output = eval(f"shared.model.generate({', '.join(generate_params)}){cuda}")[0]
            if shared.soft_prompt:
--- a/presets/Naive.txt
+++ b/presets/Naive.txt
@ -1,3 +1,4 @@
 do_sample=True
-top_p=0.95
+temperature=0.7
-temperature=0.8
+top_p=0.85
 top_k=50
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,8 @@ bitsandbytes==0.37.0
 flexgen==0.1.7
 gradio==3.18.0
 numpy
-rwkv==0.0.6
+rwkv==0.1.0
 safetensors==0.2.8
 git+https://github.com/huggingface/transformers
 tensorboard
 sentencepiece
 git+https://github.com/oobabooga/transformers@llama_push
--- a/server.py
+++ b/server.py
@ -22,8 +22,14 @@ if (shared.args.chat or shared.args.cai_chat) and not shared.args.no_stream:
    print('Warning: chat mode currently becomes somewhat slower with text streaming on.\nConsider starting the web UI with the --no-stream option.\n')
 # Loading custom settings
 settings_file = None
 if shared.args.settings is not None and Path(shared.args.settings).exists():
-    new_settings = json.loads(open(Path(shared.args.settings), 'r').read())
+    settings_file = Path(shared.args.settings)
 elif Path('settings.json').exists():
    settings_file = Path('settings.json')
 if settings_file is not None:
    print(f"Loading settings from {settings_file}...")
    new_settings = json.loads(open(settings_file, 'r').read())
    for item in new_settings:
        shared.settings[item] = new_settings[item]