Generalize multimodality (llava/minigpt4 7b and 13b now supported) (#1741)

2024-10-01 01:26:03 -04:00 · 2023-05-10 01:18:02 +02:00 · 2023-05-10 01:18:02 +02:00 · e9e75a9ec7
commit e9e75a9ec7
parent a2b25322f0
22 changed files with 812 additions and 371 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,6 +4,7 @@ training/datasets
 extensions/silero_tts/outputs
 extensions/elevenlabs_tts/outputs
 extensions/sd_api_pictures/outputs
 extensions/multimodal/pipelines
 logs
 loras
 models
--- a/README.md
+++ b/README.md
@ -31,6 +31,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 * [llama.cpp](docs/llama.cpp-models.md)
 * [RWKV model](docs/RWKV-model.md)
 * [LoRA (loading and training)](docs/Using-LoRAs.md)
 * [Multimodal pipelines, including LLaVA and MiniGPT-4](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal)
 * Softprompts
 * [Extensions](docs/Extensions.md) - see the [user extensions list](https://github.com/oobabooga/text-generation-webui-extensions)
@ -281,6 +282,12 @@ Optionally, you can use the following command-line flags:
 | `--api`                               | Enable the API extension. |
 | `--public-api`                        | Create a public URL for the API using Cloudfare. |
 #### Multimodal
 | Flag                                  | Description |
 |---------------------------------------|-------------|
 | `--multimodal-pipeline PIPELINE`      | The multimodal pipeline to use. Examples: `llava-7b`, `llava-13b`. |
 Out of memory errors? [Check the low VRAM guide](docs/Low-VRAM-guide.md).
 ## Presets
--- a/characters/instruction-following/LLaVA.yaml
+++ b/characters/instruction-following/LLaVA.yaml
@ -1,4 +1,4 @@
-user: "### Human"
+user: "### Human:"
-bot: "### Assistant"
+bot: "### Assistant:"
-turn_template: "<|user|>\n<|user-message|>\n<|bot|>\n<|bot-message|>\n"
+turn_template: "<|user|> <|user-message|><|bot|> <|bot-message|>\n"
-context: "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.\n### Human: \nHi!\n### Assistant: \nHi there! How can I help you today?\n"
+context: "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.### Human: Hi!### Assistant: Hi there! How can I help you today?\n"
--- a/docs/Extensions.md
+++ b/docs/Extensions.md
@ -33,7 +33,7 @@ Most of these have been created by the extremely talented contributors that you
 |[send_pictures](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/send_pictures/)| Creates an image upload field that can be used to send images to the bot in chat mode. Captions are automatically generated using BLIP. |
 |[whisper_stt](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/whisper_stt)| Allows you to enter your inputs in chat mode using your microphone. |
 |[sd_api_pictures](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/sd_api_pictures)| Allows you to request pictures from the bot in chat mode, which will be generated using the AUTOMATIC1111 Stable Diffusion API. See examples [here](https://github.com/oobabooga/text-generation-webui/pull/309). |
-|[llava](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/llava) | Adds LLaVA multimodal model support. For detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/llava/README.md) in the extension directory. |
+|[multimodal](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal) | Adds multimodality support (text+images). For detailed description see [README.md](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal/README.md) in the extension directory. |
 |[openai](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/openai)| Creates an API that mimics the OpenAI API and can be used as a drop-in replacement. |
 |[superbooga](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/superbooga)| An extension that uses ChromaDB to create an arbitrarily large pseudocontext, taking as input text files, URLs, or pasted text. Based on https://github.com/kaiokendev/superbig. |
@ -50,7 +50,8 @@ Most of these have been created by the extremely talented contributors that you
 | `def bot_prefix_modifier(string)`  | Applied in chat mode to the prefix for the bot's reply (more on that below). |
 | `def custom_generate_reply(...)` | Overrides the main text generation function. |
 | `def custom_generate_chat_prompt(...)` | Overrides the prompt generator in chat mode. |
-| `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See `llava` extension for an example |
+| `def tokenizer_modifier(state, prompt, input_ids, input_embeds)` | Modifies the `input_ids`/`input_embeds` fed to the model. Should return `prompt`, `input_ids`, `input_embeds`. See `multimodal` extension for an example |
 | `def custom_tokenized_length(prompt)` | Used in conjunction with `tokenizer_modifier`, returns the length in tokens of `prompt`. See `multimodal` extension for an example |
 Additionally, the script may define two special global variables:
@ -78,7 +79,7 @@ input_hijack = {
 ```
 This is only relevant in chat mode. If your extension sets `input_hijack['state']` to `True` at any moment, the next call to `modules.chat.chatbot_wrapper` will use the values inside `input_hijack['value']` as the user input for text generation. See the `send_pictures` extension above for an example. 
-Additionally, your extension can set the value to be a callback, in the form of `def cb(text: str, visible_text: str) -> [str, str]`. See the `llava` extension above for an example.
+Additionally, your extension can set the value to be a callback, in the form of `def cb(text: str, visible_text: str) -> [str, str]`. See the `multimodal` extension above for an example.
 ## The `bot_prefix_modifier`
@ -100,13 +101,22 @@ Marie Antoinette will become very enthusiastic in all her messages.
 In order to use your extension, you must start the web UI with the `--extensions` flag followed by the name of your extension (the folder under `text-generation-webui/extension` where `script.py` resides).
-You can activate more than one extension at a time by providing their names separated by spaces. The input, output and bot prefix modifiers will be applied in the specified order. For `custom_generate_chat_prompt`, only the first declaration encountered will be used and the rest will be ignored.
+You can activate more than one extension at a time by providing their names separated by spaces. The input, output and bot prefix modifiers will be applied in the specified order. 
 ```
 python server.py --extensions enthusiasm translate # First apply enthusiasm, then translate
 python server.py --extensions translate enthusiasm # First apply translate, then enthusiasm
 ```
 Do note, that for:
 - `custom_generate_chat_prompt`
 - `custom_generate_reply`
 - `tokenizer_modifier`
 - `custom_tokenized_length`
 only the first declaration encountered will be used and the rest will be ignored. 
 ## `custom_generate_reply` example
 Once defined in a `script.py`, this function is executed in place of the main generation functions. You can use it to connect the web UI to an external API, or to load a custom model that is not supported yet.
@ -167,7 +177,7 @@ def custom_generate_chat_prompt(user_input, state, **kwargs):
    # Building the prompt
    i = len(shared.history['internal']) - 1
-    while i >= 0 and len(encode(''.join(rows))[0]) < max_length:
+    while i >= 0 and get_encoded_length(''.join(rows)) < max_length:
        if _continue and i == len(shared.history['internal']) - 1:
            rows.insert(1, bot_turn_stripped + shared.history['internal'][i][1].strip())
        else:
@ -190,7 +200,7 @@ def custom_generate_chat_prompt(user_input, state, **kwargs):
        # Adding the Character prefix
        rows.append(apply_extensions("bot_prefix", bot_turn_stripped.rstrip(' ')))
-    while len(rows) > min_rows and len(encode(''.join(rows))[0]) >= max_length:
+    while len(rows) > min_rows and get_encoded_length(''.join(rows)) >= max_length:
        rows.pop(1)
    prompt = ''.join(rows)
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@ -3,7 +3,7 @@ import traceback
 from threading import Thread
 from typing import Callable, Optional
-from modules.text_generation import encode
+from modules.text_generation import get_encoded_length
 def build_parameters(body):
@ -11,7 +11,7 @@ def build_parameters(body):
    prompt_lines = [k.strip() for k in prompt.split('\n')]
    max_context = body.get('max_context_length', 2048)
-    while len(prompt_lines) >= 0 and len(encode('\n'.join(prompt_lines))) > max_context:
+    while len(prompt_lines) >= 0 and get_encoded_length('\n'.join(prompt_lines)) > max_context:
        prompt_lines.pop(0)
    prompt = '\n'.join(prompt_lines)
--- a/extensions/llava/README.md
+++ b/extensions/llava/README.md
@ -1,71 +0,0 @@
 # LLaVA
 ## Description
 Adds [LLaVA 13B](https://github.com/haotian-liu/LLaVA) multimodality support to text-generation-webui.
 https://user-images.githubusercontent.com/3718215/233817203-69b57e77-0c55-4fd6-b742-3204bb13b8fc.mp4
 ## LLaVA-7B
 7B version currently isn't supported. It will be supported if/when [more generic multimodality support](https://github.com/oobabooga/text-generation-webui/discussions/1687) gets implemented.
 ## Usage
 To run this extension, download LLaVA weights, for example from [here](https://huggingface.co/wojtab/llava-13b-v0-4bit-128g) (note: it's a 4-bit [GPTQ quantization](https://github.com/oobabooga/text-generation-webui/tree/main/docs/GPTQ-models-(4-bit-mode).md), done on "old CUDA" branch), and then start server.py with `--extensions llava` argument.
 Do note, that each image takes up 258 tokens, so adjust max_new_tokens to be at most 1700 (recommended value is between 200 to 500), so the images don't get truncated.
 To send an image, just upload it to the extension field below chat, and send a prompt as always. The image will be added to the end of your message. If you wish to modify the placement, include a string `<image>` in your prompt.
 Additionally, there is *Embed all images, not only the last one* checkbox. It modifies the image embeddings, by default (if it's unchecked), all but the most recent images have their embeddings empty, so they are not fed to the network. From initial testing, it seems as LLaVA considers the features in all images at the same time, so by default the extension skips previous images. If you want to include them anyway, just tick this checkbox.
 ## Extension config
 This extension uses following parameters (from settings.json):
 |Parameter|Description|
 |---------|-----------|
 |`llava-clip_bits`|Number of bits to load CLIP feature extractor in (either 32 or 16, default=32)|
 |`llava-clip_device`|Torch device to run the extractor on, for example `cpu` or `cuda:0`, by default `cuda:0` if available|
 |`llava-clip_repo`|Huggingface repository of CLIP model, `openai/clip-vit-large-patch14` by default. There should be no need to change it|
 |`llava-projector_bits`|Number of bits to load CLIP->LLaMA feature projector in (either 32 or 16, default=32)|
 |`llava-projector_device`|Torch device to run the CLIP->LLaMA feature projector on, for example `cpu` or `cuda:0`, by default `cuda:0` if available|
 |`llava-projector_repo`|Huggingface repository of multimodal projector, `liuhaotian/LLaVA-13b-delta-v0` by default. There should be no need to change it|
 |`llava-projector_filename`|The filename of multimodal projector weights, `mm_projector.bin` by default. There should be no need to change it|
 |`llava-add_all_images_to_prompt`|Default value of "Embed all images, not only the last one" checkbox|
 ## Technical description
 ### Original LLaVA
 The default LLaVA implementation uses modified `transformers` library, however this extension forgoes this requirement. The transformers are modified in LLaVA in such a way, that the entire LLaVA model gets loaded, and the inference now looks as follows:
 ```
 images --> CLIP --> projector --> input embeddings for images --> | 
                                                                  | --> LLaMA
 prompt -------------------------> input embeddings for text ----> |
 ```
 The images are represented in the prompt by the following token IDs:
 - 32000 - `<im_patch>` - placeholder token for embeddings from projector
 - 32001 - `<im_start>` - token marking start of an image
 - 32002 - `<im_end>` - token marking end of an image
 By default, image will be represented as `<im_start><im_patch>*256<im_end>`. The input embeddings for an image are converted with a single linear layer of the projector, then they are placed instead of `<im_patch>` tokens.
 The concatenated prompt then gets fed to fine-tuned LLaMA.
 ### In this extension
 Using default transformers, they only load the LLaMA part of LLaVA, ignoring the added projector weights, and not loading CLIP. We then reconstruct the `images -> CLIP -> projector` pipeline ourselves, then concatenate the input embeddings, and feed it to LLaMA loaded by transformers. This allows us to use normal flow from webui to load this model, and just hijack the model input with additional features.
 Splitting it to 3 separate models, allows us to configure each of them, and to move them to different devices(for example we can run CLIP+projector on CPU and LLaMA on GPU). Also, it enables us to use 4-bit GPTQ quantization for LLaVA, massively cutting down the VRAM requirement (it should be possible to fit on 12GB of VRAM with full context size by moving CLIP and projector to CPU).
 ### Usage through API
 You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Python example:
 ```Python
 import base64
 import requests
 CONTEXT = "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.\n### Human: \nHi!\n### Assistant: \nHi there! How can I help you today?\n"
 with open('extreme_ironing.jpg', 'rb') as f:
    img_str = base64.b64encode(f.read()).decode('utf-8')
    prompt = CONTEXT + f'### Human: \nWhat is unusual about this image: \n<img src="data:image/jpeg;base64,{img_str}">\n### Assistant: \n'
    print(requests.post('http://127.0.0.1:5000/api/v1/generate', json={'prompt': prompt, 'stopping_strings': ['\n###']}).json())
 ```
 script output:
 ```Python
 {'results': [{'text': "The unusual aspect of this image is that a man is standing on top of a yellow minivan while doing his laundry. He has set up a makeshift clothes line using the car's rooftop as an outdoor drying area. This scene is uncommon because people typically do their laundry indoors, in a dedicated space like a laundromat or a room in their home, rather than on top of a moving vehicle. Additionally, hanging clothes on the car could be potentially hazardous or illegal in some jurisdictions due to the risk of damaging the vehicle or causing accidents on the road.\n##"}]}
 ```
--- a/extensions/llava/script.py
+++ b/extensions/llava/script.py
@ -1,272 +1,6 @@
 import base64
 import re
 import time
 from dataclasses import dataclass
 from functools import partial
 from io import BytesIO
 import gradio as gr
-import torch
+import logging
 from huggingface_hub import hf_hub_download
 from PIL import Image
 from transformers import CLIPImageProcessor, CLIPVisionModel
 from modules import shared
 from modules.extensions import apply_extensions
 from modules.text_generation import encode, get_max_prompt_length
 params = {
    "add_all_images_to_prompt": False,
    # device to run CLIP on
    "clip_device": None,
    # bits to load clip in either 32 or 16 (it doesn't support 8-bit)
    "clip_bits": 32,
    # clip repository
    "clip_repo": "openai/clip-vit-large-patch14",
    # device to run projector on
    "projector_device": None,
    # projector bits, either 32 or 16
    "projector_bits": 32,
    # projector repository
    "projector_repo": "liuhaotian/LLaVA-13b-delta-v0",
    # file with the projector weights
    "projector_file": "mm_projector.bin"
 }
 # If 'state' is True, will hijack the next chat generation
 input_hijack = {
    'state': False,
    'value': ["", ""]
 }
 # initialized in ui, so that params are loaded from settings
 llava_embedder = None
@dataclass
 class Token:
    token: str
    id: int
 class LLaVAEmbedder:
    IM_PATCH = Token("<im_patch>", 32000)
    IM_START = Token("<im_start>", 32001)
    IM_END = Token("<im_end>", 32002)
    def __init__(self):
        self.clip_device = self._get_device("clip_device")
        self.clip_dtype = self._get_dtype("clip_bits")
        self.projector_device = self._get_device("projector_device")
        self.projector_dtype = self._get_dtype("projector_bits")
        self.image_processor, self.vision_tower, self.mm_projector = self._load_models()
    def _get_device(self, setting_name):
        if params[setting_name] is None:
            return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        return torch.device(params[setting_name])
    def _get_dtype(self, setting_name):
        return torch.float32 if int(params[setting_name]) == 32 else torch.float16
    def _load_models(self):
        start_ts = time.time()
        print(f"LLaVA - Loading CLIP from {params['clip_repo']} as {self.clip_dtype} on {self.clip_device}...")
        image_processor = CLIPImageProcessor.from_pretrained(params["clip_repo"], torch_dtype=self.clip_dtype)
        vision_tower = CLIPVisionModel.from_pretrained(params["clip_repo"], torch_dtype=self.clip_dtype).to(self.clip_device)
        print(f"LLaVA - Loading projector from {params['projector_repo']} as {self.projector_dtype} on {self.projector_device}...")
        projector_path = hf_hub_download(params["projector_repo"], params["projector_file"])
        mm_projector = torch.nn.Linear(1024, 5120)
        projector_data = torch.load(projector_path)
        mm_projector.weight = torch.nn.Parameter(projector_data['model.mm_projector.weight'].to(dtype=self.projector_dtype), False)
        mm_projector.bias = torch.nn.Parameter(projector_data['model.mm_projector.bias'].to(dtype=self.projector_dtype), False)
        mm_projector = mm_projector.to(self.projector_device)
        print(f"LLaVA supporting models loaded, took {time.time() - start_ts:.2f} seconds")
        return image_processor, vision_tower, mm_projector
    def _update_prompt(self, prompt, images):
        for _ in images:
            # replace the image token with the image patch token in the prompt (each occurrence)
            replace_token = LLaVAEmbedder.IM_PATCH.token * 256
            replace_token = LLaVAEmbedder.IM_START.token + replace_token + LLaVAEmbedder.IM_END.token
            prompt = re.sub(r'<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">', replace_token, prompt, 1)
        return prompt
    def _extract_image_features(self, images):
        images = self.image_processor(images, return_tensors='pt')['pixel_values']
        images = images.to(self.clip_device, dtype=self.clip_dtype)
        with torch.no_grad():
            image_forward_outs = self.vision_tower(images, output_hidden_states=True)
            select_hidden_state_layer = -2
            select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
            image_features = select_hidden_state[:, 1:].to(self.projector_device, dtype=self.projector_dtype)
            image_features = self.mm_projector(image_features)
        return image_features
    def forward(self, prompt, images, state):
        prompt = self._update_prompt(prompt, images)
        input_ids = encode(prompt, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))[0]
        input_embeds = shared.model.model.embed_tokens(input_ids).to(self.projector_device)
        if input_ids[0] == LLaVAEmbedder.IM_PATCH.id:
            # prompt got truncated in the middle of an image, remove the image data
            im_end = torch.where(input_ids == LLaVAEmbedder.IM_END.id)[0][0]
            input_ids = input_ids[im_end+1:]
            input_embeds = input_embeds[im_end+1:]
            leftover_images = torch.where(input_ids == LLaVAEmbedder.IM_START.id)[0].shape[0]
            print(f"LLaVA - WARNING: removed {len(images) - leftover_images} image(s) from prompt. The generation might be broken, try decreasing max_new_tokens")
            images = images[-leftover_images:]
            if len(images) == 0:
                return prompt, input_ids, input_embeds, 0
        total_embedded = 0
        image_features = self._extract_image_features(images).to(self.projector_device)
        image_start_tokens = torch.where(input_ids == LLaVAEmbedder.IM_START.id)[0]
        if not torch.any(input_ids == LLaVAEmbedder.IM_PATCH.id) or len(image_start_tokens) == 0:
            # multimodal LLM, but the current prompt is not multimodal/truncated
            return prompt, input_ids, input_embeds, total_embedded
        cur_image_idx = 0
        if not params['add_all_images_to_prompt']:
            image_start_tokens = [image_start_tokens[-1]]
            cur_image_idx = -1
        for image_start_token_pos in image_start_tokens:
            cur_image_features = image_features[cur_image_idx]
            num_patches = cur_image_features.shape[0]
            input_embeds = torch.cat((input_embeds[:image_start_token_pos+1], cur_image_features, input_embeds[image_start_token_pos + num_patches + 1:]), dim=0)
            cur_image_idx += 1
            total_embedded += 1
        return prompt, input_ids, input_embeds, total_embedded
    @staticmethod
    def len_in_tokens(text):
        images = re.findall(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', text)
        image_tokens = 0
        for _ in images:
            image_tokens += 258
        return len(encode(re.sub(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', '', text))[0]) + image_tokens
 def add_chat_picture(picture, text, visible_text):
    # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
    max_hw, min_hw = max(picture.size), min(picture.size)
    aspect_ratio = max_hw / min_hw
    shortest_edge = int(max(300 / aspect_ratio, 224))
    longest_edge = int(shortest_edge * aspect_ratio)
    w = shortest_edge if picture.width < picture.height else longest_edge
    h = shortest_edge if picture.width >= picture.height else longest_edge
    picture = picture.resize((w,h))
    buffer = BytesIO()
    picture.save(buffer, format="JPEG")
    img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
    image = f'<img src="data:image/jpeg;base64,{img_str}">'
    if '<image>' in text:
        text = text.replace('<image>', image)
    else:
        text = text + '\n' + image
    if visible_text == '' or visible_text is None:
        visible_text = text
    elif '<image>' in visible_text:
        visible_text = visible_text.replace('<image>', image)
    else:
        visible_text = visible_text + '\n' + image
    return text, visible_text
 def custom_generate_chat_prompt(user_input, state, **kwargs):
    impersonate = kwargs['impersonate'] if 'impersonate' in kwargs else False
    _continue = kwargs['_continue'] if '_continue' in kwargs else False
    also_return_rows = kwargs['also_return_rows'] if 'also_return_rows' in kwargs else False
    rows = [f"{state['context'].strip()}\n"]
    min_rows = 3
    # Finding the maximum prompt size
    chat_prompt_size = state['chat_prompt_size']
    if shared.soft_prompt:
        chat_prompt_size -= shared.soft_prompt_tensor.shape[1]
    max_length = min(get_max_prompt_length(state), chat_prompt_size)
    prefix1 = f"{state['name1']}: "
    prefix2 = f"{state['name2']}: "
    i = len(shared.history['internal']) - 1
    while i >= 0 and LLaVAEmbedder.len_in_tokens(''.join(rows)) < max_length:
        if _continue and i == len(shared.history['internal']) - 1:
            rows.insert(1, f"{prefix2}{shared.history['internal'][i][1]}")
        else:
            rows.insert(1, f"{prefix2}{shared.history['internal'][i][1].strip()}\n")
        string = shared.history['internal'][i][0]
        if string != '':
            rows.insert(1, f"{prefix1}{string.strip()}\n")
        i -= 1
    if impersonate:
        min_rows = 2
        rows.append(f"{prefix1}")
    elif not _continue:
        # Adding the user message
        if len(user_input) > 0:
            rows.append(f"{prefix1}{user_input}\n")
        # Adding the Character prefix
        rows.append(apply_extensions("bot_prefix", f"{prefix2}"))
    while len(rows) > min_rows and LLaVAEmbedder.len_in_tokens(''.join(rows)) >= max_length:
        rows.pop(1)
    prompt = ''.join(rows)
    if also_return_rows:
        return prompt, rows
    else:
        return prompt
 def tokenizer_modifier(state, prompt, input_ids, input_embeds):
    global params
    start_ts = time.time()
    image_matches = re.finditer(r'<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">', prompt)
    images = [Image.open(BytesIO(base64.b64decode(match.group(1)))) for match in image_matches]
    if len(images) == 0:
        return prompt, input_ids, input_embeds
    prompt, input_ids, input_embeds, total_embedded = llava_embedder.forward(prompt, images, state)
    print(f'LLaVA - Embedded {total_embedded} image(s) in {time.time()-start_ts:.2f}s')
    return (prompt,
        input_ids.unsqueeze(0).to(shared.model.device, dtype=torch.int64),
        input_embeds.unsqueeze(0).to(shared.model.device, dtype=shared.model.dtype))
 def ui():
-    global llava_embedder
+    gr.Markdown("### This extension is deprecated, use \"multimodal\" extension instead")
-    llava_embedder = LLaVAEmbedder()
+    logging.error("LLaVA extension is deprecated, use \"multimodal\" extension instead")
    with gr.Column():
        picture_select = gr.Image(label='Send a picture', type='pil')
        # I found that it doesn't deal super well with multiple images, and demo ui had a bug where it included only the last image anyway
        single_image_checkbox = gr.Checkbox(False, label='Embed all images, not only the last one')
    # Prepare the input hijack
    picture_select.upload(
        lambda picture: input_hijack.update({"state": True, "value": partial(add_chat_picture, picture)}),
        [picture_select],
        None
    )
    picture_select.clear(lambda: input_hijack.update({"state": False, "value": ["",""]}), None, None)
    single_image_checkbox.change(lambda x: params.update({"add_all_images_to_prompt": x}), single_image_checkbox, None)
    shared.gradio['Generate'].click(lambda: None, None, picture_select)
    shared.gradio['textbox'].submit(lambda: None, None, picture_select)
--- a/extensions/multimodal/DOCS.md
+++ b/extensions/multimodal/DOCS.md
@ -0,0 +1,85 @@
 # Technical description of multimodal extension
 ## Working principle
 Multimodality extension does most of the stuff which is required for any image input:
 - adds the UI
 - saves the images as base64 JPEGs to history
 - provides the hooks to the UI
 - if there are images in the prompt, it:
    - splits the prompt to text and image parts
    - adds image start/end markers to text parts, then encodes and embeds the text parts
    - calls the vision pipeline to embed the images
    - stitches the embeddings together, and returns them to text generation
 - loads the appropriate vision pipeline, selected either from model name, or by specifying --multimodal-pipeline parameter
 Now, for the pipelines, they:
 - load the required vision models
 - return some consts, for example the number of tokens taken up by image
 - and most importantly: return the embeddings for LLM, given a list of images
 ## Prompts/history
 To save images in prompt/history, this extension is using a base64 JPEG, wrapped in a HTML tag, like so:
 ```
 <img src="data:image/jpeg;base64,{img_str}">
 ```
 where `{img_str}` is the actual image data. This format makes displaying them in the UI for free. Do note, that this format is required to be exactly the same, the regex used to find the images is: `<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">`.
 ## LLM input
 To describe the input, let's see it on an example prompt:
 ```
 text1<image1>text2<image2>text3
 ```
 where `textN` is N-th text, `<imageN>` is N-th image, in HTML format specified above.
 **The first step is to split the prompt into image/text parts**, so we get:
 ```
 ['text1', '<image1>', 'text2', '<image2>', 'text3']
 ```
 this is done in `MultimodalEmbedder._split_prompt(...)` function, which returns a list of `PromptPart`s - dataclasses wrapping the separate parts.
 This function also appends the image start/end markers to text, which are provided by `AbstractMultimodalPipeline.image_start()` / `AbstractMultimodalPipeline.image_end()` functions. If image start is `<Img>`, and end is `</Img>`, this function will return:
 ```
 ['text1<Img>', '<image1>', '</Img>text2<Img>', '<image2>', '</Img>text3']
 ```
 **The returned prompt parts are then turned into token embeddings.**
 First, they are modified to token IDs, for the text it is done using standard `modules.text_generation.encode()` function, and for the images the returned token IDs are changed to placeholders. The placeholder is a list of `N` times `placeholder token id`, where `N` is specified using `AbstractMultimodalPipeline.num_image_embeds()`, and placeholder token IDs using  `AbstractMultimodalPipeline.placeholder_token_id()`.
 Now, based on the token IDs, the prompt might get truncated, especially if `max_new_tokens` are unreasonably high. Unfortunately, it can't be done simply, just by trimming the prompt to be short enough. This way will lead to sometimes splitting the prompt in the middle of an image embedding, which usually breaks the generation. Therefore, in this case, the entire image needs to be removed from input. This is done inside `MultimodalEmbedder._encode_text(...)` function.
 **After the tokenization, the tokens need to get embedded**, the text and images are once again treated separately.
 The text parts are turned to embeddings, using `AbstractMultimodalPipeline.embed_tokens(...)` function. It uses standard embedding function from the model, but to support many LLMs, the actual function is returned by the pipeline (as it might be different for different LLMs), for LLaMA it is `shared.model.model.embed_tokens(...)`.
 The image parts are turned to embeddings, using `AbstractMultimodalPipeline.embed_images(...)` function. This function is specific for a given pipeline, it takes the images as input, forwards them through vision model/projector, and returns the embeddings.
 **Now, the returned embeddings are stitched together**, using `torch.cat()`, this is creating the final input to the LLM.
 ## Pipelines
 All of the pipelines should subclass `AbstractMultimodalPipeline` class. The idea is to allow for new pipelines to be added in the same way as user extensions - git clone into `extensions/multimodal/pipelines`.
 The pipelines are the description of the vision part, containing vision model/multimodal projector. All of the pipelines should have an unique `name()`, which is then selected by user, in `--multimodal-pipeline` CLI argument. For an example, see `pipelines/llava/llava.py`.
 ## Pipeline modules
 Pipelines are organized into "pipeline modules" - subdirectories in `pipelines` directory. The pipeline modules should contain a file called `pipelines.py`, that should contain the following fields:
 - `available_pipelines: List[str]` - list of pipelines provided by this module, shown as the list of available pipelines to the user
 - `def get_pipeline(name: str, params: dict) -> Optional[AbstractMultimodalPipeline]`: - a function to get a concrete pipeline by `name`, if `name` doesn't match any, should return `None`. `params` is the user settings for multimodal extension
 - `def get_pipeline_from_model_name(model_name: str, params: dict) -> Optional[AbstractMultimodalPipeline]`: - a function to get a pipeline from `model_name`, should be eager to return `None`, unless the determination can be done clearly (for example: minigpt-4 bases on vicuna - it should never return the pipeline, but llava can, as it has its own specific LLM finetune)
 **NOTE**: A pipeline module should lazy-import the pipelines only when necessary, and it should keep its imports to minimum
 ## Pipeline params
 The pipelines will get the extension `params` in the constructor. They should honor the following fields:
 - `vision_device` - string, specifying `torch.device` to run the vision model (CLIP/ViT) on
 - `vision_bits` - int, number of fp bits to load the vision model(s) in
 - `projector_device` - string, specifying `torch.device` to run the projector models (Linear layers, QFormer, etc.) on
 - `projector_bits` - int, number of fp bits to load the projector models in
 As a helper, `AbstractMultimodalPipeline` has `_get_device(self, setting_name: str, params: dict)` and `_get_dtype(self, setting_name: str, params: dict)` helper functions, which parse string/int and return `torch.device` / `torch.dtype`.
--- a/extensions/multimodal/README.md
+++ b/extensions/multimodal/README.md
@ -0,0 +1,78 @@
 # Multimodal
 ## Description
 Adds support for multimodality (text+images) to text-generation-webui.
 https://user-images.githubusercontent.com/3718215/233817203-69b57e77-0c55-4fd6-b742-3204bb13b8fc.mp4
 ## Usage
 To run this extension, download a LLM that supports multimodality, and then start server.py with the appropriate `--multimodal-pipeline` argument. Examples:
 ```
 python server.py --model wojtab_llava-7b-v0-4bit-128g --multimodal-pipeline llava-7b --chat
 python3 server.py --model wojtab_llava-13b-v0-4bit-128g --multimodal-pipeline llava-13b --chat
 python server.py --model anon8231489123_vicuna-13b-GPTQ-4bit-128g --multimodal-pipeline minigpt4-13b --chat
 python server.py --model llama-7b-4bit --multimodal-pipeline minigpt4-7b --chat
 ```
 There is built-in support for LLaVA-v0-13B and LLaVA-v0-7b. To install `minigpt4`:
 - clone https://github.com/Wojtab/minigpt-4-pipeline into `extensions/multimodal/pipelines`
 - install the requirements.txt
 The same procedure should be used to install other pipelines, which can then me used with `--multimodal-pipeline [pipeline name]`. For additional multimodal pipelines refer to compatibility section below.
 Do note, that each image takes up a considerable amount of tokens, so adjust `max_new_tokens` to be at most 1700 (recommended value is between 200 to 500), so the images don't get truncated.
 To send an image, just upload it to the extension field below chat, and send a prompt as always. The image will be added to the end of your message. If you wish to modify the placement, include a string `<image>` in your prompt.
 Additionally, there is *Embed all images, not only the last one* checkbox. It modifies the image embeddings, by default (if it's unchecked), all but the most recent images have their embeddings empty, so they are not fed to the network. It seems as some multimodal networks consider the features in all images at the same time as if they were a single image. Due to this behavior, by default the extension skips previous images. However, it can lead to sub-par generation on other pipelines. If you want to include all images, just tick this checkbox.
 ## Compatibility
 As of now, the following multimodal pipelines are supported:
 |Pipeline|`--multimodal-pipeline`|Default LLM|LLM info(for the linked model)|Pipeline repository|
 |-|-|-|-|-|
 |[LLaVA 13B](https://github.com/haotian-liu/LLaVA)|`llava-13b`|[LLaVA 13B](https://huggingface.co/wojtab/llava-13b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
 |[LLaVA 7B](https://github.com/haotian-liu/LLaVA)|`llava-7b`|[LLaVA 7B](https://huggingface.co/wojtab/llava-7b-v0-4bit-128g)|GPTQ 4-bit quant, old CUDA|built-in|
 |[MiniGPT-4 7B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-7b`|[Vicuna v0 7B](https://huggingface.co/TheBloke/vicuna-7B-GPTQ-4bit-128g)|GPTQ 4-bit quant, new format|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
 |[MiniGPT-4 13B](https://github.com/Vision-CAIR/MiniGPT-4)|`minigpt4-13b`|[Vicuna v0 13B](https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g)|GPTQ 4-bit quant, old CUDA|[Wojtab/minigpt-4-pipeline](https://github.com/Wojtab/minigpt-4-pipeline)|
 Some pipelines could support different LLMs, but do note that while it might work, it isn't a supported configuration.
 DO NOT report bugs if you are using a different LLM.
 DO NOT report bugs with pipelines in this repository (unless they are built-in)
 ## Extension config
 This extension uses following parameters (from settings.json):
 |Parameter|Description|
 |---------|-----------|
 |`multimodal-vision_bits`|Number of bits to load vision models (CLIP/ViT) feature extractor in (most pipelines should support either 32 or 16, default=32)|
 |`multimodal-vision_device`|Torch device to run the feature extractor on, for example `cpu` or `cuda:0`, by default `cuda:0` if available|
 |`multimodal-projector_bits`|Number of bits to load feature projector model(s) in (most pipelines should support either 32 or 16, default=32)|
 |`multimodal-projector_device`|Torch device to run the feature projector model(s) on, for example `cpu` or `cuda:0`, by default `cuda:0` if available|
 |`multimodal-add_all_images_to_prompt`|Default value of "Embed all images, not only the last one" checkbox|
 ## Usage through API
 You can run the multimodal inference through API, by inputting the images to prompt. Images are embedded like so: `f'<img src="data:image/jpeg;base64,{img_str}">'`, where `img_str` is base-64 jpeg data. Python example:
 ```Python
 import base64
 import requests
 CONTEXT = "You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language. Follow the instructions carefully and explain your answers in detail.### Human: Hi!### Assistant: Hi there! How can I help you today?\n"
 with open('extreme_ironing.jpg', 'rb') as f:
    img_str = base64.b64encode(f.read()).decode('utf-8')
    prompt = CONTEXT + f'### Human: What is unusual about this image: \n<img src="data:image/jpeg;base64,{img_str}">### Assistant: '
    print(requests.post('http://127.0.0.1:5000/api/v1/generate', json={'prompt': prompt, 'stopping_strings': ['\n###']}).json())
 ```
 script output:
 ```Python
 {'results': [{'text': "The unusual aspect of this image is that a man is standing on top of a yellow minivan while doing his laundry. He has set up a makeshift clothes line using the car's rooftop as an outdoor drying area. This scene is uncommon because people typically do their laundry indoors, in a dedicated space like a laundromat or a room in their home, rather than on top of a moving vehicle. Additionally, hanging clothes on the car could be potentially hazardous or illegal in some jurisdictions due to the risk of damaging the vehicle or causing accidents on the road.\n##"}]}
 ```
 ## For pipeline developers/technical description
 see [DOCS.md](https://github.com/oobabooga/text-generation-webui/blob/main/extensions/multimodal/DOCS.md)
--- a/extensions/multimodal/abstract_pipeline.py
+++ b/extensions/multimodal/abstract_pipeline.py
@ -0,0 +1,62 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional
 import torch
 from PIL import Image
 class AbstractMultimodalPipeline(ABC):
    @staticmethod
    @abstractmethod
    def name() -> str:
        'name of the pipeline, should be same as in --multimodal-pipeline'
        pass
    @staticmethod
    @abstractmethod
    def image_start() -> Optional[str]:
        'return image start string, string representation of image start token, or None if not applicable'
        pass
    @staticmethod
    @abstractmethod
    def image_end() -> Optional[str]:
        'return image end string, string representation of image end token, or None if not applicable'
        pass
    @staticmethod
    @abstractmethod
    def placeholder_token_id() -> int:
        'return placeholder token id'
        pass
    @staticmethod
    @abstractmethod
    def num_image_embeds() -> int:
        'return the number of embeds used by a single image (for example: 256 for LLaVA)'
        pass
    @abstractmethod
    def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
        'forward the images through vision pipeline, and return their embeddings'
        pass
    @staticmethod
    @abstractmethod
    def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
        'embed tokens, the exact function varies by LLM, for LLaMA it is `shared.model.model.embed_tokens`'
        pass
    @staticmethod
    @abstractmethod
    def placeholder_embeddings() -> torch.Tensor:
        'get placeholder embeddings if there are multiple images, and `add_all_images_to_prompt` is False'
        pass
    def _get_device(self, setting_name: str, params: dict):
        if params[setting_name] is None:
            return torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        return torch.device(params[setting_name])
    def _get_dtype(self, setting_name: str, params: dict):
        return torch.float32 if int(params[setting_name]) == 32 else torch.float16
--- a/extensions/multimodal/multimodal_embedder.py
+++ b/extensions/multimodal/multimodal_embedder.py
@ -0,0 +1,177 @@
 import base64
 import logging
 import re
 from dataclasses import dataclass
 from io import BytesIO
 from typing import Any, List, Optional
 import torch
 from extensions.multimodal.pipeline_loader import load_pipeline
 from modules import shared
 from modules.text_generation import encode, get_max_prompt_length
 from PIL import Image
@dataclass
 class PromptPart:
    text: str
    image: Optional[Image.Image] = None
    is_image: bool = False
    input_ids: Optional[torch.Tensor] = None
    embedding: Optional[torch.Tensor] = None
 class MultimodalEmbedder:
    def __init__(self, params: dict):
        pipeline, source = load_pipeline(params)
        self.pipeline = pipeline
        logging.info(f'Multimodal: loaded pipeline {self.pipeline.name()} from pipelines/{source} ({self.pipeline.__class__.__name__})')
    def _split_prompt(self, prompt: str, load_images: bool = False) -> List[PromptPart]:
        """Splits a prompt into a list of `PromptParts` to separate image data from text.
        It will also append `image_start` and `image_end` before and after the image, and optionally parse and load the images,
        if `load_images` is `True`.
        """
        parts: List[PromptPart] = []
        curr = 0
        while True:
            match = re.search(r'<img src="data:image/jpeg;base64,([A-Za-z0-9+/=]+)">', prompt[curr:])
            if match is None:
                # no more image tokens, append the rest of the prompt
                if curr > 0:
                    # add image end token after last image
                    parts.append(PromptPart(text=self.pipeline.image_end() + prompt[curr:]))
                else:
                    parts.append(PromptPart(text=prompt))
                break
            # found an image, append image start token to the text
            if match.start() > 0:
                parts.append(PromptPart(text=prompt[curr:curr+match.start()]+self.pipeline.image_start()))
            else:
                parts.append(PromptPart(text=self.pipeline.image_start()))
            # append the image
            parts.append(PromptPart(
                text=match.group(0), 
                image=Image.open(BytesIO(base64.b64decode(match.group(1)))) if load_images else None,
                is_image=True
            ))
            curr += match.end()
        return parts
    def _len_in_tokens_prompt_parts(self, parts: List[PromptPart]) -> int:
        """Total length in tokens of all `parts`"""
        tokens = 0
        for part in parts:
            if part.is_image:
                tokens += self.pipeline.num_image_embeds()
            elif part.input_ids is not None:
                tokens += len(part.input_ids)
            else:
                tokens += len(encode(part.text)[0])
        return tokens
    def len_in_tokens(self, prompt: str) -> int:
        """Total length in tokens for a given text `prompt`"""
        parts = self._split_prompt(prompt, False)
        return self._len_in_tokens_prompt_parts(parts)
    def _encode_single_text(self, part: PromptPart, add_bos_token: bool) -> PromptPart:
        """Encode a single prompt `part` to `input_ids`. Returns a `PromptPart`"""
        if part.is_image:
            placeholders = torch.ones((self.pipeline.num_image_embeds())) * self.pipeline.placeholder_token_id()
            part.input_ids = placeholders.to(shared.model.device, dtype=torch.int64)
        else:
            part.input_ids = encode(part.text, add_bos_token=add_bos_token)[0].to(shared.model.device, dtype=torch.int64)
        return part
    @staticmethod
    def _num_images(parts: List[PromptPart]) -> int:
        count = 0
        for part in parts:
            if part.is_image:
                count += 1
        return count
    def _encode_text(self, state, parts: List[PromptPart]) -> List[PromptPart]:
        """Encode text to token_ids, also truncate the prompt, if necessary.
        The chat/instruct mode should make prompts that fit in get_max_prompt_length, but if max_new_tokens are set
        such that the context + min_rows don't fit, we can get a prompt which is too long.
        We can't truncate image embeddings, as it leads to broken generation, so remove the images instead and warn the user
        """
        encoded: List[PromptPart] = []
        for i, part in enumerate(parts):
            encoded.append(self._encode_single_text(part, i==0 and state['add_bos_token']))
        # truncation:
        max_len = get_max_prompt_length(state)
        removed_images = 0
        # 1. remove entire text/image blocks
        while self._len_in_tokens_prompt_parts(encoded[1:]) > max_len:
            if encoded[0].is_image:
                removed_images += 1
            encoded = encoded[1:]
        # 2. check if the last prompt part doesn't need to get truncated
        if self._len_in_tokens_prompt_parts(encoded) > max_len:
            if encoded[0].is_image:
                # don't truncate image embeddings, just remove the image, otherwise generation will be broken
                removed_images += 1
                encoded = encoded[1:]
            elif len(encoded) > 1 and encoded[0].text.endswith(self.pipeline.image_start()):
                # see if we can keep image_start token
                len_image_start = len(encode(self.pipeline.image_start(), add_bos_token=state['add_bos_token'])[0])
                if self._len_in_tokens_prompt_parts(encoded[1:]) + len_image_start > max_len:
                    # we can't -> remove this text, and the image
                    encoded = encoded[2:]
                    removed_images += 1
                else:
                    # we can -> just truncate the text
                    trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
                    encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
            elif len(encoded) > 0:
                # only one text left, truncate it normally
                trunc_len = self._len_in_tokens_prompt_parts(encoded) - max_len
                encoded[0].input_ids = encoded[0].input_ids[trunc_len:]
        # notify user if we truncated an image
        if removed_images > 0:
            logging.warning(f"Multimodal: removed {removed_images} image(s) from prompt. Try decreasing max_new_tokens if generation is broken")
        return encoded
    def _embed(self, parts: List[PromptPart]) -> List[PromptPart]:
        # batch images
        image_indicies = [i for i, part in enumerate(parts) if part.is_image]
        embedded = self.pipeline.embed_images([parts[i].image for i in image_indicies])
        for i, embeds in zip(image_indicies, embedded):
            parts[i].embedding = embeds
        # embed text
        for (i, part) in enumerate(parts):
            if not part.is_image:
                parts[i].embedding = self.pipeline.embed_tokens(part.input_ids)
        return parts
    def _remove_old_images(self, parts: List[PromptPart], params: dict) -> List[PromptPart]:
        if params['add_all_images_to_prompt']:
            return parts
        already_added = False
        for i, part in reversed(list(enumerate(parts))):
            if part.is_image:
                if already_added:
                    parts[i].embedding = self.pipeline.placeholder_embeddings()
                else:
                    already_added = True
        return parts
    def forward(self, prompt: str, state: Any, params: dict):
        prompt_parts = self._split_prompt(prompt, True)
        prompt_parts = self._encode_text(state, prompt_parts)
        prompt_parts = self._embed(prompt_parts)
        prompt_parts = self._remove_old_images(prompt_parts, params)
        embeds = tuple(part.embedding for part in prompt_parts)
        ids = tuple(part.input_ids for part in prompt_parts)
        input_embeds = torch.cat(embeds, dim=0)
        input_ids = torch.cat(ids, dim=0)
        return prompt, input_ids, input_embeds, self._num_images(prompt_parts)
--- a/extensions/multimodal/pipeline_loader.py
+++ b/extensions/multimodal/pipeline_loader.py
@ -0,0 +1,52 @@
 import logging
 import traceback
 from importlib import import_module
 from pathlib import Path
 from typing import Tuple
 from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 from modules import shared
 def _get_available_pipeline_modules():
    pipeline_path = Path(__file__).parent / 'pipelines'
    modules = [p for p in pipeline_path.iterdir() if p.is_dir()]
    return [m.name for m in modules if (m / 'pipelines.py').exists()]
 def load_pipeline(params: dict) -> Tuple[AbstractMultimodalPipeline, str]:
    pipeline_modules = {}
    available_pipeline_modules = _get_available_pipeline_modules()
    for name in available_pipeline_modules:
        try:
            pipeline_modules[name] = import_module(f'extensions.multimodal.pipelines.{name}.pipelines')
        except:
            logging.warning(f'Failed to get multimodal pipelines from {name}')
            logging.warning(traceback.format_exc())
    if shared.args.multimodal_pipeline is not None:
        for k in pipeline_modules:
             if hasattr(pipeline_modules[k], 'get_pipeline'):
                pipeline = getattr(pipeline_modules[k], 'get_pipeline')(shared.args.multimodal_pipeline, params)
                if pipeline is not None:
                    return (pipeline, k)
    else:
        model_name = shared.args.model.lower()
        for k in pipeline_modules:
            if hasattr(pipeline_modules[k], 'get_pipeline_from_model_name'):
                pipeline = getattr(pipeline_modules[k], 'get_pipeline_from_model_name')(model_name, params)
                if pipeline is not None:
                    return (pipeline, k)
    available = []
    for k in pipeline_modules:
        if hasattr(pipeline_modules[k], 'available_pipelines'):
            pipelines = getattr(pipeline_modules[k], 'available_pipelines')
            available += pipelines
    if shared.args.multimodal_pipeline is not None:
        log = f'Multimodal - ERROR: Failed to load multimodal pipeline "{shared.args.multimodal_pipeline}", available pipelines are: {available}.'
    else:
        log = f'Multimodal - ERROR: Failed to determine multimodal pipeline for model {shared.args.model}, please select one manually using --multimodal-pipeline [PIPELINE]. Available pipelines are: {available}.'
    logging.critical(f'{log} Please specify a correct pipeline, or disable the extension')
    raise RuntimeError(f'{log} Please specify a correct pipeline, or disable the extension')
--- a/extensions/multimodal/pipelines/llava/README.md
+++ b/extensions/multimodal/pipelines/llava/README.md
@ -0,0 +1,9 @@
 ## LLaVA pipeline
 This module provides 2 pipelines:
 - `llava-7b` - for use with LLaVA v0 7B model (finetuned LLaMa 7B)
 - `llava-13b` - for use with LLaVA v0 13B model (finetuned LLaMa 13B)
 [LLaVA](https://github.com/haotian-liu/LLaVA) uses CLIP `openai/clip-vit-large-patch14` as the vision model, and then a single linear layer. For 13B the projector weights are in `liuhaotian/LLaVA-13b-delta-v0`, and for 7B they are in `liuhaotian/LLaVA-7b-delta-v0`.
 The supported parameter combinations for both the vision model, and the projector are: CUDA/32bit, CUDA/16bit, CPU/32bit
--- a/extensions/multimodal/pipelines/llava/llava.py
+++ b/extensions/multimodal/pipelines/llava/llava.py
@ -0,0 +1,139 @@
 import logging
 import time
 from abc import abstractmethod
 from typing import List, Tuple
 import torch
 from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 from huggingface_hub import hf_hub_download
 from modules import shared
 from modules.text_generation import encode
 from PIL import Image
 from transformers import CLIPImageProcessor, CLIPVisionModel
 class LLaVA_v0_Pipeline(AbstractMultimodalPipeline):
    CLIP_REPO = "openai/clip-vit-large-patch14"
    def __init__(self, params: dict) -> None:
        super().__init__()
        self.clip_device = self._get_device("vision_device", params)
        self.clip_dtype = self._get_dtype("vision_bits", params)
        self.projector_device = self._get_device("projector_device", params)
        self.projector_dtype = self._get_dtype("projector_bits", params)
        self.image_processor, self.vision_tower, self.mm_projector = self._load_models()
    def _load_models(self):
        start_ts = time.time()
        logging.info(f"LLaVA - Loading CLIP from {LLaVA_v0_Pipeline.CLIP_REPO} as {self.clip_dtype} on {self.clip_device}...")
        image_processor = CLIPImageProcessor.from_pretrained(LLaVA_v0_Pipeline.CLIP_REPO, torch_dtype=self.clip_dtype)
        vision_tower = CLIPVisionModel.from_pretrained(LLaVA_v0_Pipeline.CLIP_REPO, torch_dtype=self.clip_dtype).to(self.clip_device)
        logging.info(f"LLaVA - Loading projector from {self.llava_projector_repo()} as {self.projector_dtype} on {self.projector_device}...")
        projector_path = hf_hub_download(self.llava_projector_repo(), self.llava_projector_filename())
        mm_projector = torch.nn.Linear(*self.llava_projector_shape())
        projector_data = torch.load(projector_path)
        mm_projector.weight = torch.nn.Parameter(projector_data['model.mm_projector.weight'].to(dtype=self.projector_dtype), False)
        mm_projector.bias = torch.nn.Parameter(projector_data['model.mm_projector.bias'].to(dtype=self.projector_dtype), False)
        mm_projector = mm_projector.to(self.projector_device)
        logging.info(f"LLaVA supporting models loaded, took {time.time() - start_ts:.2f} seconds")
        return image_processor, vision_tower, mm_projector
    @staticmethod
    def image_start() -> str:
        return "<im_start>"
    @staticmethod
    def image_end() -> str:
        return "<im_end>"
    @staticmethod
    def num_image_embeds() -> int:
        return 256
    @staticmethod
    def embed_tokens(input_ids: torch.Tensor) -> torch.Tensor:
        return shared.model.model.embed_tokens(input_ids).to(shared.model.device, dtype=shared.model.dtype)
    @staticmethod
    def placeholder_embeddings() -> torch.Tensor:
        return LLaVA_v0_Pipeline.embed_tokens(encode("<im_patch>"*256, add_bos_token=False)[0])
    def embed_images(self, images: List[Image.Image]) -> torch.Tensor:
        images = self.image_processor(images, return_tensors='pt')['pixel_values']
        images = images.to(self.clip_device, dtype=self.clip_dtype)
        with torch.no_grad():
            image_forward_outs = self.vision_tower(images, output_hidden_states=True)
            select_hidden_state_layer = -2
            select_hidden_state = image_forward_outs.hidden_states[select_hidden_state_layer]
            image_features = select_hidden_state[:, 1:].to(self.projector_device, dtype=self.projector_dtype)
            image_features = self.mm_projector(image_features)
        return image_features.to(shared.model.device, dtype=shared.model.dtype)
    @staticmethod
    @abstractmethod
    def llava_projector_repo() -> str:
        pass
    @staticmethod
    @abstractmethod
    def llava_projector_filename() -> str:
        pass
    @staticmethod
    @abstractmethod
    def llava_projector_shape() -> Tuple[int, int]:
        pass
 class LLaVA_v0_13B_Pipeline(LLaVA_v0_Pipeline):
    def __init__(self, params: dict) -> None:
        super().__init__(params)
    @staticmethod
    def name() -> str:
        return "llava-13b"
    @staticmethod
    def placeholder_token_id() -> int:
        return 32000
    @staticmethod
    def llava_projector_shape() -> Tuple[int, int]:
        return (1024, 5120)
    @staticmethod
    def llava_projector_filename() -> str:
        return "mm_projector.bin"
    @staticmethod
    def llava_projector_repo() -> str:
        return "liuhaotian/LLaVA-13b-delta-v0"
 class LLaVA_v0_7B_Pipeline(LLaVA_v0_Pipeline):
    def __init__(self, params: dict) -> None:
        super().__init__(params)
    @staticmethod
    def name() -> str:
        return "llava-7b"
    @staticmethod
    def placeholder_token_id() -> int:
        return 32001
    @staticmethod
    def llava_projector_shape() -> Tuple[int, int]:
        return (1024, 4096)
    @staticmethod
    def llava_projector_filename() -> str:
        return "mm_projector.bin"
    @staticmethod
    def llava_projector_repo() -> str:
        return "liuhaotian/LLaVA-7b-delta-v0"
--- a/extensions/multimodal/pipelines/llava/pipelines.py
+++ b/extensions/multimodal/pipelines/llava/pipelines.py
@ -0,0 +1,27 @@
 from typing import Optional
 from extensions.multimodal.abstract_pipeline import AbstractMultimodalPipeline
 available_pipelines = ['llava-7b', 'llava-13b']
 def get_pipeline(name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
    if name == 'llava-7b':
        from .llava import LLaVA_v0_7B_Pipeline
        return LLaVA_v0_7B_Pipeline(params)
    if name == 'llava-13b':
        from .llava import LLaVA_v0_13B_Pipeline
        return LLaVA_v0_13B_Pipeline(params)
    return None
 def get_pipeline_from_model_name(model_name: str, params: dict) -> Optional[AbstractMultimodalPipeline]:
    if 'llava' not in model_name.lower():
        return None
    if '7b' in model_name.lower():
        from .llava import LLaVA_v0_7B_Pipeline
        return LLaVA_v0_7B_Pipeline(params)
    if '13b' in model_name.lower():
        from .llava import LLaVA_v0_13B_Pipeline
        return LLaVA_v0_13B_Pipeline(params)
    return None
--- a/extensions/multimodal/pipelines/place-additional-pipelines-here.txt
+++ b/extensions/multimodal/pipelines/place-additional-pipelines-here.txt
--- a/extensions/multimodal/script.py
+++ b/extensions/multimodal/script.py
@ -0,0 +1,103 @@
 import base64
 import logging
 import re
 import time
 from functools import partial
 from io import BytesIO
 import gradio as gr
 import torch
 from extensions.multimodal.multimodal_embedder import MultimodalEmbedder
 from modules import shared
 params = {
    "add_all_images_to_prompt": False,
    # device to run vision encoder on
    "vision_device": None,
    # bits to load vision encoder in, either 16 or 32
    "vision_bits": 32,
    # device to run multimodal projector on
    "projector_device": None,
    # multimodal projector bits, either 32 or 16
    "projector_bits": 32
 }
 # If 'state' is True, will hijack the next chat generation
 input_hijack = {
    'state': False,
    'value': ["", ""]
 }
 # initialized in ui, so that params are loaded from settings
 multimodal_embedder: MultimodalEmbedder = None
 def add_chat_picture(picture, text, visible_text):
    # resize the image, so that shortest edge is at least 224 (size for CLIP), and at most 300 (to keep history manageable)
    max_hw, min_hw = max(picture.size), min(picture.size)
    aspect_ratio = max_hw / min_hw
    shortest_edge = int(max(300 / aspect_ratio, 224))
    longest_edge = int(shortest_edge * aspect_ratio)
    w = shortest_edge if picture.width < picture.height else longest_edge
    h = shortest_edge if picture.width >= picture.height else longest_edge
    picture = picture.resize((w,h))
    buffer = BytesIO()
    picture.save(buffer, format="JPEG")
    img_str = base64.b64encode(buffer.getvalue()).decode('utf-8')
    image = f'<img src="data:image/jpeg;base64,{img_str}">'
    if '<image>' in text:
        text = text.replace('<image>', image)
    else:
        text = text + '\n' + image
    if visible_text == '' or visible_text is None:
        visible_text = text
    elif '<image>' in visible_text:
        visible_text = visible_text.replace('<image>', image)
    else:
        visible_text = visible_text + '\n' + image
    return text, visible_text
 def custom_tokenized_length(prompt):
    return multimodal_embedder.len_in_tokens(prompt)
 def tokenizer_modifier(state, prompt, input_ids, input_embeds):
    global params
    start_ts = time.time()
    image_match = re.search(r'<img src="data:image/jpeg;base64,[A-Za-z0-9+/=]+">', prompt)
    if image_match is None:
        return prompt, input_ids, input_embeds
    prompt, input_ids, input_embeds, total_embedded = multimodal_embedder.forward(prompt, state, params)
    logging.info(f'Embedded {total_embedded} image(s) in {time.time()-start_ts:.2f}s')
    return (prompt,
        input_ids.unsqueeze(0).to(shared.model.device, dtype=torch.int64),
        input_embeds.unsqueeze(0).to(shared.model.device, dtype=shared.model.dtype))
 def ui():
    global multimodal_embedder
    multimodal_embedder = MultimodalEmbedder(params)
    with gr.Column():
        picture_select = gr.Image(label='Send a picture', type='pil')
        # The models don't seem to deal well with multiple images
        single_image_checkbox = gr.Checkbox(False, label='Embed all images, not only the last one')
    # Prepare the input hijack
    picture_select.upload(
        lambda picture: input_hijack.update({"state": True, "value": partial(add_chat_picture, picture)}),
        [picture_select],
        None
    )
    picture_select.clear(lambda: input_hijack.update({"state": False, "value": ["",""]}), None, None)
    single_image_checkbox.change(lambda x: params.update({"add_all_images_to_prompt": x}), single_image_checkbox, None)
    shared.gradio['Generate'].click(lambda: None, None, picture_select)
    shared.gradio['textbox'].submit(lambda: None, None, picture_select)
--- a/modules/chat.py
+++ b/modules/chat.py
@ -14,7 +14,7 @@ from PIL import Image
 import modules.shared as shared
 from modules.extensions import apply_extensions
 from modules.html_generator import chat_html_wrapper, make_thumbnail
-from modules.text_generation import (encode, generate_reply,
+from modules.text_generation import (generate_reply, get_encoded_length,
                                     get_max_prompt_length)
@ -67,7 +67,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
    # Building the prompt
    i = len(history) - 1
-    while i >= 0 and len(encode(''.join(rows))[0]) < max_length:
+    while i >= 0 and get_encoded_length(''.join(rows)) < max_length:
        if _continue and i == len(history) - 1:
            rows.insert(1, bot_turn_stripped + history[i][1].strip())
        else:
@ -90,7 +90,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
        # Adding the Character prefix
        rows.append(apply_extensions("bot_prefix", bot_turn_stripped.rstrip(' ')))
-    while len(rows) > min_rows and len(encode(''.join(rows))[0]) >= max_length:
+    while len(rows) > min_rows and get_encoded_length(''.join(rows)) >= max_length:
        rows.pop(1)
    prompt = ''.join(rows)
--- a/modules/extensions.py
+++ b/modules/extensions.py
@ -7,6 +7,7 @@ import gradio as gr
 import extensions
 import modules.shared as shared
 state = {}
 available_extensions = []
 setup_called = set()
@ -73,15 +74,12 @@ def _apply_input_hijack(text, visible_text):
    return text, visible_text
-# custom_generate_chat_prompt handling
+# custom_generate_chat_prompt handling - currently only the first one will work
 def _apply_custom_generate_chat_prompt(text, state, **kwargs):
    custom_generate_chat_prompt = None
    for extension, _ in iterator():
-        if custom_generate_chat_prompt is None and hasattr(extension, 'custom_generate_chat_prompt'):
+        if hasattr(extension, 'custom_generate_chat_prompt'):
-            custom_generate_chat_prompt = extension.custom_generate_chat_prompt
+            return custom_generate_chat_prompt(text, state, **kwargs)
    if custom_generate_chat_prompt is not None:
        return custom_generate_chat_prompt(text, state, **kwargs)
    return None
@ -95,16 +93,26 @@ def _apply_state_modifier_extensions(state):
    return state
-# Extension functions that override the default tokenizer output
+# Extension functions that override the default tokenizer output - currently only the first one will work
 def _apply_tokenizer_extensions(function_name, state, prompt, input_ids, input_embeds):
    for extension, _ in iterator():
        if hasattr(extension, function_name):
-            prompt, input_ids, input_embeds = getattr(extension, function_name)(state, prompt, input_ids, input_embeds)
+            return getattr(extension, function_name)(state, prompt, input_ids, input_embeds)
    return prompt, input_ids, input_embeds
-# Custom generate reply handling
+# Get prompt length in tokens after applying extension functions which override the default tokenizer output
 # currently only the first one will work
 def _apply_custom_tokenized_length(prompt):
    for extension, _ in iterator():
        if hasattr(extension, 'custom_tokenized_length'):
            return getattr(extension, 'custom_tokenized_length')(prompt)
    return None
 # Custom generate reply handling - currently only the first one will work
 def _apply_custom_generate_reply():
    for extension, _ in iterator():
        if hasattr(extension, 'custom_generate_reply'):
@ -121,7 +129,8 @@ EXTENSION_MAP = {
    "tokenizer": partial(_apply_tokenizer_extensions, "tokenizer_modifier"),
    "input_hijack": _apply_input_hijack,
    "custom_generate_chat_prompt": _apply_custom_generate_chat_prompt,
-    "custom_generate_reply": _apply_custom_generate_reply
+    "custom_generate_reply": _apply_custom_generate_reply,
    "tokenized_length": _apply_custom_tokenized_length
 }
--- a/modules/shared.py
+++ b/modules/shared.py
@ -166,6 +166,8 @@ parser.add_argument("--gradio-auth-path", type=str, help='Set the gradio authent
 parser.add_argument('--api', action='store_true', help='Enable the API extension.')
 parser.add_argument('--public-api', action='store_true', help='Create a public URL for the API using Cloudfare.')
 # Multimodal
 parser.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.')
 args = parser.parse_args()
 args_defaults = parser.parse_args([])
@ -183,12 +185,21 @@ if args.trust_remote_code:
 if args.share:
    logging.warning("The gradio \"share link\" feature downloads a proprietary and unaudited blob to create a reverse tunnel. This is potentially dangerous.")
 def add_extension(name):
    if args.extensions is None:
        args.extensions = [name]
    elif 'api' not in args.extensions:
        args.extensions.append(name)
 # Activating the API extension
 if args.api or args.public_api:
-    if args.extensions is None:
+    add_extension('api')
-        args.extensions = ['api']
+
-    elif 'api' not in args.extensions:
+# Activating the multimodal extension
-        args.extensions.append('api')
+if args.multimodal_pipeline is not None:
    add_extension('multimodal')
 def is_chat():
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -59,6 +59,14 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
        return input_ids.cuda()
 def get_encoded_length(prompt):
    length_after_extensions = apply_extensions('tokenized_length', prompt)
    if length_after_extensions is not None:
        return length_after_extensions
    return len(encode(prompt)[0])
 def decode(output_ids, skip_special_tokens=True):
    return shared.tokenizer.decode(output_ids, skip_special_tokens)
--- a/server.py
+++ b/server.py
@ -48,7 +48,7 @@ from modules import chat, shared, training, ui, utils
 from modules.html_generator import chat_html_wrapper
 from modules.LoRA import add_lora_to_model
 from modules.models import load_model, load_soft_prompt, unload_model
-from modules.text_generation import encode, generate_reply, stop_everything_event
+from modules.text_generation import generate_reply, get_encoded_length, stop_everything_event
 def load_model_wrapper(selected_model, autoload=False):
@ -140,7 +140,7 @@ def load_prompt(fname):
 def count_tokens(text):
-    tokens = len(encode(text)[0])
+    tokens = get_encoded_length(text)
    return f'{tokens} tokens in the input.'