gpt4all/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py

#!/usr/bin/env python3
# Convert Hugging Face fine-tuned bloom-like models to ggml format
#
# Usage:
#
#   python3 models/convert-h5-to-ggml.py 
#
# This script is similar to "convert-pt-to-ggml.py"
#

from __future__ import annotations

import json
import struct
import sys
from pathlib import Path

import gguf
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from transformers.models.gpt2 import tokenization_gpt2


if not 3 <= len(sys.argv) < 5:
    print("Usage: {} model-name dir-output [ftype]".format(Path(__file__).name))
    print("  model-name: name of the model to convert. Example: 'bigscience/bloomz-560m'")
    print("  dir-output: directory where the output file will be written")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)

model_name = sys.argv[1]
dir_out = Path(sys.argv[2])

# make sure the output directory exists
dir_out.mkdir(exist_ok=True)

# possible data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 3:
    ftype = int(sys.argv[3])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)

fname_out = dir_out / f"ggml-model-{Path(model_name).name}-{ftype_str[ftype]}.gguf"


ARCH = gguf.MODEL_ARCH.MPT
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])

print("gguf: get model metadata")

config = AutoConfig.from_pretrained(model_name)

block_count = config.n_layers
gguf_writer.add_name("MPT")
gguf_writer.add_context_length(config.max_seq_len)
gguf_writer.add_embedding_length(config.d_model)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(4 * config.d_model)
gguf_writer.add_head_count(config.n_heads)
gguf_writer.add_max_alibi_bias(config.attn_config.alibi_bias_max)
gguf_writer.add_layer_norm_eps(config.layer_norm_epsilon)
gguf_writer.add_file_type(ftype)

clip_qkv = config.attn_config.clip_qkv
if clip_qkv is not None:
    gguf_writer.add_clamp_kqv(clip_qkv)

print("gguf: get gpt2 tokenizer vocab")

tokenizer = AutoTokenizer.from_pretrained(model_name)

special_ids = tokenizer.all_special_ids

reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
added_tokens = tokenizer.get_added_vocab().values()
byte_encoder = tokenization_gpt2.bytes_to_unicode()
byte_decoder = {v: k for k, v in byte_encoder.items()}

tokens: list[bytearray] = []
toktypes: list[gguf.TokenType] = []

# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
for i in range(config.vocab_size):
    if i not in reverse_vocab:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    elif i in added_tokens:
        # these tokens are not encoded, for some reason
        text = bytearray(reverse_vocab[i].encode('utf-8'))
    else:
        text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])

    tokens.append(text)

    # TODO(cebtenzzre): is there a better way to do this?
    toktypes.append(gguf.TokenType.CONTROL if i in special_ids else gguf.TokenType.NORMAL)

gguf_writer.add_tokenizer_model("gpt2")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_types(toktypes)

print("gguf: get tensor metadata")

print("Loading model:", model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True,
)
print("Model loaded:", model_name)

tensor_map = gguf.get_tensor_name_map(ARCH, block_count)

list_vars = model.state_dict()
for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable:", name, "with shape:", data.shape)

    n_dims = len(data.shape)

    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    # Keep token embeddings in fp32
    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name:
        print("  Converting to float16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1 or data.dtype != np.float32:
        print("  Converting to float32")
        data = data.astype(np.float32)
        ftype_cur = 0

    # map tensor names
    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
    if new_name is None:
        print("Can not map tensor '" + name + "'")
        sys.exit()

    gguf_writer.add_tensor(new_name, data)


print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()

gguf_writer.close()

print(f"gguf: model successfully exported to '{fname_out}'")
print()
convert scripts: make them directly executable 2023-09-29 13:39:35 -04:00			`#!/usr/bin/env python3`
backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`# Convert Hugging Face fine-tuned bloom-like models to ggml format`
			`#`
			`# Usage:`
			`#`
			`# python3 models/convert-h5-to-ggml.py`
			`#`
			`# This script is similar to "convert-pt-to-ggml.py"`
			`#`

			`from __future__ import annotations`

			`import json`
			`import struct`
			`import sys`
			`from pathlib import Path`

			`import gguf`
			`import numpy as np`
			`import torch`
convert scripts: use bytes_to_unicode from transformers 2023-09-29 17:39:49 -04:00			`from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig`
			`from transformers.models.gpt2 import tokenization_gpt2`
backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00

			`if not 3 <= len(sys.argv) < 5:`
convert scripts: make them directly executable 2023-09-29 13:39:35 -04:00			`print("Usage: {} model-name dir-output [ftype]".format(Path(__file__).name))`
backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`print(" model-name: name of the model to convert. Example: 'bigscience/bloomz-560m'")`
			`print(" dir-output: directory where the output file will be written")`
			`print(" ftype == 0 -> float32")`
			`print(" ftype == 1 -> float16")`
			`sys.exit(1)`

			`model_name = sys.argv[1]`
			`dir_out = Path(sys.argv[2])`

			`# make sure the output directory exists`
			`dir_out.mkdir(exist_ok=True)`

			`# possible data types`
			`# ftype == 0 -> float32`
			`# ftype == 1 -> float16`
			`#`
			`# map from ftype to string`
			`ftype_str = ["f32", "f16"]`
conversion scripts: cleanup 2023-09-28 15:32:43 -04:00
backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`ftype = 1`
			`if len(sys.argv) > 3:`
			`ftype = int(sys.argv[3])`
			`if ftype < 0 or ftype > 1:`
			`print("Invalid ftype: " + str(ftype))`
			`sys.exit(1)`

			`fname_out = dir_out / f"ggml-model-{Path(model_name).name}-{ftype_str[ftype]}.gguf"`


			`ARCH = gguf.MODEL_ARCH.MPT`
			`gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])`

			`print("gguf: get model metadata")`

			`config = AutoConfig.from_pretrained(model_name)`

			`block_count = config.n_layers`
			`gguf_writer.add_name("MPT")`
			`gguf_writer.add_context_length(config.max_seq_len)`
			`gguf_writer.add_embedding_length(config.d_model)`
			`gguf_writer.add_block_count(block_count)`
convert scripts: add feed-forward length for better compatiblilty This GGUF key is used by all llama.cpp models with upstream support. 2023-09-30 18:03:23 -04:00			`gguf_writer.add_feed_forward_length(4 * config.d_model)`
backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`gguf_writer.add_head_count(config.n_heads)`
			`gguf_writer.add_max_alibi_bias(config.attn_config.alibi_bias_max)`
			`gguf_writer.add_layer_norm_eps(config.layer_norm_epsilon)`
			`gguf_writer.add_file_type(ftype)`

			`clip_qkv = config.attn_config.clip_qkv`
			`if clip_qkv is not None:`
			`gguf_writer.add_clamp_kqv(clip_qkv)`

			`print("gguf: get gpt2 tokenizer vocab")`

			`tokenizer = AutoTokenizer.from_pretrained(model_name)`

			`special_ids = tokenizer.all_special_ids`

convert_mpt_hf_to_gguf.py: better tokenizer decoding 2023-09-29 10:02:04 -04:00			`reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}`
			`added_tokens = tokenizer.get_added_vocab().values()`
convert scripts: use bytes_to_unicode from transformers 2023-09-29 17:39:49 -04:00			`byte_encoder = tokenization_gpt2.bytes_to_unicode()`
convert_mpt_hf_to_gguf.py: better tokenizer decoding 2023-09-29 10:02:04 -04:00			`byte_decoder = {v: k for k, v in byte_encoder.items()}`

backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`tokens: list[bytearray] = []`
			`toktypes: list[gguf.TokenType] = []`

			`# The number of tokens in tokenizer.json can differ from the expected vocab size.`
			`# This causes downstream issues with mismatched tensor sizes when running the inference`
			`for i in range(config.vocab_size):`
convert_mpt_hf_to_gguf.py: better tokenizer decoding 2023-09-29 10:02:04 -04:00			`if i not in reverse_vocab:`
			`print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")`
			`pad_token = f"[PAD{i}]".encode("utf8")`
			`text = bytearray(pad_token)`
			`elif i in added_tokens:`
			`# these tokens are not encoded, for some reason`
			`text = bytearray(reverse_vocab[i].encode('utf-8'))`
			`else:`
			`text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])`

backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`tokens.append(text)`

			`# TODO(cebtenzzre): is there a better way to do this?`
			`toktypes.append(gguf.TokenType.CONTROL if i in special_ids else gguf.TokenType.NORMAL)`

			`gguf_writer.add_tokenizer_model("gpt2")`
			`gguf_writer.add_token_list(tokens)`
			`gguf_writer.add_token_types(toktypes)`

			`print("gguf: get tensor metadata")`

convert scripts: load model as late as possible 2023-09-29 09:59:11 -04:00			`print("Loading model:", model_name)`
			`model = AutoModelForCausalLM.from_pretrained(`
			`model_name, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True,`
			`)`
			`print("Model loaded:", model_name)`

backend: port MPT to GGUF 2023-09-28 09:33:39 -04:00			`tensor_map = gguf.get_tensor_name_map(ARCH, block_count)`

			`list_vars = model.state_dict()`
			`for name in list_vars.keys():`
			`data = list_vars[name].squeeze().numpy()`
			`print("Processing variable:", name, "with shape:", data.shape)`

			`n_dims = len(data.shape)`

			`# ftype == 0 -> float32, ftype == 1 -> float16`
			`ftype_cur = 0`
			`# Keep token embeddings in fp32`
			`if ftype == 1 and name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name:`
			`print(" Converting to float16")`
			`data = data.astype(np.float16)`
			`ftype_cur = 1`
			`elif ftype == 1 or data.dtype != np.float32:`
			`print(" Converting to float32")`
			`data = data.astype(np.float32)`
			`ftype_cur = 0`

			`# map tensor names`
			`new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))`
			`if new_name is None:`
			`print("Can not map tensor '" + name + "'")`
			`sys.exit()`

			`gguf_writer.add_tensor(new_name, data)`


			`print("gguf: write header")`
			`gguf_writer.write_header_to_file()`
			`print("gguf: write metadata")`
			`gguf_writer.write_kv_data_to_file()`
			`print("gguf: write tensors")`
			`gguf_writer.write_tensors_to_file()`

			`gguf_writer.close()`

			`print(f"gguf: model successfully exported to '{fname_out}'")`
			`print()`