convert scripts: use bytes_to_unicode from transformers

2024-10-01 01:06:10 -04:00 · 2023-09-29 17:39:49 -04:00 · 2023-09-29 17:39:49 -04:00 · 0493e6eb07
commit 0493e6eb07
parent a49a1dcdf4
2 changed files with 5 additions and 48 deletions
--- a/gpt4all-backend/scripts/convert_gptj_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_gptj_to_gguf.py
@ -27,28 +27,7 @@ from pathlib import Path
 import gguf
 import numpy as np
 from transformers import AutoTokenizer, GPTJConfig, GPTJForCausalLM
-
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
+from transformers.models.gpt2 import tokenization_gpt2


 if not 2 <= len(sys.argv) < 4:
@ -100,7 +79,7 @@ print("gguf: get gpt2 tokenizer vocab")
 tokenizer = AutoTokenizer.from_pretrained(dir_model)

 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
+byte_encoder = tokenization_gpt2.bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}

 tokens: list[bytearray] = []
--- a/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_mpt_hf_to_gguf.py
@ -18,30 +18,8 @@ from pathlib import Path
 import gguf
 import numpy as np
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
-
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from transformers.models.gpt2 import tokenization_gpt2


 if not 3 <= len(sys.argv) < 5:
@ -104,7 +82,7 @@ special_ids = tokenizer.all_special_ids

 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 added_tokens = tokenizer.get_added_vocab().values()
-byte_encoder = bytes_to_unicode()
+byte_encoder = tokenization_gpt2.bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}

 tokens: list[bytearray] = []