CodeT5/tokenizer/train_tokenizer.py
2021-09-03 22:14:17 +08:00

23 lines
497 B
Python

from tokenizers import ByteLevelBPETokenizer
paths = ['train_code.txt', 'train_doc.txt']
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
# Save files to disk
tokenizer.save_model("./salesforce", "codet5")
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)