mirror of
https://github.com/salesforce/CodeT5.git
synced 2024-10-01 06:35:38 -04:00
23 lines
497 B
Python
23 lines
497 B
Python
from tokenizers import ByteLevelBPETokenizer
|
|
|
|
paths = ['train_code.txt', 'train_doc.txt']
|
|
|
|
# Initialize a tokenizer
|
|
tokenizer = ByteLevelBPETokenizer()
|
|
|
|
# Customize training
|
|
tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[
|
|
"<pad>",
|
|
"<s>",
|
|
"</s>",
|
|
"<unk>",
|
|
"<mask>"
|
|
])
|
|
|
|
# Save files to disk
|
|
tokenizer.save_model("./salesforce", "codet5")
|
|
|
|
print(
|
|
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
|
|
)
|