CodeT5/tokenizer/apply_tokenizer.py
2021-09-03 22:14:17 +08:00

17 lines
371 B
Python

from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer.from_file(
"./salesforce/codet5-vocab.json",
"./salesforce/codet5-merges.txt"
)
tokenizer.add_special_tokens([
"<pad>",
"<s>",
"</s>",
"<unk>",
"<mask>"
])
print(
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
)