mirror of
https://github.com/salesforce/CodeT5.git
synced 2024-10-01 06:35:38 -04:00
17 lines
371 B
Python
17 lines
371 B
Python
from tokenizers import ByteLevelBPETokenizer
|
|
|
|
tokenizer = ByteLevelBPETokenizer.from_file(
|
|
"./salesforce/codet5-vocab.json",
|
|
"./salesforce/codet5-merges.txt"
|
|
)
|
|
tokenizer.add_special_tokens([
|
|
"<pad>",
|
|
"<s>",
|
|
"</s>",
|
|
"<unk>",
|
|
"<mask>"
|
|
])
|
|
|
|
print(
|
|
tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens
|
|
) |