from tokenizers import ByteLevelBPETokenizer paths = ['train_code.txt', 'train_doc.txt'] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "", "", "", "", "" ]) # Save files to disk tokenizer.save_model("./salesforce", "codet5") print( tokenizer.encode(" hello Don't you love 🤗 Transformers yes . ").tokens )