diff --git a/tokenization_arcade100k.py b/tokenization_arcade100k.py index ade4c84..be91425 100644 --- a/tokenization_arcade100k.py +++ b/tokenization_arcade100k.py @@ -126,6 +126,9 @@ class Arcade100kTokenizer(PreTrainedTokenizer): self.decoder.update({i: n for n, i in self.tokenizer._special_tokens.items()}) self.eos_token = self.decoder[self.tokenizer.eot_token] self.pad_token = self.decoder[self.tokenizer.eot_token] + # Expose for convenience + self.mergeable_ranks = self.tokenizer._mergeable_ranks + self.special_tokens = self.tokenizer._special_tokens def __len__(self): return self.tokenizer.n_vocab @@ -270,4 +273,4 @@ class Arcade100kTokenizer(PreTrainedTokenizer): token_ids = [token_ids] if skip_special_tokens: token_ids = [i for i in token_ids if i < self.tokenizer.eot_token] - return self.tokenizer.decode(token_ids) + return self.tokenizer.decode(token_ids) \ No newline at end of file