diff --git a/special_tokens_map.json b/special_tokens_map.json index f55d8b6..02ee80b 100644 --- a/special_tokens_map.json +++ b/special_tokens_map.json @@ -1,4 +1,16 @@ { - "bos_token": "<|begin_of_text|>", - "eos_token": "<|eot_id|>" + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } } diff --git a/tokenizer.json b/tokenizer.json index 4128065..5cc5f00 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -2329,10 +2329,69 @@ ] }, "post_processor": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": false, - "use_regex": true + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] }, "decoder": { "type": "ByteLevel",