From 339ce92d052f002cdbac4a4bd551d1c61dd8345e Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Tue, 23 Apr 2024 11:33:49 +0000 Subject: [PATCH] Update post-processor to add bos (#42) - Update post-processor to add bos (4d3ac242e1d717fbebaa94154be38077f4e1623b) --- tokenizer.json | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/tokenizer.json b/tokenizer.json index 94eacd0..b197f72 100644 --- a/tokenizer.json +++ b/tokenizer.json @@ -2329,10 +2329,69 @@ ] }, "post_processor": { - "type": "ByteLevel", - "add_prefix_space": true, - "trim_offsets": false, - "use_regex": true + "type": "Sequence", + "processors": [ + { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": false, + "use_regex": true + }, + { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "<|begin_of_text|>", + "type_id": 1 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "<|begin_of_text|>": { + "id": "<|begin_of_text|>", + "ids": [ + 128000 + ], + "tokens": [ + "<|begin_of_text|>" + ] + } + } + } + ] }, "decoder": { "type": "ByteLevel",