diff --git a/1_Pooling/config.json b/1_Pooling/config.json new file mode 100644 index 0000000..9bd8592 --- /dev/null +++ b/1_Pooling/config.json @@ -0,0 +1,7 @@ +{ + "word_embedding_dimension": 1024, + "pooling_mode_cls_token": true, + "pooling_mode_mean_tokens": false, + "pooling_mode_max_tokens": false, + "pooling_mode_mean_sqrt_len_tokens": false +} \ No newline at end of file diff --git a/README.md b/README.md index 32897cd..73c7dd6 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,56 @@ --- -license: mit +pipeline_tag: sentence-similarity +tags: +- sentence-transformers +- feature-extraction +- sentence-similarity + --- + +# {MODEL_NAME} + +This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 1024 dimensional dense vector space and can be used for tasks like clustering or semantic search. + + + +## Usage (Sentence-Transformers) + +Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: + +``` +pip install -U sentence-transformers +``` + +Then you can use the model like this: + +```python +from sentence_transformers import SentenceTransformer +sentences = ["This is an example sentence", "Each sentence is converted"] + +model = SentenceTransformer('{MODEL_NAME}') +embeddings = model.encode(sentences) +print(embeddings) +``` + + + +## Evaluation Results + + + +For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME}) + + + +## Full Model Architecture +``` +SentenceTransformer( + (0): Transformer({'max_seq_length': 8192, 'do_lower_case': False}) with Transformer model: XLMRobertaModel + (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) + (2): Normalize() +) +``` + +## Citing & Authors + + \ No newline at end of file diff --git a/colbert_linear.pt b/colbert_linear.pt new file mode 100644 index 0000000..4a8e512 --- /dev/null +++ b/colbert_linear.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19bfbae397c2b7524158c919d0e9b19393c5639d098f0a66932c91ed8f5f9abb +size 2100674 diff --git a/config.json b/config.json new file mode 100644 index 0000000..1a5ff43 --- /dev/null +++ b/config.json @@ -0,0 +1,28 @@ +{ + "_name_or_path": "/home/baaiks/jianlv/models/bge-m3/bge-m3", + "architectures": [ + "XLMRobertaModel" + ], + "attention_probs_dropout_prob": 0.1, + "bos_token_id": 0, + "classifier_dropout": null, + "eos_token_id": 2, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 8194, + "model_type": "xlm-roberta", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "output_past": true, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "torch_dtype": "float32", + "transformers_version": "4.33.0", + "type_vocab_size": 1, + "use_cache": true, + "vocab_size": 250002 +} diff --git a/config_sentence_transformers.json b/config_sentence_transformers.json new file mode 100644 index 0000000..1fba91c --- /dev/null +++ b/config_sentence_transformers.json @@ -0,0 +1,7 @@ +{ + "__version__": { + "sentence_transformers": "2.2.2", + "transformers": "4.33.0", + "pytorch": "2.1.2+cu121" + } +} \ No newline at end of file diff --git a/model.safetensors b/model.safetensors new file mode 100644 index 0000000..62bb301 --- /dev/null +++ b/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:993b2248881724788dcab8c644a91dfd63584b6e5604ff2037cb5541e1e38e7e +size 2271064456 diff --git a/modules.json b/modules.json new file mode 100644 index 0000000..952a9b8 --- /dev/null +++ b/modules.json @@ -0,0 +1,20 @@ +[ + { + "idx": 0, + "name": "0", + "path": "", + "type": "sentence_transformers.models.Transformer" + }, + { + "idx": 1, + "name": "1", + "path": "1_Pooling", + "type": "sentence_transformers.models.Pooling" + }, + { + "idx": 2, + "name": "2", + "path": "2_Normalize", + "type": "sentence_transformers.models.Normalize" + } +] \ No newline at end of file diff --git a/pytorch_model.bin b/pytorch_model.bin new file mode 100644 index 0000000..88e71f3 --- /dev/null +++ b/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e0ce3470abf5ef3831aa1bd5553b486803e83251590ab7ff35a117cf6aad38 +size 2271145830 diff --git a/sentence_bert_config.json b/sentence_bert_config.json new file mode 100644 index 0000000..0140ba1 --- /dev/null +++ b/sentence_bert_config.json @@ -0,0 +1,4 @@ +{ + "max_seq_length": 8192, + "do_lower_case": false +} \ No newline at end of file diff --git a/sentencepiece.bpe.model b/sentencepiece.bpe.model new file mode 100644 index 0000000..7a3f40a --- /dev/null +++ b/sentencepiece.bpe.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865 +size 5069051 diff --git a/sparse_linear.pt b/sparse_linear.pt new file mode 100644 index 0000000..19175a7 --- /dev/null +++ b/sparse_linear.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45c93804d2142b8f6d7ec6914ae23a1eee9c6a1d27d83d908a20d2afb3595ad9 +size 3516 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..b1879d7 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,51 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "cls_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "mask_token": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "sep_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..4019fcb --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,62 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "250001": { + "content": "", + "lstrip": true, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "mask_token": { + "__type": "AddedToken", + "content": "", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "model_max_length": 8192, + "pad_token": "", + "sep_token": "", + "sp_model_kwargs": {}, + "tokenizer_class": "XLMRobertaTokenizer", + "unk_token": "" +}