From 9fd6e32e8d229e2f4fff880c9a12fa5f71100793 Mon Sep 17 00:00:00 2001 From: system Date: Tue, 25 Jan 2022 16:34:04 +0100 Subject: [PATCH] Update files from the datasets library (from 1.0.0) Release notes: https://github.com/huggingface/datasets/releases/tag/1.0.0 --- .gitattributes | 27 ++++++ dataset_infos.json | 1 + dummy/plain_text/1.0.0/dummy_data.zip | 3 + imdb.py | 122 ++++++++++++++++++++++++++ 4 files changed, 153 insertions(+) create mode 100644 .gitattributes create mode 100644 dataset_infos.json create mode 100644 dummy/plain_text/1.0.0/dummy_data.zip create mode 100644 imdb.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..957b257 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,27 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/dataset_infos.json b/dataset_infos.json new file mode 100644 index 0000000..b5f2589 --- /dev/null +++ b/dataset_infos.json @@ -0,0 +1 @@ +{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n title = {Learning Word Vectors for Sentiment Analysis},\n booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n month = {June},\n year = {2011},\n address = {Portland, Oregon, USA},\n publisher = {Association for Computational Linguistics},\n pages = {142--150},\n url = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 32660064, "num_examples": 25000, "dataset_name": "imdb"}, "train": {"name": "train", "num_bytes": 33442202, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67125548, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "dataset_size": 133227814, "size_in_bytes": 217353639}} \ No newline at end of file diff --git a/dummy/plain_text/1.0.0/dummy_data.zip b/dummy/plain_text/1.0.0/dummy_data.zip new file mode 100644 index 0000000..748977e --- /dev/null +++ b/dummy/plain_text/1.0.0/dummy_data.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:570a8f885827a2f340aec4a9f8b3452d037ee361ae00aa97c12d85bf3fc59e6a +size 4699 diff --git a/imdb.py b/imdb.py new file mode 100644 index 0000000..b205b83 --- /dev/null +++ b/imdb.py @@ -0,0 +1,122 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""IMDB movie reviews dataset.""" + +from __future__ import absolute_import, division, print_function + +import os + +import datasets + + +_DESCRIPTION = """\ +Large Movie Review Dataset. +This is a dataset for binary sentiment classification containing substantially \ +more data than previous benchmark datasets. We provide a set of 25,000 highly \ +polar movie reviews for training, and 25,000 for testing. There is additional \ +unlabeled data for use as well.\ +""" + +_CITATION = """\ +@InProceedings{maas-EtAl:2011:ACL-HLT2011, + author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, + title = {Learning Word Vectors for Sentiment Analysis}, + booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, + month = {June}, + year = {2011}, + address = {Portland, Oregon, USA}, + publisher = {Association for Computational Linguistics}, + pages = {142--150}, + url = {http://www.aclweb.org/anthology/P11-1015} +} +""" + +_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + + +class IMDBReviewsConfig(datasets.BuilderConfig): + """BuilderConfig for IMDBReviews.""" + + def __init__(self, **kwargs): + """BuilderConfig for IMDBReviews. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs) + + +class Imdb(datasets.GeneratorBasedBuilder): + """IMDB movie reviews dataset.""" + + BUILDER_CONFIGS = [ + IMDBReviewsConfig( + name="plain_text", + description="Plain text", + ) + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])} + ), + supervised_keys=None, + homepage="http://ai.stanford.edu/~amaas/data/sentiment/", + citation=_CITATION, + ) + + def _vocab_text_gen(self, archive): + for _, ex in self._generate_examples(archive, os.path.join("aclImdb", "train")): + yield ex["text"] + + def _split_generators(self, dl_manager): + arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL) + data_dir = os.path.join(arch_path, "aclImdb") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")} + ), + datasets.SplitGenerator( + name=datasets.Split("unsupervised"), + gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False}, + ), + ] + + def _generate_examples(self, directory, labeled=True): + """Generate IMDB examples.""" + # For labeled examples, extract the label from the path. + if labeled: + files = { + "pos": sorted(os.listdir(os.path.join(directory, "pos"))), + "neg": sorted(os.listdir(os.path.join(directory, "neg"))), + } + for key in files: + for id_, file in enumerate(files[key]): + filepath = os.path.join(directory, key, file) + with open(filepath, encoding="UTF-8") as f: + yield key + "_" + str(id_), {"text": f.read(), "label": key} + else: + unsup_files = sorted(os.listdir(os.path.join(directory, "unsup"))) + for id_, file in enumerate(unsup_files): + filepath = os.path.join(directory, "unsup", file) + with open(filepath, encoding="UTF-8") as f: + yield id_, {"text": f.read(), "label": -1}