diff --git a/README.md b/README.md index 1c6d572..9efdff4 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ --- -pretty_name: IMDB annotations_creators: - expert-generated language_creators: @@ -19,6 +18,40 @@ task_categories: task_ids: - sentiment-classification paperswithcode_id: imdb-movie-reviews +pretty_name: IMDB +dataset_info: + config_name: plain_text + features: + - name: text + dtype: string + - name: label + dtype: + class_label: + names: + '0': neg + '1': pos + splits: + - name: train + num_bytes: 33432823 + num_examples: 25000 + - name: test + num_bytes: 32650685 + num_examples: 25000 + - name: unsupervised + num_bytes: 67106794 + num_examples: 50000 + download_size: 83446840 + dataset_size: 133190302 +configs: +- config_name: plain_text + data_files: + - split: train + path: plain_text/train-* + - split: test + path: plain_text/test-* + - split: unsupervised + path: plain_text/unsupervised-* + default: true train-eval-index: - config: plain_text task: text-classification @@ -68,29 +101,6 @@ train-eval-index: name: Recall weighted args: average: weighted -dataset_info: - features: - - name: text - dtype: string - - name: label - dtype: - class_label: - names: - 0: neg - 1: pos - config_name: plain_text - splits: - - name: train - num_bytes: 33432835 - num_examples: 25000 - - name: test - num_bytes: 32650697 - num_examples: 25000 - - name: unsupervised - num_bytes: 67106814 - num_examples: 50000 - download_size: 84125825 - dataset_size: 133190346 --- # Dataset Card for "imdb" diff --git a/dataset_infos.json b/dataset_infos.json deleted file mode 100644 index 967864f..0000000 --- a/dataset_infos.json +++ /dev/null @@ -1 +0,0 @@ -{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n title = {Learning Word Vectors for Sentiment Analysis},\n booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n month = {June},\n year = {2011},\n address = {Portland, Oregon, USA},\n publisher = {Association for Computational Linguistics},\n pages = {142--150},\n url = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "label", "labels": ["neg", "pos"]}], "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 33432835, "num_examples": 25000, "dataset_name": "imdb"}, "test": {"name": "test", "num_bytes": 32650697, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67106814, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "post_processing_size": null, "dataset_size": 133190346, "size_in_bytes": 217316171}} \ No newline at end of file diff --git a/imdb.py b/imdb.py deleted file mode 100644 index 674ad65..0000000 --- a/imdb.py +++ /dev/null @@ -1,111 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""IMDB movie reviews dataset.""" - -import datasets -from datasets.tasks import TextClassification - - -_DESCRIPTION = """\ -Large Movie Review Dataset. -This is a dataset for binary sentiment classification containing substantially \ -more data than previous benchmark datasets. We provide a set of 25,000 highly \ -polar movie reviews for training, and 25,000 for testing. There is additional \ -unlabeled data for use as well.\ -""" - -_CITATION = """\ -@InProceedings{maas-EtAl:2011:ACL-HLT2011, - author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, - title = {Learning Word Vectors for Sentiment Analysis}, - booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, - month = {June}, - year = {2011}, - address = {Portland, Oregon, USA}, - publisher = {Association for Computational Linguistics}, - pages = {142--150}, - url = {http://www.aclweb.org/anthology/P11-1015} -} -""" - -_DOWNLOAD_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" - - -class IMDBReviewsConfig(datasets.BuilderConfig): - """BuilderConfig for IMDBReviews.""" - - def __init__(self, **kwargs): - """BuilderConfig for IMDBReviews. - - Args: - **kwargs: keyword arguments forwarded to super. - """ - super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs) - - -class Imdb(datasets.GeneratorBasedBuilder): - """IMDB movie reviews dataset.""" - - BUILDER_CONFIGS = [ - IMDBReviewsConfig( - name="plain_text", - description="Plain text", - ) - ] - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])} - ), - supervised_keys=None, - homepage="http://ai.stanford.edu/~amaas/data/sentiment/", - citation=_CITATION, - task_templates=[TextClassification(text_column="text", label_column="label")], - ) - - def _split_generators(self, dl_manager): - archive = dl_manager.download(_DOWNLOAD_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"} - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"} - ), - datasets.SplitGenerator( - name=datasets.Split("unsupervised"), - gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False}, - ), - ] - - def _generate_examples(self, files, split, labeled=True): - """Generate aclImdb examples.""" - # For labeled examples, extract the label from the path. - if labeled: - label_mapping = {"pos": 1, "neg": 0} - for path, f in files: - if path.startswith(f"aclImdb/{split}"): - label = label_mapping.get(path.split("/")[2]) - if label is not None: - yield path, {"text": f.read().decode("utf-8"), "label": label} - else: - for path, f in files: - if path.startswith(f"aclImdb/{split}"): - if path.split("/")[2] == "unsup": - yield path, {"text": f.read().decode("utf-8"), "label": -1} diff --git a/plain_text/test-00000-of-00001.parquet b/plain_text/test-00000-of-00001.parquet new file mode 100644 index 0000000..f79b447 --- /dev/null +++ b/plain_text/test-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52e26e2f872d282ffac460bf9770b25ac6f102cda0e6ca7158df98c94e8b3da +size 20470363 diff --git a/plain_text/train-00000-of-00001.parquet b/plain_text/train-00000-of-00001.parquet new file mode 100644 index 0000000..04b6b41 --- /dev/null +++ b/plain_text/train-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db47d16b5c297cc0dd625e519c81319c24c9149e70e8496de5475f6fa928342c +size 20979968 diff --git a/plain_text/unsupervised-00000-of-00001.parquet b/plain_text/unsupervised-00000-of-00001.parquet new file mode 100644 index 0000000..86163ca --- /dev/null +++ b/plain_text/unsupervised-00000-of-00001.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d14fbfcbb39fb7d299c38ca9f0ae6d231bf97108da85d620027ba437b6d52e +size 41996509