7 changed files with 126 additions and 116 deletions
--- a/README.md
+++ b/README.md
@ -1,106 +1,8 @@
 ---
 annotations_creators:
 - expert-generated
 language_creators:
 - expert-generated
 language:
 - en
 license:
 - other
 multilinguality:
 - monolingual
 size_categories:
 - 10K<n<100K
 source_datasets:
 - original
 task_categories:
 - text-classification
 task_ids:
 - sentiment-classification
 paperswithcode_id: imdb-movie-reviews
 pretty_name: IMDB
-dataset_info:
+languages:
-  config_name: plain_text
+- en
-  features:
+paperswithcode_id: imdb-movie-reviews
  - name: text
    dtype: string
  - name: label
    dtype:
      class_label:
        names:
          '0': neg
          '1': pos
  splits:
  - name: train
    num_bytes: 33432823
    num_examples: 25000
  - name: test
    num_bytes: 32650685
    num_examples: 25000
  - name: unsupervised
    num_bytes: 67106794
    num_examples: 50000
  download_size: 83446840
  dataset_size: 133190302
 configs:
 - config_name: plain_text
  data_files:
  - split: train
    path: plain_text/train-*
  - split: test
    path: plain_text/test-*
  - split: unsupervised
    path: plain_text/unsupervised-*
  default: true
 train-eval-index:
 - config: plain_text
  task: text-classification
  task_id: binary_classification
  splits:
    train_split: train
    eval_split: test
  col_mapping:
    text: text
    label: target
  metrics:
  - type: accuracy
  - name: Accuracy
  - type: f1
    name: F1 macro
    args:
      average: macro
  - type: f1
    name: F1 micro
    args:
      average: micro
  - type: f1
    name: F1 weighted
    args:
      average: weighted
  - type: precision
    name: Precision macro
    args:
      average: macro
  - type: precision
    name: Precision micro
    args:
      average: micro
  - type: precision
    name: Precision weighted
    args:
      average: weighted
  - type: recall
    name: Recall macro
    args:
      average: macro
  - type: recall
    name: Recall micro
    args:
      average: micro
  - type: recall
    name: Recall weighted
    args:
      average: weighted
 ---
 # Dataset Card for "imdb"
@ -135,9 +37,9 @@ train-eval-index:
 - **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Size of downloaded dataset files:** 84.13 MB
+- **Size of downloaded dataset files:** 80.23 MB
- **Size of the generated dataset:** 133.23 MB
+- **Size of the generated dataset:** 127.06 MB
- **Total amount of disk used:** 217.35 MB
+- **Total amount of disk used:** 207.28 MB
 ### Dataset Summary
@ -154,13 +56,15 @@ This is a dataset for binary sentiment classification containing substantially m
 ## Dataset Structure
 We show detailed information for up to 5 configurations of the dataset.
 ### Data Instances
 #### plain_text
- **Size of downloaded dataset files:** 84.13 MB
+- **Size of downloaded dataset files:** 80.23 MB
- **Size of the generated dataset:** 133.23 MB
+- **Size of the generated dataset:** 127.06 MB
- **Total amount of disk used:** 217.35 MB
+- **Total amount of disk used:** 207.28 MB
 An example of 'train' looks as follows.
 ```
--- a/dataset_infos.json
+++ b/dataset_infos.json
@ -0,0 +1 @@
 {"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "label", "labels": ["neg", "pos"]}], "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 33432835, "num_examples": 25000, "dataset_name": "imdb"}, "test": {"name": "test", "num_bytes": 32650697, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67106814, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "post_processing_size": null, "dataset_size": 133190346, "size_in_bytes": 217316171}}
--- a/dummy/plain_text/1.0.0/dummy_data.zip
+++ b/dummy/plain_text/1.0.0/dummy_data.zip
--- a/imdb.py
+++ b/imdb.py
@ -0,0 +1,111 @@
 # coding=utf-8
 # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Lint as: python3
 """IMDB movie reviews dataset."""
 import datasets
 from datasets.tasks import TextClassification
 _DESCRIPTION = """\
 Large Movie Review Dataset.
 This is a dataset for binary sentiment classification containing substantially \
 more data than previous benchmark datasets. We provide a set of 25,000 highly \
 polar movie reviews for training, and 25,000 for testing. There is additional \
 unlabeled data for use as well.\
 """
 _CITATION = """\
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
 }
 """
 _DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
 class IMDBReviewsConfig(datasets.BuilderConfig):
    """BuilderConfig for IMDBReviews."""
    def __init__(self, **kwargs):
        """BuilderConfig for IMDBReviews.
        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
 class Imdb(datasets.GeneratorBasedBuilder):
    """IMDB movie reviews dataset."""
    BUILDER_CONFIGS = [
        IMDBReviewsConfig(
            name="plain_text",
            description="Plain text",
        )
    ]
    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
            ),
            supervised_keys=None,
            homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
            citation=_CITATION,
            task_templates=[TextClassification(text_column="text", label_column="label")],
        )
    def _split_generators(self, dl_manager):
        archive = dl_manager.download(_DOWNLOAD_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
            ),
            datasets.SplitGenerator(
                name=datasets.Split("unsupervised"),
                gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
            ),
        ]
    def _generate_examples(self, files, split, labeled=True):
        """Generate aclImdb examples."""
        # For labeled examples, extract the label from the path.
        if labeled:
            label_mapping = {"pos": 1, "neg": 0}
            for path, f in files:
                if path.startswith(f"aclImdb/{split}"):
                    label = label_mapping.get(path.split("/")[2])
                    if label is not None:
                        yield path, {"text": f.read().decode("utf-8"), "label": label}
        else:
            for path, f in files:
                if path.startswith(f"aclImdb/{split}"):
                    if path.split("/")[2] == "unsup":
                        yield path, {"text": f.read().decode("utf-8"), "label": -1}
--- a/plain_text/test-00000-of-00001.parquet
+++ b/plain_text/test-00000-of-00001.parquet
--- a/plain_text/train-00000-of-00001.parquet
+++ b/plain_text/train-00000-of-00001.parquet
--- a/plain_text/unsupervised-00000-of-00001.parquet
+++ b/plain_text/unsupervised-00000-of-00001.parquet
		`@ -0,0 +1 @@`
							{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n title = {Learning Word Vectors for Sentiment Analysis},\n booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n month = {June},\n year = {2011},\n address = {Portland, Oregon, USA},\n publisher = {Association for Computational Linguistics},\n pages = {142--150},\n url = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "label", "labels": ["neg", "pos"]}], "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 33432835, "num_examples": 25000, "dataset_name": "imdb"}, "test": {"name": "test", "num_bytes": 32650697, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67106814, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "post_processing_size": null, "dataset_size": 133190346, "size_in_bytes": 217316171}}