Convert dataset to Parquet (#5 )

- Convert dataset to Parquet (610bdae015434d0a02e81468da0abb51c2164bc8) - Delete loading script (1087fc1e105aad8fa3a6730edb6619b43ec420b5) - Delete legacy dataset_infos.json (2fa85f972ae01fb2956c9925f7c420be7214de12)
Convert dataset sizes from base 2 to base 10 in the dataset card (#4 )
2024-01-04 12:09:45 +00:00 · 2023-04-05 10:07:38 +00:00 · 2023-02-16 15:28:09 +00:00 · 2022-11-18 20:09:16 +00:00 · 2022-11-03 17:47:48 +01:00 · 2022-10-28 16:35:23 +00:00
7 changed files with 115 additions and 125 deletions
--- a/README.md
+++ b/README.md
@ -1,8 +1,106 @@
 ---
-pretty_name: IMDB
-languages:
+annotations_creators:
+- expert-generated
+language_creators:
+- expert-generated
+language:
 - en
+license:
+- other
+multilinguality:
+- monolingual
+size_categories:
+- 10K<n<100K
+source_datasets:
+- original
+task_categories:
+- text-classification
+task_ids:
+- sentiment-classification
 paperswithcode_id: imdb-movie-reviews
+pretty_name: IMDB
+dataset_info:
+  config_name: plain_text
+  features:
+  - name: text
+    dtype: string
+  - name: label
+    dtype:
+      class_label:
+        names:
+          '0': neg
+          '1': pos
+  splits:
+  - name: train
+    num_bytes: 33432823
+    num_examples: 25000
+  - name: test
+    num_bytes: 32650685
+    num_examples: 25000
+  - name: unsupervised
+    num_bytes: 67106794
+    num_examples: 50000
+  download_size: 83446840
+  dataset_size: 133190302
+configs:
+- config_name: plain_text
+  data_files:
+  - split: train
+    path: plain_text/train-*
+  - split: test
+    path: plain_text/test-*
+  - split: unsupervised
+    path: plain_text/unsupervised-*
+  default: true
+train-eval-index:
+- config: plain_text
+  task: text-classification
+  task_id: binary_classification
+  splits:
+    train_split: train
+    eval_split: test
+  col_mapping:
+    text: text
+    label: target
+  metrics:
+  - type: accuracy
+  - name: Accuracy
+  - type: f1
+    name: F1 macro
+    args:
+      average: macro
+  - type: f1
+    name: F1 micro
+    args:
+      average: micro
+  - type: f1
+    name: F1 weighted
+    args:
+      average: weighted
+  - type: precision
+    name: Precision macro
+    args:
+      average: macro
+  - type: precision
+    name: Precision micro
+    args:
+      average: micro
+  - type: precision
+    name: Precision weighted
+    args:
+      average: weighted
+  - type: recall
+    name: Recall macro
+    args:
+      average: macro
+  - type: recall
+    name: Recall micro
+    args:
+      average: micro
+  - type: recall
+    name: Recall weighted
+    args:
+      average: weighted
 ---

 # Dataset Card for "imdb"
@ -37,9 +135,9 @@ paperswithcode_id: imdb-movie-reviews
 - **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
 - **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Size of downloaded dataset files:** 80.23 MB
- **Size of the generated dataset:** 127.06 MB
- **Total amount of disk used:** 207.28 MB
+- **Size of downloaded dataset files:** 84.13 MB
+- **Size of the generated dataset:** 133.23 MB
+- **Total amount of disk used:** 217.35 MB

 ### Dataset Summary

@ -56,15 +154,13 @@ This is a dataset for binary sentiment classification containing substantially m

 ## Dataset Structure

-We show detailed information for up to 5 configurations of the dataset.
-
 ### Data Instances

 #### plain_text

- **Size of downloaded dataset files:** 80.23 MB
- **Size of the generated dataset:** 127.06 MB
- **Total amount of disk used:** 207.28 MB
+- **Size of downloaded dataset files:** 84.13 MB
+- **Size of the generated dataset:** 133.23 MB
+- **Total amount of disk used:** 217.35 MB

 An example of 'train' looks as follows.
 ```
--- a/dataset_infos.json
+++ b/dataset_infos.json
@ -1 +0,0 @@
-{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "label", "labels": ["neg", "pos"]}], "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 33432835, "num_examples": 25000, "dataset_name": "imdb"}, "test": {"name": "test", "num_bytes": 32650697, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67106814, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "post_processing_size": null, "dataset_size": 133190346, "size_in_bytes": 217316171}}
--- a/dummy/plain_text/1.0.0/dummy_data.zip
+++ b/dummy/plain_text/1.0.0/dummy_data.zip
--- a/imdb.py
+++ b/imdb.py
@ -1,111 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-"""IMDB movie reviews dataset."""
-
-import datasets
-from datasets.tasks import TextClassification
-
-
-_DESCRIPTION = """\
-Large Movie Review Dataset.
-This is a dataset for binary sentiment classification containing substantially \
-more data than previous benchmark datasets. We provide a set of 25,000 highly \
-polar movie reviews for training, and 25,000 for testing. There is additional \
-unlabeled data for use as well.\
-"""
-
-_CITATION = """\
-@InProceedings{maas-EtAl:2011:ACL-HLT2011,
-  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
-  title     = {Learning Word Vectors for Sentiment Analysis},
-  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
-  month     = {June},
-  year      = {2011},
-  address   = {Portland, Oregon, USA},
-  publisher = {Association for Computational Linguistics},
-  pages     = {142--150},
-  url       = {http://www.aclweb.org/anthology/P11-1015}
-}
-"""
-
-_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
-
-
-class IMDBReviewsConfig(datasets.BuilderConfig):
-    """BuilderConfig for IMDBReviews."""
-
-    def __init__(self, **kwargs):
-        """BuilderConfig for IMDBReviews.
-
-        Args:
-          **kwargs: keyword arguments forwarded to super.
-        """
-        super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
-
-
-class Imdb(datasets.GeneratorBasedBuilder):
-    """IMDB movie reviews dataset."""
-
-    BUILDER_CONFIGS = [
-        IMDBReviewsConfig(
-            name="plain_text",
-            description="Plain text",
-        )
-    ]
-
-    def _info(self):
-        return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=datasets.Features(
-                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
-            ),
-            supervised_keys=None,
-            homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
-            citation=_CITATION,
-            task_templates=[TextClassification(text_column="text", label_column="label")],
-        )
-
-    def _split_generators(self, dl_manager):
-        archive = dl_manager.download(_DOWNLOAD_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split("unsupervised"),
-                gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
-            ),
-        ]
-
-    def _generate_examples(self, files, split, labeled=True):
-        """Generate aclImdb examples."""
-        # For labeled examples, extract the label from the path.
-        if labeled:
-            label_mapping = {"pos": 1, "neg": 0}
-            for path, f in files:
-                if path.startswith(f"aclImdb/{split}"):
-                    label = label_mapping.get(path.split("/")[2])
-                    if label is not None:
-                        yield path, {"text": f.read().decode("utf-8"), "label": label}
-        else:
-            for path, f in files:
-                if path.startswith(f"aclImdb/{split}"):
-                    if path.split("/")[2] == "unsup":
-                        yield path, {"text": f.read().decode("utf-8"), "label": -1}
--- a/plain_text/test-00000-of-00001.parquet
+++ b/plain_text/test-00000-of-00001.parquet
--- a/plain_text/train-00000-of-00001.parquet
+++ b/plain_text/train-00000-of-00001.parquet
--- a/plain_text/unsupervised-00000-of-00001.parquet
+++ b/plain_text/unsupervised-00000-of-00001.parquet
Author	SHA1	Message	Date
Albert Villanova	e6281661ce	Convert dataset to Parquet (#5 ) - Convert dataset to Parquet (610bdae015434d0a02e81468da0abb51c2164bc8) - Delete loading script (1087fc1e105aad8fa3a6730edb6619b43ec420b5) - Delete legacy dataset_infos.json (2fa85f972ae01fb2956c9925f7c420be7214de12)	2024-01-04 12:09:45 +00:00
Albert Villanova	9c6ede893f	Convert dataset sizes from base 2 to base 10 in the dataset card (#4 ) - Convert dataset sizes from base 2 to base 10 in the dataset card (ffcad612ef8b84500389a498e9046f72ce776524)	2023-04-05 10:07:38 +00:00
Albert Villanova	234e99c22b	Change download link from http to https (#2 ) - Change download link from http to https (dd3f73e25da17138d20d3bf4c41351e54870e632) Co-authored-by: Syed Haque <syedarehaq@users.noreply.huggingface.co>	2023-02-16 15:28:09 +00:00
Albert Villanova	94a9845cc8	Reorder split names (#1 ) - Reorder split names (b6740dccc681412d840918cdeefd3449c5e393f4)	2022-11-18 20:09:16 +00:00
Quentin Lhoest	3be66bfb24	add dataset_info in dataset metadata	2022-11-03 17:47:48 +01:00
mariosasko	b1e50541a3	remove dummmy data	2022-10-28 16:35:23 +00:00
Julien Chaumond	de29c68072	Align more metadata with other repo types (models,spaces) (#4607 ) * language(s) and license(s) for consistency * `licenses:` => `license:` * `languages:` => `language:` * backward-compat * make style * set default values for deprecated languages and licenses Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com> Commit from `a5192964dc`	2022-07-01 11:52:46 +00:00
Nazneen Rajani	92d6c44499	Autoeval config (#4234 ) * autoeval config added * autoeval config added * Added autonlp config changes https://github.com/huggingface/autonlp-backend/issues/414 * multi-input text classification as task id instead of category * improve metadata validation: - support YAML keys with dashes - add train-eval-index validation * revert debugging stuff * fix tests * style * Update metadata.py * Update metadata.py Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com> Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Commit from `6af556b64d`	2022-05-05 18:21:25 +00:00
Albert Villanova del Moral	f04bc7e946	Remove a copy-paste sentence in dataset cards (#4281 ) Commit from `bf6108b8e0`	2022-05-04 18:34:26 +00:00
Nazneen Rajani	d00115d50d	task id update (#4244 ) * autoeval config added * autoeval config added * multi-input text classification as task id instead of category * Added task id * removing autoeval config for now * Added required config Commit from `175b0953fc`	2022-05-04 10:37:08 +00:00
				`@ -1 +0,0 @@`
				{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n title = {Learning Word Vectors for Sentiment Analysis},\n booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n month = {June},\n year = {2011},\n address = {Portland, Oregon, USA},\n publisher = {Association for Computational Linguistics},\n pages = {142--150},\n url = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "task_templates": [{"task": "text-classification", "text_column": "text", "label_column": "label", "labels": ["neg", "pos"]}], "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 33432835, "num_examples": 25000, "dataset_name": "imdb"}, "test": {"name": "test", "num_bytes": 32650697, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67106814, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "post_processing_size": null, "dataset_size": 133190346, "size_in_bytes": 217316171}}