From 9fd6e32e8d229e2f4fff880c9a12fa5f71100793 Mon Sep 17 00:00:00 2001
From: system <system@huggingface.co>
Date: Tue, 25 Jan 2022 16:34:04 +0100
Subject: [PATCH]  Update files from the datasets library (from 1.0.0)

 Release notes: https://github.com/huggingface/datasets/releases/tag/1.0.0
---
 .gitattributes                        |  27 ++++++
 dataset_infos.json                    |   1 +
 dummy/plain_text/1.0.0/dummy_data.zip |   3 +
 imdb.py                               | 122 ++++++++++++++++++++++++++
 4 files changed, 153 insertions(+)
 create mode 100644 .gitattributes
 create mode 100644 dataset_infos.json
 create mode 100644 dummy/plain_text/1.0.0/dummy_data.zip
 create mode 100644 imdb.py

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..957b257
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,27 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/dataset_infos.json b/dataset_infos.json
new file mode 100644
index 0000000..b5f2589
--- /dev/null
+++ b/dataset_infos.json
@@ -0,0 +1 @@
+{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 32660064, "num_examples": 25000, "dataset_name": "imdb"}, "train": {"name": "train", "num_bytes": 33442202, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67125548, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "dataset_size": 133227814, "size_in_bytes": 217353639}}
\ No newline at end of file
diff --git a/dummy/plain_text/1.0.0/dummy_data.zip b/dummy/plain_text/1.0.0/dummy_data.zip
new file mode 100644
index 0000000..748977e
--- /dev/null
+++ b/dummy/plain_text/1.0.0/dummy_data.zip
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:570a8f885827a2f340aec4a9f8b3452d037ee361ae00aa97c12d85bf3fc59e6a
+size 4699
diff --git a/imdb.py b/imdb.py
new file mode 100644
index 0000000..b205b83
--- /dev/null
+++ b/imdb.py
@@ -0,0 +1,122 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""IMDB movie reviews dataset."""
+
+from __future__ import absolute_import, division, print_function
+
+import os
+
+import datasets
+
+
+_DESCRIPTION = """\
+Large Movie Review Dataset.
+This is a dataset for binary sentiment classification containing substantially \
+more data than previous benchmark datasets. We provide a set of 25,000 highly \
+polar movie reviews for training, and 25,000 for testing. There is additional \
+unlabeled data for use as well.\
+"""
+
+_CITATION = """\
+@InProceedings{maas-EtAl:2011:ACL-HLT2011,
+  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
+  title     = {Learning Word Vectors for Sentiment Analysis},
+  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
+  month     = {June},
+  year      = {2011},
+  address   = {Portland, Oregon, USA},
+  publisher = {Association for Computational Linguistics},
+  pages     = {142--150},
+  url       = {http://www.aclweb.org/anthology/P11-1015}
+}
+"""
+
+_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+
+
+class IMDBReviewsConfig(datasets.BuilderConfig):
+    """BuilderConfig for IMDBReviews."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for IMDBReviews.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
+
+
+class Imdb(datasets.GeneratorBasedBuilder):
+    """IMDB movie reviews dataset."""
+
+    BUILDER_CONFIGS = [
+        IMDBReviewsConfig(
+            name="plain_text",
+            description="Plain text",
+        )
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
+            ),
+            supervised_keys=None,
+            homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
+            citation=_CITATION,
+        )
+
+    def _vocab_text_gen(self, archive):
+        for _, ex in self._generate_examples(archive, os.path.join("aclImdb", "train")):
+            yield ex["text"]
+
+    def _split_generators(self, dl_manager):
+        arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
+        data_dir = os.path.join(arch_path, "aclImdb")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split("unsupervised"),
+                gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False},
+            ),
+        ]
+
+    def _generate_examples(self, directory, labeled=True):
+        """Generate IMDB examples."""
+        # For labeled examples, extract the label from the path.
+        if labeled:
+            files = {
+                "pos": sorted(os.listdir(os.path.join(directory, "pos"))),
+                "neg": sorted(os.listdir(os.path.join(directory, "neg"))),
+            }
+            for key in files:
+                for id_, file in enumerate(files[key]):
+                    filepath = os.path.join(directory, key, file)
+                    with open(filepath, encoding="UTF-8") as f:
+                        yield key + "_" + str(id_), {"text": f.read(), "label": key}
+        else:
+            unsup_files = sorted(os.listdir(os.path.join(directory, "unsup")))
+            for id_, file in enumerate(unsup_files):
+                filepath = os.path.join(directory, "unsup", file)
+                with open(filepath, encoding="UTF-8") as f:
+                    yield id_, {"text": f.read(), "label": -1}