Update files from the datasets library (from 1.0.0)
Release notes: https://github.com/huggingface/datasets/releases/tag/1.0.0
This commit is contained in:
commit
9fd6e32e8d
27
.gitattributes
vendored
Normal file
27
.gitattributes
vendored
Normal file
@ -0,0 +1,27 @@
|
||||
*.7z filter=lfs diff=lfs merge=lfs -text
|
||||
*.arrow filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin filter=lfs diff=lfs merge=lfs -text
|
||||
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
||||
*.ftz filter=lfs diff=lfs merge=lfs -text
|
||||
*.gz filter=lfs diff=lfs merge=lfs -text
|
||||
*.h5 filter=lfs diff=lfs merge=lfs -text
|
||||
*.joblib filter=lfs diff=lfs merge=lfs -text
|
||||
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.model filter=lfs diff=lfs merge=lfs -text
|
||||
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
||||
*.onnx filter=lfs diff=lfs merge=lfs -text
|
||||
*.ot filter=lfs diff=lfs merge=lfs -text
|
||||
*.parquet filter=lfs diff=lfs merge=lfs -text
|
||||
*.pb filter=lfs diff=lfs merge=lfs -text
|
||||
*.pt filter=lfs diff=lfs merge=lfs -text
|
||||
*.pth filter=lfs diff=lfs merge=lfs -text
|
||||
*.rar filter=lfs diff=lfs merge=lfs -text
|
||||
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
||||
*.tflite filter=lfs diff=lfs merge=lfs -text
|
||||
*.tgz filter=lfs diff=lfs merge=lfs -text
|
||||
*.xz filter=lfs diff=lfs merge=lfs -text
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
||||
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
||||
1
dataset_infos.json
Normal file
1
dataset_infos.json
Normal file
@ -0,0 +1 @@
|
||||
{"plain_text": {"description": "Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.", "citation": "@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},\n title = {Learning Word Vectors for Sentiment Analysis},\n booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n month = {June},\n year = {2011},\n address = {Portland, Oregon, USA},\n publisher = {Association for Computational Linguistics},\n pages = {142--150},\n url = {http://www.aclweb.org/anthology/P11-1015}\n}\n", "homepage": "http://ai.stanford.edu/~amaas/data/sentiment/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["neg", "pos"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "supervised_keys": null, "builder_name": "imdb", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 32660064, "num_examples": 25000, "dataset_name": "imdb"}, "train": {"name": "train", "num_bytes": 33442202, "num_examples": 25000, "dataset_name": "imdb"}, "unsupervised": {"name": "unsupervised", "num_bytes": 67125548, "num_examples": 50000, "dataset_name": "imdb"}}, "download_checksums": {"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz": {"num_bytes": 84125825, "checksum": "c40f74a18d3b61f90feba1e17730e0d38e8b97c05fde7008942e91923d1658fe"}}, "download_size": 84125825, "dataset_size": 133227814, "size_in_bytes": 217353639}}
|
||||
BIN
dummy/plain_text/1.0.0/dummy_data.zip
(Stored with Git LFS)
Normal file
BIN
dummy/plain_text/1.0.0/dummy_data.zip
(Stored with Git LFS)
Normal file
Binary file not shown.
122
imdb.py
Normal file
122
imdb.py
Normal file
@ -0,0 +1,122 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Lint as: python3
|
||||
"""IMDB movie reviews dataset."""
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import os
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
_DESCRIPTION = """\
|
||||
Large Movie Review Dataset.
|
||||
This is a dataset for binary sentiment classification containing substantially \
|
||||
more data than previous benchmark datasets. We provide a set of 25,000 highly \
|
||||
polar movie reviews for training, and 25,000 for testing. There is additional \
|
||||
unlabeled data for use as well.\
|
||||
"""
|
||||
|
||||
_CITATION = """\
|
||||
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
|
||||
author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
|
||||
title = {Learning Word Vectors for Sentiment Analysis},
|
||||
booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
|
||||
month = {June},
|
||||
year = {2011},
|
||||
address = {Portland, Oregon, USA},
|
||||
publisher = {Association for Computational Linguistics},
|
||||
pages = {142--150},
|
||||
url = {http://www.aclweb.org/anthology/P11-1015}
|
||||
}
|
||||
"""
|
||||
|
||||
_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
|
||||
|
||||
|
||||
class IMDBReviewsConfig(datasets.BuilderConfig):
|
||||
"""BuilderConfig for IMDBReviews."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""BuilderConfig for IMDBReviews.
|
||||
|
||||
Args:
|
||||
**kwargs: keyword arguments forwarded to super.
|
||||
"""
|
||||
super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
|
||||
|
||||
|
||||
class Imdb(datasets.GeneratorBasedBuilder):
|
||||
"""IMDB movie reviews dataset."""
|
||||
|
||||
BUILDER_CONFIGS = [
|
||||
IMDBReviewsConfig(
|
||||
name="plain_text",
|
||||
description="Plain text",
|
||||
)
|
||||
]
|
||||
|
||||
def _info(self):
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=datasets.Features(
|
||||
{"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
|
||||
),
|
||||
supervised_keys=None,
|
||||
homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _vocab_text_gen(self, archive):
|
||||
for _, ex in self._generate_examples(archive, os.path.join("aclImdb", "train")):
|
||||
yield ex["text"]
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
|
||||
data_dir = os.path.join(arch_path, "aclImdb")
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}
|
||||
),
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split("unsupervised"),
|
||||
gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, directory, labeled=True):
|
||||
"""Generate IMDB examples."""
|
||||
# For labeled examples, extract the label from the path.
|
||||
if labeled:
|
||||
files = {
|
||||
"pos": sorted(os.listdir(os.path.join(directory, "pos"))),
|
||||
"neg": sorted(os.listdir(os.path.join(directory, "neg"))),
|
||||
}
|
||||
for key in files:
|
||||
for id_, file in enumerate(files[key]):
|
||||
filepath = os.path.join(directory, key, file)
|
||||
with open(filepath, encoding="UTF-8") as f:
|
||||
yield key + "_" + str(id_), {"text": f.read(), "label": key}
|
||||
else:
|
||||
unsup_files = sorted(os.listdir(os.path.join(directory, "unsup")))
|
||||
for id_, file in enumerate(unsup_files):
|
||||
filepath = os.path.join(directory, "unsup", file)
|
||||
with open(filepath, encoding="UTF-8") as f:
|
||||
yield id_, {"text": f.read(), "label": -1}
|
||||
Loading…
Reference in New Issue
Block a user