Update files from the datasets library (from 1.16.0)
Release notes: https://github.com/huggingface/datasets/releases/tag/1.16.0
This commit is contained in:
parent
0c5d6316a3
commit
9a403d6ee7
@ -1,4 +1,5 @@
|
|||||||
---
|
---
|
||||||
|
pretty_name: IMDB
|
||||||
languages:
|
languages:
|
||||||
- en
|
- en
|
||||||
paperswithcode_id: imdb-movie-reviews
|
paperswithcode_id: imdb-movie-reviews
|
||||||
|
|||||||
44
imdb.py
44
imdb.py
@ -16,9 +16,6 @@
|
|||||||
# Lint as: python3
|
# Lint as: python3
|
||||||
"""IMDB movie reviews dataset."""
|
"""IMDB movie reviews dataset."""
|
||||||
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
from datasets.tasks import TextClassification
|
from datasets.tasks import TextClassification
|
||||||
|
|
||||||
@ -82,42 +79,33 @@ class Imdb(datasets.GeneratorBasedBuilder):
|
|||||||
task_templates=[TextClassification(text_column="text", label_column="label")],
|
task_templates=[TextClassification(text_column="text", label_column="label")],
|
||||||
)
|
)
|
||||||
|
|
||||||
def _vocab_text_gen(self, archive):
|
|
||||||
for _, ex in self._generate_examples(archive, os.path.join("aclImdb", "train")):
|
|
||||||
yield ex["text"]
|
|
||||||
|
|
||||||
def _split_generators(self, dl_manager):
|
def _split_generators(self, dl_manager):
|
||||||
arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
|
archive = dl_manager.download(_DOWNLOAD_URL)
|
||||||
data_dir = os.path.join(arch_path, "aclImdb")
|
|
||||||
return [
|
return [
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(
|
||||||
name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}
|
name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
|
||||||
),
|
),
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(
|
||||||
name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}
|
name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
|
||||||
),
|
),
|
||||||
datasets.SplitGenerator(
|
datasets.SplitGenerator(
|
||||||
name=datasets.Split("unsupervised"),
|
name=datasets.Split("unsupervised"),
|
||||||
gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False},
|
gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
def _generate_examples(self, directory, labeled=True):
|
def _generate_examples(self, files, split, labeled=True):
|
||||||
"""Generate IMDB examples."""
|
"""Generate aclImdb examples."""
|
||||||
# For labeled examples, extract the label from the path.
|
# For labeled examples, extract the label from the path.
|
||||||
if labeled:
|
if labeled:
|
||||||
files = {
|
label_mapping = {"pos": 1, "neg": 0}
|
||||||
"pos": sorted(os.listdir(os.path.join(directory, "pos"))),
|
for path, f in files:
|
||||||
"neg": sorted(os.listdir(os.path.join(directory, "neg"))),
|
if path.startswith(f"aclImdb/{split}"):
|
||||||
}
|
label = label_mapping.get(path.split("/")[2])
|
||||||
for key in files:
|
if label is not None:
|
||||||
for id_, file in enumerate(files[key]):
|
yield path, {"text": f.read().decode("utf-8"), "label": label}
|
||||||
filepath = os.path.join(directory, key, file)
|
|
||||||
with open(filepath, encoding="UTF-8") as f:
|
|
||||||
yield key + "_" + str(id_), {"text": f.read(), "label": key}
|
|
||||||
else:
|
else:
|
||||||
unsup_files = sorted(os.listdir(os.path.join(directory, "unsup")))
|
for path, f in files:
|
||||||
for id_, file in enumerate(unsup_files):
|
if path.startswith(f"aclImdb/{split}"):
|
||||||
filepath = os.path.join(directory, "unsup", file)
|
if path.split("/")[2] == "unsup":
|
||||||
with open(filepath, encoding="UTF-8") as f:
|
yield path, {"text": f.read().decode("utf-8"), "label": -1}
|
||||||
yield id_, {"text": f.read(), "label": -1}
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user