Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NLP -> HF Datasets #1137

Merged
merged 5 commits into from
Oct 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion jiant/scripts/download_data/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Directly download tasks when nlp format is different than original dataset
# Directly download tasks when not available in HF Datasets, or HF Datasets version
# is not suitable
SQUAD_TASKS = {"squad_v1", "squad_v2"}
DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS = {
"wsc": f"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Use this for tasks that can be obtained from NLP without further/special processing"""
"""Use this for tasks that can be obtained from HF-Datasets without further/special processing"""

import jiant.scripts.download_data.utils as download_utils
import jiant.utils.python.io as py_io
Expand All @@ -21,7 +21,7 @@
)


NLP_CONVERSION_DICT = {
HF_DATASETS_CONVERSION_DICT = {
# === GLUE === #
"cola": {
"path": "glue",
Expand Down Expand Up @@ -152,24 +152,24 @@
},
}

# NLP uses "validation", we use "val"
# HF-Datasets uses "validation", we use "val"
DEFAULT_PHASE_MAP = {"validation": "val"}


def download_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
nlp_conversion_metadata = NLP_CONVERSION_DICT[task_name]
examples_dict = download_utils.convert_nlp_dataset_to_examples(
path=nlp_conversion_metadata["path"],
name=nlp_conversion_metadata.get("name"),
field_map=nlp_conversion_metadata.get("field_map"),
label_map=nlp_conversion_metadata.get("label_map"),
phase_map=nlp_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
phase_list=nlp_conversion_metadata.get("phase_list"),
hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name]
examples_dict = download_utils.convert_hf_dataset_to_examples(
path=hf_datasets_conversion_metadata["path"],
name=hf_datasets_conversion_metadata.get("name"),
field_map=hf_datasets_conversion_metadata.get("field_map"),
label_map=hf_datasets_conversion_metadata.get("label_map"),
phase_map=hf_datasets_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
phase_list=hf_datasets_conversion_metadata.get("phase_list"),
)
paths_dict = download_utils.write_examples_to_jsonls(
examples_dict=examples_dict, task_data_path=task_data_path,
)
jiant_task_name = nlp_conversion_metadata.get("jiant_task_name", task_name)
jiant_task_name = hf_datasets_conversion_metadata.get("jiant_task_name", task_name)
py_io.write_json(
data={"task": jiant_task_name, "paths": paths_dict, "name": task_name},
path=task_config_path,
Expand Down
16 changes: 8 additions & 8 deletions jiant/scripts/download_data/runscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
import argparse

import jiant.utils.python.io as py_io
import jiant.scripts.download_data.datasets.nlp_tasks as nlp_tasks_download
import jiant.scripts.download_data.datasets.hf_datasets_tasks as hf_datasets_tasks_download
import jiant.scripts.download_data.datasets.xtreme as xtreme_download
import jiant.scripts.download_data.datasets.files_tasks as files_tasks_download
from jiant.tasks.constants import (
GLUE_TASKS,
SUPERGLUE_TASKS,
OTHER_NLP_TASKS,
OTHER_HF_DATASETS_TASKS,
XTREME_TASKS,
BENCHMARKS,
)
from jiant.scripts.download_data.constants import SQUAD_TASKS, DIRECT_DOWNLOAD_TASKS

# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the nlp
# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the HF Datasets
# implementation differs from the original dataset format
NLP_DOWNLOADER_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_NLP_TASKS) - DIRECT_DOWNLOAD_TASKS
SUPPORTED_TASKS = NLP_DOWNLOADER_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS
HF_DATASETS_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_HF_DATASETS_TASKS) - DIRECT_DOWNLOAD_TASKS
SUPPORTED_TASKS = HF_DATASETS_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS


# noinspection PyUnusedLocal
Expand Down Expand Up @@ -50,8 +50,8 @@ def download_data(task_names, output_base_path):
for i, task_name in enumerate(task_names):
task_data_path = os.path.join(task_data_base_path, task_name)

if task_name in NLP_DOWNLOADER_TASKS:
nlp_tasks_download.download_data_and_write_config(
if task_name in HF_DATASETS_TASKS:
hf_datasets_tasks_download.download_data_and_write_config(
task_name=task_name,
task_data_path=task_data_path,
task_config_path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
Expand All @@ -74,7 +74,7 @@ def download_data(task_names, output_base_path):


def main():
parser = argparse.ArgumentParser(description="Download NLP datasets and generate task configs")
parser = argparse.ArgumentParser(description="Download datasets and generate task configs")
subparsers = parser.add_subparsers()
sp_list = subparsers.add_parser("list", help="list supported tasks in downloader")
sp_download = subparsers.add_parser("download", help="download data command")
Expand Down
14 changes: 7 additions & 7 deletions jiant/scripts/download_data/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import nlp
import datasets
import os
import tarfile
import urllib
Expand All @@ -8,15 +8,15 @@
from jiant.utils.python.datastructures import replace_key


def convert_nlp_dataset_to_examples(
def convert_hf_dataset_to_examples(
path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None
):
"""Helper function for reading from nlp.load_dataset and converting to examples
"""Helper function for reading from datasets.load_dataset and converting to examples

Args:
path: path argument (from nlp.load_dataset)
name: name argument (from nlp.load_dataset)
version: version argument (from nlp.load_dataset)
path: path argument (from datasets.load_dataset)
name: name argument (from datasets.load_dataset)
version: version argument (from datasets.load_dataset)
field_map: dictionary for renaming fields, non-exhaustive
label_map: dictionary for replacing labels, non-exhaustive
phase_map: dictionary for replacing phase names, non-exhaustive
Expand All @@ -25,7 +25,7 @@ def convert_nlp_dataset_to_examples(
Returns:
Dict[phase] -> list[examples]
"""
dataset = nlp.load_dataset(path=path, name=name, version=version)
dataset = datasets.load_dataset(path=path, name=name, version=version)
if phase_map:
for old_phase_name, new_phase_name in phase_map.items():
replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name)
Expand Down
2 changes: 1 addition & 1 deletion jiant/tasks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"superglue_winogender_diagnostics",
}

OTHER_NLP_TASKS = {
OTHER_HF_DATASETS_TASKS = {
"snli",
"commonsenseqa",
"hellaswag",
Expand Down
8 changes: 4 additions & 4 deletions jiant/tasks/lib/commonsenseqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def _create_examples(cls, lines, set_type):

@classmethod
def _create_example(cls, raw_example, set_type, i):
# Use heuristic for determining original or NLP format
# Use heuristic for determining original or HF Datasets format
if isinstance(raw_example["question"], dict):
return cls._create_example_from_original_format(
raw_example=raw_example, set_type=set_type, i=i,
)
elif isinstance(raw_example["question"], str):
return cls._create_example_from_nlp_format(
return cls._create_example_from_hf_datasets_format(
raw_example=raw_example, set_type=set_type, i=i,
)
else:
Expand All @@ -80,8 +80,8 @@ def _create_example_from_original_format(cls, raw_example, set_type, i):
)

@classmethod
def _create_example_from_nlp_format(cls, raw_example, set_type, i):
"""Return question and choices from NLP example format"""
def _create_example_from_hf_datasets_format(cls, raw_example, set_type, i):
"""Return question and choices from HF Datasets example format"""
return Example(
guid="%s-%s" % (set_type, i),
prompt=raw_example["question"],
Expand Down
2 changes: 1 addition & 1 deletion jiant/tasks/lib/snli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _create_examples(cls, lines, set_type):
)
)
else:
# Loading from NLP data
# Loading from HF Datasets data
if line["label"] == -1:
continue
examples.append(
Expand Down
6 changes: 3 additions & 3 deletions jiant/tasks/lib/socialiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def get_test_examples(self):
def _create_examples(cls, lines, set_type):
examples = []
answer_key_ls = ["answerA", "answerB", "answerC"]
nlp_label_map = {
hf_datasets_label_map = {
"1\n": "A",
"2\n": "B",
"3\n": "C",
}
for i, line in enumerate(lines):
if "label" in line:
# Loading from NLP data
label = nlp_label_map[line["label"]]
# Loading from HF Datasets data
label = hf_datasets_label_map[line["label"]]
else:
# Loading from original data
label = line["correct"]
Expand Down
2 changes: 1 addition & 1 deletion requirements-no-torch.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ attrs==19.3.0
bs4==0.0.1
jsonnet==0.15.0
lxml==4.5.1
nlp==0.4.0
datasets==1.1.2
nltk>=3.5
numexpr==2.7.1
numpy==1.18.4
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
"bs4 == 0.0.1",
"jsonnet == 0.15.0",
"lxml == 4.5.1",
"nlp == 0.4.0",
"datasets == 1.1.2",
"nltk >= 3.5",
"numexpr == 2.7.1",
"numpy == 1.18.4",
Expand Down