Skip to content

Commit

Permalink
nlp to datasets (nyu-mll#1137)
Browse files Browse the repository at this point in the history
Co-authored-by: jeswan <57466294+jeswan@users.noreply.github.com>
  • Loading branch information
zphang and jeswan committed Oct 19, 2020
1 parent 7329cc5 commit dea7947
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 39 deletions.
3 changes: 2 additions & 1 deletion jiant/scripts/download_data/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Directly download tasks when nlp format is different than original dataset
# Directly download tasks when not available in HF Datasets, or HF Datasets version
# is not suitable
SQUAD_TASKS = {"squad_v1", "squad_v2"}
DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS = {
"wsc": f"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Use this for tasks that can be obtained from NLP without further/special processing"""
"""Use this for tasks that can be obtained from HF-Datasets without further/special processing"""

import jiant.scripts.download_data.utils as download_utils
import jiant.utils.python.io as py_io
Expand All @@ -21,7 +21,7 @@
)


NLP_CONVERSION_DICT = {
HF_DATASETS_CONVERSION_DICT = {
# === GLUE === #
"cola": {
"path": "glue",
Expand Down Expand Up @@ -152,24 +152,24 @@
},
}

# NLP uses "validation", we use "val"
# HF-Datasets uses "validation", we use "val"
DEFAULT_PHASE_MAP = {"validation": "val"}


def download_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
nlp_conversion_metadata = NLP_CONVERSION_DICT[task_name]
examples_dict = download_utils.convert_nlp_dataset_to_examples(
path=nlp_conversion_metadata["path"],
name=nlp_conversion_metadata.get("name"),
field_map=nlp_conversion_metadata.get("field_map"),
label_map=nlp_conversion_metadata.get("label_map"),
phase_map=nlp_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
phase_list=nlp_conversion_metadata.get("phase_list"),
hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name]
examples_dict = download_utils.convert_hf_dataset_to_examples(
path=hf_datasets_conversion_metadata["path"],
name=hf_datasets_conversion_metadata.get("name"),
field_map=hf_datasets_conversion_metadata.get("field_map"),
label_map=hf_datasets_conversion_metadata.get("label_map"),
phase_map=hf_datasets_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
phase_list=hf_datasets_conversion_metadata.get("phase_list"),
)
paths_dict = download_utils.write_examples_to_jsonls(
examples_dict=examples_dict, task_data_path=task_data_path,
)
jiant_task_name = nlp_conversion_metadata.get("jiant_task_name", task_name)
jiant_task_name = hf_datasets_conversion_metadata.get("jiant_task_name", task_name)
py_io.write_json(
data={"task": jiant_task_name, "paths": paths_dict, "name": task_name},
path=task_config_path,
Expand Down
16 changes: 8 additions & 8 deletions jiant/scripts/download_data/runscript.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
import argparse

import jiant.utils.python.io as py_io
import jiant.scripts.download_data.datasets.nlp_tasks as nlp_tasks_download
import jiant.scripts.download_data.datasets.hf_datasets_tasks as hf_datasets_tasks_download
import jiant.scripts.download_data.datasets.xtreme as xtreme_download
import jiant.scripts.download_data.datasets.files_tasks as files_tasks_download
from jiant.tasks.constants import (
GLUE_TASKS,
SUPERGLUE_TASKS,
OTHER_NLP_TASKS,
OTHER_HF_DATASETS_TASKS,
XTREME_TASKS,
BENCHMARKS,
)
from jiant.scripts.download_data.constants import SQUAD_TASKS, DIRECT_DOWNLOAD_TASKS

# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the nlp
# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the HF Datasets
# implementation differs from the original dataset format
NLP_DOWNLOADER_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_NLP_TASKS) - DIRECT_DOWNLOAD_TASKS
SUPPORTED_TASKS = NLP_DOWNLOADER_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS
HF_DATASETS_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_HF_DATASETS_TASKS) - DIRECT_DOWNLOAD_TASKS
SUPPORTED_TASKS = HF_DATASETS_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS


# noinspection PyUnusedLocal
Expand Down Expand Up @@ -50,8 +50,8 @@ def download_data(task_names, output_base_path):
for i, task_name in enumerate(task_names):
task_data_path = os.path.join(task_data_base_path, task_name)

if task_name in NLP_DOWNLOADER_TASKS:
nlp_tasks_download.download_data_and_write_config(
if task_name in HF_DATASETS_TASKS:
hf_datasets_tasks_download.download_data_and_write_config(
task_name=task_name,
task_data_path=task_data_path,
task_config_path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
Expand All @@ -74,7 +74,7 @@ def download_data(task_names, output_base_path):


def main():
parser = argparse.ArgumentParser(description="Download NLP datasets and generate task configs")
parser = argparse.ArgumentParser(description="Download datasets and generate task configs")
subparsers = parser.add_subparsers()
sp_list = subparsers.add_parser("list", help="list supported tasks in downloader")
sp_download = subparsers.add_parser("download", help="download data command")
Expand Down
14 changes: 7 additions & 7 deletions jiant/scripts/download_data/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import nlp
import datasets
import os
import tarfile
import urllib
Expand All @@ -8,15 +8,15 @@
from jiant.utils.python.datastructures import replace_key


def convert_nlp_dataset_to_examples(
def convert_hf_dataset_to_examples(
path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None
):
"""Helper function for reading from nlp.load_dataset and converting to examples
"""Helper function for reading from datasets.load_dataset and converting to examples
Args:
path: path argument (from nlp.load_dataset)
name: name argument (from nlp.load_dataset)
version: version argument (from nlp.load_dataset)
path: path argument (from datasets.load_dataset)
name: name argument (from datasets.load_dataset)
version: version argument (from datasets.load_dataset)
field_map: dictionary for renaming fields, non-exhaustive
label_map: dictionary for replacing labels, non-exhaustive
phase_map: dictionary for replacing phase names, non-exhaustive
Expand All @@ -25,7 +25,7 @@ def convert_nlp_dataset_to_examples(
Returns:
Dict[phase] -> list[examples]
"""
dataset = nlp.load_dataset(path=path, name=name, version=version)
dataset = datasets.load_dataset(path=path, name=name, version=version)
if phase_map:
for old_phase_name, new_phase_name in phase_map.items():
replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name)
Expand Down
2 changes: 1 addition & 1 deletion jiant/tasks/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"superglue_winogender_diagnostics",
}

OTHER_NLP_TASKS = {
OTHER_HF_DATASETS_TASKS = {
"snli",
"commonsenseqa",
"hellaswag",
Expand Down
8 changes: 4 additions & 4 deletions jiant/tasks/lib/commonsenseqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ def _create_examples(cls, lines, set_type):

@classmethod
def _create_example(cls, raw_example, set_type, i):
# Use heuristic for determining original or NLP format
# Use heuristic for determining original or HF Datasets format
if isinstance(raw_example["question"], dict):
return cls._create_example_from_original_format(
raw_example=raw_example, set_type=set_type, i=i,
)
elif isinstance(raw_example["question"], str):
return cls._create_example_from_nlp_format(
return cls._create_example_from_hf_datasets_format(
raw_example=raw_example, set_type=set_type, i=i,
)
else:
Expand All @@ -80,8 +80,8 @@ def _create_example_from_original_format(cls, raw_example, set_type, i):
)

@classmethod
def _create_example_from_nlp_format(cls, raw_example, set_type, i):
"""Return question and choices from NLP example format"""
def _create_example_from_hf_datasets_format(cls, raw_example, set_type, i):
"""Return question and choices from HF Datasets example format"""
return Example(
guid="%s-%s" % (set_type, i),
prompt=raw_example["question"],
Expand Down
2 changes: 1 addition & 1 deletion jiant/tasks/lib/snli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _create_examples(cls, lines, set_type):
)
)
else:
# Loading from NLP data
# Loading from HF Datasets data
if line["label"] == -1:
continue
examples.append(
Expand Down
6 changes: 3 additions & 3 deletions jiant/tasks/lib/socialiqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ def get_test_examples(self):
def _create_examples(cls, lines, set_type):
examples = []
answer_key_ls = ["answerA", "answerB", "answerC"]
nlp_label_map = {
hf_datasets_label_map = {
"1\n": "A",
"2\n": "B",
"3\n": "C",
}
for i, line in enumerate(lines):
if "label" in line:
# Loading from NLP data
label = nlp_label_map[line["label"]]
# Loading from HF Datasets data
label = hf_datasets_label_map[line["label"]]
else:
# Loading from original data
label = line["correct"]
Expand Down
2 changes: 1 addition & 1 deletion requirements-no-torch.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ attrs==19.3.0
bs4==0.0.1
jsonnet==0.15.0
lxml==4.5.1
nlp==0.4.0
datasets==1.1.2
nltk>=3.5
numexpr==2.7.1
numpy==1.18.4
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
"bs4 == 0.0.1",
"jsonnet == 0.15.0",
"lxml == 4.5.1",
"nlp == 0.4.0",
"datasets == 1.1.2",
"nltk >= 3.5",
"numexpr == 2.7.1",
"numpy == 1.18.4",
Expand Down

0 comments on commit dea7947

Please sign in to comment.