From dea7947eb816ce6c4c2423c069b8f8478d693e5e Mon Sep 17 00:00:00 2001 From: Jason Phang Date: Mon, 19 Oct 2020 14:01:27 -0400 Subject: [PATCH] nlp to datasets (#1137) Co-authored-by: jeswan <57466294+jeswan@users.noreply.github.com> --- jiant/scripts/download_data/constants.py | 3 ++- .../{nlp_tasks.py => hf_datasets_tasks.py} | 24 +++++++++---------- jiant/scripts/download_data/runscript.py | 16 ++++++------- jiant/scripts/download_data/utils.py | 14 +++++------ jiant/tasks/constants.py | 2 +- jiant/tasks/lib/commonsenseqa.py | 8 +++---- jiant/tasks/lib/snli.py | 2 +- jiant/tasks/lib/socialiqa.py | 6 ++--- requirements-no-torch.txt | 2 +- setup.py | 2 +- 10 files changed, 40 insertions(+), 39 deletions(-) rename jiant/scripts/download_data/datasets/{nlp_tasks.py => hf_datasets_tasks.py} (87%) diff --git a/jiant/scripts/download_data/constants.py b/jiant/scripts/download_data/constants.py index 868728a88..8110be95e 100644 --- a/jiant/scripts/download_data/constants.py +++ b/jiant/scripts/download_data/constants.py @@ -1,4 +1,5 @@ -# Directly download tasks when nlp format is different than original dataset +# Directly download tasks when not available in HF Datasets, or HF Datasets version +# is not suitable SQUAD_TASKS = {"squad_v1", "squad_v2"} DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS = { "wsc": f"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip", diff --git a/jiant/scripts/download_data/datasets/nlp_tasks.py b/jiant/scripts/download_data/datasets/hf_datasets_tasks.py similarity index 87% rename from jiant/scripts/download_data/datasets/nlp_tasks.py rename to jiant/scripts/download_data/datasets/hf_datasets_tasks.py index 1d99dcf9b..8dd0c0271 100644 --- a/jiant/scripts/download_data/datasets/nlp_tasks.py +++ b/jiant/scripts/download_data/datasets/hf_datasets_tasks.py @@ -1,4 +1,4 @@ -"""Use this for tasks that can be obtained from NLP without further/special processing""" +"""Use this for tasks that can be obtained from HF-Datasets without further/special processing""" import jiant.scripts.download_data.utils as download_utils import jiant.utils.python.io as py_io @@ -21,7 +21,7 @@ ) -NLP_CONVERSION_DICT = { +HF_DATASETS_CONVERSION_DICT = { # === GLUE === # "cola": { "path": "glue", @@ -152,24 +152,24 @@ }, } -# NLP uses "validation", we use "val" +# HF-Datasets uses "validation", we use "val" DEFAULT_PHASE_MAP = {"validation": "val"} def download_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): - nlp_conversion_metadata = NLP_CONVERSION_DICT[task_name] - examples_dict = download_utils.convert_nlp_dataset_to_examples( - path=nlp_conversion_metadata["path"], - name=nlp_conversion_metadata.get("name"), - field_map=nlp_conversion_metadata.get("field_map"), - label_map=nlp_conversion_metadata.get("label_map"), - phase_map=nlp_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP), - phase_list=nlp_conversion_metadata.get("phase_list"), + hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name] + examples_dict = download_utils.convert_hf_dataset_to_examples( + path=hf_datasets_conversion_metadata["path"], + name=hf_datasets_conversion_metadata.get("name"), + field_map=hf_datasets_conversion_metadata.get("field_map"), + label_map=hf_datasets_conversion_metadata.get("label_map"), + phase_map=hf_datasets_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP), + phase_list=hf_datasets_conversion_metadata.get("phase_list"), ) paths_dict = download_utils.write_examples_to_jsonls( examples_dict=examples_dict, task_data_path=task_data_path, ) - jiant_task_name = nlp_conversion_metadata.get("jiant_task_name", task_name) + jiant_task_name = hf_datasets_conversion_metadata.get("jiant_task_name", task_name) py_io.write_json( data={"task": jiant_task_name, "paths": paths_dict, "name": task_name}, path=task_config_path, diff --git a/jiant/scripts/download_data/runscript.py b/jiant/scripts/download_data/runscript.py index 5d21cc6b9..ebf988deb 100644 --- a/jiant/scripts/download_data/runscript.py +++ b/jiant/scripts/download_data/runscript.py @@ -2,22 +2,22 @@ import argparse import jiant.utils.python.io as py_io -import jiant.scripts.download_data.datasets.nlp_tasks as nlp_tasks_download +import jiant.scripts.download_data.datasets.hf_datasets_tasks as hf_datasets_tasks_download import jiant.scripts.download_data.datasets.xtreme as xtreme_download import jiant.scripts.download_data.datasets.files_tasks as files_tasks_download from jiant.tasks.constants import ( GLUE_TASKS, SUPERGLUE_TASKS, - OTHER_NLP_TASKS, + OTHER_HF_DATASETS_TASKS, XTREME_TASKS, BENCHMARKS, ) from jiant.scripts.download_data.constants import SQUAD_TASKS, DIRECT_DOWNLOAD_TASKS -# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the nlp +# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the HF Datasets # implementation differs from the original dataset format -NLP_DOWNLOADER_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_NLP_TASKS) - DIRECT_DOWNLOAD_TASKS -SUPPORTED_TASKS = NLP_DOWNLOADER_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS +HF_DATASETS_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_HF_DATASETS_TASKS) - DIRECT_DOWNLOAD_TASKS +SUPPORTED_TASKS = HF_DATASETS_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS # noinspection PyUnusedLocal @@ -50,8 +50,8 @@ def download_data(task_names, output_base_path): for i, task_name in enumerate(task_names): task_data_path = os.path.join(task_data_base_path, task_name) - if task_name in NLP_DOWNLOADER_TASKS: - nlp_tasks_download.download_data_and_write_config( + if task_name in HF_DATASETS_TASKS: + hf_datasets_tasks_download.download_data_and_write_config( task_name=task_name, task_data_path=task_data_path, task_config_path=os.path.join(task_config_base_path, f"{task_name}_config.json"), @@ -74,7 +74,7 @@ def download_data(task_names, output_base_path): def main(): - parser = argparse.ArgumentParser(description="Download NLP datasets and generate task configs") + parser = argparse.ArgumentParser(description="Download datasets and generate task configs") subparsers = parser.add_subparsers() sp_list = subparsers.add_parser("list", help="list supported tasks in downloader") sp_download = subparsers.add_parser("download", help="download data command") diff --git a/jiant/scripts/download_data/utils.py b/jiant/scripts/download_data/utils.py index 2bcb3d895..25b795b3c 100644 --- a/jiant/scripts/download_data/utils.py +++ b/jiant/scripts/download_data/utils.py @@ -1,4 +1,4 @@ -import nlp +import datasets import os import tarfile import urllib @@ -8,15 +8,15 @@ from jiant.utils.python.datastructures import replace_key -def convert_nlp_dataset_to_examples( +def convert_hf_dataset_to_examples( path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None ): - """Helper function for reading from nlp.load_dataset and converting to examples + """Helper function for reading from datasets.load_dataset and converting to examples Args: - path: path argument (from nlp.load_dataset) - name: name argument (from nlp.load_dataset) - version: version argument (from nlp.load_dataset) + path: path argument (from datasets.load_dataset) + name: name argument (from datasets.load_dataset) + version: version argument (from datasets.load_dataset) field_map: dictionary for renaming fields, non-exhaustive label_map: dictionary for replacing labels, non-exhaustive phase_map: dictionary for replacing phase names, non-exhaustive @@ -25,7 +25,7 @@ def convert_nlp_dataset_to_examples( Returns: Dict[phase] -> list[examples] """ - dataset = nlp.load_dataset(path=path, name=name, version=version) + dataset = datasets.load_dataset(path=path, name=name, version=version) if phase_map: for old_phase_name, new_phase_name in phase_map.items(): replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name) diff --git a/jiant/tasks/constants.py b/jiant/tasks/constants.py index 32f51aa05..f2783d733 100644 --- a/jiant/tasks/constants.py +++ b/jiant/tasks/constants.py @@ -25,7 +25,7 @@ "superglue_winogender_diagnostics", } -OTHER_NLP_TASKS = { +OTHER_HF_DATASETS_TASKS = { "snli", "commonsenseqa", "hellaswag", diff --git a/jiant/tasks/lib/commonsenseqa.py b/jiant/tasks/lib/commonsenseqa.py index b362002db..c03506eb1 100644 --- a/jiant/tasks/lib/commonsenseqa.py +++ b/jiant/tasks/lib/commonsenseqa.py @@ -55,13 +55,13 @@ def _create_examples(cls, lines, set_type): @classmethod def _create_example(cls, raw_example, set_type, i): - # Use heuristic for determining original or NLP format + # Use heuristic for determining original or HF Datasets format if isinstance(raw_example["question"], dict): return cls._create_example_from_original_format( raw_example=raw_example, set_type=set_type, i=i, ) elif isinstance(raw_example["question"], str): - return cls._create_example_from_nlp_format( + return cls._create_example_from_hf_datasets_format( raw_example=raw_example, set_type=set_type, i=i, ) else: @@ -80,8 +80,8 @@ def _create_example_from_original_format(cls, raw_example, set_type, i): ) @classmethod - def _create_example_from_nlp_format(cls, raw_example, set_type, i): - """Return question and choices from NLP example format""" + def _create_example_from_hf_datasets_format(cls, raw_example, set_type, i): + """Return question and choices from HF Datasets example format""" return Example( guid="%s-%s" % (set_type, i), prompt=raw_example["question"], diff --git a/jiant/tasks/lib/snli.py b/jiant/tasks/lib/snli.py index 197087b94..ccc1fbb32 100644 --- a/jiant/tasks/lib/snli.py +++ b/jiant/tasks/lib/snli.py @@ -105,7 +105,7 @@ def _create_examples(cls, lines, set_type): ) ) else: - # Loading from NLP data + # Loading from HF Datasets data if line["label"] == -1: continue examples.append( diff --git a/jiant/tasks/lib/socialiqa.py b/jiant/tasks/lib/socialiqa.py index dc3fe3692..71af81e58 100644 --- a/jiant/tasks/lib/socialiqa.py +++ b/jiant/tasks/lib/socialiqa.py @@ -50,15 +50,15 @@ def get_test_examples(self): def _create_examples(cls, lines, set_type): examples = [] answer_key_ls = ["answerA", "answerB", "answerC"] - nlp_label_map = { + hf_datasets_label_map = { "1\n": "A", "2\n": "B", "3\n": "C", } for i, line in enumerate(lines): if "label" in line: - # Loading from NLP data - label = nlp_label_map[line["label"]] + # Loading from HF Datasets data + label = hf_datasets_label_map[line["label"]] else: # Loading from original data label = line["correct"] diff --git a/requirements-no-torch.txt b/requirements-no-torch.txt index 3ed44cb7a..33619155d 100644 --- a/requirements-no-torch.txt +++ b/requirements-no-torch.txt @@ -2,7 +2,7 @@ attrs==19.3.0 bs4==0.0.1 jsonnet==0.15.0 lxml==4.5.1 -nlp==0.4.0 +datasets==1.1.2 nltk>=3.5 numexpr==2.7.1 numpy==1.18.4 diff --git a/setup.py b/setup.py index 8f699cdb4..1a627b8b2 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "bs4 == 0.0.1", "jsonnet == 0.15.0", "lxml == 4.5.1", - "nlp == 0.4.0", + "datasets == 1.1.2", "nltk >= 3.5", "numexpr == 2.7.1", "numpy == 1.18.4",