nlp to datasets (nyu-mll#1137)

Co-authored-by: jeswan <57466294+jeswan@users.noreply.github.com>
leo-liuzy · Oct 19, 2020 · dea7947 · dea7947
1 parent 7329cc5
commit dea7947
Show file tree

Hide file tree

Showing 10 changed files with 40 additions and 39 deletions.
diff --git a/jiant/scripts/download_data/constants.py b/jiant/scripts/download_data/constants.py
@@ -1,4 +1,5 @@
-# Directly download tasks when nlp format is different than original dataset
+# Directly download tasks when not available in HF Datasets, or HF Datasets version
+# is not suitable
 SQUAD_TASKS = {"squad_v1", "squad_v2"}
 DIRECT_SUPERGLUE_TASKS_TO_DATA_URLS = {
  "wsc": f"https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip",

diff --git a/...ripts/download_data/datasets/nlp_tasks.py → ...wnload_data/datasets/hf_datasets_tasks.py b/...ripts/download_data/datasets/nlp_tasks.py → ...wnload_data/datasets/hf_datasets_tasks.py
@@ -1,4 +1,4 @@
-"""Use this for tasks that can be obtained from NLP without further/special processing"""
+"""Use this for tasks that can be obtained from HF-Datasets without further/special processing"""
 
 import jiant.scripts.download_data.utils as download_utils
 import jiant.utils.python.io as py_io
@@ -21,7 +21,7 @@
 )
 
 
-NLP_CONVERSION_DICT = {
+HF_DATASETS_CONVERSION_DICT = {
  # === GLUE === #
  "cola": {
  "path": "glue",
@@ -152,24 +152,24 @@
  },
 }
 
-# NLP uses "validation", we use "val"
+# HF-Datasets uses "validation", we use "val"
 DEFAULT_PHASE_MAP = {"validation": "val"}
 
 
 def download_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
- nlp_conversion_metadata = NLP_CONVERSION_DICT[task_name]
- examples_dict = download_utils.convert_nlp_dataset_to_examples(
- path=nlp_conversion_metadata["path"],
- name=nlp_conversion_metadata.get("name"),
- field_map=nlp_conversion_metadata.get("field_map"),
- label_map=nlp_conversion_metadata.get("label_map"),
- phase_map=nlp_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
- phase_list=nlp_conversion_metadata.get("phase_list"),
+ hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name]
+ examples_dict = download_utils.convert_hf_dataset_to_examples(
+ path=hf_datasets_conversion_metadata["path"],
+ name=hf_datasets_conversion_metadata.get("name"),
+ field_map=hf_datasets_conversion_metadata.get("field_map"),
+ label_map=hf_datasets_conversion_metadata.get("label_map"),
+ phase_map=hf_datasets_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP),
+ phase_list=hf_datasets_conversion_metadata.get("phase_list"),
  )
  paths_dict = download_utils.write_examples_to_jsonls(
  examples_dict=examples_dict, task_data_path=task_data_path,
  )
- jiant_task_name = nlp_conversion_metadata.get("jiant_task_name", task_name)
+ jiant_task_name = hf_datasets_conversion_metadata.get("jiant_task_name", task_name)
  py_io.write_json(
  data={"task": jiant_task_name, "paths": paths_dict, "name": task_name},
  path=task_config_path,

diff --git a/jiant/scripts/download_data/runscript.py b/jiant/scripts/download_data/runscript.py
@@ -2,22 +2,22 @@
 import argparse
 
 import jiant.utils.python.io as py_io
-import jiant.scripts.download_data.datasets.nlp_tasks as nlp_tasks_download
+import jiant.scripts.download_data.datasets.hf_datasets_tasks as hf_datasets_tasks_download
 import jiant.scripts.download_data.datasets.xtreme as xtreme_download
 import jiant.scripts.download_data.datasets.files_tasks as files_tasks_download
 from jiant.tasks.constants import (
  GLUE_TASKS,
  SUPERGLUE_TASKS,
- OTHER_NLP_TASKS,
+ OTHER_HF_DATASETS_TASKS,
  XTREME_TASKS,
  BENCHMARKS,
 )
 from jiant.scripts.download_data.constants import SQUAD_TASKS, DIRECT_DOWNLOAD_TASKS
 
-# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the nlp
+# DIRECT_DOWNLOAD_TASKS need to be directly downloaded because the HF Datasets
 # implementation differs from the original dataset format
-NLP_DOWNLOADER_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_NLP_TASKS) - DIRECT_DOWNLOAD_TASKS
-SUPPORTED_TASKS = NLP_DOWNLOADER_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS
+HF_DATASETS_TASKS = (GLUE_TASKS | SUPERGLUE_TASKS | OTHER_HF_DATASETS_TASKS) - DIRECT_DOWNLOAD_TASKS
+SUPPORTED_TASKS = HF_DATASETS_TASKS | XTREME_TASKS | SQUAD_TASKS | DIRECT_DOWNLOAD_TASKS
 
 
 # noinspection PyUnusedLocal
@@ -50,8 +50,8 @@ def download_data(task_names, output_base_path):
  for i, task_name in enumerate(task_names):
  task_data_path = os.path.join(task_data_base_path, task_name)
 
- if task_name in NLP_DOWNLOADER_TASKS:
- nlp_tasks_download.download_data_and_write_config(
+ if task_name in HF_DATASETS_TASKS:
+ hf_datasets_tasks_download.download_data_and_write_config(
  task_name=task_name,
  task_data_path=task_data_path,
  task_config_path=os.path.join(task_config_base_path, f"{task_name}_config.json"),
@@ -74,7 +74,7 @@ def download_data(task_names, output_base_path):
 
 
 def main():
- parser = argparse.ArgumentParser(description="Download NLP datasets and generate task configs")
+ parser = argparse.ArgumentParser(description="Download datasets and generate task configs")
  subparsers = parser.add_subparsers()
  sp_list = subparsers.add_parser("list", help="list supported tasks in downloader")
  sp_download = subparsers.add_parser("download", help="download data command")

diff --git a/jiant/scripts/download_data/utils.py b/jiant/scripts/download_data/utils.py
@@ -1,4 +1,4 @@
-import nlp
+import datasets
 import os
 import tarfile
 import urllib
@@ -8,15 +8,15 @@
 from jiant.utils.python.datastructures import replace_key
 
 
-def convert_nlp_dataset_to_examples(
+def convert_hf_dataset_to_examples(
  path, name=None, version=None, field_map=None, label_map=None, phase_map=None, phase_list=None
 ):
- """Helper function for reading from nlp.load_dataset and converting to examples
+ """Helper function for reading from datasets.load_dataset and converting to examples
 
  Args:
- path: path argument (from nlp.load_dataset)
- name: name argument (from nlp.load_dataset)
- version: version argument (from nlp.load_dataset)
+ path: path argument (from datasets.load_dataset)
+ name: name argument (from datasets.load_dataset)
+ version: version argument (from datasets.load_dataset)
  field_map: dictionary for renaming fields, non-exhaustive
  label_map: dictionary for replacing labels, non-exhaustive
  phase_map: dictionary for replacing phase names, non-exhaustive
@@ -25,7 +25,7 @@ def convert_nlp_dataset_to_examples(
  Returns:
  Dict[phase] -> list[examples]
  """
- dataset = nlp.load_dataset(path=path, name=name, version=version)
+ dataset = datasets.load_dataset(path=path, name=name, version=version)
  if phase_map:
  for old_phase_name, new_phase_name in phase_map.items():
  replace_key(dataset, old_key=old_phase_name, new_key=new_phase_name)

diff --git a/jiant/tasks/constants.py b/jiant/tasks/constants.py
@@ -25,7 +25,7 @@
  "superglue_winogender_diagnostics",
 }
 
-OTHER_NLP_TASKS = {
+OTHER_HF_DATASETS_TASKS = {
  "snli",
  "commonsenseqa",
  "hellaswag",

diff --git a/jiant/tasks/lib/commonsenseqa.py b/jiant/tasks/lib/commonsenseqa.py
@@ -55,13 +55,13 @@ def _create_examples(cls, lines, set_type):
 
  @classmethod
  def _create_example(cls, raw_example, set_type, i):
- # Use heuristic for determining original or NLP format
+ # Use heuristic for determining original or HF Datasets format
  if isinstance(raw_example["question"], dict):
  return cls._create_example_from_original_format(
  raw_example=raw_example, set_type=set_type, i=i,
  )
  elif isinstance(raw_example["question"], str):
- return cls._create_example_from_nlp_format(
+ return cls._create_example_from_hf_datasets_format(
  raw_example=raw_example, set_type=set_type, i=i,
  )
  else:
@@ -80,8 +80,8 @@ def _create_example_from_original_format(cls, raw_example, set_type, i):
  )
 
  @classmethod
- def _create_example_from_nlp_format(cls, raw_example, set_type, i):
- """Return question and choices from NLP example format"""
+ def _create_example_from_hf_datasets_format(cls, raw_example, set_type, i):
+ """Return question and choices from HF Datasets example format"""
  return Example(
  guid="%s-%s" % (set_type, i),
  prompt=raw_example["question"],

diff --git a/jiant/tasks/lib/snli.py b/jiant/tasks/lib/snli.py
@@ -105,7 +105,7 @@ def _create_examples(cls, lines, set_type):
  )
  )
  else:
- # Loading from NLP data
+ # Loading from HF Datasets data
  if line["label"] == -1:
  continue
  examples.append(

diff --git a/jiant/tasks/lib/socialiqa.py b/jiant/tasks/lib/socialiqa.py
@@ -50,15 +50,15 @@ def get_test_examples(self):
  def _create_examples(cls, lines, set_type):
  examples = []
  answer_key_ls = ["answerA", "answerB", "answerC"]
- nlp_label_map = {
+ hf_datasets_label_map = {
  "1\n": "A",
  "2\n": "B",
  "3\n": "C",
  }
  for i, line in enumerate(lines):
  if "label" in line:
- # Loading from NLP data
- label = nlp_label_map[line["label"]]
+ # Loading from HF Datasets data
+ label = hf_datasets_label_map[line["label"]]
  else:
  # Loading from original data
  label = line["correct"]

diff --git a/requirements-no-torch.txt b/requirements-no-torch.txt
@@ -2,7 +2,7 @@ attrs==19.3.0
 bs4==0.0.1
 jsonnet==0.15.0
 lxml==4.5.1
-nlp==0.4.0
+datasets==1.1.2
 nltk>=3.5
 numexpr==2.7.1
 numpy==1.18.4

diff --git a/setup.py b/setup.py
@@ -56,7 +56,7 @@
  "bs4 == 0.0.1",
  "jsonnet == 0.15.0",
  "lxml == 4.5.1",
- "nlp == 0.4.0",
+ "datasets == 1.1.2",
  "nltk >= 3.5",
  "numexpr == 2.7.1",
  "numpy == 1.18.4",