facebookresearch · klshuster · Mar 29, 2022 · Mar 25, 2022 · Mar 25, 2022 · Mar 25, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -222,26 +222,26 @@ commands:
             - setupcuda
       - fixgit
       - restore_cache:
-          key: deps-20220227-<< parameters.cachename >>-{{ checksum "requirements.txt" }}
+          key: deps-20220328-<< parameters.cachename >>-{{ checksum "requirements.txt" }}
       - setup
       - installdeps
       - << parameters.more_installs >>
       - save_cache:
-          key: deps-20220227-<< parameters.cachename >>-{{ checksum "requirements.txt" }}
+          key: deps-20220328-<< parameters.cachename >>-{{ checksum "requirements.txt" }}
           paths:
             - "~/venv/bin"
             - "~/venv/lib"
       - findtests:
           marker: << parameters.marker >>
       - restore_cache:
-          key: data-20220227-<< parameters.cachename >>-{{ checksum "teststorun.txt" }}
+          key: data-20220328-<< parameters.cachename >>-{{ checksum "teststorun.txt" }}
       - run:
           name: Run tests
           no_output_timeout: 60m
           command: |
             coverage run -m pytest -m << parameters.marker >> << parameters.pytest_flags >> --junitxml=test-results/junit.xml
       - save_cache:
-          key: data-20220227-<< parameters.cachename >>-{{ checksum "teststorun.txt" }}
+          key: data-20220328-<< parameters.cachename >>-{{ checksum "teststorun.txt" }}
           paths:
             - "~/ParlAI/data"
       - codecov
@@ -258,12 +258,12 @@ commands:
       - checkout
       - fixgit
       - restore_cache:
-          key: deps-20220227-bw-{{ checksum "requirements.txt" }}
+          key: deps-20220328-bw-{{ checksum "requirements.txt" }}
       - setup
       - installdeps
       - installtorchgpu
       - save_cache:
-          key: deps-20220227-bw-{{ checksum "requirements.txt" }}
+          key: deps-20220328-bw-{{ checksum "requirements.txt" }}
           paths:
             - "~/venv/bin"
             - "~/venv/lib"

diff --git a/parlai/tasks/natural_questions/agents.py b/parlai/tasks/natural_questions/agents.py
@@ -15,7 +15,7 @@
 from typing import List, Optional, Tuple
 
 from parlai.core.teachers import ChunkTeacher, DialogTeacher
-from .build import build, DATASET_NAME_LOCAL
+from .build import build, DATASET_NAME_LOCAL, build_sample
 from .build_open import build as build_
 from .utils.text_utils import simplify_nq_example
 
@@ -195,6 +195,18 @@ def _get_short_answers(self, example):
                 short_answers.append(annotation['yes_no_answer'])
         return short_answers
 
+    def _get_fname(self, chunk_idx: int) -> str:
+        """
+        Get the filname of the data chunk.
+
+        :param chunk_idx:
+            which chunk to get
+
+        :return chunk_name:
+            return the chunk fname
+        """
+        return f'nq-{self.dtype}-{str(chunk_idx).zfill(2)}.jsonl'
+
     def load_from_chunk(self, chunk_idx: int):
         """
         Loads from a chunk of the dataset, given the chunk index.
@@ -213,7 +225,7 @@ def _extract_labels_indices(example, candidate_labels):
                 labels.append(candidate_labels[label_ind])
             return labels
 
-        fname = f'nq-{self.dtype}-{str(chunk_idx).zfill(2)}.jsonl'
+        fname = self._get_fname(chunk_idx)
         fpath = os.path.join(self.dpath, fname)
         output = []
         with jsonlines.open(fpath, 'r') as fi:
@@ -256,6 +268,38 @@ def create_message(self, example_components, entry_idx=0):
         return message_dict
 
 
+class NaturalQuestionsSampleTeacher(NaturalQuestionsTeacher):
+    """
+    Loads the NQ Sample data for testing purposes.
+    """
+
+    def __init__(self, opt, shared=None):
+        build_sample(opt)
+        self.use_html = opt.get('use_html', False)
+        self.use_long_answer = opt.get('use_long_answer', False)
+        self.use_context = opt.get('use_context', False)
+        self.id = 'natural_questions'
+        self.opt = copy.deepcopy(opt)
+        self.dtype = DatatypeHelper.fold(self.opt['datatype'])
+        if self.dtype == 'test':
+            logging.error("No test split for this teacher; overriding to valid")
+            self.dtype = 'valid'
+        self.dpath = os.path.join(
+            self.opt['datapath'], f"{DATASET_NAME_LOCAL}_sample", self.dtype
+        )
+        self.n_samples = None
+        ChunkTeacher.__init__(self, self.opt, shared)
+
+    def _get_fname(self, chunk_idx: int) -> str:
+        return f'nq-{self.dtype}-sample.jsonl'
+
+    def get_fold_chunks(self, opt) -> List[int]:
+        return list(range(1))
+
+    def get_num_samples(self, opt) -> Tuple[int, int]:
+        return (200, 200)
+
+
 class InMetric(AverageMetric):
     @staticmethod
     def compute(guess: str, answers: List[str]) -> Optional["InMetric"]:
@@ -271,7 +315,7 @@ def compute(guess: str, answers: List[str]) -> Optional["InMetric"]:
 class NaturalQuestionsOpenTeacher(DialogTeacher):
     def __init__(self, opt: Opt, shared=None):
         self.fold = opt["datatype"].split(":")[0]
-        self.dpath = os.path.join(opt["datapath"], "NaturalQuestions_retrieval")
+        self.dpath = os.path.join(opt["datapath"], "NaturalQuestionsOpen")
         self.opt = opt
         self.opt['datafile'] = os.path.join(self.dpath, self.fold + ".csv")
         if shared is None:
@@ -283,7 +327,7 @@ def add_cmdline_args(
         cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
     ) -> ParlaiParser:
         super().add_cmdline_args(parser, partial_opt)
-        group = parser.add_argument_group("Natural Questions retrieval")
+        group = parser.add_argument_group("Natural Questions Open")
         group.add_argument(
             "--normalize-everything",
             default=False,

diff --git a/parlai/tasks/natural_questions/build.py b/parlai/tasks/natural_questions/build.py
@@ -27,7 +27,7 @@ def _import_google_cloud_client():
     return storage
 
 
-def _download_with_cloud_storage_client(dpath):
+def _download_with_cloud_storage_client(dpath, sample: bool = False):
     # Initiating the Cloud Storage Client with anonymous credentials
     stm = _import_google_cloud_client()
     storage_client = stm.Client.create_anonymous_client()
@@ -54,9 +54,14 @@ def _download_blobs_from_list(blobs_list, target_path):
         if not blob_name.endswith('.gz'):  # Not a zipped file
             continue
 
-        if blob_name.startswith('v1.0/train'):
+        if sample and blob_name.startswith('v1.0/sample'):
+            if 'train' in blob_name:
+                train_blobs.append(blob)
+            else:
+                valid_blobs.append(blob)
+        elif not sample and blob_name.startswith('v1.0/train'):
             train_blobs.append(blob)
-        elif blob_name.startswith('v1.0/dev'):
+        elif not sample and blob_name.startswith('v1.0/dev'):
             valid_blobs.append(blob)
 
     # Downloading the blobs to their respective dtype directory
@@ -94,8 +99,10 @@ def _move_valid_files_from_dev_to_valid(dpath):
             os.rename(os.path.join(valid_path, f), os.path.join(valid_path, new))
 
 
-def build(opt):
+def build(opt, sample: bool = False):
     dpath = os.path.join(opt['datapath'], DATASET_NAME_LOCAL)
+    if sample:
+        dpath = f"{dpath}_sample"
     version = 'v1.0'
 
     if not build_data.built(dpath, version_string=version):
@@ -105,7 +112,11 @@ def build(opt):
             build_data.remove_dir(dpath)
             logging.info('Removed the existing data (old version).')
         build_data.make_dir(dpath)
-        _download_with_cloud_storage_client(dpath)
+        _download_with_cloud_storage_client(dpath, sample)
         _untar_dataset_files(dpath)
         _move_valid_files_from_dev_to_valid(dpath)
         build_data.mark_done(dpath, version_string=version)
+
+
+def build_sample(opt):
+    build(opt, True)
diff --git a/parlai/tasks/natural_questions/test.py b/parlai/tasks/natural_questions/test.py
@@ -4,14 +4,12 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from parlai.utils.testing import AutoTeacherTest  # noqa: F401
+from parlai.utils.testing import AutoTeacherTest
 
 
-class TestNaturalQuestionsTeacher(AutoTeacherTest):
-    task = 'natural_questions'  # replace with your teacher name
+class TestNaturalQuestionsSampleTeacher(AutoTeacherTest):
+    task = 'natural_questions:NaturalQuestionsSampleTeacher'
 
 
 class TestNaturalQuestionsOpenTeacher(AutoTeacherTest):
-    task = (
-        'natural_questions:NaturalQuestionsOpenTeacher'
-    )  # replace with your teacher name
+    task = 'natural_questions:NaturalQuestionsOpenTeacher'
diff --git a/parlai/tasks/natural_questions/test/natural_questions_NaturalQuestionsOpenTeacher_valid.yml b/parlai/tasks/natural_questions/test/natural_questions_NaturalQuestionsOpenTeacher_valid.yml
@@ -127,5 +127,5 @@ acts:
     id: natural_questions:NaturalQuestionsOpenTeacher
     text: what is the smallest prime number that is greater than 30
     title: List of prime numbers
-num_episodes: 1223
-num_examples: 1223
+num_episodes: 8757
+num_examples: 8757