From d42c9703708765ca7a127dcb73eeb91cd0f7bef7 Mon Sep 17 00:00:00 2001
From: Moya Chen <72097364+moyapchen@users.noreply.github.com>
Date: Wed, 22 Dec 2021 20:46:51 -0600
Subject: [PATCH] [TOD][Datasets][Easy] Taskmaster2 to TOD Conversations format
 (#4188)

---
 parlai/tasks/taskmaster2/README.md            |   2 +-
 parlai/tasks/taskmaster2/agents.py            | 447 ++++++------------
 parlai/tasks/taskmaster2/build.py             |  90 ++--
 parlai/tasks/taskmaster2/test.py              |  15 +
 .../taskmaster2_UserSimulatorTeacher_test.yml |  46 ++
 ...taskmaster2_UserSimulatorTeacher_train.yml |  43 ++
 ...taskmaster2_UserSimulatorTeacher_valid.yml |  45 ++
 .../taskmaster2/test/taskmaster2_test.yml     |  55 +++
 .../taskmaster2/test/taskmaster2_train.yml    |  48 ++
 .../taskmaster2/test/taskmaster2_valid.yml    |  43 ++
 10 files changed, 494 insertions(+), 340 deletions(-)
 create mode 100644 parlai/tasks/taskmaster2/test.py
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_test.yml
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_train.yml
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_valid.yml
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_test.yml
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_train.yml
 create mode 100644 parlai/tasks/taskmaster2/test/taskmaster2_valid.yml

diff --git a/parlai/tasks/taskmaster2/README.md b/parlai/tasks/taskmaster2/README.md
index ac138930708..a64ce6fd321 100644
--- a/parlai/tasks/taskmaster2/README.md
+++ b/parlai/tasks/taskmaster2/README.md
@@ -1,5 +1,5 @@
 # Taskmaster 2
 
 Originally from the
-[Google Research Datasets](https://github.com/google-research-datasets/Taskmaster/blob/main/TM-2-2020/README.md).
+[Google Research Datasets](https://github.com/google-research-datasets/Taskmaster/blob/master/TM-2-2020/README.md).
 See that page for details.
diff --git a/parlai/tasks/taskmaster2/agents.py b/parlai/tasks/taskmaster2/agents.py
index ce761afc47c..f5cb8d81fa0 100644
--- a/parlai/tasks/taskmaster2/agents.py
+++ b/parlai/tasks/taskmaster2/agents.py
@@ -14,36 +14,31 @@
 from parlai.core.params import ParlaiParser
 import os
 import pandas as pd
-import hashlib
 from collections import Counter
 from parlai.core.opt import Opt
-from parlai.core.teachers import DialogTeacher
-from parlai.core.metrics import AverageMetric, F1Metric, BleuMetric
+import parlai.core.tod.tod_core as tod
 from parlai.utils.misc import warn_once
 import json
-import parlai.utils.logging as logging
-from typing import Optional, Tuple
-from parlai.core.message import Message
+from typing import Optional
+from parlai.utils.data import DatatypeHelper
 from parlai.utils.io import PathManager
 
 import parlai.tasks.taskmaster2.build as build_
+import parlai.core.tod.tod_agents as tod_agents
+
 
 DOMAINS = [
-    'flights',
-    'food-ordering',
-    'hotels',
-    'movies',
-    'restaurant-search',
-    'sports',
-    'music',
+    "flights",
+    "food-ordering",
+    "hotels",
+    "movies",
+    "restaurant-search",
+    "sports",
+    "music",
 ]
 
-ONTO_TOKEN = "Onto:"
-CALL_TOKEN = "Call:"
-RESP_TOKEN = "Result:"
-
 
-class _Abstract(DialogTeacher):
+class Taskmaster2Parser(tod_agents.TodStructuredDataParser):
     """
     Abstract data loader.
     """
@@ -52,21 +47,26 @@ class _Abstract(DialogTeacher):
     def add_cmdline_args(
         cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
     ) -> ParlaiParser:
-        super().add_cmdline_args(parser, partial_opt)
-        parser.add_argument('--include-ontology', type=bool, default=False)
         parser.add_argument(
-            '--domains',
-            nargs='+',
+            "--taskmaster2-domains",
+            nargs="+",
             default=DOMAINS,
             choices=DOMAINS,
-            help='Uses last passed in configuration.',
+            help="Uses last passed in configuration.",
+        )
+        parser.add_argument(
+            "--use-cumulative-api-calls",
+            type=bool,
+            default=True,
+            help="Have API Call/API response turns only when an API response"
+            "slot exist. Accumulate all API call slots with same API call name",
         )
-        return parser
+        return super().add_cmdline_args(parser, partial_opt)
 
     def __init__(self, opt: Opt, shared=None):
-        self.fold = opt['datatype'].split(':')[0]
-        opt['datafile'] = self.fold
-        self.dpath = os.path.join(opt['datapath'], 'taskmaster-2')
+        self.fold = DatatypeHelper.fold(opt["datatype"])
+        opt["datafile"] = self.fold
+        self.dpath = os.path.join(opt["datapath"], "taskmaster-2")
         if shared is None:
             warn_once(
                 "Taskmaster2 is a beta dataset, and format may significantly change."
@@ -74,298 +74,157 @@ def __init__(self, opt: Opt, shared=None):
             build_.build(opt)
         super().__init__(opt, shared)
 
-    def _h(self, x):
-        """
-        Hash function.
-        """
-        h = int(hashlib.sha1(x.encode('utf-8')).hexdigest(), 16) % 10
-        if h == 0:
-            return 'valid'
-        elif h == 1:
-            return 'test'
-        else:
-            return 'train'
-
-    def _normalize_annotation(self, anno):
-        return anno
-
     def _load_data(self, fold, domains):
         # load up the ontology
-        ontology = {}
+        ontologies = {}
         for section in domains:
-            parts = []
-            fn = os.path.join(self.dpath, section + '.onto.json')
-            with PathManager.open(fn, 'r') as f:
-                o = json.load(f)
-            assert len(o) == 1
-            o = list(o.values())[0]
-            for sub in o:
-                prefix = sub['prefix']
-                parts += [
-                    self._normalize_annotation(f'{prefix}.{a}')
-                    for a in sub['annotations']
-                ]
-            ontology[section] = ' ; '.join(parts)
+            fn = os.path.join(self.dpath, section + ".onto.json")
+            with PathManager.open(fn, "r") as f:
+                ontologies.update(json.load(f))
 
         chunks = []
         for section in domains:
-            with PathManager.open(os.path.join(self.dpath, section + '.json')) as f:
+            with PathManager.open(os.path.join(self.dpath, section + ".json")) as f:
                 subset = pd.read_json(f)
-            subset['domain'] = section
+            subset["domain"] = section
             chunks.append(subset)
         chunks = pd.concat(chunks, axis=0)
-        # shuffle deterministically for randomness in few-shot training
+        # deterministic shuffle data for splits
         chunks = chunks.sample(frac=1.0, random_state=42)
-        chunks['fold'] = self._label_fold(chunks)
-        # only the fold we need here
-        chunks = chunks[chunks.fold == fold].reset_index()
-        chunks['ontology'] = chunks['domain'].apply(ontology.get)
-        return chunks
-
-    def _segments2text(self, segments):
-        output = []
+        split_size = len(chunks) // 10
+        if fold == "train":
+            chunks = chunks[: split_size * 8]
+        elif fold == "valid":
+            chunks = chunks[split_size * 8 : split_size * 9]
+        elif fold == "test":
+            chunks = chunks[split_size * 9 :]
+        return chunks, ontologies
+
+    def _parse_segment_to_slots(self, segment_list):
+        result = {}
+        for segment in segment_list:
+            slot_name = segment["annotations"][0]["name"]
+            slot_value = segment["text"]
+            prefix_split_idx = slot_name.find(".")
+            api_name = slot_name[:prefix_split_idx]
+            slot_name = slot_name[prefix_split_idx + 1 :]
+            result[slot_name] = slot_value
+            result[tod.STANDARD_API_NAME_SLOT] = api_name
+        return result
+
+    def _get_utterance_and_api_call_for_speaker(self, speaker, utterances, idx):
+        utts = []
         slots = {}
-        for segment in segments:
-            val = segment['text']
-            for anno_ in segment['annotations']:
-                anno = anno_['name']
-                anno = self._normalize_annotation(anno)
-                output.append(f'{anno} = {val}')
-                slots[anno] = val
-        return " ; ".join(output), slots
-
-    def custom_evaluation(
-        self,
-        teacher_action: Message,
-        labels: Optional[Tuple[str]],
-        model_response: Message,
-    ):
-        if 'metrics' in model_response and 'type' in teacher_action:
-            # keep copies of metrics across both api calls/responses
-            prefix = teacher_action['type']
-            keys = list(model_response['metrics'].keys())
-            for k in keys:
-                self.metrics.add(f'{prefix}_{k}', model_response['metrics'][k])
-
-        if 'text' not in model_response or not labels or 'type' not in teacher_action:
-            return
-
-        domain = teacher_action['domain']
-
-        if teacher_action['type'] == 'apicall':
-            # also count slot accuracy
-            text = model_response['text']
-            slot_guesses = set(
-                text.replace(CALL_TOKEN + " ", "").split(' ; ')
-            )  # prevent cheating via repeated guesses
-            correct = 0
-            for slot_guess in slot_guesses:
-                if ' = ' not in slot_guess:
-                    continue
-                try:
-                    slot, guess = slot_guess.split(' = ')
-                except ValueError:
-                    continue
-                if teacher_action['slots'].get(slot) == guess:
-                    self.metrics.add('slot_p', AverageMetric(1))
-                    self.metrics.add(f'{domain}_slot_p', AverageMetric(1))
-                    correct += 1
-                else:
-                    self.metrics.add('slot_p', AverageMetric(0))
-                    self.metrics.add(f'{domain}_slot_p', AverageMetric(0))
-                    logging.debug(
-                        f"Bad slot guess '{slot_guess}' != {teacher_action['slots']}"
-                    )
-            if teacher_action['slots']:
-                self.metrics.add(
-                    'slot_r', AverageMetric(correct, len(teacher_action['slots']))
-                )
-                self.metrics.add(
-                    f'{domain}_slot_r',
-                    AverageMetric(correct, len(teacher_action['slots'])),
-                )
-                self.metrics.add(
-                    'jga', AverageMetric(correct == len(teacher_action['slots']))
-                )
-
-        elif teacher_action['type'] == 'apiresp':
-            # keep track of statistics by domain
-            f1_metric = F1Metric.compute(model_response['text'], labels)
-            bleu_metric = BleuMetric.compute(model_response['text'], labels)
-            self.metrics.add(f'{domain}_lex_f1', f1_metric)
-            self.metrics.add(f'{domain}_lex_bleu', bleu_metric)
-
-            delex_text = model_response['text']
-            delex_label = labels[0]
-            # compute delexicalized string metrics
-            for slot, value in teacher_action['slots'].items():
-                delex_text = delex_text.replace(value, slot)
-                delex_label = delex_label.replace(value, slot)
-            f1_metric = F1Metric.compute(delex_text, (delex_label,))
-            self.metrics.add('delex_f1', f1_metric)
-            self.metrics.add(f'{domain}_delex_f1', f1_metric)
-            bleu_metric = BleuMetric.compute(delex_text, [delex_label])
-            self.metrics.add('delex_bleu', bleu_metric)
-            self.metrics.add(f'{domain}_delex_bleu', bleu_metric)
-
-    def setup_data(self, fold):
-        domains = self.opt.get('domains', DOMAINS)
-        chunks = self._load_data(fold, domains)
-        domains_cnt = Counter()
-        for _, row in chunks.iterrows():
-            domains_cnt[row['domain']] += 1
-            first = True
-            utterances = row['utterances'][:]
-            if (
-                len(utterances) >= 3
-                and utterances[0]['speaker'] == 'USER'
-                and utterances[1]['speaker'] == 'ASSISTANT'
-                and utterances[2]['speaker'] == 'ASSISTANT'
-                and "help you?" in utterances[1]['text']
-            ):
-                # skip this one
-                utterances.pop(1)
-            if self.opt['include_ontology']:
-                yield {'text': f"{ONTO_TOKEN} {row['ontology']}", 'label': ''}, True
-                first = False
-            while utterances:
-                utt = utterances.pop(0)
-                segtxt, slots = self._segments2text(utt.get('segments', []))
-                if utt['speaker'] == 'USER':
-                    yield {
-                        'text': utt['text'],
-                        'label': f'{CALL_TOKEN} {segtxt}',
-                        'domain': row['domain'],
-                        'slots': slots,
-                        'type': 'apicall',
-                    }, first
-                    first = False
-                elif utt['speaker'] == 'ASSISTANT':
-                    yield {
-                        'text': f'{RESP_TOKEN} {segtxt}',
-                        'label': utt['text'],
-                        'domain': row['domain'],
-                        'slots': slots,
-                        'type': 'apiresp',
-                    }, first
-                    first = False
-        logging.debug(f"Fold {fold} domains: {domains_cnt}")
-
-
-class DelexTeacher(_Abstract):
-    def _label_fold(self, chunks):
-        return chunks.conversation_id.apply(self._h)
-
-    def _delexicalize(self, text, slots):
-        for key, value in slots.items():
-            text = text.replace(value, key)
-        return text
-
-    def setup_data(self, fold):
+        while idx < len(utterances):
+            here = utterances[idx]
+            if here["speaker"] != speaker:
+                break
+            utts.append(here["text"])
+            slots.update(self._parse_segment_to_slots(here.get("segments", [])))
+            idx += 1
+        return idx, "\n".join(utts), slots
+
+    def _get_onto_list(self, onto_map, domain):
+        results = []
+        domain = domain.replace(
+            "-", "_"
+        )  # cause they changed it for restaurant-search >.>
+        for data in onto_map[domain]:
+            call = {}
+            call[tod.STANDARD_API_NAME_SLOT] = data["prefix"]
+            call[tod.STANDARD_OPTIONAL_KEY] = data[
+                "annotations"
+            ]  # make all args optional since not specified
+            results.append(call)
+        return results
+
+    def setup_episodes(self, fold):
+        """
+        Parses into TodStructuredEpisode.
+        """
+        domains = self.opt.get("taskmaster2_domains", DOMAINS)
+        chunks, ontologies = self._load_data(fold, domains)
         domains_cnt = Counter()
-        chunks = self._load_data(fold)
+        episodes = []
         for _, row in chunks.iterrows():
-            domains_cnt[row['domain']] += 1
-            first = True
-            utterances = row['utterances'][:]
-            if (
-                len(utterances) >= 3
-                and utterances[0]['speaker'] == 'USER'
-                and utterances[1]['speaker'] == 'ASSISTANT'
-                and utterances[2]['speaker'] == 'ASSISTANT'
-                and "help you?" in utterances[1]['text']
-            ):
-                # skip this one
-                utterances.pop(1)
-
-            user_utterances = []
-            asst_utterances = []
-            while utterances:
-                utt = utterances.pop(0)
-                _, slots = self._segments2text(utt.get('segments', []))
-                if utt['speaker'] == 'USER':
-                    if asst_utterances:
-                        yield {
-                            'text': ' __BREAK__ '.join(user_utterances),
-                            'label': ' __BREAK__ '.join(asst_utterances),
-                            'domain': row['domain'],
-                        }, first
-                        first = False
-                        user_utterances = []
-                        asst_utterances = []
-                    user_utterances.append(self._delexicalize(utt['text'], slots))
-                elif utt['speaker'] == 'ASSISTANT':
-                    asst_utterances.append(self._delexicalize(utt['text'], slots))
-                    if not user_utterances:
-                        user_utterances.append('__SILENCE__')
-            if asst_utterances:
-                yield {
-                    'text': ' __BREAK__ '.join(user_utterances),
-                    'label': ' __BREAK__ '.join(asst_utterances),
-                    'domain': row['domain'],
-                }, first
+            domains_cnt[row["domain"]] += 1
+            utterances = row["utterances"][:]
+
+            idx = 0
+            rounds = []
+            goal_calls = []
+            if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT":
+                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
+                    "ASSISTANT", utterances, idx
+                )
+                r = tod.TodStructuredRound(api_resp_machine=api_resp, sys_utt=sys_utt)
+                rounds.append(r)
 
+            cum_api_call = {}
+            while idx < len(utterances):
+                idx, user_utt, api_call = self._get_utterance_and_api_call_for_speaker(
+                    "USER", utterances, idx
+                )
+                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
+                    "ASSISTANT", utterances, idx
+                )
+                if not self.opt["use_cumulative_api_calls"]:
+                    r = tod.TodStructuredRound(
+                        user_utt=user_utt,
+                        api_call_machine=api_call,
+                        api_resp_machine=api_resp,
+                        sys_utt=sys_utt,
+                    )
+                else:
+                    cum_api_call = self.process_call_for_cumlative_standalone_api(
+                        api_call, cum_api_call
+                    )
+                    r = tod.TodStructuredRound(
+                        user_utt=user_utt,
+                        api_call_machine=cum_api_call if len(api_resp) > 0 else {},
+                        api_resp_machine=api_resp if len(api_resp) > 0 else {},
+                        sys_utt=sys_utt,
+                    )
 
-class TextOnlyTeacher(DelexTeacher):
-    def _delexicalize(self, text, slots):
-        return text
+                rounds.append(r)
+                if len(api_call) > 0:
+                    goal_calls.append(api_call)
 
+            episode = tod.TodStructuredEpisode(
+                domain=tod.SerializationHelpers.inner_list_join(row["domain"]),
+                api_schemas_machine=self._get_onto_list(ontologies, row["domain"]),
+                goal_calls_machine=goal_calls,
+                rounds=rounds,
+                delex=self.opt.get("delex", False),
+            )
+            episodes.append(episode)
+        return episodes
 
-class FullShotTeacher(_Abstract):
-    """
-    The full shot teacher uses a standard 80-10-10 split, without regarding domain.
-    """
+    def get_id_task_prefix(self):
+        return "Taskmaster2"
 
     def _label_fold(self, chunks):
         return chunks.conversation_id.apply(self._h)
 
+    def process_call_for_cumlative_standalone_api(self, new_call, cum_calls):
+        if (
+            len(new_call) > 0
+            and len(cum_calls) > 0
+            and new_call[tod.STANDARD_API_NAME_SLOT]
+            != cum_calls[tod.STANDARD_API_NAME_SLOT]
+        ):
+            cum_calls = {}
+        cum_calls.update(new_call)
+        return cum_calls
 
-class FewShotTeacher(_Abstract):
-    """
-    Few shot teacher tests for generalization to new domains.
-    """
 
-    @classmethod
-    def add_cmdline_args(
-        cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
-    ) -> ParlaiParser:
-        super().add_cmdline_args(parser, partial_opt)
-        parser.add_argument(
-            '--holdout',
-            default=DOMAINS[0],
-            choices=DOMAINS,
-            help='Domain which is held out from test',
-        )
-        parser.add_argument(
-            '--n-shot',
-            default=100,
-            type=int,
-            help='Number of few shot examples to provide in training fold.',
-        )
-        return super().add_cmdline_args(parser, partial_opt=partial_opt)
+class UserSimulatorTeacher(Taskmaster2Parser, tod_agents.TodUserSimulatorTeacher):
+    pass
 
-    def _label_fold(self, chunks):
-        folds = []
-        num_shots = 0
-        for _, row in chunks.iterrows():
-            if row['domain'] != self.opt['holdout']:
-                # if it's not in the holdout, always mark it train
-                folds.append('train')
-            else:
-                # keep the same valid/test sets as in fullshot, and only leak
-                # a small number of the training examples (i.e. throw away the
-                # vast majority of our data but keep test sets the same)
 
-                f = self._h(row['conversation_id'])
-                if f != 'train':
-                    folds.append(f)
-                elif num_shots < self.opt['n_shot']:
-                    folds.append('train')
-                    num_shots += 1
-                else:
-                    folds.append('throwaway')
-        return folds
+class SystemTeacher(Taskmaster2Parser, tod_agents.TodSystemTeacher):
+    pass
 
 
-class DefaultTeacher(FullShotTeacher):
+class DefaultTeacher(SystemTeacher):
     pass
diff --git a/parlai/tasks/taskmaster2/build.py b/parlai/tasks/taskmaster2/build.py
index 23b7a4845e8..1f71a2ae8ed 100644
--- a/parlai/tasks/taskmaster2/build.py
+++ b/parlai/tasks/taskmaster2/build.py
@@ -9,93 +9,93 @@
 import os
 from parlai.core.build_data import DownloadableFile
 
-ROOT_URL = 'https://github.com/google-research-datasets/Taskmaster/raw/master/TM-2-2020'
+ROOT_URL = "https://github.com/google-research-datasets/Taskmaster/raw/master/TM-2-2020"
 
 RESOURCES = [
     # raw data files
     DownloadableFile(
-        f'{ROOT_URL}/data/flights.json',
-        'flights.json',
-        '86b37b5ae25f530fd18ced78800d30c3b54f7b34bb208ecb51842718f04e760b',
+        f"{ROOT_URL}/data/flights.json",
+        "flights.json",
+        "86b37b5ae25f530fd18ced78800d30c3b54f7b34bb208ecb51842718f04e760b",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/food-ordering.json',
-        'food-ordering.json',
-        '0a042e566a816a5d0abebe6f7e8cfd6abaa89729ffc42f433d327df7342b12f8',
+        f"{ROOT_URL}/data/food-ordering.json",
+        "food-ordering.json",
+        "0a042e566a816a5d0abebe6f7e8cfd6abaa89729ffc42f433d327df7342b12f8",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/hotels.json',
-        'hotels.json',
-        '975b0242f1e37ea1ab94ccedd7e0d6ee5831599d5df1f16143e71110d6c6006a',
+        f"{ROOT_URL}/data/hotels.json",
+        "hotels.json",
+        "975b0242f1e37ea1ab94ccedd7e0d6ee5831599d5df1f16143e71110d6c6006a",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/movies.json',
-        'movies.json',
-        '6f67c9a1f04abc111186e5bcfbe3050be01d0737fd6422901402715bc1f3dd0d',
+        f"{ROOT_URL}/data/movies.json",
+        "movies.json",
+        "6f67c9a1f04abc111186e5bcfbe3050be01d0737fd6422901402715bc1f3dd0d",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/music.json',
-        'music.json',
-        'e5db60d6576fa010bef87a70a8b371d293d48cde8524c1d3ed7c3022f079d95d',
+        f"{ROOT_URL}/data/music.json",
+        "music.json",
+        "e5db60d6576fa010bef87a70a8b371d293d48cde8524c1d3ed7c3022f079d95d",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/restaurant-search.json',
-        'restaurant-search.json',
-        'fb9735f89e7ebc7c877f976da4c30391af6a6277991b597c0755564657ff8f47',
+        f"{ROOT_URL}/data/restaurant-search.json",
+        "restaurant-search.json",
+        "fb9735f89e7ebc7c877f976da4c30391af6a6277991b597c0755564657ff8f47",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/data/sports.json',
-        'sports.json',
-        '8191531bfa5a8426b1508c396ab9886a19c7c620b443c436ec10d8d4708d0eac',
+        f"{ROOT_URL}/data/sports.json",
+        "sports.json",
+        "8191531bfa5a8426b1508c396ab9886a19c7c620b443c436ec10d8d4708d0eac",
         zipped=False,
     ),
     # ontology data files
     DownloadableFile(
-        f'{ROOT_URL}/ontology/flights.json',
-        'flights.onto.json',
-        '1ebc5c982339d24b2dcf50677883fed65b7fcb95f01edbbd3be6357090893c33',
+        f"{ROOT_URL}/ontology/flights.json",
+        "flights.onto.json",
+        "1ebc5c982339d24b2dcf50677883fed65b7fcb95f01edbbd3be6357090893c33",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/food-ordering.json',
-        'food-ordering.onto.json',
-        '79c1189c16f0ab937bad558c70a0b9b99358f9ed91ea65ce4af37c4b7d999063',
+        f"{ROOT_URL}/ontology/food-ordering.json",
+        "food-ordering.onto.json",
+        "79c1189c16f0ab937bad558c70a0b9b99358f9ed91ea65ce4af37c4b7d999063",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/hotels.json',
-        'hotels.onto.json',
-        '22ae51ba546ee7ca03143097782817c4cdd0de74ac84893eaf40b8254aa866d3',
+        f"{ROOT_URL}/ontology/hotels.json",
+        "hotels.onto.json",
+        "22ae51ba546ee7ca03143097782817c4cdd0de74ac84893eaf40b8254aa866d3",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/movies.json',
-        'movies.onto.json',
-        '8403283526bb314e871850b98bb86a7987ef0af6fbbe4fb5a089ee9498584476',
+        f"{ROOT_URL}/ontology/movies.json",
+        "movies.onto.json",
+        "8403283526bb314e871850b98bb86a7987ef0af6fbbe4fb5a089ee9498584476",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/music.json',
-        'music.onto.json',
-        '4bcd6dcf1cdc6bdb717e5fdc08b3472dc3d1f4da8a0f8aee917494d79a7fe338',
+        f"{ROOT_URL}/ontology/music.json",
+        "music.onto.json",
+        "4bcd6dcf1cdc6bdb717e5fdc08b3472dc3d1f4da8a0f8aee917494d79a7fe338",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/restaurant-search.json',
-        'restaurant-search.onto.json',
-        'c9ead7985695b3feba1fb955e8407d806e4095f5459485adc5448ae89989e609',
+        f"{ROOT_URL}/ontology/restaurant-search.json",
+        "restaurant-search.onto.json",
+        "c9ead7985695b3feba1fb955e8407d806e4095f5459485adc5448ae89989e609",
         zipped=False,
     ),
     DownloadableFile(
-        f'{ROOT_URL}/ontology/sports.json',
-        'sports.onto.json',
-        '52f9bbb86ebd9e2b3916185ad4e3e9b8b77d2164d96bd3b98ad67cbaa653757d',
+        f"{ROOT_URL}/ontology/sports.json",
+        "sports.onto.json",
+        "52f9bbb86ebd9e2b3916185ad4e3e9b8b77d2164d96bd3b98ad67cbaa653757d",
         zipped=False,
     ),
 ]
@@ -103,13 +103,13 @@
 
 def build(opt):
     # get path to data directory
-    dpath = os.path.join(opt['datapath'], 'taskmaster-2')
+    dpath = os.path.join(opt["datapath"], "taskmaster-2")
     # define version if any
     version = "1.1"
 
     # check if data had been previously built
     if not build_data.built(dpath, version_string=version):
-        print('[building data: ' + dpath + ']')
+        print("[building data: " + dpath + "]")
 
         # make a clean directory if needed
         if build_data.built(dpath):
diff --git a/parlai/tasks/taskmaster2/test.py b/parlai/tasks/taskmaster2/test.py
new file mode 100644
index 00000000000..e0d2e79a87c
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from parlai.utils.testing import AutoTeacherTest
+
+
+class TestDefaultTeacher(AutoTeacherTest):
+    task = "taskmaster2"
+
+
+class TestUserSimulatorTeacher(AutoTeacherTest):
+    task = "taskmaster2:UserSimulatorTeacher"
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_test.yml b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_test.yml
new file mode 100644
index 00000000000..cabdc76d438
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_test.yml
@@ -0,0 +1,46 @@
+acts:
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: Hey, what''s the Denver Broncos record?'
+    id: Taskmaster2_UserSimulatorTeacher
+    text: 'GOAL: api_name = nfl ; name.team = Denver Broncos'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: What Conference are they in?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: The Denver Broncos are in currently fourth in the AFC West with
+      a record of four wins and nine losses.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: Who do they play against next week?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: The Denver Broncos played in the American Football Conference in
+      the west division.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: When did they play last?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: In next week the Denver Broncos will be playing against the Indianapolis
+      Colts.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: How many games back from first place are they?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: Their last game was yesterday, they beat the New York Jets by 23
+      to 0.'
+    type: 'USER: '
+num_episodes: 1734
+num_examples: 17425
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_train.yml b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_train.yml
new file mode 100644
index 00000000000..d4ad22a81c3
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_train.yml
@@ -0,0 +1,43 @@
+acts:
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_UserSimulatorTeacher
+    labels:
+    - 'USER: Hey. How are the Denver Nuggets doing this year?'
+    text: 'GOAL: api_name = nba ; name.team = Denver Nuggets | api_name = nba ; name.player
+      = Nikola Jokic'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_UserSimulatorTeacher
+    labels:
+    - 'USER: Okay. And what division are they in?'
+    slots: {}
+    text: 'SYSTEM: Hello, They''re currently six place in the Western Conference.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_UserSimulatorTeacher
+    labels:
+    - 'USER: Okay. And how they did last game?'
+    slots: {}
+    text: 'SYSTEM: There in the Northwest division.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_UserSimulatorTeacher
+    labels:
+    - 'USER: Okay. And I need to start report that.'
+    slots: {}
+    text: 'SYSTEM: They lost the last game against the 76ers.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_UserSimulatorTeacher
+    labels:
+    - 'USER: Okay, And how many points is the college York average in?'
+    slots: {}
+    text: 'SYSTEM: Starting point guard is Gary Harris.'
+    type: 'USER: '
+num_episodes: 13840
+num_examples: 138596
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_valid.yml b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_valid.yml
new file mode 100644
index 00000000000..95ef7136d19
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_UserSimulatorTeacher_valid.yml
@@ -0,0 +1,45 @@
+acts:
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: '
+    id: Taskmaster2_UserSimulatorTeacher
+    text: 'GOAL: api_name = mls ; name.team = Vancouver Whitecaps FC? | api_name =
+      mls ; day.match = last Saturday | api_name = mls ; position.player = goalkeeper'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: Hi Assistant. How are you?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: Hello.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: I''m great. I''m a big fan of Major League Soccer, And my favorite team
+      is Vancouver Whitecaps FC. And I would love to know what place they are in,
+      the Vancouver Whitecaps FC?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: I''m good and yourself?'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: Thank you. And are they playing right now?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: Currently in 7th place in the Western Conference.'
+    type: 'USER: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'USER: Okay, thank you. And who did they play last Saturday?'
+    id: Taskmaster2_UserSimulatorTeacher
+    slots: {}
+    text: 'SYSTEM: No, they''re not scheduled to play today.'
+    type: 'USER: '
+num_episodes: 1730
+num_examples: 17337
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_test.yml b/parlai/tasks/taskmaster2/test/taskmaster2_test.yml
new file mode 100644
index 00000000000..eae250af098
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_test.yml
@@ -0,0 +1,55 @@
+acts:
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APIS: '
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'APIS: '
+    type: 'APIS: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APICALL: api_name = nfl ; name.team = Denver Broncos'
+    id: Taskmaster2_SystemTeacher
+    slots:
+      api_name: nfl
+      name.team: Denver Broncos
+    text: 'USER: Hey, what''s the Denver Broncos record?'
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'SYSTEM: The Denver Broncos are in currently fourth in the AFC West with a record
+      of four wins and nine losses.'
+    id: Taskmaster2_SystemTeacher
+    slots:
+      api_name: nfl
+      name.team: Denver Broncos
+      record.team: four wins and nine losses
+    text: 'APIRESP: api_name = nfl ; name.team = Denver Broncos ; record.team = four
+      wins and nine losses'
+    type: 'SYSTEM: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APICALL: api_name = nfl ; name.team = Denver Broncos'
+    id: Taskmaster2_SystemTeacher
+    slots:
+      api_name: nfl
+      name.team: Denver Broncos
+    text: 'USER: What Conference are they in?'
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'SYSTEM: The Denver Broncos played in the American Football Conference in the
+      west division.'
+    id: Taskmaster2_SystemTeacher
+    slots:
+      api_name: nfl
+      name.team: Denver Broncos
+    text: 'APIRESP: api_name = nfl ; name.team = Denver Broncos'
+    type: 'SYSTEM: '
+num_episodes: 1734
+num_examples: 36584
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_train.yml b/parlai/tasks/taskmaster2/test/taskmaster2_train.yml
new file mode 100644
index 00000000000..5676f053795
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_train.yml
@@ -0,0 +1,48 @@
+acts:
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_SystemTeacher
+    labels:
+    - 'APIS: '
+    slots: {}
+    text: 'APIS: '
+    type: 'APIS: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_SystemTeacher
+    labels:
+    - 'APICALL: api_name = nba ; name.team = Denver Nuggets'
+    slots:
+      api_name: nba
+      name.player: Nikola Jokic
+      name.team: Denver Nuggets
+    text: 'USER: Hey. How are the Denver Nuggets doing this year?'
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_SystemTeacher
+    labels:
+    - 'SYSTEM: Hello, They''re currently six place in the Western Conference.'
+    slots:
+      api_name: nba
+      place.team: six place
+    text: 'APIRESP: api_name = nba ; place.team = six place'
+    type: 'SYSTEM: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_SystemTeacher
+    labels:
+    - 'APICALL: '
+    slots: {}
+    text: 'USER: Okay. And what division are they in?'
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    id: Taskmaster2_SystemTeacher
+    labels:
+    - 'SYSTEM: There in the Northwest division.'
+    slots: {}
+    text: 'APIRESP: '
+    type: 'SYSTEM: '
+num_episodes: 13840
+num_examples: 291032
diff --git a/parlai/tasks/taskmaster2/test/taskmaster2_valid.yml b/parlai/tasks/taskmaster2/test/taskmaster2_valid.yml
new file mode 100644
index 00000000000..3a1c36b2bdb
--- /dev/null
+++ b/parlai/tasks/taskmaster2/test/taskmaster2_valid.yml
@@ -0,0 +1,43 @@
+acts:
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APIS: '
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'APIS: '
+    type: 'APIS: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APICALL: '
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'USER: '
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'SYSTEM: Hello.'
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'APIRESP: '
+    type: 'SYSTEM: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'APICALL: '
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'USER: Hi Assistant. How are you?'
+    type: 'APICALL: '
+- - domain: sports
+    episode_done: false
+    eval_labels:
+    - 'SYSTEM: I''m good and yourself?'
+    id: Taskmaster2_SystemTeacher
+    slots: {}
+    text: 'APIRESP: '
+    type: 'SYSTEM: '
+num_episodes: 1730
+num_examples: 36404