From 14a2b3f99f6fae09b79026d65921f4056d549098 Mon Sep 17 00:00:00 2001 From: Megan Ung <20617868+meganung@users.noreply.github.com> Date: Sun, 7 Mar 2021 01:32:47 -0800 Subject: [PATCH 1/4] How to write a PR contributing (moving to a fork) tutorial (#3490) * added instructions for moving to a fork if people directly cloned and committed changes * update the link to section --- CONTRIBUTING.md | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8711fb33271..b4c0a47bb88 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,22 +4,46 @@ While we are seeding this project with an initial set of popular tasks and a few models and examples, ongoing contributions from the research community are desired to increase the pool of tasks, models, and baselines. + ## Pull Requests We actively welcome your pull requests. -1. Fork the repo and create your branch from `master`. Set up your environment - and run `pre-commit install` once. -2. Link [CircleCI](https://circleci.com/vcs-authorize/) to your github account - if you haven't done so previously (and make sure the CircleCI tests run - successfully). -3. If you've added code that should be tested, [add tests](http://parl.ai/docs/tutorial_tests.html). -4. If you've changed APIs, update the documentation. -5. Autoformat and lint your code (`bash autoformat.sh`) -6. Ensure the test suite passes. Run `python -m pytest -m unit`. -7. If you've added a new dataset, you should also run +1. Fork the repo and then clone the forked repository. (See this [github guide](https://guides.github.com/activities/forking/) on forking for more info). + **If you have already cloned the repo directly and committed changes, follow the steps in the [section below](#moving-changes-youve-committed-to-a-fork)** +2. Create your branch from `master`. Set up your environment + and run `pre-commit install` once. +3. Make your changes +4. If you've added code that should be tested, [add tests](http://parl.ai/docs/tutorial_tests.html). +5. If you've changed APIs, update the documentation. +6. Autoformat and lint your code (`bash autoformat.sh`) +7. Ensure the test suite passes. Run `python -m pytest -m unit`. +8. If you've added a new dataset, you should also run `python -m pytest -m data`. Copy-paste the output into a comment in your PR. -8. If you haven't already, complete the Contributor License Agreement ("CLA"). -9. Once the PR is accepted and CI is passing, we will merge the PR for you. +9. If you haven't already, complete the Contributor License Agreement ("CLA"). +10. Link [CircleCI](https://circleci.com/vcs-authorize/) to your github account + if you haven't done so previously (and make sure the CircleCI tests run + successfully on the PR after you push your changes). +11. Push your changes! +12. Once the PR is accepted and CI is passing, we will merge the PR for you. + +### Moving changes you've committed to a fork +1. Fork the repo +2. In your local repo, rename your origin remote to upstream + ``` + git remote rename origin upstream + ``` +3. Point origin to the forked repo (instead of to the original repo) + ``` + git remote add origin git@github... + ``` +4. Fetch from the new origin + ``` + git fetch origin + ``` +5. Make your local branch track the remote branch (of the forked repo) + ``` + git branch --set-upstream-to origin/master master + ``` ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need From c110f731c955e8c1927a955904ab86dcf6a22d1f Mon Sep 17 00:00:00 2001 From: Hyunwoo Kim Date: Mon, 8 Mar 2021 22:52:17 +0900 Subject: [PATCH 2/4] Messages instead of dicts for get() in ED, DNLI (#3496) --- parlai/tasks/dialogue_nli/agents.py | 3 ++- parlai/tasks/empathetic_dialogues/agents.py | 30 ++++++++++++--------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/parlai/tasks/dialogue_nli/agents.py b/parlai/tasks/dialogue_nli/agents.py index 854d0ee825d..d7bf7190aac 100644 --- a/parlai/tasks/dialogue_nli/agents.py +++ b/parlai/tasks/dialogue_nli/agents.py @@ -13,6 +13,7 @@ import json import os +from parlai.core.message import Message from parlai.core.teachers import FixedDialogTeacher from .build import build from parlai.tasks.multinli.agents import convert_to_dialogData @@ -134,7 +135,7 @@ def get(self, episode_idx, entry_idx=0): binary_classes=self.binary_classes, ) new_entry = {k: entry[k] for k in ENTRY_FIELDS if k in entry} - return new_entry + return Message(new_entry) class ExtrasTeacher(DialogueNliTeacher): diff --git a/parlai/tasks/empathetic_dialogues/agents.py b/parlai/tasks/empathetic_dialogues/agents.py index 8371683730b..a17c10b8134 100644 --- a/parlai/tasks/empathetic_dialogues/agents.py +++ b/parlai/tasks/empathetic_dialogues/agents.py @@ -13,6 +13,7 @@ import numpy as np from parlai.utils.io import PathManager +from parlai.core.message import Message from parlai.core.teachers import FixedDialogTeacher from .build import build @@ -220,18 +221,21 @@ def get(self, episode_idx, entry_idx=0): ep = self.data[episode_idx] ep_i = ep[entry_idx] episode_done = entry_idx >= (len(ep) - 1) - action = { - 'situation': ep_i[3], - 'emotion': ep_i[2], - 'text': ep_i[0], - 'labels': [ep_i[1]], - 'prepend_ctx': ep_i[6], - 'prepend_cand': ep_i[7], - 'deepmoji_ctx': ep_i[4], - 'deepmoji_cand': ep_i[5], - 'episode_done': episode_done, - 'label_candidates': ep_i[8], - } + action = Message( + { + 'situation': ep_i[3], + 'emotion': ep_i[2], + 'text': ep_i[0], + 'labels': [ep_i[1]], + 'prepend_ctx': ep_i[6], + 'prepend_cand': ep_i[7], + 'deepmoji_ctx': ep_i[4], + 'deepmoji_cand': ep_i[5], + 'episode_done': episode_done, + 'label_candidates': ep_i[8], + } + ) + return action def share(self): @@ -268,7 +272,7 @@ def get(self, episode_idx, entry_idx=0): ex = self.data[episode_idx] episode_done = True - return {'labels': [ex[2]], 'text': ex[3], 'episode_done': episode_done} + return Message({'labels': [ex[2]], 'text': ex[3], 'episode_done': episode_done}) class DefaultTeacher(EmpatheticDialoguesTeacher): From 7224486a82e67a16e30441b04dbe22965e64fa0f Mon Sep 17 00:00:00 2001 From: Stephen Roller Date: Mon, 8 Mar 2021 12:09:54 -0500 Subject: [PATCH 3/4] [docs] Add docs explaining metrics. (#3498) * Add docs explaining metrics. * Slightly change title --- docs/source/tutorial_metrics.md | 44 ++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/source/tutorial_metrics.md b/docs/source/tutorial_metrics.md index 79787f7e920..f94b8cbabc6 100644 --- a/docs/source/tutorial_metrics.md +++ b/docs/source/tutorial_metrics.md @@ -1,9 +1,13 @@ -# Understanding and adding new metrics +# Understanding and adding metrics Author: Stephen Roller ## Introduction and Standard Metrics +:::{tip} List of metrics +If you're not sure what a metric means, refer to our [List of metrics](#list-of-metrics). +::: + ParlAI contains a number of built-in metrics that are automatically computed when we train and evaluate models. Some of these metrics are _text generation_ metrics, which happen any time we generate a text: this includes F1, BLEU and Accuracy. @@ -53,6 +57,7 @@ One nice thing about metrics is that they are automatically logged to the statements into your code. + ### Agent-specific metrics Some agents include their own metrics that are computed for them. For example, @@ -402,3 +407,40 @@ __Under the hood__: Local metrics work by including a "metrics" field in the return message. This is a dictionary which maps field name to a metric value. When the teacher receives the response from the model, it utilizes the metrics field to update counters on its side. + +## List of Metrics + +Below is a list of metrics and a brief explanation of each. + +:::{note} List of metrics +If you find a metric not listed here, +please [file an issue on GitHub](https://github.com/facebookresearch/ParlAI/issues/new?assignees=&labels=Docs,Metrics&template=other.md). +::: + +| Metric | Explanation | +| ----------------------- | ------------ | +| `accuracy` | Exact match text accuracy | +| `bleu-4` | BLEU-4 of the generation, under a standardized (model-independent) tokenizer | +| `clip` | Fraction of batches with clipped gradients | +| `ctpb` | Context tokens per batch | +| `ctps` | Context tokens per second | +| `exps` | Examples per second | +| `exs` | Number of examples processed since last print | +| `f1` | Unigram F1 overlap, under a standardized (model-independent) tokenizer | +| `gnorm` | Gradient norm | +| `gpu_mem` | Fraction of GPU memory used. May slightly underestimate true value. | +| `hits@1`, `hits@5`, ... | Fraction of correct choices in K guesses. (Similar to recall@K) | +| `interdistinct-1`, `interdictinct-2` | Fraction of n-grams unique across _all_ generations | +| `intradistinct-1`, `intradictinct-2` | Fraction of n-grams unique _within_ each utterance | +| `jga` | Joint Goal Accuracy | +| `loss` | Loss | +| `lr` | The most recent learning rate applied | +| `ltpb` | Label tokens per batch | +| `ltps` | Label tokens per second | +| `rouge-1`, `rouge-1`, `rouge-L` | ROUGE metrics | +| `token_acc` | Token-wise accuracy (generative only) | +| `token_em` | Utterance-level token accuracy. Roughly corresponds to perfection under greedy search (generative only) | +| `total_train_updates` | Number of SGD steps taken across all batches | +| `tpb` | Total tokens (context + label) per batch | +| `tps` | Total tokens (context + label) per second | +| `ups` | Updates per second (approximate) | From 540cb369a8a0694fb33280e4528665319743a2b0 Mon Sep 17 00:00:00 2001 From: Stephen Roller Date: Mon, 8 Mar 2021 13:22:47 -0500 Subject: [PATCH 4/4] Fix task list (#3495) --- docs/source/generate_task_list.py | 17 ++- parlai/tasks/task_list.py | 224 +++++++++++++++--------------- 2 files changed, 124 insertions(+), 117 deletions(-) diff --git a/docs/source/generate_task_list.py b/docs/source/generate_task_list.py index 867b1a746c0..b6a7e0909cd 100755 --- a/docs/source/generate_task_list.py +++ b/docs/source/generate_task_list.py @@ -7,18 +7,25 @@ MASTER = "https://github.com/facebookresearch/ParlAI/tree/master" -category_order = ['QA', 'Cloze', 'Goal', 'ChitChat', 'Negotiation', 'Visual', 'decanlp'] -category_task_list = {x: [] for x in category_order} +categories = set() +for task_dict in task_list: + categories.update(task_dict.get('tags', [])) +categories = sorted(categories) +category_task_list = {x: [] for x in categories} fout = open('task_list.inc', 'w') s = "They consist of: " -for t in category_order: +for t in categories: fout.write(f"1. {t} tasks\n") fout.write("\n") for task_dict in task_list: - tags = task_dict.get('tags', None) + tags = task_dict.get('tags', []) + if not tags: + if 'Uncategorized' not in category_task_list: + category_task_list['Uncategorized'] = [] + category_task_list['Uncategorized'].append(task_dict) for tag in tags: if tag in category_task_list: category_task_list[tag].append(task_dict) @@ -44,7 +51,7 @@ urls.append(("code", code_url)) urls_md = ", ".join(f"[{k}]({v})" for k, v in urls) - fout.write(f"### {display_name}\n") + fout.write(f"### {display_name.title().replace('_', ' ')}\n") fout.write(f"_Usage_: `--task {task}`\n\n") fout.write(f"_Links_: {urls_md}\n\n") if description: diff --git a/parlai/tasks/task_list.py b/parlai/tasks/task_list.py index 16c025be4b5..f5249fc8c0e 100644 --- a/parlai/tasks/task_list.py +++ b/parlai/tasks/task_list.py @@ -13,7 +13,7 @@ "id": "AmazonQA", "display_name": "AmazonQA", "task": "amazon_qa", - "tags": ["All", "QA"], + "tags": ["QA"], "links": {"website": "http://jmcauley.ucsd.edu/data/amazon/qa/"}, "description": ( "This dataset contains Question and Answer data from Amazon, " @@ -24,7 +24,7 @@ "id": "AQuA", "display_name": "AQuA", "task": "aqua", - "tags": ["All", "QA"], + "tags": ["QA"], "links": {"arXiv": "https://arxiv.org/abs/1705.04146"}, "description": ( "Dataset containing algebraic word problems with rationales for " @@ -35,7 +35,7 @@ "id": "bAbI-1k", "display_name": "bAbI 1k", "task": "babi:All1k", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "20 synthetic tasks that each test a unique aspect of text and " "reasoning, and hence test different capabilities of learning " @@ -51,7 +51,7 @@ "id": "bAbI-10k", "display_name": "bAbI 10k", "task": "babi:All10k", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "20 synthetic tasks that each test a unique aspect of text and " "reasoning, and hence test different capabilities of learning " @@ -67,7 +67,7 @@ "id": "BlendedSkillTalk", "display_name": "Blended Skill Talk", "task": "blended_skill_talk", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "A dataset of 7k conversations explicitly designed to exhibit multiple " "conversation modes: displaying personality, having empathy, and " @@ -78,7 +78,7 @@ "id": "BookTest", "display_name": "BookTest", "task": "booktest", - "tags": ["All", "Cloze"], + "tags": ["Cloze"], "description": ( "Sentence completion given a few sentences as context from a book. " "A larger version of CBT." @@ -89,19 +89,19 @@ "id": "BotAdversarialDialogue", "display_name": "Bot Adversarial Dialogue ", "task": "bot_adversarial_dialogue", - "tags": ["All"], + "tags": [], "description": ( "Datasets described in the paper Recipes for Safety in Open-domain Chatbots." "Datasets consist of classification tasks in which the goal is to " "determine if the utterance is offensive or not given a dialogue context. " ), - "links": {"arXiv": ""}, + "links": {"arXiv": "https://arxiv.org/abs/2010.07079"}, }, { "id": "CBT", "display_name": "Children's Book Test (CBT)", "task": "cbt", - "tags": ["All", "Cloze"], + "tags": ["Cloze"], "description": ( "Sentence completion given a few sentences as context from a " "children's book." @@ -112,7 +112,7 @@ "id": "CCPE", "display_name": "Coached Conversational Preference Elicitation", "task": "ccpe", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "A dataset consisting of 502 dialogs with 12,000 annotated " "utterances between a user and an assistant discussing movie " @@ -129,7 +129,7 @@ "id": "COPA", "display_name": "Choice of Plausible Alternatives", "task": "copa", - "tags": ["All", "Reasoning"], + "tags": ["Reasoning"], "description": ( "The Choice Of Plausible Alternatives (COPA) evaluation provides " "researchers with a tool for assessing progress in open-domain " @@ -142,7 +142,7 @@ "id": "COQA", "display_name": "Conversational Question Answering Challenge", "task": "coqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "CoQA is a large-scale dataset for building Conversational " "Question Answering systems. The goal of the CoQA challenge " @@ -156,7 +156,7 @@ "id": "CornellMovie", "display_name": "Cornell Movie", "task": "cornell_movie", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ("Fictional conversations extracted from raw movie scripts."), "links": {"arXiv": "https://arxiv.org/abs/1106.3077"}, }, @@ -164,7 +164,7 @@ "id": "DBLL-bAbI", "display_name": "Dialog Based Language Learning: bAbI Task", "task": "dbll_babi", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "Short dialogs based on the bAbI tasks, but in the form of a " "question from a teacher, the answer from the student, and finally a " @@ -183,7 +183,7 @@ "id": "DBLL-Movie", "display_name": "Dialog Based Language Learning: WikiMovies Task", "task": "dbll_movie", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "Short dialogs based on WikiMovies, but in the form of a question " "from a teacher, the answer from the student, and finally a comment " @@ -196,7 +196,7 @@ "id": "dialog-bAbI", "display_name": "Dialog bAbI", "task": "dialog_babi", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": "Simulated dialogs of restaurant booking", "links": {"arXiv": "https://arxiv.org/abs/1605.07683"}, }, @@ -204,7 +204,7 @@ "id": "dialog-bAbI-plus", "display_name": "Dialog bAbI+", "task": "dialog_babi_plus", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "bAbI+ is an extension of the bAbI Task 1 dialogues with everyday " "incremental dialogue phenomena (hesitations, restarts, and " @@ -224,7 +224,7 @@ "id": "dialogue-nli", "display_name": "Dialogue NLI", "task": "dialogue_nli", - "tags": ["All", "ChitChat", "NLI"], + "tags": ["ChitChat", "NLI"], "description": ( "Dialogue NLI is a dataset that addresses the issue of consistency in " "dialogue models." @@ -238,7 +238,7 @@ "id": "dstc7", "display_name": "DSTC7 subtrack 1 - ubuntu", "task": "dstc7", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "DSTC7 is a competition which provided a dataset of dialogs very " "similar to the ubuntu dataset. In particular, the subtrack 1 " @@ -250,7 +250,7 @@ "id": "FVQA", "display_name": "FVQA", "task": "fvqa", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "The FVQA, a VQA dataset which requires, and supports, much deeper " "reasoning. We extend a conventional visual question answering " @@ -265,7 +265,7 @@ "id": "DealNoDeal", "display_name": "Deal or No Deal", "task": "dealnodeal", - "tags": ["All", "Negotiation"], + "tags": ["Negotiation"], "description": ( "End-to-end negotiation task which requires two agents to agree on " "how to divide a set of items, with each agent assigning different " @@ -277,7 +277,7 @@ "id": "HotpotQA", "display_name": "HotpotQA", "task": "hotpotqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "HotpotQA is a dataset for multi-hop question answering." "The overall setting is that given some context paragraphs" @@ -292,7 +292,7 @@ "id": "LIGHT-Dialogue", "display_name": "LIGHT-Dialogue", "task": "light_dialog", - "tags": ["All", "Grounded", "Dodeca"], + "tags": ["Grounded", "Dodeca"], "description": ( "LIGHT is a text adventure game with actions and dialogue collected." "The source data is collected between crowdworkers playing the game." @@ -306,7 +306,7 @@ "id": "LIGHT-Dialogue-Wild", "display_name": "LIGHT-Dialogue-Wild", "task": "light_dialog_wild", - "tags": ["All", "Grounded", "LIGHT"], + "tags": ["Grounded", "LIGHT"], "description": ( " LIGHT is a text adventure game with actions and dialogue." "The WILD dataset here features 41,131+ training episodes of dialogue " @@ -321,7 +321,7 @@ "id": "MutualFriends", "display_name": "MutualFriends", "task": "mutualfriends", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "Task where two agents must discover which friend of theirs is " "mutual based on the friends's attributes." @@ -332,7 +332,7 @@ "id": "MCTest", "display_name": "MCTest", "task": "mctest", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ("Questions about short children's stories."), "links": { "website": ( @@ -345,7 +345,7 @@ "id": "MovieDD-QA", "display_name": "Movie Dialog QA", "task": "moviedialog:Task:1", - "tags": ["All", "QA", "MovieDD"], + "tags": ["QA", "MovieDD"], "description": ( "Closed-domain QA dataset asking templated questions about movies, " "answerable from Wikipedia, similar to WikiMovies." @@ -356,7 +356,7 @@ "id": "MovieDD-QARecs", "display_name": "Movie Dialog QA Recommendations", "task": "moviedialog:Task:3", - "tags": ["All", "Goal", "MovieDD"], + "tags": ["Goal", "MovieDD"], "description": ( "Dialogs discussing questions about movies as well as recommendations." ), @@ -366,7 +366,7 @@ "id": "MovieDD-Recs", "display_name": "Movie Dialog Recommendations", "task": "moviedialog:Task:2", - "tags": ["All", "QA", "MovieDD"], + "tags": ["QA", "MovieDD"], "description": ("Questions asking for movie recommendations."), "links": {"arXiv": "https://arxiv.org/abs/1511.06931"}, }, @@ -374,7 +374,7 @@ "id": "MovieDD-Reddit", "display_name": "Movie Dialog Reddit", "task": "moviedialog:Task:4", - "tags": ["All", "ChitChat", "MovieDD"], + "tags": ["ChitChat", "MovieDD"], "description": ( "Dialogs discussing Movies from Reddit (the Movies SubReddit)." ), @@ -384,7 +384,7 @@ "id": "MTurkWikiMovies", "display_name": "MTurk WikiMovies", "task": "mturkwikimovies", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Closed-domain QA dataset asking MTurk-derived questions about " "movies, answerable from Wikipedia." @@ -395,7 +395,7 @@ "id": "MultiNLI", "display_name": "MultiNLI", "task": "multinli", - "tags": ["All", "Entailment", "decanlp"], + "tags": ["Entailment", "decanlp"], "description": ( "A dataset designed for use in the development and evaluation of " "machine learning models for sentence understanding. Each example " @@ -409,7 +409,7 @@ "id": "NarrativeQA", "display_name": "NarrativeQA", "task": "narrative_qa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "A dataset and set of tasks in which the reader must answer " "questions about stories by reading entire books or movie scripts. " @@ -424,7 +424,7 @@ "id": "NaturalQuestions", "display_name": "Natural Questions", "task": "natural_questions", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "An open domain question answering dataset. " "Each example contains real questions that people searched " @@ -450,7 +450,7 @@ "id": "OpenSubtitles", "display_name": "Open Subtitles", "task": "opensubtitles", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": "Dataset of dialogs from movie scripts.", "links": { "version 2018 website": "http://opus.lingfil.uu.se/OpenSubtitles2018.php", @@ -462,7 +462,7 @@ "id": "personalized-dialog-full", "display_name": "Personalized Dialog Full Set", "task": "personalized_dialog:AllFull", - "tags": ["All", "Goal", "Personalization"], + "tags": ["Goal", "Personalization"], "description": ( "Simulated dataset of restaurant booking focused on personalization " "based on user profiles." @@ -473,7 +473,7 @@ "id": "personalized-dialog-small", "display_name": "Personalized Dialog Small Set", "task": "personalized_dialog:AllSmall", - "tags": ["All", "Goal", "Personalization"], + "tags": ["Goal", "Personalization"], "description": ( "Simulated dataset of restaurant booking focused on personalization " "based on user profiles." @@ -484,7 +484,7 @@ "id": "QACNN", "display_name": "QA CNN", "task": "qacnn", - "tags": ["All", "Cloze"], + "tags": ["Cloze"], "description": ( "Cloze dataset based on a missing (anonymized) entity phrase from a " "CNN article" @@ -495,7 +495,7 @@ "id": "QADailyMail", "display_name": "QA Daily Mail", "task": "qadailymail", - "tags": ["All", "Cloze"], + "tags": ["Cloze"], "description": ( "Cloze dataset based on a missing (anonymized) entity phrase from a " "Daily Mail article." @@ -506,7 +506,7 @@ "id": "QuAC", "display_name": "Question Answering in Context", "task": "quac", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Question Answering in Context is a dataset for modeling, " "understanding, and participating in information seeking dialog. Data " @@ -524,7 +524,7 @@ "id": "SelfFeedingChatbot", "display_name": "Self-Feeding Chatbot", "task": "self_feeding", - "tags": ["diaexp", "diasen", "All"], + "tags": [], "description": ( "Learning from Dialogue after Deployment. Leveraging user textual " "feedback to improve the chatbot's abilities." @@ -535,7 +535,7 @@ "id": "SimpleQuestions", "display_name": "Simple Questions", "task": "simplequestions", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ("Open-domain QA dataset based on Freebase triples."), "links": {"arXiv": "https://arxiv.org/abs/1506.02075"}, }, @@ -543,7 +543,7 @@ "id": "SNLI", "display_name": "The Stanford Natural Language Inference (SNLI) Corpus", "task": "snli", - "tags": ["All", "Entailment"], + "tags": ["Entailment"], "description": ( "The SNLI corpus (version 1.0) is a collection of 570k " "human-written English sentence pairs manually labeled for balanced " @@ -557,7 +557,7 @@ "id": "SQuAD2", "display_name": "SQuAD2", "task": "squad2", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Open-domain QA dataset answerable from a given paragraph from " "Wikipedia." @@ -568,7 +568,7 @@ "id": "SQuAD", "display_name": "SQuAD", "task": "squad", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Open-domain QA dataset answerable from a given paragraph from " "Wikipedia." @@ -579,7 +579,7 @@ "id": "TriviaQA", "display_name": "TriviaQA", "task": "triviaqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Open-domain QA dataset with question-answer-evidence triples." ), @@ -589,7 +589,7 @@ "id": "TaskNTalk", "display_name": "Task N' Talk", "task": "taskntalk", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "Dataset of synthetic shapes described by attributes, for agents to " "play a cooperative QA game." @@ -600,7 +600,7 @@ "id": "Ubuntu", "display_name": "Ubuntu", "task": "ubuntu", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ( "Dialogs between an Ubuntu user and an expert trying to fix issue, " "we use the V2 version, which cleaned the data to some extent. " @@ -611,7 +611,7 @@ "id": "WebQuestions", "display_name": "Web Questions", "task": "webquestions", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ("Open-domain QA dataset from Web queries."), "links": {"paper": "http://www.aclweb.org/anthology/D13-1160"}, }, @@ -619,7 +619,7 @@ "id": "WikiMovies", "display_name": "WikiMovies", "task": "wikimovies", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Closed-domain QA dataset asking templated questions about movies, " "answerable from Wikipedia." @@ -630,7 +630,7 @@ "id": "WikiQA", "display_name": "WikiQA", "task": "wikiqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ("Open domain QA from Wikipedia dataset"), "links": { "website": ( @@ -643,7 +643,7 @@ "id": "VQAv1", "display_name": "VQAv1", "task": "vqa_v1", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ("Open-ended question answering about visual content."), "links": {"arXiv": "https://arxiv.org/abs/1505.00468"}, }, @@ -651,7 +651,7 @@ "id": "VQAv2", "display_name": "VQAv2", "task": "vqa_v2", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ("Bigger, more balanced version of the original VQA dataset."), "links": {"arXiv": "https://arxiv.org/abs/1612.00837"}, }, @@ -659,7 +659,7 @@ "id": "VisDial", "display_name": "VisDial", "task": "visdial", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "Task which requires agents to hold a meaningful dialog about " "visual content." @@ -670,7 +670,7 @@ "id": "MNIST_QA", "display_name": "MNIST_QA", "task": "mnist_qa", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "Task which requires agents to identify which number they are " "seeing. From the MNIST dataset." @@ -680,7 +680,7 @@ "id": "InsuranceQA", "display_name": "InsuranceQA", "task": "insuranceqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Task which requires agents to identify high quality answers " "composed by professionals with deep domain knowledge." @@ -691,7 +691,7 @@ "id": "MS_MARCO", "display_name": "MS_MARCO", "task": "ms_marco", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "A large scale Machine Reading Comprehension Dataset with questions " "sampled from real anonymized user queries and contexts from web " @@ -703,7 +703,7 @@ "id": "CLEVR", "display_name": "CLEVR", "task": "clevr", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "A visual reasoning dataset that tests abilities such as attribute " "identification, counting, comparison, spatial relationships, and " @@ -715,7 +715,7 @@ "id": "nlvr", "display_name": "nlvr", "task": "nlvr", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "Cornell Natural Language Visual Reasoning (NLVR) is a language " "grounding dataset based on pairs of natural language statements " @@ -727,7 +727,7 @@ "id": "WMT", "display_name": "WMT", "task": "wmt", - "tags": ["All", "MT"], + "tags": ["MT"], "description": ( "Workshop on Machine Translation task, currently only includes en_de." ), @@ -736,7 +736,7 @@ "id": "IWSLT14", "display_name": "IWSLT14", "task": "iwslt14", - "tags": ["All", "MT", "decanlp"], + "tags": ["MT", "decanlp"], "description": ( "2014 International Workshop on Spoken Language task, currently " "only includes en_de and de_en." @@ -747,7 +747,7 @@ "id": "ConvAI2", "display_name": "ConvAI2", "task": "convai2", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ( "A chit-chat dataset based on PersonaChat for a NIPS 2018 competition. " ), @@ -760,7 +760,7 @@ "id": "ConvAI_ChitChat", "display_name": "ConvAI_ChitChat", "task": "convai_chitchat", - "tags": ["All", "ChitChat", "decanlp"], + "tags": ["ChitChat", "decanlp"], "description": ( "Human-bot dialogues containing free discussions of randomly chosen " "paragraphs from SQuAD." @@ -771,7 +771,7 @@ "id": "Dialogue_QE", "display_name": "Dialogue_QE", "task": "dialogue_qe", - "tags": ["All"], + "tags": [], "description": ( "Human-bot dialogues labelled for quality at the level of " "dialogues. Can be used to train dialogue-level metric for dialogue " @@ -782,7 +782,7 @@ "id": "QAngaroo", "display_name": "QAngaroo", "task": "qangaroo", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "Reading Comprehension with Multiple Hop. Including two datasets: " "WIKIHOP built on on wikipedia, MEDHOP built on paper abstracts from " @@ -794,7 +794,7 @@ "id": "SCAN", "display_name": "SCAN", "task": "scan", - "tags": ["Goal", "All"], + "tags": ["Goal"], "description": ( "SCAN is a set of simple language-driven navigation tasks for " "studying compositional learning and zero-shot generalization. The " @@ -811,7 +811,7 @@ "id": "Persona-Chat", "display_name": "Persona-Chat", "task": "personachat", - "tags": ["ChitChat", "All"], + "tags": ["ChitChat"], "description": ( "A chit-chat dataset where paired Turkers are given assigned " "personas and chat to try to get to know each other." @@ -822,7 +822,7 @@ "id": "TaskMaster", "display_name": "TaskMaster-1-2019", "task": "taskmaster", - "tags": ["ChitChat", "All"], + "tags": ["ChitChat"], "description": ( "A chit-chat dataset by GoogleAI providing high quality goal-oriented conversations" "The dataset hopes to provoke interest in written vs spoken language" @@ -837,7 +837,7 @@ "id": "Twitter", "display_name": "Twitter", "task": "twitter", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ( "Twitter data found on GitHub. No " "train/valid/test split was provided so 10k for valid and 10k for " @@ -849,7 +849,7 @@ "id": "Wikipedia", "display_name": "Wikipedia", "task": 'wikipedia', - "tags": ["All"], + "tags": [], "description": ("Dump of Wikipedia articles from 2/3/18"), "notes": ( "Specify ':full' for the full articles to be returned, otherwise " @@ -863,7 +863,7 @@ "id": "Flickr30k", "display_name": "Flickr30k", "task": "flickr30k", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ("30k captioned images pulled from Flickr compiled by UIUC. "), "links": { "website": "http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/", @@ -875,7 +875,7 @@ "id": "COCO_Captions", "display_name": "COCO_Captions", "task": "coco_caption", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "COCO annotations derived from the 2015 COCO Caption Competition. " ), @@ -885,14 +885,14 @@ "id": "integration_tests", "display_name": "Integration Tests", "task": "integration_tests", - "tags": ["All", "Debug"], + "tags": ["Debug"], "description": ("Artificial tasks for ensuring models perform as expected"), }, { "id": "ConvAI2_wild_evaluation", "display_name": "ConvAI2_wild_evaluation", "task": "convai2_wild_evaluation", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "Dataset collected during the wild evaluation of ConvaAI2 participants " "bots. 60% train, 20% valid and 20% test is chosen at " @@ -904,7 +904,7 @@ "id": "sst", "display_name": "SST Sentiment Analysis", "task": "sst", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Dataset containing sentiment trees of movie reviews. We use the modified " "binary sentence analysis subtask given by the DecaNLP paper here." @@ -918,7 +918,7 @@ "id": "cnn_dm", "display_name": "CNN/DM Summarisation", "task": "cnn_dm", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Dataset collected from CNN and the Daily Mail with summaries as labels, " "Implemented as part of the DecaNLP task." @@ -929,7 +929,7 @@ "id": "qasrl", "display_name": "QA-SRL Semantic Role Labeling", "task": "qasrl", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ("QA dataset implemented as part of the DecaNLP task."), "links": {"website": "https://dada.cs.washington.edu/qasrl/"}, }, @@ -937,7 +937,7 @@ "id": "qazre", "display_name": "QA-ZRE Relation Extraction", "task": "qazre", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Zero Shot relation extraction task implemented as part of the DecaNLP " "task." @@ -948,7 +948,7 @@ "id": "woz", "display_name": "WOZ restuarant reservation (Goal-Oriented Dialogue)", "task": "woz", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Dataset containing dialogues dengotiating a resturant reservation. " "Implemented as part of the DecaNLP task, focused on the change " @@ -960,7 +960,7 @@ "id": "wikisql", "display_name": "WikiSQL semantic parsing task", "task": "wikisql", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Dataset for parsing sentences to SQL code, given a table. " "Implemented as part of the DecaNLP task." @@ -971,7 +971,7 @@ "id": "mwsc", "display_name": "MWSC pronoun resolution", "task": "mwsc", - "tags": ["All", "decanlp"], + "tags": ["decanlp"], "description": ( "Resolving possible ambiguous pronouns. " "Implemented as part of the DecaNLP " @@ -983,7 +983,7 @@ "id": "decanlp", "display_name": "DecaNLP: The Natural Language Decathlon", "task": "decanlp", - "tags": ["All"], + "tags": [], "description": ( "A collection of 10 tasks (SQuAD, IWSLT, CNN/DM, MNLI, SST, QA‑SRL," "QA‑ZRE, WOZ, WikiSQL and MWSC) designed to challenge a model with a range " @@ -999,7 +999,7 @@ "id": "Personality_Captions", "display_name": "Personality_Captions", "task": "personality_captions", - "tags": ["All", "Visual"], + "tags": ["Visual"], "description": ( "200k images from the YFCC100m dataset " "with captions conditioned on one of 215 personalities." @@ -1018,7 +1018,7 @@ "id": "Image_Chat", "display_name": "Image_Chat", "task": "image_chat", - "tags": ["All", "Visual", "ChitChat"], + "tags": ["Visual", "ChitChat"], "description": ( "202k dialogues and 401k utterances over 202k images from " "the YFCC100m dataset " @@ -1038,14 +1038,14 @@ "id": "Image_Chat_Generation", "display_name": "Image_Chat_Generation", "task": "image_chat:Generation", - "tags": ["All", "Visual", "ChitChat", "Dodeca"], + "tags": ["Visual", "ChitChat", "Dodeca"], "description": ("Image Chat task to train generative model"), }, { "id": "Wizard_of_Wikipedia", "display_name": "Wizard_of_Wikipedia", "task": "wizard_of_wikipedia", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "A dataset with conversations directly grounded with knowledge " "retrieved from Wikipedia. Contains 201k utterances from 22k " @@ -1066,14 +1066,14 @@ "id": "Wizard_of_Wikipedia_Generator", "display_name": "Wizard_of_Wikipedia_Generator", "task": "wizard_of_wikipedia:Generator", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ("Wizard of Wikipedia task to train generative models"), }, { "id": "DailyDialog", "display_name": "Daily Dialog", "task": "dailydialog", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ( "A dataset of chitchat dialogues with strong annotations for " "topic, emotion and utterance act. This version contains both sides " @@ -1086,7 +1086,7 @@ "id": "EmpatheticDialogues", "display_name": "Empathetic Dialogues", "task": "empathetic_dialogues", - "tags": ["All", "ChitChat", "Dodeca"], + "tags": ["ChitChat", "Dodeca"], "description": ( "A dataset of 25k conversations grounded in emotional situations " "to facilitate training and evaluating dialogue systems." @@ -1111,7 +1111,7 @@ "id": "DialogueSafety", "display_name": "Dialogue Safety", "task": "dialogue_safety", - "tags": ["All"], + "tags": [], "description": ( "Several datasets described in the paper Built it Break it Fix it " "for Dialogue Safety: Robustness from Adversarial Human Attack. " @@ -1124,7 +1124,7 @@ "id": "MultiWOZv2.0", "display_name": "MultiWOZ 2.0", "task": "multiwoz_v20", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "A fully labeled collection of human-written conversations spanning" "over multiple domains and topics." @@ -1135,7 +1135,7 @@ "id": "MultiWOZv2.1", "display_name": "MultiWOZ 2.1", "task": "multiwoz_v21", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "A fully labeled collection of human-written conversations spanning" "over multiple domains and topics." @@ -1153,7 +1153,7 @@ "id": "OneCommon", "display_name": "OneCommon", "task": "onecommon", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "A collaborative referring task which requires advanced skills " "of common grounding under continuous and partially-observable context. " @@ -1165,7 +1165,7 @@ "id": "IGC", "display_name": "Image Grounded Conversations", "task": "igc", - "tags": ["All", "Visual", "ChitChat", "Dodeca"], + "tags": ["Visual", "ChitChat", "Dodeca"], "description": ( "A dataset of (image, context, question, answer) tuples, comprised " "of eventful images taken from Bing, Flickr, and COCO." @@ -1176,7 +1176,7 @@ "id": "ANLI", "display_name": "Adversarial Natural Language Inference (ANLI) Corpus", "task": "anli", - "tags": ["All", "Entailment", "NLI"], + "tags": ["Entailment", "NLI"], "description": ( "The ANLI corpus (version 1.0) is a new large-scale NLI benchmark dataset," "collected via an iterative, adversarial human-and-model-in-the-loop procedure" @@ -1192,7 +1192,7 @@ "id": "NLI", "display_name": "Natural Language Inference (NLI) Corpus", "task": "nli", - "tags": ["All", "Entailment"], + "tags": ["Entailment"], "description": ( "A collection of 3 popular Natural Language Inference(NLI) benchmark tasks: " "ANLI v0.1, MultiNLI 1.0, SNLI 1.0." @@ -1202,7 +1202,7 @@ "id": "Funpedia", "display_name": "Funpedia", "task": "funpedia", - "tags": ["All"], + "tags": [], "description": ( "Task for rephrasing sentences from Wikipedia conditioned on a persona." ), @@ -1211,7 +1211,7 @@ "id": "LIGHTGenderBias", "display_name": "LIGHT Gender Bias", "task": "light_genderation_bias", - "tags": ["All"], + "tags": [], "description": ("Task for debiasing the LIGHT dataset."), "links": {"arXiv": "https://arxiv.org/abs/1911.03842"}, }, @@ -1219,7 +1219,7 @@ "id": "AirDialogue", "display_name": "AirDialogue", "task": "airdialogue", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "Task for goal-oriented dialogue using airplane booking conversations " "between agents and customers." @@ -1230,7 +1230,7 @@ "id": "HollE", "display_name": "Holl-E", "task": "holl_e", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "Sequence of utterances and responses with background knowledge about" "movies. From the Holl-E dataset." @@ -1241,7 +1241,7 @@ "id": "ELI5", "display_name": "ELI5", "task": "eli5", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "This dataset contains Question and Answer data from Reddit " "explainlikeimfive posts and comments." @@ -1252,7 +1252,7 @@ "id": "ReDial", "display_name": "ReDial", "task": "redial", - "tags": ["All", "ChitChat", "Goal"], + "tags": ["ChitChat", "Goal"], "description": ( "Annotated dataset of dialogues where users recommend movies to each other." ), @@ -1262,7 +1262,7 @@ "id": "DREAM", "display_name": "DREAM", "task": "dream", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "A multiple-choice answering dataset based on multi-turn, multi-party dialogue." ), @@ -1272,7 +1272,7 @@ "id": "C3", "display_name": "C3", "task": "c3", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "A multiple-choice answering dataset in Chinese based on a prior passage." ), @@ -1282,7 +1282,7 @@ "id": "CommonSenseQA", "display_name": "CommonSenseQA", "task": "commonsenseqa", - "tags": ["All", "QA"], + "tags": ["QA"], "description": ( "CommonSenseQA is a multiple-choice Q-A dataset that relies on commonsense " "knowlegde to predict correct answers." @@ -1293,7 +1293,7 @@ "id": "StyleGen", "display_name": "Style-Controlled Generation", "task": "style_gen", - "tags": ["All", "ChitChat"], + "tags": ["ChitChat"], "description": ( "Dialogue datasets (BlendedSkillTalk, ConvAI2, EmpatheticDialogues, and " "Wizard of Wikipedia) labeled with personalities taken from the Image-Chat " @@ -1304,7 +1304,7 @@ "id": "GoogleSGD", "display_name": "GoogleSGD", "task": "google_sgd", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "The Schema-Guided Dialogue (SGD) dataset consists of over 20k " "annotated multi-domain, task-oriented conversations between a " @@ -1315,7 +1315,7 @@ "id": "TaskMaster2", "display_name": "TaskMaster2", "task": "taskmaster2", - "tags": ["All", "Goal"], + "tags": ["Goal"], "description": ( "The second version of TaskMaster, containing Wizard-of-Oz dialogues " "for task oriented dialogue in 7 domains." @@ -1325,7 +1325,7 @@ "id": "GenderationBiasControlTask", "display_name": "GenderationBiasControlTask", "task": "genderation_bias:controllable_task", - "tags": ["All"], + "tags": [], "description": ( "A teacher that wraps other ParlAI tasks and appends control tokens to the " "text field indicating the presence of gender words in the label(s)." @@ -1335,7 +1335,7 @@ "id": "MDGender", "display_name": "MD Gender", "task": "md_gender", - "tags": ["All"], + "tags": [], "description": ( "Tasks for the multi-dimensional gender bias classifier training." ), @@ -1345,7 +1345,7 @@ "id": "Sensitive Topics Evaluation Topics Valid Teacher", "display_name": "Sensitive Topics Evaluation Topics Valid Teacher", "task": "sensitive_topics_evaluation", - "tags": ["All"], + "tags": [], "description": ( "Task for evaluating a classifier trained to identify conversational messages " "on the following sensitive topics: Politics, Drugs, Medical Advice, Religion, " @@ -1357,7 +1357,7 @@ "id": "decode", "display_name": "DialoguE COntradiction DEteCtion (DECODE)", "task": "decode", - "tags": ["All", "ChitChat", "Entailment"], + "tags": ["ChitChat", "Entailment"], "description": "Task for detect whether the last utterance contradicts previous dialogue history.", "links": {"arXiv": "https://arxiv.org/abs/2012.13391"}, },