diff --git a/parlai/core/teachers.py b/parlai/core/teachers.py index 8a698403b4f..35e18e27cf3 100644 --- a/parlai/core/teachers.py +++ b/parlai/core/teachers.py @@ -1611,11 +1611,11 @@ def setup_data(self, datafile): yield act, next_episode_new -class ConversationTeacher(FixedDialogTeacher): +class ConversationTeacher(DialogTeacher): """ This module provides access to data in the Conversations format. - Subclasses ``FixedDialogTeacher`` for functionality and provides an + Subclasses ``DialogTeacher`` for functionality and provides an implementation of ``setup_data()`` which iterates over datasets in the "Conversations" format. If your data is in the format below, use this class to handle file parsing for you. @@ -1649,61 +1649,46 @@ class ConversationTeacher(FixedDialogTeacher): A set of examples X1 => Y1, X2 => Y2, and X3 => Y3 will be generated, forming one episode. However, Y1 => X2 and Y2 => X3 are not created as separate examples by default. - To change this behavior, you can set opt['label_turns']. The default - value is 'secondspeaker' (i.e., the second speaker's utterances are + To change this behavior, you can set ``opt['label_turns']`` or ``--label-turns flag``. + The default value is 'secondspeaker' (i.e., the second speaker's utterances are used as labels), but 'firstspeaker' and 'both' are also options. In the case of 'both', two episodes are generated for each conversation. """ + @classmethod + def add_cmdline_args( + cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None + ) -> ParlaiParser: + agent = super().add_cmdline_args(parser, partial_opt) + agent.add_argument( + '--label-turns', + type=str, + help='which speaker to use as label', + choices=['firstspeaker', 'secondspeaker', 'both'], + default='secondspeaker', + ) + return parser + def __init__(self, opt, shared=None): - super().__init__(opt, shared) - if not shared: - self.episodes = [] - self.num_exs = 0 - self.label_turns = opt.get('label_turns') - if opt.get('conversationteacher_datafile') is not None: - self._setup_data(opt.get('conversationteacher_datafile')) - else: - self.episodes = shared['episodes'] - self.num_exs = sum(len(e) for e in self.episodes) + if not opt.get('conversationteacher_datafile'): + raise RuntimeError('conversationteacher_datafile not specified') + opt = copy.deepcopy(opt) + opt['datafile'] = opt.get('conversationteacher_datafile') + self.label_turns = opt.get('label_turns') + super().__init__(opt, shared) self.id = opt['task'] - self.reset() + def _return_episode_examples(self, episode): + for idx, example in enumerate(episode): + episode_begin = idx == 0 + if 'episode_done' in example: + example.pop('episode_done') + yield example, episode_begin - def share(self): - """ - Share the episodes. - """ - shared = super().share() - shared['episodes'] = self.episodes - return shared - - def num_examples(self): - """ - Return the number of examples from the data. - """ - return self.num_exs - - def num_episodes(self): - """ - Return the number of episodes from the data. - """ - return len(self.episodes) - - def get(self, episode_idx, entry_idx=None): - """ - Get a specific example from the dataset. - """ - return Message(self.episodes[episode_idx][entry_idx]) - - def _setup_data(self, path): - logging.info("[loading data from json file into task:" + path + "]") - self.episodes = [] - self.num_exs = 0 - eps = [] + def setup_data(self, path): + logging.info(f"[loading data from json file into task: {path} ]") conversations = Conversations(path) - self.num_exs = 0 for conv in conversations: if conv.context: warn_once( @@ -1719,15 +1704,15 @@ def _setup_data(self, path): if self.label_turns in ['firstspeaker', 'both']: eps = self._get_ep_from_turns(turns[::2], turns[1::2]) if eps: - self.episodes.append(eps) - self.num_exs += len(eps) + for example, example_begins in self._return_episode_examples(eps): + yield example, example_begins # train on even turns as labels (turns w/ second speaker) if self.label_turns in ['secondspeaker', 'both']: eps = self._get_ep_from_turns(turns[1::2], turns[2::2]) if eps: - self.episodes.append(eps) - self.num_exs += len(eps) + for example, example_begins in self._return_episode_examples(eps): + yield example, example_begins def _get_ep_from_turns(self, xturns, yturns): eps = [] @@ -1735,11 +1720,8 @@ def _get_ep_from_turns(self, xturns, yturns): turn = {} turn['text'] = xturn.get('text').strip() turn['labels'] = [yturn.get('text').strip()] - turn['episode_done'] = False eps.append(turn) - if eps: - eps[-1]['episode_done'] = True - return eps + return eps class AbstractImageTeacher(FixedDialogTeacher): @@ -1930,9 +1912,9 @@ def get_image_features_path(self, task, image_model_name, dt): """ Image features for the dataset images are stored here. - Can be overridden in subclass to use custom paths. Image features can be manually - copied into this directory or in the case of ImageLoader eligible models, they - will be built and stored here if not already there. + Can be overridden in subclass to use custom paths. Image features can be + manually copied into this directory or in the case of ImageLoader eligible + models, they will be built and stored here if not already there. """ # In default implementation, self.data_path already has task name added image_features_path = os.path.join(self.data_path, 'image_features') diff --git a/parlai/crowdsourcing/README.md b/parlai/crowdsourcing/README.md index 4ddc6a9a5ea..368f8e0bf5e 100644 --- a/parlai/crowdsourcing/README.md +++ b/parlai/crowdsourcing/README.md @@ -6,7 +6,7 @@ Code for crowdsourcing tasks that use Mephisto. See the [Mephisto quick start gu ## Running tasks -Tasks are launched by calling the appropriate run script: for instance, an ACUTE-Eval run can be launched with `python parlai/crowdsourcing/tasks/acute_eval/run.py`, followed by any appropriate flags. All run parameters are set using [Hydra](https://github.com/facebookresearch/hydra): append the flag `-c job` to your run command to see a list of all available parameters, grouped by their package name (`mephisto.blueprint`, `mephisto.task`, etc.), which determines how they are called. Each run script has a YAML file of default parameters that will be loaded, found in the `hydra_configs/conf/` subfolder of each task. +Tasks are launched by calling the appropriate run script: for instance, an ACUTE-Eval run can be launched with `python parlai/crowdsourcing/tasks/acute_eval/run.py`, followed by any appropriate flags. All run parameters are set using [Hydra](https://github.com/facebookresearch/hydra): append the flag `-c job` to your run command to see a list of all available parameters, grouped by their package name (`mephisto.blueprint`, `mephisto.task`, etc.), which determines how they are called. Each run script points to a YAML file of default parameters (including the blueprints) that will be loaded, found in the `hydra_configs/conf/` subfolder of each task. You can specify a different blueprint in a YAML file and use that YAML file to run your task (see below). ### Specifying your own YAML file @@ -24,6 +24,8 @@ mephisto: ``` If you want to quickly modify this parameter to, say, 0.6 without changing the YAML file, you can add a `mephisto.task.task_reward=0.6` string to your launch command. +You can also specify a blueprint on the command line by adding `mephisto/blueprint=my_blueprint_type` (this will override the default blueprint type if defined in the config file). + ### MTurk-specific task configuration Here is a partial list of MTurk-specific parameters that can be set in YAML files or on the command line: diff --git a/parlai/crowdsourcing/tasks/model_chat/README.md b/parlai/crowdsourcing/tasks/model_chat/README.md index 653e761160f..875c4c5341c 100644 --- a/parlai/crowdsourcing/tasks/model_chat/README.md +++ b/parlai/crowdsourcing/tasks/model_chat/README.md @@ -6,7 +6,7 @@ This task will collect conversations between a human and a model. After each res ## Launching -Call `run.py` to run this task with the default parameters, as set by `conf/example.yaml`. Some parameters that you can adjust include where to save data, lists of workers to soft-block, the maximum response time, etc. +Call `run.py` to run this task with the default parameters, as set by `hydra_configs/conf/example.yaml`. Some parameters that you can adjust include where to save data, lists of workers to soft-block, the maximum response time, etc. Set `mephisto.blueprint.model_opt_path` to specify a path to a YAML file listing all models to be chatted with, as well as the ParlAI flags for running each one. See `task_config/model_opts.yaml` for an example. @@ -26,7 +26,7 @@ In `worlds.py`, modify `ModelChatOnboardWorld.check_onboarding_answers()` to cha ## Human+model image chat -`run_image_chat.py` can be run to chat with a model about an image: each conversation will begin with a selected image, and then the human and model will chat about it. +Call `run.py conf=example_image_chat` to chat with a model about an image: each conversation will begin with a selected image, and then the human and model will chat about it. This task is run with the parameters defined in `hydra_configs/conf/example_image_chat.yaml`. This code replaces the old `parlai/mturk/tasks/image_chat/` and `parlai/mturk/tasks/personality_captions/` tasks, which are deprecated and can be accessed with `git checkout v0.10.0`. Those tasks also featured the ability to compare two possible captions to an image and rate which one is more engaging: this functionality has now been replaced by the [ACUTE-Eval](https://github.com/facebookresearch/ParlAI/tree/main/parlai/crowdsourcing/tasks/acute_eval) task. diff --git a/parlai/crowdsourcing/tasks/model_chat/run_image_chat.py b/parlai/crowdsourcing/tasks/model_chat/run_image_chat.py deleted file mode 100644 index f27c91f185e..00000000000 --- a/parlai/crowdsourcing/tasks/model_chat/run_image_chat.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) Facebook, Inc. and its affiliates. -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -from dataclasses import dataclass, field -from typing import Any, List - -import hydra -from mephisto.operations.hydra_config import register_script_config -from omegaconf import DictConfig - -from parlai.crowdsourcing.tasks.model_chat.impl import run_task -from parlai.crowdsourcing.utils.mturk import MTurkRunScriptConfig -import parlai.crowdsourcing.tasks.model_chat.worlds as world_module - - -TASK_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) - - -defaults = ["_self_", {"conf": "example_image_chat"}] - - -@dataclass -class ScriptConfig(MTurkRunScriptConfig): - defaults: List[Any] = field(default_factory=lambda: defaults) - task_dir: str = TASK_DIRECTORY - monitoring_log_rate: int = field( - default=30, - metadata={ - 'help': 'Frequency in seconds of logging the monitoring of the crowdsourcing task' - }, - ) - - -register_script_config(name='scriptconfig', module=ScriptConfig) - - -@hydra.main(config_path="hydra_configs", config_name="scriptconfig") -def main(cfg: DictConfig) -> None: - run_task(cfg=cfg, task_directory=TASK_DIRECTORY, world_module=world_module) - - -if __name__ == "__main__": - main() diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/README.md b/parlai/crowdsourcing/tasks/turn_annotations_static/README.md index cf9aa75bdf8..cf7fcee8abc 100644 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/README.md +++ b/parlai/crowdsourcing/tasks/turn_annotations_static/README.md @@ -5,9 +5,11 @@ Two variants of the blueprint are supported: - `TurnAnnotationStaticBlueprint` - The base static turn-annotations task - Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run.py` + - (the task runs with the default parameters set in `hydra_configs/conf/example.yaml`) - `TurnAnnotationStaticInFlightQABlueprint` - Includes the ability to add an additional in-flight (i.e. mid-HIT) quality assurance check - - Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run_in_flight_qa.py` + - Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run.py conf=example_in_flight_qa` + - (the task runs with the parameters set in `hydra_configs/conf/example_in_flight.yaml`) For both variants of the blueprint, it is required to pass in your own file of conversations with `mephisto.blueprint.data_jsonl=${PATH_TO_CONVERSATIONS}`. diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/run_in_flight_qa.py b/parlai/crowdsourcing/tasks/turn_annotations_static/run_in_flight_qa.py deleted file mode 100644 index b3918651bbd..00000000000 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/run_in_flight_qa.py +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright (c) Facebook, Inc. and its affiliates. -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import os -from dataclasses import dataclass, field -from typing import Any, List - -import hydra -from mephisto.operations.hydra_config import register_script_config -from omegaconf import DictConfig - -from parlai.crowdsourcing.tasks.turn_annotations_static.util import run_static_task -from parlai.crowdsourcing.utils.mturk import MTurkRunScriptConfig - - -# TODO: merge this with run.py once Hydra supports recursive defaults -# (https://github.com/facebookresearch/hydra/issues/171) - - -TASK_DIRECTORY = os.path.dirname(os.path.abspath(__file__)) - - -defaults = ["_self_", {"conf": "example_in_flight_qa"}] - - -@dataclass -class ScriptConfig(MTurkRunScriptConfig): - defaults: List[Any] = field(default_factory=lambda: defaults) - task_dir: str = TASK_DIRECTORY - monitoring_log_rate: int = field( - default=30, - metadata={ - 'help': 'Frequency in seconds of logging the monitoring of the crowdsourcing task' - }, - ) - - -register_script_config(name='scriptconfig', module=ScriptConfig) - - -@hydra.main(config_path="hydra_configs", config_name="scriptconfig") -def main(cfg: DictConfig) -> None: - run_static_task(cfg=cfg, task_directory=TASK_DIRECTORY) - - -if __name__ == "__main__": - main() diff --git a/parlai/tasks/jsonfile/agents.py b/parlai/tasks/jsonfile/agents.py index 430a53ca86b..1ce9bf9837d 100644 --- a/parlai/tasks/jsonfile/agents.py +++ b/parlai/tasks/jsonfile/agents.py @@ -47,19 +47,18 @@ def add_cmdline_args( return parser def __init__(self, opt, shared=None): - super().__init__(opt, shared) opt = copy.deepcopy(opt) if not opt.get('jsonfile_datapath'): raise RuntimeError('jsonfile_datapath not specified') datafile = opt['jsonfile_datapath'] - if self.opt['jsonfile_datatype_extension']: - datafile += "_" + self.opt['datatype'].split(':')[0] + '.jsonl' - if shared is None: - self._setup_data(datafile) + if opt['jsonfile_datatype_extension']: + datafile += "_" + opt['datatype'].split(':')[0] + '.jsonl' + opt['conversationteacher_datafile'] = datafile + super().__init__(opt, shared) + # Truncate datafile to just the immediate enclosing folder name and file name dirname, basename = os.path.split(datafile) self.id = os.path.join(os.path.split(dirname)[1], basename) - self.reset() class DefaultTeacher(JsonTeacher): diff --git a/projects/blenderbot2/agents/modules.py b/projects/blenderbot2/agents/modules.py index f53b59d109b..cc8924f43f9 100644 --- a/projects/blenderbot2/agents/modules.py +++ b/projects/blenderbot2/agents/modules.py @@ -437,6 +437,9 @@ def retrieve_and_concat( input_lengths = input_lengths.repeat_interleave( input_turns_cnt, dim=0 ) # type: ignore + + # Filtering empty doc_scores added due to dynamic batching (if used) + doc_scores = [[s for s in ds if s is not None] for ds in doc_scores if ds] top_doc_scores = torch.stack( [torch.cat([s_i for s_i in scores_i]) for scores_i in doc_scores] ) diff --git a/tests/crowdsourcing/tasks/model_chat/test_model_image_chat.py b/tests/crowdsourcing/tasks/model_chat/test_model_image_chat.py index 076d0f1de32..feb6800535f 100644 --- a/tests/crowdsourcing/tasks/model_chat/test_model_image_chat.py +++ b/tests/crowdsourcing/tasks/model_chat/test_model_image_chat.py @@ -41,7 +41,7 @@ try: import parlai.crowdsourcing.tasks.model_chat.worlds_image_chat as world_module - from parlai.crowdsourcing.tasks.model_chat.run_image_chat import TASK_DIRECTORY + from parlai.crowdsourcing.tasks.model_chat.run import TASK_DIRECTORY from parlai.crowdsourcing.tasks.model_chat.model_chat_blueprint import ( SharedModelImageChatTaskState, )