Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

ConversationTeacher parent class changed #4256

Merged
merged 9 commits into from
Dec 16, 2021
98 changes: 40 additions & 58 deletions parlai/core/teachers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1611,11 +1611,11 @@ def setup_data(self, datafile):
yield act, next_episode_new


class ConversationTeacher(FixedDialogTeacher):
class ConversationTeacher(DialogTeacher):
"""
This module provides access to data in the Conversations format.

Subclasses ``FixedDialogTeacher`` for functionality and provides an
Subclasses ``DialogTeacher`` for functionality and provides an
implementation of ``setup_data()`` which iterates over datasets in the
"Conversations" format. If your data is in the format below, use this class to
handle file parsing for you.
Expand Down Expand Up @@ -1649,61 +1649,46 @@ class ConversationTeacher(FixedDialogTeacher):
A set of examples X1 => Y1, X2 => Y2, and X3 => Y3 will be generated,
forming one episode. However, Y1 => X2 and Y2 => X3 are not created as
separate examples by default.
To change this behavior, you can set opt['label_turns']. The default
value is 'secondspeaker' (i.e., the second speaker's utterances are
To change this behavior, you can set ``opt['label_turns']`` or ``--label-turns flag``.
The default value is 'secondspeaker' (i.e., the second speaker's utterances are
used as labels), but 'firstspeaker' and 'both' are also options. In the
case of 'both', two episodes are generated for each conversation.
"""

@classmethod
def add_cmdline_args(
cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
) -> ParlaiParser:
agent = super().add_cmdline_args(parser, partial_opt)
agent.add_argument(
'--label-turns',
type=str,
help='which speaker to use as label',
choices=['firstspeaker', 'secondspeaker', 'both'],
default='secondspeaker',
)
return parser

def __init__(self, opt, shared=None):
super().__init__(opt, shared)
if not shared:
self.episodes = []
self.num_exs = 0
self.label_turns = opt.get('label_turns')
if opt.get('conversationteacher_datafile') is not None:
self._setup_data(opt.get('conversationteacher_datafile'))
else:
self.episodes = shared['episodes']
self.num_exs = sum(len(e) for e in self.episodes)
if not opt.get('conversationteacher_datafile'):
raise RuntimeError('conversationteacher_datafile not specified')

opt = copy.deepcopy(opt)
opt['datafile'] = opt.get('conversationteacher_datafile')
self.label_turns = opt.get('label_turns')
super().__init__(opt, shared)
self.id = opt['task']

self.reset()
def _return_episode_examples(self, episode):
for idx, example in enumerate(episode):
episode_begin = idx == 0
if 'episode_done' in example:
example.pop('episode_done')
yield example, episode_begin

def share(self):
"""
Share the episodes.
"""
shared = super().share()
shared['episodes'] = self.episodes
return shared

def num_examples(self):
"""
Return the number of examples from the data.
"""
return self.num_exs

def num_episodes(self):
"""
Return the number of episodes from the data.
"""
return len(self.episodes)

def get(self, episode_idx, entry_idx=None):
"""
Get a specific example from the dataset.
"""
return Message(self.episodes[episode_idx][entry_idx])

def _setup_data(self, path):
logging.info("[loading data from json file into task:" + path + "]")
self.episodes = []
self.num_exs = 0
eps = []
def setup_data(self, path):
logging.info(f"[loading data from json file into task: {path} ]")
conversations = Conversations(path)
self.num_exs = 0
for conv in conversations:
if conv.context:
warn_once(
Expand All @@ -1719,27 +1704,24 @@ def _setup_data(self, path):
if self.label_turns in ['firstspeaker', 'both']:
eps = self._get_ep_from_turns(turns[::2], turns[1::2])
if eps:
self.episodes.append(eps)
self.num_exs += len(eps)
for example, example_begins in self._return_episode_examples(eps):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks good

yield example, example_begins

# train on even turns as labels (turns w/ second speaker)
if self.label_turns in ['secondspeaker', 'both']:
eps = self._get_ep_from_turns(turns[1::2], turns[2::2])
if eps:
self.episodes.append(eps)
self.num_exs += len(eps)
for example, example_begins in self._return_episode_examples(eps):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks good!

yield example, example_begins

def _get_ep_from_turns(self, xturns, yturns):
eps = []
for xturn, yturn in zip(xturns, yturns):
turn = {}
turn['text'] = xturn.get('text').strip()
turn['labels'] = [yturn.get('text').strip()]
turn['episode_done'] = False
eps.append(turn)
if eps:
eps[-1]['episode_done'] = True
return eps
return eps


class AbstractImageTeacher(FixedDialogTeacher):
Expand Down Expand Up @@ -1930,9 +1912,9 @@ def get_image_features_path(self, task, image_model_name, dt):
"""
Image features for the dataset images are stored here.

Can be overridden in subclass to use custom paths. Image features can be manually
copied into this directory or in the case of ImageLoader eligible models, they
will be built and stored here if not already there.
Can be overridden in subclass to use custom paths. Image features can be
manually copied into this directory or in the case of ImageLoader eligible
models, they will be built and stored here if not already there.
"""
# In default implementation, self.data_path already has task name added
image_features_path = os.path.join(self.data_path, 'image_features')
Expand Down
4 changes: 3 additions & 1 deletion parlai/crowdsourcing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Code for crowdsourcing tasks that use Mephisto. See the [Mephisto quick start gu

## Running tasks

Tasks are launched by calling the appropriate run script: for instance, an ACUTE-Eval run can be launched with `python parlai/crowdsourcing/tasks/acute_eval/run.py`, followed by any appropriate flags. All run parameters are set using [Hydra](https://github.com/facebookresearch/hydra): append the flag `-c job` to your run command to see a list of all available parameters, grouped by their package name (`mephisto.blueprint`, `mephisto.task`, etc.), which determines how they are called. Each run script has a YAML file of default parameters that will be loaded, found in the `hydra_configs/conf/` subfolder of each task.
Tasks are launched by calling the appropriate run script: for instance, an ACUTE-Eval run can be launched with `python parlai/crowdsourcing/tasks/acute_eval/run.py`, followed by any appropriate flags. All run parameters are set using [Hydra](https://github.com/facebookresearch/hydra): append the flag `-c job` to your run command to see a list of all available parameters, grouped by their package name (`mephisto.blueprint`, `mephisto.task`, etc.), which determines how they are called. Each run script points to a YAML file of default parameters (including the blueprints) that will be loaded, found in the `hydra_configs/conf/` subfolder of each task. You can specify a different blueprint in a YAML file and use that YAML file to run your task (see below).

### Specifying your own YAML file

Expand All @@ -24,6 +24,8 @@ mephisto:
```
If you want to quickly modify this parameter to, say, 0.6 without changing the YAML file, you can add a `mephisto.task.task_reward=0.6` string to your launch command.

You can also specify a blueprint on the command line by adding `mephisto/blueprint=my_blueprint_type` (this will override the default blueprint type if defined in the config file).

### MTurk-specific task configuration

Here is a partial list of MTurk-specific parameters that can be set in YAML files or on the command line:
Expand Down
4 changes: 2 additions & 2 deletions parlai/crowdsourcing/tasks/model_chat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This task will collect conversations between a human and a model. After each res

## Launching

Call `run.py` to run this task with the default parameters, as set by `conf/example.yaml`. Some parameters that you can adjust include where to save data, lists of workers to soft-block, the maximum response time, etc.
Call `run.py` to run this task with the default parameters, as set by `hydra_configs/conf/example.yaml`. Some parameters that you can adjust include where to save data, lists of workers to soft-block, the maximum response time, etc.

Set `mephisto.blueprint.model_opt_path` to specify a path to a YAML file listing all models to be chatted with, as well as the ParlAI flags for running each one. See `task_config/model_opts.yaml` for an example.

Expand All @@ -26,7 +26,7 @@ In `worlds.py`, modify `ModelChatOnboardWorld.check_onboarding_answers()` to cha

## Human+model image chat

`run_image_chat.py` can be run to chat with a model about an image: each conversation will begin with a selected image, and then the human and model will chat about it.
Call `run.py conf=example_image_chat` to chat with a model about an image: each conversation will begin with a selected image, and then the human and model will chat about it. This task is run with the parameters defined in `hydra_configs/conf/example_image_chat.yaml`.

This code replaces the old `parlai/mturk/tasks/image_chat/` and `parlai/mturk/tasks/personality_captions/` tasks, which are deprecated and can be accessed with `git checkout v0.10.0`. Those tasks also featured the ability to compare two possible captions to an image and rate which one is more engaging: this functionality has now been replaced by the [ACUTE-Eval](https://github.com/facebookresearch/ParlAI/tree/main/parlai/crowdsourcing/tasks/acute_eval) task.

Expand Down
47 changes: 0 additions & 47 deletions parlai/crowdsourcing/tasks/model_chat/run_image_chat.py

This file was deleted.

4 changes: 3 additions & 1 deletion parlai/crowdsourcing/tasks/turn_annotations_static/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ Two variants of the blueprint are supported:
- `TurnAnnotationStaticBlueprint`
- The base static turn-annotations task
- Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run.py`
- (the task runs with the default parameters set in `hydra_configs/conf/example.yaml`)
- `TurnAnnotationStaticInFlightQABlueprint`
- Includes the ability to add an additional in-flight (i.e. mid-HIT) quality assurance check
- Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run_in_flight_qa.py`
- Called with `python parlai/crowdsourcing/tasks/turn_annotations_static/run.py conf=example_in_flight_qa`
- (the task runs with the parameters set in `hydra_configs/conf/example_in_flight.yaml`)

For both variants of the blueprint, it is required to pass in your own file of conversations with `mephisto.blueprint.data_jsonl=${PATH_TO_CONVERSATIONS}`.

Expand Down

This file was deleted.

11 changes: 5 additions & 6 deletions parlai/tasks/jsonfile/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,18 @@ def add_cmdline_args(
return parser

def __init__(self, opt, shared=None):
super().__init__(opt, shared)
opt = copy.deepcopy(opt)
if not opt.get('jsonfile_datapath'):
raise RuntimeError('jsonfile_datapath not specified')
datafile = opt['jsonfile_datapath']
if self.opt['jsonfile_datatype_extension']:
datafile += "_" + self.opt['datatype'].split(':')[0] + '.jsonl'
if shared is None:
self._setup_data(datafile)
if opt['jsonfile_datatype_extension']:
datafile += "_" + opt['datatype'].split(':')[0] + '.jsonl'
opt['conversationteacher_datafile'] = datafile
super().__init__(opt, shared)

# Truncate datafile to just the immediate enclosing folder name and file name
dirname, basename = os.path.split(datafile)
self.id = os.path.join(os.path.split(dirname)[1], basename)
self.reset()


class DefaultTeacher(JsonTeacher):
Expand Down
3 changes: 3 additions & 0 deletions projects/blenderbot2/agents/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,9 @@ def retrieve_and_concat(
input_lengths = input_lengths.repeat_interleave(
input_turns_cnt, dim=0
) # type: ignore

# Filtering empty doc_scores added due to dynamic batching (if used)
doc_scores = [[s for s in ds if s is not None] for ds in doc_scores if ds]
top_doc_scores = torch.stack(
[torch.cat([s_i for s_i in scores_i]) for scores_i in doc_scores]
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
try:

import parlai.crowdsourcing.tasks.model_chat.worlds_image_chat as world_module
from parlai.crowdsourcing.tasks.model_chat.run_image_chat import TASK_DIRECTORY
from parlai.crowdsourcing.tasks.model_chat.run import TASK_DIRECTORY
from parlai.crowdsourcing.tasks.model_chat.model_chat_blueprint import (
SharedModelImageChatTaskState,
)
Expand Down