From 307d7bff880b7c12e28019e4942933382ef389dd Mon Sep 17 00:00:00 2001 From: Mojtaba Komeili Date: Mon, 27 Mar 2023 12:31:10 -0400 Subject: [PATCH 1/5] _initialization_data_dicts type --- parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py | 2 +- .../tasks/turn_annotations_static/turn_annotations_blueprint.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py index ec6674e55f3..002c752cdfe 100644 --- a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py +++ b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py @@ -67,7 +67,7 @@ def __init__( f'subtasks_per_unit must be greater than zero but was {self.subtasks_per_unit}' ) - self.raw_data = self._initialization_data_dicts + self.raw_data = self._initialization_data_dicts: Iterable[Dict[str, Any]] # Now chunk the data into groups of grouped_data = [] diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py index d6cd7804f3d..098c7319881 100644 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py +++ b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py @@ -131,7 +131,7 @@ def __init__( f'subtasks_per_unit must be greater than zero but was {self.subtasks_per_unit}' ) - self.raw_data = self._initialization_data_dicts + self.raw_data = self._initialization_data_dicts: Iterable[Dict[str, Any]] # Load from file if needed specifying which utterances within each # conversation to annotate From d6de583258d14629ee8742879b78bce4acfbd96d Mon Sep 17 00:00:00 2001 From: Mojtaba Komeili Date: Mon, 27 Mar 2023 12:37:00 -0400 Subject: [PATCH 2/5] right place for the type --- parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py | 2 +- .../tasks/turn_annotations_static/turn_annotations_blueprint.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py index 002c752cdfe..bff983ce9c1 100644 --- a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py +++ b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py @@ -67,7 +67,7 @@ def __init__( f'subtasks_per_unit must be greater than zero but was {self.subtasks_per_unit}' ) - self.raw_data = self._initialization_data_dicts: Iterable[Dict[str, Any]] + self.raw_data: Iterable[Dict[str, Any]] = self._initialization_data_dicts # Now chunk the data into groups of grouped_data = [] diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py index 098c7319881..601ca157938 100644 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py +++ b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py @@ -131,7 +131,7 @@ def __init__( f'subtasks_per_unit must be greater than zero but was {self.subtasks_per_unit}' ) - self.raw_data = self._initialization_data_dicts: Iterable[Dict[str, Any]] + self.raw_data: Iterable[Dict[str, Any]] = self._initialization_data_dicts # Load from file if needed specifying which utterances within each # conversation to annotate From ed883c604a0aa4a0223eb5943ef81026943c84f0 Mon Sep 17 00:00:00 2001 From: Mojtaba Komeili Date: Mon, 27 Mar 2023 12:43:55 -0400 Subject: [PATCH 3/5] added the iterable --- parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py index bff983ce9c1..1d8c23b344b 100644 --- a/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py +++ b/parlai/crowdsourcing/tasks/dialcrowd/dialcrowd_blueprint.py @@ -8,7 +8,7 @@ import logging import os from dataclasses import dataclass, field -from typing import Any, Dict, TYPE_CHECKING +from typing import Any, Dict, Iterable, TYPE_CHECKING from mephisto.operations.registry import register_mephisto_abstraction from mephisto.abstractions.blueprint import SharedTaskState From 321b0e0c5fc16df4f4a36adaa2975b00ee49f6c0 Mon Sep 17 00:00:00 2001 From: Mojtaba Komeili Date: Mon, 27 Mar 2023 12:45:00 -0400 Subject: [PATCH 4/5] added the iterable 2 --- .../tasks/turn_annotations_static/turn_annotations_blueprint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py index 601ca157938..06a7649605f 100644 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py +++ b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py @@ -10,7 +10,7 @@ import os import random from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, TYPE_CHECKING +from typing import Any, Dict, Iterable, List, Optional, TYPE_CHECKING import numpy as np from mephisto.operations.registry import register_mephisto_abstraction From 1998e452eb6b5e57317388139a934b1f8beebe7a Mon Sep 17 00:00:00 2001 From: Mojtaba Date: Mon, 27 Mar 2023 10:01:49 -0700 Subject: [PATCH 5/5] long lines --- .../turn_annotations_blueprint.py | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py index 06a7649605f..389e68d5913 100644 --- a/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py +++ b/parlai/crowdsourcing/tasks/turn_annotations_static/turn_annotations_blueprint.py @@ -57,7 +57,9 @@ class TurnAnnotationsStaticBlueprintArgs(StaticReactBlueprintArgs): annotation_indices_jsonl: Optional[str] = field( default=None, metadata={ - "help": "Specify which utterance indices to annotate per conversation in a JSONL file. Must be same length as conversations data-jsonl file. See example file in task_config/annotation_indices_example.jsonl" + "help": "Specify which utterance indices to annotate per conversation in a JSONL file. " + "Must be same length as conversations data-jsonl file. " + "See example file in task_config/annotation_indices_example.jsonl" }, ) annotation_last_only: Optional[bool] = field( @@ -87,8 +89,8 @@ class TurnAnnotationsStaticBlueprintArgs(StaticReactBlueprintArgs): annotations_config_path: str = field( default="", metadata={ - "help": "As per Turn Annotations task, path to annotation buckets which will be checkboxes in the frontend for worker to annotate an utterance. Set to " - " to disable checkboxes." + "help": "As per Turn Annotations task, path to annotation buckets which will be checkboxes in" + " the frontend for worker to annotate an utterance. Set to disable checkboxes." }, ) response_field: bool = field( @@ -152,7 +154,8 @@ def __init__( line = f.readline() if len(self.annotation_indices) != len(self.raw_data): raise Exception( - f'Cannot specify a different length of annotation indices ({len(self.annotation_indices)}) than conversations ({len(self.raw_data)}).' + f'Cannot specify a different length of annotation indices ' + f'({len(self.annotation_indices)}) than conversations ({len(self.raw_data)}).' ) # TODO: should check that utterances specified are all bot # utterances (agent_idx == 1) @@ -237,7 +240,9 @@ def process_data(self, data_dicts, annotation_indices=None): # or bias the turkers if len(annotation_indices[conv_idx]) > 1: logging.info( - f'Splitting {len(annotation_indices[conv_idx])} separate problematic utterance annotations in the same conversation into two separate conversations for this task. This avoids biasing the turkers with utterances that may come after one of the annotations.' + f'Splitting {len(annotation_indices[conv_idx])} separate problematic utterance ' + 'annotations in the same conversation into two separate conversations for this task. ' + 'This avoids biasing the turkers with utterances that may come after one of the annotations.' ) for a in annotation_indices[conv_idx]: processed_dialog = self._process_conversation(d, [a]) @@ -259,7 +264,9 @@ def process_data(self, data_dicts, annotation_indices=None): output.append(processed_dialog) print( - f'process_data: Processed {len(data_dicts)} total conversations into {len(output)} conversations in the full data with {total_annotation_count} total turn annotations. (Does not account for units per assignment value - i.e. multiple annotations.)' + f'process_data: Processed {len(data_dicts)} total conversations into {len(output)} ' + f'conversations in the full data with {total_annotation_count} total turn annotations. ' + f'(Does not account for units per assignment value - i.e. multiple annotations.)' ) np.random.shuffle(output) @@ -326,9 +333,12 @@ def _process_conversation(self, d, annotation_indices: Optional[List[int]] = Non raise Exception( f'Conversation had {adjusted_turn_idx} but max_turn_to_show was {max_turn_to_show}' ) - assert any( - nd['do_annotate'] for nd in new_dialogue - ), f'Have to annotate at least one index in the conversation! But new_dialogue was: {new_dialogue}, raw dialogue was: {d["dialog"]}, annotation_indices was: {annotation_indices}, length of dialogue was {len(new_dialogue)}, adjusted_turn_idx was: {adjusted_turn_idx}, max_turn_to_show: {max_turn_to_show}' + assert any(nd['do_annotate'] for nd in new_dialogue), ( + f'Have to annotate at least one index in the conversation! But new_dialogue was: {new_dialogue},' + f' raw dialogue was: {d["dialog"]}, annotation_indices was: {annotation_indices}, ' + f'length of dialogue was {len(new_dialogue)}, adjusted_turn_idx was: {adjusted_turn_idx}, ' + f'max_turn_to_show: {max_turn_to_show}' + ) return new_dialogue @@ -339,7 +349,10 @@ class TurnAnnotationsStaticInFlightQABlueprintArgs(TurnAnnotationsStaticBlueprin _group: str = field( default="TurnAnnotationsStaticInFlightQABlueprint", metadata={ - 'help': """This task mixes in a live onboarding as the last subtask (in addition to an onboarding at the start), and actually increases the number of subtasks per unit by 1.""" + 'help': ( + "This task mixes in a live onboarding as the last subtask (in addition to an onboarding at the start)," + " and actually increases the number of subtasks per unit by 1." + ) }, ) onboarding_in_flight_data: str = field( @@ -401,5 +414,6 @@ def __init__( self.subtasks_per_unit = len(chunk) print( - f'{self.__class__.__name__}: Grouped data into {len(self._initialization_data_dicts)} tasks with {self.subtasks_per_unit} subtasks each (added in-flight qualification task).' + f'{self.__class__.__name__}: Grouped data into {len(self._initialization_data_dicts)}' + f'tasks with {self.subtasks_per_unit} subtasks each (added in-flight qualification task).' )