Generalize model chat analysis code (#3844)

* Start to fix some analysis issues * Dealing with static turn annotations * Start to disable some problem stuff * Rename var * Work on fixes * Add more bucket flags * Update worker columns * Hook for new function * Add in passthrough method * Fix * Add none bucket * Cosmetic * Fix test * Update test * Parametrize test * Dump in new cases * Start of new test * Finish new tests * Start revising test * Format JSONs * More fixes
facebookresearch · Jul 26, 2021 · e89a77e · e89a77e
1 parent 32945a3
commit e89a77e
Show file tree

Hide file tree

Showing 26 changed files with 3,063 additions and 372 deletions.
diff --git a/parlai/crowdsourcing/tasks/model_chat/analysis/compile_results.py b/parlai/crowdsourcing/tasks/model_chat/analysis/compile_results.py
diff --git a/parlai/crowdsourcing/tasks/model_chat/task_config/annotations_config.json b/parlai/crowdsourcing/tasks/model_chat/task_config/annotations_config.json
@@ -23,5 +23,10 @@
         "value": "bucket_4",
         "name": "Bucket 4",
         "description": "this response implies something...4"
+    },
+    {
+        "value": "none_all_good",
+        "name": "None, all good",
+        "description": "This response implies that there are no problems with the data"
     }
 ]
diff --git a/parlai/crowdsourcing/tasks/turn_annotations_static/analysis/compile_results.py b/parlai/crowdsourcing/tasks/turn_annotations_static/analysis/compile_results.py
@@ -53,6 +53,10 @@ def __init__(self, opt: Dict[str, Any]):
         super().__init__(opt)
         self.onboarding_in_flight_data_file = opt.get('onboarding_in_flight_data_file')
         self.gold_annotations_file = opt.get('gold_annotations_file')
+        if not self.use_problem_buckets:
+            raise ValueError(
+                'Problem buckets must be used when analyzing results from the static turn annotations task!'
+            )
 
     def get_data_paths_mephisto(self, task_run_id_folder):
         """

diff --git a/parlai/crowdsourcing/utils/analysis.py b/parlai/crowdsourcing/utils/analysis.py
@@ -109,7 +109,7 @@ def setup_args(cls):
         parser.add_argument(
             '--problem-buckets',
             type=str,
-            help='Comma-separated list of buckets used for annotation',
+            help='Comma-separated list of buckets used for annotation. Set to an empty string to not analyze problem buckets.',
             default='bucket_0,bucket_1,bucket_2,bucket_3,bucket_4,none_all_good',
         )
         return parser
@@ -123,10 +123,15 @@ def __init__(self, opt: Dict[str, Any]):
             self.results_folders = opt['results_folders'].split(',')
         else:
             self.results_folders = None
-        self.problem_buckets = opt['problem_buckets'].split(',')
+        if opt['problem_buckets'].lower() not in ['', 'none']:
+            self.use_problem_buckets = True
+            self.problem_buckets = opt['problem_buckets'].split(',')
+        else:
+            self.use_problem_buckets = False
+            self.problem_buckets = []
 
         # Validate problem buckets
-        if 'none_all_good' not in self.problem_buckets:
+        if self.use_problem_buckets and 'none_all_good' not in self.problem_buckets:
             # The code relies on a catchall "none" category if the user selects no other
             # annotation bucket
             raise ValueError(

diff --git a/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_1_live.json b/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_1_live.json
diff --git a/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_2_live.json b/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_2_live.json
diff --git a/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_3_live.json b/tests/crowdsourcing/tasks/model_chat/analysis_samples/2020_12_29/20201229_173200_3_live.json
diff --git a/...wdsourcing/tasks/model_chat/analysis_samples/basic/2020_12_29/20201229_173200_1_live.json b/...wdsourcing/tasks/model_chat/analysis_samples/basic/2020_12_29/20201229_173200_1_live.json
@@ -0,0 +1,253 @@
+{
+    "personas": null,
+    "context_dataset": null,
+    "person1_seed_utterance": null,
+    "person2_seed_utterance": null,
+    "additional_context": null,
+    "dialog": [
+        {
+            "episode_done": false,
+            "id": "Worker",
+            "text": "Hi!",
+            "fake_start": true,
+            "agent_idx": 0,
+            "message_id": "MESSAGE_ID"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 0",
+            "id": "TransformerGenerator"
+        },
+        {
+            "agent_idx": 0,
+            "text": "Utterance placeholder line 0 1",
+            "id": "Worker"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 1",
+            "id": "TransformerGenerator"
+        },
+        {
+            "agent_idx": 0,
+            "text": "Utterance placeholder line 0 2",
+            "id": "Worker"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 2",
+            "id": "TransformerGenerator"
+        },
+        {
+            "agent_idx": 0,
+            "text": "Utterance placeholder line 0 3",
+            "id": "Worker"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 3",
+            "id": "TransformerGenerator"
+        },
+        {
+            "agent_idx": 0,
+            "text": "Utterance placeholder line 0 4",
+            "id": "Worker"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 4",
+            "id": "TransformerGenerator"
+        },
+        {
+            "agent_idx": 0,
+            "text": "Utterance placeholder line 0 5",
+            "id": "Worker"
+        },
+        {
+            "agent_idx": 1,
+            "text": "Utterance placeholder line 1 5",
+            "id": "TransformerGenerator",
+            "final_rating": "1"
+        }
+    ],
+    "workers": [
+        "WORKER_1"
+    ],
+    "bad_workers": [],
+    "acceptability_violations": [
+        null
+    ],
+    "hit_ids": [
+        "HIT_ID_1"
+    ],
+    "assignment_ids": [
+        "ASSIGNMENT_ID_1"
+    ],
+    "task_description": {
+        "annotations_config": null,
+        "model_nickname": "blender_90M",
+        "model_file": "/private/home/user/GitHub/facebookresearch/ParlAI/data/models/blender/blender_90M/model",
+        "model_opt": {
+            "init_opt": null,
+            "show_advanced_args": false,
+            "task": "internal:blended_skill_talk,wizard_of_wikipedia,convai2,empathetic_dialogues",
+            "datatype": "train",
+            "image_mode": "raw",
+            "numthreads": 1,
+            "hide_labels": false,
+            "multitask_weights": [
+                1.0,
+                3.0,
+                3.0,
+                3.0
+            ],
+            "batchsize": 16,
+            "model": "transformer/generator",
+            "model_file": "/private/home/user/GitHub/facebookresearch/ParlAI/data/models/blender/blender_90M/model",
+            "init_model": "/checkpoint/parlai/zoo/new_reddit/newreddit_trained20190909_usedfordodeca/model",
+            "dict_class": "parlai.core.dict:DictionaryAgent",
+            "evaltask": null,
+            "eval_batchsize": null,
+            "display_examples": false,
+            "num_epochs": -1,
+            "max_train_time": -1,
+            "validation_every_n_secs": -1,
+            "save_every_n_secs": 60.0,
+            "save_after_valid": true,
+            "validation_every_n_epochs": 0.25,
+            "validation_max_exs": 20000,
+            "short_final_eval": false,
+            "validation_patience": 15,
+            "validation_metric": "ppl",
+            "validation_metric_mode": "min",
+            "validation_cutoff": 1.0,
+            "validation_share_agent": false,
+            "aggregate_micro": false,
+            "metrics": "default",
+            "tensorboard_log": false,
+            "dict_maxexs": -1,
+            "dict_include_valid": false,
+            "dict_include_test": false,
+            "log_every_n_secs": 2,
+            "image_size": 256,
+            "image_cropsize": 224,
+            "label_type": "response",
+            "include_knowledge": true,
+            "include_checked_sentence": true,
+            "include_knowledge_separator": false,
+            "num_topics": 5,
+            "train_experiencer_only": false,
+            "embedding_size": 512,
+            "n_layers": 8,
+            "ffn_size": 2048,
+            "dropout": 0.1,
+            "attention_dropout": 0.0,
+            "relu_dropout": 0.0,
+            "n_heads": 16,
+            "learn_positional_embeddings": true,
+            "embeddings_scale": true,
+            "n_positions": 512,
+            "n_segments": 0,
+            "variant": "xlm",
+            "activation": "gelu",
+            "output_scaling": 1.0,
+            "share_word_embeddings": true,
+            "beam_size": 10,
+            "beam_min_length": 20,
+            "beam_context_block_ngram": 3,
+            "beam_block_ngram": 3,
+            "beam_length_penalty": 0.65,
+            "skip_generation": false,
+            "inference": "beam",
+            "topk": 10,
+            "topp": 0.9,
+            "compute_tokenized_bleu": false,
+            "embedding_type": "random",
+            "embedding_projection": "random",
+            "fp16": true,
+            "fp16_impl": "safe",
+            "force_fp16_tokens": true,
+            "optimizer": "adamax",
+            "learningrate": 7.5e-06,
+            "gradient_clip": 0.1,
+            "adam_eps": 1e-08,
+            "adafactor_eps": [
+                1e-30,
+                0.001
+            ],
+            "momentum": 0,
+            "nesterov": true,
+            "nus": [
+                0.7
+            ],
+            "betas": [
+                0.9,
+                0.999
+            ],
+            "weight_decay": null,
+            "rank_candidates": false,
+            "truncate": -1,
+            "text_truncate": 512,
+            "label_truncate": 128,
+            "history_size": -1,
+            "person_tokens": false,
+            "split_lines": false,
+            "use_reply": "label",
+            "add_p1_after_newln": false,
+            "delimiter": "\n",
+            "gpu": -1,
+            "no_cuda": false,
+            "dict_file": "/private/home/user/GitHub/facebookresearch/ParlAI/data/models/blender/blender_90M/model.dict",
+            "dict_initpath": null,
+            "dict_language": "english",
+            "dict_max_ngram_size": -1,
+            "dict_minfreq": 0,
+            "dict_maxtokens": -1,
+            "dict_nulltoken": "__null__",
+            "dict_starttoken": "__start__",
+            "dict_endtoken": "__end__",
+            "dict_unktoken": "__unk__",
+            "dict_tokenizer": "bpe",
+            "dict_lower": true,
+            "bpe_debug": false,
+            "dict_textfields": "text,labels",
+            "lr_scheduler": "reduceonplateau",
+            "lr_scheduler_patience": 3,
+            "lr_scheduler_decay": 0.5,
+            "max_lr_steps": -1,
+            "invsqrt_lr_decay_gamma": -1,
+            "warmup_updates": -1,
+            "warmup_rate": 0.0001,
+            "update_freq": 1,
+            "parlai_home": "/private/home/user/ParlAI",
+            "starttime": "Feb10_07-25",
+            "model_parallel": true,
+            "beam_block_full_context": false,
+            "allow_missing_init_opts": false,
+            "download_path": null,
+            "loglevel": "info",
+            "dynamic_batching": null,
+            "verbose": false,
+            "is_debug": false,
+            "datapath": "/private/home/user/GitHub/facebookresearch/ParlAI/data",
+            "n_encoder_layers": -1,
+            "n_decoder_layers": -1,
+            "beam_delay": 30,
+            "beam_block_list_filename": null,
+            "temperature": 1.0,
+            "interactive_mode": false,
+            "history_reversed": false,
+            "history_add_global_end_token": null,
+            "special_tok_lst": null,
+            "bpe_vocab": null,
+            "bpe_merge": null,
+            "bpe_add_prefix_space": null,
+            "bpe_dropout": null,
+            "override": {
+                "model_parallel": true,
+                "model_file": "/private/home/user/GitHub/facebookresearch/ParlAI/data/models/blender/blender_90M/model"
+            },
+            "dict_loaded": true
+        }
+    }
+}