From 150aaac19b56965ca24d4df1e136cbafa6c82f74 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Wed, 18 Sep 2019 21:34:13 -0700 Subject: [PATCH 01/13] fix logdir --- train.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 21027bc..2978bea 100644 --- a/train.py +++ b/train.py @@ -231,19 +231,19 @@ def inference(engine, batch): tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) - checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) + checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=3) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation - torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') - getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) - tokenizer.save_vocabulary(tb_logger.writer.log_dir) + torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) + tokenizer.save_vocabulary(tb_logger.writer.logdir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: - os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close() if __name__ == "__main__": From 68f926b2a44c75a625c1ec7dc4c9792064c59f9b Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 11:33:57 -0700 Subject: [PATCH 02/13] Migrate eval code --- README.md | 2 +- convai_evaluation.py | 3 +-- interact.py | 11 ++++------- requirements.txt | 6 +++--- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 089844e..b904fb3 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ This model should give a Hits@1 over 79, perplexity of 20.5 and F1 of 16.5 using These numbers are slightly lower than the number we obtained in the ConvAI2 competition. Here is what you can tweak to reach the same results: -- in the ConvAI2 competition we also used tweaked position emebddings so that the history of the dialog always start at with the same embeddings. This is easy to add with pytorch-pretrained-bert and should improve the hits@1 metric. +- in the ConvAI2 competition we also used tweaked position emebddings so that the history of the dialog always start at with the same embeddings. This is easy to add with pytorch-transformers and should improve the hits@1 metric. - in the ConvAI2 competition we used a beam search decoder. While the results are better in term of f1 metric, our feeling is that the human experience is les compelling with beam search versus the nucleus sampling detector which is provided in the present repository. ## Using the interaction script diff --git a/convai_evaluation.py b/convai_evaluation.py index 5a28de9..abafdce 100644 --- a/convai_evaluation.py +++ b/convai_evaluation.py @@ -17,7 +17,7 @@ from projects.convai2.eval_f1 import eval_f1, setup_args as setup_args_f1 from projects.convai2.eval_ppl import eval_ppl, setup_args as setup_args_ppl from projects.convai2.build_dict import build_dict -from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer +from pytorch_transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer from train import build_input_from_segments, pad_dataset, SPECIAL_TOKENS from utils import download_pretrained_model, AttrDict @@ -64,7 +64,6 @@ def __init__(self, opt, shared=None): else: self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) self.model_checkpoint.to(args.device) - self.model_checkpoint.eval() self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() diff --git a/interact.py b/interact.py index 169bcd7..7e233a8 100644 --- a/interact.py +++ b/interact.py @@ -11,7 +11,7 @@ import torch import torch.nn.functional as F -from pytorch_pretrained_bert import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer +from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer from train import SPECIAL_TOKENS, build_input_from_segments from utils import get_dataset_personalities, download_pretrained_model @@ -60,14 +60,13 @@ def sample_sequence(personality, history, tokenizer, model, args, current_output current_output = [] for i in range(args.max_length): - instance, sequence = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False) + instance, _ = build_input_from_segments(personality, history, current_output, tokenizer, with_eos=False) input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) - logits = model(input_ids, token_type_ids=token_type_ids) - - if "gpt2" == args.model: + logits, = model(input_ids, token_type_ids=token_type_ids) + if isinstance(logits, tuple): # for gpt2 and maybe others logits = logits[0] logits = logits[0, -1, :] / args.temperature logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p) @@ -118,9 +117,7 @@ def run(): tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) - model.to(args.device) - model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) diff --git a/requirements.txt b/requirements.txt index a96c023..af30e3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch pytorch-ignite -pytorch-pretrained-bert >= 0.6.2 -tensorboardX -tensorflow # for tensorboardX \ No newline at end of file +pytorch-transformers>=1.2 +git+https://github.com/lanpa/tensorboardX +tensorflow # for tensorboardX From bfdd032f101303a15e8ce10fa385bebeb7100290 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 11:34:51 -0700 Subject: [PATCH 03/13] Utils import --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 6061889..91a5c12 100644 --- a/utils.py +++ b/utils.py @@ -9,7 +9,7 @@ import torch -from pytorch_pretrained_bert import cached_path +from pytorch_transformers import cached_path PERSONACHAT_URL = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json" HF_FINETUNED_MODEL = "https://s3.amazonaws.com/models.huggingface.co/transfer-learning-chatbot/finetuned_chatbot_gpt.tar.gz" From e7d6e7b68316e2733422ef64a155865388672647 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 11:35:21 -0700 Subject: [PATCH 04/13] Train.py and tokenizer test --- test_special_tokens.py | 32 +++++++++++++++++++++++++ train.py | 54 ++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 20 deletions(-) create mode 100644 test_special_tokens.py diff --git a/test_special_tokens.py b/test_special_tokens.py new file mode 100644 index 0000000..fb39ce7 --- /dev/null +++ b/test_special_tokens.py @@ -0,0 +1,32 @@ +from pathlib import Path +import shutil +import unittest + +from pytorch_transformers import OpenAIGPTTokenizer, GPT2Tokenizer +from train import ATTR_TO_SPECIAL_TOKEN, SPECIAL_TOKENS + +class TestSpecialTokenTreatment(unittest.TestCase): + + def setUp(self): + self.save_dir = Path('utest_save_dir') + self.save_dir.mkdir(exist_ok=True) + + def tearDown(self): + shutil.rmtree(self.save_dir) + + def test_special_tokens_checkpoint_behavior(self): + toks = [OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2')] + for tok in toks: + self.assertEqual(len(tok.added_tokens_encoder), 0) + tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) + self.assertEqual(len(tok.added_tokens_encoder), 5) + # Make sure we never split + self.assertEqual(len(tok.tokenize(" ")), 2) + ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS) + self.assertTrue(all([x > 0 for x in ids]), + f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}') + # Need to mantain indices through save. (this is also tested in pytorch-transformers) + tok.save_pretrained(self.save_dir) + tok_loaded = tok.from_pretrained(str(self.save_dir)) + ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) + self.assertListEqual(ids, ids2) diff --git a/train.py b/train.py index 2978bea..c2889f3 100644 --- a/train.py +++ b/train.py @@ -16,12 +16,14 @@ from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage from ignite.contrib.handlers import ProgressBar, PiecewiseLinear from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler -from pytorch_pretrained_bert import (OpenAIAdam, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, - GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME) +from pytorch_transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME) from utils import get_dataset SPECIAL_TOKENS = ["", "", "", "", ""] +ATTR_TO_SPECIAL_TOKEN = {'bos_token': '', 'eos_token': '', 'pad_token': '', + 'additional_special_tokens': ('', '')} MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"] PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"] @@ -37,7 +39,7 @@ def average_distributed_scalar(scalar, args): def pad_dataset(dataset, padding=0): - """ Pad the dataset. This could be optimized by defining a Dataset class and padd only batches but this is simpler. """ + """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level, but this is simpler. """ max_l = max(len(x) for x in dataset["input_ids"]) for name in PADDED_INPUTS: dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]] @@ -45,7 +47,7 @@ def pad_dataset(dataset, padding=0): def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True): - """ Build a sequence of input from 3 segments: persona, history and last reply """ + """ Build a sequence of input from 3 segments: persona, history and last reply. """ bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) instance = {} @@ -58,7 +60,7 @@ def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=Fals instance["lm_labels"] = [-1] * len(instance["input_ids"]) if lm_labels: instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:] - return instance, sequence + return instance, sequence # TODO: second arg is never used, delete it def get_data_loaders(args, tokenizer): @@ -141,15 +143,20 @@ def train(): args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') - logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") - tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer + logger.info("Prepare tokenizer, pretrained model and optimizer.") + tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) + + model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) - tokenizer.set_special_tokens(SPECIAL_TOKENS) - model.set_num_special_tokens(len(SPECIAL_TOKENS)) model.to(args.device) - optimizer = OpenAIAdam(model.parameters(), lr=args.lr) + # Add special tokens if they are not already added + orig_num_tokens = len(tokenizer.encoder) + num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # returns 0 and doesn't add if they are already loaded + if num_added_tokens > 0: + model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) # use vocab_size after PR + optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if args.fp16: @@ -165,7 +172,11 @@ def train(): def update(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) - lm_loss, mc_loss = model(*batch) + input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch + (lm_loss), (mc_loss), *_ = model( + input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, + mc_labels=mc_labels, lm_labels=lm_labels + ) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: @@ -187,8 +198,10 @@ def inference(engine, batch): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) - model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) - lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs + # if we dont send labels to model, it doesnt return losses + lm_logits, mc_logits, *_ = model( + input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, + ) lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) @@ -210,7 +223,7 @@ def inference(engine, batch): scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) - # Prepare metrics - note how we compute distributed metrics + # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} @@ -227,23 +240,24 @@ def inference(engine, batch): evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir=None) + log_dir = tb_logger.writer.logdir # to save typing tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) - checkpoint_handler = ModelCheckpoint(tb_logger.writer.logdir, 'checkpoint', save_interval=1, n_saved=3) - trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation + checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3) + trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation - torch.save(args, tb_logger.writer.logdir + '/model_training_args.bin') - getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.logdir, CONFIG_NAME)) - tokenizer.save_vocabulary(tb_logger.writer.logdir) + torch.save(args, log_dir + '/model_training_args.bin') + getattr(model, 'module', model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME)) + tokenizer.save_pretrained(log_dir) # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) if args.local_rank in [-1, 0] and args.n_epochs > 0: - os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.logdir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) + os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close() if __name__ == "__main__": From 517eb77f3fd0fd74a231cf82670d4b218fdeb4ea Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 11:46:48 -0700 Subject: [PATCH 05/13] Pin tensorboardx --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index af30e3a..1005759 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ torch pytorch-ignite pytorch-transformers>=1.2 -git+https://github.com/lanpa/tensorboardX +tensorboardX==1.8 tensorflow # for tensorboardX From 2f475cb5cb00fe8b2990ca9f9cf77601a22b1a8d Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 14:11:50 -0700 Subject: [PATCH 06/13] Add compatibility comment to eval --- convai_evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convai_evaluation.py b/convai_evaluation.py index abafdce..89d1fee 100644 --- a/convai_evaluation.py +++ b/convai_evaluation.py @@ -27,7 +27,7 @@ class TransformerAgent(Agent): @staticmethod def add_cmdline_args(argparser): agent_args = argparser.add_argument_group('Agent parameters') - agent_args.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") + agent_args.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model. Must be OpenAIGPT.") agent_args.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") agent_args.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") agent_args.add_argument("--eval_type", type=str, default="hits@1", help="hits@1, ppl or f1") From 0f56c4af36953ec5a6361ba8e5983ce4df3bf7a5 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 14:42:00 -0700 Subject: [PATCH 07/13] Partial GPT2 compatibility fix --- interact.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interact.py b/interact.py index 7e233a8..8ab6461 100644 --- a/interact.py +++ b/interact.py @@ -65,7 +65,7 @@ def sample_sequence(personality, history, tokenizer, model, args, current_output input_ids = torch.tensor(instance["input_ids"], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance["token_type_ids"], device=args.device).unsqueeze(0) - logits, = model(input_ids, token_type_ids=token_type_ids) + logits = model(input_ids, token_type_ids=token_type_ids) if isinstance(logits, tuple): # for gpt2 and maybe others logits = logits[0] logits = logits[0, -1, :] / args.temperature From 2f1207a0d05e908ecc16d96d77a2929506ea1eae Mon Sep 17 00:00:00 2001 From: sshleifer Date: Thu, 19 Sep 2019 22:54:32 +0000 Subject: [PATCH 08/13] add special tokens before interact --- interact.py | 4 +++- train.py | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/interact.py b/interact.py index 8ab6461..df4bfe6 100644 --- a/interact.py +++ b/interact.py @@ -12,7 +12,7 @@ import torch.nn.functional as F from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, GPT2LMHeadModel, GPT2Tokenizer -from train import SPECIAL_TOKENS, build_input_from_segments +from train import SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_ from utils import get_dataset_personalities, download_pretrained_model def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')): @@ -76,6 +76,7 @@ def sample_sequence(personality, history, tokenizer, model, args, current_output if i < args.min_length and prev.item() in special_tokens_ids: while prev.item() in special_tokens_ids: prev = torch.multinomial(probs, num_samples=1) + if probs.max().item() == 1: break if prev.item() in special_tokens_ids: break @@ -118,6 +119,7 @@ def run(): model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) + add_special_tokens_(model, tokenizer) logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) diff --git a/train.py b/train.py index c2889f3..52affb2 100644 --- a/train.py +++ b/train.py @@ -46,6 +46,15 @@ def pad_dataset(dataset, padding=0): return dataset +def add_special_tokens_(model, tokenizer): + """ Add special tokens to the tokenizer and the model if they have not already been added. """ + orig_num_tokens = len(tokenizer.encoder) + num_added_tokens = tokenizer.add_special_tokens( + ATTR_TO_SPECIAL_TOKEN) # returns 0 and doesn't add if they are already there + if num_added_tokens > 0: + model.resize_token_embeddings( + new_num_tokens=orig_num_tokens + num_added_tokens) + def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True): """ Build a sequence of input from 3 segments: persona, history and last reply. """ bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1]) @@ -152,10 +161,7 @@ def train(): model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) # Add special tokens if they are not already added - orig_num_tokens = len(tokenizer.encoder) - num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # returns 0 and doesn't add if they are already loaded - if num_added_tokens > 0: - model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) # use vocab_size after PR + add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) @@ -260,5 +266,7 @@ def inference(engine, batch): os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close() + + if __name__ == "__main__": train() From 8477c2e439014e3c59b8f1be6df68b0b08fcffe9 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Sun, 22 Sep 2019 18:41:24 -0700 Subject: [PATCH 09/13] Convai eval for GPT2 --- convai_evaluation.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/convai_evaluation.py b/convai_evaluation.py index 89d1fee..909eb1c 100644 --- a/convai_evaluation.py +++ b/convai_evaluation.py @@ -17,9 +17,10 @@ from projects.convai2.eval_f1 import eval_f1, setup_args as setup_args_f1 from projects.convai2.eval_ppl import eval_ppl, setup_args as setup_args_ppl from projects.convai2.build_dict import build_dict -from pytorch_transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer +from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, + GPT2DoubleHeadsModel, GPT2LMHeadModel, GPT2Tokenizer) -from train import build_input_from_segments, pad_dataset, SPECIAL_TOKENS +from train import build_input_from_segments, pad_dataset, SPECIAL_TOKENS, add_special_tokens_ from utils import download_pretrained_model, AttrDict from interact import sample_sequence @@ -57,12 +58,14 @@ def __init__(self, opt, shared=None): self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() - - self.tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) - if self.args.eval_type == "hits@1": - self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_checkpoint) + if 'gpt2' in args.model_checkpoint: + self.tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) + model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: - self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) + self.tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) + model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel + + self.model_checkpoint = model_class.from_pretrained(args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") @@ -73,7 +76,7 @@ def __init__(self, opt, shared=None): self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] - + add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) self.persona = [] @@ -135,7 +138,7 @@ def act(self): tensor_inputs[input_name] = tensor with torch.no_grad(): - _, mc_logits = self.model_checkpoint(**tensor_inputs) + mc_logits = self.model_checkpoint(**tensor_inputs)[1] val, ind = torch.sort(mc_logits[0], descending=True) From 1ba3929cc59855caa1ee98120966d8290b193c6b Mon Sep 17 00:00:00 2001 From: sshleifer Date: Sun, 22 Sep 2019 19:50:21 -0700 Subject: [PATCH 10/13] add args.model_checkpoint to logdir path --- train.py | 7 ++++--- utils.py | 12 ++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index 52affb2..655a97a 100644 --- a/train.py +++ b/train.py @@ -19,7 +19,7 @@ from pytorch_transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME) -from utils import get_dataset +from utils import get_dataset, make_logdir SPECIAL_TOKENS = ["", "", "", "", ""] ATTR_TO_SPECIAL_TOKEN = {'bos_token': '', 'eos_token': '', 'pad_token': '', @@ -245,8 +245,9 @@ def inference(engine, batch): pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) - tb_logger = TensorboardLogger(log_dir=None) - log_dir = tb_logger.writer.logdir # to save typing + log_dir = make_logdir(args.model_checkpoint) + tb_logger = TensorboardLogger(log_dir) + tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) diff --git a/utils.py b/utils.py index 91a5c12..c836531 100644 --- a/utils.py +++ b/utils.py @@ -1,11 +1,14 @@ # Copyright (c) 2019-present, HuggingFace Inc. # All rights reserved. This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from datetime import datetime import json import logging import os import tarfile import tempfile +import socket + import torch @@ -88,3 +91,12 @@ class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self + + +def make_logdir(model_name: str): + """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2""" + # Code copied from ignite repo + current_time = datetime.now().strftime('%b%d_%H-%M-%S') + logdir = os.path.join( + 'runs', current_time + '_' + socket.gethostname() + '_' + model_name) + return logdir From d4e007f28b338a2a2d9b70b17a63836c22db6b84 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Sun, 22 Sep 2019 19:57:07 -0700 Subject: [PATCH 11/13] comment, warning about infinite loop hack --- interact.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/interact.py b/interact.py index df4bfe6..6aa7d2c 100644 --- a/interact.py +++ b/interact.py @@ -7,6 +7,7 @@ from argparse import ArgumentParser from itertools import chain from pprint import pformat +import warnings import torch import torch.nn.functional as F @@ -75,8 +76,10 @@ def sample_sequence(personality, history, tokenizer, model, args, current_output prev = torch.topk(probs, 1)[1] if args.no_sample else torch.multinomial(probs, 1) if i < args.min_length and prev.item() in special_tokens_ids: while prev.item() in special_tokens_ids: + if probs.max().item() == 1: + warnings.warn("Warning: model generating special token with probability 1.") + break # avoid infinitely looping over special token prev = torch.multinomial(probs, num_samples=1) - if probs.max().item() == 1: break if prev.item() in special_tokens_ids: break From 1a00f96568b7a1c10e18029adb266f33cb5d26e1 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Mon, 23 Sep 2019 09:47:52 -0700 Subject: [PATCH 12/13] cleanup: remove extra newlines --- train.py | 2 -- utils.py | 1 - 2 files changed, 3 deletions(-) diff --git a/train.py b/train.py index 655a97a..87e2e26 100644 --- a/train.py +++ b/train.py @@ -267,7 +267,5 @@ def inference(engine, batch): os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close() - - if __name__ == "__main__": train() diff --git a/utils.py b/utils.py index c836531..bdc6297 100644 --- a/utils.py +++ b/utils.py @@ -9,7 +9,6 @@ import tempfile import socket - import torch from pytorch_transformers import cached_path From e92ee7cb69325ca855af1da1cb89609daed58bd8 Mon Sep 17 00:00:00 2001 From: sshleifer Date: Mon, 23 Sep 2019 13:40:26 -0700 Subject: [PATCH 13/13] Pad on the batch level --- train.py | 91 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/train.py b/train.py index 87e2e26..7fd5d3a 100644 --- a/train.py +++ b/train.py @@ -10,6 +10,7 @@ import torch from torch.nn.parallel import DistributedDataParallel +from torch.nn.utils.rnn import pad_sequence from torch.utils.data import DataLoader, TensorDataset from ignite.engine import Engine, Events from ignite.handlers import ModelCheckpoint @@ -72,11 +73,63 @@ def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=Fals return instance, sequence # TODO: second arg is never used, delete it +def pad_and_tensorize(batch_dict, padding): + """ Pad the batch_dict.""" + tensors = [] + for name in MODEL_INPUTS: + if name not in PADDED_INPUTS: + tensors.append(torch.tensor(batch_dict[name])) + continue + entry = batch_dict[name] + pad_id = padding if name != "lm_labels" else -1 + padded = pad_sequence([torch.tensor(seq) for x in entry for seq in x], batch_first=True, + padding_value=pad_id) + bs, n_candidates = len(entry), len(entry[0]) + tensors.append(padded.view(bs, n_candidates, -1)) + return tensors + +class ChatDataset(torch.utils.data.Dataset): + + def __init__(self, fields, pad_id): + self.fields = fields + self.pad_id = pad_id + + def __getitem__(self, item) -> dict: + return {f: self.fields[f][item] for f in MODEL_INPUTS} + + def collate_fn(self, examples): + batch_dict = defaultdict(list) + for input_name in MODEL_INPUTS: + for e in examples: + batch_dict[input_name].append(e[input_name]) + tensors = pad_and_tensorize(batch_dict, padding=self.pad_id) + return tensors + + def __len__(self): + return len(self.fields['input_ids']) + + def get_data_loaders(args, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) logger.info("Build inputs and labels") + datasets: dict = make_data_lists(args, personachat, tokenizer) + pad_id = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]) + train_dataset = ChatDataset(datasets['train'], pad_id) + valid_dataset = ChatDataset(datasets['valid'], pad_id) + + logger.info("Build train and validation dataloaders") + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None + valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None + train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed), + collate_fn=train_dataset.collate_fn) + valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False, + collate_fn=valid_dataset.collate_fn) + return train_loader, valid_loader, train_sampler, valid_sampler + + +def make_data_lists(args, personachat, tokenizer): datasets = {"train": defaultdict(list), "valid": defaultdict(list)} for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) @@ -86,36 +139,20 @@ def get_data_loaders(args, tokenizer): persona = dialog["personality"].copy() for _ in range(args.personality_permutations): for utterance in dialog["utterances"]: - history = utterance["history"][-(2*args.max_history+1):] + candidate_instances = defaultdict(list) + history = utterance["history"][-(2 * args.max_history + 1):] for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): - lm_labels = bool(j == num_candidates-1) - instance, _ = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels) + lm_labels = bool(j == num_candidates - 1) + instance, _ = build_input_from_segments(persona, history, candidate, + tokenizer, lm_labels) for input_name, input_array in instance.items(): - datasets[dataset_name][input_name].append(input_array) + candidate_instances[input_name].append(input_array) + for k in candidate_instances.keys(): + datasets[dataset_name][k].append(candidate_instances[k]) datasets[dataset_name]["mc_labels"].append(num_candidates - 1) datasets[dataset_name]["n_candidates"] = num_candidates persona = [persona[-1]] + persona[:-1] # permuted personalities - - logger.info("Pad inputs and convert to Tensor") - tensor_datasets = {"train": [], "valid": []} - for dataset_name, dataset in datasets.items(): - dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) - for input_name in MODEL_INPUTS: - tensor = torch.tensor(dataset[input_name]) - if input_name != "mc_labels": - tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) - tensor_datasets[dataset_name].append(tensor) - - logger.info("Build train and validation dataloaders") - train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None - valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None - train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed)) - valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False) - - logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) - logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) - return train_loader, valid_loader, train_sampler, valid_sampler + return datasets def train(): @@ -155,12 +192,9 @@ def train(): logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) - - model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) - # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True) @@ -247,7 +281,6 @@ def inference(engine, batch): log_dir = make_logdir(args.model_checkpoint) tb_logger = TensorboardLogger(log_dir) - tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)