From b272f7eb0ad391e1fd21c4c94cff2545928c4d8d Mon Sep 17 00:00:00 2001 From: Neil <1522674529@qq.com> Date: Sun, 9 Feb 2020 18:02:56 +0800 Subject: [PATCH] newest --- README.md | 14 +++++++------- mgan/data/imdb_tensor.py | 4 ++-- mgan/data/vocab_builder.py | 12 ++++++++---- mgan/main.py | 15 ++++++++------- mgan/models/critic.py | 7 +++---- mgan/models/discriminator.py | 7 +++---- mgan/preproc/tokenize.py | 5 +++-- 7 files changed, 34 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 3c5ea85..b3727a0 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # MaskGAN.pytorch -A PyTorch attempt at reimplementing +A PyTorch attempt at reimplementing * MaskGAN: Better Text Generation via Filling in the _______ , William Fedus, Ian Goodfellow, Andrew M. Dai [[paper]](https://openreview.net/pdf?id=ByOExmWAb) - -**This is a work in progress.** +==Solved some bugs in original repository== +https://github.com/jerinphilip/MaskGAN.pytorch # Setting up @@ -28,12 +28,12 @@ python3 -m pip install git+https://github.com/pytorch/fairseq #### IMDB Reviews Dataset ``` -mkdir datasets +mkdir datasets cd datasets IMDB_DATASET='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz' wget $IMDB_DATASET -O aclImdb_v1.tar.gz tar xvzf aclImdb_v1.tar.gz -``` +``` #### Training @@ -48,6 +48,6 @@ Run the training script. ``` python3 -m mgan.main \ - --path datasets/aclImdb/train/ \ - --spm_path datasets/aclImdb/train/imdb.model +--path datasets/aclImdb/ \ +--spm_prefix datasets/aclImdb/train/imdb ``` diff --git a/mgan/data/imdb_tensor.py b/mgan/data/imdb_tensor.py index 1680240..3ebc4ef 100644 --- a/mgan/data/imdb_tensor.py +++ b/mgan/data/imdb_tensor.py @@ -22,7 +22,7 @@ def __init__(self, path, tokenizer, mask_builder, truncate_length, vocab=None): def _construct_vocabulary(self): if self.vocab is None: raw_dataset = IMDbDataset(self.path) - builder = VocabBuilder(raw_dataset, self.tokenizer, self.path) + builder = VocabBuilder(raw_dataset, self.tokenizer, self.path, self.mask_builder) self.vocab = builder.vocab() def __len__(self): @@ -76,7 +76,7 @@ def collate(samples): lengths = torch.LongTensor(lengths) lengths, sort_order = lengths.sort(descending=True) - + def _rearrange(tensor): return tensor.index_select(0, sort_order) diff --git a/mgan/data/vocab_builder.py b/mgan/data/vocab_builder.py index 6152b57..cc7c419 100644 --- a/mgan/data/vocab_builder.py +++ b/mgan/data/vocab_builder.py @@ -1,12 +1,15 @@ import os +from tqdm import tqdm from fairseq.data.dictionary import Dictionary class VocabBuilder: - def __init__(self, dataset, tokenizer, save_path): + def __init__(self, dataset, tokenizer, save_path, mask_builder): + self.save_path = save_path self.dataset = dataset self.tokenizer = tokenizer self.vocab_path = os.path.join(save_path, 'vocab.pt') self._vocab = None + self.mask_builder = mask_builder def vocab(self): if self._vocab is None: @@ -14,18 +17,19 @@ def vocab(self): return self._vocab def build_vocab(self): + print('vocab path:',self.vocab_path) if os.path.exists(self.vocab_path): self._vocab = Dictionary.load(self.vocab_path) else: self.rebuild_vocab() - + def rebuild_vocab(self): self._vocab = Dictionary() self._vocab.add_symbol(self.mask_builder.mask_token) desc = 'build-vocab: {}'.format(self.save_path) pbar = tqdm( - range(len(self.dataset)), - desc=desc, + range(len(self.dataset)), + desc=desc, leave=True ) diff --git a/mgan/main.py b/mgan/main.py index c2269b7..b122325 100644 --- a/mgan/main.py +++ b/mgan/main.py @@ -37,36 +37,37 @@ def main(args): truncate_length = 20 batch_size = int(max_tokens/truncate_length) - checkpoint_path = "/home/jerin/mgan-attempts/" + checkpoint_path = "/data/neil_noadmin/jerin/mgan-attempts/" saver = Saver(checkpoint_path) train_path = os.path.join(args.path, 'train') + print('train path:',train_path) dev_path = os.path.join(args.path, 'test') train_dataset = TensorIMDbDataset( - train_path, spm_tokenize, + train_path, spm_tokenize, rmask, truncate_length ) # Constructed vocabulary from train vocab = train_dataset.vocab Task = namedtuple('Task', 'source_dictionary target_dictionary') - task = Task(source_dictionary=vocab, + task = Task(source_dictionary=vocab, target_dictionary=vocab) trainer = MGANTrainer(args, task, saver, visdom, vocab) def loader(dataset): - _loader = DataLoader(dataset, batch_size=batch_size, - collate_fn=TensorIMDbDataset.collate, + _loader = DataLoader(dataset, batch_size=batch_size, + collate_fn=TensorIMDbDataset.collate, shuffle=True, num_workers=8) return _loader #trainer.validate_dataset(loader(train_dataset)) dev_dataset = TensorIMDbDataset( - dev_path, spm_tokenize, + dev_path, spm_tokenize, rmask, truncate_length, - vocab + vocab ) Datasets = namedtuple('Dataset', 'train dev') diff --git a/mgan/models/critic.py b/mgan/models/critic.py index 5763c2b..84ad069 100644 --- a/mgan/models/critic.py +++ b/mgan/models/critic.py @@ -19,8 +19,8 @@ def __init__(self, *args, **kwargs): out_embed_dim = self.additional_fc.out_features if hasattr(self, "additional_fc") else self.hidden_size self.fc_out = nn.Linear(out_embed_dim, 1) - def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None): - x, attn_scores = super().forward(prev_output_tokens, encoder_out_dict, incremental_state) + def forward(self, prev_output_tokens, encoder_out, incremental_state=None): + x, attn_scores = super().forward(prev_output_tokens, encoder_out, incremental_state) return x, attn_scores @@ -58,7 +58,7 @@ def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): ) if args.encoder_embed_dim != args.decoder_embed_dim: raise RuntimeError( - '--share-all-embeddings requires --encoder-embed-dim to ' + '--share-all-embeddings requires --encoder_embed_dim to ' 'match --decoder-embed-dim' ) pretrained_decoder_embed = pretrained_encoder_embed @@ -99,7 +99,6 @@ def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=options.eval_bool(args.decoder_attention), - encoder_embed_dim=args.encoder_embed_dim, encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, diff --git a/mgan/models/discriminator.py b/mgan/models/discriminator.py index dc8a1fd..88d3044 100644 --- a/mgan/models/discriminator.py +++ b/mgan/models/discriminator.py @@ -21,8 +21,8 @@ def __init__(self, *args, **kwargs): out_embed_dim = self.additional_fc.out_features if hasattr(self, "additional_fc") else self.hidden_size self.fc_out = nn.Linear(out_embed_dim, 1) - def forward(self, prev_output_tokens, encoder_out_dict, incremental_state=None): - x, attn_scores = super().forward(prev_output_tokens, encoder_out_dict, incremental_state) + def forward(self, prev_output_tokens, encoder_out, incremental_state=None): + x, attn_scores = super().forward(prev_output_tokens, encoder_out, incremental_state) # Do not apply sigmoid, numerically unstable while training. # Get logits and use BCEWithLogitsLoss() instead. # x = torch.sigmoid(x) @@ -64,7 +64,7 @@ def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): ) if args.encoder_embed_dim != args.decoder_embed_dim: raise RuntimeError( - '--share-all-embeddings requires --encoder-embed-dim to ' + '--share-all-embeddings requires --encoder_embed_dim to ' 'match --decoder-embed-dim' ) pretrained_decoder_embed = pretrained_encoder_embed @@ -105,7 +105,6 @@ def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=options.eval_bool(args.decoder_attention), - encoder_embed_dim=args.encoder_embed_dim, encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, diff --git a/mgan/preproc/tokenize.py b/mgan/preproc/tokenize.py index 96fed73..d9e0dcf 100644 --- a/mgan/preproc/tokenize.py +++ b/mgan/preproc/tokenize.py @@ -17,7 +17,7 @@ def __init__(self, model_prefix): for key in ['model', 'vocab']: self.path[key] = '{}.{}'.format(self.prefix, key) - self.sp = spm.SentencePieceProcessor() + self.sp = spm.SentencePieceProcessor() self.sp.Load(self.path['model']) # Build vocabulary. @@ -33,7 +33,8 @@ def build_vocabulary(self): def __call__(self, text): tokens = self.sp.EncodeAsPieces(text) - to_utf = lambda x: x.decode("utf-8") + # to_utf = lambda x: x.decode("utf-8") + to_utf = lambda x: x stokens = list(map(to_utf, tokens)) wanted = lambda s: s in self.vocab