You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I was trying to reproduce the performance of baseline model imojie with multiple GPUs, so that I commented out the lines including "ipdb" code in "copy_seq2seq_bahdanu.py" file.
However, there was a trouble when I used multiple GPUs to train the model (using only one GPU was fine). Some key error messages were shown in bellow:
'''
Traceback (most recent call last):
File "allennlp_script.py", line 151, in
main()
File "allennlp_script.py", line 138, in main
overrides=args.overrides
File "/home/qj/imojie/allennlp/allennlp/commands/train.py", line 168, in train_model_from_file
cache_directory, cache_prefix)
File "/home/qj/imojie/allennlp/allennlp/commands/train.py", line 254, in train_model
metrics = trainer.train()
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 478, in train
train_metrics = self._train_epoch(epoch)
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 320, in _train_epoch
loss = self.batch_loss(batch_group, for_training=True, optimizer=self.optimizer)
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 256, in batch_loss
output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
File "/home/qj/imojie/allennlp/allennlp/training/util.py", line 335, in data_parallel
outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/_utils.py", line 425, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 389, in forward
output_dict = self.forward_append(source_tokens, source_token_ids, source_to_target, metadata, target_tokens, target_token_ids, optimizer)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 288, in forward_append
output_dict = self.train_append(source_tokens, source_token_ids, source_to_target, metadata, target_tokens, target_token_ids, optimizer=optimizer)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 509, in train_append
state = self._encode({'tokens': append_tokens})
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 633, in _encode
embedded_input = self._source_embedder(source_tokens)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/allennlp/allennlp/modules/text_field_embedders/basic_text_field_embedder.py", line 131, in forward
token_vectors = embedder(*tensors, **forward_params_values)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/allennlp/allennlp/modules/token_embedders/pretrained_transformer_embedder.py", line 28, in forward
return self.transformer_model(token_ids)[0]
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/miniconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 696, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration
'''
Can we fix this trouble and train model with multiple GPUs soon?
The text was updated successfully, but these errors were encountered:
Hi, thank you for your excellent work.
I was trying to reproduce the performance of baseline model imojie with multiple GPUs, so that I commented out the lines including "ipdb" code in "copy_seq2seq_bahdanu.py" file.
However, there was a trouble when I used multiple GPUs to train the model (using only one GPU was fine). Some key error messages were shown in bellow:
'''
Traceback (most recent call last):
File "allennlp_script.py", line 151, in
main()
File "allennlp_script.py", line 138, in main
overrides=args.overrides
File "/home/qj/imojie/allennlp/allennlp/commands/train.py", line 168, in train_model_from_file
cache_directory, cache_prefix)
File "/home/qj/imojie/allennlp/allennlp/commands/train.py", line 254, in train_model
metrics = trainer.train()
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 478, in train
train_metrics = self._train_epoch(epoch)
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 320, in _train_epoch
loss = self.batch_loss(batch_group, for_training=True, optimizer=self.optimizer)
File "/home/qj/imojie/allennlp/allennlp/training/trainer.py", line 256, in batch_loss
output_dict = training_util.data_parallel(batch_group, self.model, self._cuda_devices)
File "/home/qj/imojie/allennlp/allennlp/training/util.py", line 335, in data_parallel
outputs = parallel_apply(replicas, inputs, moved, used_device_ids)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
output.reraise()
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/_utils.py", line 425, in reraise
raise self.exc_type(msg)
StopIteration: Caught StopIteration in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 389, in forward
output_dict = self.forward_append(source_tokens, source_token_ids, source_to_target, metadata, target_tokens, target_token_ids, optimizer)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 288, in forward_append
output_dict = self.train_append(source_tokens, source_token_ids, source_to_target, metadata, target_tokens, target_token_ids, optimizer=optimizer)
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 509, in train_append
state = self._encode({'tokens': append_tokens})
File "/home/qj/imojie/imojie/models/copy_seq2seq_bahdanu.py", line 633, in _encode
embedded_input = self._source_embedder(source_tokens)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/allennlp/allennlp/modules/text_field_embedders/basic_text_field_embedder.py", line 131, in forward
token_vectors = embedder(*tensors, **forward_params_values)
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/imojie/allennlp/allennlp/modules/token_embedders/pretrained_transformer_embedder.py", line 28, in forward
return self.transformer_model(token_ids)[0]
File "/home/qj/miniconda3/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qj/miniconda3/lib/python3.7/site-packages/pytorch_transformers/modeling_bert.py", line 696, in forward
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
StopIteration
'''
Can we fix this trouble and train model with multiple GPUs soon?
The text was updated successfully, but these errors were encountered: