-
Notifications
You must be signed in to change notification settings - Fork 3.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
test_step hangs after one iteration when on multiple GPUs #3730
Comments
Can you post which trainer settings you are using? |
def training_step(self, batch, batch_idx):
images, masks = batch["image"], batch["mask"]
if images.shape[1] != self.hparams.n_channels:
raise AssertionError(
f"Network has been defined with {self.hparams.n_channels} input channels, "
f"but loaded images have {images.shape[1]} channels. Please check that "
"the images are loaded correctly."
)
masks = (
masks.type(torch.float32)
if self.hparams.n_classes == 1
else masks.type(torch.long)
)
masks_pred = self(images) # Forward pass
loss = self.loss_funciton(masks_pred, masks)
result = pl.TrainResult(minimize=loss)
result.log("train_loss", loss, sync_dist=True)
if batch_idx == 0:
self.logg_images(images, masks, masks_pred, "TRAIN")
pred = (torch.sigmoid(masks_pred) > 0.5).float()
f1 = f1_score(pred, masks, self.hparams.n_classes + 1)
rec = recall(pred, masks, self.hparams.n_classes + 1)
pres = precision(pred, masks, self.hparams.n_classes + 1)
result.log("train_f1", f1, on_epoch=True)
result.log("train_recall", rec, on_epoch=True)
result.log("train_precision", pres, on_epoch=True)
return result
def validation_step(self, batch, batch_idx):
images, masks = batch["image"], batch["mask"]
if images.shape[1] != self.hparams.n_channels:
raise AssertionError(
f"Network has been defined with {self.n_channels} input channels, "
f"but loaded images have {images.shape[1]} channels. Please check that "
"the images are loaded correctly."
)
masks = (
masks.type(torch.float32)
if self.hparams.n_classes == 1
else masks.type(torch.long)
)
masks_pred = self(images) # Forward pass
loss = self.loss_funciton(masks_pred, masks)
result = pl.EvalResult(loss, checkpoint_on=loss)
result.log("val_loss", loss, sync_dist=True)
if batch_idx == 0:
self.logg_images(images, masks, masks_pred, "VAL")
pred = (torch.sigmoid(masks_pred) > 0.5).float()
f1 = f1_score(pred, masks, self.hparams.n_classes + 1)
rec = recall(pred, masks, self.hparams.n_classes + 1)
pres = precision(pred, masks, self.hparams.n_classes + 1)
result.log("val_f1", f1, on_epoch=True)
result.log("val_recall", rec, on_epoch=True)
result.log("val_precision", pres, on_epoch=True)
return result
def test_step(self, batch, batch_idx):
images, masks = batch["image"], batch["mask"]
if images.shape[1] != self.hparams.n_channels:
raise AssertionError(
f"Network has been defined with {self.n_channels} input channels, "
f"but loaded images have {images.shape[1]} channels. Please check that "
"the images are loaded correctly."
)
masks = (
masks.type(torch.float32)
if self.hparams.n_classes == 1
else masks.type(torch.long)
)
masks_pred = self(images) # Forward pass
loss = self.loss_funciton(masks_pred, masks)
result = pl.EvalResult(loss, checkpoint_on=loss)
result.log("test_loss", loss, on_step=True, on_epoch=True, sync_dist=True)
self.logg_images(images, masks, masks_pred, "TEST")
pred = (torch.sigmoid(masks_pred) > 0.5).float()
f1 = f1_score(pred, masks, self.hparams.n_classes + 1)
rec = recall(pred, masks, self.hparams.n_classes + 1)
pres = precision(pred, masks, self.hparams.n_classes + 1)
result.log("test_f1", f1, on_epoch=True)
result.log("test_recall", rec, on_epoch=True)
result.log("test_precision", pres, on_epoch=True)
return result |
What arguments do you pass to Trainer(...) |
try:
trainer = Trainer.from_argparse_args(
args,
gpus=-1,
precision=16,
distributed_backend="ddp",
callbacks=[lr_monitor],
early_stop_callback=early_stopping,
accumulate_grad_batches=1
if not os.getenv("ACC_GRAD")
else int(os.getenv("ACC_GRAD")),
gradient_clip_val=0.0
if not os.getenv("GRAD_CLIP")
else float(os.getenv("GRAD_CLIP")),
max_epochs=1000 if not os.getenv("EPOCHS") else int(os.getenv("EPOCHS")),
default_root_dir=os.getcwd()
if not os.getenv("DIR_ROOT_DIR")
else os.getenv("DIR_ROOT_DIR"),
)
trainer.fit(model)
trainer.test(model)
except KeyboardInterrupt:
torch.save(model.state_dict(), "INTERRUPTED.pth")
logging.info("Saved interrupt")
try:
sys.exit(0)
except SystemExit:
os._exit(0) |
Is using DDP the issue? I used DDP on the environment with one GPU as well. |
yes, unfortunately it looks like so. calling |
🐛 Bug
When running the same code on a computer with 1 gpu, test_step runs as normal and logs what it should.
How ever on a node with 4 gpus, it hangs after 1 iteration!
Code sample
Expected behavior
I expect it to finish the testing-epoch.
Environment
Environment 1
CUDA:
Packages:
System:
ptl
topl
#52~18.04.1-Ubuntu SMP Thu Sep 10 12:50:22 UTC 2020Environment 2
The text was updated successfully, but these errors were encountered: