Fixes to allow DDP

drewnutt · drewnutt · commit 3e5597f91245 · 2024-08-30T13:27:06.000-04:00
Had to rollback lightning due to this issue: Lightning-AI/pytorch-lightning#18803 Now uses the `exp-id` to save the best model checkpoint (also fixed DDP issues with saving to wandb). Removed `device` from model call.
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [{ name = "Caleb Ellington", email = "cellingt@andrew.cmu.edu" },
            { name = "Abhinav Adduri"},
            { name = "Monica Dayao"}]
 dependencies = [
-    "lightning==2.4.0",
+    "lightning==2.0.8",
     "torch==2.4.0",
     "pandas==2.2.2",
     "wandb==0.17.8",
diff --git a/ultrafast/model.py b/ultrafast/model.py
@@ -169,7 +169,6 @@ def __init__(
         dropout=0,
         lr=1e-4,
         contrastive=False,
-        device='cpu',
         args=None,
     ):
         super().__init__()
@@ -183,7 +182,6 @@ def __init__(
         self.classify = classify
         self.contrastive = contrastive
         self.args = args
-        self.device_ = device
 
         if args.drug_layers == 1:
             self.drug_projector = nn.Sequential(
@@ -313,7 +311,7 @@ def training_step(self, batch, batch_idx):
             loss = self.contrastive_step(batch)
             self.manual_backward(loss)
             con_opt.step()
-            self.log("train/contrastive_loss", loss)
+            self.log("train/contrastive_loss", loss, sync_dist=True if self.trainer.num_devices > 1 else False)
         else:
             if self.contrastive:
                 opt, _ = self.optimizers()
@@ -323,7 +321,7 @@ def training_step(self, batch, batch_idx):
             loss = self.non_contrastive_step(batch)
             self.manual_backward(loss)
             opt.step()
-            self.log("train/loss", loss)
+            self.log("train/loss", loss, sync_dist=True if self.trainer.num_devices > 1 else False)
 
         return loss
 
@@ -332,18 +330,18 @@ def on_train_epoch_end(self):
         if self.contrastive:
             if self.current_epoch % 2 == 0: # supervised learning epoch
                 sch[0].step()
-                self.log("train/lr", sch[0].get_lr()[0])
+                self.log("train/lr", sch[0].get_lr()[0], sync_dist=True if self.trainer.num_devices > 1 else False)
             else: # contrastive learning epoch
                 sch[1].step()
                 self.contrastive_loss_fct.step()
-                self.log("train/triplet_margin", self.contrastive_loss_fct.margin)
-                self.log("train/contrastive_lr", sch[1].get_lr()[0])
+                self.log("train/triplet_margin", self.contrastive_loss_fct.margin, sync_dist=True if self.trainer.num_devices > 1 else False)
+                self.log("train/contrastive_lr", sch[1].get_lr()[0], sync_dist=True if self.trainer.num_devices > 1 else False)
         else:
-            self.log("train/lr", sch.get_lr()[0])
+            self.log("train/lr", sch.get_lr()[0], sync_dist=True if self.trainer.num_devices > 1 else False)
             sch.step()
 
     def validation_step(self, batch, batch_idx):
-        if self.global_step == 0 and not self.args.no_wandb:
+        if self.global_step == 0 and self.global_rank == 0 and not self.args.no_wandb:
             wandb.define_metric("val/aupr", summary="max")
         drug, protein, label = batch
         similarity = self.forward(drug, protein)
@@ -352,7 +350,7 @@ def validation_step(self, batch, batch_idx):
             similarity = torch.squeeze(F.sigmoid(similarity))
 
         loss = self.loss_fct(similarity, label)
-        self.log("val/loss", loss)
+        self.log("val/loss", loss, sync_dist=True if self.trainer.num_devices > 1 else False)
 
         self.val_step_outputs.extend(similarity)
         self.val_step_targets.extend(label)
@@ -365,7 +363,7 @@ def on_validation_epoch_end(self):
                 metric(torch.Tensor(self.val_step_outputs), torch.Tensor(self.val_step_targets).to(torch.int))
             else:
                 metric(torch.Tensor(self.val_step_outputs).cuda(), torch.Tensor(self.val_step_targets).to(torch.float).cuda())
-            self.log(f"val/{name}", metric, on_step=False, on_epoch=True)
+            self.log(f"val/{name}", metric, on_step=False, on_epoch=True, sync_dist=True if self.trainer.num_devices > 1 else False)
 
         self.val_step_outputs.clear()
         self.val_step_targets.clear()
@@ -388,7 +386,7 @@ def on_test_epoch_end(self):
                 metric(torch.Tensor(self.test_step_outputs), torch.Tensor(self.test_step_targets).to(torch.int))
             else:
                 metric(torch.Tensor(self.test_step_outputs).cuda(), torch.Tensor(self.test_step_targets).to(torch.float).cuda())
-            self.log(f"test/{name}", metric, on_step=False, on_epoch=True)
+            self.log(f"test/{name}", metric, on_step=False, on_epoch=True, sync_dist=True if self.trainer.num_devices > 1 else False)
 
         self.test_step_outputs.clear()
         self.test_step_targets.clear()
diff --git a/ultrafast/train.py b/ultrafast/train.py
@@ -112,7 +112,6 @@ def train(
     config.update(args_overrides)
 
     save_dir = f'{config.get("model_save_dir", ".")}/{config.experiment_id}'
-    os.makedirs(save_dir, exist_ok=True)
 
     # Set CUDA device
     device_no = config.device
@@ -221,16 +220,17 @@ def train(
             contrastive=config.contrastive,
             num_layers_target=config.num_layers_target,
             dropout=config.dropout,
-            device=device,
             args=config
         )
 
     if not config.no_wandb:
-        wandb_logger = WandbLogger(project=config.wandb_proj, log_model="gradients")
+        wandb_logger = WandbLogger(project=config.wandb_proj, log_model=True)
         wandb_logger.watch(model)
-        wandb_logger.experiment.config.update(OmegaConf.to_container(config, resolve=True, throw_on_missing=True))
+        if hasattr(wandb_logger.experiment.config, 'update'):
+            wandb_logger.experiment.config.update(OmegaConf.to_container(config, resolve=True, throw_on_missing=True))
 
-    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor=config.watch_metric, mode="max", filename=config.task, verbose=True)
+    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor=config.watch_metric, mode="max", filename=config.task,
+                                                       dirpath=save_dir, verbose=True)
     # Train model
     trainer = pl.Trainer(
             accelerator="auto",
@@ -245,11 +245,9 @@ def train(
             datamodule=datamodule,
             )
 
-    wandb.save(f'{config.task}.ckpt')
-
     # Test model using best weights
     trainer.test(datamodule=datamodule, ckpt_path=checkpoint_callback.best_model_path)
 
 
 if __name__ == '__main__':
-    train()
+    train()