NINAnor · dependabot · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 2, 2024
diff --git a/.gitignore b/.gitignore
@@ -178,6 +178,6 @@ pyrightconfig.json
 nina-python-init.py
 info_proj.txt
 lightning_logs/
-config.yaml
+.hydra/
 notebooks/
 .vscode/
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -0,0 +1,11 @@
+defaults:
+  - _self_
+  - dataset: default
+  - train: default
+  - predict: default
+  - paths: default
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
diff --git a/configs/train/default.yaml b/configs/train/default.yaml
@@ -1,8 +1,16 @@
+MODEL: 'deeplabv3' # or deeplabv3plus
+
 NUM_WORKERS: 4
 NUM_CLASSES: 7
 NUM_EPOCHS: 1000
 BATCH_SIZE: 16
+LOG_EVERY_N_STEPS: 15
 LR: 0.0001
 
 # Callbacks
 PATIENCE: 7
+
+# where to save model outputs, nothing will be overwritten
+# the python script generates a new folder with the current timestamp 
+# for each run
+LOG_DIR: 'lightning_logs'
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ torch = "2.3.1"
 torchmetrics = "1.4.0.post0"
 torchvision = "0.18.1"
 tqdm = "4.66.4"
+segmentation-models-pytorch = "^0.3.4"
 
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.5"

diff --git a/src/train.py b/src/train.py
@@ -1,33 +1,34 @@
 #!/usr/env/bin python3
 
+from datetime import datetime
+from pathlib import Path
+
+import hydra
 import pytorch_lightning as pl
 import torch
 import torch.nn.functional as F
 import torch.utils
 import torch.utils.data
 import torchmetrics
-import torchvision.models.segmentation as models
-import hydra
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
 from torchmetrics.classification import MulticlassJaccardIndex
 
 from dataset.segmentation_dataset import get_data_loaders
+from utils.check_cuda import check_tensor_cores
+from utils.log_files import log_augmentations, log_train_cfg
+from utils.extras import title
+from utils.models import get_segmentation_model
 from utils.transforms import albumentations_transform, resize_transform
 
 torch.backends.cudnn.benchmark = True
 
 
-def get_deeplabv3_model(num_classes):
-    model = models.deeplabv3_resnet50(weights="COCO_WITH_VOC_LABELS_V1")
-    model.classifier[4] = torch.nn.Conv2d(256, num_classes, kernel_size=(1, 1))
-    return model
-
-
 class SegmentationModel(pl.LightningModule):
-    def __init__(self, num_classes, lr=1e-4):
+    def __init__(self, num_classes, lr=1e-4, model=None):
         super().__init__()
-        self.model = get_deeplabv3_model(num_classes)
+        self.model = model
         self.lr = lr
         self.num_classes = num_classes
 
@@ -57,7 +58,12 @@ def __init__(self, num_classes, lr=1e-4):
         self.val_iou = MulticlassJaccardIndex(num_classes=num_classes, ignore_index=-1)
 
     def forward(self, x):
-        return self.model(x)["out"]
+        model_name = self.model.__class__.__name__.lower()
+
+        if model_name == "deeplabv3":
+            return self.model(x)["out"]
+        else:
+            return self.model(x)
 
     def training_step(self, batch, batch_idx):
         images, masks = batch
@@ -99,6 +105,21 @@ def configure_optimizers(self):
 
 @hydra.main(version_base=None, config_path="../configs", config_name="config")
 def main(cfg):
+    print(title)
+
+    if torch.cuda.is_available():
+        check_tensor_cores()
+    else:
+        print("CUDA is not available on this system.")
+
+    # create output directory for logging
+    log_dir = Path(cfg.train.LOG_DIR)
+    today_date = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    output_dir = log_dir / today_date
+
+    log_augmentations(albumentations_transform, resize_transform, output_dir)
+    log_train_cfg(cfg.train, output_dir)
+
     train_loader, val_loader = get_data_loaders(
         cfg.paths.IMG_DIR,
         cfg.paths.MASKS_DIR,
@@ -109,7 +130,11 @@ def main(cfg):
     )
 
     num_classes = cfg.train.NUM_CLASSES
-    model = SegmentationModel(num_classes=num_classes, lr=cfg.train.LR)
+    model = SegmentationModel(
+        num_classes=num_classes,
+        lr=cfg.train.LR,
+        model=get_segmentation_model(cfg.train.MODEL, num_classes),
+    )
 
     checkpoint_callback = ModelCheckpoint(monitor="val_loss")
 
@@ -120,9 +145,12 @@ def main(cfg):
         mode="min",
     )
 
+    tb_logger = TensorBoardLogger(save_dir=log_dir, name=today_date)
     trainer = Trainer(
         max_epochs=cfg.train.NUM_EPOCHS,
+        log_every_n_steps=cfg.train.LOG_EVERY_N_STEPS,
         callbacks=[checkpoint_callback, early_stopping_callback],
+        logger=tb_logger,
     )
     trainer.fit(model, train_loader, val_loader)
 

diff --git a/src/utils/check_cuda.py b/src/utils/check_cuda.py
@@ -0,0 +1,10 @@
+import torch
+
+def check_tensor_cores():
+    device_name = torch.cuda.get_device_name()
+    tensor_cores_devices = ["V100", "A100", "A40", "T4", "RTX 20", "RTX 30", "RTX 40"]
+
+    if any(core in device_name for core in tensor_cores_devices):
+        torch.set_float32_matmul_precision("high")
+    else:
+        print(f"Your CUDA device ('{device_name}') does not appear to have Tensor Cores.")
diff --git a/src/utils/extras.py b/src/utils/extras.py
@@ -0,0 +1,7 @@
+title = """
+ ▄▄▄ ▗▞▀▘▐▌    ▄▄▄   ▄▄▄  █ ▄▄▄▄  ▗▞▀▜▌▄▄▄▄  
+▀▄▄  ▝▚▄▖▐▌   █   █ █   █ █ █ █ █ ▝▚▄▟▌█   █ 
+▄▄▄▀     ▐▛▀▚▖▀▄▄▄▀ ▀▄▄▄▀ █ █   █      █▄▄▄▀ 
+         ▐▌ ▐▌            █            █     
+                                       ▀     
+"""
diff --git a/src/utils/log_files.py b/src/utils/log_files.py
@@ -0,0 +1,36 @@
+import os
+import yaml
+from omegaconf import OmegaConf
+from pathlib import Path
+
+def log_train_cfg(cfg, output_dir):
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    cfg_dict = OmegaConf.to_container(cfg, resolve=True)
+
+    output_file = os.path.join(output_dir, "train.yaml")
+    with open(output_file, "w") as f:
+        yaml.dump(cfg_dict, f, default_flow_style=False)
+
+def log_augmentations(albumentations_transform, resize_transform, output_dir):
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    aug_dict = {
+        "albumentations": {
+            "transforms": [
+                {"name": t.__class__.__name__}
+                for t in albumentations_transform.transforms
+            ]
+        },
+        "resize": {
+            "transforms": [
+                {"name": t.__class__.__name__} for t in resize_transform.transforms
+            ]
+        },
+    }
+
+    output_file = output_path / "augmentations.yaml"
+    with open(output_file, "w") as f:
+        yaml.dump(aug_dict, f, default_flow_style=False)
diff --git a/src/utils/models.py b/src/utils/models.py
@@ -0,0 +1,28 @@
+import segmentation_models_pytorch as smp
+import torch
+import torchvision.models.segmentation as models
+
+def get_segmentation_model(model_name, num_classes):
+    if model_name == "deeplabv3plus":
+        model = _get_deeplabv3plus_model(num_classes)
+    elif model_name == "deeplabv3":
+        model = _get_deeplabv3_model(num_classes)
+    else:
+        raise ValueError(f"Unknown model name: {model_name}")
+
+    return model
+
+def _get_deeplabv3plus_model(num_classes):
+    model = smp.DeepLabV3Plus(
+        encoder_name="resnet101",
+        encoder_weights="imagenet",
+        classes=num_classes,
+        activation="softmax2d",
+    )
+    return model
+
+
+def _get_deeplabv3_model(num_classes):
+    model = models.deeplabv3_resnet50(weights="COCO_WITH_VOC_LABELS_V1")
+    model.classifier[4] = torch.nn.Conv2d(256, num_classes, kernel_size=(1, 1))
+    return model