interTwin-eu · annaelisalappe · Nov 7, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/.gitignore b/.gitignore
@@ -42,6 +42,7 @@ mllogs/
 .logs/
 lightning_logs/
 mlruns/
+ray_checkpoints/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/docs/how-it-works/loggers/explain_loggers.rst b/docs/how-it-works/loggers/explain_loggers.rst
@@ -302,6 +302,46 @@ integer.
     that the chosen logging framework supports multiprocessing**, adapting it accordingly
     if not.
 
+
+Pytorch Lightning Logger
++++++++++++++++++++++++++++
+In addition to the itwinai loggers, itwinai provides a way for users to integrate itwinai
+logging functionalities within PyTorch Lightning workflows. Designed to work as a direct
+substitute for the PyTorch Lightning logger, the :class:`~itwinai.loggerstorch..ItwinaiLogger`
+wraps any existing itwinai logger (e.g., MLFlow or Prov4ML logger) and adapts it to the 
+PyTorch Lightning interface. By implementing the same methods as the PyTorch Lightning logger, 
+the :class:`~itwinai.loggerstorch..ItwinaiLogger` allows users to replace the default logger
+in their existing code with this wrapper while maintaining compatibility with PyTorch 
+Lightning's logging operations, including integration with the ModelCheckpoint callback. 
+
+.. admonition:: Example of Pytorch Lightning Trainer using ItwinaiLogger
+
+    In this example, we see how we can instantiate the 
+    :class:`~itwinai.loggerstorch..ItwinaiLogger` with an itwinai logger instance of our choice
+    and pass it to a Pytorch Lightning Trainer to use just as any other Lightning Logger.
+
+    .. code-block:: python
+
+        import pytorch_lightning as pl
+        from my_models import MyModel
+        from itwinai.loggers import Prov4ML
+        from itwinai.torch.loggers import ItwinaiLogger
+
+        my_model = MyModel()
+
+        my_prov_logger = Prov4ML()
+        my_lightning_logger = ItwinaiLogger(itwinai_logger=my_prov_logger)
+
+        trainer = pl.Trainer(logger=my_itwinai_logger)
+        trainer.fit(model)
+
+This illustrates the basic structure of instantiating the
+:class:`~itwinai.loggerstorch..ItwinaiLogger` and passing it to a
+Pytorch Lightning trainer instance. 
+A more detailed example of the use of the :class:`~itwinai.loggerstorch..ItwinaiLogger`
+in an itwinai Pytorch Lightning training pipeline on the MNIST dataset can be found in 
+:doc:`Pytorch Lightning <../../use-cases/mnist_doc>`
+
 Further references
 -------------------
 

diff --git a/env-files/torch/generic_torch.sh b/env-files/torch/generic_torch.sh
@@ -163,4 +163,4 @@ else
 fi
 
 # Install itwinai: MUST be last line of the script for the user installation script to work!
-pip install --no-cache-dir -e .[torch,dev] || exit 1
+pip install --no-cache-dir -e .[torch,dev] || exit 1
diff --git a/src/itwinai/loggers.py b/src/itwinai/loggers.py
@@ -1,3 +1,13 @@
+# --------------------------------------------------------------------------------------
+# Part of the interTwin Project: https://www.intertwin.eu/
+#
+# Created by: Matteo Bunino
+#
+# Credit:
+# - Matteo Bunino <matteo.bunino@cern.ch> - CERN
+# - Anna Lappe <anna.elisa.lappe@cern.ch> - CERN
+# -------------------------------------------------------------------------------------
+
 """
 ``itwinai`` wrappers for well-known ML loggers.
 
@@ -140,10 +150,24 @@ def __init__(
         savedir: str = "mllogs",
         log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
         log_on_workers: Union[int, List[int]] = 0,
+        experiment_id: Optional[str] = None,
+        run_id: Optional[Union[int, str]] = None,
     ) -> None:
         self.savedir = savedir
         self.log_freq = log_freq
         self.log_on_workers = log_on_workers
+        self._experiment_id = experiment_id
+        self._run_id = run_id
+
+    @property
+    def experiment_id(self) -> Optional[str]:
+        """Return the experiment name."""
+        return self._experiment_id
+
+    @property
+    def run_id(self) -> Optional[Union[int, str]]:
+        """Return the experiment version."""
+        return self._run_id
 
     @property
     def log_freq(self) -> Union[int, Literal["epoch", "batch"]]:
@@ -350,13 +374,18 @@ def create_logger_context(self, rank: Optional[int] = None):
         if not self.should_log():
             return
 
-        os.makedirs(self.savedir, exist_ok=True)
-        run_dirs = sorted([int(dir) for dir in os.listdir(self.savedir)])
-        if len(run_dirs) == 0:
-            self.run_id = 0
+        # os.makedirs(self.savedir, exist_ok=True)
+        if os.path.isdir(self.savedir):
+            numeric_dirs = [
+                int(name)
+                for name in os.listdir(self.savedir)
+                if os.path.isdir(os.path.join(self.savedir, name)) and name.isdigit()
+            ]
+            self._experiment_id = max(numeric_dirs) + 1
         else:
-            self.run_id = int(run_dirs[-1]) + 1
-        self.run_path = os.path.join(self.savedir, str(self.run_id))
+            self._experiment_id = 0
+
+        self.run_path = os.path.join(self.savedir, str(self.experiment_id))
         os.makedirs(self.run_path, exist_ok=True)
 
     def destroy_logger_context(self):
@@ -400,25 +429,40 @@ def log(
             return
 
         if kind == "artifact":
-            if isinstance(item, str) and os.path.isfile(item):
-                import shutil
-
-                identifier = os.path.join(self.run_path, identifier)
-                if len(os.path.dirname(identifier)) > 0:
-                    os.makedirs(os.path.dirname(identifier), exist_ok=True)
-                print(f"ConsoleLogger: Serializing to {identifier}...")
-                shutil.copyfile(item, identifier)
+            import shutil
+
+            artifact_dir = os.path.join(self.run_path, "artifacts", identifier)
+            os.makedirs(artifact_dir, exist_ok=True)
+
+            if os.path.isfile(item):
+                target_path = os.path.join(artifact_dir, identifier)
+                shutil.copyfile(item, target_path)
+
+            elif os.path.isdir(item):
+                numeric_dirs = [
+                    int(name)
+                    for name in os.listdir(self.savedir)
+                    if os.path.isdir(os.path.join(self.savedir, name))
+                    and name.isdigit()
+                ]
+                child_id = max(numeric_dirs) + 1
+                target_path = os.path.join(
+                    artifact_dir, f"{self.experiment_id}.{child_id}"
+                )
+                shutil.copytree(item, target_path, dirs_exist_ok=True)
             else:
-                identifier = os.path.join(os.path.basename(self.run_path), identifier)
-                print(f"ConsoleLogger: Serializing to {identifier}...")
-                self.serialize(item, identifier)
+                print(f"INFO: The ConsoleLogger expects an artifact to be either a path \
+                      or a directory. Received instead an item of type {type(item)}. \
+                        The item will be ignored and not logged.")
+
         elif kind == "torch":
-            identifier = os.path.join(self.run_path, identifier)
-            print(f"ConsoleLogger: Saving to {identifier}...")
             import torch
 
-            torch.save(item, identifier)
-        else:
+            target_path = os.path.join(self.run_path, identifier)
+            torch.save(item, target_path)
+            print(f"INFO: ConsoleLogger saved to {target_path}...")
+
+        elif kind == "metric":
             print(f"ConsoleLogger: {identifier} = {item}")
 
 
@@ -477,14 +521,14 @@ def __init__(
         super().__init__(
             savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers
         )
-        self.experiment_name = experiment_name
         self.tracking_uri = tracking_uri
         self.run_description = run_description
         self.run_name = run_name
+        self.experiment_name = experiment_name
 
         self.tracking_uri = (
             self.tracking_uri
-            or os.environ.get('MLFLOW_TRACKING_URI')
+            or os.environ.get("MLFLOW_TRACKING_URI")
             or pathlib.Path(os.path.abspath(self.savedir)).as_uri()
         )
 
@@ -514,9 +558,8 @@ def create_logger_context(self, rank: Optional[int] = None) -> mlflow.ActiveRun:
             self.active_run: mlflow.ActiveRun = mlflow.start_run(
                 description=self.run_description, run_name=self.run_name
             )
-
-        # TODO: for pytorch lightning:
-        # mlflow.pytorch.autolog()
+        self._run_id = self.active_run.info.run_id
+        self._experiment_id = self.active_run.info.experiment_id
 
         return self.active_run
 
@@ -566,7 +609,6 @@ def log(
             return
 
         if kind == "metric":
-            # if isinstance(item, list) and isinstance(identifier, list):
             mlflow.log_metric(key=identifier, value=item, step=step)
         if kind == "artifact":
             if not isinstance(item, str):
@@ -1019,9 +1061,8 @@ def __init__(
             log_freq=log_freq,
             log_on_workers=log_on_workers,
         )
-        self.name = experiment_name
-        self.version = None
         self.prov_user_namespace = prov_user_namespace
+        self.experiment_name = experiment_name
         self.provenance_save_dir = provenance_save_dir
         self.save_after_n_logs = save_after_n_logs
         self.create_graph = create_graph
@@ -1043,7 +1084,7 @@ def create_logger_context(self, rank: Optional[int] = None):
 
         prov4ml.start_run(
             prov_user_namespace=self.prov_user_namespace,
-            experiment_name=self.name,
+            experiment_name=self.experiment_name,
             provenance_save_dir=self.provenance_save_dir,
             save_after_n_logs=self.save_after_n_logs,
             # This class will control which workers can log
@@ -1139,8 +1180,7 @@ def log(
                 prov4ml.log_param(key=identifier, value=item)
         elif kind == "prov_documents":
             prov_docs = prov4ml.log_provenance_documents(
-                create_graph=True,
-                create_svg=True
+                create_graph=True, create_svg=True
             )
 
             # Upload to MLFlow
@@ -1158,7 +1198,6 @@ def __init__(
     ) -> None:
         if isinstance(save_path, str):
             save_path = Path(save_path)
-
         self.save_path: Path = save_path
         self.strategy_name = strategy_name
         self.num_nodes = num_nodes

diff --git a/src/itwinai/tests/sanity_check.py b/src/itwinai/tests/sanity_check.py
@@ -34,6 +34,7 @@
     'itwinai.torch.reproducibility',
     'itwinai.torch.trainer',
     'itwinai.torch.type',
+    'itwinai.torch.loggers'
 ]
 
 tensorflow_modules = [