Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lighting logger integration #241

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ mllogs/
.logs/
lightning_logs/
mlruns/
ray_checkpoints/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
40 changes: 40 additions & 0 deletions docs/how-it-works/loggers/explain_loggers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,46 @@ integer.
that the chosen logging framework supports multiprocessing**, adapting it accordingly
if not.


Pytorch Lightning Logger
+++++++++++++++++++++++++++
In addition to the itwinai loggers, itwinai provides a way for users to integrate itwinai
logging functionalities within PyTorch Lightning workflows. Designed to work as a direct
substitute for the PyTorch Lightning logger, the :class:`~itwinai.loggerstorch..ItwinaiLogger`
wraps any existing itwinai logger (e.g., MLFlow or Prov4ML logger) and adapts it to the
PyTorch Lightning interface. By implementing the same methods as the PyTorch Lightning logger,
the :class:`~itwinai.loggerstorch..ItwinaiLogger` allows users to replace the default logger
in their existing code with this wrapper while maintaining compatibility with PyTorch
Lightning's logging operations, including integration with the ModelCheckpoint callback.

.. admonition:: Example of Pytorch Lightning Trainer using ItwinaiLogger

In this example, we see how we can instantiate the
:class:`~itwinai.loggerstorch..ItwinaiLogger` with an itwinai logger instance of our choice
and pass it to a Pytorch Lightning Trainer to use just as any other Lightning Logger.

.. code-block:: python

import pytorch_lightning as pl
from my_models import MyModel
from itwinai.loggers import Prov4ML
from itwinai.torch.loggers import ItwinaiLogger

my_model = MyModel()

my_prov_logger = Prov4ML()
my_lightning_logger = ItwinaiLogger(itwinai_logger=my_prov_logger)

trainer = pl.Trainer(logger=my_itwinai_logger)
trainer.fit(model)

This illustrates the basic structure of instantiating the
:class:`~itwinai.loggerstorch..ItwinaiLogger` and passing it to a
Pytorch Lightning trainer instance.
A more detailed example of the use of the :class:`~itwinai.loggerstorch..ItwinaiLogger`
in an itwinai Pytorch Lightning training pipeline on the MNIST dataset can be found in
:doc:`Pytorch Lightning <../../use-cases/mnist_doc>`

Further references
-------------------

Expand Down
2 changes: 1 addition & 1 deletion env-files/torch/generic_torch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,4 @@ else
fi

# Install itwinai: MUST be last line of the script for the user installation script to work!
pip install --no-cache-dir -e .[torch,dev] || exit 1
pip install --no-cache-dir -e .[torch,dev] || exit 1
105 changes: 72 additions & 33 deletions src/itwinai/loggers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# --------------------------------------------------------------------------------------
# Part of the interTwin Project: https://www.intertwin.eu/
#
# Created by: Matteo Bunino
#
# Credit:
# - Matteo Bunino <matteo.bunino@cern.ch> - CERN
# - Anna Lappe <anna.elisa.lappe@cern.ch> - CERN
# -------------------------------------------------------------------------------------

"""
``itwinai`` wrappers for well-known ML loggers.

Expand Down Expand Up @@ -140,10 +150,24 @@ def __init__(
savedir: str = "mllogs",
log_freq: Union[int, Literal["epoch", "batch"]] = "epoch",
log_on_workers: Union[int, List[int]] = 0,
experiment_id: Optional[str] = None,
run_id: Optional[Union[int, str]] = None,
) -> None:
self.savedir = savedir
self.log_freq = log_freq
self.log_on_workers = log_on_workers
self._experiment_id = experiment_id
self._run_id = run_id

@property
def experiment_id(self) -> Optional[str]:
"""Return the experiment name."""
return self._experiment_id

@property
def run_id(self) -> Optional[Union[int, str]]:
"""Return the experiment version."""
return self._run_id

@property
def log_freq(self) -> Union[int, Literal["epoch", "batch"]]:
Expand Down Expand Up @@ -350,13 +374,18 @@ def create_logger_context(self, rank: Optional[int] = None):
if not self.should_log():
return

os.makedirs(self.savedir, exist_ok=True)
run_dirs = sorted([int(dir) for dir in os.listdir(self.savedir)])
if len(run_dirs) == 0:
self.run_id = 0
# os.makedirs(self.savedir, exist_ok=True)
if os.path.isdir(self.savedir):
numeric_dirs = [
int(name)
for name in os.listdir(self.savedir)
if os.path.isdir(os.path.join(self.savedir, name)) and name.isdigit()
]
self._experiment_id = max(numeric_dirs) + 1
else:
self.run_id = int(run_dirs[-1]) + 1
self.run_path = os.path.join(self.savedir, str(self.run_id))
self._experiment_id = 0

self.run_path = os.path.join(self.savedir, str(self.experiment_id))
os.makedirs(self.run_path, exist_ok=True)

def destroy_logger_context(self):
Expand Down Expand Up @@ -400,25 +429,40 @@ def log(
return

if kind == "artifact":
if isinstance(item, str) and os.path.isfile(item):
import shutil

identifier = os.path.join(self.run_path, identifier)
if len(os.path.dirname(identifier)) > 0:
os.makedirs(os.path.dirname(identifier), exist_ok=True)
print(f"ConsoleLogger: Serializing to {identifier}...")
shutil.copyfile(item, identifier)
import shutil

artifact_dir = os.path.join(self.run_path, "artifacts", identifier)
os.makedirs(artifact_dir, exist_ok=True)

if os.path.isfile(item):
target_path = os.path.join(artifact_dir, identifier)
shutil.copyfile(item, target_path)

elif os.path.isdir(item):
numeric_dirs = [
int(name)
for name in os.listdir(self.savedir)
if os.path.isdir(os.path.join(self.savedir, name))
and name.isdigit()
]
child_id = max(numeric_dirs) + 1
target_path = os.path.join(
artifact_dir, f"{self.experiment_id}.{child_id}"
)
shutil.copytree(item, target_path, dirs_exist_ok=True)
else:
identifier = os.path.join(os.path.basename(self.run_path), identifier)
print(f"ConsoleLogger: Serializing to {identifier}...")
self.serialize(item, identifier)
print(f"INFO: The ConsoleLogger expects an artifact to be either a path \
or a directory. Received instead an item of type {type(item)}. \
The item will be ignored and not logged.")

elif kind == "torch":
identifier = os.path.join(self.run_path, identifier)
print(f"ConsoleLogger: Saving to {identifier}...")
import torch

torch.save(item, identifier)
else:
target_path = os.path.join(self.run_path, identifier)
torch.save(item, target_path)
print(f"INFO: ConsoleLogger saved to {target_path}...")

elif kind == "metric":
print(f"ConsoleLogger: {identifier} = {item}")


Expand Down Expand Up @@ -477,14 +521,14 @@ def __init__(
super().__init__(
savedir=savedir, log_freq=log_freq, log_on_workers=log_on_workers
)
self.experiment_name = experiment_name
self.tracking_uri = tracking_uri
self.run_description = run_description
self.run_name = run_name
self.experiment_name = experiment_name

self.tracking_uri = (
self.tracking_uri
or os.environ.get('MLFLOW_TRACKING_URI')
or os.environ.get("MLFLOW_TRACKING_URI")
or pathlib.Path(os.path.abspath(self.savedir)).as_uri()
)

Expand Down Expand Up @@ -514,9 +558,8 @@ def create_logger_context(self, rank: Optional[int] = None) -> mlflow.ActiveRun:
self.active_run: mlflow.ActiveRun = mlflow.start_run(
description=self.run_description, run_name=self.run_name
)

# TODO: for pytorch lightning:
# mlflow.pytorch.autolog()
self._run_id = self.active_run.info.run_id
self._experiment_id = self.active_run.info.experiment_id

return self.active_run

Expand Down Expand Up @@ -566,7 +609,6 @@ def log(
return

if kind == "metric":
# if isinstance(item, list) and isinstance(identifier, list):
mlflow.log_metric(key=identifier, value=item, step=step)
if kind == "artifact":
if not isinstance(item, str):
Expand Down Expand Up @@ -1019,9 +1061,8 @@ def __init__(
log_freq=log_freq,
log_on_workers=log_on_workers,
)
self.name = experiment_name
self.version = None
self.prov_user_namespace = prov_user_namespace
self.experiment_name = experiment_name
self.provenance_save_dir = provenance_save_dir
self.save_after_n_logs = save_after_n_logs
self.create_graph = create_graph
Expand All @@ -1043,7 +1084,7 @@ def create_logger_context(self, rank: Optional[int] = None):

prov4ml.start_run(
prov_user_namespace=self.prov_user_namespace,
experiment_name=self.name,
experiment_name=self.experiment_name,
provenance_save_dir=self.provenance_save_dir,
save_after_n_logs=self.save_after_n_logs,
# This class will control which workers can log
Expand Down Expand Up @@ -1139,8 +1180,7 @@ def log(
prov4ml.log_param(key=identifier, value=item)
elif kind == "prov_documents":
prov_docs = prov4ml.log_provenance_documents(
create_graph=True,
create_svg=True
create_graph=True, create_svg=True
)

# Upload to MLFlow
Expand All @@ -1158,7 +1198,6 @@ def __init__(
) -> None:
if isinstance(save_path, str):
save_path = Path(save_path)

self.save_path: Path = save_path
self.strategy_name = strategy_name
self.num_nodes = num_nodes
Expand Down
1 change: 1 addition & 0 deletions src/itwinai/tests/sanity_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
'itwinai.torch.reproducibility',
'itwinai.torch.trainer',
'itwinai.torch.type',
'itwinai.torch.loggers'
]

tensorflow_modules = [
Expand Down
Loading