Allow TrackioCallback to work when pynvml is not installed (#39851)

qgallouedec · web-flow · commit b727c2b20e57 · 2025-08-01T18:57:25.000+02:00
Allow TrackioCallback to work when pynvml is not installed
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
@@ -35,6 +35,8 @@
 import numpy as np
 import packaging.version
 
+from transformers.utils.import_utils import _is_package_available
+
 
 if os.getenv("WANDB_MODE") == "offline":
     print("⚙️  Running in WANDB offline mode")
@@ -1043,6 +1045,14 @@ def on_predict(self, args, state, control, metrics, **kwargs):
 class TrackioCallback(TrainerCallback):
     """
     A [`TrainerCallback`] that logs metrics to Trackio.
+
+    It records training metrics, model (and PEFT) configuration, and GPU memory usage.
+    If `nvidia-ml-py` is installed, GPU power consumption is also tracked.
+
+    **Requires**:
+    ```bash
+    pip install trackio
+    ```
     """
 
     def __init__(self):
@@ -1119,12 +1129,14 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs):
             device_idx = torch.cuda.current_device()
             total_memory = torch.cuda.get_device_properties(device_idx).total_memory
             memory_allocated = torch.cuda.memory_allocated(device_idx)
-            power = torch.cuda.power_draw(device_idx)
+
             gpu_memory_logs = {
                 f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3),  # GB
                 f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory,  # ratio
-                f"gpu/{device_idx}/power": power / 1000,  # Watts
             }
+            if _is_package_available("pynvml"):
+                power = torch.cuda.power_draw(device_idx)
+                gpu_memory_logs[f"gpu/{device_idx}/power"] = power / 1000  # Watts
             if dist.is_available() and dist.is_initialized():
                 gathered_logs = [None] * dist.get_world_size()
                 dist.all_gather_object(gathered_logs, gpu_memory_logs)