Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/transformers/integrations/integration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
import numpy as np
import packaging.version

from transformers.utils.import_utils import _is_package_available


if os.getenv("WANDB_MODE") == "offline":
print("⚙️ Running in WANDB offline mode")
Expand Down Expand Up @@ -1043,6 +1045,14 @@ def on_predict(self, args, state, control, metrics, **kwargs):
class TrackioCallback(TrainerCallback):
"""
A [`TrainerCallback`] that logs metrics to Trackio.

It records training metrics, model (and PEFT) configuration, and GPU memory usage.
If `nvidia-ml-py` is installed, GPU power consumption is also tracked.

**Requires**:
```bash
pip install trackio
```
"""

def __init__(self):
Expand Down Expand Up @@ -1119,12 +1129,14 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs):
device_idx = torch.cuda.current_device()
total_memory = torch.cuda.get_device_properties(device_idx).total_memory
memory_allocated = torch.cuda.memory_allocated(device_idx)
power = torch.cuda.power_draw(device_idx)

gpu_memory_logs = {
f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB
f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio
f"gpu/{device_idx}/power": power / 1000, # Watts
}
if _is_package_available("pynvml"):
power = torch.cuda.power_draw(device_idx)
gpu_memory_logs[f"gpu/{device_idx}/power"] = power / 1000 # Watts
if dist.is_available() and dist.is_initialized():
gathered_logs = [None] * dist.get_world_size()
dist.all_gather_object(gathered_logs, gpu_memory_logs)
Expand Down