diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 5b172b3eb98c..94915a58d124 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -35,6 +35,8 @@ import numpy as np import packaging.version +from transformers.utils.import_utils import _is_package_available + if os.getenv("WANDB_MODE") == "offline": print("⚙️ Running in WANDB offline mode") @@ -1043,6 +1045,14 @@ def on_predict(self, args, state, control, metrics, **kwargs): class TrackioCallback(TrainerCallback): """ A [`TrainerCallback`] that logs metrics to Trackio. + + It records training metrics, model (and PEFT) configuration, and GPU memory usage. + If `nvidia-ml-py` is installed, GPU power consumption is also tracked. + + **Requires**: + ```bash + pip install trackio + ``` """ def __init__(self): @@ -1119,12 +1129,14 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs): device_idx = torch.cuda.current_device() total_memory = torch.cuda.get_device_properties(device_idx).total_memory memory_allocated = torch.cuda.memory_allocated(device_idx) - power = torch.cuda.power_draw(device_idx) + gpu_memory_logs = { f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio - f"gpu/{device_idx}/power": power / 1000, # Watts } + if _is_package_available("pynvml"): + power = torch.cuda.power_draw(device_idx) + gpu_memory_logs[f"gpu/{device_idx}/power"] = power / 1000 # Watts if dist.is_available() and dist.is_initialized(): gathered_logs = [None] * dist.get_world_size() dist.all_gather_object(gathered_logs, gpu_memory_logs)