|
35 | 35 | import numpy as np |
36 | 36 | import packaging.version |
37 | 37 |
|
| 38 | +from transformers.utils.import_utils import _is_package_available |
| 39 | + |
38 | 40 |
|
39 | 41 | if os.getenv("WANDB_MODE") == "offline": |
40 | 42 | print("⚙️ Running in WANDB offline mode") |
@@ -1043,6 +1045,14 @@ def on_predict(self, args, state, control, metrics, **kwargs): |
1043 | 1045 | class TrackioCallback(TrainerCallback): |
1044 | 1046 | """ |
1045 | 1047 | A [`TrainerCallback`] that logs metrics to Trackio. |
| 1048 | +
|
| 1049 | + It records training metrics, model (and PEFT) configuration, and GPU memory usage. |
| 1050 | + If `nvidia-ml-py` is installed, GPU power consumption is also tracked. |
| 1051 | +
|
| 1052 | + **Requires**: |
| 1053 | + ```bash |
| 1054 | + pip install trackio |
| 1055 | + ``` |
1046 | 1056 | """ |
1047 | 1057 |
|
1048 | 1058 | def __init__(self): |
@@ -1119,12 +1129,14 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs): |
1119 | 1129 | device_idx = torch.cuda.current_device() |
1120 | 1130 | total_memory = torch.cuda.get_device_properties(device_idx).total_memory |
1121 | 1131 | memory_allocated = torch.cuda.memory_allocated(device_idx) |
1122 | | - power = torch.cuda.power_draw(device_idx) |
| 1132 | + |
1123 | 1133 | gpu_memory_logs = { |
1124 | 1134 | f"gpu/{device_idx}/allocated_memory": memory_allocated / (1024**3), # GB |
1125 | 1135 | f"gpu/{device_idx}/memory_usage": memory_allocated / total_memory, # ratio |
1126 | | - f"gpu/{device_idx}/power": power / 1000, # Watts |
1127 | 1136 | } |
| 1137 | + if _is_package_available("pynvml"): |
| 1138 | + power = torch.cuda.power_draw(device_idx) |
| 1139 | + gpu_memory_logs[f"gpu/{device_idx}/power"] = power / 1000 # Watts |
1128 | 1140 | if dist.is_available() and dist.is_initialized(): |
1129 | 1141 | gathered_logs = [None] * dist.get_world_size() |
1130 | 1142 | dist.all_gather_object(gathered_logs, gpu_memory_logs) |
|
0 commit comments