|
1 | 1 | import json |
2 | 2 | import math |
| 3 | +from datetime import datetime |
3 | 4 |
|
4 | 5 | from art.utils.deploy_model import ( |
5 | 6 | LoRADeploymentJob, |
@@ -241,7 +242,7 @@ async def _delete_checkpoints( |
241 | 242 | ) |
242 | 243 | steps_to_keep.append(best_step) |
243 | 244 | except FileNotFoundError: |
244 | | - pass |
| 245 | + print(f'"{output_dir}/history.jsonl" not found') |
245 | 246 | except pl.exceptions.ColumnNotFoundError: |
246 | 247 | print(f'No "{benchmark}" metric found in history') |
247 | 248 | delete_checkpoints(output_dir, steps_to_keep) |
@@ -273,7 +274,7 @@ async def _log( |
273 | 274 | os.makedirs(parent_dir, exist_ok=True) |
274 | 275 |
|
275 | 276 | # Get the file name for the current iteration, or default to 0 for non-trainable models |
276 | | - iteration = self.__get_step(model) if isinstance(model, TrainableModel) else 0 |
| 277 | + iteration = self.__get_step(model) |
277 | 278 | file_name = f"{iteration:04d}.jsonl" |
278 | 279 |
|
279 | 280 | # Write the logs to the file |
@@ -443,11 +444,20 @@ def _log_metrics( |
443 | 444 | step: int | None = None, |
444 | 445 | ) -> None: |
445 | 446 | metrics = {f"{split}/{metric}": value for metric, value in metrics.items()} |
446 | | - step = ( |
447 | | - step |
448 | | - if step is not None |
449 | | - else (self.__get_step(model) if isinstance(model, TrainableModel) else 0) |
450 | | - ) |
| 447 | + step = step if step is not None else self.__get_step(model) |
| 448 | + |
| 449 | + with open( |
| 450 | + f"{get_model_dir(model=model, art_path=self._path)}/history.jsonl", "a" |
| 451 | + ) as f: |
| 452 | + f.write( |
| 453 | + json.dumps( |
| 454 | + { |
| 455 | + k: v for k, v in metrics.items() if v == v |
| 456 | + } # Filter out NaN values |
| 457 | + | {"step": step, "recorded_at": datetime.now().isoformat()} |
| 458 | + ) |
| 459 | + + "\n" |
| 460 | + ) |
451 | 461 |
|
452 | 462 | # If we have a W&B run, log the data there |
453 | 463 | if run := self._get_wandb_run(model): |
|
0 commit comments