Skip to content

Commit 538a662

Browse files
authored
Merge pull request #308 from OpenPipe/fix-logs
Remove failing step checks
2 parents 25661ec + 622ebd7 commit 538a662

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

src/art/local/backend.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import math
3+
from datetime import datetime
34

45
from art.utils.deploy_model import (
56
LoRADeploymentJob,
@@ -241,7 +242,7 @@ async def _delete_checkpoints(
241242
)
242243
steps_to_keep.append(best_step)
243244
except FileNotFoundError:
244-
pass
245+
print(f'"{output_dir}/history.jsonl" not found')
245246
except pl.exceptions.ColumnNotFoundError:
246247
print(f'No "{benchmark}" metric found in history')
247248
delete_checkpoints(output_dir, steps_to_keep)
@@ -273,7 +274,7 @@ async def _log(
273274
os.makedirs(parent_dir, exist_ok=True)
274275

275276
# Get the file name for the current iteration, or default to 0 for non-trainable models
276-
iteration = self.__get_step(model) if isinstance(model, TrainableModel) else 0
277+
iteration = self.__get_step(model)
277278
file_name = f"{iteration:04d}.jsonl"
278279

279280
# Write the logs to the file
@@ -443,11 +444,20 @@ def _log_metrics(
443444
step: int | None = None,
444445
) -> None:
445446
metrics = {f"{split}/{metric}": value for metric, value in metrics.items()}
446-
step = (
447-
step
448-
if step is not None
449-
else (self.__get_step(model) if isinstance(model, TrainableModel) else 0)
450-
)
447+
step = step if step is not None else self.__get_step(model)
448+
449+
with open(
450+
f"{get_model_dir(model=model, art_path=self._path)}/history.jsonl", "a"
451+
) as f:
452+
f.write(
453+
json.dumps(
454+
{
455+
k: v for k, v in metrics.items() if v == v
456+
} # Filter out NaN values
457+
| {"step": step, "recorded_at": datetime.now().isoformat()}
458+
)
459+
+ "\n"
460+
)
451461

452462
# If we have a W&B run, log the data there
453463
if run := self._get_wandb_run(model):

0 commit comments

Comments
 (0)