diff --git a/README.md b/README.md index c25c07db2..7e7192414 100644 --- a/README.md +++ b/README.md @@ -415,11 +415,17 @@ When training a RN50 on YFCC the same hyperparameters as above are used, with th Note that to use another model, like `ViT-B/32` or `RN50x4` or `RN50x16` or `ViT-B/16`, specify with `--model RN50x4`. -### Launch tensorboard: +### Logging + +For tensorboard logging, run: ```bash tensorboard --logdir=logs/tensorboard/ --port=7777 ``` +For wandb logging, we recommend looking at the `step` variable instead of `Step`, since the later was not properly set in earlier versions of this codebase. +For older runs with models trained before https://github.com/mlfoundations/open_clip/pull/613, the `Step` variable should be ignored. +For newer runs, after that PR, the two variables are the same. + ## Evaluation / Zero-Shot We recommend https://github.com/LAION-AI/CLIP_benchmark#how-to-use for systematic evaluation on 40 datasets. diff --git a/src/training/train.py b/src/training/train.py index e93d9d370..4877a7850 100644 --- a/src/training/train.py +++ b/src/training/train.py @@ -219,14 +219,17 @@ def train_one_epoch(model, data, loss, epoch, optimizer, scaler, scheduler, dist } log_data.update({name:val.val for name,val in losses_m.items()}) - for name, val in log_data.items(): - name = "train/" + name - if tb_writer is not None: - tb_writer.add_scalar(name, val, step) - if args.wandb: - assert wandb is not None, 'Please install wandb.' - wandb.log({name: val, 'step': step}) + log_data = {"train/" + name: val for name, val in log_data.items()} + if tb_writer is not None: + for name, val in log_data.items(): + tb_writer.add_scalar(name, val, step) + + if args.wandb: + assert wandb is not None, 'Please install wandb.' + log_data['step'] = step # for backwards compatibility + wandb.log(log_data, step=step) + # resetting batch / data time meters per log window batch_time_m.reset() data_time_m.reset() @@ -317,10 +320,12 @@ def evaluate(model, data, epoch, args, tb_writer=None): + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()]) ) + log_data = {"val/" + name: val for name, val in metrics.items()} + if args.save_logs: - for name, val in metrics.items(): - if tb_writer is not None: - tb_writer.add_scalar(f"val/{name}", val, epoch) + if tb_writer is not None: + for name, val in log_data.items(): + tb_writer.add_scalar(name, val, epoch) with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f: f.write(json.dumps(metrics)) @@ -328,8 +333,14 @@ def evaluate(model, data, epoch, args, tb_writer=None): if args.wandb: assert wandb is not None, 'Please install wandb.' - for name, val in metrics.items(): - wandb.log({f"val/{name}": val, 'epoch': epoch}) + if 'train' in data: + dataloader = data['train'].dataloader + num_batches_per_epoch = dataloader.num_batches // args.accum_freq + step = num_batches_per_epoch * epoch + else: + step = None + log_data['epoch'] = epoch + wandb.log(log_data, step=step) return metrics