From 7007bf10db5bbe6502706df936fa920b90c488e3 Mon Sep 17 00:00:00 2001 From: Gabriel Ilharco Date: Mon, 28 Aug 2023 17:39:40 +0000 Subject: [PATCH 1/4] wandb step fix --- src/training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/train.py b/src/training/train.py index e93d9d370..8aa58a620 100644 --- a/src/training/train.py +++ b/src/training/train.py @@ -225,7 +225,7 @@ def train_one_epoch(model, data, loss, epoch, optimizer, scaler, scheduler, dist tb_writer.add_scalar(name, val, step) if args.wandb: assert wandb is not None, 'Please install wandb.' - wandb.log({name: val, 'step': step}) + wandb.log({name: val}, step=step) # resetting batch / data time meters per log window batch_time_m.reset() From a3e821121f540b55918068dac6d6f5cdc5fb23be Mon Sep 17 00:00:00 2001 From: Gabriel Ilharco Date: Mon, 28 Aug 2023 20:21:06 +0000 Subject: [PATCH 2/4] backwards compat fix --- src/training/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/training/train.py b/src/training/train.py index 8aa58a620..91db2dab2 100644 --- a/src/training/train.py +++ b/src/training/train.py @@ -225,7 +225,7 @@ def train_one_epoch(model, data, loss, epoch, optimizer, scaler, scheduler, dist tb_writer.add_scalar(name, val, step) if args.wandb: assert wandb is not None, 'Please install wandb.' - wandb.log({name: val}, step=step) + wandb.log({name: val, 'step': step}, step=step) # resetting batch / data time meters per log window batch_time_m.reset() From d1a0f8d23df77792d2d1a2e630436000f2c65f84 Mon Sep 17 00:00:00 2001 From: Gabriel Ilharco Date: Thu, 28 Sep 2023 18:11:22 +0000 Subject: [PATCH 3/4] update wandb calls --- src/training/train.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/src/training/train.py b/src/training/train.py index 91db2dab2..4877a7850 100644 --- a/src/training/train.py +++ b/src/training/train.py @@ -219,14 +219,17 @@ def train_one_epoch(model, data, loss, epoch, optimizer, scaler, scheduler, dist } log_data.update({name:val.val for name,val in losses_m.items()}) - for name, val in log_data.items(): - name = "train/" + name - if tb_writer is not None: - tb_writer.add_scalar(name, val, step) - if args.wandb: - assert wandb is not None, 'Please install wandb.' - wandb.log({name: val, 'step': step}, step=step) + log_data = {"train/" + name: val for name, val in log_data.items()} + if tb_writer is not None: + for name, val in log_data.items(): + tb_writer.add_scalar(name, val, step) + + if args.wandb: + assert wandb is not None, 'Please install wandb.' + log_data['step'] = step # for backwards compatibility + wandb.log(log_data, step=step) + # resetting batch / data time meters per log window batch_time_m.reset() data_time_m.reset() @@ -317,10 +320,12 @@ def evaluate(model, data, epoch, args, tb_writer=None): + "\t".join([f"{k}: {round(v, 4):.4f}" for k, v in metrics.items()]) ) + log_data = {"val/" + name: val for name, val in metrics.items()} + if args.save_logs: - for name, val in metrics.items(): - if tb_writer is not None: - tb_writer.add_scalar(f"val/{name}", val, epoch) + if tb_writer is not None: + for name, val in log_data.items(): + tb_writer.add_scalar(name, val, epoch) with open(os.path.join(args.checkpoint_path, "results.jsonl"), "a+") as f: f.write(json.dumps(metrics)) @@ -328,8 +333,14 @@ def evaluate(model, data, epoch, args, tb_writer=None): if args.wandb: assert wandb is not None, 'Please install wandb.' - for name, val in metrics.items(): - wandb.log({f"val/{name}": val, 'epoch': epoch}) + if 'train' in data: + dataloader = data['train'].dataloader + num_batches_per_epoch = dataloader.num_batches // args.accum_freq + step = num_batches_per_epoch * epoch + else: + step = None + log_data['epoch'] = epoch + wandb.log(log_data, step=step) return metrics From ea8d66ccd4f0539a537d40d40134634aa0ebcd65 Mon Sep 17 00:00:00 2001 From: Gabriel Ilharco Date: Fri, 29 Sep 2023 19:41:44 +0000 Subject: [PATCH 4/4] update readme --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c25c07db2..7e7192414 100644 --- a/README.md +++ b/README.md @@ -415,11 +415,17 @@ When training a RN50 on YFCC the same hyperparameters as above are used, with th Note that to use another model, like `ViT-B/32` or `RN50x4` or `RN50x16` or `ViT-B/16`, specify with `--model RN50x4`. -### Launch tensorboard: +### Logging + +For tensorboard logging, run: ```bash tensorboard --logdir=logs/tensorboard/ --port=7777 ``` +For wandb logging, we recommend looking at the `step` variable instead of `Step`, since the later was not properly set in earlier versions of this codebase. +For older runs with models trained before https://github.com/mlfoundations/open_clip/pull/613, the `Step` variable should be ignored. +For newer runs, after that PR, the two variables are the same. + ## Evaluation / Zero-Shot We recommend https://github.com/LAION-AI/CLIP_benchmark#how-to-use for systematic evaluation on 40 datasets.