diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py new file mode 100755 index 00000000000000..903b4e0dd6d500 --- /dev/null +++ b/scripts/benchmark/trainer-benchmark.py @@ -0,0 +1,448 @@ +#!/usr/bin/env python + +# HF Trainer benchmarking tool +# +# This tool can be used to run and compare multiple dimensions of the HF Trainers args. +# +# It then prints a report once in github format with all the information that needs to be shared +# with others and second time in a console-friendly format, so it's easier to use for tuning things up. +# +# The main idea is: +# +# ./trainer-benchmark.py --base-cmd '' \ +# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \ +# --target-metric-key train_samples_per_second +# +# The variations can be any command line argument that you want to compare and not just dtype as in +# the example. +# +# --variations allows you to compare variations in multiple dimensions. +# +# as the first dimention has 2 options and the second 3 in our example, this will run the trainer 6 +# times adding one of: +# +# 1. --tf32 0 --fp16 0 +# 2. --tf32 0 --fp16 1 +# 3. --tf32 0 --bf16 1 +# 4. --tf32 1 --fp16 0 +# 5. --tf32 1 --fp16 1 +# 6. --tf32 1 --bf16 1 +# +# and print the results. This is just a cartesian product - and more than 2 dimensions can be used. +# +# If you want to rely on defaults, this: +# --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' +# is identical to this: +# --variations '--tf32 0|--tf32 1' '|--fp16|--bf16' +# +# the leading empty variation in the 2nd dimension is a valid variation. +# +# So here we get the following 6 variations: +# +# 1. --tf32 0 +# 2. --tf32 0 --fp16 +# 3. --tf32 0 --bf16 +# 4. --tf32 1 +# 5. --tf32 1 --fp16 +# 6. --tf32 1 --bf16 +# +# In this particular case we don't know what the default tf32 setting is as it's normally +# pytorch-version dependent). That's why it's best to do an explicit setting of each variation: +# `--tf32 0|--tf32 1` +# +# Here is a full example of a train: +# +# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \ +# --base-cmd \ +# ' examples/pytorch/translation/run_translation.py --model_name_or_path t5-small \ +# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \ +# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \ +# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \ +# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \ +# --source_prefix "translate English to Romanian: " --warmup_steps 50 \ +# --max_train_samples 20000 --dataloader_num_workers 2 ' \ +# --target-metric-key train_samples_per_second --repeat-times 1 --variations \ +# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \ +# --repeat-times 1 --base-variation '--tf32 0' +# +# and here is a possible output: +# +# +# | Variation | Train | Diff | Train | +# | | samples | % | loss | +# | | per | | | +# | | second | | | +# |:----------------|----------:|-------:|--------:| +# | --tf32 0 | 285.11 | 0 | 2.51 | +# | --tf32 1 | 342.09 | 20 | 2.51 | +# | --fp16 --tf32 0 | 423.49 | 49 | 2.51 | +# | --fp16 --tf32 1 | 423.13 | 48 | 2.51 | +# | --bf16 --tf32 0 | 416.80 | 46 | 2.52 | +# | --bf16 --tf32 1 | 415.87 | 46 | 2.52 | +# +# +# So you can quickly compare the different outcomes. +# +# Typically running each experiment once is enough, but if the environment is unstable you can +# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results. +# +# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to +# it as can be seen from the table above, but you can also specify which combination is the one to use as +# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0' +# +# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are +# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use: +# --target-metric-key eval_samples_per_second +# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as +# well (as currently it doesn't) +# + +import argparse +import datetime +import io +import itertools +import json +import math +import os +import platform +import re +import shlex +import subprocess +import sys +from pathlib import Path +from statistics import fmean + +import pandas as pd +import torch +from tqdm import tqdm + +import transformers + + +nan = float("nan") + + +class Tee: + """ + A helper class to tee print's output into a file. + Usage: + sys.stdout = Tee(filename) + """ + + def __init__(self, filename): + self.stdout = sys.stdout + self.file = open(filename, "a") + + def __getattr__(self, attr): + return getattr(self.stdout, attr) + + def write(self, msg): + self.stdout.write(msg) + # strip tqdm codes + self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M)) + + +def get_original_command(max_width=80, full_python_path=False): + """ + Return the original command line string that can be replayed nicely and wrapped for 80 char width. + + Args: + max_width (`int`, `optional`, defaults to 80): + The width to wrap for. + full_python_path (`bool`, `optional`, defaults to `False`): + Whether to replicate the full path or just the last segment (i.e. `python`). + """ + + cmd = [] + + # deal with critical env vars + env_keys = ["CUDA_VISIBLE_DEVICES"] + for key in env_keys: + val = os.environ.get(key, None) + if val is not None: + cmd.append(f"{key}={val}") + + # python executable (not always needed if the script is executable) + python = sys.executable if full_python_path else sys.executable.split("/")[-1] + cmd.append(python) + + # now the normal args + cmd += list(map(shlex.quote, sys.argv)) + + # split up into up to MAX_WIDTH lines with shell multi-line escapes + lines = [] + current_line = "" + while len(cmd) > 0: + current_line += f"{cmd.pop(0)} " + if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1: + lines.append(current_line) + current_line = "" + return "\\\n".join(lines) + + +def get_base_command(args, output_dir): + + # unwrap multi-line input + args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd) + + # remove --output_dir if any and set our own + args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd) + args.base_cmd += f" --output_dir {output_dir}" + + # ensure we have --overwrite_output_dir + args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd) + args.base_cmd += " --overwrite_output_dir" + + return [sys.executable] + shlex.split(args.base_cmd) + + +def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose): + + # Enable to debug everything but the run itself, to do it fast and see the progress. + # This is useful for debugging the output formatting quickly - we can remove it later once + # everybody is happy with the output + if 0: + import random + from time import sleep + + sleep(0) + return dict( + {k: random.uniform(0, 100) for k in metric_keys}, + **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])}, + ) + + result = subprocess.run(cmd, capture_output=True, text=True) + + if verbose: + print("STDOUT", result.stdout) + print("STDERR", result.stderr) + + # save the streams + prefix = variation.replace(" ", "-") + with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f: + f.write(result.stdout) + with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f: + f.write(result.stderr) + + if result.returncode != 0: + if verbose: + print("failed") + return {target_metric_key: nan} + + with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f: + metrics = json.load(f) + + # filter out just the keys we want + return {k: v for k, v in metrics.items() if k in metric_keys} + + +def process_run( + id, + cmd, + variation_key, + variation, + longest_variation_len, + target_metric_key, + report_metric_keys, + repeat_times, + output_dir, + verbose, +): + results = [] + metrics = [] + preamble = f"{id}: {variation:<{longest_variation_len}}" + outcome = f"{preamble}: " + metric_keys = set(report_metric_keys + [target_metric_key]) + for i in tqdm(range(repeat_times), desc=preamble, leave=False): + single_run_metrics = process_run_single( + id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose + ) + result = single_run_metrics[target_metric_key] + if not math.isnan(result): + metrics.append(single_run_metrics) + results.append(result) + outcome += "✓" + else: + outcome += "✘" + outcome = f"\33[2K\r{outcome}" + if len(metrics) > 0: + mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()} + mean_target = round(mean_metrics[target_metric_key], 2) + results_str = f"{outcome} {mean_target}" + if len(metrics) > 1: + results_str += f" {tuple(round(x, 2) for x in results)}" + print(results_str) + mean_metrics[variation_key] = variation + return mean_metrics + else: + print(outcome) + return {variation_key: variation, target_metric_key: nan} + + +def get_versions(): + properties = torch.cuda.get_device_properties(torch.device("cuda")) + return f""" +Datetime : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} + +Software: +transformers: {transformers.__version__} +torch : {torch.__version__} +cuda : {torch.version.cuda} +python : {platform.python_version()} + +Hardware: +{torch.cuda.device_count()} GPUs : {properties.name}, {properties.total_memory/2**30:0.2f}GB +""" + + +def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir): + + df = pd.DataFrame(results) + variation_key = "variation" + diff_key = "diff_%" + + sentinel_value = nan + if base_variation is not None and len(df[df[variation_key] == base_variation]): + # this may still return nan + sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item() + if math.isnan(sentinel_value): + # as a fallback, use the minimal value as the sentinel + sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min() + + # create diff column if possible + if not math.isnan(sentinel_value): + df[diff_key] = df.apply( + lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value) + if not math.isnan(r[target_metric_key]) + else 0, + axis="columns", + ) + + # re-order columns + cols = [variation_key, target_metric_key, diff_key, *report_metric_keys] + df = df.reindex(cols, axis="columns") # reorder cols + + # capitalize + df = df.rename(str.capitalize, axis="columns") + + # make the cols as narrow as possible + df_github = df.rename(lambda c: c.replace("_", "
"), axis="columns") + df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns") + + report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"] + report += ["----------8<-----------------8<--------"] + report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")] + report += ["```"] + report += ["*** Setup:", get_versions()] + report += ["*** The benchmark command line was:", get_original_command()] + report += ["```"] + report += ["----------8<-----------------8<--------"] + report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")] + + print("\n\n".join(report)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--base-cmd", + default=None, + type=str, + required=True, + help="Base cmd", + ) + parser.add_argument( + "--variations", + default=None, + type=str, + nargs="+", + required=True, + help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'", + ) + parser.add_argument( + "--base-variation", + default=None, + type=str, + help="Baseline variation to compare to. if None the minimal target value will be used to compare against", + ) + parser.add_argument( + "--target-metric-key", + default=None, + type=str, + required=True, + help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second", + ) + parser.add_argument( + "--report-metric-keys", + default="", + type=str, + help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples", + ) + parser.add_argument( + "--repeat-times", + default=1, + type=int, + help="How many times to re-run each variation - an average will be reported", + ) + parser.add_argument( + "--output_dir", + default="output_benchmark", + type=str, + help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked", + ) + parser.add_argument( + "--verbose", + default=False, + action="store_true", + help="Whether to show the outputs of each run or just the benchmark progress", + ) + args = parser.parse_args() + + output_dir = args.output_dir + Path(output_dir).mkdir(exist_ok=True) + base_cmd = get_base_command(args, output_dir) + + # split each dimension into its --foo variations + dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations] + # build a cartesian product of dimensions and convert those back into cmd-line arg strings, + # while stripping white space for inputs that were empty + variations = list(map(str.strip, map(" ".join, itertools.product(*dims)))) + longest_variation_len = max(len(x) for x in variations) + + # split wanted keys + report_metric_keys = args.report_metric_keys.split() + + # capture prints into a log file for convenience + report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt" + print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt") + print(f"and this script's output is also piped into {report_fn}") + + sys.stdout = Tee(report_fn) + + print(f"\n*** Running {len(variations)} benchmarks:") + print(f"Base command: {' '.join(base_cmd)}") + + variation_key = "variation" + results = [] + for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)): + cmd = base_cmd + variation.split() + results.append( + process_run( + id + 1, + cmd, + variation_key, + variation, + longest_variation_len, + args.target_metric_key, + report_metric_keys, + args.repeat_times, + output_dir, + args.verbose, + ) + ) + + process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir) + + +if __name__ == "__main__": + main()