From 372a3275d29bb79e42b5c69c65e20cf04ee2ae54 Mon Sep 17 00:00:00 2001 From: eolecvk Date: Wed, 12 Oct 2022 09:16:10 -0700 Subject: [PATCH 1/5] cleaner autocast logic, benchmark.md with flag instructions --- docs/benchmark.md | 13 ++++++++++++- scripts/benchmark.py | 4 +--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/docs/benchmark.md b/docs/benchmark.md index 4109382..269b75c 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -108,7 +108,18 @@ Launch the `benchmark.py` script to append benchmark results to the existing [be python ./scripts/benchmark.py ``` -Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models: +Here are the following flags that can be set on the `benchmark.py` script: +* `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`. +* `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`. +* `--repeats` sets the number of times to repeat each run with a given set of parameter value before reporting their average inference latencies. It is passed as an integer value, eg: `2`. Default is `3`. +* `--autocast` sets whether or not to add cuda autocast runs to the benchmark (respectively `yes` and `no`). Default is `no`. + +An example of running the benchmark script options set: +``` +python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no +``` + +Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models: ``` python ./scripts/benchmark_quality.py ``` diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4e94b8d..925b760 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -181,11 +181,9 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps): device_desc = get_device_description() for n_samples in grid["n_samples"]: for precision in grid["precision"]: - use_autocast = False if precision == "half": for autocast in grid["autocast"]: - if autocast == "yes": - use_autocast = True + use_autocast = (autocast == "yes") for backend in grid["backend"]: try: new_log = run_benchmark( From ba987189ae3cb2ed1a6c49d978c4d289281846c5 Mon Sep 17 00:00:00 2001 From: eolecvk Date: Wed, 12 Oct 2022 09:35:47 -0700 Subject: [PATCH 2/5] fix gross mistake in autocast logic --- scripts/benchmark.py | 69 ++++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 925b760..1ec6d40 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -182,40 +182,41 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps): for n_samples in grid["n_samples"]: for precision in grid["precision"]: if precision == "half": - for autocast in grid["autocast"]: - use_autocast = (autocast == "yes") - for backend in grid["backend"]: - try: - new_log = run_benchmark( - n_repeats=n_repeats, - n_samples=n_samples, - precision=precision, - use_autocast=use_autocast, - backend=backend, - num_inference_steps=num_inference_steps, - ) - except Exception as e: - if "CUDA out of memory" in str( - e - ) or "Failed to allocate memory" in str(e): - print(str(e)) - torch.cuda.empty_cache() - new_log = {"latency": -1.00, "memory": -1.00} - else: - raise e - - latency = new_log["latency"] - memory = new_log["memory"] - new_row = [ - device_desc, - precision, - autocast, - backend, - n_samples, - latency, - memory, - ] - writer.writerow(new_row) + use_autocast = (autocast == "yes") + else: + use_autocast = False + for backend in grid["backend"]: + try: + new_log = run_benchmark( + n_repeats=n_repeats, + n_samples=n_samples, + precision=precision, + use_autocast=use_autocast, + backend=backend, + num_inference_steps=num_inference_steps, + ) + except Exception as e: + if "CUDA out of memory" in str( + e + ) or "Failed to allocate memory" in str(e): + print(str(e)) + torch.cuda.empty_cache() + new_log = {"latency": -1.00, "memory": -1.00} + else: + raise e + + latency = new_log["latency"] + memory = new_log["memory"] + new_row = [ + device_desc, + precision, + autocast, + backend, + n_samples, + latency, + memory, + ] + writer.writerow(new_row) if __name__ == "__main__": From 3c6362fa9d85594f470d6875fd6796a8bcf0c606 Mon Sep 17 00:00:00 2001 From: eolecvk Date: Wed, 12 Oct 2022 11:51:40 -0700 Subject: [PATCH 3/5] add benchmark.py config options to .md, update req.txt for fp16 to work without autocast, fix benchmark code for cases when to use autocase --- docs/benchmark.md | 13 ++++++ requirements.txt | 2 + scripts/benchmark.py | 103 ++++++++++++++++++++----------------------- 3 files changed, 64 insertions(+), 54 deletions(-) diff --git a/docs/benchmark.md b/docs/benchmark.md index 269b75c..7e52d27 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -103,11 +103,22 @@ export ACCESS_TOKEN= #### Usage +**Run benchmark on GPU (if available)** Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file: ``` python ./scripts/benchmark.py ``` +**Run benchmark on CPU** + +In order to run the benchmark on your CPU device, prefix the script run command with `CUDA_VISIBLE_DEVICES=` like so: +``` +export CUDA_VISIBLE_DEVICES= +python ./scripts/benchmark.py +``` + +**Benchmark config options** + Here are the following flags that can be set on the `benchmark.py` script: * `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`. * `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`. @@ -119,6 +130,8 @@ An example of running the benchmark script options set: python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no ``` +**Compare output of single-precision and half-precision** + Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models: ``` python ./scripts/benchmark_quality.py diff --git a/requirements.txt b/requirements.txt index 9089b38..bf6a343 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ --extra-index-url https://download.pytorch.org/whl/cu116 torch +torchvision==0.13.1 +torch==1.12.1 transformers==4.22.1 ftfy==6.1.1 Pillow==9.2.0 diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 1ec6d40..480c25c 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -10,7 +10,6 @@ from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - prompt = "a photo of an astronaut riding a horse on mars" @@ -30,8 +29,8 @@ def get_inference_pipeline(precision, backend): revision="main" if precision == "single" else "fp16", use_auth_token=os.environ["ACCESS_TOKEN"], torch_dtype=torch.float32 if precision == "single" else torch.float16, - ) - pipe = pipe.to(device) + ).to(device) + else: pipe = StableDiffusionOnnxPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", @@ -42,23 +41,18 @@ def get_inference_pipeline(precision, backend): else "CUDAExecutionProvider", torch_dtype=torch.float32 if precision == "single" else torch.float16, ) - # Disable safety - disable_safety = True - if disable_safety: + def null_safety(images, **kwargs): + return images, False - def null_safety(images, **kwargs): - return images, False + pipe.safety_checker = null_safety - pipe.safety_checker = null_safety return pipe def do_inference(pipe, n_samples, use_autocast, num_inference_steps): torch.cuda.empty_cache() - context = ( - autocast if (device.type == "cuda" and use_autocast) else nullcontext - ) + context = autocast if (device.type == "cuda" and use_autocast) else nullcontext with context("cuda"): images = pipe( prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps @@ -67,9 +61,7 @@ def do_inference(pipe, n_samples, use_autocast, num_inference_steps): return images -def get_inference_time( - pipe, n_samples, n_repeats, use_autocast, num_inference_steps -): +def get_inference_time(pipe, n_samples, n_repeats, use_autocast, num_inference_steps): from torch.utils.benchmark import Timer timer = Timer( @@ -121,14 +113,14 @@ def run_benchmark( logs = { "memory": 0.00 if device.type == "cpu" - else get_inference_memory( - pipe, n_samples, use_autocast, num_inference_steps - ), + else get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps), "latency": get_inference_time( pipe, n_samples, n_repeats, use_autocast, num_inference_steps ), } - print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}") + print( + f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}" + ) print(logs, "\n") return logs @@ -181,42 +173,45 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps): device_desc = get_device_description() for n_samples in grid["n_samples"]: for precision in grid["precision"]: - if precision == "half": - use_autocast = (autocast == "yes") + # restrict enabling autocast to half precision + if precision == "single": + use_autocast_vals = ("no",) else: - use_autocast = False - for backend in grid["backend"]: - try: - new_log = run_benchmark( - n_repeats=n_repeats, - n_samples=n_samples, - precision=precision, - use_autocast=use_autocast, - backend=backend, - num_inference_steps=num_inference_steps, - ) - except Exception as e: - if "CUDA out of memory" in str( - e - ) or "Failed to allocate memory" in str(e): - print(str(e)) - torch.cuda.empty_cache() - new_log = {"latency": -1.00, "memory": -1.00} - else: - raise e - - latency = new_log["latency"] - memory = new_log["memory"] - new_row = [ - device_desc, - precision, - autocast, - backend, - n_samples, - latency, - memory, - ] - writer.writerow(new_row) + use_autocast_vals = grid["autocast"] + for use_autocast_val in use_autocast_vals: + use_autocast = use_autocast_val == "yes" + for backend in grid["backend"]: + try: + new_log = run_benchmark( + n_repeats=n_repeats, + n_samples=n_samples, + precision=precision, + use_autocast=use_autocast, + backend=backend, + num_inference_steps=num_inference_steps, + ) + except Exception as e: + if "CUDA out of memory" in str( + e + ) or "Failed to allocate memory" in str(e): + print(str(e)) + torch.cuda.empty_cache() + new_log = {"latency": -1.00, "memory": -1.00} + else: + raise e + + latency = new_log["latency"] + memory = new_log["memory"] + new_row = [ + device_desc, + precision, + use_autocast, + backend, + n_samples, + latency, + memory, + ] + writer.writerow(new_row) if __name__ == "__main__": From a5cc87dc0b87f9d8f2983d1c74fbe5bc063c44b3 Mon Sep 17 00:00:00 2001 From: eolecvk Date: Wed, 12 Oct 2022 11:55:04 -0700 Subject: [PATCH 4/5] update diffusers version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bf6a343..77585ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ torch==1.12.1 transformers==4.22.1 ftfy==6.1.1 Pillow==9.2.0 -diffusers==0.3.0 +diffusers==0.4.2 onnxruntime==1.12.1 scikit-image==0.19.3 -e . From bb72fa679281f42092576c6f8e6bd4e46bcc3d03 Mon Sep 17 00:00:00 2001 From: eolecvk Date: Wed, 12 Oct 2022 11:55:37 -0700 Subject: [PATCH 5/5] rm torch torchvision from requirements --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 77585ac..82b6f5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cu116 torch -torchvision==0.13.1 -torch==1.12.1 transformers==4.22.1 ftfy==6.1.1 Pillow==9.2.0