diff --git a/docs/benchmark.md b/docs/benchmark.md index 4109382..7e52d27 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -103,12 +103,36 @@ export ACCESS_TOKEN= #### Usage +**Run benchmark on GPU (if available)** Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file: ``` python ./scripts/benchmark.py ``` -Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models: +**Run benchmark on CPU** + +In order to run the benchmark on your CPU device, prefix the script run command with `CUDA_VISIBLE_DEVICES=` like so: +``` +export CUDA_VISIBLE_DEVICES= +python ./scripts/benchmark.py +``` + +**Benchmark config options** + +Here are the following flags that can be set on the `benchmark.py` script: +* `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`. +* `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`. +* `--repeats` sets the number of times to repeat each run with a given set of parameter value before reporting their average inference latencies. It is passed as an integer value, eg: `2`. Default is `3`. +* `--autocast` sets whether or not to add cuda autocast runs to the benchmark (respectively `yes` and `no`). Default is `no`. + +An example of running the benchmark script options set: +``` +python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no +``` + +**Compare output of single-precision and half-precision** + +Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models: ``` python ./scripts/benchmark_quality.py ``` diff --git a/requirements.txt b/requirements.txt index 9089b38..82b6f5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ transformers==4.22.1 ftfy==6.1.1 Pillow==9.2.0 -diffusers==0.3.0 +diffusers==0.4.2 onnxruntime==1.12.1 scikit-image==0.19.3 -e . diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 4e94b8d..480c25c 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -10,7 +10,6 @@ from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - prompt = "a photo of an astronaut riding a horse on mars" @@ -30,8 +29,8 @@ def get_inference_pipeline(precision, backend): revision="main" if precision == "single" else "fp16", use_auth_token=os.environ["ACCESS_TOKEN"], torch_dtype=torch.float32 if precision == "single" else torch.float16, - ) - pipe = pipe.to(device) + ).to(device) + else: pipe = StableDiffusionOnnxPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", @@ -42,23 +41,18 @@ def get_inference_pipeline(precision, backend): else "CUDAExecutionProvider", torch_dtype=torch.float32 if precision == "single" else torch.float16, ) - # Disable safety - disable_safety = True - if disable_safety: + def null_safety(images, **kwargs): + return images, False - def null_safety(images, **kwargs): - return images, False + pipe.safety_checker = null_safety - pipe.safety_checker = null_safety return pipe def do_inference(pipe, n_samples, use_autocast, num_inference_steps): torch.cuda.empty_cache() - context = ( - autocast if (device.type == "cuda" and use_autocast) else nullcontext - ) + context = autocast if (device.type == "cuda" and use_autocast) else nullcontext with context("cuda"): images = pipe( prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps @@ -67,9 +61,7 @@ def do_inference(pipe, n_samples, use_autocast, num_inference_steps): return images -def get_inference_time( - pipe, n_samples, n_repeats, use_autocast, num_inference_steps -): +def get_inference_time(pipe, n_samples, n_repeats, use_autocast, num_inference_steps): from torch.utils.benchmark import Timer timer = Timer( @@ -121,14 +113,14 @@ def run_benchmark( logs = { "memory": 0.00 if device.type == "cpu" - else get_inference_memory( - pipe, n_samples, use_autocast, num_inference_steps - ), + else get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps), "latency": get_inference_time( pipe, n_samples, n_repeats, use_autocast, num_inference_steps ), } - print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}") + print( + f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}" + ) print(logs, "\n") return logs @@ -181,43 +173,45 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps): device_desc = get_device_description() for n_samples in grid["n_samples"]: for precision in grid["precision"]: - use_autocast = False - if precision == "half": - for autocast in grid["autocast"]: - if autocast == "yes": - use_autocast = True - for backend in grid["backend"]: - try: - new_log = run_benchmark( - n_repeats=n_repeats, - n_samples=n_samples, - precision=precision, - use_autocast=use_autocast, - backend=backend, - num_inference_steps=num_inference_steps, - ) - except Exception as e: - if "CUDA out of memory" in str( - e - ) or "Failed to allocate memory" in str(e): - print(str(e)) - torch.cuda.empty_cache() - new_log = {"latency": -1.00, "memory": -1.00} - else: - raise e - - latency = new_log["latency"] - memory = new_log["memory"] - new_row = [ - device_desc, - precision, - autocast, - backend, - n_samples, - latency, - memory, - ] - writer.writerow(new_row) + # restrict enabling autocast to half precision + if precision == "single": + use_autocast_vals = ("no",) + else: + use_autocast_vals = grid["autocast"] + for use_autocast_val in use_autocast_vals: + use_autocast = use_autocast_val == "yes" + for backend in grid["backend"]: + try: + new_log = run_benchmark( + n_repeats=n_repeats, + n_samples=n_samples, + precision=precision, + use_autocast=use_autocast, + backend=backend, + num_inference_steps=num_inference_steps, + ) + except Exception as e: + if "CUDA out of memory" in str( + e + ) or "Failed to allocate memory" in str(e): + print(str(e)) + torch.cuda.empty_cache() + new_log = {"latency": -1.00, "memory": -1.00} + else: + raise e + + latency = new_log["latency"] + memory = new_log["memory"] + new_row = [ + device_desc, + precision, + use_autocast, + backend, + n_samples, + latency, + memory, + ] + writer.writerow(new_row) if __name__ == "__main__":