From 372a3275d29bb79e42b5c69c65e20cf04ee2ae54 Mon Sep 17 00:00:00 2001
From: eolecvk <eolecvk@gmail.com>
Date: Wed, 12 Oct 2022 09:16:10 -0700
Subject: [PATCH 1/5] cleaner autocast logic, benchmark.md with flag
 instructions

---
 docs/benchmark.md    | 13 ++++++++++++-
 scripts/benchmark.py |  4 +---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/benchmark.md b/docs/benchmark.md
index 4109382..269b75c 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -108,7 +108,18 @@ Launch the `benchmark.py` script to append benchmark results to the existing [be
 python ./scripts/benchmark.py
 ```
 
-Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
+Here are the following flags that can be set on the `benchmark.py` script:
+* `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`.
+* `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`.
+* `--repeats` sets the number of times to repeat each run with a given set of parameter value before reporting their average inference latencies. It is passed as an integer value, eg: `2`. Default is `3`.
+* `--autocast` sets whether or not to add cuda autocast runs to the benchmark (respectively `yes` and `no`). Default is `no`.
+
+An example of running the benchmark script options set:
+```
+python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no
+```
+
+Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
 ```
 python ./scripts/benchmark_quality.py
 ```
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 4e94b8d..925b760 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -181,11 +181,9 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
         device_desc = get_device_description()
         for n_samples in grid["n_samples"]:
             for precision in grid["precision"]:
-                use_autocast = False
                 if precision == "half":
                     for autocast in grid["autocast"]:
-                        if autocast == "yes":
-                            use_autocast = True
+                        use_autocast = (autocast == "yes")
                         for backend in grid["backend"]:
                             try:
                                 new_log = run_benchmark(

From ba987189ae3cb2ed1a6c49d978c4d289281846c5 Mon Sep 17 00:00:00 2001
From: eolecvk <eolecvk@gmail.com>
Date: Wed, 12 Oct 2022 09:35:47 -0700
Subject: [PATCH 2/5] fix gross mistake in autocast logic

---
 scripts/benchmark.py | 69 ++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 925b760..1ec6d40 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -182,40 +182,41 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
         for n_samples in grid["n_samples"]:
             for precision in grid["precision"]:
                 if precision == "half":
-                    for autocast in grid["autocast"]:
-                        use_autocast = (autocast == "yes")
-                        for backend in grid["backend"]:
-                            try:
-                                new_log = run_benchmark(
-                                    n_repeats=n_repeats,
-                                    n_samples=n_samples,
-                                    precision=precision,
-                                    use_autocast=use_autocast,
-                                    backend=backend,
-                                    num_inference_steps=num_inference_steps,
-                                )
-                            except Exception as e:
-                                if "CUDA out of memory" in str(
-                                    e
-                                ) or "Failed to allocate memory" in str(e):
-                                    print(str(e))
-                                    torch.cuda.empty_cache()
-                                    new_log = {"latency": -1.00, "memory": -1.00}
-                                else:
-                                    raise e
-
-                            latency = new_log["latency"]
-                            memory = new_log["memory"]
-                            new_row = [
-                                device_desc,
-                                precision,
-                                autocast,
-                                backend,
-                                n_samples,
-                                latency,
-                                memory,
-                            ]
-                            writer.writerow(new_row)
+                    use_autocast = (autocast == "yes")
+                else:
+                    use_autocast = False
+                for backend in grid["backend"]:
+                    try:
+                        new_log = run_benchmark(
+                            n_repeats=n_repeats,
+                            n_samples=n_samples,
+                            precision=precision,
+                            use_autocast=use_autocast,
+                            backend=backend,
+                            num_inference_steps=num_inference_steps,
+                        )
+                    except Exception as e:
+                        if "CUDA out of memory" in str(
+                            e
+                        ) or "Failed to allocate memory" in str(e):
+                            print(str(e))
+                            torch.cuda.empty_cache()
+                            new_log = {"latency": -1.00, "memory": -1.00}
+                        else:
+                            raise e
+
+                    latency = new_log["latency"]
+                    memory = new_log["memory"]
+                    new_row = [
+                        device_desc,
+                        precision,
+                        autocast,
+                        backend,
+                        n_samples,
+                        latency,
+                        memory,
+                    ]
+                    writer.writerow(new_row)
 
 
 if __name__ == "__main__":

From 3c6362fa9d85594f470d6875fd6796a8bcf0c606 Mon Sep 17 00:00:00 2001
From: eolecvk <eolecvk@gmail.com>
Date: Wed, 12 Oct 2022 11:51:40 -0700
Subject: [PATCH 3/5] add benchmark.py config options to .md, update req.txt
 for fp16 to work without autocast, fix benchmark code for cases when to use
 autocase

---
 docs/benchmark.md    |  13 ++++++
 requirements.txt     |   2 +
 scripts/benchmark.py | 103 ++++++++++++++++++++-----------------------
 3 files changed, 64 insertions(+), 54 deletions(-)

diff --git a/docs/benchmark.md b/docs/benchmark.md
index 269b75c..7e52d27 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -103,11 +103,22 @@ export ACCESS_TOKEN=<hf_...>
 
 #### Usage
 
+**Run benchmark on GPU (if available)**
 Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file:
 ```
 python ./scripts/benchmark.py
 ```
 
+**Run benchmark on CPU**
+
+In order to run the benchmark on your CPU device, prefix the script run command with `CUDA_VISIBLE_DEVICES=` like so:
+```
+export CUDA_VISIBLE_DEVICES=
+python ./scripts/benchmark.py
+```
+
+**Benchmark config options**
+
 Here are the following flags that can be set on the `benchmark.py` script:
 * `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`.
 * `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`.
@@ -119,6 +130,8 @@ An example of running the benchmark script options set:
 python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no
 ```
 
+**Compare output of single-precision and half-precision**
+
 Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
 ```
 python ./scripts/benchmark_quality.py
diff --git a/requirements.txt b/requirements.txt
index 9089b38..bf6a343 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 --extra-index-url https://download.pytorch.org/whl/cu116 torch
 
+torchvision==0.13.1
+torch==1.12.1
 transformers==4.22.1
 ftfy==6.1.1
 Pillow==9.2.0
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 1ec6d40..480c25c 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -10,7 +10,6 @@
 from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline
 
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
 prompt = "a photo of an astronaut riding a horse on mars"
 
 
@@ -30,8 +29,8 @@ def get_inference_pipeline(precision, backend):
             revision="main" if precision == "single" else "fp16",
             use_auth_token=os.environ["ACCESS_TOKEN"],
             torch_dtype=torch.float32 if precision == "single" else torch.float16,
-        )
-        pipe = pipe.to(device)
+        ).to(device)
+
     else:
         pipe = StableDiffusionOnnxPipeline.from_pretrained(
             "CompVis/stable-diffusion-v1-4",
@@ -42,23 +41,18 @@ def get_inference_pipeline(precision, backend):
             else "CUDAExecutionProvider",
             torch_dtype=torch.float32 if precision == "single" else torch.float16,
         )
-
     # Disable safety
-    disable_safety = True
-    if disable_safety:
+    def null_safety(images, **kwargs):
+        return images, False
 
-        def null_safety(images, **kwargs):
-            return images, False
+    pipe.safety_checker = null_safety
 
-        pipe.safety_checker = null_safety
     return pipe
 
 
 def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
     torch.cuda.empty_cache()
-    context = (
-        autocast if (device.type == "cuda" and use_autocast) else nullcontext
-    )
+    context = autocast if (device.type == "cuda" and use_autocast) else nullcontext
     with context("cuda"):
         images = pipe(
             prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
@@ -67,9 +61,7 @@ def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
     return images
 
 
-def get_inference_time(
-    pipe, n_samples, n_repeats, use_autocast, num_inference_steps
-):
+def get_inference_time(pipe, n_samples, n_repeats, use_autocast, num_inference_steps):
     from torch.utils.benchmark import Timer
 
     timer = Timer(
@@ -121,14 +113,14 @@ def run_benchmark(
     logs = {
         "memory": 0.00
         if device.type == "cpu"
-        else get_inference_memory(
-            pipe, n_samples, use_autocast, num_inference_steps
-        ),
+        else get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps),
         "latency": get_inference_time(
             pipe, n_samples, n_repeats, use_autocast, num_inference_steps
         ),
     }
-    print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}")
+    print(
+        f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}"
+    )
     print(logs, "\n")
     return logs
 
@@ -181,42 +173,45 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
         device_desc = get_device_description()
         for n_samples in grid["n_samples"]:
             for precision in grid["precision"]:
-                if precision == "half":
-                    use_autocast = (autocast == "yes")
+                # restrict enabling autocast to half precision
+                if precision == "single":
+                    use_autocast_vals = ("no",)
                 else:
-                    use_autocast = False
-                for backend in grid["backend"]:
-                    try:
-                        new_log = run_benchmark(
-                            n_repeats=n_repeats,
-                            n_samples=n_samples,
-                            precision=precision,
-                            use_autocast=use_autocast,
-                            backend=backend,
-                            num_inference_steps=num_inference_steps,
-                        )
-                    except Exception as e:
-                        if "CUDA out of memory" in str(
-                            e
-                        ) or "Failed to allocate memory" in str(e):
-                            print(str(e))
-                            torch.cuda.empty_cache()
-                            new_log = {"latency": -1.00, "memory": -1.00}
-                        else:
-                            raise e
-
-                    latency = new_log["latency"]
-                    memory = new_log["memory"]
-                    new_row = [
-                        device_desc,
-                        precision,
-                        autocast,
-                        backend,
-                        n_samples,
-                        latency,
-                        memory,
-                    ]
-                    writer.writerow(new_row)
+                    use_autocast_vals = grid["autocast"]
+                for use_autocast_val in use_autocast_vals:
+                    use_autocast = use_autocast_val == "yes"
+                    for backend in grid["backend"]:
+                        try:
+                            new_log = run_benchmark(
+                                n_repeats=n_repeats,
+                                n_samples=n_samples,
+                                precision=precision,
+                                use_autocast=use_autocast,
+                                backend=backend,
+                                num_inference_steps=num_inference_steps,
+                            )
+                        except Exception as e:
+                            if "CUDA out of memory" in str(
+                                e
+                            ) or "Failed to allocate memory" in str(e):
+                                print(str(e))
+                                torch.cuda.empty_cache()
+                                new_log = {"latency": -1.00, "memory": -1.00}
+                            else:
+                                raise e
+
+                        latency = new_log["latency"]
+                        memory = new_log["memory"]
+                        new_row = [
+                            device_desc,
+                            precision,
+                            use_autocast,
+                            backend,
+                            n_samples,
+                            latency,
+                            memory,
+                        ]
+                        writer.writerow(new_row)
 
 
 if __name__ == "__main__":

From a5cc87dc0b87f9d8f2983d1c74fbe5bc063c44b3 Mon Sep 17 00:00:00 2001
From: eolecvk <eolecvk@gmail.com>
Date: Wed, 12 Oct 2022 11:55:04 -0700
Subject: [PATCH 4/5] update diffusers version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index bf6a343..77585ac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ torch==1.12.1
 transformers==4.22.1
 ftfy==6.1.1
 Pillow==9.2.0
-diffusers==0.3.0
+diffusers==0.4.2
 onnxruntime==1.12.1
 scikit-image==0.19.3
 -e .

From bb72fa679281f42092576c6f8e6bd4e46bcc3d03 Mon Sep 17 00:00:00 2001
From: eolecvk <eolecvk@gmail.com>
Date: Wed, 12 Oct 2022 11:55:37 -0700
Subject: [PATCH 5/5] rm torch torchvision from requirements

---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 77585ac..82b6f5f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu116 torch
 
-torchvision==0.13.1
-torch==1.12.1
 transformers==4.22.1
 ftfy==6.1.1
 Pillow==9.2.0