Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix autocast v fp16 + add benchmark config instructions in benchmark.md #11

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion docs/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,36 @@ export ACCESS_TOKEN=<hf_...>

#### Usage

**Run benchmark on GPU (if available)**
Launch the `benchmark.py` script to append benchmark results to the existing [benchmark.csv](../benchmark.csv) results file:
```
python ./scripts/benchmark.py
```

Lauch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
**Run benchmark on CPU**

In order to run the benchmark on your CPU device, prefix the script run command with `CUDA_VISIBLE_DEVICES=` like so:
```
export CUDA_VISIBLE_DEVICES=
python ./scripts/benchmark.py
```

**Benchmark config options**

Here are the following flags that can be set on the `benchmark.py` script:
* `--samples` sets the sample size for which to run a benchmark and is passed as a comma separated list of values such as `1,2,4,8,16`. Default is `1`.
* `--steps` sets the number of inference steps and is passed as an integer value, eg: `50`. Default is `40`.
* `--repeats` sets the number of times to repeat each run with a given set of parameter value before reporting their average inference latencies. It is passed as an integer value, eg: `2`. Default is `3`.
* `--autocast` sets whether or not to add cuda autocast runs to the benchmark (respectively `yes` and `no`). Default is `no`.

An example of running the benchmark script options set:
```
python ./scripts/benchmark.py --samples=1,2,4 --steps=50 --repeats=3 --autocast=no
```

**Compare output of single-precision and half-precision**

Launch the `benchmark_quality.py` script to compare the output of single-precision and half-precision models:
```
python ./scripts/benchmark_quality.py
```
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
transformers==4.22.1
ftfy==6.1.1
Pillow==9.2.0
diffusers==0.3.0
diffusers==0.4.2
onnxruntime==1.12.1
scikit-image==0.19.3
-e .
106 changes: 50 additions & 56 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from diffusers import StableDiffusionPipeline, StableDiffusionOnnxPipeline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

prompt = "a photo of an astronaut riding a horse on mars"


Expand All @@ -30,8 +29,8 @@ def get_inference_pipeline(precision, backend):
revision="main" if precision == "single" else "fp16",
use_auth_token=os.environ["ACCESS_TOKEN"],
torch_dtype=torch.float32 if precision == "single" else torch.float16,
)
pipe = pipe.to(device)
).to(device)

else:
pipe = StableDiffusionOnnxPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
Expand All @@ -42,23 +41,18 @@ def get_inference_pipeline(precision, backend):
else "CUDAExecutionProvider",
torch_dtype=torch.float32 if precision == "single" else torch.float16,
)

# Disable safety
disable_safety = True
if disable_safety:
def null_safety(images, **kwargs):
return images, False

def null_safety(images, **kwargs):
return images, False
pipe.safety_checker = null_safety

pipe.safety_checker = null_safety
return pipe


def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
torch.cuda.empty_cache()
context = (
autocast if (device.type == "cuda" and use_autocast) else nullcontext
)
context = autocast if (device.type == "cuda" and use_autocast) else nullcontext
with context("cuda"):
images = pipe(
prompt=[prompt] * n_samples, num_inference_steps=num_inference_steps
Expand All @@ -67,9 +61,7 @@ def do_inference(pipe, n_samples, use_autocast, num_inference_steps):
return images


def get_inference_time(
pipe, n_samples, n_repeats, use_autocast, num_inference_steps
):
def get_inference_time(pipe, n_samples, n_repeats, use_autocast, num_inference_steps):
from torch.utils.benchmark import Timer

timer = Timer(
Expand Down Expand Up @@ -121,14 +113,14 @@ def run_benchmark(
logs = {
"memory": 0.00
if device.type == "cpu"
else get_inference_memory(
pipe, n_samples, use_autocast, num_inference_steps
),
else get_inference_memory(pipe, n_samples, use_autocast, num_inference_steps),
"latency": get_inference_time(
pipe, n_samples, n_repeats, use_autocast, num_inference_steps
),
}
print(f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}")
print(
f"n_samples: {n_samples}\tprecision: {precision}\tautocast: {use_autocast}\tbackend: {backend}"
)
print(logs, "\n")
return logs

Expand Down Expand Up @@ -181,43 +173,45 @@ def run_benchmark_grid(grid, n_repeats, num_inference_steps):
device_desc = get_device_description()
for n_samples in grid["n_samples"]:
for precision in grid["precision"]:
use_autocast = False
if precision == "half":
for autocast in grid["autocast"]:
if autocast == "yes":
use_autocast = True
for backend in grid["backend"]:
try:
new_log = run_benchmark(
n_repeats=n_repeats,
n_samples=n_samples,
precision=precision,
use_autocast=use_autocast,
backend=backend,
num_inference_steps=num_inference_steps,
)
except Exception as e:
if "CUDA out of memory" in str(
e
) or "Failed to allocate memory" in str(e):
print(str(e))
torch.cuda.empty_cache()
new_log = {"latency": -1.00, "memory": -1.00}
else:
raise e

latency = new_log["latency"]
memory = new_log["memory"]
new_row = [
device_desc,
precision,
autocast,
backend,
n_samples,
latency,
memory,
]
writer.writerow(new_row)
# restrict enabling autocast to half precision
if precision == "single":
use_autocast_vals = ("no",)
else:
use_autocast_vals = grid["autocast"]
for use_autocast_val in use_autocast_vals:
use_autocast = use_autocast_val == "yes"
for backend in grid["backend"]:
try:
new_log = run_benchmark(
n_repeats=n_repeats,
n_samples=n_samples,
precision=precision,
use_autocast=use_autocast,
backend=backend,
num_inference_steps=num_inference_steps,
)
except Exception as e:
if "CUDA out of memory" in str(
e
) or "Failed to allocate memory" in str(e):
print(str(e))
torch.cuda.empty_cache()
new_log = {"latency": -1.00, "memory": -1.00}
else:
raise e

latency = new_log["latency"]
memory = new_log["memory"]
new_row = [
device_desc,
precision,
use_autocast,
backend,
n_samples,
latency,
memory,
]
writer.writerow(new_row)


if __name__ == "__main__":
Expand Down