improve perf on convert_image_dtype and add tests #6795

pmeier · 2022-10-19T09:05:26Z

The improvements come from using inplace operations where possible.

benchmark script

import itertools
import pathlib
import pickle

import torch
from torch.utils import benchmark
import functools

from torchvision.prototype.transforms import functional as F

description = "PR"  # "main", "PR"


def make_inputs(*, input_dtype, output_dtype, device, shape=(3, 512, 512)):
    if input_dtype.is_floating_point:
        image = torch.rand(shape, dtype=input_dtype, device=device)
    else:
        image = torch.randint(0, torch.iinfo(input_dtype).max + 1, shape, dtype=input_dtype, device=device)
    return image, output_dtype


sub_labels_and_input_fns = [
    ("float to float", functools.partial(make_inputs, input_dtype=torch.float32, output_dtype=torch.float64)),
    ("float to   int", functools.partial(make_inputs, input_dtype=torch.float32, output_dtype=torch.uint8)),
    ("  int to float", functools.partial(make_inputs, input_dtype=torch.uint8, output_dtype=torch.float32)),
    ("  int to   int (down)", functools.partial(make_inputs, input_dtype=torch.int32, output_dtype=torch.uint8)),
    ("  int to   int (up)", functools.partial(make_inputs, input_dtype=torch.uint8, output_dtype=torch.int32)),
]


timers = [
    benchmark.Timer(
        stmt="convert_image_dtype(*inputs)",
        globals=dict(
            convert_image_dtype=F.convert_image_dtype,
            inputs=inputs_fn(device=device),
        ),
        label="convert_image_dtype perf improvements",
        sub_label=f"{device:4} / {sub_label}",
        description=description,
        num_threads=num_threads,
    )
    for (sub_label, inputs_fn), device in itertools.product(sub_labels_and_input_fns, ["cpu", "cuda"])
    for num_threads in ([1, 2, 4] if device == "cpu" else [1])
]

measurements = [timer.blocked_autorange(min_run_time=5) for timer in timers]


with open(f"{description}.measurements", "wb") as fh:
    pickle.dump(measurements, fh)

measurements = []
for file in pathlib.Path(".").glob("*.measurements"):
    with open(file, "rb") as fh:
        measurements.extend(pickle.load(fh))

comparison = benchmark.Compare(measurements)
comparison.trim_significant_figures()
comparison.print()

[----- convert_image_dtype perf improvements ------]
                                    |  main  |   PR 
1 threads: -----------------------------------------
      cpu  / float to float         |    88  |    82
      cuda / float to float         |    42  |    42
      cpu  / float to   int         |   380  |   360
      cuda / float to   int         |    46  |    50
      cpu  /   int to float         |   136  |   130
      cuda /   int to float         |    47  |    47
      cpu  /   int to   int (down)  |  1050  |  1070
      cuda /   int to   int (down)  |    44  |    44
      cpu  /   int to   int (up)    |   120  |    88
      cuda /   int to   int (up)    |    46  |    46
2 threads: -----------------------------------------
      cpu  / float to float         |    53  |    46
      cpu  / float to   int         |   210  |   199
      cpu  /   int to float         |    82  |    76
      cpu  /   int to   int (down)  |   560  |   546
      cpu  /   int to   int (up)    |    74  |    55
4 threads: -----------------------------------------
      cpu  / float to float         |    31  |    27
      cpu  / float to   int         |   115  |   108
      cpu  /   int to float         |    51  |    45
      cpu  /   int to   int (down)  |   293  |   286
      cpu  /   int to   int (up)    |    47  |    35

Times are in microseconds (us).

The branches that are improved are

float to float
float to int
int to int (up)

Of these float to int is the most interesting for us, since we regularly to torch.uint8 to torch.float32 before we normalize. With this patch, we get the following diff when profiling with @vfdev-5's benchmark scripts

-      2000    0.047    0.000    0.095    0.000 /home/philip/git/pytorch/torchvision/torchvision/transforms/functional_tensor.py:68(convert_image_dtype)
+      2000    0.005    0.000    0.072    0.000 /home/philip/git/pytorch/torchvision/torchvision/prototype/transforms/functional/_type_conversion.py:46(convert_image_dtype)

cc @vfdev-5 @datumbox @bjuncek

pmeier · 2022-10-19T09:06:16Z

test/test_prototype_transforms_functional.py

+@pytest.mark.parametrize(
+    ("info", "args_kwargs"),
+    make_info_args_kwargs_params(
+        next(info for info in KERNEL_INFOS if info.kernel is F.convert_image_dtype),
+        args_kwargs_fn=lambda info: info.sample_inputs_fn(),
+    ),
+)


This is rather convoluted to get the sample inputs for a single kernel. I'll refactor later since this is low priority right now.

pmeier

Another round of benchmarks after the new commits. Benchmark removed CUDA test and only tested one thread. On the flip side, measurements are now running longer to reduce the noise.

benchmark script

import pathlib
import pickle

import torch
from torch.utils import benchmark
import functools

from torchvision.prototype.transforms import functional as F

description = "PR"  # "main", "PR"


def make_inputs(*, input_dtype, output_dtype, shape=(3, 512, 512)):
    if input_dtype.is_floating_point:
        image = torch.rand(shape, dtype=input_dtype)
    else:
        image = torch.randint(0, torch.iinfo(input_dtype).max + 1, shape, dtype=input_dtype)
    return image, output_dtype


sub_labels_and_input_fns = [
    ("float to float", functools.partial(make_inputs, input_dtype=torch.float32, output_dtype=torch.float64)),
    ("float to   int", functools.partial(make_inputs, input_dtype=torch.float32, output_dtype=torch.uint8)),
    ("  int to float", functools.partial(make_inputs, input_dtype=torch.uint8, output_dtype=torch.float32)),
    ("  int to   int (down)", functools.partial(make_inputs, input_dtype=torch.int32, output_dtype=torch.uint8)),
    ("  int to   int (up)", functools.partial(make_inputs, input_dtype=torch.uint8, output_dtype=torch.int32)),
]


timers = [
    benchmark.Timer(
        stmt="convert_image_dtype(*inputs)",
        globals=dict(
            convert_image_dtype=F.convert_image_dtype,
            inputs=inputs_fn(),
        ),
        label="convert_image_dtype perf improvements",
        sub_label=sub_label,
        description=description,
        num_threads=1,
    )
    for sub_label, inputs_fn in sub_labels_and_input_fns
]

measurements = [timer.blocked_autorange(min_run_time=15) for timer in timers]


with open(f"{description}.measurements", "wb") as fh:
    pickle.dump(measurements, fh)

measurements = []
for file in pathlib.Path(".").glob("*.measurements"):
    with open(file, "rb") as fh:
        measurements.extend(pickle.load(fh))

comparison = benchmark.Compare(measurements)
comparison.trim_significant_figures()
comparison.print()

[- convert_image_dtype perf improvements --]
                             |  main  |   PR
1 threads: ---------------------------------
      float to float         |    90  |   83
      float to   int         |   380  |  380
        int to float         |   138  |  134
        int to   int (down)  |  1100  |  402
        int to   int (up)    |   127  |   92

Times are in microseconds (us).

"float to float", "int to float", "int to int (up)" did not change from the last benchmarks and are still faster
"int to int (down)" now uses bit shifts and is 3x faster

pmeier · 2022-10-20T08:02:39Z

test/prototype_transforms_kernel_infos.py

+            if output_dtype.is_floating_point:
+                return value
+            else:
+                return int(decimal.Decimal(value) * torch.iinfo(output_dtype).max)


This gives us arbitrary floating point precision for the intermediate calculations, which is what we want for the reference function. You can see from the xfails I needed to add below, that we need this in some cases.

pmeier · 2022-10-20T08:03:37Z

test/prototype_transforms_kernel_infos.py

+                    condition=lambda args_kwargs: (
+                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
+                        and not args_kwargs.kwargs["dtype"].is_floating_point
+                    )
+                    or (
+                        args_kwargs.args[0].dtype in {torch.float16, torch.bfloat16}
+                        and args_kwargs.kwargs["dtype"] == torch.int64
+                    )
+                    or (
+                        args_kwargs.args[0].dtype in {torch.int32, torch.int64}
+                        and args_kwargs.kwargs["dtype"] == torch.float16
+                    ),


I'm going to open an issue soon detailing what is happening in these cases and how we could mitigate it.

pmeier · 2022-10-20T08:05:34Z

torchvision/prototype/transforms/functional/_type_conversion.py

+            # The bitshift kernel is not vectorized
+            #  https://github.com/pytorch/pytorch/blob/703c19008df4700b6a522b0ae5c4b6d5ffc0906f/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp#L315-L322
+            #  This results in the multiplication actually being faster.
+            # TODO: If the bitshift kernel is optimized in core, replace the computation below with
+            #  `image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)`


Per comment. The same applies to the bitwise_right_shift kernel in the branch above, but that is still much faster than the division we had before.

datumbox

LGTM, just one question:

Edit: Lol, Github lost my comment. I was asking if we are confident that the use of bitwise_right_shift produces identical results to the previous implementation.

pmeier · 2022-10-20T12:12:20Z

I was asking if we are confident that the use of bitwise_right_shift produces identical results to the previous implementation.

Yes, I am. I've added reference tests just to make sure I'm not introducing anything here. If you look at them, in there I'm actually using the old idiom of multiplying or dividing by the factors.

pmeier · 2022-10-20T12:13:59Z

test/prototype_transforms_kernel_infos.py

+                if input_max_value > output_max_value:
+                    factor = (input_max_value + 1) // (output_max_value + 1)
+                    return value // factor
+                else:
+                    factor = (output_max_value + 1) // (input_max_value + 1)
+                    return value * factor


Pointer for my comment above.

Summary: * improve perf on convert_image_dtype and add tests * add reference tests * use bitshifts for int to int * revert bitshifts for int to int upscale * fix warning ignore Reviewed By: YosuaMichael Differential Revision: D40588162 fbshipit-source-id: 4f1c564f94f75ff37979c123a416b043b4c9ec14

improve perf on convert_image_dtype and add tests

328190f

pmeier added module: transforms Perf For performance improvements prototype labels Oct 19, 2022

pmeier requested review from vfdev-5 and datumbox October 19, 2022 09:05

facebook-github-bot added the cla signed label Oct 19, 2022

pmeier commented Oct 19, 2022

View reviewed changes

pmeier mentioned this pull request Oct 19, 2022

Add tests and proper support for videos in ConvertImageDtype #6783

Merged

pmeier added 5 commits October 20, 2022 08:15

add reference tests

9d03609

use bitshifts for int to int

13feab9

revert bitshifts for int to int upscale

2cafe05

fix warning ignore

7e1843f

Merge branch 'main' into convert-image-dtype

03c409d

pmeier commented Oct 20, 2022

View reviewed changes

pmeier mentioned this pull request Oct 20, 2022

convert_image_dtype overflows with low precision floating point dtypes #6799

Open

datumbox approved these changes Oct 20, 2022

View reviewed changes

pmeier commented Oct 20, 2022

View reviewed changes

pmeier merged commit 211563f into pytorch:main Oct 20, 2022

pmeier deleted the convert-image-dtype branch October 20, 2022 12:14

pmeier mentioned this pull request Oct 24, 2022

Performance improvements for transforms v2 vs. v1 #6818

Closed

31 tasks

pmeier mentioned this pull request Nov 25, 2022

use bitshifts for int to int in convert_dtype #6978

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

improve perf on convert_image_dtype and add tests #6795

improve perf on convert_image_dtype and add tests #6795

pmeier commented Oct 19, 2022 •

edited

Loading

pmeier Oct 19, 2022

pmeier left a comment •

edited

Loading

pmeier Oct 20, 2022

pmeier Oct 20, 2022

pmeier Oct 20, 2022

datumbox left a comment •

edited

Loading

pmeier commented Oct 20, 2022

pmeier Oct 20, 2022

improve perf on convert_image_dtype and add tests #6795

improve perf on convert_image_dtype and add tests #6795

Conversation

pmeier commented Oct 19, 2022 • edited Loading

pmeier Oct 19, 2022

Choose a reason for hiding this comment

pmeier left a comment • edited Loading

Choose a reason for hiding this comment

pmeier Oct 20, 2022

Choose a reason for hiding this comment

pmeier Oct 20, 2022

Choose a reason for hiding this comment

pmeier Oct 20, 2022

Choose a reason for hiding this comment

datumbox left a comment • edited Loading

Choose a reason for hiding this comment

pmeier commented Oct 20, 2022

pmeier Oct 20, 2022

Choose a reason for hiding this comment

pmeier commented Oct 19, 2022 •

edited

Loading

pmeier left a comment •

edited

Loading

datumbox left a comment •

edited

Loading