Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Playing around with grid persistence. #1779

Merged
merged 31 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a054b3e
Refactor TransormPropagator to allow specifying a position and propag…
zasdfgbnm Jun 26, 2022
1d3fd15
Revert scheduling changes. Cleanup only.
csarofeen Jun 26, 2022
44b3183
Start drafting grid persistent kernels.
csarofeen Jun 27, 2022
ecc7a87
Extend mma dimension and layout checking to support strided batched m…
shmsong Jun 27, 2022
d3de227
Fix FusionMaxRootDomainInfoSpanningTreePrintTwice_CUDA (#1781)
zasdfgbnm Jun 27, 2022
86f46aa
Fix div(Val, TensorView) (#1778)
IvanYashchuk Jun 28, 2022
33a824d
Adding sibling path for MaxInfoSpanningTree (#1776)
zasdfgbnm Jun 28, 2022
15488be
Save.
csarofeen Jun 28, 2022
0c82ecf
Disable register reuse across serial broadcast ops (#1787)
shmsong Jun 30, 2022
ebf23a5
Fix isIntegralType error msg (#1789)
Sergei-Lebedev Jun 30, 2022
fe93bf5
Transform propagator skip replay when possible (#1782)
zasdfgbnm Jun 30, 2022
59f3c32
Output allocate patch (#1790)
jjsjann123 Jun 30, 2022
635ebfc
Add SpanningTreePrinter (#1786)
zasdfgbnm Jul 1, 2022
28cbaf9
New compute at interface (#1743)
zasdfgbnm Jul 1, 2022
45f5203
Fix TransformReplay::getMatchedLeafPosWithoutReplay* (#1791)
zasdfgbnm Jul 1, 2022
d0d0908
Some further cleanup for the new computeAt interface (#1793)
zasdfgbnm Jul 1, 2022
c077085
Use TransformPropagatorWithCheck in many tests (#1795)
zasdfgbnm Jul 1, 2022
3f2c263
validateDomain in TransformPropagator (#1796)
zasdfgbnm Jul 1, 2022
38c7f3c
InlinePropagator please don't replay (#1797)
zasdfgbnm Jul 1, 2022
ef04f6c
Coding style cleanups (#1798)
zasdfgbnm Jul 1, 2022
76b3cca
Add parsing support for `_to_copy` to handle AMP casts. (#1756)
kevinstephano Jul 2, 2022
f008140
MMA Rfactor support for cross-warp and cross-CTA split on K dimension…
shmsong Jul 2, 2022
8d384da
Indexing refactor stage 2 : Remove reference tensor in predicate inde…
shmsong Jul 2, 2022
5f375d0
More cleanup on InlinePropagator (#1800)
zasdfgbnm Jul 5, 2022
37c579e
Temporarily disable test requring large shared memory. (#1802)
shmsong Jul 5, 2022
025c840
Grouping grid allreduces across iterations (#1755)
naoyam Jul 5, 2022
fa4e6a4
Check siblings in getMaxPosAll (#1805)
zasdfgbnm Jul 6, 2022
fd4be12
remove dead indexing code (#1806)
shmsong Jul 6, 2022
3ba6a5f
Broadcast in dim with expand (#1794)
jjsjann123 Jul 6, 2022
282c429
spam nvrtc options (#1783)
jjsjann123 Jul 7, 2022
e594590
Merge branch 'devel' of https://www.github.com/csarofeen/pytorch into…
csarofeen Jul 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,16 +298,16 @@ NVFUSER_BENCHMARK_DEFINE(
DataType::Half);

NVFUSER_BENCHMARK_RUN(NvFuserScheduler_ResNext_BatchNorm_nhwc_fp16)
->Args({128, 64, 112})
->Args({128, 128, 56})
->Args({128, 256, 56})
->Args({128, 128, 56})
->Args({128, 256, 28})
->Args({128, 512, 28})
->Args({128, 512, 14})
->Args({128, 1024, 14})
->Args({128, 1024, 7})
->Args({128, 2048, 7})
// ->Args({128, 64, 112})
// ->Args({128, 128, 56})
// ->Args({128, 256, 56})
// ->Args({128, 128, 56})
// ->Args({128, 256, 28})
// ->Args({128, 512, 28})
// ->Args({128, 512, 14})
// ->Args({128, 1024, 14})
// ->Args({128, 1024, 7})
->Args({8, 4096, 64})
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: Remove before considering to merge.

->Unit(benchmark::kMicrosecond)
->UseManualTime();

Expand Down
2 changes: 1 addition & 1 deletion build_variables.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,7 @@ libtorch_cuda_core_sources = [
"torch/csrc/autograd/functions/comm.cpp",
"torch/csrc/jit/codegen/cuda/arith.cpp",
"torch/csrc/jit/codegen/cuda/compute_at.cpp",
"torch/csrc/jit/codegen/cuda/inline_propagator.cpp",
"torch/csrc/jit/codegen/cuda/compute_at_map.cpp",
"torch/csrc/jit/codegen/cuda/codegen.cpp",
"torch/csrc/jit/codegen/cuda/contiguity.cpp",
Expand All @@ -657,7 +658,6 @@ libtorch_cuda_core_sources = [
"torch/csrc/jit/codegen/cuda/grouped_reduction.cpp",
"torch/csrc/jit/codegen/cuda/index_compute.cpp",
"torch/csrc/jit/codegen/cuda/lower_index_compute.cpp",
"torch/csrc/jit/codegen/cuda/index_reference_replay.cpp",
"torch/csrc/jit/codegen/cuda/instrumentation.cpp",
"torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp",
"torch/csrc/jit/codegen/cuda/ir_builder.cpp",
Expand Down
56 changes: 56 additions & 0 deletions test/test_jit_cuda_fuser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4390,6 +4390,33 @@ def t(x):
t_jit = torch.jit.script(t)
self._run_helper(t_jit, t, x)


@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_to_copy(self):
x = torch.randn(4, 2, device="cuda")

with nvfuser_singleton_fusion(True):
def t(x, dtype : torch.dtype):
o = torch.ops.aten._to_copy(x, dtype=dtype)
return o

t.__disable_jit_function_caching__ = True

t_jit = torch.jit.script(t)
for dtype in [torch.float16, torch.bool, torch.float64]:
self._run_helper(t_jit, t, x, dtype)

def t_none(x):
with torch.jit.strict_fusion():
o = torch.ops.aten._to_copy(x, dtype=None)
return o

t_jit_none = torch.jit.script(t_none)
self._run_helper(t_jit_none, t_none, x)


@unittest.skipIf(ALIAS_TEST_DISABLED, "skipping this test since reshape is disabled now")
@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
Expand Down Expand Up @@ -4751,6 +4778,35 @@ def t(x):
jit_t = torch.jit.script(t)
self._run_helper(jit_t, t, x)

@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
def test_issue_1785(self):
class Fusion(torch.nn.Module):
def __init__(self):
super(Fusion, self).__init__()

def forward(self, x, a, b):
out = torch.mul(x.unsqueeze(-1), a)
out = out + b
return out

x = torch.randn(1024, 192, 3, device='cuda')
a = torch.randn(3, 128, device='cuda')
b = torch.randn(3, 128, device='cuda')

model = Fusion()
jit_model = torch.jit.script(model)

with torch.jit.fuser('fuser2'):
for _ in range(4):
out_ref = model(x, a, b)
out_jit = jit_model(x, a, b)

out_ref = model(x, a, b)
out_jit = jit_model(x, a, b)
self.assertTrue(self._compare("comparing output failed", out_ref, out_jit, 1e-5))

@unittest.skipIf(not RUN_NVFUSER, "requires CUDA")
@unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING,
"Requires fusion optimization pass to be effective")
Expand Down
2 changes: 1 addition & 1 deletion torch/csrc/jit/codegen/cuda/arith.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ TensorView* binaryOp(
} \
TensorView* op_name(Val* v1, TensorView* v2) { \
return binaryOp( \
BinaryOpType::op_type, v2, v2, TypePromotion::float_op_config); \
BinaryOpType::op_type, v1, v2, TypePromotion::float_op_config); \
} \
TensorView* op_name(TensorView* v1, TensorView* v2) { \
return binaryOp( \
Expand Down
Loading