Fix CUDA driver error: misaligned address for transpose scheduler #1918

zasdfgbnm · 2022-08-17T19:24:58Z

The same issue as #1880, but this time, it is vectorization. Using the same workaround by @shmsong could fix this issue.

Fusion IR math for test FusionScheduleTransposeMissingDim_CUDA after fix

Inputs:
  T0_g[ iS99{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iS100{1}, iS185{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, iS186{128}, iS184{4} ], float
  T1_g[ iS67{( ceilDiv(( 1 * ( ( ceilDiv(i5, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iS68{1}, iS112{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, iS113{128}, iS111{4} ], float
  T2_g[ iS107{( ceilDiv(( ceilDiv(i8, 32) ), 1) )}, iS108{1}, iS150{( ceilDiv(( ceilDiv(32, 4) ), 128) )}, iS151{128}, iS149{4} ], float
Outputs:
  T6_g[ iblockIdx.x35{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS36{1}, iUR155{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x156{128}, iV154{4} ] ca_pos( 2 ) produce_pos( 2), float

%kernel_math {
T7_l[ iblockIdx.x91{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS92{1}, iUR180{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x181{128}, iV179{4} ] ca_pos( 2 )
   = T0_g[ iS99{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iS100{1}, iS185{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, iS186{128}, iS184{4} ];
T9_s[ iblockIdx.x103{( ceilDiv(( ceilDiv(i8, 32) ), 1) )}, iUS104{1}, iS146{( ceilDiv(( ceilDiv(32, 4) ), 128) )}, ithreadIdx.x147{128}, iS145{4} ] ca_pos( 5 )
   = T2_g[ iS107{( ceilDiv(( ceilDiv(i8, 32) ), 1) )}, iS108{1}, iS150{( ceilDiv(( ceilDiv(32, 4) ), 128) )}, iS151{128}, iS149{4} ];
T3_l[ iblockIdx.x83{( ceilDiv(( 1 * ( ( ceilDiv(i8, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iUS84{1}, iS175{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x176{128}, iS174{4} ] ca_pos( 5 ) produce_pos( 5)
   = broadcast( T9_s[ iblockIdx.x103{( ceilDiv(( ceilDiv(i8, 32) ), 1) )}, iUS104{1}, iS146{( ceilDiv(( ceilDiv(32, 4) ), 128) )}, ithreadIdx.x147{128}, iS145{4} ] ca_pos( 5 ) )
T4_l[ iblockIdx.x75{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS76{1}, iS170{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x171{128}, iS169{4} ] ca_pos( 5 ) produce_pos( 5)
   = T7_l[ iblockIdx.x91{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS92{1}, iUR180{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x181{128}, iV179{4} ] ca_pos( 2 )
   - T3_l[ iblockIdx.x83{( ceilDiv(( 1 * ( ( ceilDiv(i8, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iUS84{1}, iS175{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x176{128}, iS174{4} ] ca_pos( 5 ) produce_pos( 5);
T8_s[ iblockIdx.x59{( ceilDiv(( 1 * ( ( ceilDiv(i5, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iUS60{1}, iUR117{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x118{128}, iV116{4} ] ca_pos( 2 )
   = T1_g[ iS67{( ceilDiv(( 1 * ( ( ceilDiv(i5, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iS68{1}, iS112{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, iS113{128}, iS111{4} ];
T5_l[ iblockIdx.x51{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS52{1}, iS165{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x166{128}, iS164{4} ] ca_pos( 5 ) produce_pos( 5)
   = T4_l[ iblockIdx.x75{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS76{1}, iS170{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x171{128}, iS169{4} ] ca_pos( 5 ) produce_pos( 5)
   * T8_s[ iblockIdx.x59{( ceilDiv(( 1 * ( ( ceilDiv(i5, 32) ) * ( ceilDiv(1, 32) ) ) ), 1) )}, iUS60{1}, iUR117{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x118{128}, iV116{4} ] ca_pos( 2 );
T10_l[ iblockIdx.x43{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS44{1}, iS160{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x161{128}, iS159{4} ] ca_pos( 2 ) produce_pos( 5)
   = relu(T5_l[ iblockIdx.x51{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS52{1}, iS165{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x166{128}, iS164{4} ] ca_pos( 5 ) produce_pos( 5));
T6_g[ iblockIdx.x35{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS36{1}, iUR155{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x156{128}, iV154{4} ] ca_pos( 2 ) produce_pos( 2)
   = T10_l[ iblockIdx.x43{( ceilDiv(( i1 * ( ( ceilDiv(i2, 32) ) * ( ceilDiv(i3, 32) ) ) ), 1) )}, iUS44{1}, iS160{( ceilDiv(( ceilDiv(( 32 * 32 ), 4) ), 128) )}, ithreadIdx.x161{128}, iS159{4} ] ca_pos( 2 ) produce_pos( 5);
}

zasdfgbnm · 2022-08-17T19:30:15Z

cc: @csarofeen

csarofeen

LGTM, I really look forward to diving into this issue more.

Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/ Codegen changes include: - codegen improvement: i. improved view support on pointwise and transpose scheduler ii. grouped grid welford added for better outer-norm grid persistence in normalization - misc: i. new composite ops added: variance_mean , arange, ii. fixes misaligned address for transpose scheduler iii. refactor on separation of compilation API from execution API to prepare us for async compilation iv. double type support on expression evaluator v. PYTORCH_NVFUSER_DUMP refactor to save PTX and CUBIN Commits that's in this PR from the devel branch: ``` 89330aa Tensor factories must set the output shape as its input (#1939) b2fd01e arange support (#1933) 56c00fd Double support on all expression evaluators (#1937) 371f282 Improve trivial reduction merge support (#1931) 1d0c267 Test `rand` in a fusion with zero tensor input (#1932) 0dab160 Fix softmax bwd sizes. (#1890) ef98f36 Fix a bug (#1936) 63132a0 Propagate permissive mapping information into indexing pass (#1929) b4ac2c8 Map IterationDomains through view operations. (#1919) c0a187a do not use deprecated functions (#1935) 88de85e Upstream cherry pick fixes 0811 (#1934) b247dcf Separate kernel compilation API from kernel execution API (#1914) b34e3b9 Fix `ir_utils::hasBlockSync` + misc fixes in transpose scheduler (#1924) 14a53e6 Nullary RNGOp (#1892) 3c3c89e Misc fixes/tuning for transpose scheduler (#1912) 20cf109 Grouped grid welford (#1921) 6cf7eb0 Transpose scheduler small dim sizes better support (#1910) 9341ea9 Disabled ViewPersistentShmoo sizes that results in NAN (#1922) 057237f Fix CUDA driver error: misaligned address for transpose scheduler (#1918) 3fb3d80 Add variance_mean function using Welford (#1907) 98febf6 Remove DisableOption::UnrollWithRng (#1913) ee8ef33 Minor fix for the debug interface of using PTX directly (#1917) 6e8f953 Add PYTORCH_NVFUSER_DUMP options to save PTX and CUBIN (#1916) 5eefa9a dopt is only available since nvrtc 11.7 (#1915) 2ec8fc7 Kill computeAtBetween (#1911) d0d106a Improve view support on pointwise and transpose scheduler (#1906) e71e1ec Fix name clash of RNG with shared memory (#1904) 3381793 Fix mutator and sameAs for expanded IterDomain (#1902) ``` RUN_TORCHBENCH: nvfuser Differential Revision: [D39324552](https://our.internmc.facebook.com/intern/diff/D39324552) Pull Request resolved: pytorch#84626 Approved by: https://github.com/malfet

zasdfgbnm added 5 commits August 17, 2022 12:15

fix misaligned

f447b22

rename

4fc3bfb

lint

21e6777

typo

1c53b58

mermaid

9852979

zasdfgbnm marked this pull request as draft August 17, 2022 19:28

zasdfgbnm marked this pull request as ready for review August 17, 2022 19:29

zasdfgbnm requested a review from shmsong August 17, 2022 19:30

don't vectorize if not possible

839da0a

csarofeen mentioned this pull request Aug 22, 2022

Silent wrong result on broadcasting with split and merge #1880

Open

csarofeen approved these changes Aug 22, 2022

View reviewed changes

zasdfgbnm merged commit 057237f into devel Aug 22, 2022

zasdfgbnm deleted the misaligned-fix branch August 22, 2022 17:43

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix CUDA driver error: misaligned address for transpose scheduler #1918

Fix CUDA driver error: misaligned address for transpose scheduler #1918

zasdfgbnm commented Aug 17, 2022 •

edited

Loading

zasdfgbnm commented Aug 17, 2022

csarofeen left a comment

Fix CUDA driver error: misaligned address for transpose scheduler #1918

Fix CUDA driver error: misaligned address for transpose scheduler #1918

Conversation

zasdfgbnm commented Aug 17, 2022 • edited Loading

zasdfgbnm commented Aug 17, 2022

csarofeen left a comment

Choose a reason for hiding this comment

zasdfgbnm commented Aug 17, 2022 •

edited

Loading