diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 56111961cb45..64f3698b9a31 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -12,11 +12,18 @@ // from https://gist.github.com/pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 #include +static int __builtin_clz(unsigned x) { + unsigned long r; + _BitScanReverse(&r, x); + return static_cast(r); +} + static int __builtin_ctz(unsigned x) { unsigned long r; _BitScanForward(&r, x); return static_cast(r); } + #endif namespace mlir { @@ -601,18 +608,22 @@ Value pext_i32(RewriterBase &rewriter, Location loc, Value a, uint32_t mask) { if (mask == 0xFFFFFFFF) return a; - // We implement a blocked algorithm to avoid generating too many instructions + // Implements the blocked algorithm from + // https://forums.developer.nvidia.com/t/pdep-and-pext-functionality-for-cuda/270973 + uint32_t mskConst = mask; + uint32_t extcnt = 0; Value result = i32_val(0); - int resultPos = 0; - while (mask) { - int start = __builtin_ctz(mask); - int width = __builtin_ctz(~(mask >> start)); - Value shifted = lshr(a, i32_val(start)); - Value widthMask = i32_val(((1u << width) - 1)); - Value blockVal = and_(shifted, widthMask); - result = or_(result, shl(blockVal, i32_val(resultPos))); - resultPos += width; - mask &= ~(((1u << width) - 1) << start); + while (mskConst) { + uint32_t oldmsk = mskConst; + uint32_t bitgrplsb = mskConst & (-mskConst); + mskConst &= bitgrplsb + mskConst; + uint32_t bitgrp = mskConst ^ oldmsk; + uint32_t lsbpos = 31 - __builtin_clz(bitgrplsb); + // like popcount for a number 0..01..1..0 but portable + uint32_t grplen = __builtin_ctz(~(bitgrp >> lsbpos)); + uint32_t shift = lsbpos - extcnt; + extcnt += grplen; + result = or_(result, lshr(and_(i32_val(bitgrp), a), i32_val(shift))); } return result; } diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp index 09df9977828c..084224db124d 100644 --- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp +++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp @@ -447,7 +447,8 @@ class DecomposeScaledBlocked mmaEnc.getInstrShape()[versionMajor == 3 ? 0 : mmaEnc.getInstrShape().size() - 2]; - auto warpSize = getWarpSize(newAEncoding); + auto mod = scaledDotOp->getParentOfType(); + int warpSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod); assert(instrShapeM <= warpSize); // Necessary choice to leave all the scales of the tile in that given warp auto threadsPerWarp = diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index c68a49e73006..bcbf00e5c8dd 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -2793,6 +2793,8 @@ def test_reduce_layouts(M, N, src_layout, axis, epilogue_kind, dtype_str, add_ov pytest.skip("Skipping test because it runs out of shared memory") if reduce_op == "sum" and dtype_str == "float16" and M * N > 1024: pytest.skip("Skipping sum reduction on float16 due to accuracy issues") + if is_hip() and isinstance(src_layout, LinearLayout): + pytest.skip("FIXME: LinearLayout not supported on HIP") if isinstance(src_layout, MmaLayout) and src_layout.version == 3: src_layout[2] = 16 if dtype_str == "float16" else 8