-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AMDGPU] Propagate alias information in AMDGPULowerKernelArguments. #144714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This patch reimplements https://reviews.llvm.org/D108363 and https://reviews.llvm.org/D108361 to emit !noalias and !alias.scope metadata for noalias kernel arguments.
@llvm/pr-subscribers-backend-amdgpu Author: Leon Clark (PeddleSpam) ChangesThis patch reimplements https://reviews.llvm.org/D108363 and https://reviews.llvm.org/D108361 to emit !noalias and !alias.scope metadata for noalias kernel arguments. Patch is 276.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144714.diff 12 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 6b56230a6e1d4..05490e6c81bc8 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -363,6 +363,17 @@ LLVM_ABI void updateProfileCallee(
Function *Callee, int64_t EntryDelta,
const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
+/// Adds `!noalias` and `!alias.scope` metadata for `CB`'s called function's
+/// `noalias` argument based memory accesses.
+void addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic);
+
+/// Adds `!noalias` and `!alias.scope` metadata for `F`'s `noalias` argument
+/// based memory accesses.
+void addAliasScopeMetadata(Function &F);
+
/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
/// basic blocks and extract their scope. These are candidates for duplication
/// when cloning.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d71c54e..edd19e1ef1241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
@@ -86,6 +87,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
+
+ addAliasScopeMetadata(F);
+
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +128,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
!ST.hasUsableDSOffset())
continue;
-
- // FIXME: We can replace this with equivalent alias.scope/noalias
- // metadata, but this appears to be a lot of work.
- if (Arg.hasNoAliasAttr())
- continue;
}
auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +214,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
}
}
- // TODO: Convert noalias arg to !noalias
-
if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7df5e9958182c..a56dc39e569c0 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -51,6 +51,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -1114,17 +1115,30 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
/// then add new alias scopes for each noalias argument, tag the mapped noalias
/// parameters with noalias metadata specifying the new scope, and tag all
/// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
- const DataLayout &DL, AAResults *CalleeAAR,
- ClonedCodeInfo &InlinedFunctionInfo) {
- if (!EnableNoAliasConversion)
- return;
-
- const Function *CalledFunc = CB.getCalledFunction();
+static void addAliasScopeMetadataImpl(CallBase *CB, Function *F,
+ ValueToValueMapTy *VMap,
+ const DataLayout &DL,
+ AAResults *CalleeAAR,
+ ClonedCodeInfo *InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ assert(CB || F);
+ const Function *CalledFunc = CB ? CB->getCalledFunction() : F;
SmallVector<const Argument *, 4> NoAliasArgs;
+ std::function<bool(const Argument *, Attribute::AttrKind)> paramHasAttr;
+ if (CB) {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return CB->paramHasAttr(Arg->getArgNo(), Attr);
+ };
+
+ } else {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return Arg->hasAttribute(Attr);
+ };
+ }
+
for (const Argument &Arg : CalledFunc->args())
- if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+ if (paramHasAttr(&Arg, Attribute::NoAlias) && !Arg.use_empty())
NoAliasArgs.push_back(&Arg);
if (NoAliasArgs.empty())
@@ -1166,29 +1180,20 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
NewScopes.insert(std::make_pair(A, NewScope));
if (UseNoAliasIntrinsic) {
+ assert(CB);
// Introduce a llvm.experimental.noalias.scope.decl for the noalias
// argument.
MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope);
auto *NoAliasDecl =
- IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList);
+ IRBuilder<>(CB).CreateNoAliasScopeDeclaration(AScopeList);
// Ignore the result for now. The result will be used when the
// llvm.noalias intrinsic is introduced.
(void)NoAliasDecl;
}
}
- // Iterate over all new instructions in the map; for all memory-access
- // instructions, add the alias scope metadata.
- for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
- VMI != VMIE; ++VMI) {
- if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
- if (!VMI->second)
- continue;
-
- Instruction *NI = dyn_cast<Instruction>(VMI->second);
- if (!NI || InlinedFunctionInfo.isSimplified(I, NI))
- continue;
-
+ {
+ auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void {
bool IsArgMemOnlyCall = false, IsFuncCall = false;
SmallVector<const Value *, 2> PtrArgs;
@@ -1207,7 +1212,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// know that about the inlined clone of this call site, and we don't
// need to add metadata.
if (Call->doesNotAccessMemory())
- continue;
+ return;
IsFuncCall = true;
if (CalleeAAR) {
@@ -1215,7 +1220,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// We'll retain this knowledge without additional metadata.
if (ME.onlyAccessesInaccessibleMem())
- continue;
+ return;
if (ME.onlyAccessesArgPointees())
IsArgMemOnlyCall = true;
@@ -1237,7 +1242,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// However, if this is a call, this we might just alias with none of the
// noalias arguments.
if (PtrArgs.empty() && !IsFuncCall)
- continue;
+ return;
// It is possible that there is only one underlying object, but you
// need to go through several PHIs to see it, and thus could be
@@ -1270,7 +1275,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// completely describe the aliasing properties using alias.scope
// metadata (and, thus, won't add any).
if (const Argument *A = dyn_cast<Argument>(V)) {
- if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+ if (!paramHasAttr(A, Attribute::NoAlias))
UsesAliasingPtr = true;
} else {
UsesAliasingPtr = true;
@@ -1292,7 +1297,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// Nothing we can do if the used underlying object cannot be reliably
// determined.
if (UsesUnknownObject)
- continue;
+ return;
// A function call can always get captured noalias pointers (via other
// parameters, globals, etc.).
@@ -1353,10 +1358,49 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
LLVMContext::MD_alias_scope,
MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
MDNode::get(CalledFunc->getContext(), Scopes)));
+ };
+
+ if (VMap) {
+ assert(InlinedFunctionInfo);
+
+ for (ValueToValueMapTy::iterator VMI = VMap->begin(), VMIE = VMap->end();
+ VMI != VMIE; ++VMI) {
+ const Instruction *I = dyn_cast<Instruction>(VMI->first);
+ if (!I || !VMI->second)
+ continue;
+
+ Instruction *NI = dyn_cast<Instruction>(VMI->second);
+ if (!NI || InlinedFunctionInfo->isSimplified(I, NI))
+ continue;
+
+ addAliasMD(I, NI);
+ }
+
+ } else {
+ for (auto It = inst_begin(F), End = inst_end(F); It != End; ++It) {
+ Instruction *I = &(*It);
+ addAliasMD(I, I);
+ }
}
}
}
+void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ addAliasScopeMetadataImpl(&CB, /* F */ nullptr, &VMap, DL, CalleeAAR,
+ &InlinedFunctionInfo, UseNoAliasIntrinsic);
+}
+
+void llvm::addAliasScopeMetadata(Function &F) {
+ addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+ F.getParent()->getDataLayout(),
+ /* CalleeAAR */ nullptr,
+ /* InlinedFunctionInfo */ nullptr,
+ /* UseNoAliasIntrinsic */ false);
+}
+
static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
ReturnInst *End) {
@@ -2797,7 +2841,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
SAMetadataCloner.remap(FirstNewBlock, Caller->end());
// Add noalias metadata if necessary.
- AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo);
+ if (EnableNoAliasConversion)
+ addAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo,
+ UseNoAliasIntrinsic);
// Clone return attributes on the callsite into the calls within the inlined
// function which feed into its return value.
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 99b7c7737f4ae..a87baca5a5878 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
@@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v3, v3
; VI-NEXT: v_ffbh_u32_e32 v2, v2
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_ffbh_u32_e32 v1, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT: v_ffbh_u32_e32 v3, v3
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -706,21 +706,21 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_readfirstlane_b32 s2, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s3, v0
-; VI-NEXT: s_lshl_b32 s2, s2, 8
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b32 s2, s3, 32
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_readfirstlane_b32 s0, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s1, v3
+; VI-NEXT: s_lshl_b32 s0, s0, 8
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_lshl_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s1, s1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cselect_b32 s0, s1, 32
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -814,37 +814,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -965,29 +965,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: flat_load_ubyte v10, v[0:1]
; VI-NEXT: flat_load_ubyte v11, v[2:3]
; VI-NEXT: flat_load_ubyte v12, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_add_u32 s4, s2, 1
-; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
; VI-NEXT: s_waitcnt vmcnt(6)
@@ -1001,19 +1002,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: v_ffbh_u32_e32 v4, v4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_min_u32_e32 v0, v0, v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; ...
[truncated]
|
@llvm/pr-subscribers-llvm-transforms Author: Leon Clark (PeddleSpam) ChangesThis patch reimplements https://reviews.llvm.org/D108363 and https://reviews.llvm.org/D108361 to emit !noalias and !alias.scope metadata for noalias kernel arguments. Patch is 276.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144714.diff 12 Files Affected:
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index 6b56230a6e1d4..05490e6c81bc8 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -363,6 +363,17 @@ LLVM_ABI void updateProfileCallee(
Function *Callee, int64_t EntryDelta,
const ValueMap<const Value *, WeakTrackingVH> *VMap = nullptr);
+/// Adds `!noalias` and `!alias.scope` metadata for `CB`'s called function's
+/// `noalias` argument based memory accesses.
+void addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic);
+
+/// Adds `!noalias` and `!alias.scope` metadata for `F`'s `noalias` argument
+/// based memory accesses.
+void addAliasScopeMetadata(Function &F);
+
/// Find the 'llvm.experimental.noalias.scope.decl' intrinsics in the specified
/// basic blocks and extract their scope. These are candidates for duplication
/// when cloning.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index dec781d71c54e..edd19e1ef1241 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
@@ -86,6 +87,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
+
+ addAliasScopeMetadata(F);
+
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
@@ -124,11 +128,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
!ST.hasUsableDSOffset())
continue;
-
- // FIXME: We can replace this with equivalent alias.scope/noalias
- // metadata, but this appears to be a lot of work.
- if (Arg.hasNoAliasAttr())
- continue;
}
auto *VT = dyn_cast<FixedVectorType>(ArgTy);
@@ -215,8 +214,6 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
}
}
- // TODO: Convert noalias arg to !noalias
-
if (DoShiftOpt) {
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 7df5e9958182c..a56dc39e569c0 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -51,6 +51,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -1114,17 +1115,30 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
/// then add new alias scopes for each noalias argument, tag the mapped noalias
/// parameters with noalias metadata specifying the new scope, and tag all
/// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
- const DataLayout &DL, AAResults *CalleeAAR,
- ClonedCodeInfo &InlinedFunctionInfo) {
- if (!EnableNoAliasConversion)
- return;
-
- const Function *CalledFunc = CB.getCalledFunction();
+static void addAliasScopeMetadataImpl(CallBase *CB, Function *F,
+ ValueToValueMapTy *VMap,
+ const DataLayout &DL,
+ AAResults *CalleeAAR,
+ ClonedCodeInfo *InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ assert(CB || F);
+ const Function *CalledFunc = CB ? CB->getCalledFunction() : F;
SmallVector<const Argument *, 4> NoAliasArgs;
+ std::function<bool(const Argument *, Attribute::AttrKind)> paramHasAttr;
+ if (CB) {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return CB->paramHasAttr(Arg->getArgNo(), Attr);
+ };
+
+ } else {
+ paramHasAttr = [&](const Argument *Arg, Attribute::AttrKind Attr) -> bool {
+ return Arg->hasAttribute(Attr);
+ };
+ }
+
for (const Argument &Arg : CalledFunc->args())
- if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+ if (paramHasAttr(&Arg, Attribute::NoAlias) && !Arg.use_empty())
NoAliasArgs.push_back(&Arg);
if (NoAliasArgs.empty())
@@ -1166,29 +1180,20 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
NewScopes.insert(std::make_pair(A, NewScope));
if (UseNoAliasIntrinsic) {
+ assert(CB);
// Introduce a llvm.experimental.noalias.scope.decl for the noalias
// argument.
MDNode *AScopeList = MDNode::get(CalledFunc->getContext(), NewScope);
auto *NoAliasDecl =
- IRBuilder<>(&CB).CreateNoAliasScopeDeclaration(AScopeList);
+ IRBuilder<>(CB).CreateNoAliasScopeDeclaration(AScopeList);
// Ignore the result for now. The result will be used when the
// llvm.noalias intrinsic is introduced.
(void)NoAliasDecl;
}
}
- // Iterate over all new instructions in the map; for all memory-access
- // instructions, add the alias scope metadata.
- for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
- VMI != VMIE; ++VMI) {
- if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
- if (!VMI->second)
- continue;
-
- Instruction *NI = dyn_cast<Instruction>(VMI->second);
- if (!NI || InlinedFunctionInfo.isSimplified(I, NI))
- continue;
-
+ {
+ auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void {
bool IsArgMemOnlyCall = false, IsFuncCall = false;
SmallVector<const Value *, 2> PtrArgs;
@@ -1207,7 +1212,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// know that about the inlined clone of this call site, and we don't
// need to add metadata.
if (Call->doesNotAccessMemory())
- continue;
+ return;
IsFuncCall = true;
if (CalleeAAR) {
@@ -1215,7 +1220,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// We'll retain this knowledge without additional metadata.
if (ME.onlyAccessesInaccessibleMem())
- continue;
+ return;
if (ME.onlyAccessesArgPointees())
IsArgMemOnlyCall = true;
@@ -1237,7 +1242,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// However, if this is a call, this we might just alias with none of the
// noalias arguments.
if (PtrArgs.empty() && !IsFuncCall)
- continue;
+ return;
// It is possible that there is only one underlying object, but you
// need to go through several PHIs to see it, and thus could be
@@ -1270,7 +1275,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// completely describe the aliasing properties using alias.scope
// metadata (and, thus, won't add any).
if (const Argument *A = dyn_cast<Argument>(V)) {
- if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+ if (!paramHasAttr(A, Attribute::NoAlias))
UsesAliasingPtr = true;
} else {
UsesAliasingPtr = true;
@@ -1292,7 +1297,7 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// Nothing we can do if the used underlying object cannot be reliably
// determined.
if (UsesUnknownObject)
- continue;
+ return;
// A function call can always get captured noalias pointers (via other
// parameters, globals, etc.).
@@ -1353,10 +1358,49 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
LLVMContext::MD_alias_scope,
MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
MDNode::get(CalledFunc->getContext(), Scopes)));
+ };
+
+ if (VMap) {
+ assert(InlinedFunctionInfo);
+
+ for (ValueToValueMapTy::iterator VMI = VMap->begin(), VMIE = VMap->end();
+ VMI != VMIE; ++VMI) {
+ const Instruction *I = dyn_cast<Instruction>(VMI->first);
+ if (!I || !VMI->second)
+ continue;
+
+ Instruction *NI = dyn_cast<Instruction>(VMI->second);
+ if (!NI || InlinedFunctionInfo->isSimplified(I, NI))
+ continue;
+
+ addAliasMD(I, NI);
+ }
+
+ } else {
+ for (auto It = inst_begin(F), End = inst_end(F); It != End; ++It) {
+ Instruction *I = &(*It);
+ addAliasMD(I, I);
+ }
}
}
}
+void llvm::addAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR,
+ ClonedCodeInfo &InlinedFunctionInfo,
+ bool UseNoAliasIntrinsic) {
+ addAliasScopeMetadataImpl(&CB, /* F */ nullptr, &VMap, DL, CalleeAAR,
+ &InlinedFunctionInfo, UseNoAliasIntrinsic);
+}
+
+void llvm::addAliasScopeMetadata(Function &F) {
+ addAliasScopeMetadataImpl(/* CB */ nullptr, &F, /* VMap */ nullptr,
+ F.getParent()->getDataLayout(),
+ /* CalleeAAR */ nullptr,
+ /* InlinedFunctionInfo */ nullptr,
+ /* UseNoAliasIntrinsic */ false);
+}
+
static bool MayContainThrowingOrExitingCallAfterCB(CallBase *Begin,
ReturnInst *End) {
@@ -2797,7 +2841,9 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
SAMetadataCloner.remap(FirstNewBlock, Caller->end());
// Add noalias metadata if necessary.
- AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo);
+ if (EnableNoAliasConversion)
+ addAliasScopeMetadata(CB, VMap, DL, CalleeAAR, InlinedFunctionInfo,
+ UseNoAliasIntrinsic);
// Clone return attributes on the callsite into the calls within the inlined
// function which feed into its return value.
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 99b7c7737f4ae..a87baca5a5878 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -105,11 +105,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v2, v0
+; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -181,8 +181,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_ffbh_u32_e32 v0, v0
@@ -261,8 +261,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v3, v3
; VI-NEXT: v_ffbh_u32_e32 v2, v2
@@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
@@ -605,15 +605,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
-; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: v_ffbh_u32_e32 v1, v1
-; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT: flat_load_ubyte v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v3, 24, v2
+; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; VI-NEXT: v_ffbh_u32_e32 v3, v3
+; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; VI-NEXT: v_cndmask_b32_e32 v2, 32, v3, vcc
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -706,21 +706,21 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
-; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_readfirstlane_b32 s2, v2
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s3, v0
-; VI-NEXT: s_lshl_b32 s2, s2, 8
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: s_lshl_b32 s3, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s3, s3
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b32 s2, s3, 32
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt vmcnt(1)
+; VI-NEXT: v_readfirstlane_b32 s0, v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: v_readfirstlane_b32 s1, v3
+; VI-NEXT: s_lshl_b32 s0, s0, 8
+; VI-NEXT: s_or_b32 s0, s0, s1
+; VI-NEXT: s_lshl_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s1, s1
+; VI-NEXT: s_cmp_lg_u32 s0, 0
+; VI-NEXT: s_cselect_b32 s0, s1, 32
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -814,37 +814,37 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_add_u32 s2, s2, 1
-; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: flat_load_ubyte v2, v[2:3]
-; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: flat_load_ubyte v4, v[6:7]
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v3, v[0:1]
+; VI-NEXT: flat_load_ubyte v4, v[4:5]
+; VI-NEXT: flat_load_ubyte v5, v[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v2, v3, v2
+; VI-NEXT: v_ffbh_u32_e32 v2, v2
+; VI-NEXT: v_min_u32_e32 v2, 32, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -965,29 +965,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_mov_b32_e32 v6, s4
-; VI-NEXT: s_add_u32 s4, s2, 3
+; VI-NEXT: s_add_u32 s4, s2, 1
; VI-NEXT: s_addc_u32 s5, s3, 0
; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: s_add_u32 s4, s2, 2
+; VI-NEXT: s_add_u32 s4, s2, 3
; VI-NEXT: flat_load_ubyte v10, v[0:1]
; VI-NEXT: flat_load_ubyte v11, v[2:3]
; VI-NEXT: flat_load_ubyte v12, v[4:5]
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: flat_load_ubyte v7, v[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_add_u32 s4, s2, 1
-; VI-NEXT: s_addc_u32 s5, s3, 0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_u32 s2, s2, 2
; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: flat_load_ubyte v0, v[0:1]
+; VI-NEXT: flat_load_ubyte v8, v[0:1]
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(7)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10
; VI-NEXT: s_waitcnt vmcnt(6)
@@ -1001,19 +1002,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; VI-NEXT: v_ffbh_u32_e32 v4, v4
; VI-NEXT: s_waitcnt vmcnt(2)
-; VI-NEXT: v_or_b32_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_or_b32_e32 v2, v2, v3
-; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
-; VI-NEXT: v_min_u32_e32 v0, v0, v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing tests. This should have dedicated IR tests
SmallVector<const Argument *, 4> NoAliasArgs; | ||
|
||
std::function<bool(const Argument *, Attribute::AttrKind)> paramHasAttr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't need to be a std::function. I also don't see this adding anything, it would be clearer to just directly do the attribute test in context
continue; | ||
|
||
{ | ||
auto addAliasMD = [&](const Instruction *I, Instruction *NI) -> void { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make this a real helper function, not a lambda
This patch reimplements D108363 and D108361 to emit
!noalias
and!alias.scope
metadata for noalias kernel arguments.