diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index f078a68231448a..cae9dd93bc5592 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1390,6 +1390,19 @@ void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV, } if (!D) return; + + // OpenMP declare target variables must be visible to the host so they can + // be registered. We require protected visibility unless the variable has + // the DT_nohost modifier and does not need to be registered. + if (Context.getLangOpts().OpenMP && + Context.getLangOpts().OpenMPIsTargetDevice && isa(D) && + D->hasAttr() && + D->getAttr()->getDevType() != + OMPDeclareTargetDeclAttr::DT_NoHost) { + GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); + return; + } + // Set visibility for definitions, and for declarations if requested globally // or set explicitly. LinkageInfo LV = D->getLinkageAndVisibility(); diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index dc628b7345f59f..f6a614b3e4d54d 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -308,12 +308,13 @@ static bool requiresAMDGPUProtectedVisibility(const Decl *D, if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility) return false; - return D->hasAttr() || - (isa(D) && D->hasAttr()) || - (isa(D) && - (D->hasAttr() || D->hasAttr() || - cast(D)->getType()->isCUDADeviceBuiltinSurfaceType() || - cast(D)->getType()->isCUDADeviceBuiltinTextureType())); + return !D->hasAttr() && + (D->hasAttr() || + (isa(D) && D->hasAttr()) || + (isa(D) && + (D->hasAttr() || D->hasAttr() || + cast(D)->getType()->isCUDADeviceBuiltinSurfaceType() || + cast(D)->getType()->isCUDADeviceBuiltinTextureType()))); } void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp index 71c742198af6bf..225695feae9515 100644 --- a/clang/test/OpenMP/declare_target_codegen.cpp +++ b/clang/test/OpenMP/declare_target_codegen.cpp @@ -31,7 +31,7 @@ // CHECK-DAG: @dy = {{protected | }}global i32 0, // CHECK-DAG: @bbb = {{protected | }}global i32 0, // CHECK-DAG: weak constant %struct.__tgt_offload_entry { ptr @bbb, -// CHECK-DAG: @ccc = external global i32, +// CHECK-DAG: @ccc = external {{protected | }}global i32, // CHECK-DAG: @ddd = {{protected | }}global i32 0, // CHECK-DAG: @hhh_decl_tgt_ref_ptr = weak global ptr null // CHECK-DAG: @ggg_decl_tgt_ref_ptr = weak global ptr null diff --git a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp index 0acd98129394b8..2b256cd6a4c7f0 100644 --- a/clang/test/OpenMP/declare_target_constexpr_codegen.cpp +++ b/clang/test/OpenMP/declare_target_constexpr_codegen.cpp @@ -16,7 +16,7 @@ class A { public: static constexpr double pi = 3.141592653589793116; //. -// CHECK: @_ZN1A2piE = linkonce_odr constant double 0x400921FB54442D18, comdat, align 8 +// CHECK: @_ZN1A2piE = linkonce_odr protected constant double 0x400921FB54442D18, comdat, align 8 // CHECK: @_ZL9anotherPi = internal constant double 3.140000e+00, align 8 // CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr @"__ZN1A2piE$ref", ptr @"__ZL9anotherPi$ref"], section "llvm.metadata" //. diff --git a/clang/test/OpenMP/target_visibility.cpp b/clang/test/OpenMP/target_visibility.cpp index 938d164df89bff..2554f653170b94 100644 --- a/clang/test/OpenMP/target_visibility.cpp +++ b/clang/test/OpenMP/target_visibility.cpp @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s -// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s // expected-no-diagnostics @@ -21,6 +21,14 @@ void B::bar() { A a; a.foo(); } void B::sbar() { A::sfoo(); } #pragma omp declare target to(B::bar, B::sbar) +[[gnu::visibility("hidden")]] extern const int x = 0; +#pragma omp declare target to(x) device_type(nohost) + +[[gnu::visibility("hidden")]] int y = 0; +#pragma omp declare target to(y) + +// CHECK-DAG: @x = hidden{{.*}} constant i32 0 +// CHECK-DAG: @y = protected{{.*}} i32 0 // CHECK-DAG: define hidden void @_ZN1B4sbarEv() // CHECK-DAG: define linkonce_odr hidden void @_ZN1A4sfooEv() // CHECK-DAG: define hidden void @_ZN1B3barEv( diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake index 486ae01800b600..04ef577e6593e2 100644 --- a/llvm/include/llvm/Config/llvm-config.h.cmake +++ b/llvm/include/llvm/Config/llvm-config.h.cmake @@ -16,7 +16,7 @@ /* Indicate that this is LLVM compiled from the amd-gfx branch. */ #define LLVM_HAVE_BRANCH_AMD_GFX -#define LLVM_MAIN_REVISION 476930 +#define LLVM_MAIN_REVISION 476933 /* Define if LLVM_ENABLE_DUMP is enabled */ #cmakedefine LLVM_ENABLE_DUMP diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp index 3a1e1720be9c62..d57a912f418b72 100644 --- a/llvm/lib/CodeGen/MachineFunctionPass.cpp +++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp @@ -88,6 +88,8 @@ bool MachineFunctionPass::runOnFunction(Function &F) { MF.print(OS); } + MFProps.reset(ClearedProperties); + bool RV = runOnMachineFunction(MF); if (ShouldEmitSizeRemarks) { @@ -114,7 +116,6 @@ bool MachineFunctionPass::runOnFunction(Function &F) { } MFProps.set(SetProperties); - MFProps.reset(ClearedProperties); // For --print-changed, print if the serialized MF has changed. Modes other // than quiet/verbose are unimplemented and treated the same as 'quiet'. diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 68c5fee85973d5..7e5ce300370c92 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -4167,14 +4167,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) { else JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE); - // FIXME: MachineFunctionProperties cannot express the required pre-property - // no-SSA. When running a MIR testcase without any virtual register defs, the - // MIR parser assumes SSA. MachineFunctionPass::getClearedProperties is called - // after the pass is run, so the properties at this point say it's an SSA - // function. Forcibly clear it here so -verify-coalescing doesn't complain - // after multiple virtual register defs are introduced. - MRI->leaveSSA(); - // If there are PHIs tracked by debug-info, they will need updating during // coalescing. Build an index of those PHIs to ease updating. SlotIndexes *Slots = LIS->getSlotIndexes(); diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp index 2a524ceb9db887..9f58e9055acadb 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/MemRef/Transforms/Transforms.h" #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/Support/MathExtras.h" #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" @@ -209,6 +210,76 @@ struct ConvertMemRefLoad final : OpConversionPattern { return success(); } }; + +//===----------------------------------------------------------------------===// +// ConvertMemRefSubview +//===----------------------------------------------------------------------===// + +/// Emulating narrow ints on subview have limited support, supporting only +/// static offset and size and stride of 1. Ideally, the subview should be +/// folded away before running narrow type emulation, and this pattern would +/// never run. This pattern is mostly used for testing pruposes. +struct ConvertMemRefSubview final : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(memref::SubViewOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + MemRefType newTy = + dyn_cast(getTypeConverter()->convertType(op.getType())); + if (!newTy) { + return rewriter.notifyMatchFailure( + op->getLoc(), + llvm::formatv("failed to convert memref type: {0}", op.getType())); + } + + auto convertedElementType = newTy.getElementType(); + auto oldElementType = op.getType().getElementType(); + int srcBits = oldElementType.getIntOrFloatBitWidth(); + int dstBits = convertedElementType.getIntOrFloatBitWidth(); + if (dstBits % srcBits != 0) { + return rewriter.notifyMatchFailure( + op, "only dstBits % srcBits == 0 supported"); + } + + // Only support offset for 1-D subview. + if (op.getType().getRank() != 1) { + return rewriter.notifyMatchFailure( + op->getLoc(), "subview with rank > 1 is not supported"); + } + + // Only support stride of 1. + if (op.getStaticStride(0) != 1) { + return rewriter.notifyMatchFailure( + op->getLoc(), "subview with stride != 1 is not supported"); + } + + int64_t size = op.getStaticSize(0); + int64_t offset = op.getStaticOffset(0); + // Only support static sizes and offsets. + if (size == ShapedType::kDynamic || offset == ShapedType::kDynamic) { + return rewriter.notifyMatchFailure( + op->getLoc(), "subview with dynamic size or offset is not supported"); + } + + int elementsPerByte = dstBits / srcBits; + if (offset % elementsPerByte != 0) { + return rewriter.notifyMatchFailure( + op->getLoc(), + "subview with offset not multiple of elementsPerByte is not " + "supported"); + } + + size = ceilDiv(size, elementsPerByte); + offset = offset / elementsPerByte; + + rewriter.replaceOpWithNewOp( + op, newTy, *adaptor.getODSOperands(0).begin(), offset, size, + op.getStaticStrides()); + return success(); + } +}; + } // end anonymous namespace //===----------------------------------------------------------------------===// @@ -220,9 +291,9 @@ void memref::populateMemRefNarrowTypeEmulationPatterns( RewritePatternSet &patterns) { // Populate `memref.*` conversion patterns. - patterns - .add( - typeConverter, patterns.getContext()); + patterns.add( + typeConverter, patterns.getContext()); memref::populateResolveExtractStridedMetadataPatterns(patterns); } @@ -271,9 +342,22 @@ void memref::populateMemRefNarrowTypeEmulationConversions( return std::nullopt; StridedLayoutAttr layoutAttr; + // If the offset is 0, we do not need a strided layout as the stride is + // 1, so we only use the strided layout if the offset is not 0. if (offset != 0) { - layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset, - ArrayRef{1}); + if (offset == ShapedType::kDynamic) { + layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset, + ArrayRef{1}); + } else { + // Check if the number of bytes are a multiple of the loadStoreWidth + // and if so, divide it by the loadStoreWidth to get the offset. + if ((offset * width) % loadStoreWidth != 0) + return std::nullopt; + offset = (offset * width) / loadStoreWidth; + + layoutAttr = StridedLayoutAttr::get(ty.getContext(), offset, + ArrayRef{1}); + } } return MemRefType::get(getLinearizedShape(ty, width, loadStoreWidth), diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir index c0050d8c510d53..6ed97f05aa7cff 100644 --- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir +++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir @@ -155,3 +155,22 @@ func.func @rank_zero_memref() -> i4 { // CHECK32: %[[LOAD:.+]] = memref.load %[[ALLOC]][] : memref // CHECK32: %[[TRUNC:.+]] = arith.trunci %[[LOAD]] : i32 to i4 // CHECK32: return %[[TRUNC]] + +// ----- + +func.func @memref_strided_i4(%idx : index) -> i4 { + %arr = memref.alloc() : memref<128xi4> + %subview = memref.subview %arr[32] [32] [1] : memref<128xi4> to memref<32xi4, strided<[1], offset:32>> + %1 = memref.load %subview[%idx] : memref<32xi4, strided<[1], offset:32>> + return %1 : i4 +} + +// CHECK-LABEL: func @memref_strided_i4 +// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<64xi8> +// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][16] [16] [1] : memref<64xi8> to memref<16xi8, strided<[1], offset: 16>> +// CHECK: %[[LOAD:.+]] = memref.load %[[SUBVIEW]] + +// CHECK32-LABEL: func @memref_strided_i4 +// CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<16xi32> +// CHECK32: %[[SUBVIEW:.+]] = memref.subview %[[ALLOC]][4] [4] [1] : memref<16xi32> to memref<4xi32, strided<[1], offset: 4>> +// CHECK32: %[[LOAD:.+]] = memref.load %[[SUBVIEW]]