diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 87ac4541abe39..e781caf494a6a 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -5557,8 +5557,9 @@ class OffloadingActionBuilder final { DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL); continue; } - if (IsNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) { - // When compiling for Nvidia/CUDA devices and the user requested the + if ((IsNVPTX || IsAMDGCN) && + Args.hasArg(options::OPT_fsycl_embed_ir)) { + // When compiling for Nvidia/AMD devices and the user requested the // IR to be embedded in the application (via option), run the output // of sycl-post-link (filetable referencing LLVM Bitcode + symbols) // through the offload wrapper and link the resulting object to the diff --git a/clang/test/Driver/sycl-embed-ir.cpp b/clang/test/Driver/sycl-embed-ir.cpp new file mode 100644 index 0000000000000..ed65be3b24314 --- /dev/null +++ b/clang/test/Driver/sycl-embed-ir.cpp @@ -0,0 +1,19 @@ +/// Tests for -fsycl-embed-ir + +// UNSUPPORTED: system-windows + +// RUN: %clangxx -fsycl -fsycl-targets=nvidia_gpu_sm_80 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \ +// RUN: FileCheck -check-prefix=CHECK-NV %s + +// CHECK-NV: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, sm_80) +// CHECK-NV: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, sm_80) +// CHECK-NV: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, sm_80) +// CHECK-NV: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {[[WRAP]]}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {{{.*}}}, image + +// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx1010 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \ +// RUN: FileCheck -check-prefix=CHECK-AMD %s + +// CHECK-AMD: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, gfx1010) +// CHECK-AMD: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, gfx1010) +// CHECK-AMD: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, gfx1010) +// CHECK-AMD: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {[[WRAP]]}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {{{.*}}}, image diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h index ef10cd47b0394..1820cb040730f 100644 --- a/sycl-fusion/common/include/Kernel.h +++ b/sycl-fusion/common/include/Kernel.h @@ -20,6 +20,30 @@ namespace jit_compiler { using BinaryAddress = const uint8_t *; +/// Possible barrier flags +enum class BarrierFlags : uint32_t { + None = 0, // Do not insert barrier + Local = 1, // Ensure correct ordering of memory operations to local memory + Global = 2, // Ensure correct ordering of memory operations to global memory + LocalAndGlobal = Local | Global +}; + +constexpr BarrierFlags getNoBarrierFlag() { return BarrierFlags::None; } +constexpr BarrierFlags getLocalAndGlobalBarrierFlag() { + return BarrierFlags::LocalAndGlobal; +} +constexpr bool isNoBarrierFlag(BarrierFlags Flag) { + return Flag == BarrierFlags::None; +} +constexpr bool hasLocalBarrierFlag(BarrierFlags Flag) { + return static_cast(Flag) & + static_cast(BarrierFlags::Local); +} +constexpr bool hasGlobalBarrierFlag(BarrierFlags Flag) { + return static_cast(Flag) & + static_cast(BarrierFlags::Global); +} + /// /// Enumerate possible kinds of parameters. /// 1:1 correspondence with the definition in kernel_desc.hpp in the DPC++ SYCL @@ -35,7 +59,7 @@ enum class ParameterKind : uint32_t { }; /// Different binary formats supported as input to the JIT compiler. -enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX }; +enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX, AMDGCN }; /// Information about a device intermediate representation module (e.g., SPIR-V, /// LLVM IR) from DPC++. diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h index 09058d61e9981..8de794256c71c 100644 --- a/sycl-fusion/common/lib/KernelIO.h +++ b/sycl-fusion/common/lib/KernelIO.h @@ -48,6 +48,7 @@ template <> struct ScalarEnumerationTraits { IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM); IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV); IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX); + IO.enumCase(BF, "AMDGCN", jit_compiler::BinaryFormat::AMDGCN); IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID); } }; diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index bf323239679b4..35850132fcd95 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -9,6 +9,9 @@ add_llvm_library(sycl-fusion lib/fusion/ModuleHelper.cpp lib/helper/ConfigHelper.cpp + DEPENDS + intrinsics_gen + LINK_COMPONENTS BitReader Core @@ -50,6 +53,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX) endif() +if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN) +endif() + if (BUILD_SHARED_LIBS) if(NOT MSVC AND NOT APPLE) # Manage symbol visibility through the linker to make sure no LLVM symbols diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h index 69465a74d8371..381c7b2a54343 100644 --- a/sycl-fusion/jit-compiler/include/JITContext.h +++ b/sycl-fusion/jit-compiler/include/JITContext.h @@ -19,6 +19,7 @@ #include "Hashing.h" #include "Kernel.h" +#include "Options.h" #include "Parameter.h" namespace llvm { @@ -28,7 +29,7 @@ class LLVMContext; namespace jit_compiler { using CacheKeyT = - std::tuple, ParamIdentList, int, + std::tuple, ParamIdentList, BarrierFlags, std::vector, std::vector, // This field of the cache is optional because, if all of the // ranges are equal, we will perform no remapping, so that fused diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h index d4607df2e5660..7f0d287ae7aa1 100644 --- a/sycl-fusion/jit-compiler/include/KernelFusion.h +++ b/sycl-fusion/jit-compiler/include/KernelFusion.h @@ -60,7 +60,8 @@ class KernelFusion { const std::vector &KernelInformation, const std::vector &KernelsToFuse, const std::string &FusedKernelName, - jit_compiler::ParamIdentList &Identities, int BarriersFlags, + jit_compiler::ParamIdentList &Identities, + BarrierFlags BarriersFlags, const std::vector &Internalization, const std::vector &JITConstants); diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp index 8aa62a664641c..fdd58def9f2f6 100644 --- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp +++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp @@ -58,6 +58,13 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) { #else // FUSION_JIT_SUPPORT_PTX return false; #endif // FUSION_JIT_SUPPORT_PTX + } + case BinaryFormat::AMDGCN: { +#ifdef FUSION_JIT_SUPPORT_AMDGCN + return true; +#else // FUSION_JIT_SUPPORT_AMDGCN + return false; +#endif // FUSION_JIT_SUPPORT_AMDGCN } default: return false; @@ -69,7 +76,7 @@ FusionResult KernelFusion::fuseKernels( const std::vector &KernelInformation, const std::vector &KernelsToFuse, const std::string &FusedKernelName, ParamIdentList &Identities, - int BarriersFlags, + BarrierFlags BarriersFlags, const std::vector &Internalization, const std::vector &Constants) { // Initialize the configuration helper to make the options for this invocation diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index a6c5569d87d22..bf769b50e2f30 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -40,7 +40,7 @@ static unsigned getFlatAddressSpace(Module &Mod) { // Ideally, we could get this information from the TargetTransformInfo, but // the SPIR-V backend does not yet seem to have an implementation for that. llvm::Triple Tri(Mod.getTargetTriple()); - if (Tri.isNVPTX()) { + if (Tri.isNVPTX() || Tri.isAMDGCN()) { return 0; } if (Tri.isSPIRV() || Tri.isSPIR()) { @@ -53,7 +53,7 @@ static unsigned getFlatAddressSpace(Module &Mod) { std::unique_ptr FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, - int BarriersFlags) { + BarrierFlags BarriersFlags) { // Perform the actual kernel fusion, i.e., generate a kernel function for the // fused kernel from the kernel functions of the input kernels. This is done // by the SYCLKernelFusion LLVM pass, which is run here through a custom LLVM diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h index 0941472dafe9d..642206025224f 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h @@ -25,7 +25,7 @@ class FusionPipeline { /// contain an entry for the fused kernel. static std::unique_ptr runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo, - int BarriersFlags); + BarrierFlags BarriersFlags); }; } // namespace fusion } // namespace jit_compiler diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp index 69067b3ad5d84..354e68bab5ea2 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp @@ -191,6 +191,14 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel, KernelBin = *BinaryOrError; break; } + case BinaryFormat::AMDGCN: { + llvm::Expected BinaryOrError = + translateToAMDGCN(Kernel, Mod, JITCtx); + if (auto Error = BinaryOrError.takeError()) + return Error; + KernelBin = *BinaryOrError; + break; + } default: { return createStringError( inconvertibleErrorCode(), @@ -287,3 +295,75 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX); #endif // FUSION_JIT_SUPPORT_PTX } + +llvm::Expected +KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo, + llvm::Module &Mod, JITContext &JITCtx) { +#ifndef FUSION_JIT_SUPPORT_AMDGCN + (void)KernelInfo; + (void)Mod; + (void)JITCtx; + return createStringError(inconvertibleErrorCode(), + "AMDGPU translation not supported in this build"); +#else // FUSION_JIT_SUPPORT_AMDGCN + + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUAsmPrinter(); + LLVMInitializeAMDGPUTargetMC(); + + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; + + std::string TargetTriple{"amdgcn-amd-amdhsa"}; + + std::string ErrorMessage; + const auto *Target = + llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage); + + if (!Target) + return createStringError( + inconvertibleErrorCode(), + "Failed to load and translate AMDGCN LLVM IR module with error %s", + ErrorMessage.c_str()); + + // Set to the lowest tested target according to the GetStartedGuide, section + // "Build DPC++ toolchain with support for HIP AMD" + llvm::StringRef TargetCPU{"gfx906"}; + llvm::StringRef TargetFeatures{""}; + if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) { + if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + TargetCPU = + KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + + // FIXME: Check whether we can provide more accurate target information here + auto *TargetMachine = Target->createTargetMachine( + TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_, + std::nullopt, llvm::CodeGenOptLevel::Default); + + std::string AMDObj; + { + llvm::legacy::PassManager PM; + llvm::raw_string_ostream OBJStream{AMDObj}; + llvm::buffer_ostream BufferedOBJ{OBJStream}; + + if (TargetMachine->addPassesToEmitFile(PM, BufferedOBJ, nullptr, + llvm::CodeGenFileType::ObjectFile)) { + return createStringError( + inconvertibleErrorCode(), + "Failed to construct pass pipeline to emit output"); + } + + PM.run(Mod); + OBJStream.flush(); + } + + return &JITCtx.emplaceKernelBinary(std::move(AMDObj), BinaryFormat::AMDGCN); +#endif // FUSION_JIT_SUPPORT_AMDGCN +} diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h index 7e4816df9bf94..a7936438202d0 100644 --- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h +++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h @@ -43,6 +43,10 @@ class KernelTranslator { static llvm::Expected translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx); + + static llvm::Expected + translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod, + JITContext &JITCtx); }; } // namespace translation } // namespace jit_compiler diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt index 4693083be4faa..1b1f393eb9df9 100644 --- a/sycl-fusion/passes/CMakeLists.txt +++ b/sycl-fusion/passes/CMakeLists.txt @@ -30,6 +30,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX) endif() +if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN) +endif() + # Static library for linking with the jit_compiler add_llvm_library(SYCLKernelFusionPasses SYCLFusionPasses.cpp @@ -68,3 +72,7 @@ target_link_libraries(SYCLKernelFusionPasses if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD) target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX) endif() + +if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD) + target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_AMDGCN) +endif() diff --git a/sycl-fusion/passes/SYCLFusionPasses.cpp b/sycl-fusion/passes/SYCLFusionPasses.cpp index b112cfec1b18e..16a938ec991ce 100644 --- a/sycl-fusion/passes/SYCLFusionPasses.cpp +++ b/sycl-fusion/passes/SYCLFusionPasses.cpp @@ -9,12 +9,15 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" +#include "Kernel.h" + #include "internalization/Internalization.h" #include "kernel-fusion/SYCLKernelFusion.h" #include "kernel-info/SYCLKernelInfo.h" #include "syclcp/SYCLCP.h" using namespace llvm; +using namespace jit_compiler; cl::opt NoBarriers("sycl-kernel-fusion-no-barriers", @@ -28,8 +31,9 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() { [](StringRef Name, ModulePassManager &MPM, ArrayRef) { if (Name == "sycl-kernel-fusion") { - int BarrierFlag = - (NoBarriers) ? -1 : SYCLKernelFusion::DefaultBarriersFlags; + BarrierFlags BarrierFlag = + (NoBarriers) ? getNoBarrierFlag() + : SYCLKernelFusion::DefaultBarriersFlags; MPM.addPass(SYCLKernelFusion(BarrierFlag)); return true; } diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp index ecc97b244c6ff..4a0f2509cff86 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp @@ -234,13 +234,12 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder, return {Entry, CallInsertion, Exit}; } -static Expected -createFusionCall(IRBuilderBase &Builder, Function *F, - ArrayRef CallArgs, - const jit_compiler::NDRange &SrcNDRange, - const jit_compiler::NDRange &FusedNDRange, bool IsLast, - int BarriersFlags, jit_compiler::Remapper &Remapper, - bool ShouldRemap, TargetFusionInfo &TargetInfo) { +static Expected createFusionCall( + IRBuilderBase &Builder, Function *F, ArrayRef CallArgs, + const jit_compiler::NDRange &SrcNDRange, + const jit_compiler::NDRange &FusedNDRange, bool IsLast, + jit_compiler::BarrierFlags BarriersFlags, jit_compiler::Remapper &Remapper, + bool ShouldRemap, TargetFusionInfo &TargetInfo) { const auto IPs = addGuard(Builder, TargetInfo, SrcNDRange, FusedNDRange, IsLast); @@ -266,7 +265,7 @@ createFusionCall(IRBuilderBase &Builder, Function *F, Builder.SetInsertPoint(IPs.Exit); // Insert barrier if needed - if (!IsLast && BarriersFlags > 0) { + if (!IsLast && !jit_compiler::isNoBarrierFlag(BarriersFlags)) { TargetInfo.createBarrierCall(Builder, BarriersFlags); } diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h index 5f52f0a317d14..3da212ccb668d 100644 --- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h +++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h @@ -35,7 +35,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { constexpr static llvm::StringLiteral NDRangesMDKey{"sycl.kernel.nd-ranges"}; constexpr SYCLKernelFusion() = default; - constexpr explicit SYCLKernelFusion(int BarriersFlags) + constexpr explicit SYCLKernelFusion(jit_compiler::BarrierFlags BarriersFlags) : BarriersFlags{BarriersFlags} {} llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); @@ -45,7 +45,8 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { /// /// By default, correct ordering of memory operations to global memory is /// ensured. - constexpr static int DefaultBarriersFlags{3}; + constexpr static jit_compiler::BarrierFlags DefaultBarriersFlags{ + jit_compiler::getLocalAndGlobalBarrierFlag()}; private: // This needs to be in sync with the metadata kind @@ -155,11 +156,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin { /// /// Flags to apply to the barrier to be introduced between fused kernels. /// - /// Possible values: - /// - -1: Do not insert barrier - /// - 1: ensure correct ordering of memory operations to local memory - /// - 2: ensure correct ordering of memory operations to global memory - const int BarriersFlags{DefaultBarriersFlags}; + const jit_compiler::BarrierFlags BarriersFlags{DefaultBarriersFlags}; /// /// Merge the content of Other into Attributes, adding, removing or updating diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp index b70494c25722a..5dd259f62faa7 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.cpp +++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "TargetFusionInfo.h" +#include "Kernel.h" #include "Kernel.h" #include "NDRangesHelper.h" @@ -14,9 +15,12 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/TargetParser/Triple.h" +using namespace jit_compiler; + template static ForwardIt mapArrayLookup(ForwardIt Begin, ForwardIt End, const KeyTy &Key) { @@ -62,7 +66,7 @@ class TargetFusionInfoImpl { virtual ArrayRef getUniformKernelAttributes() const { return {}; } virtual void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const = 0; + BarrierFlags BarrierFlags) const = 0; virtual unsigned getPrivateAddressSpace() const = 0; @@ -173,12 +177,10 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { } void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override { - if (BarrierFlags == -1) { + BarrierFlags BarrierFlags) const override { + if (isNoBarrierFlag(BarrierFlags)) { return; } - assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) && - "Invalid barrier flags"); static const auto FnAttrs = AttributeSet::get( LLVMMod->getContext(), @@ -208,8 +210,9 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { SmallVector Args{ Builder.getInt32(/*Exec Scope : Workgroup = */ 2), Builder.getInt32(/*Exec Scope : Workgroup = */ 2), - Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) | - ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))}; + Builder.getInt32(0x10 | + (hasLocalBarrierFlag(BarrierFlags) ? 0x100 : 0x0) | + ((hasGlobalBarrierFlag(BarrierFlags) ? 0x200 : 0x0)))}; auto *BarrierCallInst = Builder.CreateCall(F, Args); BarrierCallInst->setAttributes( @@ -412,18 +415,15 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl { } }; -// -// NVPTXTargetFusionInfo -// -#ifdef FUSION_JIT_SUPPORT_PTX -class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { +class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl { public: using TargetFusionInfoImpl::TargetFusionInfoImpl; - void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override { + void notifyFunctionsDelete(StringRef MDName, + llvm::ArrayRef Funcs) const { SmallPtrSet DeletedFuncs{Funcs.begin(), Funcs.end()}; SmallVector ValidKernels; - auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations"); + auto *OldAnnotations = LLVMMod->getNamedMetadata(MDName); for (auto *Op : OldAnnotations->operands()) { if (auto *TOp = dyn_cast(Op)) { if (auto *COp = dyn_cast_if_present( @@ -437,23 +437,21 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { } } LLVMMod->eraseNamedMetadata(OldAnnotations); - auto *NewAnnotations = - LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata(MDName); for (auto *Kernel : ValidKernels) { NewAnnotations->addOperand(Kernel); } } - void addKernelFunction(Function *KernelFunc) const override { - auto *NVVMAnnotations = - LLVMMod->getOrInsertNamedMetadata("nvvm.annotations"); + void addKernelFunction(StringRef MDName, Function *KernelFunc) const { + auto *Annotations = LLVMMod->getOrInsertNamedMetadata(MDName); auto *MDOne = ConstantAsMetadata::get( ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1)); auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel"); auto *MDFunc = ConstantAsMetadata::get(KernelFunc); SmallVector KernelMD({MDFunc, MDKernelString, MDOne}); auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD); - NVVMAnnotations->addOperand(Tuple); + Annotations->addOperand(Tuple); } ArrayRef getKernelMetadataKeys() const override { @@ -469,10 +467,29 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { {"target-cpu", "target-features", "uniform-work-group-size"}}; return Keys; } +}; + +// +// NVPTXTargetFusionInfo +// +#ifdef FUSION_JIT_SUPPORT_PTX +class NVPTXTargetFusionInfo : public NVPTXAMDGCNTargetFusionInfoBase { +public: + using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase; + + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override { + NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("nvvm.annotations", + Funcs); + } + + void addKernelFunction(Function *KernelFunc) const override { + NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("nvvm.annotations", + KernelFunc); + } void createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const override { - if (BarrierFlags == -1) { + BarrierFlags BarrierFlags) const override { + if (isNoBarrierFlag(BarrierFlags)) { return; } // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX @@ -735,6 +752,87 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl { }; #endif // FUSION_JIT_SUPPORT_PTX +// +// AMDGCNTargetFusionInfo +// +#ifdef FUSION_JIT_SUPPORT_AMDGCN +class AMDGCNTargetFusionInfo : public NVPTXAMDGCNTargetFusionInfoBase { +public: + using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase; + + void notifyFunctionsDelete(llvm::ArrayRef Funcs) const override { + NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("amdgcn.annotations", + Funcs); + } + + void addKernelFunction(Function *KernelFunc) const override { + KernelFunc->setCallingConv(CallingConv::AMDGPU_KERNEL); + NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("amdgcn.annotations", + KernelFunc); + } + + void createBarrierCall(IRBuilderBase &Builder, + BarrierFlags BarrierFlags) const override { + if (isNoBarrierFlag(BarrierFlags)) { + return; + } + // Following implemention in + // libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl + llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; + llvm::SyncScope::ID SSID = + LLVMMod->getContext().getOrInsertSyncScopeID("workgroup"); + Builder.CreateFence(AO, SSID); + Builder.CreateIntrinsic(Intrinsic::AMDGCNIntrinsics::amdgcn_s_barrier, {}, + {}); + } + + std::optional getBuiltinKind(Function *) const override { + llvm_unreachable("Not implemented yet"); + return {}; + } + + bool shouldRemap(BuiltinKind K, const NDRange &SrcNDRange, + const NDRange &FusedNDRange) const override { + llvm_unreachable("Not implemented yet"); + return false; + } + + bool isSafeToNotRemapBuiltin(Function *F) const override { + llvm_unreachable("Not implemented yet"); + return false; + } + + unsigned getIndexSpaceBuiltinBitwidth() const override { + llvm_unreachable("Not implemented yet"); + return false; + } + + void setMetadataForGeneratedFunction(Function *F) const override { + llvm_unreachable("Not implemented yet"); + } + + Value *getGlobalIDWithoutOffset(IRBuilderBase &Builder, + const NDRange &FusedNDRange, + uint32_t Idx) const override { + llvm_unreachable("Not implemented yet"); + return nullptr; + } + + Function *createRemapperFunction(const Remapper &R, BuiltinKind K, + StringRef OrigName, StringRef Name, + Module *M, const NDRange &SrcNDRange, + const NDRange &FusedNDRange) const override { + llvm_unreachable("Not implemented yet"); + return nullptr; + } + + // Corresponds to the definitions in the LLVM AMDGCN backend user guide: + // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces + unsigned getPrivateAddressSpace() const override { return 5; } + unsigned getLocalAddressSpace() const override { return 3; } +}; +#endif // FUSION_JIT_SUPPORT_ADMGCN + } // anonymous namespace // @@ -749,6 +847,12 @@ TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) { return; } #endif // FUSION_JIT_SUPPORT_PTX +#ifdef FUSION_JIT_SUPPORT_AMDGCN + if (Tri.isAMDGCN()) { + Impl = std::make_shared(Mod); + return; + } +#endif // FUSION_JIT_SUPPORT_AMDGCN if (Tri.isSPIRV() || Tri.isSPIR()) { Impl = std::make_shared(Mod); return; @@ -775,7 +879,7 @@ TargetFusionInfo::getKernelMetadataKeys() const { } void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder, - int BarrierFlags) const { + BarrierFlags BarrierFlags) const { Impl->createBarrierCall(Builder, BarrierFlags); } diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h index f93866df3e5a4..35158e5fbbdee 100644 --- a/sycl-fusion/passes/target/TargetFusionInfo.h +++ b/sycl-fusion/passes/target/TargetFusionInfo.h @@ -9,6 +9,7 @@ #ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H #define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H +#include "Kernel.h" #include "kernel-fusion/Builtins.h" #include "llvm/IR/Function.h" @@ -57,7 +58,8 @@ class TargetFusionInfo { /// kernel. llvm::ArrayRef getUniformKernelAttributes() const; - void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const; + void createBarrierCall(IRBuilderBase &Builder, + jit_compiler::BarrierFlags BarrierFlags) const; unsigned getPrivateAddressSpace() const; diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index 45b1cccc1d0c5..6c71a80ef6d6f 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -777,24 +777,32 @@ Note: Kernel naming is not fully stable for now. The [experimental kernel fusion extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc) -also supports the CUDA backend. However, as neither CUBIN nor PTX are a suitable -input format for the [kernel fusion JIT compiler](KernelFusionJIT.md), a +also supports the CUDA and HIP backends. However, as the CUBIN, PTX and AMD assembly +are not suitable input formats for the [kernel fusion JIT compiler](KernelFusionJIT.md), a suitable IR has to be added as an additional device binary. -Therefore, in case kernel fusion should be performed for the CUDA backend, the +Therefore, in case kernel fusion should be performed for the CUDA or HIP backends, the user needs to specify the additional flag `-fsycl-embed-ir` during compilation, to add LLVM IR as an additional device binary. When the flag `-fsycl-embed-ir` -is specified, the LLVM IR produced by Clang for the CUDA backend device +is specified, the LLVM IR produced by Clang for the CUDA/HIP backend device compilation is added to the fat binary file. To this end, the resulting file-table from `sycl-post-link` is additionally passed to the -`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`. +`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64` +for the CUDA backend and `llvm_amdgcn` for the HIP backend. This device binary in LLVM IR format can be retrieved by the SYCL runtime and -used by the kernel fusion JIT compiler. The resulting fused kernel is compiled -to PTX assembly by the kernel fusion JIT compiler at runtime. +used by the kernel fusion JIT compiler. For the CUDA backend, the resulting fused +kernel is compiled to PTX assembly by the kernel fusion JIT compiler at runtime. +For the HIP backend, the resulting fused kernel is compiled to an AMDGCN binary +by the kernel fusion JIT compiler at runtime, however this output requires +finalization by `lld`. Rather than adding another dependancy to the fusion jit, +a `Requires finalization` property is added the binary. The HIP +PI plugin/UR adapter will then use the AMD Compiler Object Manager library +(`comgr`, part of the ROCm package) in order to finalize it into +a loadable format. Note that the device binary in LLVM IR does not replace the device binary in -CUBIN/PTX format, but is embed in addition to it. +target format, but is embed in addition to it. ### Integration with SPIR-V format diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index c49b0796700a7..78adbd319e1ea 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -270,7 +270,7 @@ Following [User Guide for NVPTX](https://llvm.org/docs/NVPTXUsage.html#llvm-nvvm ### Support for non SPIR-V targets -Fusion is currently supported for the NVPTX/CUDA backend. +Fusion is currently supported for the NVPTX/CUDA and HIP backend. As this backend cannot ingest a SPIR-V module, additional changes to the compilation flow are necessary. During static compilation the LLVM module for @@ -278,12 +278,9 @@ this backend is stored in addition to the finalized binary. This behavior is controlled by the `-fsycl-embed-ir` flag to avoid binary inflation in case kernel fusion is not used. If users want to use kernel fusion -at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir` +at runtime on the NVPTX/HIP backend, they need to pass the `-fsycl-embed-ir` flag during static compilation. During the fusion process at runtime, the JIT will load the LLVM IR and finalize the fused kernel to the final target. More information is available -[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). - -Support for the AMD GPU/HIP/AMDGCN backend is not yet implemented, but could -follow an approach similar to the NVPTX/CUDA backend. +[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index ef6b0b544cb78..3845be3600a24 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -951,6 +951,8 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; "@reqd_work_group_size" #define __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_PI_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" + /// This struct is a record of the device binary information. If the Kind field /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec /// field can still be specific and denote e.g. FPGA target. It must match the diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt index 22bf6e1c578f3..4e88944b5a298 100644 --- a/sycl/plugins/hip/CMakeLists.txt +++ b/sycl/plugins/hip/CMakeLists.txt @@ -11,6 +11,12 @@ set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir") set(SYCL_BUILD_PI_HIP_INCLUDE_DIR "" CACHE STRING "Override HIP include dir path (set to \"\" for default behavior)") set(SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR "" CACHE STRING "Override HSA include dir path (set to \"\" for default behavior)") +if(SYCL_ENABLE_KERNEL_FUSION) + set(SYCL_ENABLE_COMGR ON) +else(SYCL_ENABLE_KERNEL_FUSION) + set(SYCL_ENABLE_COMGR OFF) +endif(SYCL_ENABLE_KERNEL_FUSION) + if("${SYCL_BUILD_PI_HIP_INCLUDE_DIR}" STREQUAL "") set(PI_HIP_INCLUDE_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/include") else() @@ -26,7 +32,7 @@ endif() # Set HIP lib dir set(SYCL_BUILD_PI_HIP_LIB_DIR "" CACHE STRING "Override HIP lib dir path (set to \"\" for default behavior)") if("${SYCL_BUILD_PI_HIP_LIB_DIR}" STREQUAL "") - set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/hip/lib") + set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/lib") else() set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_LIB_DIR}") endif() @@ -121,6 +127,18 @@ if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD") ) target_link_libraries(pi_hip PUBLIC rocmdrv) + if(SYCL_ENABLE_COMGR) + add_library(amd_comgr SHARED IMPORTED GLOBAL) + set_target_properties( + amd_comgr PROPERTIES + IMPORTED_LOCATION "${PI_HIP_LIB_DIR}/libamd_comgr.so" + INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" + ) + target_link_libraries(pi_hip PUBLIC amd_comgr) + target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION) + endif(SYCL_ENABLE_COMGR) + # Set HIP define to select AMD platform target_compile_definitions(pi_hip PRIVATE __HIP_PLATFORM_AMD__) elseif("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "NVIDIA") diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 20aae8927f0bb..88b328e8daab4 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 3a3aae387eaa8aa072ff310b94df83c9d1a33a0b - # Merge: 614e6d0 b4425bc + # commit cf26de283a1233e6c93feb085acc10c566888b59 + # Merge: 3a3aae38 2fd9dea2 # Author: Kenneth Benzie (Benie) - # Date: Mon Oct 23 14:49:01 2023 +0100 - # Merge pull request #945 from npmiller/fix-priority - # [CUDA] Fix queue creation with native handle - set(UNIFIED_RUNTIME_TAG 3a3aae387eaa8aa072ff310b94df83c9d1a33a0b) + # Date: Wed Oct 25 10:36:48 2023 +0100 + # Merge pull request #940 from Naghasan/victor/kernel-fusion-amd + # [UR][HIP] Enable kernel finalization using comgr + set(UNIFIED_RUNTIME_TAG cf26de283a1233e6c93feb085acc10c566888b59) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index d2846dfb71315..50d41e96042cc 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -47,6 +47,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) { diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 0ff49fe7b64f2..68d105fef0f61 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -979,11 +979,12 @@ struct get_device_info_impl< bool, ext::codeplay::experimental::info::device::supports_fusion> { static bool get(const DeviceImplPtr &Dev) { #if SYCL_EXT_CODEPLAY_KERNEL_FUSION - // Currently fusion is only supported for SPIR-V based backends, i.e. OpenCL - // and LevelZero. + // Currently fusion is only supported for SPIR-V based backends, + // CUDA and HIP. return (Dev->getBackend() == backend::ext_oneapi_level_zero) || (Dev->getBackend() == backend::opencl) || - (Dev->getBackend() == backend::ext_oneapi_cuda); + (Dev->getBackend() == backend::ext_oneapi_cuda) || + (Dev->getBackend() == backend::ext_oneapi_hip); #else // SYCL_EXT_CODEPLAY_KERNEL_FUSION (void)Dev; return false; diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index b1e13d7d8237d..587ef45d11ed5 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -47,6 +47,8 @@ ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) { return ::jit_compiler::BinaryFormat::SPIRV; case backend::ext_oneapi_cuda: return ::jit_compiler::BinaryFormat::PTX; + case backend::ext_oneapi_hip: + return ::jit_compiler::BinaryFormat::AMDGCN; default: throw sycl::exception( sycl::make_error_code(sycl::errc::feature_not_supported), @@ -60,16 +62,20 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) { bool isNvidia = Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda; - if (isNvidia) { + bool isHIP = + Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip; + if (isNvidia || isHIP) { auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName); std::vector KernelIds{KernelID}; auto DeviceImages = ProgramManager::getInstance().getRawDeviceImages(KernelIds); auto DeviceImage = std::find_if( - DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) { + DeviceImages.begin(), DeviceImages.end(), + [isNvidia](RTDeviceBinaryImage *DI) { + const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64") + : std::string("llvm_amdgcn"); return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE && - DI->getRawData().DeviceTargetSpec == - std::string("llvm_nvptx64"); + DI->getRawData().DeviceTargetSpec == TargetSpec; }); if (DeviceImage == DeviceImages.end()) { return {nullptr, nullptr}; @@ -796,11 +802,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue, } // Retrieve barrier flags. - int BarrierFlags = + ::jit_compiler::BarrierFlags BarrierFlags = (PropList .has_property()) - ? -1 - : 3; + ? ::jit_compiler::getNoBarrierFlag() + : ::jit_compiler::getLocalAndGlobalBarrierFlag(); static size_t FusedKernelNameIndex = 0; std::stringstream FusedKernelName; @@ -894,6 +900,11 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( BinFormat = PI_DEVICE_BINARY_TYPE_NONE; break; } + case ::jit_compiler::BinaryFormat::AMDGCN: { + TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN; + BinFormat = PI_DEVICE_BINARY_TYPE_NONE; + break; + } case ::jit_compiler::BinaryFormat::SPIRV: { TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64) ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64 @@ -929,7 +940,8 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( Binary.addProperty(std::move(ArgMaskPropSet)); - if (Format == ::jit_compiler::BinaryFormat::PTX) { + if (Format == ::jit_compiler::BinaryFormat::PTX || + Format == ::jit_compiler::BinaryFormat::AMDGCN) { // Add a program metadata property with the reqd_work_group_size attribute. // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference. auto ReqdWGS = std::find_if( @@ -951,6 +963,14 @@ pi_device_binaries jit_compiler::createPIDeviceBinary( Binary.addProperty(std::move(ProgramMetadata)); } } + if (Format == ::jit_compiler::BinaryFormat::AMDGCN) { + PropertyContainer NeedFinalization{ + __SYCL_PI_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}; + PropertySetContainer ProgramMetadata{ + __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(std::move(NeedFinalization)); + Binary.addProperty(std::move(ProgramMetadata)); + } DeviceBinariesCollection Collection; Collection.addDeviceBinary( diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp index 13ea3faa87ea0..cfb3844867b10 100644 --- a/sycl/source/detail/jit_device_binaries.cpp +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -35,6 +35,12 @@ PropertyContainer::PropertyContainer(const std::string &Name, void *Data, std::memcpy(Value.get(), Data, Size); } +PropertyContainer::PropertyContainer(const std::string &Name, uint32_t Data) + : PropName{new char[Name.length() + 1]}, Value{}, ValueSize{Data}, + PropType{PI_PROPERTY_TYPE_UINT32} { + std::memcpy(PropName.get(), Name.c_str(), Name.length() + 1); +} + _pi_device_binary_property_struct PropertyContainer::getPIProperty() { return _pi_device_binary_property_struct{PropName.get(), Value.get(), PropType, ValueSize}; diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp index dd52f10aeea67..2ef916d4e4d11 100644 --- a/sycl/source/detail/jit_device_binaries.hpp +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -50,6 +50,8 @@ class PropertyContainer { public: PropertyContainer(const std::string &Name, void *Data, size_t Size, uint32_t Type); + // Set a PI_PROPERTY_TYPE_UINT32 property + PropertyContainer(const std::string &Name, uint32_t Data); PropertyContainer(PropertyContainer &&) = default; PropertyContainer &operator=(PropertyContainer &&) = default; diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp index 50933f13cda25..df4c4a0bd3ab1 100644 --- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp +++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp @@ -1,11 +1,15 @@ // REQUIRES: fusion // RUN: %{build} -fsycl-embed-ir -o %t.out -// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "Computation error" --implicit-check-not "Internalized" +// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "Computation error" --implicit-check-not "Internalized" --check-prefix=CHECK %if ext_oneapi_hip %{ --check-prefix=CHECK-HIP %} %else %{ --check-prefix=CHECK-NON-HIP %} // Test pointers being stored are not internalized. // CHECK: Unable to perform all promotions for function {{.*}}. Detailed information: -// CHECK-NEXT: Failed to promote argument 0 of function {{.*}}: It is not safe to promote values being stored to another pointer +// CHECK-NON-HIP-NEXT: Failed to promote argument 0 of function {{.*}}: It is not safe to promote values being stored to another pointer +// COM: The libspirv for HIP adds an instruction prior to the store causing the +// internalization failure. COM: The failure is still related to what we expect, +// it just fails for a slightly different reason. +// CHECK-HIP-NEXT: Failed to promote argument 0 of function {{.*}}: Do not know how to handle value to promote #include diff --git a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp index 89d75eea4e589..75251e3ca8a31 100644 --- a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp +++ b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp @@ -1,6 +1,7 @@ // REQUIRES: fusion // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "COMPUTATION ERROR" +// UNSUPPORTED: hip // Test caching for JIT fused kernels. Also test for debug messages being // printed when SYCL_RT_WARNING_LEVEL=1. diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp index daa3909b705e8..9b93e832cbbaf 100644 --- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp @@ -1,7 +1,7 @@ // REQUIRES: fusion // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out -// UNSUPPORTED: cuda +// UNSUPPORTED: cuda, hip // Test local internalization of a nested array type. diff --git a/sycl/test-e2e/KernelFusion/lit.local.cfg b/sycl/test-e2e/KernelFusion/lit.local.cfg deleted file mode 100644 index 1e863d5103c30..0000000000000 --- a/sycl/test-e2e/KernelFusion/lit.local.cfg +++ /dev/null @@ -1 +0,0 @@ -config.unsupported_features += ['hip']