diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 87ac4541abe39..e781caf494a6a 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -5557,8 +5557,9 @@ class OffloadingActionBuilder final {
             DA.add(*DeviceWrappingAction, *TC, BoundArch, Action::OFK_SYCL);
             continue;
           }
-          if (IsNVPTX && Args.hasArg(options::OPT_fsycl_embed_ir)) {
-            // When compiling for Nvidia/CUDA devices and the user requested the
+          if ((IsNVPTX || IsAMDGCN) &&
+              Args.hasArg(options::OPT_fsycl_embed_ir)) {
+            // When compiling for Nvidia/AMD devices and the user requested the
             // IR to be embedded in the application (via option), run the output
             // of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
             // through the offload wrapper and link the resulting object to the
diff --git a/clang/test/Driver/sycl-embed-ir.cpp b/clang/test/Driver/sycl-embed-ir.cpp
new file mode 100644
index 0000000000000..ed65be3b24314
--- /dev/null
+++ b/clang/test/Driver/sycl-embed-ir.cpp
@@ -0,0 +1,19 @@
+/// Tests for -fsycl-embed-ir
+
+// UNSUPPORTED: system-windows
+
+// RUN: %clangxx -fsycl -fsycl-targets=nvidia_gpu_sm_80 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \
+// RUN: FileCheck -check-prefix=CHECK-NV %s
+
+// CHECK-NV: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, sm_80)
+// CHECK-NV: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, sm_80)
+// CHECK-NV: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, sm_80)
+// CHECK-NV: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {[[WRAP]]}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {{{.*}}}, image
+
+// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx1010 -fsycl-embed-ir -ccc-print-phases %s 2>&1 | \
+// RUN: FileCheck -check-prefix=CHECK-AMD %s
+
+// CHECK-AMD: [[IR:[0-9]+]]: compiler, {4}, ir, (device-sycl, gfx1010)
+// CHECK-AMD: [[POSTLINK:[0-9]+]]: sycl-post-link, {{{.*}}}, ir, (device-sycl, gfx1010)
+// CHECK-AMD: [[WRAP:[0-9]+]]: clang-offload-wrapper, {[[POSTLINK]]}, object, (device-sycl, gfx1010)
+// CHECK-AMD: offload, "host-sycl (x86_64-unknown-linux-gnu)" {{{.*}}}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {[[WRAP]]}, "device-sycl (amdgcn-amd-amdhsa:gfx1010)" {{{.*}}}, image
diff --git a/sycl-fusion/common/include/Kernel.h b/sycl-fusion/common/include/Kernel.h
index ef10cd47b0394..1820cb040730f 100644
--- a/sycl-fusion/common/include/Kernel.h
+++ b/sycl-fusion/common/include/Kernel.h
@@ -20,6 +20,30 @@ namespace jit_compiler {
 
 using BinaryAddress = const uint8_t *;
 
+/// Possible barrier flags
+enum class BarrierFlags : uint32_t {
+  None = 0,   // Do not insert barrier
+  Local = 1,  // Ensure correct ordering of memory operations to local memory
+  Global = 2, // Ensure correct ordering of memory operations to global memory
+  LocalAndGlobal = Local | Global
+};
+
+constexpr BarrierFlags getNoBarrierFlag() { return BarrierFlags::None; }
+constexpr BarrierFlags getLocalAndGlobalBarrierFlag() {
+  return BarrierFlags::LocalAndGlobal;
+}
+constexpr bool isNoBarrierFlag(BarrierFlags Flag) {
+  return Flag == BarrierFlags::None;
+}
+constexpr bool hasLocalBarrierFlag(BarrierFlags Flag) {
+  return static_cast<uint32_t>(Flag) &
+         static_cast<uint32_t>(BarrierFlags::Local);
+}
+constexpr bool hasGlobalBarrierFlag(BarrierFlags Flag) {
+  return static_cast<uint32_t>(Flag) &
+         static_cast<uint32_t>(BarrierFlags::Global);
+}
+
 ///
 /// Enumerate possible kinds of parameters.
 /// 1:1 correspondence with the definition in kernel_desc.hpp in the DPC++ SYCL
@@ -35,7 +59,7 @@ enum class ParameterKind : uint32_t {
 };
 
 /// Different binary formats supported as input to the JIT compiler.
-enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX };
+enum class BinaryFormat : uint32_t { INVALID, LLVM, SPIRV, PTX, AMDGCN };
 
 /// Information about a device intermediate representation module (e.g., SPIR-V,
 /// LLVM IR) from DPC++.
diff --git a/sycl-fusion/common/lib/KernelIO.h b/sycl-fusion/common/lib/KernelIO.h
index 09058d61e9981..8de794256c71c 100644
--- a/sycl-fusion/common/lib/KernelIO.h
+++ b/sycl-fusion/common/lib/KernelIO.h
@@ -48,6 +48,7 @@ template <> struct ScalarEnumerationTraits<jit_compiler::BinaryFormat> {
     IO.enumCase(BF, "LLVM", jit_compiler::BinaryFormat::LLVM);
     IO.enumCase(BF, "SPIRV", jit_compiler::BinaryFormat::SPIRV);
     IO.enumCase(BF, "PTX", jit_compiler::BinaryFormat::PTX);
+    IO.enumCase(BF, "AMDGCN", jit_compiler::BinaryFormat::AMDGCN);
     IO.enumCase(BF, "INVALID", jit_compiler::BinaryFormat::INVALID);
   }
 };
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index bf323239679b4..35850132fcd95 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -9,6 +9,9 @@ add_llvm_library(sycl-fusion
    lib/fusion/ModuleHelper.cpp
    lib/helper/ConfigHelper.cpp
 
+   DEPENDS
+   intrinsics_gen
+
    LINK_COMPONENTS
    BitReader
    Core
@@ -50,6 +53,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_PTX)
 endif()
 
+if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(sycl-fusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
+endif()
+
 if (BUILD_SHARED_LIBS)
   if(NOT MSVC AND NOT APPLE)
     # Manage symbol visibility through the linker to make sure no LLVM symbols
diff --git a/sycl-fusion/jit-compiler/include/JITContext.h b/sycl-fusion/jit-compiler/include/JITContext.h
index 69465a74d8371..381c7b2a54343 100644
--- a/sycl-fusion/jit-compiler/include/JITContext.h
+++ b/sycl-fusion/jit-compiler/include/JITContext.h
@@ -19,6 +19,7 @@
 
 #include "Hashing.h"
 #include "Kernel.h"
+#include "Options.h"
 #include "Parameter.h"
 
 namespace llvm {
@@ -28,7 +29,7 @@ class LLVMContext;
 namespace jit_compiler {
 
 using CacheKeyT =
-    std::tuple<std::vector<std::string>, ParamIdentList, int,
+    std::tuple<std::vector<std::string>, ParamIdentList, BarrierFlags,
                std::vector<ParameterInternalization>, std::vector<JITConstant>,
                // This field of the cache is optional because, if all of the
                // ranges are equal, we will perform no remapping, so that fused
diff --git a/sycl-fusion/jit-compiler/include/KernelFusion.h b/sycl-fusion/jit-compiler/include/KernelFusion.h
index d4607df2e5660..7f0d287ae7aa1 100644
--- a/sycl-fusion/jit-compiler/include/KernelFusion.h
+++ b/sycl-fusion/jit-compiler/include/KernelFusion.h
@@ -60,7 +60,8 @@ class KernelFusion {
               const std::vector<SYCLKernelInfo> &KernelInformation,
               const std::vector<std::string> &KernelsToFuse,
               const std::string &FusedKernelName,
-              jit_compiler::ParamIdentList &Identities, int BarriersFlags,
+              jit_compiler::ParamIdentList &Identities,
+              BarrierFlags BarriersFlags,
               const std::vector<jit_compiler::ParameterInternalization>
                   &Internalization,
               const std::vector<jit_compiler::JITConstant> &JITConstants);
diff --git a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
index 8aa62a664641c..fdd58def9f2f6 100644
--- a/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
+++ b/sycl-fusion/jit-compiler/lib/KernelFusion.cpp
@@ -58,6 +58,13 @@ static bool isTargetFormatSupported(BinaryFormat TargetFormat) {
 #else  // FUSION_JIT_SUPPORT_PTX
     return false;
 #endif // FUSION_JIT_SUPPORT_PTX
+  }
+  case BinaryFormat::AMDGCN: {
+#ifdef FUSION_JIT_SUPPORT_AMDGCN
+    return true;
+#else  // FUSION_JIT_SUPPORT_AMDGCN
+    return false;
+#endif // FUSION_JIT_SUPPORT_AMDGCN
   }
   default:
     return false;
@@ -69,7 +76,7 @@ FusionResult KernelFusion::fuseKernels(
     const std::vector<SYCLKernelInfo> &KernelInformation,
     const std::vector<std::string> &KernelsToFuse,
     const std::string &FusedKernelName, ParamIdentList &Identities,
-    int BarriersFlags,
+    BarrierFlags BarriersFlags,
     const std::vector<jit_compiler::ParameterInternalization> &Internalization,
     const std::vector<jit_compiler::JITConstant> &Constants) {
   // Initialize the configuration helper to make the options for this invocation
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index a6c5569d87d22..bf769b50e2f30 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -40,7 +40,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {
   // Ideally, we could get this information from the TargetTransformInfo, but
   // the SPIR-V backend does not yet seem to have an implementation for that.
   llvm::Triple Tri(Mod.getTargetTriple());
-  if (Tri.isNVPTX()) {
+  if (Tri.isNVPTX() || Tri.isAMDGCN()) {
     return 0;
   }
   if (Tri.isSPIRV() || Tri.isSPIR()) {
@@ -53,7 +53,7 @@ static unsigned getFlatAddressSpace(Module &Mod) {
 
 std::unique_ptr<SYCLModuleInfo>
 FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
-                                int BarriersFlags) {
+                                BarrierFlags BarriersFlags) {
   // Perform the actual kernel fusion, i.e., generate a kernel function for the
   // fused kernel from the kernel functions of the input kernels. This is done
   // by the SYCLKernelFusion LLVM pass, which is run here through a custom LLVM
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
index 0941472dafe9d..642206025224f 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.h
@@ -25,7 +25,7 @@ class FusionPipeline {
   /// contain an entry for the fused kernel.
   static std::unique_ptr<SYCLModuleInfo>
   runFusionPasses(llvm::Module &Mod, SYCLModuleInfo &InputInfo,
-                  int BarriersFlags);
+                  BarrierFlags BarriersFlags);
 };
 } // namespace fusion
 } // namespace jit_compiler
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
index 69067b3ad5d84..354e68bab5ea2 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.cpp
@@ -191,6 +191,14 @@ llvm::Error KernelTranslator::translateKernel(SYCLKernelInfo &Kernel,
     KernelBin = *BinaryOrError;
     break;
   }
+  case BinaryFormat::AMDGCN: {
+    llvm::Expected<KernelBinary *> BinaryOrError =
+        translateToAMDGCN(Kernel, Mod, JITCtx);
+    if (auto Error = BinaryOrError.takeError())
+      return Error;
+    KernelBin = *BinaryOrError;
+    break;
+  }
   default: {
     return createStringError(
         inconvertibleErrorCode(),
@@ -287,3 +295,75 @@ KernelTranslator::translateToPTX(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
   return &JITCtx.emplaceKernelBinary(std::move(PTXASM), BinaryFormat::PTX);
 #endif // FUSION_JIT_SUPPORT_PTX
 }
+
+llvm::Expected<KernelBinary *>
+KernelTranslator::translateToAMDGCN(SYCLKernelInfo &KernelInfo,
+                                    llvm::Module &Mod, JITContext &JITCtx) {
+#ifndef FUSION_JIT_SUPPORT_AMDGCN
+  (void)KernelInfo;
+  (void)Mod;
+  (void)JITCtx;
+  return createStringError(inconvertibleErrorCode(),
+                           "AMDGPU translation not supported in this build");
+#else  // FUSION_JIT_SUPPORT_AMDGCN
+
+  LLVMInitializeAMDGPUTargetInfo();
+  LLVMInitializeAMDGPUTarget();
+  LLVMInitializeAMDGPUAsmPrinter();
+  LLVMInitializeAMDGPUTargetMC();
+
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
+
+  std::string TargetTriple{"amdgcn-amd-amdhsa"};
+
+  std::string ErrorMessage;
+  const auto *Target =
+      llvm::TargetRegistry::lookupTarget(TargetTriple, ErrorMessage);
+
+  if (!Target)
+    return createStringError(
+        inconvertibleErrorCode(),
+        "Failed to load and translate AMDGCN LLVM IR module with error %s",
+        ErrorMessage.c_str());
+
+  // Set to the lowest tested target according to the GetStartedGuide, section
+  // "Build DPC++ toolchain with support for HIP AMD"
+  llvm::StringRef TargetCPU{"gfx906"};
+  llvm::StringRef TargetFeatures{""};
+  if (auto *KernelFunc = Mod.getFunction(KernelInfo.Name)) {
+    if (KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      TargetCPU =
+          KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+    if (KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      TargetFeatures = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                           .getValueAsString();
+    }
+  }
+
+  // FIXME: Check whether we can provide more accurate target information here
+  auto *TargetMachine = Target->createTargetMachine(
+      TargetTriple, TargetCPU, TargetFeatures, {}, llvm::Reloc::PIC_,
+      std::nullopt, llvm::CodeGenOptLevel::Default);
+
+  std::string AMDObj;
+  {
+    llvm::legacy::PassManager PM;
+    llvm::raw_string_ostream OBJStream{AMDObj};
+    llvm::buffer_ostream BufferedOBJ{OBJStream};
+
+    if (TargetMachine->addPassesToEmitFile(PM, BufferedOBJ, nullptr,
+                                           llvm::CodeGenFileType::ObjectFile)) {
+      return createStringError(
+          inconvertibleErrorCode(),
+          "Failed to construct pass pipeline to emit output");
+    }
+
+    PM.run(Mod);
+    OBJStream.flush();
+  }
+
+  return &JITCtx.emplaceKernelBinary(std::move(AMDObj), BinaryFormat::AMDGCN);
+#endif // FUSION_JIT_SUPPORT_AMDGCN
+}
diff --git a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
index 7e4816df9bf94..a7936438202d0 100644
--- a/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
+++ b/sycl-fusion/jit-compiler/lib/translation/KernelTranslation.h
@@ -43,6 +43,10 @@ class KernelTranslator {
 
   static llvm::Expected<KernelBinary *>
   translateToPTX(SYCLKernelInfo &Kernel, llvm::Module &Mod, JITContext &JITCtx);
+
+  static llvm::Expected<KernelBinary *>
+  translateToAMDGCN(SYCLKernelInfo &KernelInfo, llvm::Module &Mod,
+                    JITContext &JITCtx);
 };
 } // namespace translation
 } // namespace jit_compiler
diff --git a/sycl-fusion/passes/CMakeLists.txt b/sycl-fusion/passes/CMakeLists.txt
index 4693083be4faa..1b1f393eb9df9 100644
--- a/sycl-fusion/passes/CMakeLists.txt
+++ b/sycl-fusion/passes/CMakeLists.txt
@@ -30,6 +30,10 @@ if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_PTX)
 endif()
 
+if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusion PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
+endif()
+
 # Static library for linking with the jit_compiler
 add_llvm_library(SYCLKernelFusionPasses
   SYCLFusionPasses.cpp
@@ -68,3 +72,7 @@ target_link_libraries(SYCLKernelFusionPasses
 if("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_PTX)
 endif()
+
+if("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  target_compile_definitions(SYCLKernelFusionPasses PRIVATE FUSION_JIT_SUPPORT_AMDGCN)
+endif()
diff --git a/sycl-fusion/passes/SYCLFusionPasses.cpp b/sycl-fusion/passes/SYCLFusionPasses.cpp
index b112cfec1b18e..16a938ec991ce 100644
--- a/sycl-fusion/passes/SYCLFusionPasses.cpp
+++ b/sycl-fusion/passes/SYCLFusionPasses.cpp
@@ -9,12 +9,15 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 
+#include "Kernel.h"
+
 #include "internalization/Internalization.h"
 #include "kernel-fusion/SYCLKernelFusion.h"
 #include "kernel-info/SYCLKernelInfo.h"
 #include "syclcp/SYCLCP.h"
 
 using namespace llvm;
+using namespace jit_compiler;
 
 cl::opt<bool>
     NoBarriers("sycl-kernel-fusion-no-barriers",
@@ -28,8 +31,9 @@ llvm::PassPluginLibraryInfo getSYCLKernelFusionPluginInfo() {
             [](StringRef Name, ModulePassManager &MPM,
                ArrayRef<PassBuilder::PipelineElement>) {
               if (Name == "sycl-kernel-fusion") {
-                int BarrierFlag =
-                    (NoBarriers) ? -1 : SYCLKernelFusion::DefaultBarriersFlags;
+                BarrierFlags BarrierFlag =
+                    (NoBarriers) ? getNoBarrierFlag()
+                                 : SYCLKernelFusion::DefaultBarriersFlags;
                 MPM.addPass(SYCLKernelFusion(BarrierFlag));
                 return true;
               }
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
index ecc97b244c6ff..4a0f2509cff86 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.cpp
@@ -234,13 +234,12 @@ static FusionInsertPoints addGuard(IRBuilderBase &Builder,
   return {Entry, CallInsertion, Exit};
 }
 
-static Expected<CallInst *>
-createFusionCall(IRBuilderBase &Builder, Function *F,
-                 ArrayRef<Value *> CallArgs,
-                 const jit_compiler::NDRange &SrcNDRange,
-                 const jit_compiler::NDRange &FusedNDRange, bool IsLast,
-                 int BarriersFlags, jit_compiler::Remapper &Remapper,
-                 bool ShouldRemap, TargetFusionInfo &TargetInfo) {
+static Expected<CallInst *> createFusionCall(
+    IRBuilderBase &Builder, Function *F, ArrayRef<Value *> CallArgs,
+    const jit_compiler::NDRange &SrcNDRange,
+    const jit_compiler::NDRange &FusedNDRange, bool IsLast,
+    jit_compiler::BarrierFlags BarriersFlags, jit_compiler::Remapper &Remapper,
+    bool ShouldRemap, TargetFusionInfo &TargetInfo) {
   const auto IPs =
       addGuard(Builder, TargetInfo, SrcNDRange, FusedNDRange, IsLast);
 
@@ -266,7 +265,7 @@ createFusionCall(IRBuilderBase &Builder, Function *F,
   Builder.SetInsertPoint(IPs.Exit);
 
   // Insert barrier if needed
-  if (!IsLast && BarriersFlags > 0) {
+  if (!IsLast && !jit_compiler::isNoBarrierFlag(BarriersFlags)) {
     TargetInfo.createBarrierCall(Builder, BarriersFlags);
   }
 
diff --git a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
index 5f52f0a317d14..3da212ccb668d 100644
--- a/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
+++ b/sycl-fusion/passes/kernel-fusion/SYCLKernelFusion.h
@@ -35,7 +35,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   constexpr static llvm::StringLiteral NDRangesMDKey{"sycl.kernel.nd-ranges"};
 
   constexpr SYCLKernelFusion() = default;
-  constexpr explicit SYCLKernelFusion(int BarriersFlags)
+  constexpr explicit SYCLKernelFusion(jit_compiler::BarrierFlags BarriersFlags)
       : BarriersFlags{BarriersFlags} {}
 
   llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
@@ -45,7 +45,8 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   ///
   /// By default, correct ordering of memory operations to global memory is
   /// ensured.
-  constexpr static int DefaultBarriersFlags{3};
+  constexpr static jit_compiler::BarrierFlags DefaultBarriersFlags{
+      jit_compiler::getLocalAndGlobalBarrierFlag()};
 
 private:
   // This needs to be in sync with the metadata kind
@@ -155,11 +156,7 @@ class SYCLKernelFusion : public llvm::PassInfoMixin<SYCLKernelFusion> {
   ///
   /// Flags to apply to the barrier to be introduced between fused kernels.
   ///
-  /// Possible values:
-  /// - -1: Do not insert barrier
-  /// - 1: ensure correct ordering of memory operations to local memory
-  /// - 2: ensure correct ordering of memory operations to global memory
-  const int BarriersFlags{DefaultBarriersFlags};
+  const jit_compiler::BarrierFlags BarriersFlags{DefaultBarriersFlags};
 
   ///
   /// Merge the content of Other into Attributes, adding, removing or updating
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.cpp b/sycl-fusion/passes/target/TargetFusionInfo.cpp
index b70494c25722a..5dd259f62faa7 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.cpp
+++ b/sycl-fusion/passes/target/TargetFusionInfo.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TargetFusionInfo.h"
+#include "Kernel.h"
 
 #include "Kernel.h"
 #include "NDRangesHelper.h"
@@ -14,9 +15,12 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/TargetParser/Triple.h"
 
+using namespace jit_compiler;
+
 template <typename ForwardIt, typename KeyTy>
 static ForwardIt mapArrayLookup(ForwardIt Begin, ForwardIt End,
                                 const KeyTy &Key) {
@@ -62,7 +66,7 @@ class TargetFusionInfoImpl {
   virtual ArrayRef<StringRef> getUniformKernelAttributes() const { return {}; }
 
   virtual void createBarrierCall(IRBuilderBase &Builder,
-                                 int BarrierFlags) const = 0;
+                                 BarrierFlags BarrierFlags) const = 0;
 
   virtual unsigned getPrivateAddressSpace() const = 0;
 
@@ -173,12 +177,10 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
   }
 
   void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override {
-    if (BarrierFlags == -1) {
+                         BarrierFlags BarrierFlags) const override {
+    if (isNoBarrierFlag(BarrierFlags)) {
       return;
     }
-    assert((BarrierFlags == 1 || BarrierFlags == 2 || BarrierFlags == 3) &&
-           "Invalid barrier flags");
 
     static const auto FnAttrs = AttributeSet::get(
         LLVMMod->getContext(),
@@ -208,8 +210,9 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
     SmallVector<Value *> Args{
         Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
         Builder.getInt32(/*Exec Scope : Workgroup = */ 2),
-        Builder.getInt32(0x10 | (BarrierFlags % 2 == 1 ? 0x100 : 0x0) |
-                         ((BarrierFlags >> 1 == 1 ? 0x200 : 0x0)))};
+        Builder.getInt32(0x10 |
+                         (hasLocalBarrierFlag(BarrierFlags) ? 0x100 : 0x0) |
+                         ((hasGlobalBarrierFlag(BarrierFlags) ? 0x200 : 0x0)))};
 
     auto *BarrierCallInst = Builder.CreateCall(F, Args);
     BarrierCallInst->setAttributes(
@@ -412,18 +415,15 @@ class SPIRVTargetFusionInfo : public TargetFusionInfoImpl {
   }
 };
 
-//
-// NVPTXTargetFusionInfo
-//
-#ifdef FUSION_JIT_SUPPORT_PTX
-class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
+class NVPTXAMDGCNTargetFusionInfoBase : public TargetFusionInfoImpl {
 public:
   using TargetFusionInfoImpl::TargetFusionInfoImpl;
 
-  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+  void notifyFunctionsDelete(StringRef MDName,
+                             llvm::ArrayRef<Function *> Funcs) const {
     SmallPtrSet<Constant *, 8> DeletedFuncs{Funcs.begin(), Funcs.end()};
     SmallVector<MDNode *> ValidKernels;
-    auto *OldAnnotations = LLVMMod->getNamedMetadata("nvvm.annotations");
+    auto *OldAnnotations = LLVMMod->getNamedMetadata(MDName);
     for (auto *Op : OldAnnotations->operands()) {
       if (auto *TOp = dyn_cast<MDTuple>(Op)) {
         if (auto *COp = dyn_cast_if_present<ConstantAsMetadata>(
@@ -437,23 +437,21 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
       }
     }
     LLVMMod->eraseNamedMetadata(OldAnnotations);
-    auto *NewAnnotations =
-        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+    auto *NewAnnotations = LLVMMod->getOrInsertNamedMetadata(MDName);
     for (auto *Kernel : ValidKernels) {
       NewAnnotations->addOperand(Kernel);
     }
   }
 
-  void addKernelFunction(Function *KernelFunc) const override {
-    auto *NVVMAnnotations =
-        LLVMMod->getOrInsertNamedMetadata("nvvm.annotations");
+  void addKernelFunction(StringRef MDName, Function *KernelFunc) const {
+    auto *Annotations = LLVMMod->getOrInsertNamedMetadata(MDName);
     auto *MDOne = ConstantAsMetadata::get(
         ConstantInt::get(Type::getInt32Ty(LLVMMod->getContext()), 1));
     auto *MDKernelString = MDString::get(LLVMMod->getContext(), "kernel");
     auto *MDFunc = ConstantAsMetadata::get(KernelFunc);
     SmallVector<Metadata *, 3> KernelMD({MDFunc, MDKernelString, MDOne});
     auto *Tuple = MDTuple::get(LLVMMod->getContext(), KernelMD);
-    NVVMAnnotations->addOperand(Tuple);
+    Annotations->addOperand(Tuple);
   }
 
   ArrayRef<StringRef> getKernelMetadataKeys() const override {
@@ -469,10 +467,29 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
         {"target-cpu", "target-features", "uniform-work-group-size"}};
     return Keys;
   }
+};
+
+//
+// NVPTXTargetFusionInfo
+//
+#ifdef FUSION_JIT_SUPPORT_PTX
+class NVPTXTargetFusionInfo : public NVPTXAMDGCNTargetFusionInfoBase {
+public:
+  using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;
+
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+    NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("nvvm.annotations",
+                                                           Funcs);
+  }
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("nvvm.annotations",
+                                                       KernelFunc);
+  }
 
   void createBarrierCall(IRBuilderBase &Builder,
-                         int BarrierFlags) const override {
-    if (BarrierFlags == -1) {
+                         BarrierFlags BarrierFlags) const override {
+    if (isNoBarrierFlag(BarrierFlags)) {
       return;
     }
     // Emit a call to llvm.nvvm.barrier0. From the user manual of the NVPTX
@@ -735,6 +752,87 @@ class NVPTXTargetFusionInfo : public TargetFusionInfoImpl {
 };
 #endif // FUSION_JIT_SUPPORT_PTX
 
+//
+// AMDGCNTargetFusionInfo
+//
+#ifdef FUSION_JIT_SUPPORT_AMDGCN
+class AMDGCNTargetFusionInfo : public NVPTXAMDGCNTargetFusionInfoBase {
+public:
+  using NVPTXAMDGCNTargetFusionInfoBase::NVPTXAMDGCNTargetFusionInfoBase;
+
+  void notifyFunctionsDelete(llvm::ArrayRef<Function *> Funcs) const override {
+    NVPTXAMDGCNTargetFusionInfoBase::notifyFunctionsDelete("amdgcn.annotations",
+                                                           Funcs);
+  }
+
+  void addKernelFunction(Function *KernelFunc) const override {
+    KernelFunc->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    NVPTXAMDGCNTargetFusionInfoBase::addKernelFunction("amdgcn.annotations",
+                                                       KernelFunc);
+  }
+
+  void createBarrierCall(IRBuilderBase &Builder,
+                         BarrierFlags BarrierFlags) const override {
+    if (isNoBarrierFlag(BarrierFlags)) {
+      return;
+    }
+    // Following implemention in
+    // libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl
+    llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
+    llvm::SyncScope::ID SSID =
+        LLVMMod->getContext().getOrInsertSyncScopeID("workgroup");
+    Builder.CreateFence(AO, SSID);
+    Builder.CreateIntrinsic(Intrinsic::AMDGCNIntrinsics::amdgcn_s_barrier, {},
+                            {});
+  }
+
+  std::optional<BuiltinKind> getBuiltinKind(Function *) const override {
+    llvm_unreachable("Not implemented yet");
+    return {};
+  }
+
+  bool shouldRemap(BuiltinKind K, const NDRange &SrcNDRange,
+                   const NDRange &FusedNDRange) const override {
+    llvm_unreachable("Not implemented yet");
+    return false;
+  }
+
+  bool isSafeToNotRemapBuiltin(Function *F) const override {
+    llvm_unreachable("Not implemented yet");
+    return false;
+  }
+
+  unsigned getIndexSpaceBuiltinBitwidth() const override {
+    llvm_unreachable("Not implemented yet");
+    return false;
+  }
+
+  void setMetadataForGeneratedFunction(Function *F) const override {
+    llvm_unreachable("Not implemented yet");
+  }
+
+  Value *getGlobalIDWithoutOffset(IRBuilderBase &Builder,
+                                  const NDRange &FusedNDRange,
+                                  uint32_t Idx) const override {
+    llvm_unreachable("Not implemented yet");
+    return nullptr;
+  }
+
+  Function *createRemapperFunction(const Remapper &R, BuiltinKind K,
+                                   StringRef OrigName, StringRef Name,
+                                   Module *M, const NDRange &SrcNDRange,
+                                   const NDRange &FusedNDRange) const override {
+    llvm_unreachable("Not implemented yet");
+    return nullptr;
+  }
+
+  // Corresponds to the definitions in the LLVM AMDGCN backend user guide:
+  // https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces
+  unsigned getPrivateAddressSpace() const override { return 5; }
+  unsigned getLocalAddressSpace() const override { return 3; }
+};
+#endif // FUSION_JIT_SUPPORT_ADMGCN
+
 } // anonymous namespace
 
 //
@@ -749,6 +847,12 @@ TargetFusionInfo::TargetFusionInfo(llvm::Module *Mod) {
     return;
   }
 #endif // FUSION_JIT_SUPPORT_PTX
+#ifdef FUSION_JIT_SUPPORT_AMDGCN
+  if (Tri.isAMDGCN()) {
+    Impl = std::make_shared<AMDGCNTargetFusionInfo>(Mod);
+    return;
+  }
+#endif // FUSION_JIT_SUPPORT_AMDGCN
   if (Tri.isSPIRV() || Tri.isSPIR()) {
     Impl = std::make_shared<SPIRVTargetFusionInfo>(Mod);
     return;
@@ -775,7 +879,7 @@ TargetFusionInfo::getKernelMetadataKeys() const {
 }
 
 void TargetFusionInfo::createBarrierCall(IRBuilderBase &Builder,
-                                         int BarrierFlags) const {
+                                         BarrierFlags BarrierFlags) const {
   Impl->createBarrierCall(Builder, BarrierFlags);
 }
 
diff --git a/sycl-fusion/passes/target/TargetFusionInfo.h b/sycl-fusion/passes/target/TargetFusionInfo.h
index f93866df3e5a4..35158e5fbbdee 100644
--- a/sycl-fusion/passes/target/TargetFusionInfo.h
+++ b/sycl-fusion/passes/target/TargetFusionInfo.h
@@ -9,6 +9,7 @@
 #ifndef SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
 #define SYCL_FUSION_PASSES_TARGET_TARGETFUSIONINFO_H
 
+#include "Kernel.h"
 #include "kernel-fusion/Builtins.h"
 
 #include "llvm/IR/Function.h"
@@ -57,7 +58,8 @@ class TargetFusionInfo {
   /// kernel.
   llvm::ArrayRef<llvm::StringRef> getUniformKernelAttributes() const;
 
-  void createBarrierCall(IRBuilderBase &Builder, int BarrierFlags) const;
+  void createBarrierCall(IRBuilderBase &Builder,
+                         jit_compiler::BarrierFlags BarrierFlags) const;
 
   unsigned getPrivateAddressSpace() const;
 
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index 45b1cccc1d0c5..6c71a80ef6d6f 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -777,24 +777,32 @@ Note: Kernel naming is not fully stable for now.
 
 The [experimental kernel fusion
 extension](../extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc)
-also supports the CUDA backend. However, as neither CUBIN nor PTX are a suitable
-input format for the [kernel fusion JIT compiler](KernelFusionJIT.md), a
+also supports the CUDA and HIP backends. However, as the CUBIN, PTX and AMD assembly
+are not suitable input formats for the [kernel fusion JIT compiler](KernelFusionJIT.md), a
 suitable IR has to be added as an additional device binary.
 
-Therefore, in case kernel fusion should be performed for the CUDA backend, the
+Therefore, in case kernel fusion should be performed for the CUDA or HIP backends, the
 user needs to specify the additional flag `-fsycl-embed-ir` during compilation,
 to add LLVM IR as an additional device binary. When the flag `-fsycl-embed-ir`
-is specified, the LLVM IR produced by Clang for the CUDA backend device
+is specified, the LLVM IR produced by Clang for the CUDA/HIP backend device
 compilation is added to the fat binary file. To this end, the resulting
 file-table from `sycl-post-link` is additionally passed to the
-`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`.
+`clang-offload-wrapper`, creating a wrapper object with target `llvm_nvptx64`
+for the CUDA backend and `llvm_amdgcn` for the HIP backend.
 
 This device binary in LLVM IR format can be retrieved by the SYCL runtime and
-used by the kernel fusion JIT compiler. The resulting fused kernel is compiled
-to PTX assembly by the kernel fusion JIT compiler at runtime.
+used by the kernel fusion JIT compiler. For the CUDA backend, the resulting fused
+kernel is compiled to PTX assembly by the kernel fusion JIT compiler at runtime.
+For the HIP backend, the resulting fused kernel is compiled to an AMDGCN binary
+by the kernel fusion JIT compiler at runtime, however this output requires
+finalization by `lld`. Rather than adding another dependancy to the fusion jit,
+a `Requires finalization` property is added the binary. The HIP
+PI plugin/UR adapter will then use the AMD Compiler Object Manager library
+(`comgr`, part of the ROCm package) in order to finalize it into
+a loadable format.
 
 Note that the device binary in LLVM IR does not replace the device binary in
-CUBIN/PTX format, but is embed in addition to it.
+target format, but is embed in addition to it.
 
 ### Integration with SPIR-V format
 
diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md
index c49b0796700a7..78adbd319e1ea 100644
--- a/sycl/doc/design/KernelFusionJIT.md
+++ b/sycl/doc/design/KernelFusionJIT.md
@@ -270,7 +270,7 @@ Following [User Guide for NVPTX](https://llvm.org/docs/NVPTXUsage.html#llvm-nvvm
 
 ### Support for non SPIR-V targets
 
-Fusion is currently supported for the NVPTX/CUDA backend. 
+Fusion is currently supported for the NVPTX/CUDA and HIP backend.
 
 As this backend cannot ingest a SPIR-V module, additional changes to the
 compilation flow are necessary. During static compilation the LLVM module for
@@ -278,12 +278,9 @@ this backend is stored in addition to the finalized binary.
 
 This behavior is controlled by the `-fsycl-embed-ir` flag to avoid binary
 inflation in case kernel fusion is not used. If users want to use kernel fusion
-at runtime on the NVPTX/CUDA backend, they need to pass the `-fsycl-embed-ir`
+at runtime on the NVPTX/HIP backend, they need to pass the `-fsycl-embed-ir`
 flag during static compilation. 
 
 During the fusion process at runtime, the JIT will load the LLVM IR and
 finalize the fused kernel to the final target. More information is available
-[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support). 
-
-Support for the AMD GPU/HIP/AMDGCN backend is not yet implemented, but could
-follow an approach similar to the NVPTX/CUDA backend.
+[here](./CompilerAndRuntimeDesign.md#kernel-fusion-support).
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index ef6b0b544cb78..3845be3600a24 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -951,6 +951,8 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
   "@reqd_work_group_size"
 #define __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
 
+#define __SYCL_PI_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
+
 /// This struct is a record of the device binary information. If the Kind field
 /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec
 /// field can still be specific and denote e.g. FPGA target. It must match the
diff --git a/sycl/plugins/hip/CMakeLists.txt b/sycl/plugins/hip/CMakeLists.txt
index 22bf6e1c578f3..4e88944b5a298 100644
--- a/sycl/plugins/hip/CMakeLists.txt
+++ b/sycl/plugins/hip/CMakeLists.txt
@@ -11,6 +11,12 @@ set(SYCL_BUILD_PI_HIP_ROCM_DIR "/opt/rocm" CACHE STRING "ROCm installation dir")
 set(SYCL_BUILD_PI_HIP_INCLUDE_DIR "" CACHE STRING "Override HIP include dir path (set to \"\" for default behavior)")
 set(SYCL_BUILD_PI_HIP_HSA_INCLUDE_DIR "" CACHE STRING "Override HSA include dir path (set to \"\" for default behavior)")
 
+if(SYCL_ENABLE_KERNEL_FUSION)
+  set(SYCL_ENABLE_COMGR ON)
+else(SYCL_ENABLE_KERNEL_FUSION)
+  set(SYCL_ENABLE_COMGR OFF)
+endif(SYCL_ENABLE_KERNEL_FUSION)
+
 if("${SYCL_BUILD_PI_HIP_INCLUDE_DIR}" STREQUAL "")
   set(PI_HIP_INCLUDE_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/include")
 else()
@@ -26,7 +32,7 @@ endif()
 # Set HIP lib dir
 set(SYCL_BUILD_PI_HIP_LIB_DIR "" CACHE STRING "Override HIP lib dir path (set to \"\" for default behavior)")
 if("${SYCL_BUILD_PI_HIP_LIB_DIR}" STREQUAL "")
-  set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/hip/lib")
+  set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_ROCM_DIR}/lib")
 else()
   set(PI_HIP_LIB_DIR "${SYCL_BUILD_PI_HIP_LIB_DIR}")
 endif()
@@ -121,6 +127,18 @@ if("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "AMD")
   )
   target_link_libraries(pi_hip PUBLIC rocmdrv)
 
+  if(SYCL_ENABLE_COMGR)
+    add_library(amd_comgr SHARED IMPORTED GLOBAL)
+    set_target_properties(
+      amd_comgr PROPERTIES
+        IMPORTED_LOCATION                    "${PI_HIP_LIB_DIR}/libamd_comgr.so"
+        INTERFACE_INCLUDE_DIRECTORIES        "${HIP_HEADERS}"
+        INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}"
+    )
+    target_link_libraries(pi_hip PUBLIC amd_comgr)
+    target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION)
+  endif(SYCL_ENABLE_COMGR)
+
   # Set HIP define to select AMD platform
   target_compile_definitions(pi_hip PRIVATE __HIP_PLATFORM_AMD__)
 elseif("${SYCL_BUILD_PI_HIP_PLATFORM}" STREQUAL "NVIDIA")
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 20aae8927f0bb..88b328e8daab4 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 3a3aae387eaa8aa072ff310b94df83c9d1a33a0b
-  # Merge: 614e6d0 b4425bc
+  # commit cf26de283a1233e6c93feb085acc10c566888b59
+  # Merge: 3a3aae38 2fd9dea2
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Mon Oct 23 14:49:01 2023 +0100
-  #     Merge pull request #945 from npmiller/fix-priority
-  #     [CUDA] Fix queue creation with native handle
-  set(UNIFIED_RUNTIME_TAG 3a3aae387eaa8aa072ff310b94df83c9d1a33a0b)
+  # Date:   Wed Oct 25 10:36:48 2023 +0100
+  #     Merge pull request #940 from Naghasan/victor/kernel-fusion-amd
+  #     [UR][HIP] Enable kernel finalization using comgr
+  set(UNIFIED_RUNTIME_TAG cf26de283a1233e6c93feb085acc10c566888b59)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index d2846dfb71315..50d41e96042cc 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -47,6 +47,7 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
 #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE                    \
   "@reqd_work_group_size"
 #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
+#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
 
 // Terminates the process with a catastrophic error message.
 [[noreturn]] inline void die(const char *Message) {
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 0ff49fe7b64f2..68d105fef0f61 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -979,11 +979,12 @@ struct get_device_info_impl<
     bool, ext::codeplay::experimental::info::device::supports_fusion> {
   static bool get(const DeviceImplPtr &Dev) {
 #if SYCL_EXT_CODEPLAY_KERNEL_FUSION
-    // Currently fusion is only supported for SPIR-V based backends, i.e. OpenCL
-    // and LevelZero.
+    // Currently fusion is only supported for SPIR-V based backends,
+    // CUDA and HIP.
     return (Dev->getBackend() == backend::ext_oneapi_level_zero) ||
            (Dev->getBackend() == backend::opencl) ||
-           (Dev->getBackend() == backend::ext_oneapi_cuda);
+           (Dev->getBackend() == backend::ext_oneapi_cuda) ||
+           (Dev->getBackend() == backend::ext_oneapi_hip);
 #else  // SYCL_EXT_CODEPLAY_KERNEL_FUSION
     (void)Dev;
     return false;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index b1e13d7d8237d..587ef45d11ed5 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -47,6 +47,8 @@ ::jit_compiler::BinaryFormat getTargetFormat(QueueImplPtr &Queue) {
     return ::jit_compiler::BinaryFormat::SPIRV;
   case backend::ext_oneapi_cuda:
     return ::jit_compiler::BinaryFormat::PTX;
+  case backend::ext_oneapi_hip:
+    return ::jit_compiler::BinaryFormat::AMDGCN;
   default:
     throw sycl::exception(
         sycl::make_error_code(sycl::errc::feature_not_supported),
@@ -60,16 +62,20 @@ retrieveKernelBinary(QueueImplPtr &Queue, CGExecKernel *KernelCG) {
 
   bool isNvidia =
       Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_cuda;
-  if (isNvidia) {
+  bool isHIP =
+      Queue->getDeviceImplPtr()->getBackend() == backend::ext_oneapi_hip;
+  if (isNvidia || isHIP) {
     auto KernelID = ProgramManager::getInstance().getSYCLKernelID(KernelName);
     std::vector<kernel_id> KernelIds{KernelID};
     auto DeviceImages =
         ProgramManager::getInstance().getRawDeviceImages(KernelIds);
     auto DeviceImage = std::find_if(
-        DeviceImages.begin(), DeviceImages.end(), [](RTDeviceBinaryImage *DI) {
+        DeviceImages.begin(), DeviceImages.end(),
+        [isNvidia](RTDeviceBinaryImage *DI) {
+          const std::string &TargetSpec = isNvidia ? std::string("llvm_nvptx64")
+                                                   : std::string("llvm_amdgcn");
           return DI->getFormat() == PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE &&
-                 DI->getRawData().DeviceTargetSpec ==
-                     std::string("llvm_nvptx64");
+                 DI->getRawData().DeviceTargetSpec == TargetSpec;
         });
     if (DeviceImage == DeviceImages.end()) {
       return {nullptr, nullptr};
@@ -796,11 +802,11 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   }
 
   // Retrieve barrier flags.
-  int BarrierFlags =
+  ::jit_compiler::BarrierFlags BarrierFlags =
       (PropList
            .has_property<ext::codeplay::experimental::property::no_barriers>())
-          ? -1
-          : 3;
+          ? ::jit_compiler::getNoBarrierFlag()
+          : ::jit_compiler::getLocalAndGlobalBarrierFlag();
 
   static size_t FusedKernelNameIndex = 0;
   std::stringstream FusedKernelName;
@@ -894,6 +900,11 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
     BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
     break;
   }
+  case ::jit_compiler::BinaryFormat::AMDGCN: {
+    TargetSpec = __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN;
+    BinFormat = PI_DEVICE_BINARY_TYPE_NONE;
+    break;
+  }
   case ::jit_compiler::BinaryFormat::SPIRV: {
     TargetSpec = (FusedKernelInfo.BinaryInfo.AddressBits == 64)
                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64
@@ -929,7 +940,8 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
 
   Binary.addProperty(std::move(ArgMaskPropSet));
 
-  if (Format == ::jit_compiler::BinaryFormat::PTX) {
+  if (Format == ::jit_compiler::BinaryFormat::PTX ||
+      Format == ::jit_compiler::BinaryFormat::AMDGCN) {
     // Add a program metadata property with the reqd_work_group_size attribute.
     // See CUDA PI (pi_cuda.cpp) _pi_program::set_metadata for reference.
     auto ReqdWGS = std::find_if(
@@ -951,6 +963,14 @@ pi_device_binaries jit_compiler::createPIDeviceBinary(
       Binary.addProperty(std::move(ProgramMetadata));
     }
   }
+  if (Format == ::jit_compiler::BinaryFormat::AMDGCN) {
+    PropertyContainer NeedFinalization{
+        __SYCL_PI_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1};
+    PropertySetContainer ProgramMetadata{
+        __SYCL_PI_PROPERTY_SET_PROGRAM_METADATA};
+    ProgramMetadata.addProperty(std::move(NeedFinalization));
+    Binary.addProperty(std::move(ProgramMetadata));
+  }
 
   DeviceBinariesCollection Collection;
   Collection.addDeviceBinary(
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
index 13ea3faa87ea0..cfb3844867b10 100644
--- a/sycl/source/detail/jit_device_binaries.cpp
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -35,6 +35,12 @@ PropertyContainer::PropertyContainer(const std::string &Name, void *Data,
   std::memcpy(Value.get(), Data, Size);
 }
 
+PropertyContainer::PropertyContainer(const std::string &Name, uint32_t Data)
+    : PropName{new char[Name.length() + 1]}, Value{}, ValueSize{Data},
+      PropType{PI_PROPERTY_TYPE_UINT32} {
+  std::memcpy(PropName.get(), Name.c_str(), Name.length() + 1);
+}
+
 _pi_device_binary_property_struct PropertyContainer::getPIProperty() {
   return _pi_device_binary_property_struct{PropName.get(), Value.get(),
                                            PropType, ValueSize};
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
index dd52f10aeea67..2ef916d4e4d11 100644
--- a/sycl/source/detail/jit_device_binaries.hpp
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -50,6 +50,8 @@ class PropertyContainer {
 public:
   PropertyContainer(const std::string &Name, void *Data, size_t Size,
                     uint32_t Type);
+  // Set a PI_PROPERTY_TYPE_UINT32 property
+  PropertyContainer(const std::string &Name, uint32_t Data);
 
   PropertyContainer(PropertyContainer &&) = default;
   PropertyContainer &operator=(PropertyContainer &&) = default;
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
index 50933f13cda25..df4c4a0bd3ab1 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
@@ -1,11 +1,15 @@
 // REQUIRES: fusion
 // RUN: %{build} -fsycl-embed-ir -o %t.out
-// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "Computation error" --implicit-check-not "Internalized"
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "Computation error" --implicit-check-not "Internalized" --check-prefix=CHECK %if ext_oneapi_hip %{ --check-prefix=CHECK-HIP %} %else %{ --check-prefix=CHECK-NON-HIP %}
 
 // Test pointers being stored are not internalized.
 
 // CHECK:      Unable to perform all promotions for function {{.*}}. Detailed information:
-// CHECK-NEXT: Failed to promote argument 0 of function {{.*}}: It is not safe to promote values being stored to another pointer
+// CHECK-NON-HIP-NEXT: Failed to promote argument 0 of function {{.*}}: It is not safe to promote values being stored to another pointer
+// COM: The libspirv for HIP adds an instruction prior to the store causing the
+// internalization failure. COM: The failure is still related to what we expect,
+// it just fails for a slightly different reason.
+// CHECK-HIP-NEXT: Failed to promote argument 0 of function {{.*}}: Do not know how to handle value to promote
 
 #include <sycl/sycl.hpp>
 
diff --git a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
index 89d75eea4e589..75251e3ca8a31 100644
--- a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
+++ b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
@@ -1,6 +1,7 @@
 // REQUIRES: fusion
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "COMPUTATION ERROR"
+// UNSUPPORTED: hip
 
 // Test caching for JIT fused kernels. Also test for debug messages being
 // printed when SYCL_RT_WARNING_LEVEL=1.
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
index daa3909b705e8..9b93e832cbbaf 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: fusion
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
 // RUN: %{run} %t.out
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda, hip
 
 // Test local internalization of a nested array type.
 
diff --git a/sycl/test-e2e/KernelFusion/lit.local.cfg b/sycl/test-e2e/KernelFusion/lit.local.cfg
deleted file mode 100644
index 1e863d5103c30..0000000000000
--- a/sycl/test-e2e/KernelFusion/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.unsupported_features += ['hip']