From 3d50c06eab2311261e35259292ea2787f2a2ad87 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Fri, 21 May 2021 07:54:43 +0000
Subject: [PATCH 01/18] enable amd gpu

---
 buildbot/configure.py                         |   10 +
 clang/lib/CodeGen/CGBuiltin.cpp               |    3 +-
 clang/lib/CodeGen/CGCall.cpp                  |    2 +-
 clang/lib/Driver/Driver.cpp                   |   38 +-
 clang/lib/Driver/ToolChain.cpp                |    2 +-
 clang/lib/Driver/ToolChains/HIP.cpp           |   84 +-
 clang/lib/Driver/ToolChains/HIP.h             |   20 +-
 clang/lib/Driver/ToolChains/SYCL.cpp          |    4 +-
 clang/lib/Frontend/InitPreprocessor.cpp       |    4 +
 sycl/CMakeLists.txt                           |   15 +
 sycl/include/CL/__spirv/spirv_vars.hpp        |    2 +-
 sycl/include/CL/sycl/backend_types.hpp        |    6 +-
 sycl/include/CL/sycl/detail/pi.h              |    1 +
 sycl/include/CL/sycl/detail/pi.hpp            |    2 +
 .../CL/sycl/detail/rocm_definitions.hpp       |   24 +
 sycl/plugins/CMakeLists.txt                   |    4 +
 sycl/plugins/rocm/CMakeLists.txt              |   55 +
 sycl/plugins/rocm/pi_rocm.cpp                 | 4794 +++++++++++++++++
 sycl/plugins/rocm/pi_rocm.hpp                 |  717 +++
 sycl/source/CMakeLists.txt                    |    5 +
 sycl/source/detail/config.hpp                 |    7 +-
 sycl/source/detail/pi.cpp                     |   11 +
 sycl/tools/CMakeLists.txt                     |    5 +
 23 files changed, 5794 insertions(+), 21 deletions(-)
 create mode 100644 sycl/include/CL/sycl/detail/rocm_definitions.hpp
 create mode 100644 sycl/plugins/rocm/CMakeLists.txt
 create mode 100644 sycl/plugins/rocm/pi_rocm.cpp
 create mode 100644 sycl/plugins/rocm/pi_rocm.hpp

diff --git a/buildbot/configure.py b/buildbot/configure.py
index 28f36c815bcb5..b6b92cc23c78b 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -25,6 +25,7 @@ def do_configure(args):
     llvm_enable_projects = 'clang;' + llvm_external_projects
     libclc_targets_to_build = ''
     sycl_build_pi_cuda = 'OFF'
+    sycl_build_pi_cuda = 'OFF'
     sycl_werror = 'ON'
     llvm_enable_assertions = 'ON'
     llvm_enable_doxygen = 'OFF'
@@ -43,6 +44,13 @@ def do_configure(args):
         libclc_targets_to_build = 'nvptx64--;nvptx64--nvidiacl'
         sycl_build_pi_cuda = 'ON'
 
+    if args.rocm:
+        llvm_targets_to_build += ';AMDGPU'
+        # TODO 
+        llvm_enable_projects += ';libclc;lld'
+        libclc_targets_to_build = 'amdgcn--;amdgcn--amdhsa'
+        sycl_build_pi_rocm = 'ON'
+
     if args.no_werror:
         sycl_werror = 'OFF'
 
@@ -74,6 +82,7 @@ def do_configure(args):
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DSYCL_BUILD_PI_CUDA={}".format(sycl_build_pi_cuda),
+        "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_rocm),
         "-DLLVM_BUILD_TOOLS=ON",
         "-DSYCL_ENABLE_WERROR={}".format(sycl_werror),
         "-DCMAKE_INSTALL_PREFIX={}".format(install_dir),
@@ -141,6 +150,7 @@ def main():
     parser.add_argument("-t", "--build-type",
                         metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release")
     parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA")
+    parser.add_argument("--rocm", action='store_true', help="swith from OpenCL to ROCM")
     parser.add_argument("--arm", action='store_true', help="build ARM support rather than x86")
     parser.add_argument("--no-assertions", action='store_true', help="build without assertions")
     parser.add_argument("--docs", action='store_true', help="build Doxygen documentation")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 06309c7bf2221..25ee7268c6f75 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4945,8 +4945,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BIprintf:
     if (getTarget().getTriple().isNVPTX())
       return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
-    if (getTarget().getTriple().getArch() == Triple::amdgcn &&
-        getLangOpts().HIP)
+    if (getTarget().getTriple().getArch() == Triple::amdgcn)
       return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
     break;
   case Builtin::BI__builtin_canonicalize:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 70a93b051fa13..be2d52dccbbb4 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -760,7 +760,7 @@ CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
   unsigned CC = ClangCallConvToLLVMCallConv(info.getCC());
   // This is required so SYCL kernels are successfully processed by tools from CUDA. Kernels
   // with a `spir_kernel` calling convention are ignored otherwise.
-  if (CC == llvm::CallingConv::SPIR_KERNEL && CGM.getTriple().isNVPTX() &&
+  if (CC == llvm::CallingConv::SPIR_KERNEL && (CGM.getTriple().isNVPTX() || CGM.getTriple().isNVPTX()) &&
       getContext().getLangOpts().SYCLIsDevice) {
     CC = llvm::CallingConv::C;
   }
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 8d7e782f3c13e..e3ab6debb335e 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -667,6 +667,11 @@ static bool isValidSYCLTriple(llvm::Triple T) {
   // NVPTX is valid for SYCL.
   if (T.isNVPTX())
     return true;
+
+  // AMDGCN is valid for SYCL 
+  if (T.isAMDGCN())
+    return true;
+
   // Check for invalid SYCL device triple values.
   // Non-SPIR arch.
   if (!T.isSPIR())
@@ -3839,6 +3844,20 @@ class OffloadingActionBuilder final {
       return BA;
     }
 
+    Action *finalizeAMDGCNDependences(Action *Input, const llvm::Triple &TT) {
+      auto *BA = C.getDriver().ConstructPhaseAction(
+          C, Args, phases::Backend, Input, AssociatedOffloadKind);
+      
+      auto *AA = C.getDriver().ConstructPhaseAction(
+          C, Args, phases::Assemble, BA, AssociatedOffloadKind);
+          
+      ActionList AL = {AA};
+      Action *action = C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+      ActionList HIPActions = {action};
+      Action *HIPFatBinary = C.MakeAction<LinkJobAction>(HIPActions, types::TY_HIP_FATBIN);
+      return HIPFatBinary;
+    }
+
   public:
     SYCLActionBuilder(Compilation &C, DerivedArgList &Args,
                       const Driver::InputList &Inputs)
@@ -4234,6 +4253,7 @@ class OffloadingActionBuilder final {
         ActionList LinkObjects;
         auto TT = SYCLTripleList[I];
         auto isNVPTX = (*TC)->getTriple().isNVPTX();
+        auto isAMDGCN = (*TC)->getTriple().isAMDGCN();
         bool isSpirvAOT = TT.getSubArch() == llvm::Triple::SPIRSubArch_fpga ||
                           TT.getSubArch() == llvm::Triple::SPIRSubArch_gen ||
                           TT.getSubArch() == llvm::Triple::SPIRSubArch_x86_64;
@@ -4331,7 +4351,7 @@ class OffloadingActionBuilder final {
         // When spv online link is supported by all backends, the fallback
         // device libraries are only needed when current toolchain is using
         // AOT compilation.
-        if (!isNVPTX) {
+        if (!isNVPTX && !isAMDGCN) {
           SYCLDeviceLibLinked = addSYCLDeviceLibs(
               *TC, FullLinkObjects, true,
               C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment());
@@ -4345,7 +4365,7 @@ class OffloadingActionBuilder final {
           FullDeviceLinkAction = DeviceLinkAction;
         // setup some flags upfront
 
-        if (isNVPTX && DeviceCodeSplit) {
+        if ((isNVPTX || isAMDGCN) && DeviceCodeSplit) {
           // TODO Temporary limitation, need to support code splitting for PTX
           const Driver &D = C.getDriver();
           const std::string &OptName =
@@ -4357,14 +4377,14 @@ class OffloadingActionBuilder final {
         }
         // reflects whether current target is ahead-of-time and can't support
         // runtime setting of specialization constants
-        bool isAOT = isNVPTX || isSpirvAOT;
+        bool isAOT = isNVPTX || isAMDGCN || isSpirvAOT;
         // TODO support device code split for NVPTX target
 
         ActionList WrapperInputs;
         // post link is not optional - even if not splitting, always need to
         // process specialization constants
         types::ID PostLinkOutType =
-            isNVPTX ? types::TY_LLVM_BC : types::TY_Tempfiletable;
+            isNVPTX || isAMDGCN ? types::TY_LLVM_BC : types::TY_Tempfiletable;
         auto *PostLinkAction = C.MakeAction<SYCLPostLinkJobAction>(
             FullDeviceLinkAction, PostLinkOutType);
         PostLinkAction->setRTSetsSpecConstants(!isAOT);
@@ -4373,6 +4393,10 @@ class OffloadingActionBuilder final {
           Action *FinAction =
               finalizeNVPTXDependences(PostLinkAction, (*TC)->getTriple());
           WrapperInputs.push_back(FinAction);
+        } else if(isAMDGCN) {
+          Action *FinAction =
+              finalizeAMDGCNDependences(PostLinkAction, (*TC)->getTriple());
+          WrapperInputs.push_back(FinAction);
         } else {
           // For SPIRV-based targets - translate to SPIRV then optionally
           // compile ahead-of-time to native architecture
@@ -7170,7 +7194,7 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(const ArgList &Args,
         break;
       case Action::OFK_HIP:
         TC = std::make_unique<toolchains::HIPToolChain>(
-          *this, Target, HostTC, Args);
+          *this, Target, HostTC, Args, TargetDeviceOffloadKind);
         break;
       case Action::OFK_OpenMP:
         // omp + nvptx
@@ -7189,6 +7213,10 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(const ArgList &Args,
             TC = std::make_unique<toolchains::CudaToolChain>(
               *this, Target, HostTC, Args, TargetDeviceOffloadKind);
             break;
+          case llvm::Triple::amdgcn:
+            TC = std::make_unique<toolchains::HIPToolChain>(
+              *this, Target, HostTC, Args, TargetDeviceOffloadKind);
+            break;
           default:
           break;
         }
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 01f20b36b24c1..96ad6f51f2970 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1157,7 +1157,7 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
     // matches the current toolchain triple. If it is not present
     // at all, target and host share a toolchain.
     if (A->getOption().matches(options::OPT_m_Group)) {
-      if (SameTripleAsHost)
+      if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
         DAL->append(A);
       else
         Modified = true;
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 126eb65e3c4db..2e0545fe10c7c 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -112,8 +112,10 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   // for backward compatibility. For code object version 4 and greater, the
   // offload kind in bundle ID is 'hipv4'.
   std::string OffloadKind = "hip";
+  // bundle ID equals 'hip' is always right.
   if (getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
-    OffloadKind = OffloadKind + "v4";
+    //OffloadKind = OffloadKind + "v4";
+    OffloadKind = OffloadKind;
   for (const auto &II : Inputs) {
     const auto* A = II.getAction();
     BundlerTargetArg = BundlerTargetArg + "," + OffloadKind +
@@ -225,8 +227,8 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 }
 
 HIPToolChain::HIPToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ToolChain &HostTC, const ArgList &Args)
-    : ROCMToolChain(D, Triple, Args), HostTC(HostTC) {
+                             const ToolChain &HostTC, const ArgList &Args, const Action::OffloadKind OK)
+    : ROCMToolChain(D, Triple, Args), HostTC(HostTC), OK(OK) {
   // Lookup binaries into the driver directory, this is used to
   // discover the clang-offload-bundler executable.
   getProgramPaths().push_back(getDriver().Dir);
@@ -238,8 +240,14 @@ void HIPToolChain::addClangTargetOptions(
     Action::OffloadKind DeviceOffloadingKind) const {
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
-  assert(DeviceOffloadingKind == Action::OFK_HIP &&
+  assert((DeviceOffloadingKind == Action::OFK_HIP ||
+          DeviceOffloadingKind == Action::OFK_SYCL) && 
          "Only HIP offloading kinds are supported for GPUs.");
+  
+  StringRef GpuArch = getGPUArch(DriverArgs);
+  if(GpuArch.empty()) {
+    GpuArch = "gfx906";
+  }
 
   CC1Args.push_back("-fcuda-is-device");
 
@@ -269,6 +277,54 @@ void HIPToolChain::addClangTargetOptions(
     CC1Args.push_back("-fapply-global-visibility-to-externs");
   }
 
+  if (DeviceOffloadingKind == Action::OFK_SYCL) {
+    toolchains::SYCLToolChain::AddSYCLIncludeArgs(getDriver(), DriverArgs,
+                                                  CC1Args);
+  }
+
+   auto NoLibSpirv = DriverArgs.hasArg(options::OPT_fno_sycl_libspirv,
+                                      options::OPT_fsycl_device_only);
+  if (DeviceOffloadingKind == Action::OFK_SYCL && !NoLibSpirv) {
+    std::string LibSpirvFile;
+
+    if (DriverArgs.hasArg(clang::driver::options::OPT_fsycl_libspirv_path_EQ)) {
+      auto ProvidedPath =
+        DriverArgs.getLastArgValue(clang::driver::options::OPT_fsycl_libspirv_path_EQ).str();
+      if (llvm::sys::fs::exists(ProvidedPath))
+        LibSpirvFile = ProvidedPath;
+    } else {
+      SmallVector<StringRef, 8> LibraryPaths;
+
+      // Expected path w/out install.
+      SmallString<256> WithoutInstallPath(getDriver().ResourceDir);
+      llvm::sys::path::append(WithoutInstallPath, Twine("../../clc"));
+      LibraryPaths.emplace_back(WithoutInstallPath.c_str());
+
+      // Expected path w/ install.
+      SmallString<256> WithInstallPath(getDriver().ResourceDir);
+      llvm::sys::path::append(WithInstallPath, Twine("../../../share/clc"));
+      LibraryPaths.emplace_back(WithInstallPath.c_str());
+
+      std::string LibSpirvTargetName = "libspirv-amdgcn--amdhsa.bc";
+      for (StringRef LibraryPath : LibraryPaths) {
+        SmallString<128> LibSpirvTargetFile(LibraryPath);
+        llvm::sys::path::append(LibSpirvTargetFile, LibSpirvTargetName);
+        if (llvm::sys::fs::exists(LibSpirvTargetFile)) {
+          LibSpirvFile = std::string(LibSpirvTargetFile.str());
+          break;
+        }
+      }
+    }
+
+    if (LibSpirvFile.empty()) {
+      getDriver().Diag(diag::err_drv_no_sycl_libspirv);
+      return;
+    }
+
+    CC1Args.push_back("-mlink-builtin-bitcode");
+    CC1Args.push_back(DriverArgs.MakeArgString(LibSpirvFile));
+  }
+
   llvm::for_each(getHIPDeviceLibs(DriverArgs), [&](StringRef BCFile) {
     CC1Args.push_back("-mlink-builtin-bitcode");
     CC1Args.push_back(DriverArgs.MakeArgString(BCFile));
@@ -300,11 +356,31 @@ HIPToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
   return DAL;
 }
 
+/*
+Tool *HIPToolChain::buildLinker() const {
+  assert(getTriple().getArch() == llvm::Triple::amdgcn);
+  return new tools::AMDGCN::Linker(*this);
+}
+*/
+
 Tool *HIPToolChain::buildLinker() const {
   assert(getTriple().getArch() == llvm::Triple::amdgcn);
+  if (OK == Action::OFK_SYCL)
+    return new tools::AMDGCN::SYCLLinker(*this);
   return new tools::AMDGCN::Linker(*this);
 }
 
+Tool *HIPToolChain::SelectTool(const JobAction &JA) const {
+  if (OK == Action::OFK_SYCL) {
+    if (JA.getKind() == Action::LinkJobClass &&
+        JA.getType() == types::TY_LLVM_BC) {
+      return static_cast<tools::AMDGCN::SYCLLinker *>(ToolChain::SelectTool(JA))
+          ->GetSYCLToolChainLinker();
+    }
+  }
+  return ToolChain::SelectTool(JA);
+}
+
 void HIPToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
   HostTC.addClangWarningOptions(CC1Args);
 }
diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index a9e1ed9a4656f..1c3a832db4074 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -51,6 +51,19 @@ class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
                                                 const JobAction &JA) const;
 };
 
+class LLVM_LIBRARY_VISIBILITY SYCLLinker : public Linker {
+public:
+  SYCLLinker(const ToolChain &TC) : Linker(TC) {}
+
+  Tool* GetSYCLToolChainLinker() const {
+    if (!SYCLToolChainLinker)
+      SYCLToolChainLinker.reset(new SYCL::Linker(getToolChain()));
+    return SYCLToolChainLinker.get();
+  }
+private:
+  mutable std::unique_ptr<Tool> SYCLToolChainLinker;
+};
+
 } // end namespace AMDGCN
 } // end namespace tools
 
@@ -59,7 +72,7 @@ namespace toolchains {
 class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
 public:
   HIPToolChain(const Driver &D, const llvm::Triple &Triple,
-                const ToolChain &HostTC, const llvm::opt::ArgList &Args);
+                const ToolChain &HostTC, const llvm::opt::ArgList &Args, const Action::OffloadKind OK);
 
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
@@ -95,9 +108,14 @@ class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
   unsigned GetDefaultDwarfVersion() const override { return 4; }
 
   const ToolChain &HostTC;
+  Tool *SelectTool(const JobAction &JA) const override;
 
 protected:
   Tool *buildLinker() const override;
+
+private:
+  const Action::OffloadKind OK;
+
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 8bc8bf94c501f..9bac46ae87720 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -274,7 +274,7 @@ void SYCL::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                 const char *LinkingOutput) const {
 
   assert((getToolChain().getTriple().isSPIR() ||
-          getToolChain().getTriple().isNVPTX()) &&
+          getToolChain().getTriple().isNVPTX() || getToolChain().getTriple().isAMDGCN()) &&
          "Unsupported target");
 
   std::string SubArchName =
@@ -285,7 +285,7 @@ void SYCL::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   // For CUDA, we want to link all BC files before resuming the normal
   // compilation path
-  if (getToolChain().getTriple().isNVPTX()) {
+  if (getToolChain().getTriple().isNVPTX() || getToolChain().getTriple().isAMDGCN()) {
     InputInfoList NvptxInputs;
     for (const auto &II : Inputs) {
       if (!II.isFilename())
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 61620a994fdda..80bea70d4337c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1178,6 +1178,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     if (TI.getTriple().isNVPTX()) {
         Builder.defineMacro("__SYCL_NVPTX__", "1");
     }
+    
+    if (TI.getTriple().isAMDGCN()) {
+        Builder.defineMacro("__SYCL_AMDGCN__", "1");
+    }
   }
   if (LangOpts.SYCLUnnamedLambda)
     Builder.defineMacro("__SYCL_UNNAMED_LAMBDA__", "1");
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index 6924ddb8c22a1..ea624aff6cb7c 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -91,6 +91,9 @@ install(DIRECTORY ${OpenCL_INCLUDE_DIR}/CL
 option(SYCL_BUILD_PI_CUDA
   "Enables the CUDA backend for the Plugin Interface" OFF)
 
+option(SYCL_BUILD_PI_ROCM
+  "Enables the ROCM backend for the Plugin Interface" OFF)
+
 # Configure SYCL version macro
 set(sycl_inc_dir ${CMAKE_CURRENT_SOURCE_DIR}/include)
 set(sycl_src_dir ${CMAKE_CURRENT_SOURCE_DIR}/source)
@@ -268,6 +271,18 @@ if(SYCL_BUILD_PI_CUDA)
   list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libspirv-builtins pi_cuda)
 endif()
 
+if(SYCL_BUILD_PI_ROCM)
+  # Ensure that libclc is enabled.
+  list(FIND LLVM_ENABLE_PROJECTS libclc LIBCLC_FOUND)
+  if( LIBCLC_FOUND EQUAL -1 )
+    message(FATAL_ERROR
+      "ROCM support requires adding \"libclc\" to the CMake argument \"LLVM_ENABLE_PROJECTS\"")
+  endif()
+
+  add_dependencies(sycl-toolchain libspirv-builtins pi_rocm)
+  list(APPEND SYCL_TOOLCHAIN_DEPLOY_COMPONENTS libspirv-builtins pi_rocm)
+endif()
+
 # Use it as fake dependency in order to force another command(s) to execute.
 add_custom_command(OUTPUT __force_it
   COMMAND "${CMAKE_COMMAND}" -E echo
diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index bbc6f75ddf87b..228c70181c991 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -15,7 +15,7 @@
 
 #define __SPIRV_VAR_QUALIFIERS extern "C" const
 
-#if defined(__SYCL_NVPTX__)
+#if defined(__SYCL_NVPTX__)  || defined(__SYCL_AMDGCN__)
 
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x();
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y();
diff --git a/sycl/include/CL/sycl/backend_types.hpp b/sycl/include/CL/sycl/backend_types.hpp
index 148bccf6e77a6..77009d0cca07a 100644
--- a/sycl/include/CL/sycl/backend_types.hpp
+++ b/sycl/include/CL/sycl/backend_types.hpp
@@ -23,7 +23,8 @@ enum class backend : char {
   opencl = 1,
   level_zero = 2,
   cuda = 3,
-  all = 4
+  rocm = 4,
+  all = 5
 };
 
 template <backend Backend, typename SYCLObjectT> struct interop;
@@ -51,6 +52,9 @@ inline std::ostream &operator<<(std::ostream &Out, backend be) {
   case backend::cuda:
     Out << "cuda";
     break;
+  case backend::rocm:
+    Out << "rocm";
+    break;
   case backend::all:
     Out << "all";
   }
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index e308f5e8f63e2..b1371d246b4f9 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -667,6 +667,7 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
 #define __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA "spir64_fpga"
 /// PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device
 #define __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64 "nvptx64"
+#define __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN "amdgcn"
 
 /// Device binary image property set names recognized by the SYCL runtime.
 /// Name must be consistent with
diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp
index e06ae106e65e7..ef641d3b34aa6 100644
--- a/sycl/include/CL/sycl/detail/pi.hpp
+++ b/sycl/include/CL/sycl/detail/pi.hpp
@@ -61,10 +61,12 @@ bool trace(TraceLevel level);
 #define __SYCL_OPENCL_PLUGIN_NAME "pi_opencl.dll"
 #define __SYCL_LEVEL_ZERO_PLUGIN_NAME "pi_level_zero.dll"
 #define __SYCL_CUDA_PLUGIN_NAME "pi_cuda.dll"
+#define  __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.dll"
 #else
 #define __SYCL_OPENCL_PLUGIN_NAME "libpi_opencl.so"
 #define __SYCL_LEVEL_ZERO_PLUGIN_NAME "libpi_level_zero.so"
 #define __SYCL_CUDA_PLUGIN_NAME "libpi_cuda.so"
+#define  __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.so"
 #endif
 
 // Report error and no return (keeps compiler happy about no return statements).
diff --git a/sycl/include/CL/sycl/detail/rocm_definitions.hpp b/sycl/include/CL/sycl/detail/rocm_definitions.hpp
new file mode 100644
index 0000000000000..288929ef735f5
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/rocm_definitions.hpp
@@ -0,0 +1,24 @@
+//==------------ cuda_definitions.hpp - SYCL CUDA backend ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// CUDA backend specific options
+// TODO: Use values that won't overlap with others
+
+// Mem Object info: Retrieve the raw CUDA pointer from a cl_mem
+#define __SYCL_PI_HIP_RAW_POINTER (0xFF01)
+// Context creation: Use a primary CUDA context instead of a custom one by
+//                   providing a property value of PI_TRUE for the following
+//                   property ID.
+#define __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY (0xFF02)
+
+// PI Command Queue using Default stream
+#define __SYCL_PI_HIP_USE_DEFAULT_STREAM (0xFF03)
+// PI Command queue will sync with default stream
+#define __SYCL_PI_HIP_SYNC_WITH_DEFAULT (0xFF04)
diff --git a/sycl/plugins/CMakeLists.txt b/sycl/plugins/CMakeLists.txt
index 8df33d512e3d8..b7a7853730bd4 100644
--- a/sycl/plugins/CMakeLists.txt
+++ b/sycl/plugins/CMakeLists.txt
@@ -8,5 +8,9 @@ if(SYCL_BUILD_PI_CUDA)
   add_subdirectory(cuda)
 endif()
 
+if(SYCL_BUILD_PI_ROCM)
+  add_subdirectory(rocm)
+endif()
+
 add_subdirectory(opencl)
 add_subdirectory(level_zero)
diff --git a/sycl/plugins/rocm/CMakeLists.txt b/sycl/plugins/rocm/CMakeLists.txt
new file mode 100644
index 0000000000000..edf593113aaab
--- /dev/null
+++ b/sycl/plugins/rocm/CMakeLists.txt
@@ -0,0 +1,55 @@
+message(STATUS "Including the PI API ROCM backend.")
+
+ # cannot rely on cmake support for ROCM; it assumes runtime API is being used.
+ # we only require the ROCM driver API to be used
+ # rocm_rocm_LIBRARY variable defines the path to libhsa-runtime64.so, the ROCM Driver API library.
+
+#find_package(ROCM 4.0 REQUIRED)
+
+# Make imported library global to use it within the project.
+add_library(rocmdrv SHARED IMPORTED GLOBAL)
+
+
+set(ROCM_ROCM_LIBRARY "/opt/rocm/hip/lib/libamdhip64.so")
+set(ROCM_INCLUDE_DIRS "/opt/rocm/hip/include")
+set(hsa_inc_dir "/opt/rocm/hsa/include")
+
+
+add_definitions(-D__HIP_PLATFORM_HCC__)
+
+set_target_properties(
+  rocmdrv PROPERTIES 
+    IMPORTED_LOCATION             ${ROCM_ROCM_LIBRARY}
+    INTERFACE_INCLUDE_DIRECTORIES ${ROCM_INCLUDE_DIRS}
+)
+
+add_library(pi_rocm SHARED
+   "${sycl_inc_dir}/CL/sycl/detail/pi.h"
+   "${sycl_inc_dir}/CL/sycl/detail/pi.hpp"
+   "pi_rocm.hpp"
+   "pi_rocm.cpp" 
+)
+
+
+add_dependencies(sycl-toolchain pi_rocm)
+
+set_target_properties(pi_rocm PROPERTIES LINKER_LANGUAGE CXX)
+
+
+target_include_directories(pi_rocm
+        PRIVATE
+        ${sycl_inc_dir}
+        ${sycl_plugin_dir}
+        ${ROCM_INCLUDE_DIRS}
+        ${hsa_inc_dir}
+)
+
+
+target_link_libraries(pi_rocm PUBLIC OpenCL-Headers rocmdrv)
+
+add_common_options(pi_rocm)
+
+install(TARGETS pi_rocm
+  LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT pi_rocm
+  RUNTIME DESTINATION "bin" COMPONENT pi_rocm
+)
diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp
new file mode 100644
index 0000000000000..bf2e2e399849c
--- /dev/null
+++ b/sycl/plugins/rocm/pi_rocm.cpp
@@ -0,0 +1,4794 @@
+//==---------- pi_rocm.cpp - HIP Plugin -----------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// \file pi_rocm.cpp
+/// Implementation of HIP Plugin.
+///
+/// \ingroup sycl_pi_rocm
+
+#include <CL/sycl/detail/rocm_definitions.hpp>
+#include <CL/sycl/detail/defines.hpp>
+#include <CL/sycl/detail/pi.hpp>
+#include <pi_rocm.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <hip/hip_runtime.h>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <regex>
+
+
+
+namespace {
+std::string getCudaVersionString() {
+  int driver_version = 0;
+  if (hipDriverGetVersion(&driver_version) != hipSuccess) {
+    return "";
+  }
+  // The version is returned as (1000 major + 10 minor).
+  std::stringstream stream;
+  stream << "HIP " << driver_version / 1000 << "."
+         << driver_version % 1000 / 10;
+  return stream.str();
+}
+
+pi_result map_error(hipError_t result) {
+  switch (result) {
+  case hipSuccess:
+    return PI_SUCCESS;
+  //case HIP_ERROR_NOT_PERMITTED:
+  //  return PI_INVALID_OPERATION;
+  case hipErrorInvalidContext:
+    return PI_INVALID_CONTEXT;
+  case hipErrorInvalidDevice:
+    return PI_INVALID_DEVICE;
+  case hipErrorInvalidValue:
+    return PI_INVALID_VALUE;
+  case hipErrorOutOfMemory:
+    return PI_OUT_OF_HOST_MEMORY;
+  case hipErrorLaunchOutOfResources:
+    return PI_OUT_OF_RESOURCES;
+  default:
+    return PI_ERROR_UNKNOWN;
+  }
+}
+
+inline void assign_result(pi_result *ptr, pi_result value) noexcept {
+  if (ptr) {
+    *ptr = value;
+  }
+}
+
+// Iterates over the event wait list, returns correct pi_result error codes.
+// Invokes the callback for the latest event of each queue in the wait list.
+// The callback must take a single pi_event argument and return a pi_result.
+template <typename Func>
+pi_result forLatestEvents(const pi_event *event_wait_list,
+                          std::size_t num_events_in_wait_list, Func &&f) {
+
+  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
+    return PI_INVALID_EVENT_WAIT_LIST;
+  }
+
+  // Fast path if we only have a single event
+  if (num_events_in_wait_list == 1) {
+    return f(event_wait_list[0]);
+  }
+
+  std::vector<pi_event> events{event_wait_list,
+                               event_wait_list + num_events_in_wait_list};
+  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
+    // Tiered sort creating sublists of streams (smallest value first) in which
+    // the corresponding events are sorted into a sequence of newest first.
+    return e0->get_queue()->stream_ < e1->get_queue()->stream_ ||
+           (e0->get_queue()->stream_ == e1->get_queue()->stream_ &&
+            e0->get_event_id() > e1->get_event_id());
+  });
+
+  bool first = true;
+  hipStream_t lastSeenStream = 0;
+  for (pi_event event : events) {
+    if (!event || (!first && event->get_queue()->stream_ == lastSeenStream)) {
+      continue;
+    }
+
+    first = false;
+    lastSeenStream = event->get_queue()->stream_;
+
+    auto result = f(event);
+    if (result != PI_SUCCESS) {
+      return result;
+    }
+  }
+
+  return PI_SUCCESS;
+}
+
+/// Converts HIP error into PI error codes, and outputs error information
+/// to stderr.
+/// If PI_HIP_ABORT env variable is defined, it aborts directly instead of
+/// throwing the error. This is intended for debugging purposes.
+/// \return PI_SUCCESS if \param result was hipSuccess.
+/// \throw pi_error exception (integer) if input was not success.
+///
+pi_result check_error(hipError_t result, const char *function, int line,
+                      const char *file) {
+  if (result == hipSuccess) {
+    return PI_SUCCESS;
+  }
+
+  const char *errorString = nullptr;
+  const char *errorName = nullptr;
+  errorName = hipGetErrorName(result);
+  errorString = hipGetErrorString(result);
+  std::cerr << "\nPI HIP ERROR:"
+            << "\n\tValue:           " << result
+            << "\n\tName:            " << errorName
+            << "\n\tDescription:     " << errorString
+            << "\n\tFunction:        " << function
+            << "\n\tSource Location: " << file << ":" << line << "\n"
+            << std::endl;
+
+  if (std::getenv("PI_HIP_ABORT") != nullptr) {
+    std::abort();
+  }
+
+  throw map_error(result);
+}
+
+/// \cond NODOXY
+#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
+
+/// RAII type to guarantee recovering original HIP context
+/// Scoped context is used across all PI HIP plugin implementation
+/// to activate the PI Context on the current thread, matching the
+/// HIP driver semantics where the context used for the HIP Driver
+/// API is the one active on the thread.
+/// The implementation tries to avoid replacing the hipCtx_t if it cans
+class ScopedContext {
+  pi_context placedContext_;
+  hipCtx_t original_;
+  bool needToRecover_;
+
+public:
+  ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {
+
+    if (!placedContext_) {
+      throw PI_INVALID_CONTEXT;
+    }
+
+    hipCtx_t desired = placedContext_->get();
+    PI_CHECK_ERROR(hipCtxGetCurrent(&original_));
+    if (original_ != desired) {
+      // Sets the desired context as the active one for the thread
+      PI_CHECK_ERROR(hipCtxSetCurrent(desired));
+      if (original_ == nullptr) {
+        // No context is installed on the current thread
+        // This is the most common case. We can activate the context in the
+        // thread and leave it there until all the PI context referring to the
+        // same underlying HIP context are destroyed. This emulates
+        // the behaviour of the HIP runtime api, and avoids costly context
+        // switches. No action is required on this side of the if.
+      } else {
+        needToRecover_ = true;
+      }
+    }
+  }
+
+  ~ScopedContext() {
+    if (needToRecover_) {
+      PI_CHECK_ERROR(hipCtxSetCurrent(original_));
+    }
+  }
+};
+
+/// \cond NODOXY
+template <typename T, typename Assign>
+pi_result getInfoImpl(size_t param_value_size, void *param_value,
+                      size_t *param_value_size_ret, T value, size_t value_size,
+                      Assign &&assign_func) {
+
+  if (param_value != nullptr) {
+
+    if (param_value_size < value_size) {
+      return PI_INVALID_VALUE;
+    }
+
+    assign_func(param_value, value, value_size);
+  }
+
+  if (param_value_size_ret != nullptr) {
+    *param_value_size_ret = value_size;
+  }
+
+  return PI_SUCCESS;
+}
+
+template <typename T>
+pi_result getInfo(size_t param_value_size, void *param_value,
+                  size_t *param_value_size_ret, T value) {
+
+  auto assignment = [](void *param_value, T value, size_t value_size) {
+    *static_cast<T *>(param_value) = value;
+  };
+
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     sizeof(T), assignment);
+}
+
+template <typename T>
+pi_result getInfoArray(size_t array_length, size_t param_value_size,
+                       void *param_value, size_t *param_value_size_ret,
+                       T *value) {
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     array_length * sizeof(T), memcpy);
+}
+
+template <>
+pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
+                                size_t *param_value_size_ret,
+                                const char *value) {
+  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
+                      param_value_size_ret, value);
+}
+
+int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
+  int value;
+  cl::sycl::detail::pi::assertion(
+      hipDeviceGetAttribute(&value, attribute, device->get()) == hipSuccess);
+  return value;
+}
+/// \endcond
+
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+/*
+void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
+                        const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
+  assert(threadsPerBlock != nullptr);
+  assert(global_work_size != nullptr);
+  assert(kernel != nullptr);
+  int recommendedBlockSize, minGrid;
+
+  PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
+      &minGrid, &recommendedBlockSize, kernel->get(),
+      0, 0));
+
+  (void)minGrid; // Not used, avoid warnings
+
+  threadsPerBlock[0] =
+      std::min(static_cast<int>(maxThreadsPerBlock[0]),
+               std::min(static_cast<int>(global_work_size[0]),
+                        static_cast<int>(recommendedBlockSize)));
+
+  // Find a local work group size that is a divisor of the global
+  // work group size to produce uniform work groups.
+  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
+    --threadsPerBlock[0];
+  }
+}
+*/
+
+void simpleGuessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
+                        const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
+  assert(threadsPerBlock != nullptr);
+  assert(global_work_size != nullptr);
+  assert(kernel != nullptr);
+  //int recommendedBlockSize, minGrid;
+
+  //PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
+  //    &minGrid, &recommendedBlockSize, kernel->get(),
+  //    0, 0));
+
+  //(void)minGrid; // Not used, avoid warnings
+
+  threadsPerBlock[0] =
+      std::min(static_cast<int>(maxThreadsPerBlock[0]),
+               static_cast<int>(global_work_size[0]));
+
+  // Find a local work group size that is a divisor of the global
+  // work group size to produce uniform work groups.
+  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
+    --threadsPerBlock[0];
+  }
+}
+
+} // anonymous namespace
+
+/// ------ Error handling, matching OpenCL plugin semantics.
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+namespace pi {
+
+// Report error and no return (keeps compiler from printing warnings).
+// TODO: Probably change that to throw a catchable exception,
+//       but for now it is useful to see every failure.
+//
+[[noreturn]] void die(const char *Message) {
+  std::cerr << "pi_die: " << Message << std::endl;
+  std::terminate();
+}
+
+// Reports error messages
+void hipPrint(const char *Message) {
+  std::cerr << "pi_print: " << Message << std::endl;
+}
+
+void assertion(bool Condition, const char *Message) {
+  if (!Condition)
+    die(Message);
+}
+
+} // namespace pi
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+
+//--------------
+// PI object implementation
+
+extern "C" {
+
+// Required in a number of functions, so forward declare here
+pi_result rocm_piEnqueueEventsWait(pi_queue command_queue,
+                                   pi_uint32 num_events_in_wait_list,
+                                   const pi_event *event_wait_list,
+                                   pi_event *event);
+pi_result rocm_piEventRelease(pi_event event);
+pi_result rocm_piEventRetain(pi_event event);
+
+} // extern "C"
+
+/// \endcond
+
+_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
+    : commandType_{type}, refCount_{1}, isCompleted_{false}, isRecorded_{false},
+      isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
+      queue_{queue}, context_{context} {
+
+  assert(type != PI_COMMAND_TYPE_USER);
+
+  bool profilingEnabled = queue_->properties_ & PI_QUEUE_PROFILING_ENABLE;
+
+  PI_CHECK_ERROR(hipEventCreateWithFlags(
+      &evEnd_, profilingEnabled ? hipEventDefault : hipEventDisableTiming));
+
+  if (profilingEnabled) {
+    PI_CHECK_ERROR(hipEventCreateWithFlags(&evQueued_, hipEventDefault));
+    PI_CHECK_ERROR(hipEventCreateWithFlags(&evStart_, hipEventDefault));
+  }
+
+  if (queue_ != nullptr) {
+    rocm_piQueueRetain(queue_);
+  }
+  rocm_piContextRetain(context_);
+  
+}
+
+_pi_event::~_pi_event() {
+  if (queue_ != nullptr) {
+    rocm_piQueueRelease(queue_);
+  }
+  rocm_piContextRelease(context_);
+}
+
+pi_result _pi_event::start() {
+  assert(!is_started());
+  pi_result result;
+
+  try {
+    if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+      // NOTE: This relies on the default stream to be unused.
+      result = PI_CHECK_ERROR(hipEventRecord(evQueued_, 0));
+      result = PI_CHECK_ERROR(hipEventRecord(evStart_, queue_->get()));
+    }
+  } catch (pi_result error) {
+    result = error;
+  }
+
+  isStarted_ = true;
+  return result;
+}
+
+pi_uint64 _pi_event::get_queued_time() const {
+  float miliSeconds = 0.0f;
+  assert(is_started());
+
+  PI_CHECK_ERROR(
+      hipEventElapsedTime(&miliSeconds,evStart_, evEnd_));
+  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
+}
+
+pi_uint64 _pi_event::get_start_time() const {
+  float miliSeconds = 0.0f;
+  assert(is_started());
+
+  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
+  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
+}
+
+pi_uint64 _pi_event::get_end_time() const {
+  float miliSeconds = 0.0f;
+  assert(is_started() && is_recorded());
+
+  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, context_->evBase_, evEnd_));
+  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
+}
+
+pi_result _pi_event::record() {
+
+  if (is_recorded() || !is_started()) {
+    return PI_INVALID_EVENT;
+  }
+
+  pi_result result = PI_INVALID_OPERATION;
+
+  if (!queue_) {
+    return PI_INVALID_QUEUE;
+  }
+
+  hipStream_t hipStream = queue_->get();
+
+  try {
+    eventId_ = queue_->get_next_event_id();
+    if (eventId_ == 0) {
+      cl::sycl::detail::pi::die(
+          "Unrecoverable program state reached in event identifier overflow");
+    }
+    result = PI_CHECK_ERROR(hipEventRecord(evEnd_, hipStream));
+  } catch (pi_result error) {
+    result = error;
+  }
+
+  if (result == PI_SUCCESS) {
+    isRecorded_ = true;
+  }
+
+  return result;
+}
+
+pi_result _pi_event::wait() {
+  pi_result retErr;
+  try {
+    retErr = PI_CHECK_ERROR(hipEventSynchronize(evEnd_));
+    isCompleted_ = true;
+  } catch (pi_result error) {
+    retErr = error;
+  }
+
+  return retErr;
+}
+
+pi_result _pi_event::release() {
+  assert(queue_ != nullptr);
+  PI_CHECK_ERROR(hipEventDestroy(evEnd_));
+
+  if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
+    PI_CHECK_ERROR(hipEventDestroy(evQueued_));
+    PI_CHECK_ERROR(hipEventDestroy(evStart_));
+  }
+
+  return PI_SUCCESS;
+}
+
+// makes all future work submitted to queue wait for all work captured in event.
+pi_result enqueueEventWait(pi_queue queue, pi_event event) {
+  // for native events, the hipStreamWaitEvent call is used.
+  // This makes all future work submitted to stream wait for all
+  // work captured in event.
+  return PI_CHECK_ERROR(hipStreamWaitEvent(queue->get(), event->get(), 0));
+}
+
+_pi_program::_pi_program(pi_context ctxt)
+    : module_{nullptr}, binary_{},
+      binarySizeInBytes_{0}, refCount_{1}, context_{ctxt} {
+  rocm_piContextRetain(context_);
+}
+
+_pi_program::~_pi_program() { rocm_piContextRelease(context_); }
+
+pi_result _pi_program::set_binary(const char *source, size_t length) {
+  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
+         "Re-setting program binary data which has already been set");
+  binary_ = source;
+  binarySizeInBytes_ = length;
+  return PI_SUCCESS;
+}
+
+pi_result _pi_program::build_program(const char *build_options) {
+
+  this->buildOptions_ = build_options;
+  
+  constexpr const unsigned int numberOfOptions = 4u;
+
+  hipJitOption options[numberOfOptions];
+  void *optionVals[numberOfOptions];
+
+  // Pass a buffer for info messages
+  options[0] = hipJitOptionInfoLogBuffer;
+  optionVals[0] = (void *)infoLog_;
+  // Pass the size of the info buffer
+  options[1] = hipJitOptionInfoLogBufferSizeBytes;
+  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
+  // Pass a buffer for error message
+  options[2] = hipJitOptionErrorLogBuffer;
+  optionVals[2] = (void *)errorLog_;
+  // Pass the size of the error buffer
+  options[3] = hipJitOptionErrorLogBufferSizeBytes;
+  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
+
+  auto result = PI_CHECK_ERROR(
+      hipModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
+                         numberOfOptions, options, optionVals));
+
+
+  const auto success = (result == PI_SUCCESS);
+  
+  buildStatus_ =
+      success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR;
+  
+  // If no exception, result is correct
+  return success ? PI_SUCCESS : PI_BUILD_PROGRAM_FAILURE;
+}
+
+/// Finds kernel names by searching for entry points in the PTX source, as the
+/// HIP driver API doesn't expose an operation for this.
+/// Note: This is currently only being used by the SYCL program class for the
+///       has_kernel method, so an alternative would be to move the has_kernel
+///       query to PI and use hipModuleGetFunction to check for a kernel.
+std::string getKernelNames(pi_program program) {
+  std::string source(program->binary_,
+                     program->binary_ + program->binarySizeInBytes_);
+  std::regex entries_pattern(".entry\\s+([^\\([:s:]]*)");
+  std::string names("");
+  std::smatch match;
+  bool first_match = true;
+  while (std::regex_search(source, match, entries_pattern)) {
+    assert(match.size() == 2);
+    names += first_match ? "" : ";";
+    names += match[1]; // Second element is the group.
+    source = match.suffix().str();
+    first_match = false;
+  }
+  return names;
+}
+
+/// RAII object that calls the reference count release function on the held PI
+/// object on destruction.
+///
+/// The `dismiss` function stops the release from happening on destruction.
+template <typename T> class ReleaseGuard {
+private:
+  T Captive;
+
+  static pi_result callRelease(pi_device Captive) {
+    return rocm_piDeviceRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_context Captive) {
+    return rocm_piContextRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_mem Captive) {
+    return rocm_piMemRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_program Captive) {
+    return rocm_piProgramRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_kernel Captive) {
+    return rocm_piKernelRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_queue Captive) {
+    return rocm_piQueueRelease(Captive);
+  }
+
+  static pi_result callRelease(pi_event Captive) {
+    return rocm_piEventRelease(Captive);
+  }
+
+public:
+  ReleaseGuard() = delete;
+  /// Obj can be `nullptr`.
+  explicit ReleaseGuard(T Obj) : Captive(Obj) {}
+  ReleaseGuard(ReleaseGuard &&Other) noexcept : Captive(Other.Captive) {
+    Other.Captive = nullptr;
+  }
+
+  ReleaseGuard(const ReleaseGuard &) = delete;
+
+  /// Calls the related PI object release function if the object held is not
+  /// `nullptr` or if `dismiss` has not been called.
+  ~ReleaseGuard() {
+    if (Captive != nullptr) {
+      pi_result ret = callRelease(Captive);
+      if (ret != PI_SUCCESS) {
+        // A reported HIP error is either an implementation or an asynchronous
+        // HIP error for which it is unclear if the function that reported it
+        // succeeded or not. Either way, the state of the program is compromised
+        // and likely unrecoverable.
+        cl::sycl::detail::pi::die(
+            "Unrecoverable program state reached in rocm_piMemRelease");
+      }
+    }
+  }
+
+  ReleaseGuard &operator=(const ReleaseGuard &) = delete;
+
+  ReleaseGuard &operator=(ReleaseGuard &&Other) {
+    Captive = Other.Captive;
+    Other.Captive = nullptr;
+    return *this;
+  }
+
+  /// End the guard and do not release the reference count of the held
+  /// PI object.
+  void dismiss() { Captive = nullptr; }
+};
+
+//-- PI API implementation
+extern "C" {
+
+/// Obtains the HIP platform.
+/// There is only one HIP platform, and contains all devices on the system.
+/// Triggers the HIP Driver initialization (hipInit) the first time, so this
+/// must be the first PI API called.
+///
+pi_result rocm_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
+                              pi_uint32 *num_platforms) {
+
+  try {
+    static std::once_flag initFlag;
+    static pi_uint32 numPlatforms = 1;
+    static _pi_platform platformId;
+
+    if (num_entries == 0 and platforms != nullptr) {
+      return PI_INVALID_VALUE;
+    }
+    if (platforms == nullptr and num_platforms == nullptr) {
+      return PI_INVALID_VALUE;
+    }
+
+    pi_result err = PI_SUCCESS;
+
+    std::call_once(
+        initFlag,
+        [](pi_result &err) {
+          if (hipInit(0) != hipSuccess) {
+            numPlatforms = 0;
+            return;
+          }
+          int numDevices = 0;
+          err = PI_CHECK_ERROR(hipGetDeviceCount(&numDevices));
+          if (numDevices == 0) {
+            numPlatforms = 0;
+            return;
+          }
+          try {
+            platformId.devices_.reserve(numDevices);
+            for (int i = 0; i < numDevices; ++i) {
+              hipDevice_t device;
+              err = PI_CHECK_ERROR(hipDeviceGet(&device, i));
+              platformId.devices_.emplace_back(
+                  new _pi_device{device, &platformId});
+            }
+          } catch (const std::bad_alloc &) {
+            // Signal out-of-memory situation
+            platformId.devices_.clear();
+            err = PI_OUT_OF_HOST_MEMORY;
+          } catch (...) {
+            // Clear and rethrow to allow retry
+            platformId.devices_.clear();
+            throw;
+          }
+        },
+        err);
+
+    if (num_platforms != nullptr) {
+      *num_platforms = numPlatforms;
+    }
+
+    if (platforms != nullptr) {
+      *platforms = &platformId;
+    }
+
+    return err;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_OUT_OF_RESOURCES;
+  }
+}
+
+pi_result rocm_piPlatformGetInfo(pi_platform platform,
+                                 pi_platform_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret) {
+  assert(platform != nullptr);
+
+  switch (param_name) {
+  case PI_PLATFORM_INFO_NAME:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   "AMD ROCM BACKEND");
+  case PI_PLATFORM_INFO_VENDOR:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   "AMD Corporation");
+  case PI_PLATFORM_INFO_PROFILE:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   "FULL PROFILE");
+  case PI_PLATFORM_INFO_VERSION: {
+    auto version = getCudaVersionString();
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   version.c_str());
+  }
+  case PI_PLATFORM_INFO_EXTENSIONS: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+  }
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Platform info request not implemented");
+  return {};
+}
+
+/// \param devices List of devices available on the system
+/// \param num_devices Number of elements in the list of devices
+/// Requesting a non-GPU device triggers an error, all PI HIP devices
+/// are GPUs.
+///
+pi_result rocm_piDevicesGet(pi_platform platform, pi_device_type device_type,
+                            pi_uint32 num_entries, pi_device *devices,
+                            pi_uint32 *num_devices) {
+
+  pi_result err = PI_SUCCESS;
+  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
+  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
+  const bool returnDevices = askingForDefault || askingForGPU;
+
+  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
+
+  try {
+    if (num_devices) {
+      *num_devices = numDevices;
+    }
+
+    if (returnDevices && devices) {
+      for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
+        devices[i] = platform->devices_[i].get();
+      }
+    }
+
+    return err;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_OUT_OF_RESOURCES;
+  }
+}
+
+/// \return PI_SUCCESS if the function is exehipted successfully
+/// HIP devices are always root devices so retain always returns success.
+pi_result rocm_piDeviceRetain(pi_device device) { return PI_SUCCESS; }
+
+pi_result rocm_piContextGetInfo(pi_context context, pi_context_info param_name,
+                                size_t param_value_size, void *param_value,
+                                size_t *param_value_size_ret) {
+
+  switch (param_name) {
+  case PI_CONTEXT_INFO_NUM_DEVICES:
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1);
+  case PI_CONTEXT_INFO_DEVICES:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   context->get_device());
+  case PI_CONTEXT_INFO_REFERENCE_COUNT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   context->get_reference_count());
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+
+  return PI_OUT_OF_RESOURCES;
+}
+
+pi_result rocm_piContextRetain(pi_context context) {
+  assert(context != nullptr);
+  assert(context->get_reference_count() > 0);
+
+  context->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piextContextSetExtendedDeleter(
+    pi_context context, pi_context_extended_deleter function, void *user_data) {
+  context->set_extended_deleter(function, user_data);
+  return PI_SUCCESS;
+}
+
+/// Not applicable to HIP, devices cannot be partitioned.
+///
+pi_result rocm_piDevicePartition(
+    pi_device device,
+    const cl_device_partition_property *properties, // TODO: untie from OpenCL
+    pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices) {
+  return {};
+}
+
+/// \return If available, the first binary that is PTX
+///
+pi_result rocm_piextDeviceSelectBinary(pi_device device,
+                                       pi_device_binary *binaries,
+                                       pi_uint32 num_binaries,
+                                       pi_uint32 *selected_binary) {
+  if (!binaries) {
+    cl::sycl::detail::pi::die("No list of device images provided");
+  }
+  if (num_binaries < 1) {
+    cl::sycl::detail::pi::die("No binary images in the list");
+  }
+
+  // Look for an image for the AMDGCN target, and return the first one that is
+  // found
+  for (pi_uint32 i = 0; i < num_binaries; i++) {
+    if (strcmp(binaries[i]->DeviceTargetSpec,
+               __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0) {
+      *selected_binary = i;
+      return PI_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return PI_INVALID_BINARY;
+}
+
+pi_result rocm_piextGetDeviceFunctionPointer(pi_device device,
+                                             pi_program program,
+                                             const char *function_name,
+                                             pi_uint64 *function_pointer_ret) {
+  cl::sycl::detail::pi::die(
+      "rocm_piextGetDeviceFunctionPointer not implemented");
+  return {};
+}
+
+/// \return PI_SUCCESS always since HIP devices are always root devices.
+///
+pi_result rocm_piDeviceRelease(pi_device device) { return PI_SUCCESS; }
+
+pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret) {
+
+  static constexpr pi_uint32 max_work_item_dimensions = 3u;
+
+  assert(device != nullptr);
+
+  switch (param_name) {
+  case PI_DEVICE_INFO_TYPE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   PI_DEVICE_TYPE_GPU);
+  }
+  case PI_DEVICE_INFO_VENDOR_ID: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
+  }
+  case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: {
+    int compute_units = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&compute_units,
+                             hipDeviceAttributeMultiprocessorCount,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(compute_units >= 0);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint32(compute_units));
+  }
+  case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   max_work_item_dimensions);
+  }
+  case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
+    size_t return_sizes[max_work_item_dimensions];
+
+    int max_x = 0, max_y = 0, max_z = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(max_x >= 0);
+
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(max_y >= 0);
+
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(max_z >= 0);
+
+    return_sizes[0] = size_t(max_x);
+    return_sizes[1] = size_t(max_y);
+    return_sizes[2] = size_t(max_z);
+    return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
+                        param_value_size_ret, return_sizes);
+  }
+  case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+    int max_work_group_size = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_work_group_size,
+                             hipDeviceAttributeMaxThreadsPerBlock,
+                             device->get()) == hipSuccess);
+
+    cl::sycl::detail::pi::assertion(max_work_group_size >= 0);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t(max_work_group_size));
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  }
+  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
+  }
+  case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
+    // Number of sub-groups = max block size / warp size + possible remainder
+    int max_threads = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&max_threads,
+                             hipDeviceAttributeMaxThreadsPerBlock,
+                             device->get()) == hipSuccess);
+    int warpSize = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                             device->get()) == hipSuccess);
+    int maxWarps = (max_threads + warpSize - 1) / warpSize;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<uint32_t>(maxWarps));
+  }
+  case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
+    // Volta provides independent thread scheduling
+    // TODO: Revisit for previous generation GPUs
+    int major = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&major,
+                             hipDeviceAttributeComputeCapabilityMajor,
+                             device->get()) == hipSuccess);
+    bool ifp = (major >= 7);
+    return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
+  }
+  case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+    // NVIDIA devices only support one sub-group size (the warp size)
+    int warpSize = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                             device->get()) == hipSuccess);
+    size_t sizes[1] = {static_cast<size_t>(warpSize)};
+    return getInfoArray<size_t>(1, param_value_size, param_value,
+                                param_value_size_ret, sizes);
+  }
+  case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
+    int clock_freq = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(clock_freq >= 0);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint32(clock_freq) / 1000u);
+  }
+  case PI_DEVICE_INFO_ADDRESS_BITS: {
+    auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
+    return getInfo(param_value_size, param_value, param_value_size_ret, bits);
+  }
+  case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
+    // Max size of memory object allocation in bytes.
+    // The minimum value is max(min(1024 × 1024 ×
+    // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
+    // 32 × 1024 × 1024) for devices that are not of type
+    // CL_DEVICE_TYPE_HIPSTOM.
+
+    size_t global = 0;
+    cl::sycl::detail::pi::assertion(hipDeviceTotalMem(&global, device->get()) ==
+                                    hipSuccess);
+
+    auto quarter_global = static_cast<pi_uint32>(global / 4u);
+
+    auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
+                              32u * 1024u * 1024u);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64{max_alloc});
+  }
+  case PI_DEVICE_INFO_IMAGE_SUPPORT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   PI_TRUE);
+  }
+  case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
+    // This call doesn't match to HIP as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the HIP API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
+  }
+  case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
+    // This call doesn't match to HIP as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the HIP API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
+  }
+  
+  case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int tex_height = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_height,
+                             hipDeviceAttributeMaxTexture2DHeight,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_height >= 0);
+    int surf_height = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_height,
+                             hipDeviceAttributeMaxTexture2DHeight,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_height >= 0);
+
+    int min = std::min(tex_height, surf_height);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+    
+  }
+  case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_width,
+                             hipDeviceAttributeMaxTexture2DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_width >= 0);
+    int surf_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_width,
+                             hipDeviceAttributeMaxTexture2DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+  }
+  case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int tex_height = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_height,
+                             hipDeviceAttributeMaxTexture3DHeight,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_height >= 0);
+    int surf_height = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_height,
+                             hipDeviceAttributeMaxTexture3DHeight,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_height >= 0);
+
+    int min = std::min(tex_height, surf_height);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+  }
+  case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_width,
+                             hipDeviceAttributeMaxTexture3DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_width >= 0);
+    int surf_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_width,
+                             hipDeviceAttributeMaxTexture3DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+  }
+  case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
+    // Take the smaller of maximum surface and maximum texture depth.
+    int tex_depth = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_depth,
+                             hipDeviceAttributeMaxTexture3DDepth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_depth >= 0);
+    int surf_depth = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_depth,
+                             hipDeviceAttributeMaxTexture3DDepth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_depth >= 0);
+
+    int min = std::min(tex_depth, surf_depth);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+  }
+  case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&tex_width,
+                             hipDeviceAttributeMaxTexture1DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(tex_width >= 0);
+    int surf_width = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&surf_width,
+                             hipDeviceAttributeMaxTexture1DWidth,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret, min);
+  }
+  case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t(0));
+  }
+  case PI_DEVICE_INFO_MAX_SAMPLERS: {
+    // This call is kind of meaningless for rocm, as samplers don't exist.
+    // Closest thing is textures, which is 128.
+    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
+  }
+  case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: {
+    // https://docs.nvidia.com/rocm/rocm-c-programming-guide/#function-parameters
+    // __global__ function parameters are passed to the device via constant
+    // memory and are limited to 4 KB.
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t{4000u});
+  }
+  case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
+    int mem_base_addr_align = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&mem_base_addr_align,
+                             hipDeviceAttributeTextureAlignment,
+                             device->get()) == hipSuccess);
+    // Multiply by 8 as clGetDeviceInfo returns this value in bits
+    mem_base_addr_align *= 8;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   mem_base_addr_align);
+  }
+  case PI_DEVICE_INFO_HALF_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
+  }
+  case PI_DEVICE_INFO_SINGLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
+                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA |
+                  PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    return getInfo(param_value_size, param_value, param_value_size_ret, config);
+  }
+  case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
+                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA;
+    return getInfo(param_value_size, param_value, param_value_size_ret, config);
+  }
+  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   CL_READ_WRITE_CACHE);
+  }
+  case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: {
+    // The value is dohipmented for all existing GPUs in the HIP programming
+    // guidelines, section "H.3.2. Global Memory".
+    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
+  }
+  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
+    int cache_size = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(cache_size >= 0);
+    // The L2 cache is global to the GPU.
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(cache_size));
+  }
+  case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    size_t bytes = 0;
+    // Runtime API has easy access to this value, driver API info is scarse.
+    cl::sycl::detail::pi::assertion(hipDeviceTotalMem(&bytes, device->get()) ==
+                                    hipSuccess);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64{bytes});
+  }
+  case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
+    int constant_memory = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&constant_memory,
+                             hipDeviceAttributeTotalConstantMemory,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(constant_memory >= 0);
+
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(constant_memory));
+  }
+  case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: {
+    // TODO: is there a way to retrieve this from HIP driver API?
+    // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
+    // 1060 3GB
+    return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
+  }
+  case PI_DEVICE_INFO_LOCAL_MEM_TYPE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   PI_DEVICE_LOCAL_MEM_TYPE_LOCAL);
+  }
+  case PI_DEVICE_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL's "local memory" maps most closely to HIP's "shared memory".
+    // HIP has its own definition of "local memory", which maps to OpenCL's
+    // "private memory".
+    int local_mem_size = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&local_mem_size,
+                             hipDeviceAttributeMaxSharedMemoryPerBlock,
+                             device->get()) == hipSuccess);
+    cl::sycl::detail::pi::assertion(local_mem_size >= 0);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   pi_uint64(local_mem_size));
+  }
+  case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
+    int ecc_enabled = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled,
+                             device->get()) == hipSuccess);
+
+    cl::sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
+    auto result = static_cast<bool>(ecc_enabled);
+    return getInfo(param_value_size, param_value, param_value_size_ret, result);
+  }
+  case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
+    int is_integrated = 0;
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated,
+                             device->get()) == hipSuccess);
+
+    cl::sycl::detail::pi::assertion((is_integrated == 0) |
+                                    (is_integrated == 1));
+    auto result = static_cast<bool>(is_integrated);
+    return getInfo(param_value_size, param_value, param_value_size_ret, result);
+  }
+  case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
+    // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX
+    // 1060 3GB
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t{1000u});
+  }
+  case PI_DEVICE_INFO_ENDIAN_LITTLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, true);
+  }
+  case PI_DEVICE_INFO_AVAILABLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, true);
+  }
+  case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, true);
+  }
+  case PI_DEVICE_INFO_LINKER_AVAILABLE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, true);
+  }
+  case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: {
+    auto capability = CL_EXEC_KERNEL;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   capability);
+  }
+  case PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
+    // The mandated minimum capability:
+    auto capability =
+        CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   capability);
+  }
+  case PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
+    // The mandated minimum capability:
+    auto capability = CL_QUEUE_PROFILING_ENABLE;
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   capability);
+  }
+  case PI_DEVICE_INFO_BUILT_IN_KERNELS: {
+    // An empty string is returned if no built-in kernels are supported by the
+    // device.
+    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+  }
+  case PI_DEVICE_INFO_PLATFORM: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   device->get_platform());
+  }
+  case PI_DEVICE_INFO_NAME: {
+    static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
+    char name[MAX_DEVICE_NAME_LENGTH];
+    cl::sycl::detail::pi::assertion(
+        hipDeviceGetName(name, MAX_DEVICE_NAME_LENGTH, device->get()) ==
+        hipSuccess);
+    return getInfoArray(strlen(name) + 1, param_value_size, param_value,
+                        param_value_size_ret, name);
+  }
+  case PI_DEVICE_INFO_VENDOR: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   "NVIDIA Corporation");
+  }
+  case PI_DEVICE_INFO_DRIVER_VERSION: {
+    auto version = getCudaVersionString();
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   version.c_str());
+  }
+  case PI_DEVICE_INFO_PROFILE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, "HIP");
+  }
+  case PI_DEVICE_INFO_REFERENCE_COUNT: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   device->get_reference_count());
+  }
+  case PI_DEVICE_INFO_VERSION: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   "PI 0.0");
+  }
+  case PI_DEVICE_INFO_OPENCL_C_VERSION: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+  }
+  case PI_DEVICE_INFO_EXTENSIONS: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, "");
+  }
+  case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
+    // The minimum value for the FULL profile is 1 MB.
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   size_t{1024u});
+  }
+  case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, true);
+  }
+  case PI_DEVICE_INFO_PARENT_DEVICE: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   nullptr);
+  }
+  case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
+  }
+  case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<cl_device_partition_property>(0u));
+  }
+  case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: {
+    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
+  }
+  case PI_DEVICE_INFO_PARTITION_TYPE: {
+    // TODO: uncouple from OpenCL
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<cl_device_partition_property>(0u));
+  }
+
+    // Intel USM extensions
+
+  case PI_DEVICE_INFO_USM_HOST_SUPPORT: {
+    // from cl_intel_unified_shared_memory: "The host memory access capabilities
+    // apply to any host allocation."
+    //
+    // query if/how the device can access page-locked host memory, possibly
+    // through PCIe, using the same pointer as the host
+    pi_bitfield value = {};
+    //if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+      // the device shares a unified address space with the host
+      if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      } else {
+        // on GPU architectures with compute capability lower than 6.x, atomic
+        // operations from the GPU to CPU memory will not be atomic with respect
+        // to CPU initiated atomic operations
+        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+      }
+    //}
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The device memory access capabilities apply to any device allocation
+    // associated with this device."
+    //
+    // query how the device can access memory allocated on the device itself (?)
+    pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                        PI_USM_CONCURRENT_ACCESS |
+                        PI_USM_CONCURRENT_ATOMIC_ACCESS;
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The single device shared memory access capabilities apply to any shared
+    // allocation associated with this device."
+    //
+    // query if/how the device can access managed memory associated to it
+    pi_bitfield value = {};
+    if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
+      // the device can allocate managed memory on this system
+      value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS;
+    }
+    if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
+      // the device can coherently access managed memory concurrently with the
+      // CPU
+      value |= PI_USM_CONCURRENT_ACCESS;
+      if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      }
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The cross-device shared memory access capabilities apply to any shared
+    // allocation associated with this device, or to any shared memory
+    // allocation on another device that also supports the same cross-device
+    // shared memory access capability."
+    //
+    // query if/how the device can access managed memory associated to other
+    // devices
+    pi_bitfield value = {};
+    if (getAttribute(device, hipDeviceAttributeManagedMemory)) {
+      // the device can allocate managed memory on this system
+      value |= PI_USM_ACCESS;
+    }
+    if (getAttribute(device, hipDeviceAttributeConcurrentManagedAccess)) {
+      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+      // attribute can coherently access managed memory concurrently with the
+      // CPU
+      value |= PI_USM_CONCURRENT_ACCESS;
+    }
+    if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
+        6) {
+      // compute capability 6.x introduces operations that are atomic with
+      // respect to other CPUs and GPUs in the system
+      if (value & PI_USM_ACCESS)
+        value |= PI_USM_ATOMIC_ACCESS;
+      if (value & PI_USM_CONCURRENT_ACCESS)
+        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The shared system memory access capabilities apply to any allocations
+    // made by a system allocator, such as malloc or new."
+    //
+    // query if/how the device can access pageable host memory allocated by the
+    // system allocator
+    pi_bitfield value = {};
+    if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
+      // the device suppports coherently accessing pageable memory without
+      // calling hipMemHostRegister/rocmHostRegister on it
+      /*
+      if (getAttribute(device,
+                       HIP_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
+        // the link between the device and the host supports native atomic
+        // operations
+        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      } 
+      else */
+      {
+        // the link between the device and the host does not support native
+        // atomic operations
+        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+      }
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+
+    // TODO: Investigate if this information is available on HIP.
+  case PI_DEVICE_INFO_PCI_ADDRESS:
+  case PI_DEVICE_INFO_GPU_EU_COUNT:
+  case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
+  case PI_DEVICE_INFO_GPU_SLICES:
+  case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+  case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+  case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+    return PI_INVALID_VALUE;
+
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Device info request not implemented");
+  return {};
+}
+
+/// Gets the native HIP handle of a PI device object
+///
+/// \param[in] device The PI device to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI device object.
+///
+/// \return PI_SUCCESS
+pi_result rocm_piextDeviceGetNativeHandle(pi_device device,
+                                          pi_native_handle *nativeHandle) {
+  *nativeHandle = static_cast<pi_native_handle>(device->get());
+  return PI_SUCCESS;
+}
+
+/// Created a PI device object from a HIP device handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI device object from.
+/// \param[in] platform is the PI platform of the device.
+/// \param[out] device Set to the PI device object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                 pi_platform platform,
+                                                 pi_device *device) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI device from native handle not implemented");
+  return {};
+}
+
+/* Context APIs */
+
+/// Create a PI HIP context.
+///
+/// By default creates a scoped context and keeps the last active HIP context
+/// on top of the HIP context stack.
+/// With the __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY key/id and a value of
+/// PI_TRUE creates a primary HIP context and activates it on the HIP context
+/// stack.
+///
+/// \param[in] properties 0 terminated array of key/id-value combinations. Can
+/// be nullptr. Only accepts property key/id
+/// __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY with a pi_bool value.
+/// \param[in] num_devices Number of devices to create the context for.
+/// \param[in] devices Devices to create the context for.
+/// \param[in] pfn_notify Callback, currently unused.
+/// \param[in] user_data User data for callback.
+/// \param[out] retcontext Set to created context on success.
+///
+/// \return PI_SUCCESS on success, otherwise an error return code.
+pi_result rocm_piContextCreate(const pi_context_properties *properties,
+                               pi_uint32 num_devices, const pi_device *devices,
+                               void (*pfn_notify)(const char *errinfo,
+                                                  const void *private_info,
+                                                  size_t cb, void *user_data),
+                               void *user_data, pi_context *retcontext) {
+
+  assert(devices != nullptr);
+  // TODO: How to implement context callback?
+  assert(pfn_notify == nullptr);
+  assert(user_data == nullptr);
+  assert(num_devices == 1);
+  // Need input context
+  assert(retcontext != nullptr);
+  pi_result errcode_ret = PI_SUCCESS;
+
+  // Parse properties.
+  bool property_rocm_primary = false;
+  while (properties && (0 != *properties)) {
+    // Consume property ID.
+    pi_context_properties id = *properties;
+    ++properties;
+    // Consume property value.
+    pi_context_properties value = *properties;
+    ++properties;
+    switch (id) {
+    case __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY:
+      assert(value == PI_FALSE || value == PI_TRUE);
+      property_rocm_primary = static_cast<bool>(value);
+      break;
+    default:
+      // Unknown property.
+      assert(!"Unknown piContextCreate property in property list");
+      return PI_INVALID_VALUE;
+    }
+  }
+
+  std::unique_ptr<_pi_context> piContextPtr{nullptr};
+  try {
+    hipCtx_t current = nullptr;
+
+    if (property_rocm_primary) {
+      // Use the HIP primary context and assume that we want to use it
+      // immediately as we want to forge context switches.
+      hipCtx_t Ctxt;
+      errcode_ret =
+          PI_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
+      piContextPtr = std::unique_ptr<_pi_context>(
+          new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
+      errcode_ret = PI_CHECK_ERROR(hipCtxPushCurrent(Ctxt));
+    } else {
+      // Create a scoped context.
+      hipCtx_t newContext;
+      PI_CHECK_ERROR(hipCtxGetCurrent(&current));
+      errcode_ret = PI_CHECK_ERROR(
+          hipCtxCreate(&newContext, hipDeviceMapHost, devices[0]->get()));
+      piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
+          _pi_context::kind::user_defined, newContext, *devices});
+    }
+
+    // Use default stream to record base event counter
+    PI_CHECK_ERROR(hipEventCreateWithFlags(&piContextPtr->evBase_, hipEventDefault));
+    PI_CHECK_ERROR(hipEventRecord(piContextPtr->evBase_, 0));
+
+    // For non-primary scoped contexts keep the last active on top of the stack
+    // as `cuCtxCreate` replaces it implicitly otherwise.
+    // Primary contexts are kept on top of the stack, so the previous context
+    // is not queried and therefore not recovered.
+    if (current != nullptr) {
+      PI_CHECK_ERROR(hipCtxSetCurrent(current));
+    }
+
+    *retcontext = piContextPtr.release();
+  } catch (pi_result err) {
+    errcode_ret = err;
+  } catch (...) {
+    errcode_ret = PI_OUT_OF_RESOURCES;
+  }
+  return errcode_ret;
+}
+
+pi_result rocm_piContextRelease(pi_context ctxt) {
+
+  assert(ctxt != nullptr);
+
+  if (ctxt->decrement_reference_count() > 0) {
+    return PI_SUCCESS;
+  }
+  ctxt->invoke_extended_deleters();
+
+  std::unique_ptr<_pi_context> context{ctxt};
+
+  PI_CHECK_ERROR(hipEventDestroy(context->evBase_));
+  
+  
+  if (!ctxt->is_primary()) {
+    hipCtx_t hipCtxt = ctxt->get();
+    hipCtx_t current = nullptr;
+    PI_CHECK_ERROR(hipCtxGetCurrent(&current));
+    if (hipCtxt != current) {
+      PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt));
+    }
+    //  hipErrorNotSupported this API
+    //PI_CHECK_ERROR(hipCtxSynchronize());
+    PI_CHECK_ERROR(hipCtxGetCurrent(&current));
+    if (hipCtxt == current) {
+      PI_CHECK_ERROR(hipCtxPopCurrent(&current));
+    }
+    return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
+  } else {
+    // Primary context is not destroyed, but released
+    hipDevice_t hipDev = ctxt->get_device()->get();
+    hipCtx_t current;
+    PI_CHECK_ERROR(hipCtxPopCurrent(&current));
+    return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev));
+  }
+  
+  hipCtx_t hipCtxt = ctxt->get();
+  return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
+}
+
+/// Gets the native HIP handle of a PI context object
+///
+/// \param[in] context The PI context to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI context object.
+///
+/// \return PI_SUCCESS
+pi_result rocm_piextContextGetNativeHandle(pi_context context,
+                                           pi_native_handle *nativeHandle) {
+  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
+  return PI_SUCCESS;
+}
+
+/// Created a PI context object from a HIP context handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI context object from.
+/// \param[out] context Set to the PI context object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                  pi_uint32 num_devices,
+                                                  const pi_device *devices,
+                                                  bool ownNativeHandle,
+                                                  pi_context *context) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI context from native handle not implemented");
+  return {};
+}
+
+/// Creates a PI Memory object using a HIP memory allocation.
+/// Can trigger a manual copy depending on the mode.
+/// \TODO Implement USE_HOST_PTR using cuHostRegister
+///
+pi_result rocm_piMemBufferCreate(pi_context context, pi_mem_flags flags,
+                                 size_t size, void *host_ptr, pi_mem *ret_mem,
+                                 const pi_mem_properties *properties) {
+  // Need input memory object
+  assert(ret_mem != nullptr);
+  assert(properties == nullptr && "no mem properties goes to rocm RT yet");
+  // Currently, USE_HOST_PTR is not implemented using host register
+  // since this triggers a weird segfault after program ends.
+  // Setting this constant to true enables testing that behavior.
+  const bool enableUseHostPtr = false;
+  const bool performInitialCopy =
+      (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
+      ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
+  pi_result retErr = PI_SUCCESS;
+  pi_mem retMemObj = nullptr;
+
+  try {
+    ScopedContext active(context);
+    hipDevPtr ptr;
+    _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
+        _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
+
+    if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
+      retErr = PI_CHECK_ERROR(
+          hipHostRegister(host_ptr, size, hipHostRegisterMapped));
+      retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
+      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr;
+    } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
+      retErr = PI_CHECK_ERROR(hipHostMalloc(&host_ptr, size));
+      retErr = PI_CHECK_ERROR(hipHostGetDevicePointer(&ptr, host_ptr, 0));
+      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+    } else {
+      retErr = PI_CHECK_ERROR(hipMalloc(&ptr, size));
+      if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
+        allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in;
+      }
+    }
+
+    if (retErr == PI_SUCCESS) {
+      pi_mem parentBuffer = nullptr;
+
+      auto piMemObj = std::unique_ptr<_pi_mem>(
+          new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
+      if (piMemObj != nullptr) {
+        retMemObj = piMemObj.release();
+        if (performInitialCopy) {
+          // Operates on the default stream of the current HIP context.
+          retErr = PI_CHECK_ERROR(hipMemcpyHtoD(ptr, host_ptr, size));
+          // Synchronize with default stream implicitly used by cuMemcpyHtoD
+          // to make buffer data available on device before any other PI call
+          // uses it.
+          if (retErr == PI_SUCCESS) {
+            hipStream_t defaultStream = 0;
+            retErr = PI_CHECK_ERROR(hipStreamSynchronize(defaultStream));
+          }
+        }
+      } else {
+        retErr = PI_OUT_OF_HOST_MEMORY;
+      }
+    }
+  } catch (pi_result err) {
+    retErr = err;
+  } catch (...) {
+    retErr = PI_OUT_OF_RESOURCES;
+  }
+
+  *ret_mem = retMemObj;
+
+  return retErr;
+}
+
+/// Decreases the reference count of the Mem object.
+/// If this is zero, calls the relevant HIP Free function
+/// \return PI_SUCCESS unless deallocation error
+///
+pi_result rocm_piMemRelease(pi_mem memObj) {
+  assert((memObj != nullptr) && "PI_INVALID_MEM_OBJECTS");
+
+  pi_result ret = PI_SUCCESS;
+
+  try {
+
+    // Do nothing if there are other references
+    if (memObj->decrement_reference_count() > 0) {
+      return PI_SUCCESS;
+    }
+
+    // make sure memObj is released in case PI_CHECK_ERROR throws
+    std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
+
+    if (memObj->is_sub_buffer()) {
+      return PI_SUCCESS;
+    }
+
+    ScopedContext active(uniqueMemObj->get_context());
+
+    if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
+      switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
+      case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in:
+      case _pi_mem::mem_::buffer_mem_::alloc_mode::classic:
+        ret = PI_CHECK_ERROR(hipFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
+        break;
+      case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr:
+        ret = PI_CHECK_ERROR(
+            hipHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+        break;
+      case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr:
+        ret = PI_CHECK_ERROR(
+            hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+      };
+    } 
+    
+    else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
+      ret = PI_CHECK_ERROR(
+          hipDestroySurfaceObject(uniqueMemObj->mem_.surface_mem_.get_surface()));
+      auto array = uniqueMemObj->mem_.surface_mem_.get_array();
+      ret = PI_CHECK_ERROR(
+          hipFreeArray(&array));
+    }
+    
+
+  } catch (pi_result err) {
+    ret = err;
+  } catch (...) {
+    ret = PI_OUT_OF_RESOURCES;
+  }
+
+  if (ret != PI_SUCCESS) {
+    // A reported HIP error is either an implementation or an asynchronous HIP
+    // error for which it is unclear if the function that reported it succeeded
+    // or not. Either way, the state of the program is compromised and likely
+    // unrecoverable.
+    cl::sycl::detail::pi::die(
+        "Unrecoverable program state reached in rocm_piMemRelease");
+  }
+
+  return PI_SUCCESS;
+}
+
+/// Implements a buffer partition in the HIP backend.
+/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
+/// as an offset over an existing HIP allocation.
+///
+pi_result rocm_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
+                                    pi_buffer_create_type buffer_create_type,
+                                    void *buffer_create_info, pi_mem *memObj) {
+  assert((parent_buffer != nullptr) && "PI_INVALID_MEM_OBJECT");
+  assert(parent_buffer->is_buffer() && "PI_INVALID_MEM_OBJECTS");
+  assert(!parent_buffer->is_sub_buffer() && "PI_INVALID_MEM_OBJECT");
+
+  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
+  if (flags == 0) {
+    flags = PI_MEM_FLAGS_ACCESS_RW;
+  }
+
+  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_INVALID_VALUE");
+  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
+         "PI_INVALID_VALUE");
+  assert((buffer_create_info != nullptr) && "PI_INVALID_VALUE");
+  assert(memObj != nullptr);
+
+  const auto bufferRegion =
+      *reinterpret_cast<pi_buffer_region>(buffer_create_info);
+  assert((bufferRegion.size != 0u) && "PI_INVALID_BUFFER_SIZE");
+
+  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
+         "Overflow");
+  assert(((bufferRegion.origin + bufferRegion.size) <=
+          parent_buffer->mem_.buffer_mem_.get_size()) &&
+         "PI_INVALID_BUFFER_SIZE");
+  // Retained indirectly due to retaining parent buffer below.
+  pi_context context = parent_buffer->context_;
+  _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
+      _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
+
+  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
+         _pi_mem::mem_::buffer_mem_::native_type{0});
+  _pi_mem::mem_::buffer_mem_::native_type ptr =
+      (uint8_t*)(parent_buffer->mem_.buffer_mem_.ptr_ )+ bufferRegion.origin;
+
+  void *hostPtr = nullptr;
+  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
+    hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
+              bufferRegion.origin;
+  }
+
+  ReleaseGuard<pi_mem> releaseGuard(parent_buffer);
+
+  std::unique_ptr<_pi_mem> retMemObj{nullptr};
+  try {
+    ScopedContext active(context);
+
+    retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
+        context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
+  } catch (pi_result err) {
+    *memObj = nullptr;
+    return err;
+  } catch (...) {
+    *memObj = nullptr;
+    return PI_OUT_OF_HOST_MEMORY;
+  }
+
+  releaseGuard.dismiss();
+  *memObj = retMemObj.release();
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piMemGetInfo(pi_mem memObj, cl_mem_info queriedInfo,
+                            size_t expectedQuerySize, void *queryOutput,
+                            size_t *writtenQuerySize) {
+
+  cl::sycl::detail::pi::die("rocm_piMemGetInfo not implemented");
+}
+
+/// Gets the native HIP handle of a PI mem object
+///
+/// \param[in] mem The PI mem to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI mem object.
+///
+/// \return PI_SUCCESS
+/*
+pi_result rocm_piextMemGetNativeHandle(pi_mem mem,
+                                       pi_native_handle *nativeHandle) {
+  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
+  return PI_SUCCESS;
+}
+*/
+
+/// Created a PI mem object from a HIP mem handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI mem object from.
+/// \param[out] mem Set to the PI mem object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                              pi_mem *mem) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI mem from native handle not implemented");
+  return {};
+}
+
+/// Creates a `pi_queue` object on the HIP backend.
+/// Valid properties
+/// * __SYCL_PI_HIP_USE_DEFAULT_STREAM -> hipStreamDefault
+/// * __SYCL_PI_HIP_SYNC_WITH_DEFAULT -> hipStreamNonBlocking
+/// \return Pi queue object mapping to a HIPStream
+///
+pi_result rocm_piQueueCreate(pi_context context, pi_device device,
+                             pi_queue_properties properties, pi_queue *queue) {
+  try {
+    pi_result err = PI_SUCCESS;
+
+    std::unique_ptr<_pi_queue> queueImpl{nullptr};
+
+    if (context->get_device() != device) {
+      *queue = nullptr;
+      return PI_INVALID_DEVICE;
+    }
+
+    ScopedContext active(context);
+
+    hipStream_t hipStream;
+    
+    /*
+    unsigned int flags = 0;
+
+    if (properties == __SYCL_PI_HIP_USE_DEFAULT_STREAM) {
+      flags = hipStreamDefault;
+    } else if (properties == __SYCL_PI_HIP_SYNC_WITH_DEFAULT) {
+      flags = 0;
+    } else {
+      flags = hipStreamNonBlocking;
+    }
+    */
+
+    err = PI_CHECK_ERROR(hipStreamCreate(&hipStream));
+    if (err != PI_SUCCESS) {
+      return err;
+    }
+
+    queueImpl = std::unique_ptr<_pi_queue>(
+        new _pi_queue{hipStream, context, device, properties});
+
+    *queue = queueImpl.release();
+
+    return PI_SUCCESS;
+  } catch (pi_result err) {
+
+    return err;
+
+  } catch (...) {
+
+    return PI_OUT_OF_RESOURCES;
+  }
+}
+
+pi_result rocm_piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name,
+                              size_t param_value_size, void *param_value,
+                              size_t *param_value_size_ret) {
+  assert(command_queue != nullptr);
+
+  switch (param_name) {
+  case PI_QUEUE_INFO_CONTEXT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   command_queue->context_);
+  case PI_QUEUE_INFO_DEVICE:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   command_queue->device_);
+  case PI_QUEUE_INFO_REFERENCE_COUNT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   command_queue->get_reference_count());
+  case PI_QUEUE_INFO_PROPERTIES:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   command_queue->properties_);
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Queue info request not implemented");
+  return {};
+}
+
+pi_result rocm_piQueueRetain(pi_queue command_queue) {
+  assert(command_queue != nullptr);
+  assert(command_queue->get_reference_count() > 0);
+
+  command_queue->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piQueueRelease(pi_queue command_queue) {
+  assert(command_queue != nullptr);
+
+  if (command_queue->decrement_reference_count() > 0) {
+    return PI_SUCCESS;
+  }
+
+  try {
+    std::unique_ptr<_pi_queue> queueImpl(command_queue);
+
+    ScopedContext active(command_queue->get_context());
+
+    auto stream = queueImpl->stream_;
+    PI_CHECK_ERROR(hipStreamSynchronize(stream));
+    PI_CHECK_ERROR(hipStreamDestroy(stream));
+
+    return PI_SUCCESS;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_OUT_OF_RESOURCES;
+  }
+}
+
+pi_result rocm_piQueueFinish(pi_queue command_queue) {
+
+  // set default result to a negative result (avoid false-positve tests)
+  pi_result result = PI_OUT_OF_HOST_MEMORY;
+
+  try {
+
+    assert(command_queue !=
+           nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
+    ScopedContext active(command_queue->get_context());
+    result = PI_CHECK_ERROR(hipStreamSynchronize(command_queue->stream_));
+
+  } catch (pi_result err) {
+
+    result = err;
+
+  } catch (...) {
+
+    result = PI_OUT_OF_RESOURCES;
+  }
+
+  return result;
+}
+
+/// Gets the native HIP handle of a PI queue object
+///
+/// \param[in] queue The PI queue to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI queue object.
+///
+/// \return PI_SUCCESS
+pi_result rocm_piextQueueGetNativeHandle(pi_queue queue,
+                                         pi_native_handle *nativeHandle) {
+  *nativeHandle = reinterpret_cast<pi_native_handle>(queue->get());
+  return PI_SUCCESS;
+}
+
+/// Created a PI queue object from a HIP queue handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI queue object from.
+/// \param[in] context is the PI context of the queue.
+/// \param[out] queue Set to the PI queue object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                pi_context context,
+                                                pi_queue *queue) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI queue from native handle not implemented");
+  return {};
+}
+
+pi_result rocm_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
+                                       pi_bool blocking_write, size_t offset,
+                                       size_t size,  void *ptr,
+                                       pi_uint32 num_events_in_wait_list,
+                                       const pi_event *event_wait_list,
+                                       pi_event *event) {
+
+  assert(buffer != nullptr);
+  assert(command_queue != nullptr);
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+  hipDevPtr devPtr = buffer->mem_.buffer_mem_.get();
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    retErr = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                      event_wait_list, nullptr);
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue));
+      retImplEv->start();
+    }
+
+    retErr =
+        PI_CHECK_ERROR(hipMemcpyHtoDAsync((uint8_t*)devPtr + offset, ptr, size, hipStream));
+
+    if (event) {
+      retErr = retImplEv->record();
+    }
+
+    if (blocking_write) {
+      retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+
+    if (event) {
+      *event = retImplEv.release();
+    }
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
+                                      pi_bool blocking_read, size_t offset,
+                                      size_t size, void *ptr,
+                                      pi_uint32 num_events_in_wait_list,
+                                      const pi_event *event_wait_list,
+                                      pi_event *event) {
+
+  assert(buffer != nullptr);
+  assert(command_queue != nullptr);
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+  hipDevPtr devPtr = buffer->mem_.buffer_mem_.get();
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    retErr = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                      event_wait_list, nullptr);
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue));
+      retImplEv->start();
+    }
+
+    retErr =
+        PI_CHECK_ERROR(hipMemcpyDtoHAsync(ptr, (uint8_t*)devPtr + offset, size, hipStream));
+
+    if (event) {
+      retErr = retImplEv->record();
+    }
+
+    if (blocking_read) {
+      retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+
+    if (event) {
+      *event = retImplEv.release();
+    }
+
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
+
+  try {
+    assert(num_events != 0);
+    assert(event_list);
+    if (num_events == 0) {
+      return PI_INVALID_VALUE;
+    }
+
+    if (!event_list) {
+      return PI_INVALID_EVENT;
+    }
+
+    auto context = event_list[0]->get_context();
+    ScopedContext active(context);
+
+    auto waitFunc = [context](pi_event event) -> pi_result {
+      if (!event) {
+        return PI_INVALID_EVENT;
+      }
+
+      if (event->get_context() != context) {
+        return PI_INVALID_CONTEXT;
+      }
+
+      return event->wait();
+    };
+    return forLatestEvents(event_list, num_events, waitFunc);
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_OUT_OF_RESOURCES;
+  }
+}
+
+pi_result rocm_piKernelCreate(pi_program program, const char *kernel_name,
+                              pi_kernel *kernel) {
+  assert(kernel != nullptr);
+  assert(program != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  std::unique_ptr<_pi_kernel> retKernel{nullptr};
+
+  try {
+    ScopedContext active(program->get_context());
+
+    hipFunction_t hipFunc;
+    retErr = PI_CHECK_ERROR(
+        hipModuleGetFunction(&hipFunc, program->get(), kernel_name));
+
+    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
+    hipFunction_t hipFuncWithOffsetParam;
+    hipError_t offsetRes = hipModuleGetFunction(
+        &hipFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
+
+    // If there is no kernel with global offset parameter we mark it as missing
+    if (offsetRes == hipErrorNotFound) {
+      hipFuncWithOffsetParam = nullptr;
+    } else {
+      retErr = PI_CHECK_ERROR(offsetRes);
+    }
+
+    retKernel = std::unique_ptr<_pi_kernel>(
+        new _pi_kernel{hipFunc, hipFuncWithOffsetParam, kernel_name, program,
+                       program->get_context()});
+  } catch (pi_result err) {
+    retErr = err;
+  } catch (...) {
+    retErr = PI_OUT_OF_HOST_MEMORY;
+  }
+
+  *kernel = retKernel.release();
+  return retErr;
+}
+
+pi_result rocm_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index,
+                              size_t arg_size, const void *arg_value) {
+
+  assert(kernel != nullptr);
+  pi_result retErr = PI_SUCCESS;
+  try {
+    if (arg_value) {
+      kernel->set_kernel_arg(arg_index, arg_size, arg_value);
+    } else {
+      kernel->set_kernel_local_arg(arg_index, arg_size);
+    }
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
+                                       const pi_mem *arg_value) {
+
+  assert(kernel != nullptr);
+  assert(arg_value != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  try {
+    pi_mem arg_mem = *arg_value;
+    
+    if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
+      auto array = arg_mem->mem_.surface_mem_.get_array();
+      if (array.Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&
+          array.Format != HIP_AD_FORMAT_SIGNED_INT32 &&
+          array.Format != HIP_AD_FORMAT_HALF &&
+          array.Format != HIP_AD_FORMAT_FLOAT) {
+        cl::sycl::detail::pi::die(
+            "PI HIP kernels only support images with channel types int32, "
+            "uint32, float, and half.");
+      }
+      hipSurfaceObject_t hipSurf = arg_mem->mem_.surface_mem_.get_surface();
+      kernel->set_kernel_arg(arg_index, sizeof(hipSurf), (void *)&hipSurf);
+    } else 
+    
+   {
+      hipDevPtr hipPtr = arg_mem->mem_.buffer_mem_.get();
+      kernel->set_kernel_arg(arg_index, sizeof(hipDevPtr), (void *)&hipPtr);
+    }
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
+                                        const pi_sampler *arg_value) {
+
+  assert(kernel != nullptr);
+  assert(arg_value != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  try {
+    pi_uint32 samplerProps = (*arg_value)->props_;
+    kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEnqueueKernelLaunch(
+    pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+
+  // Preconditions
+  assert(command_queue != nullptr);
+  assert(command_queue->get_context() == kernel->get_context());
+  assert(kernel != nullptr);
+  assert(global_work_offset != nullptr);
+  assert(work_dim > 0);
+  assert(work_dim < 4);
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  int threadsPerBlock[3] = {32, 1, 1};
+  size_t maxWorkGroupSize = 0u;
+  size_t maxThreadsPerBlock[3] = {};
+  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
+
+  {
+    pi_result retError = rocm_piDeviceGetInfo(
+        command_queue->device_, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
+        sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
+    assert(retError == PI_SUCCESS);
+    (void)retError;
+
+    retError = rocm_piDeviceGetInfo(
+        command_queue->device_, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+        sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
+    assert(retError == PI_SUCCESS);
+    // The maxWorkGroupsSize = 1024 for AMD GPU
+    // The maxThreadsPerBlock = {1024, 1024, 1024}
+
+    if (providedLocalWorkGroupSize) {
+      auto isValid = [&](int dim) {
+        if (local_work_size[dim] > maxThreadsPerBlock[dim])
+          return PI_INVALID_WORK_ITEM_SIZE;
+        // Checks that local work sizes are a divisor of the global work sizes
+        // which includes that the local work sizes are neither larger than the
+        // global work sizes and not 0.
+        if (0u == local_work_size[dim])
+          return PI_INVALID_WORK_GROUP_SIZE;
+        if (0u != (global_work_size[dim] % local_work_size[dim]))
+          return PI_INVALID_WORK_GROUP_SIZE;
+        threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
+        return PI_SUCCESS;
+      };
+
+      for (size_t dim = 0; dim < work_dim; dim++) {
+        auto err = isValid(dim);
+        if (err != PI_SUCCESS)
+          return err;
+      }
+    } else {
+       simpleGuessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock,
+                         kernel);
+    }
+  }
+  
+  /*
+  if (maxWorkGroupSize <
+      size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
+    return PI_INVALID_WORK_GROUP_SIZE;
+  }
+  */
+  int blocksPerGrid[3] = {1, 1, 1};
+  
+
+  for (size_t i = 0; i < work_dim; i++) {
+    blocksPerGrid[i] =
+        static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
+        threadsPerBlock[i];
+  }
+  
+
+  pi_result retError = PI_SUCCESS;
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+    hipStream_t hipStream = command_queue->get();
+    hipFunction_t hipFunc = kernel->get();
+
+    retError = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                        event_wait_list, nullptr);
+
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (kernel->get_with_offset_parameter()) {
+      std::uint32_t rocm_implicit_offset[3] = {0, 0, 0};
+      if (global_work_offset) {
+        for (size_t i = 0; i < work_dim; i++) {
+          rocm_implicit_offset[i] =
+              static_cast<std::uint32_t>(global_work_offset[i]);
+          if (global_work_offset[i] != 0) {
+            hipFunc = kernel->get_with_offset_parameter();
+          }
+        }
+      }
+      kernel->set_implicit_offset_arg(sizeof(rocm_implicit_offset),
+                                      rocm_implicit_offset);
+    }
+
+    auto argIndices = kernel->get_arg_indices();
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue));
+      retImplEv->start();
+    }
+    
+    /*
+    retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
+        hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
+        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2],
+        kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
+    */
+    
+    retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
+        hipFunc, blocksPerGrid[0], 1, 1,
+        threadsPerBlock[0], 1, 1,
+        kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
+      
+    kernel->clear_local_size();
+    if (event) {
+      retError = retImplEv->record();
+    }
+
+    if (event) {
+      *event = retImplEv.release();
+    }
+  } catch (pi_result err) {
+    retError = err;
+  }
+  return retError;
+}
+
+/// \TODO Not implemented
+pi_result rocm_piEnqueueNativeKernel(
+    pi_queue queue, void (*user_func)(void *), void *args, size_t cb_args,
+    pi_uint32 num_mem_objects, const pi_mem *mem_list,
+    const void **args_mem_loc, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+  cl::sycl::detail::pi::die("Not implemented in HIP backend");
+  return {};
+}
+
+/// \TODO Not implemented
+
+
+pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags,
+                                const pi_image_format *image_format,
+                                const pi_image_desc *image_desc, void *host_ptr,
+                                pi_mem *ret_mem) {
+  
+  // Need input memory object
+  assert(ret_mem != nullptr);
+  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
+                                  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
+  pi_result retErr = PI_SUCCESS;
+
+  // We only support RBGA channel order
+  // TODO: check SYCL CTS and spec. May also have to support BGRA
+  if (image_format->image_channel_order !=
+      pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) {
+    cl::sycl::detail::pi::die(
+        "rocm_piMemImageCreate only supports RGBA channel order");
+  }
+
+  // We have to use cuArray3DCreate, which has some caveats. The height and
+  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
+  // a minimum value of 1, so we need to convert the answer.
+  HIP_ARRAY3D_DESCRIPTOR array_desc;
+  array_desc.NumChannels = 4; // Only support 4 channel image
+  array_desc.Flags = 0;       // No flags required
+  array_desc.Width = image_desc->image_width;
+  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
+    array_desc.Height = 0;
+    array_desc.Depth = 0;
+  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
+    array_desc.Height = image_desc->image_height;
+    array_desc.Depth = 0;
+  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
+    array_desc.Height = image_desc->image_height;
+    array_desc.Depth = image_desc->image_depth;
+  }
+
+  // We need to get this now in bytes for calculating the total image size later
+  size_t pixel_type_size_bytes;
+
+  switch (image_format->image_channel_data_type) {
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+    array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT8;
+    pixel_type_size_bytes = 1;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+    array_desc.Format = HIP_AD_FORMAT_SIGNED_INT8;
+    pixel_type_size_bytes = 1;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+    array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT16;
+    pixel_type_size_bytes = 2;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+    array_desc.Format = HIP_AD_FORMAT_SIGNED_INT16;
+    pixel_type_size_bytes = 2;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+    array_desc.Format = HIP_AD_FORMAT_HALF;
+    pixel_type_size_bytes = 2;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+    array_desc.Format = HIP_AD_FORMAT_UNSIGNED_INT32;
+    pixel_type_size_bytes = 4;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+    array_desc.Format = HIP_AD_FORMAT_SIGNED_INT32;
+    pixel_type_size_bytes = 4;
+    break;
+  case PI_IMAGE_CHANNEL_TYPE_FLOAT:
+    array_desc.Format = HIP_AD_FORMAT_FLOAT;
+    pixel_type_size_bytes = 4;
+    break;
+  default:
+    cl::sycl::detail::pi::die(
+        "rocm_piMemImageCreate given unsupported image_channel_data_type");
+  }
+
+  // When a dimension isn't used image_desc has the size set to 1
+  size_t pixel_size_bytes =
+      pixel_type_size_bytes * 4; // 4 is the only number of channels we support
+  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
+                            image_desc->image_height * image_desc->image_depth;
+
+  ScopedContext active(context);
+  hipArray *image_array;
+  retErr = PI_CHECK_ERROR(hipArray3DCreate(&image_array, &array_desc));
+
+  try {
+    if (performInitialCopy) {
+      // We have to use a different copy function for each image dimensionality
+      if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
+        retErr = PI_CHECK_ERROR(
+            hipMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
+      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
+        hip_Memcpy2D cpy_desc;
+        memset(&cpy_desc, 0, sizeof(cpy_desc));
+        cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
+        cpy_desc.srcHost = host_ptr;
+        cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
+        cpy_desc.dstArray = image_array;
+        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
+        cpy_desc.Height = image_desc->image_height;
+        retErr = PI_CHECK_ERROR(hipMemcpyParam2D(&cpy_desc));
+      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
+        HIP_MEMCPY3D cpy_desc;
+        memset(&cpy_desc, 0, sizeof(cpy_desc));
+        cpy_desc.srcMemoryType = hipMemoryType::hipMemoryTypeHost;
+        cpy_desc.srcHost = host_ptr;
+        cpy_desc.dstMemoryType = hipMemoryType::hipMemoryTypeArray;
+        cpy_desc.dstArray = image_array;
+        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
+        cpy_desc.Height = image_desc->image_height;
+        cpy_desc.Depth = image_desc->image_depth;
+        retErr = PI_CHECK_ERROR(hipDrvMemcpy3D(&cpy_desc));
+      }
+    }
+
+    // HIP_RESOURCE_DESC is a union of different structs, shown here
+    // https://docs.nvidia.com/rocm/rocm-driver-api/group__HIP__TEXOBJECT.html
+    // We need to fill it as described here to use it for a surface or texture
+    // https://docs.nvidia.com/rocm/rocm-driver-api/group__HIP__SURFOBJECT.html
+    // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
+    // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
+    // handle.
+    // HIP_RESOURCE_DESC::flags must be set to zero
+
+    hipResourceDesc image_res_desc;
+    image_res_desc.res.array.array = image_array;
+    image_res_desc.resType = hipResourceTypeArray;
+
+    hipSurfaceObject_t surface;
+    retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc));
+
+    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{context, *image_array, surface, image_desc->image_type, host_ptr});
+
+
+    if (piMemObj == nullptr) {
+      return PI_OUT_OF_HOST_MEMORY;
+    }
+
+    *ret_mem = piMemObj.release();
+  } catch (pi_result err) {
+    PI_CHECK_ERROR(hipFreeArray(image_array));
+    return err;
+  } catch (...) {
+    PI_CHECK_ERROR(hipFreeArray(image_array));
+    return PI_ERROR_UNKNOWN;
+  }
+  return retErr;
+}
+
+
+/// \TODO Not implemented
+pi_result rocm_piMemImageGetInfo(pi_mem image, pi_image_info param_name,
+                                 size_t param_value_size, void *param_value,
+                                 size_t *param_value_size_ret) {
+  cl::sycl::detail::pi::die("rocm_piMemImageGetInfo not implemented");
+  return {};
+}
+
+pi_result rocm_piMemRetain(pi_mem mem) {
+  assert(mem != nullptr);
+  assert(mem->get_reference_count() > 0);
+  mem->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+/// Not used as HIP backend only creates programs from binary.
+/// See \ref rocm_piclProgramCreateWithBinary.
+///
+pi_result rocm_piclProgramCreateWithSource(pi_context context, pi_uint32 count,
+                                           const char **strings,
+                                           const size_t *lengths,
+                                           pi_program *program) {
+  cl::sycl::detail::pi::hipPrint(
+      "rocm_piclProgramCreateWithSource not implemented");
+  return PI_INVALID_OPERATION;
+}
+
+/// Loads the images from a PI program into a HIPmodule that can be
+/// used later on to extract functions (kernels).
+/// See \ref _pi_program for implementation details.
+///
+pi_result rocm_piProgramBuild(pi_program program, pi_uint32 num_devices,
+                              const pi_device *device_list, const char *options,
+                              void (*pfn_notify)(pi_program program,
+                                                 void *user_data),
+                              void *user_data) {
+
+  assert(program != nullptr);
+  assert(num_devices == 1 || num_devices == 0);
+  assert(device_list != nullptr || num_devices == 0);
+  assert(pfn_notify == nullptr);
+  assert(user_data == nullptr);
+  pi_result retError = PI_SUCCESS;
+
+  try {
+    ScopedContext active(program->get_context());
+
+    program->build_program(options);
+
+  } catch (pi_result err) {
+    retError = err;
+  }
+  return retError;
+}
+
+/// \TODO Not implemented
+pi_result rocm_piProgramCreate(pi_context context, const void *il,
+                               size_t length, pi_program *res_program) {
+  cl::sycl::detail::pi::die("rocm_piProgramCreate not implemented");
+  return {};
+}
+
+/// Loads images from a list of PTX or HIPBIN binaries.
+/// Note: No calls to HIP driver API in this function, only store binaries
+/// for later.
+///
+/// Note: Only supports one device
+///
+pi_result rocm_piProgramCreateWithBinary(
+    pi_context context, pi_uint32 num_devices, const pi_device *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    pi_int32 *binary_status, pi_program *program) {
+  assert(context != nullptr);
+  assert(binaries != nullptr);
+  assert(program != nullptr);
+  assert(device_list != nullptr);
+  assert(num_devices == 1 && "HIP contexts are for a single device");
+  assert((context->get_device()->get() == device_list[0]->get()) &&
+         "Mismatch between devices context and passed context when creating "
+         "program from binary");
+
+  pi_result retError = PI_SUCCESS;
+
+  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
+
+  const bool has_length = (lengths != nullptr);
+  size_t length = has_length
+                      ? lengths[0]
+                      : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
+
+  assert(length != 0);
+
+  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
+
+  *program = retProgram.release();
+
+  return retError;
+}
+
+pi_result rocm_piProgramGetInfo(pi_program program, pi_program_info param_name,
+                                size_t param_value_size, void *param_value,
+                                size_t *param_value_size_ret) {
+  assert(program != nullptr);
+
+  switch (param_name) {
+  case PI_PROGRAM_INFO_REFERENCE_COUNT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   program->get_reference_count());
+  case PI_PROGRAM_INFO_CONTEXT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   program->context_);
+  case PI_PROGRAM_INFO_NUM_DEVICES:
+    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
+  case PI_PROGRAM_INFO_DEVICES:
+    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
+                        &program->context_->deviceId_);
+  case PI_PROGRAM_INFO_SOURCE:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   program->binary_);
+  case PI_PROGRAM_INFO_BINARY_SIZES:
+    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
+                        &program->binarySizeInBytes_);
+  case PI_PROGRAM_INFO_BINARIES:
+    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
+                        &program->binary_);
+  case PI_PROGRAM_INFO_KERNEL_NAMES: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   getKernelNames(program).c_str());
+  }
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Program info request not implemented");
+  return {};
+}
+
+/// Creates a new PI program object that is the outcome of linking all input
+/// programs.
+/// \TODO Implement linker options, requires mapping of OpenCL to HIP
+///
+
+pi_result rocm_piProgramLink(pi_context context, pi_uint32 num_devices,
+                             const pi_device *device_list, const char *options,
+                             pi_uint32 num_input_programs,
+                             const pi_program *input_programs,
+                             void (*pfn_notify)(pi_program program,
+                                                void *user_data),
+                             void *user_data, pi_program *ret_program) {
+  /*
+  assert(ret_program != nullptr);
+  assert(num_devices == 1 || num_devices == 0);
+  assert(device_list != nullptr || num_devices == 0);
+  assert(pfn_notify == nullptr);
+  assert(user_data == nullptr);
+  pi_result retError = PI_SUCCESS;
+
+  try {
+    ScopedContext active(context);
+
+    HIPlinkState state;
+    std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
+
+    retError = PI_CHECK_ERROR(hipLinkCreate(0, nullptr, nullptr, &state));
+    try {
+      for (size_t i = 0; i < num_input_programs; ++i) {
+        pi_program program = input_programs[i];
+        retError = PI_CHECK_ERROR(hipLinkAddData(
+            state, HIP_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
+            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
+      }
+      void *hipbin = nullptr;
+      size_t hipbinSize = 0;
+      retError = PI_CHECK_ERROR(hipLinkComplete(state, &hipbin, &hipbinSize));
+
+      retError =
+          retProgram->set_binary(static_cast<const char *>(hipbin), hipbinSize);
+
+      if (retError != PI_SUCCESS) {
+        return retError;
+      }
+
+      retError = retProgram->build_program(options);
+
+      if (retError != PI_SUCCESS) {
+        return retError;
+      }
+    } catch (...) {
+      // Upon error attempt cleanup
+      PI_CHECK_ERROR(hipLinkDestroy(state));
+      throw;
+    }
+
+    retError = PI_CHECK_ERROR(hipLinkDestroy(state));
+    *ret_program = retProgram.release();
+
+  } catch (pi_result err) {
+    retError = err;
+  }
+  */
+  return PI_SUCCESS;
+}
+
+/// Creates a new program that is the outcome of the compilation of the headers
+///  and the program.
+/// \TODO Implement asynchronous compilation
+///
+pi_result rocm_piProgramCompile(
+    pi_program program, pi_uint32 num_devices, const pi_device *device_list,
+    const char *options, pi_uint32 num_input_headers,
+    const pi_program *input_headers, const char **header_include_names,
+    void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
+  assert(program != nullptr);
+  assert(num_devices == 1 || num_devices == 0);
+  assert(device_list != nullptr || num_devices == 0);
+  assert(pfn_notify == nullptr);
+  assert(user_data == nullptr);
+  assert(num_input_headers == 0);
+  pi_result retError = PI_SUCCESS;
+
+  try {
+    ScopedContext active(program->get_context());
+
+    program->build_program(options);
+
+  } catch (pi_result err) {
+    retError = err;
+  }
+  return retError;
+}
+
+pi_result rocm_piProgramGetBuildInfo(pi_program program, pi_device device,
+                                     cl_program_build_info param_name,
+                                     size_t param_value_size, void *param_value,
+                                     size_t *param_value_size_ret) {
+
+  assert(program != nullptr);
+
+  switch (param_name) {
+  case PI_PROGRAM_BUILD_INFO_STATUS: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   program->buildStatus_);
+  }
+  case PI_PROGRAM_BUILD_INFO_OPTIONS:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   program->buildOptions_.c_str());
+  case PI_PROGRAM_BUILD_INFO_LOG:
+    return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
+                        param_value_size_ret, program->infoLog_);
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Program Build info request not implemented");
+  return {};
+}
+
+pi_result rocm_piProgramRetain(pi_program program) {
+  assert(program != nullptr);
+  assert(program->get_reference_count() > 0);
+  program->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+/// Decreases the reference count of a pi_program object.
+/// When the reference count reaches 0, it unloads the module from
+/// the context.
+pi_result rocm_piProgramRelease(pi_program program) {
+  assert(program != nullptr);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  assert(program->get_reference_count() != 0 &&
+         "Reference count overflow detected in rocm_piProgramRelease.");
+
+  // decrement ref count. If it is 0, delete the program.
+  if (program->decrement_reference_count() == 0) {
+
+    std::unique_ptr<_pi_program> program_ptr{program};
+
+    pi_result result = PI_INVALID_PROGRAM;
+
+    try {
+      ScopedContext active(program->get_context());
+      auto hipModule = program->get();
+      result = PI_CHECK_ERROR(hipModuleUnload(hipModule));
+    } catch (...) {
+      result = PI_OUT_OF_RESOURCES;
+    }
+
+    return result;
+  }
+
+  return PI_SUCCESS;
+}
+
+/// Gets the native HIP handle of a PI program object
+///
+/// \param[in] program The PI program to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI program object.
+///
+/// \return TBD
+pi_result rocm_piextProgramGetNativeHandle(pi_program program,
+                                           pi_native_handle *nativeHandle) {
+  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
+  return PI_SUCCESS;
+}
+
+/// Created a PI program object from a HIP program handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI program object from.
+/// \param[in] context The PI context of the program.
+/// \param[out] program Set to the PI program object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextProgramCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                  pi_context context,
+                                                  pi_program *program) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI program from native handle not implemented");
+  return {};
+}
+
+pi_result rocm_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name,
+                               size_t param_value_size, void *param_value,
+                               size_t *param_value_size_ret) {
+
+  if (kernel != nullptr) {
+
+    switch (param_name) {
+    case PI_KERNEL_INFO_FUNCTION_NAME:
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     kernel->get_name());
+    case PI_KERNEL_INFO_NUM_ARGS:
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     kernel->get_num_args());
+    case PI_KERNEL_INFO_REFERENCE_COUNT:
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     kernel->get_reference_count());
+    case PI_KERNEL_INFO_CONTEXT: {
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     kernel->get_context());
+    }
+    case PI_KERNEL_INFO_PROGRAM: {
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     kernel->get_program());
+    }
+    case PI_KERNEL_INFO_ATTRIBUTES: {
+      return getInfo(param_value_size, param_value, param_value_size_ret, "");
+    }
+    default: {
+      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+    }
+    }
+  }
+
+  return PI_INVALID_KERNEL;
+}
+
+pi_result rocm_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
+                                    pi_kernel_group_info param_name,
+                                    size_t param_value_size, void *param_value,
+                                    size_t *param_value_size_ret) {
+
+  // here we want to query about a kernel's rocm blocks!
+
+  if (kernel != nullptr) {
+
+    switch (param_name) {
+    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+      int max_threads = 0;
+      cl::sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&max_threads,
+                             HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     size_t(max_threads));
+    }
+    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+      // Returns the work-group size specified in the kernel source or IL.
+      // If the work-group size is not specified in the kernel source or IL,
+      // (0, 0, 0) is returned.
+      // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
+
+      // TODO: can we extract the work group size from the PTX?
+      size_t group_size[3] = {0, 0, 0};
+      return getInfoArray(3, param_value_size, param_value,
+                          param_value_size_ret, group_size);
+    }
+    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+      // OpenCL LOCAL == HIP SHARED
+      int bytes = 0;
+      cl::sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                             kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+      // Work groups should be multiples of the warp size
+      int warpSize = 0;
+      cl::sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                               device->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     static_cast<size_t>(warpSize));
+    }
+    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+      // OpenCL PRIVATE == HIP LOCAL
+      int bytes = 0;
+      cl::sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                             kernel->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     pi_uint64(bytes));
+    }
+    default:
+      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+    }
+  }
+
+  return PI_INVALID_KERNEL;
+}
+
+pi_result rocm_piKernelGetSubGroupInfo(
+    pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
+    size_t input_value_size, const void *input_value, size_t param_value_size,
+    void *param_value, size_t *param_value_size_ret) {
+  if (kernel != nullptr) {
+    switch (param_name) {
+    case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
+      // Sub-group size is equivalent to warp size
+      int warpSize = 0;
+      cl::sycl::detail::pi::assertion(
+          hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
+                               device->get()) == hipSuccess);
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     static_cast<uint32_t>(warpSize));
+    }
+    case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
+      // Number of sub-groups = max block size / warp size + possible remainder
+      int max_threads = 0;
+      cl::sycl::detail::pi::assertion(
+          hipFuncGetAttribute(&max_threads,
+                             HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             kernel->get()) == hipSuccess);
+      int warpSize = 0;
+      rocm_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
+                                   0, nullptr, sizeof(uint32_t), &warpSize,
+                                   nullptr);
+      int maxWarps = (max_threads + warpSize - 1) / warpSize;
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     static_cast<uint32_t>(maxWarps));
+    }
+    case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
+      // Return value of 0 => not specified
+      // TODO: Revisit if PTX is generated for compile-time work-group sizes
+      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
+    }
+    case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
+      // Return value of 0 => unspecified or "auto" sub-group size
+      // Correct for now, since warp size may be read from special register
+      // TODO: Return warp size once default is primary sub-group size
+      // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
+      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
+    }
+    default:
+      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+    }
+  }
+  return PI_INVALID_KERNEL;
+}
+
+pi_result rocm_piKernelRetain(pi_kernel kernel) {
+  assert(kernel != nullptr);
+  assert(kernel->get_reference_count() > 0u);
+
+  kernel->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piKernelRelease(pi_kernel kernel) {
+  assert(kernel != nullptr);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  assert(kernel->get_reference_count() != 0 &&
+         "Reference count overflow detected in rocm_piKernelRelease.");
+
+  // decrement ref count. If it is 0, delete the program.
+  if (kernel->decrement_reference_count() == 0) {
+    // no internal rocm resources to clean up. Just delete it.
+    delete kernel;
+    return PI_SUCCESS;
+  }
+
+  return PI_SUCCESS;
+}
+
+// A NOP for the HIP backend
+pi_result rocm_piKernelSetExecInfo(pi_kernel kernel,
+                                   pi_kernel_exec_info param_name,
+                                   size_t param_value_size,
+                                   const void *param_value) {
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
+                                        size_t arg_size,
+                                        const void *arg_value) {
+  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
+  return PI_SUCCESS;
+}
+
+//
+// Events
+//
+pi_result rocm_piEventCreate(pi_context context, pi_event *event) {
+  cl::sycl::detail::pi::die("PI Event Create not implemented in HIP backend");
+}
+
+pi_result rocm_piEventGetInfo(pi_event event, pi_event_info param_name,
+                              size_t param_value_size, void *param_value,
+                              size_t *param_value_size_ret) {
+  assert(event != nullptr);
+
+  switch (param_name) {
+  case PI_EVENT_INFO_COMMAND_QUEUE:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_queue());
+  case PI_EVENT_INFO_COMMAND_TYPE:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_command_type());
+  case PI_EVENT_INFO_REFERENCE_COUNT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_reference_count());
+  case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<pi_event_status>(event->get_execution_status()));
+  }
+  case PI_EVENT_INFO_CONTEXT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_context());
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+
+  return PI_INVALID_EVENT;
+}
+
+/// Obtain profiling information from PI HIP events
+/// \TODO Untie from OpenCL, timings from HIP are only elapsed time.
+pi_result rocm_piEventGetProfilingInfo(pi_event event,
+                                       pi_profiling_info param_name,
+                                       size_t param_value_size,
+                                       void *param_value,
+                                       size_t *param_value_size_ret) {
+
+  assert(event != nullptr);
+
+  pi_queue queue = event->get_queue();
+  if (queue == nullptr || !(queue->properties_ & PI_QUEUE_PROFILING_ENABLE)) {
+    return PI_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
+  switch (param_name) {
+  case PI_PROFILING_INFO_COMMAND_QUEUED:
+  case PI_PROFILING_INFO_COMMAND_SUBMIT:
+    return getInfo<pi_uint64>(param_value_size, param_value,
+                              param_value_size_ret, event->get_queued_time());
+  case PI_PROFILING_INFO_COMMAND_START:
+    return getInfo<pi_uint64>(param_value_size, param_value,
+                              param_value_size_ret, event->get_start_time());
+  case PI_PROFILING_INFO_COMMAND_END:
+    return getInfo<pi_uint64>(param_value_size, param_value,
+                              param_value_size_ret, event->get_end_time());
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  cl::sycl::detail::pi::die("Event Profiling info request not implemented");
+  return {};
+}
+
+pi_result rocm_piEventSetCallback(pi_event event,
+                                  pi_int32 command_exec_callback_type,
+                                  pfn_notify notify, void *user_data) {
+
+  cl::sycl::detail::pi::die("Event Callback not implemented in HIP backend");
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piEventSetStatus(pi_event event, pi_int32 execution_status) {
+
+  cl::sycl::detail::pi::die("Event Set Status not implemented in HIP backend");
+  return PI_INVALID_VALUE;
+}
+
+pi_result rocm_piEventRetain(pi_event event) {
+  assert(event != nullptr);
+
+  const auto refCount = event->increment_reference_count();
+
+  cl::sycl::detail::pi::assertion(
+      refCount != 0,
+      "Reference count overflow detected in rocm_piEventRetain.");
+
+  return PI_SUCCESS;
+}
+
+pi_result rocm_piEventRelease(pi_event event) {
+  assert(event != nullptr);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  cl::sycl::detail::pi::assertion(
+      event->get_reference_count() != 0,
+      "Reference count overflow detected in rocm_piEventRelease.");
+
+  // decrement ref count. If it is 0, delete the event.
+  if (event->decrement_reference_count() == 0) {
+    std::unique_ptr<_pi_event> event_ptr{event};
+    pi_result result = PI_INVALID_EVENT;
+    try {
+      ScopedContext active(event->get_context());
+      result = event->release();
+    } catch (...) {
+      result = PI_OUT_OF_RESOURCES;
+    }
+    return result;
+  }
+
+  return PI_SUCCESS;
+}
+
+/// Enqueues a wait on the given CUstream for all events.
+/// See \ref enqueueEventWait
+///
+pi_result rocm_piEnqueueEventsWait(pi_queue command_queue,
+                                   pi_uint32 num_events_in_wait_list,
+                                   const pi_event *event_wait_list,
+                                   pi_event *event) {
+  if (!command_queue) {
+    return PI_INVALID_QUEUE;
+  }
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      auto result =
+          forLatestEvents(event_wait_list, num_events_in_wait_list,
+                          [command_queue](pi_event event) -> pi_result {
+                            return enqueueEventWait(command_queue, event);
+                          });
+
+      if (result != PI_SUCCESS) {
+        return result;
+      }
+    }
+
+    if (event) {
+      *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue);
+      (*event)->start();
+      (*event)->record();
+    }
+
+    return PI_SUCCESS;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+}
+
+/// Gets the native HIP handle of a PI event object
+///
+/// \param[in] event The PI event to get the native HIP object of.
+/// \param[out] nativeHandle Set to the native handle of the PI event object.
+///
+/// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
+pi_result rocm_piextEventGetNativeHandle(pi_event event,
+                                         pi_native_handle *nativeHandle) {
+  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
+  return PI_SUCCESS;
+}
+
+/// Created a PI event object from a HIP event handle.
+/// TODO: Implement this.
+/// NOTE: The created PI object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create PI event object from.
+/// \param[out] event Set to the PI event object created from native handle.
+///
+/// \return TBD
+pi_result rocm_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                pi_event *event) {
+  cl::sycl::detail::pi::die(
+      "Creation of PI event from native handle not implemented");
+  return {};
+}
+
+/// Creates a PI sampler object
+///
+/// \param[in] context The context the sampler is created for.
+/// \param[in] sampler_properties The properties for the sampler.
+/// \param[out] result_sampler Set to the resulting sampler object.
+///
+/// \return PI_SUCCESS on success. PI_INVALID_VALUE if given an invalid property
+///         or if there is multiple of properties from the same category.
+pi_result rocm_piSamplerCreate(pi_context context,
+                               const pi_sampler_properties *sampler_properties,
+                               pi_sampler *result_sampler) {
+  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
+
+  bool propSeen[3] = {false, false, false};
+  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
+    switch (sampler_properties[i]) {
+    case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS:
+      if (propSeen[0]) {
+        return PI_INVALID_VALUE;
+      }
+      propSeen[0] = true;
+      retImplSampl->props_ |= sampler_properties[i + 1];
+      break;
+    case PI_SAMPLER_PROPERTIES_FILTER_MODE:
+      if (propSeen[1]) {
+        return PI_INVALID_VALUE;
+      }
+      propSeen[1] = true;
+      retImplSampl->props_ |=
+          (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
+      break;
+    case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE:
+      if (propSeen[2]) {
+        return PI_INVALID_VALUE;
+      }
+      propSeen[2] = true;
+      retImplSampl->props_ |=
+          (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
+      break;
+    default:
+      return PI_INVALID_VALUE;
+    }
+  }
+
+  if (!propSeen[0]) {
+    retImplSampl->props_ |= CL_TRUE;
+  }
+  // Default filter mode to CL_FILTER_NEAREST
+  if (!propSeen[2]) {
+    retImplSampl->props_ |= (CL_ADDRESS_CLAMP % CL_ADDRESS_NONE) << 2;
+  }
+
+  *result_sampler = retImplSampl.release();
+  return PI_SUCCESS;
+}
+
+/// Gets information from a PI sampler object
+///
+/// \param[in] sampler The sampler to get the information from.
+/// \param[in] param_name The name of the information to get.
+/// \param[in] param_value_size The size of the param_value.
+/// \param[out] param_value Set to information value.
+/// \param[out] param_value_size_ret Set to the size of the information value.
+///
+/// \return PI_SUCCESS on success.
+pi_result rocm_piSamplerGetInfo(pi_sampler sampler, cl_sampler_info param_name,
+                                size_t param_value_size, void *param_value,
+                                size_t *param_value_size_ret) {
+  assert(sampler != nullptr);
+
+  switch (param_name) {
+  case PI_SAMPLER_INFO_REFERENCE_COUNT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   sampler->get_reference_count());
+  case PI_SAMPLER_INFO_CONTEXT:
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   sampler->context_);
+  case PI_SAMPLER_INFO_NORMALIZED_COORDS: {
+    pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   norm_coords_prop);
+  }
+  case PI_SAMPLER_INFO_FILTER_MODE: {
+    pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
+        ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   filter_prop);
+  }
+  case PI_SAMPLER_INFO_ADDRESSING_MODE: {
+    pi_sampler_addressing_mode addressing_prop =
+        static_cast<pi_sampler_addressing_mode>(
+            (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   addressing_prop);
+  }
+  default:
+    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
+  }
+  return {};
+}
+
+/// Retains a PI sampler object, incrementing its reference count.
+///
+/// \param[in] sampler The sampler to increment the reference count of.
+///
+/// \return PI_SUCCESS.
+pi_result rocm_piSamplerRetain(pi_sampler sampler) {
+  assert(sampler != nullptr);
+  sampler->increment_reference_count();
+  return PI_SUCCESS;
+}
+
+/// Releases a PI sampler object, decrementing its reference count. If the
+/// reference count reaches zero, the sampler object is destroyed.
+///
+/// \param[in] sampler The sampler to decrement the reference count of.
+///
+/// \return PI_SUCCESS.
+pi_result rocm_piSamplerRelease(pi_sampler sampler) {
+  assert(sampler != nullptr);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  cl::sycl::detail::pi::assertion(
+      sampler->get_reference_count() != 0,
+      "Reference count overflow detected in rocm_piSamplerRelease.");
+
+  // decrement ref count. If it is 0, delete the sampler.
+  if (sampler->decrement_reference_count() == 0) {
+    delete sampler;
+  }
+
+  return PI_SUCCESS;
+}
+
+/// General 3D memory copy operation.
+/// This function requires the corresponding HIP context to be at the top of
+/// the context stack
+/// If the source and/or destination is on the device, src_ptr and/or dst_ptr
+/// must be a pointer to a hipDevPtr
+static pi_result commonEnqueueMemBufferCopyRect(
+    hipStream_t hip_stream, pi_buff_rect_region region, const void *src_ptr,
+    const hipMemoryType src_type, pi_buff_rect_offset src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
+    const hipMemoryType dst_type, pi_buff_rect_offset dst_offset,
+    size_t dst_row_pitch, size_t dst_slice_pitch) {
+  
+  assert(region != nullptr);
+  assert(src_offset != nullptr);
+  assert(dst_offset != nullptr);
+
+  assert(src_type == hipMemoryTypeDevice || src_type == hipMemoryTypeHost);
+  assert(dst_type == hipMemoryTypeDevice || dst_type == hipMemoryTypeHost);
+
+  src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch;
+  src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch)
+                                       : src_slice_pitch;
+  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch;
+  dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch)
+                                       : dst_slice_pitch;
+
+  HIP_MEMCPY3D params = {0};
+
+  params.WidthInBytes = region->width_bytes;
+  params.Height = region->height_scalar;
+  params.Depth = region->depth_scalar;
+
+  params.srcMemoryType = src_type;
+  params.srcDevice = src_type == hipMemoryTypeDevice
+                         ? *static_cast<const hipDevPtr *>(src_ptr)
+                         : 0;
+  params.srcHost = src_type == hipMemoryTypeHost ? src_ptr : nullptr;
+  params.srcXInBytes = src_offset->x_bytes;
+  params.srcY = src_offset->y_scalar;
+  params.srcZ = src_offset->z_scalar;
+  params.srcPitch = src_row_pitch;
+  params.srcHeight = src_slice_pitch / src_row_pitch;
+
+  params.dstMemoryType = dst_type;
+  params.dstDevice = dst_type == hipMemoryTypeDevice
+                         ? *static_cast<hipDevPtr *>(dst_ptr)
+                         : 0;
+  params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr;
+  params.dstXInBytes = dst_offset->x_bytes;
+  params.dstY = dst_offset->y_scalar;
+  params.dstZ = dst_offset->z_scalar;
+  params.dstPitch = dst_row_pitch;
+  params.dstHeight = dst_slice_pitch / dst_row_pitch;
+  
+  return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&params, hip_stream));
+  
+ return PI_SUCCESS;
+}
+
+pi_result rocm_piEnqueueMemBufferReadRect(
+    pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
+    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
+    pi_buff_rect_region region, size_t buffer_row_pitch,
+    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
+    void *ptr, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+
+  assert(buffer != nullptr);
+  assert(command_queue != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+  hipDevPtr devPtr = buffer->mem_.buffer_mem_.get();
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    retErr = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                      event_wait_list, nullptr);
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        hipStream, region, &devPtr, hipMemoryTypeDevice, buffer_offset,
+        buffer_row_pitch, buffer_slice_pitch, ptr, hipMemoryTypeHost,
+        host_offset, host_row_pitch, host_slice_pitch);
+
+    if (event) {
+      retErr = retImplEv->record();
+    }
+
+    if (blocking_read) {
+      retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+
+    if (event) {
+      *event = retImplEv.release();
+    }
+
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEnqueueMemBufferWriteRect(
+    pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
+    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
+    pi_buff_rect_region region, size_t buffer_row_pitch,
+    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
+    const void *ptr, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+
+  assert(buffer != nullptr);
+  assert(command_queue != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+  hipDevPtr devPtr = buffer->mem_.buffer_mem_.get();
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    retErr = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                      event_wait_list, nullptr);
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        hipStream, region, ptr, hipMemoryTypeHost, host_offset, host_row_pitch,
+        host_slice_pitch, &devPtr, hipMemoryTypeDevice, buffer_offset,
+        buffer_row_pitch, buffer_slice_pitch);
+
+    if (event) {
+      retErr = retImplEv->record();
+    }
+
+    if (blocking_write) {
+      retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+
+    if (event) {
+      *event = retImplEv.release();
+    }
+
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
+                                      pi_mem dst_buffer, size_t src_offset,
+                                      size_t dst_offset, size_t size,
+                                      pi_uint32 num_events_in_wait_list,
+                                      const pi_event *event_wait_list,
+                                      pi_event *event) {
+  if (!command_queue) {
+    return PI_INVALID_QUEUE;
+  }
+
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                               event_wait_list, nullptr);
+    }
+
+    pi_result result;
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue));
+      result = retImplEv->start();
+    }
+
+    auto stream = command_queue->get();
+    auto src = (uint8_t*)(src_buffer->mem_.buffer_mem_.get()) + src_offset;
+    auto dst = (uint8_t*)(dst_buffer->mem_.buffer_mem_.get()) + dst_offset;
+
+    result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream));
+
+    if (event) {
+      result = retImplEv->record();
+      *event = retImplEv.release();
+    }
+
+    return result;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+}
+
+pi_result rocm_piEnqueueMemBufferCopyRect(
+    pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
+    pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
+    pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
+    size_t dst_row_pitch, size_t dst_slice_pitch,
+    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
+    pi_event *event) {
+
+  assert(src_buffer != nullptr);
+  assert(dst_buffer != nullptr);
+  assert(command_queue != nullptr);
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+  hipDevPtr srcPtr = src_buffer->mem_.buffer_mem_.get();
+  hipDevPtr dstPtr = dst_buffer->mem_.buffer_mem_.get();
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    retErr = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                      event_wait_list, nullptr);
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        hipStream, region, &srcPtr, hipMemoryTypeDevice, src_origin,
+        src_row_pitch, src_slice_pitch, &dstPtr, hipMemoryTypeDevice,
+        dst_origin, dst_row_pitch, dst_slice_pitch);
+
+    if (event) {
+      retImplEv->record();
+      *event = retImplEv.release();
+    }
+
+  } catch (pi_result err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
+                                      const void *pattern, size_t pattern_size,
+                                      size_t offset, size_t size,
+                                      pi_uint32 num_events_in_wait_list,
+                                      const pi_event *event_wait_list,
+                                      pi_event *event) {
+  assert(command_queue != nullptr);
+
+  auto args_are_multiples_of_pattern_size =
+      (offset % pattern_size == 0) || (size % pattern_size == 0);
+
+  auto pattern_is_valid = (pattern != nullptr);
+
+  auto pattern_size_is_valid =
+      ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
+      (pattern_size > 0) && (pattern_size <= 128);  // falls within valid range
+
+  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
+         pattern_size_is_valid);
+  (void)args_are_multiples_of_pattern_size;
+  (void)pattern_is_valid;
+  (void)pattern_size_is_valid;
+
+  std::unique_ptr<_pi_event> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                               event_wait_list, nullptr);
+    }
+
+    pi_result result;
+
+    if (event) {
+      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
+          PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue));
+      result = retImplEv->start();
+    }
+
+    auto dstDevice = (uint8_t*)(buffer->mem_.buffer_mem_.get()) + offset;
+    auto stream = command_queue->get();
+    auto N = size / pattern_size;
+
+    // pattern size in bytes
+    switch (pattern_size) {
+    case 1: {
+      auto value = *static_cast<const uint8_t *>(pattern);
+      result = PI_CHECK_ERROR(hipMemsetD8Async(dstDevice, value, N, stream));
+      break;
+    }
+    case 2: {
+      auto value = *static_cast<const uint16_t *>(pattern);
+      result = PI_CHECK_ERROR(hipMemsetD16Async(dstDevice, value, N, stream));
+      break;
+    }
+    case 4: {
+      auto value = *static_cast<const uint32_t *>(pattern);
+      result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
+      break;
+    }
+    /*
+    default: {
+      // HIP has no memset functions that allow setting values more than 4
+      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
+      // fill, which can be more than 4 bytes. We must break up the pattern
+      // into 4 byte values, and set the buffer using multiple strided calls.
+      // This means that one hipMemsetD2D32Async call is made for every 4 bytes
+      // in the pattern.
+
+      auto number_of_steps = pattern_size / sizeof(uint32_t);
+
+      // we walk up the pattern in 4-byte steps, and call hipMemset for each
+      // 4-byte chunk of the pattern.
+      for (auto step = 0u; step < number_of_steps; ++step) {
+        // take 4 bytes of the pattern
+        auto value = *(static_cast<const uint32_t *>(pattern) + step);
+
+        // offset the pointer to the part of the buffer we want to write to
+        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
+
+        // set all of the pattern chunks
+        result = PI_CHECK_ERROR(
+            hipMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
+      }
+
+      break;
+    }
+    */
+    }
+
+    if (event) {
+      result = retImplEv->record();
+      *event = retImplEv.release();
+    }
+
+    return result;
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+}
+
+
+static size_t imageElementByteSize(enum hipArray_Format array_format) {
+  switch (array_format) {
+  case HIP_AD_FORMAT_UNSIGNED_INT8:
+  case HIP_AD_FORMAT_SIGNED_INT8:
+    return 1;
+  case HIP_AD_FORMAT_UNSIGNED_INT16:
+  case HIP_AD_FORMAT_SIGNED_INT16:
+  case HIP_AD_FORMAT_HALF:
+    return 2;
+  case HIP_AD_FORMAT_UNSIGNED_INT32:
+  case HIP_AD_FORMAT_SIGNED_INT32:
+  case HIP_AD_FORMAT_FLOAT:
+    return 4;
+  default:
+    return 0;
+  }
+  cl::sycl::detail::pi::die("Invalid iamge format.");
+  return 0;
+}
+
+
+/// General ND memory copy operation for images (where N > 1).
+/// This function requires the corresponding HIP context to be at the top of
+/// the context stack
+/// If the source and/or destination is an array, src_ptr and/or dst_ptr
+/// must be a pointer to a hipArray
+
+
+static pi_result commonEnqueueMemImageNDCopy(
+    hipStream_t hip_stream, pi_mem_type img_type, const size_t *region,
+    const void *src_ptr, const hipMemoryType src_type,
+    const size_t *src_offset, void *dst_ptr, const hipMemoryType dst_type,
+    const size_t *dst_offset) {
+  assert(region != nullptr);
+
+  assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost);
+  assert(dst_type == hipMemoryTypeArray || dst_type == hipMemoryTypeHost);
+
+  if (img_type == PI_MEM_TYPE_IMAGE2D) {
+    hip_Memcpy2D cpyDesc;
+    memset(&cpyDesc, 0, sizeof(cpyDesc));
+    cpyDesc.srcMemoryType = src_type;
+    if (src_type == hipMemoryTypeArray) {
+      cpyDesc.srcArray = const_cast<hipArray*>(static_cast<const hipArray*>(src_ptr));
+      cpyDesc.srcXInBytes = src_offset[0];
+      cpyDesc.srcY = src_offset[1];
+    } else {
+      cpyDesc.srcHost = src_ptr;
+    }
+    cpyDesc.dstMemoryType = dst_type;
+    if (dst_type == hipMemoryTypeArray) {
+      cpyDesc.dstArray = const_cast<hipArray*>(static_cast<const hipArray*>(dst_ptr));
+      cpyDesc.dstXInBytes = dst_offset[0];
+      cpyDesc.dstY = dst_offset[1];
+    } else {
+      cpyDesc.dstHost = dst_ptr;
+    }
+    cpyDesc.WidthInBytes = region[0];
+    cpyDesc.Height = region[1];
+    return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream));
+  }
+  
+  if (img_type == PI_MEM_TYPE_IMAGE3D) {
+    
+    HIP_MEMCPY3D cpyDesc;
+    memset(&cpyDesc, 0, sizeof(cpyDesc));
+    cpyDesc.srcMemoryType = src_type;
+    if (src_type == hipMemoryTypeArray) {
+      cpyDesc.srcArray = const_cast<hipArray*>(static_cast<const hipArray*>(src_ptr));
+      cpyDesc.srcXInBytes = src_offset[0];
+      cpyDesc.srcY = src_offset[1];
+      cpyDesc.srcZ = src_offset[2];
+    } else {
+      cpyDesc.srcHost = src_ptr;
+    }
+    cpyDesc.dstMemoryType = dst_type;
+    if (dst_type == hipMemoryTypeArray) {
+      cpyDesc.dstArray = static_cast<hipArray *>(dst_ptr);
+      cpyDesc.dstXInBytes = dst_offset[0];
+      cpyDesc.dstY = dst_offset[1];
+      cpyDesc.dstZ = dst_offset[2];
+    } else {
+      cpyDesc.dstHost = dst_ptr;
+    }
+    cpyDesc.WidthInBytes = region[0];
+    cpyDesc.Height = region[1];
+    cpyDesc.Depth = region[2];
+    return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream));
+   return PI_ERROR_UNKNOWN;
+  }
+  
+  return PI_INVALID_VALUE;
+}
+
+pi_result rocm_piEnqueueMemImageRead(
+    pi_queue command_queue, pi_mem image, pi_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
+  
+  assert(command_queue != nullptr);
+  assert(image != nullptr);
+  assert(image->mem_type_ == _pi_mem::mem_type::surface);
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                               event_wait_list, nullptr);
+    }
+
+    hipArray array = image->mem_.surface_mem_.get_array();
+
+    int elementByteSize = imageElementByteSize(array.Format);
+
+    size_t byteOffsetX = origin[0] * elementByteSize * array.NumChannels;
+    size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
+
+    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
+    /*
+    if (imgType == PI_MEM_TYPE_IMAGE1D) {
+      retErr = PI_CHECK_ERROR(
+          hipMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, hipStream));
+    } else */ {
+      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+      size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          hipStream, imgType, adjustedRegion, &array, hipMemoryTypeArray,
+          srcOffset, ptr, hipMemoryTypeHost, nullptr);
+
+      if (retErr != PI_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (event) {
+      auto new_event =
+          _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, command_queue);
+      new_event->record();
+      *event = new_event;
+    }
+
+    if (blocking_read) {
+      retErr = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+  return PI_SUCCESS;
+ return retErr;
+}
+
+pi_result
+rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
+                            pi_bool blocking_write, const size_t *origin,
+                            const size_t *region, size_t input_row_pitch,
+                            size_t input_slice_pitch, const void *ptr,
+                            pi_uint32 num_events_in_wait_list,
+                            const pi_event *event_wait_list, pi_event *event) {
+  
+  
+  assert(command_queue != nullptr);
+  assert(image != nullptr);
+  assert(image->mem_type_ == _pi_mem::mem_type::surface);
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                               event_wait_list, nullptr);
+    }
+
+    hipArray array = image->mem_.surface_mem_.get_array();
+
+    int elementByteSize = imageElementByteSize(array.Format);
+
+    size_t byteOffsetX = origin[0] * elementByteSize * array.NumChannels;
+    size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
+
+    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
+    /* if (imgType == PI_MEM_TYPE_IMAGE1D) {
+      retErr = PI_CHECK_ERROR(
+          hipMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, hipStream));
+    } else */ {
+      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+      size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          hipStream, imgType, adjustedRegion, ptr, hipMemoryTypeHost, nullptr,
+          &array, hipMemoryTypeArray, dstOffset);
+
+      if (retErr != PI_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (event) {
+      auto new_event =
+          _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, command_queue);
+      new_event->record();
+      *event = new_event;
+    }
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+
+  return PI_SUCCESS;
+  
+ return retErr;
+}
+
+pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
+                                     pi_mem dst_image, const size_t *src_origin,
+                                     const size_t *dst_origin,
+                                     const size_t *region,
+                                     pi_uint32 num_events_in_wait_list,
+                                     const pi_event *event_wait_list,
+                                     pi_event *event) {
+  
+  
+  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
+  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
+  assert(src_image->mem_.surface_mem_.get_image_type() ==
+         dst_image->mem_.surface_mem_.get_image_type());
+
+  pi_result retErr = PI_SUCCESS;
+  hipStream_t hipStream = command_queue->get();
+
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    if (event_wait_list) {
+      rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                               event_wait_list, nullptr);
+    }
+
+    hipArray srcArray = src_image->mem_.surface_mem_.get_array();
+    hipArray dstArray = dst_image->mem_.surface_mem_.get_array();
+
+    assert(srcArray.Format == dstArray.Format);
+    assert(srcArray.NumChannels == dstArray.NumChannels);
+
+    int elementByteSize = imageElementByteSize(srcArray.Format);
+
+
+    size_t dstByteOffsetX =
+        dst_origin[0] * elementByteSize * srcArray.NumChannels;
+    size_t srcByteOffsetX =
+        src_origin[0] * elementByteSize * dstArray.NumChannels;
+    size_t bytesToCopy = elementByteSize * srcArray.NumChannels * region[0];
+
+    pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
+    /*
+    if (imgType == PI_MEM_TYPE_IMAGE1D) {
+      retErr = PI_CHECK_ERROR(hipMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
+                                           srcByteOffsetX, bytesToCopy));
+    } else 
+    */{
+      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+      size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
+      size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          hipStream, imgType, adjustedRegion, &srcArray, hipMemoryTypeArray,
+          srcOffset, &dstArray, hipMemoryTypeArray, dstOffset);
+
+      if (retErr != PI_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (event) {
+      auto new_event =
+          _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, command_queue);
+      new_event->record();
+      *event = new_event;
+    }
+  } catch (pi_result err) {
+    return err;
+  } catch (...) {
+    return PI_ERROR_UNKNOWN;
+  }
+  
+  return PI_SUCCESS;
+ return retErr;
+}
+
+/// \TODO Not implemented in HIP, requires untie from OpenCL
+pi_result rocm_piEnqueueMemImageFill(pi_queue command_queue, pi_mem image,
+                                     const void *fill_color,
+                                     const size_t *origin, const size_t *region,
+                                     pi_uint32 num_events_in_wait_list,
+                                     const pi_event *event_wait_list,
+                                     pi_event *event) {
+  cl::sycl::detail::pi::die("rocm_piEnqueueMemImageFill not implemented");
+  return {};
+}
+
+/// Implements mapping on the host using a BufferRead operation.
+/// Mapped pointers are stored in the pi_mem object.
+/// If the buffer uses pinned host memory a pointer to that memory is returned
+/// and no read operation is done.
+/// \TODO Untie types from OpenCL
+///
+pi_result rocm_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
+                                     pi_bool blocking_map,
+                                     pi_map_flags map_flags, size_t offset,
+                                     size_t size,
+                                     pi_uint32 num_events_in_wait_list,
+                                     const pi_event *event_wait_list,
+                                     pi_event *event, void **ret_map) {
+  assert(ret_map != nullptr);
+  assert(command_queue != nullptr);
+  assert(buffer != nullptr);
+  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
+
+  pi_result ret_err = PI_INVALID_OPERATION;
+  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
+                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+
+  // Currently no support for overlapping regions
+  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
+    return ret_err;
+  }
+
+  // Allocate a pointer in the host to store the mapped information
+  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
+  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
+  if (hostPtr) {
+    ret_err = PI_SUCCESS;
+  }
+
+  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
+    // Pinned host memory is already on host so it doesn't need to be read.
+    ret_err = rocm_piEnqueueMemBufferRead(
+        command_queue, buffer, blocking_map, offset, size, hostPtr,
+        num_events_in_wait_list, event_wait_list, event);
+  } else {
+    ScopedContext active(command_queue->get_context());
+
+    if (is_pinned) {
+      ret_err = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                         event_wait_list, nullptr);
+    }
+
+    if (event) {
+      try {
+        *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP,
+                                        command_queue);
+        (*event)->start();
+        (*event)->record();
+      } catch (pi_result error) {
+        ret_err = error;
+      }
+    }
+  }
+
+  return ret_err;
+}
+
+/// Implements the unmap from the host, using a BufferWrite operation.
+/// Requires the mapped pointer to be already registered in the given memobj.
+/// If memobj uses pinned host memory, this will not do a write.
+///
+pi_result rocm_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
+                                 void *mapped_ptr,
+                                 pi_uint32 num_events_in_wait_list,
+                                 const pi_event *event_wait_list,
+                                 pi_event *event) {
+  pi_result ret_err = PI_SUCCESS;
+
+  assert(command_queue != nullptr);
+  assert(mapped_ptr != nullptr);
+  assert(memobj != nullptr);
+  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
+  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
+  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
+
+  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
+                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+
+  if (!is_pinned &&
+      ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
+       (memobj->mem_.buffer_mem_.get_map_flags() &
+        PI_MAP_WRITE_INVALIDATE_REGION))) {
+    // Pinned host memory is only on host so it doesn't need to be written to.
+    ret_err = rocm_piEnqueueMemBufferWrite(
+        command_queue, memobj, true,
+        memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
+        memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
+        num_events_in_wait_list, event_wait_list, event);
+  } else {
+    ScopedContext active(command_queue->get_context());
+
+    if (is_pinned) {
+      ret_err = rocm_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
+                                         event_wait_list, nullptr);
+    }
+
+    if (event) {
+      try {
+        *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
+                                        command_queue);
+        (*event)->start();
+        (*event)->record();
+      } catch (pi_result error) {
+        ret_err = error;
+      }
+    }
+  }
+
+  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
+  return ret_err;
+}
+
+/// USM: Implements USM Host allocations using HIP Pinned Memory
+///
+pi_result rocm_piextUSMHostAlloc(void **result_ptr, pi_context context,
+                                 pi_usm_mem_properties *properties, size_t size,
+                                 pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(hipHostMalloc(result_ptr, size));
+  } catch (pi_result error) {
+    result = error;
+  }
+
+  assert(alignment == 0 ||
+         (result == PI_SUCCESS &&
+          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
+  return result;
+}
+
+/// USM: Implements USM device allocations using a normal HIP device pointer
+///
+pi_result rocm_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
+                                   pi_device device,
+                                   pi_usm_mem_properties *properties,
+                                   size_t size, pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(device != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(hipMalloc((hipDevPtr *)result_ptr, size));
+  } catch (pi_result error) {
+    result = error;
+  }
+
+  assert(alignment == 0 ||
+         (result == PI_SUCCESS &&
+          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
+  return result;
+}
+
+/// USM: Implements USM Shared allocations using HIP Managed Memory
+///
+pi_result rocm_piextUSMSharedAlloc(void **result_ptr, pi_context context,
+                                   pi_device device,
+                                   pi_usm_mem_properties *properties,
+                                   size_t size, pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(device != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(hipMallocManaged((hipDevPtr *)result_ptr, size,
+                                              hipMemAttachGlobal));
+  } catch (pi_result error) {
+    result = error;
+  }
+
+  assert(alignment == 0 ||
+         (result == PI_SUCCESS &&
+          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
+  return result;
+}
+
+/// USM: Frees the given USM pointer associated with the context.
+///
+pi_result rocm_piextUSMFree(pi_context context, void *ptr) {
+  
+  assert(context != nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    unsigned int type;
+    hipPointerAttribute_t hipPointerAttributeType;
+    result = PI_CHECK_ERROR(hipPointerGetAttributes(
+           &hipPointerAttributeType, (hipDevPtr)ptr));
+    type = hipPointerAttributeType.memoryType;
+    assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost);
+    if (type == hipMemoryTypeDevice) {
+      result = PI_CHECK_ERROR(hipFree((hipDevPtr)ptr));
+    }
+    if (type == hipMemoryTypeHost) {
+      result = PI_CHECK_ERROR(hipFreeHost(ptr));
+    }
+  } catch (pi_result error) {
+    result = error;
+  }
+  return result;
+}
+
+pi_result rocm_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
+                                     size_t count,
+                                     pi_uint32 num_events_in_waitlist,
+                                     const pi_event *events_waitlist,
+                                     pi_event *event) {
+                            
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  hipStream_t hipStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = rocm_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(hipMemsetD8Async(
+        (hipDevPtr)ptr, (unsigned char)value & 0xFF, count, hipStream));
+    if (event) {
+      result = event_ptr->record();
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  
+  return result;
+}
+
+pi_result rocm_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
+                                     void *dst_ptr, const void *src_ptr,
+                                     size_t size,
+                                     pi_uint32 num_events_in_waitlist,
+                                     const pi_event *events_waitlist,
+                                     pi_event *event) {
+  
+  assert(queue != nullptr);
+  assert(dst_ptr != nullptr);
+  assert(src_ptr != nullptr);
+  hipStream_t hipStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = rocm_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(hipMemcpyAsync(
+        dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
+    if (event) {
+      result = event_ptr->record();
+    }
+    if (blocking) {
+      result = PI_CHECK_ERROR(hipStreamSynchronize(hipStream));
+    }
+    if (event) {
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  
+  return result;
+}
+
+pi_result rocm_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
+                                       size_t size,
+                                       pi_usm_migration_flags flags,
+                                       pi_uint32 num_events_in_waitlist,
+                                       const pi_event *events_waitlist,
+                                       pi_event *event) {
+  
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  hipStream_t hipStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  // TODO implement handling the flags once the expected behaviour
+  // of piextUSMEnqueuePrefetch is detailed in the USM extension
+  assert(flags == 0u);
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = rocm_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(hipMemPrefetchAsync(
+        ptr, size, queue->get_context()->get_device()->get(),
+        hipStream));
+    if (event) {
+      result = event_ptr->record();
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  
+  return result;
+}
+
+/// USM: memadvise API to govern behavior of automatic migration mechanisms
+pi_result rocm_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
+                                        size_t length, pi_mem_advice advice,
+                                        pi_event *event) {
+                                       
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  // TODO implement a mapping to hipMemAdvise once the expected behaviour
+  // of piextUSMEnqueueMemAdvise is detailed in the USM extension
+  return rocm_piEnqueueEventsWait(queue, 0, nullptr, event);
+
+ return PI_SUCCESS;
+}
+
+/// API to query information about USM allocated pointers
+/// Valid Queries:
+///   PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value
+///   PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if
+///                         the queried pointer fell inside an allocation.
+///                         Result must fit in void *
+///   PI_MEM_ALLOC_SIZE returns how big the queried pointer's
+///                     allocation is in bytes. Result is a size_t.
+///   PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against
+///
+/// \param context is the pi_context
+/// \param ptr is the pointer to query
+/// \param param_name is the type of query to perform
+/// \param param_value_size is the size of the result in bytes
+/// \param param_value is the result
+/// \param param_value_ret is how many bytes were written
+pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
+                                       pi_mem_info param_name,
+                                       size_t param_value_size,
+                                       void *param_value,
+                                       size_t *param_value_size_ret) {
+  
+  assert(context != nullptr);
+  assert(ptr != nullptr);
+  pi_result result = PI_SUCCESS;
+  hipPointerAttribute_t hipPointerAttributeType;
+
+  try {
+    ScopedContext active(context);
+    switch (param_name) {
+    case PI_MEM_ALLOC_TYPE: {
+      unsigned int value;
+      // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
+      // TODO hipPointerGetAttribute与CUDA传参不同
+      
+      hipError_t ret = hipPointerGetAttributes(
+          &hipPointerAttributeType, ptr);
+      if (ret == hipErrorInvalidValue) {
+        // pointer not known to the HIP subsystem
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_UNKNOWN);
+      }
+      result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
+      value = hipPointerAttributeType.isManaged;
+      if (value) {
+        // pointer to managed memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_SHARED);
+      }
+      result = PI_CHECK_ERROR(hipPointerGetAttributes(
+        &hipPointerAttributeType, ptr));
+      value = hipPointerAttributeType.memoryType;
+      assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost);
+      if (value == hipMemoryTypeDevice) {
+        // pointer to device memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_DEVICE);
+      }
+      if (value == hipMemoryTypeHost) {
+        // pointer to host memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_HOST);
+      }
+      // should never get here
+      __builtin_unreachable();
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     PI_MEM_TYPE_UNKNOWN);
+    }
+    case PI_MEM_ALLOC_BASE_PTR: {
+      return PI_INVALID_VALUE;
+    }
+    case PI_MEM_ALLOC_SIZE: {
+      return PI_INVALID_VALUE;
+    }
+    
+    case PI_MEM_ALLOC_DEVICE: {
+      unsigned int value;
+      result = PI_CHECK_ERROR(hipPointerGetAttributes(
+          &hipPointerAttributeType, ptr));
+      auto devicePointer = static_cast<int*>(hipPointerAttributeType.devicePointer);
+      value = *devicePointer;
+      pi_platform platform;
+      result = rocm_piPlatformsGet(0, &platform, nullptr);
+      pi_device device = platform->devices_[value].get();
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     device);
+    } 
+  } 
+  } catch (pi_result error) {
+    result = error;
+  }
+  
+  return result;
+}
+
+// This API is called by Sycl RT to notify the end of the plugin lifetime.
+// TODO: add a global variable lifetime management code here (see
+// pi_level_zero.cpp for reference) Currently this is just a NOOP.
+pi_result rocm_piTearDown(void *PluginParameter) { return PI_SUCCESS; }
+
+const char SupportedVersion[] = _PI_H_VERSION_STRING;
+
+pi_result piPluginInit(pi_plugin *PluginInit) {
+  int CompareVersions = strcmp(PluginInit->PiVersion, SupportedVersion);
+  if (CompareVersions < 0) {
+    // PI interface supports lower version of PI.
+    // TODO: Take appropriate actions.
+    return PI_INVALID_OPERATION;
+  }
+
+  // PI interface supports higher version or the same version.
+  strncpy(PluginInit->PluginVersion, SupportedVersion, 4);
+
+  // Set whole function table to zero to make it easier to detect if
+  // functions are not set up below.
+  std::memset(&(PluginInit->PiFunctionTable), 0,
+              sizeof(PluginInit->PiFunctionTable));
+
+// Forward calls to HIP RT.
+#define _PI_CL(pi_api, rocm_api)                                               \
+  (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&rocm_api);
+
+  // Platform
+  _PI_CL(piPlatformsGet, rocm_piPlatformsGet)
+  _PI_CL(piPlatformGetInfo, rocm_piPlatformGetInfo)
+  // Device
+  _PI_CL(piDevicesGet, rocm_piDevicesGet)
+  _PI_CL(piDeviceGetInfo, rocm_piDeviceGetInfo)
+  _PI_CL(piDevicePartition, rocm_piDevicePartition)
+  _PI_CL(piDeviceRetain, rocm_piDeviceRetain)
+  _PI_CL(piDeviceRelease, rocm_piDeviceRelease)
+  _PI_CL(piextDeviceSelectBinary, rocm_piextDeviceSelectBinary)
+  _PI_CL(piextGetDeviceFunctionPointer, rocm_piextGetDeviceFunctionPointer)
+  _PI_CL(piextDeviceGetNativeHandle, rocm_piextDeviceGetNativeHandle)
+  _PI_CL(piextDeviceCreateWithNativeHandle,
+         rocm_piextDeviceCreateWithNativeHandle)
+  // Context
+  _PI_CL(piextContextSetExtendedDeleter, rocm_piextContextSetExtendedDeleter)
+  _PI_CL(piContextCreate, rocm_piContextCreate)
+  _PI_CL(piContextGetInfo, rocm_piContextGetInfo)
+  _PI_CL(piContextRetain, rocm_piContextRetain)
+  _PI_CL(piContextRelease, rocm_piContextRelease)
+  _PI_CL(piextContextGetNativeHandle, rocm_piextContextGetNativeHandle)
+  _PI_CL(piextContextCreateWithNativeHandle,
+         rocm_piextContextCreateWithNativeHandle)
+  // Queue
+  _PI_CL(piQueueCreate, rocm_piQueueCreate)
+  _PI_CL(piQueueGetInfo, rocm_piQueueGetInfo)
+  _PI_CL(piQueueFinish, rocm_piQueueFinish)
+  _PI_CL(piQueueRetain, rocm_piQueueRetain)
+  _PI_CL(piQueueRelease, rocm_piQueueRelease)
+  _PI_CL(piextQueueGetNativeHandle, rocm_piextQueueGetNativeHandle)
+  _PI_CL(piextQueueCreateWithNativeHandle,
+         rocm_piextQueueCreateWithNativeHandle)
+  // Memory
+  _PI_CL(piMemBufferCreate, rocm_piMemBufferCreate)
+  _PI_CL(piMemImageCreate, rocm_piMemImageCreate)
+  _PI_CL(piMemGetInfo, rocm_piMemGetInfo)
+  _PI_CL(piMemImageGetInfo, rocm_piMemImageGetInfo)
+  _PI_CL(piMemRetain, rocm_piMemRetain)
+  _PI_CL(piMemRelease, rocm_piMemRelease)
+  _PI_CL(piMemBufferPartition, rocm_piMemBufferPartition)
+  //_PI_CL(piextMemGetNativeHandle, rocm_piextMemGetNativeHandle)
+  _PI_CL(piextMemCreateWithNativeHandle, rocm_piextMemCreateWithNativeHandle)
+  // Program
+  _PI_CL(piProgramCreate, rocm_piProgramCreate)
+  _PI_CL(piclProgramCreateWithSource, rocm_piclProgramCreateWithSource)
+  _PI_CL(piProgramCreateWithBinary, rocm_piProgramCreateWithBinary)
+  _PI_CL(piProgramGetInfo, rocm_piProgramGetInfo)
+  _PI_CL(piProgramCompile, rocm_piProgramCompile)
+  _PI_CL(piProgramBuild, rocm_piProgramBuild)
+  _PI_CL(piProgramLink, rocm_piProgramLink)
+  _PI_CL(piProgramGetBuildInfo, rocm_piProgramGetBuildInfo)
+  _PI_CL(piProgramRetain, rocm_piProgramRetain)
+  _PI_CL(piProgramRelease, rocm_piProgramRelease)
+  _PI_CL(piextProgramGetNativeHandle, rocm_piextProgramGetNativeHandle)
+  _PI_CL(piextProgramCreateWithNativeHandle,
+         rocm_piextProgramCreateWithNativeHandle)
+  // Kernel
+  _PI_CL(piKernelCreate, rocm_piKernelCreate)
+  _PI_CL(piKernelSetArg, rocm_piKernelSetArg)
+  _PI_CL(piKernelGetInfo, rocm_piKernelGetInfo)
+  _PI_CL(piKernelGetGroupInfo, rocm_piKernelGetGroupInfo)
+  _PI_CL(piKernelGetSubGroupInfo, rocm_piKernelGetSubGroupInfo)
+  _PI_CL(piKernelRetain, rocm_piKernelRetain)
+  _PI_CL(piKernelRelease, rocm_piKernelRelease)
+  _PI_CL(piKernelSetExecInfo, rocm_piKernelSetExecInfo)
+  _PI_CL(piextKernelSetArgPointer, rocm_piextKernelSetArgPointer)
+  // Event
+  _PI_CL(piEventCreate, rocm_piEventCreate)
+  _PI_CL(piEventGetInfo, rocm_piEventGetInfo)
+  _PI_CL(piEventGetProfilingInfo, rocm_piEventGetProfilingInfo)
+  _PI_CL(piEventsWait, rocm_piEventsWait)
+  _PI_CL(piEventSetCallback, rocm_piEventSetCallback)
+  _PI_CL(piEventSetStatus, rocm_piEventSetStatus)
+  _PI_CL(piEventRetain, rocm_piEventRetain)
+  _PI_CL(piEventRelease, rocm_piEventRelease)
+  _PI_CL(piextEventGetNativeHandle, rocm_piextEventGetNativeHandle)
+  _PI_CL(piextEventCreateWithNativeHandle,
+         rocm_piextEventCreateWithNativeHandle)
+  // Sampler
+  _PI_CL(piSamplerCreate, rocm_piSamplerCreate)
+  _PI_CL(piSamplerGetInfo, rocm_piSamplerGetInfo)
+  _PI_CL(piSamplerRetain, rocm_piSamplerRetain)
+  _PI_CL(piSamplerRelease, rocm_piSamplerRelease)
+  // Queue commands
+  _PI_CL(piEnqueueKernelLaunch, rocm_piEnqueueKernelLaunch)
+  _PI_CL(piEnqueueNativeKernel, rocm_piEnqueueNativeKernel)
+  _PI_CL(piEnqueueEventsWait, rocm_piEnqueueEventsWait)
+  _PI_CL(piEnqueueMemBufferRead, rocm_piEnqueueMemBufferRead)
+  _PI_CL(piEnqueueMemBufferReadRect, rocm_piEnqueueMemBufferReadRect)
+  _PI_CL(piEnqueueMemBufferWrite, rocm_piEnqueueMemBufferWrite)
+  _PI_CL(piEnqueueMemBufferWriteRect, rocm_piEnqueueMemBufferWriteRect)
+  _PI_CL(piEnqueueMemBufferCopy, rocm_piEnqueueMemBufferCopy)
+  _PI_CL(piEnqueueMemBufferCopyRect, rocm_piEnqueueMemBufferCopyRect)
+  _PI_CL(piEnqueueMemBufferFill, rocm_piEnqueueMemBufferFill)
+  _PI_CL(piEnqueueMemImageRead, rocm_piEnqueueMemImageRead)
+  _PI_CL(piEnqueueMemImageWrite, rocm_piEnqueueMemImageWrite)
+  _PI_CL(piEnqueueMemImageCopy, rocm_piEnqueueMemImageCopy)
+  _PI_CL(piEnqueueMemImageFill, rocm_piEnqueueMemImageFill)
+  _PI_CL(piEnqueueMemBufferMap, rocm_piEnqueueMemBufferMap)
+  _PI_CL(piEnqueueMemUnmap, rocm_piEnqueueMemUnmap)
+  // USM
+  _PI_CL(piextUSMHostAlloc, rocm_piextUSMHostAlloc)
+  _PI_CL(piextUSMDeviceAlloc, rocm_piextUSMDeviceAlloc)
+  _PI_CL(piextUSMSharedAlloc, rocm_piextUSMSharedAlloc)
+  _PI_CL(piextUSMFree, rocm_piextUSMFree)
+  _PI_CL(piextUSMEnqueueMemset, rocm_piextUSMEnqueueMemset)
+  _PI_CL(piextUSMEnqueueMemcpy, rocm_piextUSMEnqueueMemcpy)
+  _PI_CL(piextUSMEnqueuePrefetch, rocm_piextUSMEnqueuePrefetch)
+  _PI_CL(piextUSMEnqueueMemAdvise, rocm_piextUSMEnqueueMemAdvise)
+  _PI_CL(piextUSMGetMemAllocInfo, rocm_piextUSMGetMemAllocInfo)
+
+  _PI_CL(piextKernelSetArgMemObj, rocm_piextKernelSetArgMemObj)
+  _PI_CL(piextKernelSetArgSampler, rocm_piextKernelSetArgSampler)
+  _PI_CL(piTearDown, rocm_piTearDown)
+
+#undef _PI_CL
+
+  return PI_SUCCESS;
+}
+
+} // extern "C"
diff --git a/sycl/plugins/rocm/pi_rocm.hpp b/sycl/plugins/rocm/pi_rocm.hpp
new file mode 100644
index 0000000000000..e748ad12f6b6f
--- /dev/null
+++ b/sycl/plugins/rocm/pi_rocm.hpp
@@ -0,0 +1,717 @@
+//===-- pi_rocm.hpp - ROCM Plugin -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// \defgroup sycl_pi_rocm ROCM Plugin
+/// \ingroup sycl_pi
+
+/// \file pi_rocm.hpp
+/// Declarations for ROCM Plugin. It is the interface between the
+/// device-agnostic SYCL runtime layer and underlying ROCM runtime.
+///
+/// \ingroup sycl_pi_rocm
+
+#ifndef PI_ROCM_HPP
+#define PI_ROCM_HPP
+
+#include "CL/sycl/detail/pi.h"
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <hip/hip_runtime.h>
+#include <limits>
+#include <numeric>
+#include <stdint.h>
+#include <string>
+#include <vector>
+#include <functional>
+#include <mutex>
+
+typedef  void* hipDevPtr;
+
+extern "C" {
+
+/// \cond INGORE_BLOCK_IN_DOXYGEN
+pi_result rocm_piContextRetain(pi_context );
+pi_result rocm_piContextRelease(pi_context );
+pi_result rocm_piDeviceRelease(pi_device );
+pi_result rocm_piDeviceRetain(pi_device );
+pi_result rocm_piProgramRetain(pi_program );
+pi_result rocm_piProgramRelease(pi_program );
+pi_result rocm_piQueueRelease(pi_queue);
+pi_result rocm_piQueueRetain(pi_queue);
+pi_result rocm_piMemRetain(pi_mem);
+pi_result rocm_piMemRelease(pi_mem);
+pi_result rocm_piKernelRetain(pi_kernel);
+pi_result rocm_piKernelRelease(pi_kernel);
+/// \endcond
+}
+
+/// A PI platform stores all known PI devices,
+///  in the ROCM plugin this is just a vector of
+///  available devices since initialization is done
+///  when devices are used.
+///
+struct _pi_platform {
+  std::vector<std::unique_ptr<_pi_device>> devices_;
+};
+
+/// PI device mapping to a hipDevice_t.
+/// Includes an observer pointer to the platform,
+/// and implements the reference counting semantics since
+/// ROCM objects are not refcounted.
+///
+class _pi_device {
+  using native_type = hipDevice_t;
+
+  native_type cuDevice_;
+  std::atomic_uint32_t refCount_;
+  pi_platform platform_;
+
+public:
+  _pi_device(native_type cuDevice, pi_platform platform)
+      : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
+
+  native_type get() const noexcept { return cuDevice_; };
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+
+  pi_platform get_platform() const noexcept { return platform_; };
+};
+
+/// PI context mapping to a ROCM context object.
+///
+/// There is no direct mapping between a ROCM context and a PI context,
+/// main differences described below:
+///
+/// <b> ROCM context vs PI context </b>
+///
+/// One of the main differences between the PI API and the ROCM driver API is
+/// that the second modifies the state of the threads by assigning
+/// `hipCtx_t` objects to threads. `hipCtx_t` objects store data associated
+/// with a given device and control access to said device from the user side.
+/// PI API context are objects that are passed to functions, and not bound
+/// to threads.
+/// The _pi_context object doesn't implement this behavior, only holds the
+/// ROCM context data. The RAII object \ref ScopedContext implements the active
+/// context behavior.
+///
+/// <b> Primary vs User-defined context </b>
+///
+/// ROCM has two different types of context, the Primary context,
+/// which is usable by all threads on a given process for a given device, and
+/// the aforementioned custom contexts.
+/// ROCM documentation, and performance analysis, indicates it is recommended
+/// to use Primary context whenever possible.
+/// Primary context is used as well by the ROCM Runtime API.
+/// For PI applications to interop with ROCM Runtime API, they have to use
+/// the primary context - and make that active in the thread.
+/// The `_pi_context` object can be constructed with a `kind` parameter
+/// that allows to construct a Primary or `user-defined` context, so that
+/// the PI object interface is always the same.
+///
+///  <b> Destructor callback </b>
+///
+///  Required to implement CP023, SYCL Extended Context Destruction,
+///  the PI Context can store a number of callback functions that will be
+///  called upon destruction of the PI Context.
+///  See proposal for details.
+///
+struct _pi_context {
+
+  struct deleter_data {
+    pi_context_extended_deleter function;
+    void *user_data;
+
+    void operator()() { function(user_data); }
+  };
+
+  using native_type = hipCtx_t;
+
+  enum class kind { primary, user_defined } kind_;
+  native_type hipContext_;
+  _pi_device *deviceId_;
+  std::atomic_uint32_t refCount_;
+
+  hipEvent_t evBase_; // ROCM event used as base counter
+
+  _pi_context(kind k, hipCtx_t ctxt, _pi_device *devId)
+      : kind_{k}, hipContext_{ctxt}, deviceId_{devId}, refCount_{1},
+        evBase_(nullptr) {
+    rocm_piDeviceRetain(deviceId_);
+  };
+
+  ~_pi_context() { rocm_piDeviceRelease(deviceId_); }
+
+  void invoke_extended_deleters() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &deleter : extended_deleters_) {
+      deleter();
+    }
+  }
+
+  void set_extended_deleter(pi_context_extended_deleter function,
+                            void *user_data) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    extended_deleters_.emplace_back(deleter_data{function, user_data});
+  }
+
+  pi_device get_device() const noexcept { return deviceId_; }
+
+  native_type get() const noexcept { return hipContext_; }
+
+  bool is_primary() const noexcept { return kind_ == kind::primary; }
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+
+private:
+  std::mutex mutex_;
+  std::vector<deleter_data> extended_deleters_;
+};
+
+/// PI Mem mapping to ROCM memory allocations, both data and texture/surface.
+/// \brief Represents non-SVM allocations on the ROCM backend.
+/// Keeps tracks of all mapped regions used for Map/Unmap calls.
+/// Only one region can be active at the same time per allocation.
+struct _pi_mem {
+
+  // TODO: Move as much shared data up as possible
+  using pi_context = _pi_context *;
+
+  // Context where the memory object is accessibles
+  pi_context context_;
+
+  /// Reference counting of the handler
+  std::atomic_uint32_t refCount_;
+  enum class mem_type { buffer, surface } mem_type_;
+
+  /// A PI Memory object represents either plain memory allocations ("Buffers"
+  /// in OpenCL) or typed allocations ("Images" in OpenCL).
+  /// In ROCM their API handlers are different. Whereas "Buffers" are allocated
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
+  /// This union allows implementation to use either from the same handler.
+  union mem_ {
+    // Handler for plain, pointer-based ROCM allocations
+    struct buffer_mem_ {
+      using native_type = hipDeviceptr_t;
+
+      // If this allocation is a sub-buffer (i.e., a view on an existing
+      // allocation), this is the pointer to the parent handler structure
+      pi_mem parent_;
+      // ROCM handler for the pointer
+      native_type ptr_;
+
+      /// Pointer associated with this device on the host
+      void *hostPtr_;
+      /// Size of the allocation in bytes
+      size_t size_;
+      /// Offset of the active mapped region.
+      size_t mapOffset_;
+      /// Pointer to the active mapped region, if any
+      void *mapPtr_;
+      /// Original flags for the mapped region
+      pi_map_flags mapFlags_;
+
+      /** alloc_mode
+       * classic: Just a normal buffer allocated on the device via rocm malloc
+       * use_host_ptr: Use an address on the host for the device
+       * copy_in: The data for the device comes from the host but the host
+       pointer is not available later for re-use
+       * alloc_host_ptr: Uses pinned-memory allocation
+      */
+      enum class alloc_mode {
+        classic,
+        use_host_ptr,
+        copy_in,
+        alloc_host_ptr
+      } allocMode_;
+
+      native_type get() const noexcept { return ptr_; }
+
+      size_t get_size() const noexcept { return size_; }
+
+      void *get_map_ptr() const noexcept { return mapPtr_; }
+
+      size_t get_map_offset(void *ptr) const noexcept { return mapOffset_; }
+
+      /// Returns a pointer to data visible on the host that contains
+      /// the data on the device associated with this allocation.
+      /// The offset is used to index into the ROCM allocation.
+      ///
+      void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
+        assert(mapPtr_ == nullptr);
+        mapOffset_ = offset;
+        mapFlags_ = flags;
+        if (hostPtr_) {
+          mapPtr_ = static_cast<char *>(hostPtr_) + offset;
+        } else {
+          // TODO: Allocate only what is needed based on the offset
+          mapPtr_ = static_cast<void *>(malloc(this->get_size()));
+        }
+        return mapPtr_;
+      }
+
+      /// Detach the allocation from the host memory.
+      void unmap(void *ptr) noexcept {
+        assert(mapPtr_ != nullptr);
+
+        if (mapPtr_ != hostPtr_) {
+          free(mapPtr_);
+        }
+        mapPtr_ = nullptr;
+        mapOffset_ = 0;
+      }
+
+      pi_map_flags get_map_flags() const noexcept {
+        assert(mapPtr_ != nullptr);
+        return mapFlags_;
+      }
+    } buffer_mem_;
+
+    // Handler data for surface object (i.e. Images)
+    struct surface_mem_ {
+      hipArray array_;
+      hipSurfaceObject_t surfObj_;
+      pi_mem_type imageType_;
+
+      hipArray get_array() const noexcept { return array_; }
+
+      hipSurfaceObject_t get_surface() const noexcept { return surfObj_; }
+
+      pi_mem_type get_image_type() const noexcept { return imageType_; }
+    } surface_mem_;
+  } mem_;
+
+  /// Constructs the PI MEM handler for a non-typed allocation ("buffer")
+  _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode,
+          hipDeviceptr_t ptr, void *host_ptr, size_t size)
+      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
+    mem_.buffer_mem_.ptr_ = ptr;
+    mem_.buffer_mem_.parent_ = parent;
+    mem_.buffer_mem_.hostPtr_ = host_ptr;
+    mem_.buffer_mem_.size_ = size;
+    mem_.buffer_mem_.mapOffset_ = 0;
+    mem_.buffer_mem_.mapPtr_ = nullptr;
+    mem_.buffer_mem_.mapFlags_ = PI_MAP_WRITE;
+    mem_.buffer_mem_.allocMode_ = mode;
+    if (is_sub_buffer()) {
+      rocm_piMemRetain(mem_.buffer_mem_.parent_);
+    } else {
+      rocm_piContextRetain(context_);
+    }
+  };
+
+
+  /// Constructs the PI allocation for an Image object
+  _pi_mem(pi_context ctxt, hipArray array, hipSurfaceObject_t surf,
+          pi_mem_type image_type, void *host_ptr)
+      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {
+    mem_.surface_mem_.array_ = array;
+    mem_.surface_mem_.imageType_ = image_type;
+    mem_.surface_mem_.surfObj_ = surf;
+    rocm_piContextRetain(context_);
+  }
+ 
+
+  ~_pi_mem() {
+    if (mem_type_ == mem_type::buffer) {
+      if (is_sub_buffer()) {
+        rocm_piMemRelease(mem_.buffer_mem_.parent_);
+        return;
+      }
+    }
+    rocm_piContextRelease(context_);
+  }
+
+  // TODO: Move as many shared funcs up as possible
+  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
+
+  bool is_sub_buffer() const noexcept {
+    return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
+  }
+
+  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
+
+  pi_context get_context() const noexcept { return context_; }
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+};
+
+/// PI queue mapping on to hipStream_t objects.
+///
+struct _pi_queue {
+  using native_type = hipStream_t;
+
+  native_type stream_;
+  _pi_context *context_;
+  _pi_device *device_;
+  pi_queue_properties properties_;
+  std::atomic_uint32_t refCount_;
+  std::atomic_uint32_t eventCount_;
+
+  _pi_queue(hipStream_t stream, _pi_context *context, _pi_device *device,
+            pi_queue_properties properties)
+      : stream_{stream}, context_{context}, device_{device},
+        properties_{properties}, refCount_{1}, eventCount_{0} {
+    rocm_piContextRetain(context_);
+    rocm_piDeviceRetain(device_);
+  }
+
+  ~_pi_queue() {
+    rocm_piContextRelease(context_);
+    rocm_piDeviceRelease(device_);
+  }
+
+  native_type get() const noexcept { return stream_; };
+
+  _pi_context *get_context() const { return context_; };
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+
+  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
+};
+
+typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
+                           void *userData);
+/// PI Event mapping to hipEvent_t
+///
+class _pi_event {
+public:
+  using native_type = hipEvent_t;
+
+  pi_result record();
+
+  pi_result wait();
+
+  pi_result start();
+
+  native_type get() const noexcept { return evEnd_; };
+
+  pi_queue get_queue() const noexcept { return queue_; }
+
+  pi_command_type get_command_type() const noexcept { return commandType_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+
+  bool is_recorded() const noexcept { return isRecorded_; }
+
+  bool is_started() const noexcept { return isStarted_; }
+
+  bool is_completed() const noexcept { return isCompleted_; };
+
+  pi_int32 get_execution_status() const noexcept {
+
+    if (!is_recorded()) {
+      return PI_EVENT_SUBMITTED;
+    }
+
+    if (!is_completed()) {
+      return PI_EVENT_RUNNING;
+    }
+    return PI_EVENT_COMPLETE;
+  }
+
+  pi_context get_context() const noexcept { return context_; };
+
+  pi_uint32 increment_reference_count() { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() { return --refCount_; }
+
+  pi_uint32 get_event_id() const noexcept { return eventId_; }
+
+  // Returns the counter time when the associated command(s) were enqueued
+  //
+  pi_uint64 get_queued_time() const;
+
+  // Returns the counter time when the associated command(s) started execution
+  //
+  pi_uint64 get_start_time() const;
+
+  // Returns the counter time when the associated command(s) completed
+  //
+  pi_uint64 get_end_time() const;
+
+  // construct a native ROCM. This maps closely to the underlying ROCM event.
+  static pi_event make_native(pi_command_type type, pi_queue queue) {
+    return new _pi_event(type, queue->get_context(), queue);
+  }
+
+  pi_result release();
+
+  ~_pi_event();
+
+private:
+  // This constructor is private to force programmers to use the make_native /
+  // make_user static members in order to create a pi_event for ROCM.
+  _pi_event(pi_command_type type, pi_context context, pi_queue queue);
+
+  pi_command_type commandType_; // The type of command associated with event.
+
+  std::atomic_uint32_t refCount_; // Event reference count.
+
+  bool isCompleted_; // Signifies whether the operations have completed
+                     //
+
+  bool isRecorded_; // Signifies wether a native ROCM event has been recorded
+                    // yet.
+  bool isStarted_;  // Signifies wether the operation associated with the
+                    // PI event has started or not
+                    //
+
+  pi_uint32 eventId_; // Queue identifier of the event.
+
+  native_type evEnd_; // ROCM event handle. If this _pi_event represents a user
+                      // event, this will be nullptr.
+
+  native_type evStart_; // ROCM event handle associated with the start
+
+  native_type evQueued_; // ROCM event handle associated with the time
+                         // the command was enqueued
+
+  pi_queue queue_; // pi_queue associated with the event. If this is a user
+                   // event, this will be nullptr.
+
+  pi_context context_; // pi_context associated with the event. If this is a
+                       // native event, this will be the same context associated
+                       // with the queue_ member.
+};
+
+/// Implementation of PI Program on ROCM Module object
+///
+struct _pi_program {
+  using native_type = hipModule_t;
+  native_type module_;
+  const char *binary_;
+  size_t binarySizeInBytes_;
+  std::atomic_uint32_t refCount_;
+  _pi_context *context_;
+
+  constexpr static size_t MAX_LOG_SIZE = 8192u;
+
+  char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE];
+  std::string buildOptions_;
+  pi_program_build_status buildStatus_ = PI_PROGRAM_BUILD_STATUS_NONE;
+
+  _pi_program(pi_context ctxt);
+  ~_pi_program();
+
+  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
+
+  pi_result build_program(const char* build_options);
+
+  pi_context get_context() const { return context_; };
+
+  native_type get() const noexcept { return module_; };
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+};
+
+/// Implementation of a PI Kernel for ROCM
+///
+/// PI Kernels are used to set kernel arguments,
+/// creating a state on the Kernel object for a given
+/// invocation. This is not the case of HIPFunction objects,
+/// which are simply passed together with the arguments on the invocation.
+/// The PI Kernel implementation for ROCM stores the list of arguments,
+/// argument sizes and offsets to emulate the interface of PI Kernel,
+/// saving the arguments for the later dispatch.
+/// Note that in PI API, the Local memory is specified as a size per
+/// individual argument, but in ROCM only the total usage of shared
+/// memory is required since it is not passed as a parameter.
+/// A compiler pass converts the PI API local memory model into the
+/// ROCM shared model. This object simply calculates the total of
+/// shared memory, and the initial offsets of each parameter.
+///
+struct _pi_kernel {
+  using native_type = hipFunction_t;
+
+  native_type function_;
+  native_type functionWithOffsetParam_;
+  std::string name_;
+  pi_context context_;
+  pi_program program_;
+  std::atomic_uint32_t refCount_;
+
+  /// Structure that holds the arguments to the kernel.
+  /// Note earch argument size is known, since it comes
+  /// from the kernel signature.
+  /// This is not something can be queried from the ROCM API
+  /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
+  /// and a storage.
+  ///
+  struct arguments {
+    static constexpr size_t MAX_PARAM_BYTES = 4000u;
+    using args_t = std::array<char, MAX_PARAM_BYTES>;
+    using args_size_t = std::vector<size_t>;
+    using args_index_t = std::vector<void *>;
+    args_t storage_;
+    args_size_t paramSizes_;
+    args_index_t indices_;
+    args_size_t offsetPerIndex_;
+
+    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
+
+    arguments() {
+      // Place the implicit offset index at the end of the indicies collection
+      indices_.emplace_back(&implicitOffsetArgs_);
+    }
+
+    /// Adds an argument to the kernel.
+    /// If the argument existed before, it is replaced.
+    /// Otherwise, it is added.
+    /// Gaps are filled with empty arguments.
+    /// Implicit offset argument is kept at the back of the indices collection.
+    void add_arg(size_t index, size_t size, const void *arg,
+                 size_t localSize = 0) {
+      if (index + 2 > indices_.size()) {
+        // Move implicit offset argument index with the end
+        indices_.resize(index + 2, indices_.back());
+        // Ensure enough space for the new argument
+        paramSizes_.resize(index + 1);
+        offsetPerIndex_.resize(index + 1);
+      }
+      paramSizes_[index] = size;
+      // calculate the insertion point on the array
+      size_t insertPos = std::accumulate(std::begin(paramSizes_),
+                                         std::begin(paramSizes_) + index, 0);
+      // Update the stored value for the argument
+      std::memcpy(&storage_[insertPos], arg, size);
+      indices_[index] = &storage_[insertPos];
+      offsetPerIndex_[index] = localSize;
+    }
+
+    void add_local_arg(size_t index, size_t size) {
+      size_t localOffset = this->get_local_size();
+      add_arg(index, sizeof(size_t), (const void *)&(localOffset), size);
+    }
+
+    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
+      assert(size == sizeof(std::uint32_t) * 3);
+      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
+    }
+
+    void clear_local_size() {
+      std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
+    }
+
+    args_index_t get_indices() const noexcept { return indices_; }
+
+    pi_uint32 get_local_size() const {
+      return std::accumulate(std::begin(offsetPerIndex_),
+                             std::end(offsetPerIndex_), 0);
+    }
+  } args_;
+
+  _pi_kernel(hipFunction_t func, hipFunction_t funcWithOffsetParam, const char *name,
+             pi_program program, pi_context ctxt)
+      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
+        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
+    rocm_piProgramRetain(program_);
+    rocm_piContextRetain(context_);
+  }
+
+  _pi_kernel(hipFunction_t func, const char *name, pi_program program,
+             pi_context ctxt)
+      : _pi_kernel{func, nullptr, name, program, ctxt} {}
+
+  ~_pi_kernel()
+  {
+    rocm_piProgramRelease(program_);
+    rocm_piContextRelease(context_);
+  }
+
+  pi_program get_program() const noexcept { return program_; }
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+
+  native_type get() const noexcept { return function_; };
+
+  native_type get_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_;
+  };
+
+  bool has_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_ != nullptr;
+  }
+
+  pi_context get_context() const noexcept { return context_; };
+
+  const char *get_name() const noexcept { return name_.c_str(); }
+
+  /// Returns the number of arguments, excluding the implicit global offset.
+  /// Note this only returns the current known number of arguments, not the
+  /// real one required by the kernel, since this cannot be queried from
+  /// the ROCM Driver API
+  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
+
+  void set_kernel_arg(int index, size_t size, const void *arg) {
+    args_.add_arg(index, size, arg);
+  }
+
+  void set_kernel_local_arg(int index, size_t size) {
+    args_.add_local_arg(index, size);
+  }
+
+  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
+    args_.set_implicit_offset(size, implicitOffset);
+  }
+
+  arguments::args_index_t get_arg_indices() const {
+    return args_.get_indices();
+  }
+
+  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
+
+  void clear_local_size() { args_.clear_local_size(); }
+};
+
+/// Implementation of samplers for ROCM
+///
+/// Sampler property layout:
+/// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
+/// |      N/A      | addressing mode | fiter mode | normalize coords |
+struct _pi_sampler {
+  std::atomic_uint32_t refCount_;
+  pi_uint32 props_;
+  pi_context context_;
+
+  _pi_sampler(pi_context context)
+      : refCount_(1), props_(0), context_(context) {}
+
+  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
+
+  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
+
+  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+};
+
+// -------------------------------------------------------------
+// Helper types and functions
+//
+
+#endif // PI_ROCM_HPP
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 7da8d346e8d3b..42330e806f62d 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -80,12 +80,17 @@ function(add_sycl_rt_library LIB_NAME)
       ${CMAKE_THREAD_LIBS_INIT}
     PUBLIC
       $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:pi_cuda>
+      $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:pi_rocm>
   )
 
   target_compile_definitions(${LIB_OBJ_NAME}
     PUBLIC
       $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>)
 
+  target_compile_definitions(${LIB_OBJ_NAME}
+    PUBLIC
+      $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:USE_PI_ROCM>)
+
   add_common_options(${LIB_NAME} ${LIB_OBJ_NAME})
 
   set_target_properties(${LIB_NAME} PROPERTIES
diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp
index 8f54271e260f6..71f7b61dd82d8 100644
--- a/sycl/source/detail/config.hpp
+++ b/sycl/source/detail/config.hpp
@@ -122,11 +122,12 @@ template <> class SYCLConfig<SYCL_BE> {
       return BackendPtr;
 
     const char *ValStr = BaseT::getRawValue();
-    const std::array<std::pair<std::string, backend>, 4> SyclBeMap = {
+    const std::array<std::pair<std::string, backend>, 5> SyclBeMap = {
         {{"PI_OPENCL", backend::opencl},
          {"PI_LEVEL_ZERO", backend::level_zero},
          {"PI_LEVEL0", backend::level_zero}, // for backward compatibility
-         {"PI_CUDA", backend::cuda}}};
+         {"PI_CUDA", backend::cuda},
+         {"PI_ROCM", backend::rocm}}};
     if (ValStr) {
       auto It = std::find_if(
           std::begin(SyclBeMap), std::end(SyclBeMap),
@@ -135,7 +136,7 @@ template <> class SYCLConfig<SYCL_BE> {
           });
       if (It == SyclBeMap.end())
         pi::die("Invalid backend. "
-                "Valid values are PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA");
+                "Valid values are PI_OPENCL/PI_LEVEL_ZERO/PI_CUDA/PI_ROCM");
       static backend Backend = It->second;
       BackendPtr = &Backend;
     }
diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index f5a393d2c0d82..502b7300ae1cf 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -222,11 +222,13 @@ bool findPlugins(vector_class<std::pair<std::string, backend>> &PluginNames) {
     PluginNames.emplace_back(__SYCL_LEVEL_ZERO_PLUGIN_NAME,
                              backend::level_zero);
     PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda);
+    PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm);
   } else {
     std::vector<device_filter> Filters = FilterList->get();
     bool OpenCLFound = false;
     bool LevelZeroFound = false;
     bool CudaFound = false;
+    bool RocmFound = false;
     for (const device_filter &Filter : Filters) {
       backend Backend = Filter.Backend;
       if (!OpenCLFound &&
@@ -244,6 +246,10 @@ bool findPlugins(vector_class<std::pair<std::string, backend>> &PluginNames) {
         PluginNames.emplace_back(__SYCL_CUDA_PLUGIN_NAME, backend::cuda);
         CudaFound = true;
       }
+      if (!RocmFound && (Backend == backend::rocm || Backend == backend::all)) {
+        PluginNames.emplace_back(__SYCL_ROCM_PLUGIN_NAME, backend::rocm);
+        RocmFound = true;
+      }
     }
   }
   return true;
@@ -348,6 +354,11 @@ static void initializePlugins(vector_class<plugin> *Plugins) {
       // Use the CUDA plugin as the GlobalPlugin
       GlobalPlugin =
           std::make_shared<plugin>(PluginInformation, backend::cuda, Library);
+    } else if (InteropBE == backend::rocm &&
+               PluginNames[I].first.find("rocm") != std::string::npos) {
+      // Use the ROCM plugin as the GlobalPlugin
+      GlobalPlugin =
+          std::make_shared<plugin>(PluginInformation, backend::rocm, Library);
     } else if (InteropBE == backend::level_zero &&
                PluginNames[I].first.find("level_zero") != std::string::npos) {
       // Use the LEVEL_ZERO plugin as the GlobalPlugin
diff --git a/sycl/tools/CMakeLists.txt b/sycl/tools/CMakeLists.txt
index eb724c70a5c58..ea6032ad86318 100644
--- a/sycl/tools/CMakeLists.txt
+++ b/sycl/tools/CMakeLists.txt
@@ -21,8 +21,13 @@ target_link_libraries(get_device_count_by_type
     OpenCL-ICD
     ${LEVEL_ZERO_LIBRARY}
     $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:cudadrv>
+    $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:rocmdrv>
 )
 target_compile_definitions(get_device_count_by_type
   PRIVATE
     $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>
 )
+target_compile_definitions(get_device_count_by_type
+  PRIVATE
+    $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:USE_PI_ROCM>
+)

From c6338abe93871487c12cfa1bfc22135fcb1bcb49 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 26 May 2021 03:03:41 +0000
Subject: [PATCH 02/18] enable amd gpu

---
 buildbot/configure.py                         |  10 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |   3 +-
 clang/lib/CodeGen/CGCall.cpp                  |   2 +-
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp  |   2 +-
 clang/lib/Driver/ToolChains/HIP.cpp           |   7 +-
 sycl/doc/GetStartedGuide.md                   |  78 ++++--
 sycl/include/CL/sycl/backend_types.hpp        |   2 +-
 ...cm_definitions.hpp => hip_definitions.hpp} |   6 +-
 sycl/plugins/rocm/pi_rocm.cpp                 | 242 +++---------------
 sycl/source/CMakeLists.txt                    |   4 -
 10 files changed, 109 insertions(+), 247 deletions(-)
 rename sycl/include/CL/sycl/detail/{rocm_definitions.hpp => hip_definitions.hpp} (82%)

diff --git a/buildbot/configure.py b/buildbot/configure.py
index b6b92cc23c78b..46310723d342f 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -25,7 +25,7 @@ def do_configure(args):
     llvm_enable_projects = 'clang;' + llvm_external_projects
     libclc_targets_to_build = ''
     sycl_build_pi_cuda = 'OFF'
-    sycl_build_pi_cuda = 'OFF'
+    sycl_build_pi_hip = 'OFF'
     sycl_werror = 'ON'
     llvm_enable_assertions = 'ON'
     llvm_enable_doxygen = 'OFF'
@@ -44,12 +44,12 @@ def do_configure(args):
         libclc_targets_to_build = 'nvptx64--;nvptx64--nvidiacl'
         sycl_build_pi_cuda = 'ON'
 
-    if args.rocm:
+    if args.hip:
         llvm_targets_to_build += ';AMDGPU'
         # TODO 
         llvm_enable_projects += ';libclc;lld'
         libclc_targets_to_build = 'amdgcn--;amdgcn--amdhsa'
-        sycl_build_pi_rocm = 'ON'
+        sycl_build_pi_hip = 'ON'
 
     if args.no_werror:
         sycl_werror = 'OFF'
@@ -82,7 +82,7 @@ def do_configure(args):
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DSYCL_BUILD_PI_CUDA={}".format(sycl_build_pi_cuda),
-        "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_rocm),
+        "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_hip),
         "-DLLVM_BUILD_TOOLS=ON",
         "-DSYCL_ENABLE_WERROR={}".format(sycl_werror),
         "-DCMAKE_INSTALL_PREFIX={}".format(install_dir),
@@ -150,7 +150,7 @@ def main():
     parser.add_argument("-t", "--build-type",
                         metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release")
     parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA")
-    parser.add_argument("--rocm", action='store_true', help="swith from OpenCL to ROCM")
+    parser.add_argument("--hip", action='store_true', help="swith from OpenCL to ROCM")
     parser.add_argument("--arm", action='store_true', help="build ARM support rather than x86")
     parser.add_argument("--no-assertions", action='store_true', help="build without assertions")
     parser.add_argument("--docs", action='store_true', help="build Doxygen documentation")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 25ee7268c6f75..06309c7bf2221 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4945,7 +4945,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BIprintf:
     if (getTarget().getTriple().isNVPTX())
       return EmitNVPTXDevicePrintfCallExpr(E, ReturnValue);
-    if (getTarget().getTriple().getArch() == Triple::amdgcn)
+    if (getTarget().getTriple().getArch() == Triple::amdgcn &&
+        getLangOpts().HIP)
       return EmitAMDGPUDevicePrintfCallExpr(E, ReturnValue);
     break;
   case Builtin::BI__builtin_canonicalize:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index be2d52dccbbb4..1604ce1d8f89b 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -760,7 +760,7 @@ CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
   unsigned CC = ClangCallConvToLLVMCallConv(info.getCC());
   // This is required so SYCL kernels are successfully processed by tools from CUDA. Kernels
   // with a `spir_kernel` calling convention are ignored otherwise.
-  if (CC == llvm::CallingConv::SPIR_KERNEL && (CGM.getTriple().isNVPTX() || CGM.getTriple().isNVPTX()) &&
+  if (CC == llvm::CallingConv::SPIR_KERNEL && (CGM.getTriple().isNVPTX() || CGM.getTriple().isAMDGCN()) &&
       getContext().getLangOpts().SYCLIsDevice) {
     CC = llvm::CallingConv::C;
   }
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 751b8df577647..33ddd5bf9b463 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -166,7 +166,7 @@ void AMDGCN::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
   const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC =
       static_cast<const toolchains::AMDGPUOpenMPToolChain &>(TC);
 
-  std::string GPUArch = Args.getLastArgValue(options::OPT_march_EQ).str();
+  std::string Args.getLastArgValue(options::OPT_march_EQ).str();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(Args, AMDGPUOpenMPTC, GPUArch))
       return;
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 2e0545fe10c7c..d24e298d27ae2 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -245,9 +245,6 @@ void HIPToolChain::addClangTargetOptions(
          "Only HIP offloading kinds are supported for GPUs.");
   
   StringRef GpuArch = getGPUArch(DriverArgs);
-  if(GpuArch.empty()) {
-    GpuArch = "gfx906";
-  }
 
   CC1Args.push_back("-fcuda-is-device");
 
@@ -463,7 +460,9 @@ HIPToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
       return {};
     }
     StringRef GpuArch = getGPUArch(DriverArgs);
-    assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
+    if(GpuArch.empty()) {
+      GpuArch = "gfx906";
+    }
     (void)GpuArch;
     auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
     const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index 1b415661a814e..d9d835799d27f 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -5,23 +5,34 @@ and a wide range of compute accelerators such as GPU and FPGA.
 
 ## Table of contents
 
-* [Prerequisites](#prerequisites)
-  * [Create DPC++ workspace](#create-dpc-workspace)
-* [Build DPC++ toolchain](#build-dpc-toolchain)
-  * [Build DPC++ toolchain with libc++ library](#build-dpc-toolchain-with-libc-library)
-  * [Build DPC++ toolchain with support for NVIDIA CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda)
-  * [Build Doxygen documentation](#build-doxygen-documentation)
-* [Use DPC++ toolchain](#use-dpc-toolchain)
-  * [Install low level runtime](#install-low-level-runtime)
-  * [Obtain prerequisites for ahead of time (AOT) compilation](#obtain-prerequisites-for-ahead-of-time-aot-compilation)
-  * [Test DPC++ toolchain](#test-dpc-toolchain)
-  * [Run simple DPC++ application](#run-simple-dpc-application)
-  * [Code the program for a specific GPU](#code-the-program-for-a-specific-gpu)
-  * [Using the DPC++ toolchain on CUDA platforms](#using-the-dpc-toolchain-on-cuda-platforms)
-* [C++ standard](#c-standard)
-* [Known Issues and Limitations](#known-issues-and-limitations)
-* [CUDA back-end limitations](#cuda-back-end-limitations)
-* [Find More](#find-more)
+- [Getting Started with oneAPI DPC++](#getting-started-with-oneapi-dpc)
+  - [Table of contents](#table-of-contents)
+  - [Prerequisites](#prerequisites)
+    - [Create DPC++ workspace](#create-dpc-workspace)
+  - [Build DPC++ toolchain](#build-dpc-toolchain)
+    - [Build DPC++ toolchain with libc++ library](#build-dpc-toolchain-with-libc-library)
+    - [Build DPC++ toolchain with support for NVIDIA CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda)
+    - [Build DPC++ toolchain with support for AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm)
+    - [Build Doxygen documentation](#build-doxygen-documentation)
+    - [Deployment](#deployment)
+  - [Use DPC++ toolchain](#use-dpc-toolchain)
+    - [Install low level runtime](#install-low-level-runtime)
+    - [Obtain prerequisites for ahead of time (AOT) compilation](#obtain-prerequisites-for-ahead-of-time-aot-compilation)
+      - [GPU](#gpu)
+      - [CPU](#cpu)
+      - [Accelerator](#accelerator)
+    - [Test DPC++ toolchain](#test-dpc-toolchain)
+      - [Run in-tree LIT tests](#run-in-tree-lit-tests)
+      - [Run DPC++ E2E test suite](#run-dpc-e2e-test-suite)
+      - [Run Khronos\* SYCL\* conformance test suite (optional)](#run-khronos-sycl-conformance-test-suite-optional)
+    - [Run simple DPC++ application](#run-simple-dpc-application)
+    - [Code the program for a specific GPU](#code-the-program-for-a-specific-gpu)
+    - [Using the DPC++ toolchain on CUDA platforms](#using-the-dpc-toolchain-on-cuda-platforms)
+  - [C++ standard](#c-standard)
+  - [Known Issues and Limitations](#known-issues-and-limitations)
+    - [CUDA back-end limitations](#cuda-back-end-limitations)
+    - [ROCm back-end limitations](#rocm-back-end-limitations)
+  - [Find More](#find-more)
 
 ## Prerequisites
 
@@ -98,6 +109,7 @@ flags can be found by launching the script with `--help`):
 * `--system-ocl` -> Don't download OpenCL headers and library via CMake but use the system ones
 * `--no-werror` -> Don't treat warnings as errors when compiling llvm
 * `--cuda` -> use the cuda backend (see [Nvidia CUDA](#build-dpc-toolchain-with-support-for-nvidia-cuda))
+* `--rocm` -> use the rocm backend (see [AMD ROCm](#build-dpc-toolchain-with-support-for-amd-rocm))
 * `--shared-libs` -> Build shared libraries
 * `-t` -> Build type (debug or release)
 * `-o` -> Path to build directory
@@ -148,6 +160,22 @@ a Titan RTX GPU (SM 71), but it should work on any GPU compatible with SM 50 or
 above. The default SM for the NVIDIA CUDA backend is 5.0. Users can specify
 lower values, but some features may not be supported.
 
+### Build DPC++ toolchain with support for AMD ROCm
+There is experimental support for DPC++ for ROCm devices.
+
+To enable support for ROCm devices, follow the instructions for the Linux
+DPC++ toolchain, but add the `--rocm` flag to `configure.py`
+
+Enabling this flag requires an installation of
+ROCm 4.1.0 on the system, refer to
+[AMD ROCm Installation Guide for Linux](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html).
+
+Currently, the only combination tested is Ubuntu 18.04 with ROCm 4.1.0 using a Vega20 gfx906.
+
+[LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMD GPU compilation chain. The AMDGPU backend generates a standard ELF [ELF] relocatable code object that can be linked by lld to produce a standard ELF shared code object which can be loaded and executed on an AMDGPU target. SO if you want to support AMD ROCm, you should also build the lld project.
+[LLD Build Guide](https://lld.llvm.org/)
+
+
 ### Build Doxygen documentation
 
 Building Doxygen documentation is similar to building the product itself. First,
@@ -489,6 +517,13 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
   simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
 ```
 
+When building for ROCm, use the ROCm target triple as follows:
+
+```bash
+clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
+  simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
+```
+
 To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,
 specify the target architecture:
 
@@ -519,7 +554,8 @@ execution, so SYCL runtime will use `default_selector` logic to select one
 of accelerators available in the system or SYCL host device.
 In this case, the behavior of the `default_selector` can be altered
 using the `SYCL_BE` environment variable, setting `PI_CUDA` forces
-the usage of the CUDA backend (if available), `PI_OPENCL` will
+the usage of the CUDA backend (if available), `PI_ROCm` forces
+the usage of the ROCm backend (if available), `PI_OPENCL` will
 force the usage of the OpenCL backend.
 
 ```bash
@@ -663,6 +699,12 @@ which contains all the symbols required.
 * The NVIDIA OpenCL headers conflict with the OpenCL headers required for this
   project and may cause compilation issues on some platforms
 
+### ROCm back-end limitations
+
+* Backend is only supported on Linux
+* The only combination tested is Ubuntu 18.04 with ROCm 4.1 using a Vega20 gfx906
+* Judging from the current [test](https://github.com/zjin-lcf/oneAPI-DirectProgramming) results, there is still a lot of room for improvement in ROCm back-end support. The current problems include three aspects. The first one is at compile time: the `barrier` and `atomic` keywords are not supported. The second is at runtime: when calling `hipMemcpyDtoHAsync` ROCm API, the program will cause an exception if the input data size is too large. The third is calculation accuracy: the ROCm backend has obvious errors in the calculation results of some float type operators
+
 ## Find More
 
 * DPC++ specification:
diff --git a/sycl/include/CL/sycl/backend_types.hpp b/sycl/include/CL/sycl/backend_types.hpp
index 77009d0cca07a..e3a32fcf21dc0 100644
--- a/sycl/include/CL/sycl/backend_types.hpp
+++ b/sycl/include/CL/sycl/backend_types.hpp
@@ -23,8 +23,8 @@ enum class backend : char {
   opencl = 1,
   level_zero = 2,
   cuda = 3,
+  all = 5,
   rocm = 4,
-  all = 5
 };
 
 template <backend Backend, typename SYCLObjectT> struct interop;
diff --git a/sycl/include/CL/sycl/detail/rocm_definitions.hpp b/sycl/include/CL/sycl/detail/hip_definitions.hpp
similarity index 82%
rename from sycl/include/CL/sycl/detail/rocm_definitions.hpp
rename to sycl/include/CL/sycl/detail/hip_definitions.hpp
index 288929ef735f5..1f3e86c18f50e 100644
--- a/sycl/include/CL/sycl/detail/rocm_definitions.hpp
+++ b/sycl/include/CL/sycl/detail/hip_definitions.hpp
@@ -8,12 +8,12 @@
 
 #pragma once
 
-// CUDA backend specific options
+// HIP backend specific options
 // TODO: Use values that won't overlap with others
 
-// Mem Object info: Retrieve the raw CUDA pointer from a cl_mem
+// Mem Object info: Retrieve the raw HIP pointer from a cl_mem
 #define __SYCL_PI_HIP_RAW_POINTER (0xFF01)
-// Context creation: Use a primary CUDA context instead of a custom one by
+// Context creation: Use a primary HIP context instead of a custom one by
 //                   providing a property value of PI_TRUE for the following
 //                   property ID.
 #define __SYCL_PI_CONTEXT_PROPERTIES_HIP_PRIMARY (0xFF02)
diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp
index bf2e2e399849c..8d6e586341099 100644
--- a/sycl/plugins/rocm/pi_rocm.cpp
+++ b/sycl/plugins/rocm/pi_rocm.cpp
@@ -11,7 +11,7 @@
 ///
 /// \ingroup sycl_pi_rocm
 
-#include <CL/sycl/detail/rocm_definitions.hpp>
+#include <CL/sycl/detail/hip_definitions.hpp>
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/pi.hpp>
 #include <pi_rocm.hpp>
@@ -43,8 +43,6 @@ pi_result map_error(hipError_t result) {
   switch (result) {
   case hipSuccess:
     return PI_SUCCESS;
-  //case HIP_ERROR_NOT_PERMITTED:
-  //  return PI_INVALID_OPERATION;
   case hipErrorInvalidContext:
     return PI_INVALID_CONTEXT;
   case hipErrorInvalidDevice:
@@ -247,36 +245,6 @@ int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
 }
 /// \endcond
 
-// Determine local work sizes that result in uniform work groups.
-// The default threadsPerBlock only require handling the first work_dim
-// dimension.
-/*
-void guessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
-                        const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
-  assert(threadsPerBlock != nullptr);
-  assert(global_work_size != nullptr);
-  assert(kernel != nullptr);
-  int recommendedBlockSize, minGrid;
-
-  PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
-      &minGrid, &recommendedBlockSize, kernel->get(),
-      0, 0));
-
-  (void)minGrid; // Not used, avoid warnings
-
-  threadsPerBlock[0] =
-      std::min(static_cast<int>(maxThreadsPerBlock[0]),
-               std::min(static_cast<int>(global_work_size[0]),
-                        static_cast<int>(recommendedBlockSize)));
-
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  while (0u != (global_work_size[0] % threadsPerBlock[0])) {
-    --threadsPerBlock[0];
-  }
-}
-*/
-
 void simpleGuessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
                         const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
   assert(threadsPerBlock != nullptr);
@@ -1509,22 +1477,9 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // system allocator
     pi_bitfield value = {};
     if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
-      // the device suppports coherently accessing pageable memory without
-      // calling hipMemHostRegister/rocmHostRegister on it
-      /*
-      if (getAttribute(device,
-                       HIP_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
-        // the link between the device and the host supports native atomic
-        // operations
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } 
-      else */
-      {
         // the link between the device and the host does not support native
         // atomic operations
         value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
     }
     return getInfo(param_value_size, param_value, param_value_size_ret, value);
   }
@@ -1970,13 +1925,7 @@ pi_result rocm_piMemGetInfo(pi_mem memObj, cl_mem_info queriedInfo,
 /// \param[out] nativeHandle Set to the native handle of the PI mem object.
 ///
 /// \return PI_SUCCESS
-/*
-pi_result rocm_piextMemGetNativeHandle(pi_mem mem,
-                                       pi_native_handle *nativeHandle) {
-  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
-  return PI_SUCCESS;
-}
-*/
+
 
 /// Created a PI mem object from a HIP mem handle.
 /// TODO: Implement this.
@@ -2014,18 +1963,6 @@ pi_result rocm_piQueueCreate(pi_context context, pi_device device,
     ScopedContext active(context);
 
     hipStream_t hipStream;
-    
-    /*
-    unsigned int flags = 0;
-
-    if (properties == __SYCL_PI_HIP_USE_DEFAULT_STREAM) {
-      flags = hipStreamDefault;
-    } else if (properties == __SYCL_PI_HIP_SYNC_WITH_DEFAULT) {
-      flags = 0;
-    } else {
-      flags = hipStreamNonBlocking;
-    }
-    */
 
     err = PI_CHECK_ERROR(hipStreamCreate(&hipStream));
     if (err != PI_SUCCESS) {
@@ -2454,12 +2391,6 @@ pi_result rocm_piEnqueueKernelLaunch(
     }
   }
   
-  /*
-  if (maxWorkGroupSize <
-      size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
-    return PI_INVALID_WORK_GROUP_SIZE;
-  }
-  */
   int blocksPerGrid[3] = {1, 1, 1};
   
 
@@ -2505,13 +2436,6 @@ pi_result rocm_piEnqueueKernelLaunch(
       retImplEv->start();
     }
     
-    /*
-    retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
-        hipFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
-        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2],
-        kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
-    */
-    
     retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
         hipFunc, blocksPerGrid[0], 1, 1,
         threadsPerBlock[0], 1, 1,
@@ -2835,72 +2759,6 @@ pi_result rocm_piProgramGetInfo(pi_program program, pi_program_info param_name,
   return {};
 }
 
-/// Creates a new PI program object that is the outcome of linking all input
-/// programs.
-/// \TODO Implement linker options, requires mapping of OpenCL to HIP
-///
-
-pi_result rocm_piProgramLink(pi_context context, pi_uint32 num_devices,
-                             const pi_device *device_list, const char *options,
-                             pi_uint32 num_input_programs,
-                             const pi_program *input_programs,
-                             void (*pfn_notify)(pi_program program,
-                                                void *user_data),
-                             void *user_data, pi_program *ret_program) {
-  /*
-  assert(ret_program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(context);
-
-    HIPlinkState state;
-    std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-    retError = PI_CHECK_ERROR(hipLinkCreate(0, nullptr, nullptr, &state));
-    try {
-      for (size_t i = 0; i < num_input_programs; ++i) {
-        pi_program program = input_programs[i];
-        retError = PI_CHECK_ERROR(hipLinkAddData(
-            state, HIP_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
-            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
-      }
-      void *hipbin = nullptr;
-      size_t hipbinSize = 0;
-      retError = PI_CHECK_ERROR(hipLinkComplete(state, &hipbin, &hipbinSize));
-
-      retError =
-          retProgram->set_binary(static_cast<const char *>(hipbin), hipbinSize);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-
-      retError = retProgram->build_program(options);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-    } catch (...) {
-      // Upon error attempt cleanup
-      PI_CHECK_ERROR(hipLinkDestroy(state));
-      throw;
-    }
-
-    retError = PI_CHECK_ERROR(hipLinkDestroy(state));
-    *ret_program = retProgram.release();
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  */
-  return PI_SUCCESS;
-}
-
 /// Creates a new program that is the outcome of the compilation of the headers
 ///  and the program.
 /// \TODO Implement asynchronous compilation
@@ -3851,34 +3709,12 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
       result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
       break;
     }
-    /*
+    
     default: {
-      // HIP has no memset functions that allow setting values more than 4
-      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
-      // fill, which can be more than 4 bytes. We must break up the pattern
-      // into 4 byte values, and set the buffer using multiple strided calls.
-      // This means that one hipMemsetD2D32Async call is made for every 4 bytes
-      // in the pattern.
-
-      auto number_of_steps = pattern_size / sizeof(uint32_t);
-
-      // we walk up the pattern in 4-byte steps, and call hipMemset for each
-      // 4-byte chunk of the pattern.
-      for (auto step = 0u; step < number_of_steps; ++step) {
-        // take 4 bytes of the pattern
-        auto value = *(static_cast<const uint32_t *>(pattern) + step);
-
-        // offset the pointer to the part of the buffer we want to write to
-        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
-
-        // set all of the pattern chunks
-        result = PI_CHECK_ERROR(
-            hipMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
-      }
-
+      resutl = PI_INVALID_VALUE;
       break;
     }
-    */
+    
     }
 
     if (event) {
@@ -4018,22 +3854,18 @@ pi_result rocm_piEnqueueMemImageRead(
     size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
 
     pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-    /*
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          hipMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, hipStream));
-    } else */ {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
+     
+    size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+    size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          hipStream, imgType, adjustedRegion, &array, hipMemoryTypeArray,
-          srcOffset, ptr, hipMemoryTypeHost, nullptr);
+    retErr = commonEnqueueMemImageNDCopy(
+        hipStream, imgType, adjustedRegion, &array, hipMemoryTypeArray,
+        srcOffset, ptr, hipMemoryTypeHost, nullptr);
 
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
+    if (retErr != PI_SUCCESS) {
+      return retErr;
     }
+    
 
     if (event) {
       auto new_event =
@@ -4086,21 +3918,18 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
     size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
 
     pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-    /* if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          hipMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, hipStream));
-    } else */ {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
+     
+    size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+    size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          hipStream, imgType, adjustedRegion, ptr, hipMemoryTypeHost, nullptr,
-          &array, hipMemoryTypeArray, dstOffset);
+    retErr = commonEnqueueMemImageNDCopy(
+        hipStream, imgType, adjustedRegion, ptr, hipMemoryTypeHost, nullptr,
+        &array, hipMemoryTypeArray, dstOffset);
 
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
+    if (retErr != PI_SUCCESS) {
+      return retErr;
     }
+    
 
     if (event) {
       auto new_event =
@@ -4160,24 +3989,19 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
     size_t bytesToCopy = elementByteSize * srcArray.NumChannels * region[0];
 
     pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
-    /*
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(hipMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
-                                           srcByteOffsetX, bytesToCopy));
-    } else 
-    */{
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
-      size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
+    
+    size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
+    size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
+    size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          hipStream, imgType, adjustedRegion, &srcArray, hipMemoryTypeArray,
-          srcOffset, &dstArray, hipMemoryTypeArray, dstOffset);
+    retErr = commonEnqueueMemImageNDCopy(
+        hipStream, imgType, adjustedRegion, &srcArray, hipMemoryTypeArray,
+        srcOffset, &dstArray, hipMemoryTypeArray, dstOffset);
 
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
+    if (retErr != PI_SUCCESS) {
+      return retErr;
     }
+    
 
     if (event) {
       auto new_event =
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 42330e806f62d..2074e547942b8 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -80,16 +80,12 @@ function(add_sycl_rt_library LIB_NAME)
       ${CMAKE_THREAD_LIBS_INIT}
     PUBLIC
       $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:pi_cuda>
-      $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:pi_rocm>
   )
 
   target_compile_definitions(${LIB_OBJ_NAME}
     PUBLIC
       $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>)
 
-  target_compile_definitions(${LIB_OBJ_NAME}
-    PUBLIC
-      $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:USE_PI_ROCM>)
 
   add_common_options(${LIB_NAME} ${LIB_OBJ_NAME})
 

From 06cc4ced0f8baaca936b850aa348a15a5eccc065 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 26 May 2021 05:25:24 +0000
Subject: [PATCH 03/18] enable AMD GPU

---
 buildbot/configure.py                        | 12 ++++++------
 clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp |  2 +-
 sycl/plugins/rocm/pi_rocm.cpp                |  3 +--
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/buildbot/configure.py b/buildbot/configure.py
index 46310723d342f..640f65345587f 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -25,7 +25,7 @@ def do_configure(args):
     llvm_enable_projects = 'clang;' + llvm_external_projects
     libclc_targets_to_build = ''
     sycl_build_pi_cuda = 'OFF'
-    sycl_build_pi_hip = 'OFF'
+    sycl_build_pi_rocm = 'OFF'
     sycl_werror = 'ON'
     llvm_enable_assertions = 'ON'
     llvm_enable_doxygen = 'OFF'
@@ -44,12 +44,12 @@ def do_configure(args):
         libclc_targets_to_build = 'nvptx64--;nvptx64--nvidiacl'
         sycl_build_pi_cuda = 'ON'
 
-    if args.hip:
+    if args.rocm:
         llvm_targets_to_build += ';AMDGPU'
         # TODO 
-        llvm_enable_projects += ';libclc;lld'
+        llvm_enable_projects += ';libclc'
         libclc_targets_to_build = 'amdgcn--;amdgcn--amdhsa'
-        sycl_build_pi_hip = 'ON'
+        sycl_build_pi_rocm = 'ON'
 
     if args.no_werror:
         sycl_werror = 'OFF'
@@ -82,7 +82,7 @@ def do_configure(args):
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DSYCL_BUILD_PI_CUDA={}".format(sycl_build_pi_cuda),
-        "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_hip),
+        "-DSYCL_BUILD_PI_ROCM={}".format(sycl_build_pi_rocm),
         "-DLLVM_BUILD_TOOLS=ON",
         "-DSYCL_ENABLE_WERROR={}".format(sycl_werror),
         "-DCMAKE_INSTALL_PREFIX={}".format(install_dir),
@@ -150,7 +150,7 @@ def main():
     parser.add_argument("-t", "--build-type",
                         metavar="BUILD_TYPE", default="Release", help="build type: Debug, Release")
     parser.add_argument("--cuda", action='store_true', help="switch from OpenCL to CUDA")
-    parser.add_argument("--hip", action='store_true', help="swith from OpenCL to ROCM")
+    parser.add_argument("--rocm", action='store_true', help="swith from OpenCL to ROCM")
     parser.add_argument("--arm", action='store_true', help="build ARM support rather than x86")
     parser.add_argument("--no-assertions", action='store_true', help="build without assertions")
     parser.add_argument("--docs", action='store_true', help="build Doxygen documentation")
diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
index 33ddd5bf9b463..751b8df577647 100644
--- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp
@@ -166,7 +166,7 @@ void AMDGCN::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
   const toolchains::AMDGPUOpenMPToolChain &AMDGPUOpenMPTC =
       static_cast<const toolchains::AMDGPUOpenMPToolChain &>(TC);
 
-  std::string Args.getLastArgValue(options::OPT_march_EQ).str();
+  std::string GPUArch = Args.getLastArgValue(options::OPT_march_EQ).str();
   if (GPUArch.empty()) {
     if (!checkSystemForAMDGPU(Args, AMDGPUOpenMPTC, GPUArch))
       return;
diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp
index 8d6e586341099..e98802a7071ea 100644
--- a/sycl/plugins/rocm/pi_rocm.cpp
+++ b/sycl/plugins/rocm/pi_rocm.cpp
@@ -3711,7 +3711,7 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
     }
     
     default: {
-      resutl = PI_INVALID_VALUE;
+      result = PI_INVALID_VALUE;
       break;
     }
     
@@ -4544,7 +4544,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piProgramGetInfo, rocm_piProgramGetInfo)
   _PI_CL(piProgramCompile, rocm_piProgramCompile)
   _PI_CL(piProgramBuild, rocm_piProgramBuild)
-  _PI_CL(piProgramLink, rocm_piProgramLink)
   _PI_CL(piProgramGetBuildInfo, rocm_piProgramGetBuildInfo)
   _PI_CL(piProgramRetain, rocm_piProgramRetain)
   _PI_CL(piProgramRelease, rocm_piProgramRelease)

From 4edaf9176775ff95fcc22cd800f45c82ee6dd56b Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 26 May 2021 05:34:34 +0000
Subject: [PATCH 04/18] enable AMD GPU

---
 clang/lib/Driver/ToolChains/HIP.cpp | 4 +---
 sycl/doc/GetStartedGuide.md         | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index d24e298d27ae2..b31d69fc1ab4b 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -460,9 +460,7 @@ HIPToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const {
       return {};
     }
     StringRef GpuArch = getGPUArch(DriverArgs);
-    if(GpuArch.empty()) {
-      GpuArch = "gfx906";
-    }
+    assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
     (void)GpuArch;
     auto Kind = llvm::AMDGPU::parseArchAMDGCN(GpuArch);
     const StringRef CanonArch = llvm::AMDGPU::getArchNameAMDGCN(Kind);
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index d9d835799d27f..d51e610fc92ed 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -517,11 +517,11 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
   simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
 ```
 
-When building for ROCm, use the ROCm target triple as follows:
+When building for ROCm, please note that the option `mcpu` must be specified,use the ROCm target triple as follows, :
 
 ```bash
 clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
-  simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
+  -mcpu=gfx906 simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
 ```
 
 To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,

From 578d95e7a87df5b64ae2449037800a48daa6eaac Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 26 May 2021 08:44:55 +0000
Subject: [PATCH 05/18] update Comment and some codes

---
 buildbot/configure.py                         |  3 ++-
 clang/lib/Driver/ToolChains/HIP.cpp           |  7 -------
 sycl/doc/GetStartedGuide.md                   |  6 +++---
 sycl/include/CL/sycl/backend_types.hpp        |  4 ++--
 .../CL/sycl/detail/hip_definitions.hpp        |  2 +-
 sycl/plugins/rocm/pi_rocm.cpp                 | 20 +++++--------------
 sycl/source/CMakeLists.txt                    |  1 -
 sycl/tools/CMakeLists.txt                     |  3 ---
 8 files changed, 13 insertions(+), 33 deletions(-)

diff --git a/buildbot/configure.py b/buildbot/configure.py
index 640f65345587f..60ab19ed7f8aa 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -46,7 +46,8 @@ def do_configure(args):
 
     if args.rocm:
         llvm_targets_to_build += ';AMDGPU'
-        # TODO 
+        # TODO libclc should be added once,
+        # TODO when we build DPC++ with both CUDA and ROCM support
         llvm_enable_projects += ';libclc'
         libclc_targets_to_build = 'amdgcn--;amdgcn--amdhsa'
         sycl_build_pi_rocm = 'ON'
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index b31d69fc1ab4b..0060f1c7be105 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -353,13 +353,6 @@ HIPToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
   return DAL;
 }
 
-/*
-Tool *HIPToolChain::buildLinker() const {
-  assert(getTriple().getArch() == llvm::Triple::amdgcn);
-  return new tools::AMDGCN::Linker(*this);
-}
-*/
-
 Tool *HIPToolChain::buildLinker() const {
   assert(getTriple().getArch() == llvm::Triple::amdgcn);
   if (OK == Action::OFK_SYCL)
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index d51e610fc92ed..f3e2de7565aec 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -172,7 +172,7 @@ ROCm 4.1.0 on the system, refer to
 
 Currently, the only combination tested is Ubuntu 18.04 with ROCm 4.1.0 using a Vega20 gfx906.
 
-[LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMD GPU compilation chain. The AMDGPU backend generates a standard ELF [ELF] relocatable code object that can be linked by lld to produce a standard ELF shared code object which can be loaded and executed on an AMDGPU target. SO if you want to support AMD ROCm, you should also build the lld project.
+[LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMD GPU compilation chain. The AMDGPU backend generates a standard ELF [ELF] relocatable code object that can be linked by lld to produce a standard ELF shared code object which can be loaded and executed on an AMDGPU target. So if you want to support AMD ROCm, you should also build the lld project.
 [LLD Build Guide](https://lld.llvm.org/)
 
 
@@ -517,7 +517,7 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
   simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
 ```
 
-When building for ROCm, please note that the option `mcpu` must be specified,use the ROCm target triple as follows, :
+When building for ROCm, please note that the option `mcpu` must be specified, use the ROCm target triple as follows:
 
 ```bash
 clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
@@ -554,7 +554,7 @@ execution, so SYCL runtime will use `default_selector` logic to select one
 of accelerators available in the system or SYCL host device.
 In this case, the behavior of the `default_selector` can be altered
 using the `SYCL_BE` environment variable, setting `PI_CUDA` forces
-the usage of the CUDA backend (if available), `PI_ROCm` forces
+the usage of the CUDA backend (if available), `PI_ROCM` forces
 the usage of the ROCm backend (if available), `PI_OPENCL` will
 force the usage of the OpenCL backend.
 
diff --git a/sycl/include/CL/sycl/backend_types.hpp b/sycl/include/CL/sycl/backend_types.hpp
index e3a32fcf21dc0..4e1c09abf70a2 100644
--- a/sycl/include/CL/sycl/backend_types.hpp
+++ b/sycl/include/CL/sycl/backend_types.hpp
@@ -23,8 +23,8 @@ enum class backend : char {
   opencl = 1,
   level_zero = 2,
   cuda = 3,
-  all = 5,
-  rocm = 4,
+  all = 4,
+  rocm = 5,
 };
 
 template <backend Backend, typename SYCLObjectT> struct interop;
diff --git a/sycl/include/CL/sycl/detail/hip_definitions.hpp b/sycl/include/CL/sycl/detail/hip_definitions.hpp
index 1f3e86c18f50e..555609139446f 100644
--- a/sycl/include/CL/sycl/detail/hip_definitions.hpp
+++ b/sycl/include/CL/sycl/detail/hip_definitions.hpp
@@ -1,4 +1,4 @@
-//==------------ cuda_definitions.hpp - SYCL CUDA backend ------------------==//
+//==------------ hip_definitions.hpp - SYCL ROCM backend ------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp
index e98802a7071ea..c2a4bee9fcda1 100644
--- a/sycl/plugins/rocm/pi_rocm.cpp
+++ b/sycl/plugins/rocm/pi_rocm.cpp
@@ -27,7 +27,7 @@
 
 
 namespace {
-std::string getCudaVersionString() {
+std::string getHipVersionString() {
   int driver_version = 0;
   if (hipDriverGetVersion(&driver_version) != hipSuccess) {
     return "";
@@ -696,7 +696,7 @@ pi_result rocm_piPlatformGetInfo(pi_platform platform,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    "FULL PROFILE");
   case PI_PLATFORM_INFO_VERSION: {
-    auto version = getCudaVersionString();
+    auto version = getHipVersionString();
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    version.c_str());
   }
@@ -789,7 +789,7 @@ pi_result rocm_piDevicePartition(
     pi_device device,
     const cl_device_partition_property *properties, // TODO: untie from OpenCL
     pi_uint32 num_devices, pi_device *out_devices, pi_uint32 *out_num_devices) {
-  return {};
+  return PI_INVALID_OPERATION;
 }
 
 /// \return If available, the first binary that is PTX
@@ -968,7 +968,6 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
   }
   case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-    // NVIDIA devices only support one sub-group size (the warp size)
     int warpSize = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
@@ -1151,7 +1150,6 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
   }
   case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: {
-    // https://docs.nvidia.com/rocm/rocm-c-programming-guide/#function-parameters
     // __global__ function parameters are passed to the device via constant
     // memory and are limited to 4 KB.
     return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -1169,24 +1167,20 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
                    mem_base_addr_align);
   }
   case PI_DEVICE_INFO_HALF_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
     return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
   }
   case PI_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
     auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
                   PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA |
                   PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     return getInfo(param_value_size, param_value, param_value_size_ret, config);
   }
   case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
     auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
                   PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA;
     return getInfo(param_value_size, param_value, param_value_size_ret, config);
   }
   case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    CL_READ_WRITE_CACHE);
   }
@@ -1324,10 +1318,10 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   }
   case PI_DEVICE_INFO_VENDOR: {
     return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA Corporation");
+                   "AMD Corporation");
   }
   case PI_DEVICE_INFO_DRIVER_VERSION: {
-    auto version = getCudaVersionString();
+    auto version = getHipVersionString();
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    version.c_str());
   }
@@ -2589,9 +2583,7 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags,
     }
 
     // HIP_RESOURCE_DESC is a union of different structs, shown here
-    // https://docs.nvidia.com/rocm/rocm-driver-api/group__HIP__TEXOBJECT.html
     // We need to fill it as described here to use it for a surface or texture
-    // https://docs.nvidia.com/rocm/rocm-driver-api/group__HIP__SURFOBJECT.html
     // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
     // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
     // handle.
@@ -4406,8 +4398,6 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
     case PI_MEM_ALLOC_TYPE: {
       unsigned int value;
       // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
-      // TODO hipPointerGetAttribute与CUDA传参不同
-      
       hipError_t ret = hipPointerGetAttributes(
           &hipPointerAttributeType, ptr);
       if (ret == hipErrorInvalidValue) {
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 2074e547942b8..7da8d346e8d3b 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -86,7 +86,6 @@ function(add_sycl_rt_library LIB_NAME)
     PUBLIC
       $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>)
 
-
   add_common_options(${LIB_NAME} ${LIB_OBJ_NAME})
 
   set_target_properties(${LIB_NAME} PROPERTIES
diff --git a/sycl/tools/CMakeLists.txt b/sycl/tools/CMakeLists.txt
index ea6032ad86318..a21ae9c8f9de9 100644
--- a/sycl/tools/CMakeLists.txt
+++ b/sycl/tools/CMakeLists.txt
@@ -26,8 +26,5 @@ target_link_libraries(get_device_count_by_type
 target_compile_definitions(get_device_count_by_type
   PRIVATE
     $<$<BOOL:${SYCL_BUILD_PI_CUDA}>:USE_PI_CUDA>
-)
-target_compile_definitions(get_device_count_by_type
-  PRIVATE
     $<$<BOOL:${SYCL_BUILD_PI_ROCM}>:USE_PI_ROCM>
 )

From f4bc26f0437d49cd53c8d9f07424de891b85c522 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 26 May 2021 10:06:31 +0000
Subject: [PATCH 06/18] update GetStartedGuide

---
 sycl/doc/GetStartedGuide.md | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index f3e2de7565aec..11bea8a659a3e 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -4,9 +4,6 @@ The DPC++ Compiler compiles C++ and SYCL\* source files with code for both CPU
 and a wide range of compute accelerators such as GPU and FPGA.
 
 ## Table of contents
-
-- [Getting Started with oneAPI DPC++](#getting-started-with-oneapi-dpc)
-  - [Table of contents](#table-of-contents)
   - [Prerequisites](#prerequisites)
     - [Create DPC++ workspace](#create-dpc-workspace)
   - [Build DPC++ toolchain](#build-dpc-toolchain)
@@ -172,7 +169,10 @@ ROCm 4.1.0 on the system, refer to
 
 Currently, the only combination tested is Ubuntu 18.04 with ROCm 4.1.0 using a Vega20 gfx906.
 
-[LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMD GPU compilation chain. The AMDGPU backend generates a standard ELF [ELF] relocatable code object that can be linked by lld to produce a standard ELF shared code object which can be loaded and executed on an AMDGPU target. So if you want to support AMD ROCm, you should also build the lld project.
+[LLD](https://llvm.org/docs/AMDGPUUsage.html) is necessary for the AMD GPU compilation chain. 
+The AMDGPU backend generates a standard ELF [ELF] relocatable code object that can be linked by lld to 
+produce a standard ELF shared code object which can be loaded and executed on an AMDGPU target. 
+So if you want to support AMD ROCm, you should also build the lld project.
 [LLD Build Guide](https://lld.llvm.org/)
 
 
@@ -701,9 +701,13 @@ which contains all the symbols required.
 
 ### ROCm back-end limitations
 
-* Backend is only supported on Linux
-* The only combination tested is Ubuntu 18.04 with ROCm 4.1 using a Vega20 gfx906
-* Judging from the current [test](https://github.com/zjin-lcf/oneAPI-DirectProgramming) results, there is still a lot of room for improvement in ROCm back-end support. The current problems include three aspects. The first one is at compile time: the `barrier` and `atomic` keywords are not supported. The second is at runtime: when calling `hipMemcpyDtoHAsync` ROCm API, the program will cause an exception if the input data size is too large. The third is calculation accuracy: the ROCm backend has obvious errors in the calculation results of some float type operators
+* For supported Operating Systems, please refer to the [Supported Operating Systems](https://github.com/RadeonOpenCompute/ROCm#supported-operating-systems)
+* The only combination tested is Ubuntu 18.04 with ROCm 4.1 using a Vega20 gfx906.
+* Judging from the current [test](https://github.com/zjin-lcf/oneAPI-DirectProgramming) results, 
+  there is still a lot of room for improvement in ROCm back-end support. The current problems include three aspects. 
+  The first one is at compile time: the `barrier` and `atomic` keywords are not supported. 
+  The second is at runtime: when calling `hipMemcpyDtoHAsync` ROCm API, the program will cause an exception if the input data size is too large. 
+  The third is calculation accuracy: the ROCm backend has obvious errors in the calculation results of some float type operators
 
 ## Find More
 

From ddd7b904503cb0831fcb0d2df7eaee937e04a223 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Thu, 27 May 2021 06:47:11 +0000
Subject: [PATCH 07/18] update code and doc

---
 clang/lib/CodeGen/CGCall.cpp            |   3 +-
 clang/lib/Driver/Driver.cpp             |  19 +-
 clang/lib/Driver/ToolChains/HIP.cpp     |  16 +-
 clang/lib/Driver/ToolChains/HIP.h       |   7 +-
 clang/lib/Driver/ToolChains/SYCL.cpp    |   6 +-
 clang/lib/Frontend/InitPreprocessor.cpp |   4 +-
 sycl/include/CL/__spirv/spirv_vars.hpp  |   2 +-
 sycl/include/CL/sycl/detail/pi.hpp      |   4 +-
 sycl/plugins/rocm/pi_rocm.cpp           | 408 +++++++++++-------------
 sycl/plugins/rocm/pi_rocm.hpp           |  29 +-
 10 files changed, 234 insertions(+), 264 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 016a3433bef67..566e082f60494 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -760,7 +760,8 @@ CodeGenTypes::arrangeLLVMFunctionInfo(CanQualType resultType,
   unsigned CC = ClangCallConvToLLVMCallConv(info.getCC());
   // This is required so SYCL kernels are successfully processed by tools from CUDA. Kernels
   // with a `spir_kernel` calling convention are ignored otherwise.
-  if (CC == llvm::CallingConv::SPIR_KERNEL && (CGM.getTriple().isNVPTX() || CGM.getTriple().isAMDGCN()) &&
+  if (CC == llvm::CallingConv::SPIR_KERNEL &&
+      (CGM.getTriple().isNVPTX() || CGM.getTriple().isAMDGCN()) &&
       getContext().getLangOpts().SYCLIsDevice) {
     CC = llvm::CallingConv::C;
   }
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 82be1849b33ae..41a635be3a80d 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -668,7 +668,7 @@ static bool isValidSYCLTriple(llvm::Triple T) {
   if (T.isNVPTX())
     return true;
 
-  // AMDGCN is valid for SYCL 
+  // AMDGCN is valid for SYCL
   if (T.isAMDGCN())
     return true;
 
@@ -3839,14 +3839,15 @@ class OffloadingActionBuilder final {
     Action *finalizeAMDGCNDependences(Action *Input, const llvm::Triple &TT) {
       auto *BA = C.getDriver().ConstructPhaseAction(
           C, Args, phases::Backend, Input, AssociatedOffloadKind);
-      
-      auto *AA = C.getDriver().ConstructPhaseAction(
-          C, Args, phases::Assemble, BA, AssociatedOffloadKind);
-          
+
+      auto *AA = C.getDriver().ConstructPhaseAction(C, Args, phases::Assemble,
+                                                    BA, AssociatedOffloadKind);
+
       ActionList AL = {AA};
       Action *action = C.MakeAction<LinkJobAction>(AL, types::TY_Image);
       ActionList HIPActions = {action};
-      Action *HIPFatBinary = C.MakeAction<LinkJobAction>(HIPActions, types::TY_HIP_FATBIN);
+      Action *HIPFatBinary =
+          C.MakeAction<LinkJobAction>(HIPActions, types::TY_HIP_FATBIN);
       return HIPFatBinary;
     }
 
@@ -4385,7 +4386,7 @@ class OffloadingActionBuilder final {
           Action *FinAction =
               finalizeNVPTXDependences(PostLinkAction, (*TC)->getTriple());
           WrapperInputs.push_back(FinAction);
-        } else if(isAMDGCN) {
+        } else if (isAMDGCN) {
           Action *FinAction =
               finalizeAMDGCNDependences(PostLinkAction, (*TC)->getTriple());
           WrapperInputs.push_back(FinAction);
@@ -7221,7 +7222,7 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(const ArgList &Args,
         break;
       case Action::OFK_HIP:
         TC = std::make_unique<toolchains::HIPToolChain>(
-          *this, Target, HostTC, Args, TargetDeviceOffloadKind);
+            *this, Target, HostTC, Args, TargetDeviceOffloadKind);
         break;
       case Action::OFK_OpenMP:
         // omp + nvptx
@@ -7242,7 +7243,7 @@ const ToolChain &Driver::getOffloadingDeviceToolChain(const ArgList &Args,
             break;
           case llvm::Triple::amdgcn:
             TC = std::make_unique<toolchains::HIPToolChain>(
-              *this, Target, HostTC, Args, TargetDeviceOffloadKind);
+                *this, Target, HostTC, Args, TargetDeviceOffloadKind);
             break;
           default:
           break;
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 18538554c76ad..f509e42f05ee5 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -114,7 +114,7 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   std::string OffloadKind = "hip";
   // bundle ID equals 'hip' is always right.
   if (getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
-    //OffloadKind = OffloadKind + "v4";
+    // OffloadKind = OffloadKind + "v4";
     OffloadKind = OffloadKind;
   for (const auto &II : Inputs) {
     const auto* A = II.getAction();
@@ -227,7 +227,8 @@ void AMDGCN::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 }
 
 HIPToolChain::HIPToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ToolChain &HostTC, const ArgList &Args, const Action::OffloadKind OK)
+                           const ToolChain &HostTC, const ArgList &Args,
+                           const Action::OffloadKind OK)
     : ROCMToolChain(D, Triple, Args), HostTC(HostTC), OK(OK) {
   // Lookup binaries into the driver directory, this is used to
   // discover the clang-offload-bundler executable.
@@ -241,9 +242,9 @@ void HIPToolChain::addClangTargetOptions(
   HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
 
   assert((DeviceOffloadingKind == Action::OFK_HIP ||
-          DeviceOffloadingKind == Action::OFK_SYCL) && 
+          DeviceOffloadingKind == Action::OFK_SYCL) &&
          "Only HIP offloading kinds are supported for GPUs.");
-  
+
   StringRef GpuArch = getGPUArch(DriverArgs);
 
   CC1Args.push_back("-fcuda-is-device");
@@ -279,14 +280,17 @@ void HIPToolChain::addClangTargetOptions(
                                                   CC1Args);
   }
 
-   auto NoLibSpirv = DriverArgs.hasArg(options::OPT_fno_sycl_libspirv,
+  auto NoLibSpirv = DriverArgs.hasArg(options::OPT_fno_sycl_libspirv,
                                       options::OPT_fsycl_device_only);
   if (DeviceOffloadingKind == Action::OFK_SYCL && !NoLibSpirv) {
     std::string LibSpirvFile;
 
     if (DriverArgs.hasArg(clang::driver::options::OPT_fsycl_libspirv_path_EQ)) {
       auto ProvidedPath =
-        DriverArgs.getLastArgValue(clang::driver::options::OPT_fsycl_libspirv_path_EQ).str();
+          DriverArgs
+              .getLastArgValue(
+                  clang::driver::options::OPT_fsycl_libspirv_path_EQ)
+              .str();
       if (llvm::sys::fs::exists(ProvidedPath))
         LibSpirvFile = ProvidedPath;
     } else {
diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 1c3a832db4074..9b47513dd9789 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -55,11 +55,12 @@ class LLVM_LIBRARY_VISIBILITY SYCLLinker : public Linker {
 public:
   SYCLLinker(const ToolChain &TC) : Linker(TC) {}
 
-  Tool* GetSYCLToolChainLinker() const {
+  Tool *GetSYCLToolChainLinker() const {
     if (!SYCLToolChainLinker)
       SYCLToolChainLinker.reset(new SYCL::Linker(getToolChain()));
     return SYCLToolChainLinker.get();
   }
+
 private:
   mutable std::unique_ptr<Tool> SYCLToolChainLinker;
 };
@@ -72,7 +73,8 @@ namespace toolchains {
 class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
 public:
   HIPToolChain(const Driver &D, const llvm::Triple &Triple,
-                const ToolChain &HostTC, const llvm::opt::ArgList &Args, const Action::OffloadKind OK);
+               const ToolChain &HostTC, const llvm::opt::ArgList &Args,
+               const Action::OffloadKind OK);
 
   const llvm::Triple *getAuxTriple() const override {
     return &HostTC.getTriple();
@@ -115,7 +117,6 @@ class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
 
 private:
   const Action::OffloadKind OK;
-
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index 9bac46ae87720..5bb7728a4d317 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -274,7 +274,8 @@ void SYCL::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                 const char *LinkingOutput) const {
 
   assert((getToolChain().getTriple().isSPIR() ||
-          getToolChain().getTriple().isNVPTX() || getToolChain().getTriple().isAMDGCN()) &&
+          getToolChain().getTriple().isNVPTX() ||
+          getToolChain().getTriple().isAMDGCN()) &&
          "Unsupported target");
 
   std::string SubArchName =
@@ -285,7 +286,8 @@ void SYCL::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   // For CUDA, we want to link all BC files before resuming the normal
   // compilation path
-  if (getToolChain().getTriple().isNVPTX() || getToolChain().getTriple().isAMDGCN()) {
+  if (getToolChain().getTriple().isNVPTX() ||
+      getToolChain().getTriple().isAMDGCN()) {
     InputInfoList NvptxInputs;
     for (const auto &II : Inputs) {
       if (!II.isFilename())
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ec9406f0dabbf..e1bf31dd77068 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1182,9 +1182,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     if (TI.getTriple().isNVPTX()) {
         Builder.defineMacro("__SYCL_NVPTX__", "1");
     }
-    
+
     if (TI.getTriple().isAMDGCN()) {
-        Builder.defineMacro("__SYCL_AMDGCN__", "1");
+      Builder.defineMacro("__SYCL_AMDGCN__", "1");
     }
   }
   if (LangOpts.SYCLUnnamedLambda)
diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index 228c70181c991..1abf991e393d3 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -15,7 +15,7 @@
 
 #define __SPIRV_VAR_QUALIFIERS extern "C" const
 
-#if defined(__SYCL_NVPTX__)  || defined(__SYCL_AMDGCN__)
+#if defined(__SYCL_NVPTX__) || defined(__SYCL_AMDGCN__)
 
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x();
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y();
diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp
index ef641d3b34aa6..13b8765c0455a 100644
--- a/sycl/include/CL/sycl/detail/pi.hpp
+++ b/sycl/include/CL/sycl/detail/pi.hpp
@@ -61,12 +61,12 @@ bool trace(TraceLevel level);
 #define __SYCL_OPENCL_PLUGIN_NAME "pi_opencl.dll"
 #define __SYCL_LEVEL_ZERO_PLUGIN_NAME "pi_level_zero.dll"
 #define __SYCL_CUDA_PLUGIN_NAME "pi_cuda.dll"
-#define  __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.dll"
+#define __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.dll"
 #else
 #define __SYCL_OPENCL_PLUGIN_NAME "libpi_opencl.so"
 #define __SYCL_LEVEL_ZERO_PLUGIN_NAME "libpi_level_zero.so"
 #define __SYCL_CUDA_PLUGIN_NAME "libpi_cuda.so"
-#define  __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.so"
+#define __SYCL_ROCM_PLUGIN_NAME "libpi_rocm.so"
 #endif
 
 // Report error and no return (keeps compiler happy about no return statements).
diff --git a/sycl/plugins/rocm/pi_rocm.cpp b/sycl/plugins/rocm/pi_rocm.cpp
index c2a4bee9fcda1..687bd2ed935d8 100644
--- a/sycl/plugins/rocm/pi_rocm.cpp
+++ b/sycl/plugins/rocm/pi_rocm.cpp
@@ -11,8 +11,8 @@
 ///
 /// \ingroup sycl_pi_rocm
 
-#include <CL/sycl/detail/hip_definitions.hpp>
 #include <CL/sycl/detail/defines.hpp>
+#include <CL/sycl/detail/hip_definitions.hpp>
 #include <CL/sycl/detail/pi.hpp>
 #include <pi_rocm.hpp>
 
@@ -24,8 +24,6 @@
 #include <mutex>
 #include <regex>
 
-
-
 namespace {
 std::string getHipVersionString() {
   int driver_version = 0;
@@ -245,22 +243,23 @@ int getAttribute(pi_device device, hipDeviceAttribute_t attribute) {
 }
 /// \endcond
 
-void simpleGuessLocalWorkSize(int *threadsPerBlock, const size_t *global_work_size,
-                        const size_t maxThreadsPerBlock[3], pi_kernel kernel) {
+void simpleGuessLocalWorkSize(int *threadsPerBlock,
+                              const size_t *global_work_size,
+                              const size_t maxThreadsPerBlock[3],
+                              pi_kernel kernel) {
   assert(threadsPerBlock != nullptr);
   assert(global_work_size != nullptr);
   assert(kernel != nullptr);
-  //int recommendedBlockSize, minGrid;
+  // int recommendedBlockSize, minGrid;
 
-  //PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
+  // PI_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
   //    &minGrid, &recommendedBlockSize, kernel->get(),
   //    0, 0));
 
   //(void)minGrid; // Not used, avoid warnings
 
-  threadsPerBlock[0] =
-      std::min(static_cast<int>(maxThreadsPerBlock[0]),
-               static_cast<int>(global_work_size[0]));
+  threadsPerBlock[0] = std::min(static_cast<int>(maxThreadsPerBlock[0]),
+                                static_cast<int>(global_work_size[0]));
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
@@ -339,7 +338,6 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
     rocm_piQueueRetain(queue_);
   }
   rocm_piContextRetain(context_);
-  
 }
 
 _pi_event::~_pi_event() {
@@ -371,8 +369,7 @@ pi_uint64 _pi_event::get_queued_time() const {
   float miliSeconds = 0.0f;
   assert(is_started());
 
-  PI_CHECK_ERROR(
-      hipEventElapsedTime(&miliSeconds,evStart_, evEnd_));
+  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, evStart_, evEnd_));
   return static_cast<pi_uint64>(miliSeconds * 1.0e6);
 }
 
@@ -380,7 +377,8 @@ pi_uint64 _pi_event::get_start_time() const {
   float miliSeconds = 0.0f;
   assert(is_started());
 
-  PI_CHECK_ERROR(hipEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
+  PI_CHECK_ERROR(
+      hipEventElapsedTime(&miliSeconds, context_->evBase_, evStart_));
   return static_cast<pi_uint64>(miliSeconds * 1.0e6);
 }
 
@@ -475,7 +473,7 @@ pi_result _pi_program::set_binary(const char *source, size_t length) {
 pi_result _pi_program::build_program(const char *build_options) {
 
   this->buildOptions_ = build_options;
-  
+
   constexpr const unsigned int numberOfOptions = 4u;
 
   hipJitOption options[numberOfOptions];
@@ -496,14 +494,13 @@ pi_result _pi_program::build_program(const char *build_options) {
 
   auto result = PI_CHECK_ERROR(
       hipModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
-                         numberOfOptions, options, optionVals));
-
+                          numberOfOptions, options, optionVals));
 
   const auto success = (result == PI_SUCCESS);
-  
+
   buildStatus_ =
       success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR;
-  
+
   // If no exception, result is correct
   return success ? PI_SUCCESS : PI_BUILD_PROGRAM_FAILURE;
 }
@@ -852,8 +849,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int compute_units = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&compute_units,
-                             hipDeviceAttributeMultiprocessorCount,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMultiprocessorCount,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(compute_units >= 0);
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    pi_uint32(compute_units));
@@ -868,17 +865,17 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int max_x = 0, max_y = 0, max_z = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&max_x, hipDeviceAttributeMaxBlockDimX,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(max_x >= 0);
 
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&max_y, hipDeviceAttributeMaxBlockDimY,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(max_y >= 0);
 
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&max_z, hipDeviceAttributeMaxBlockDimZ,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(max_z >= 0);
 
     return_sizes[0] = size_t(max_x);
@@ -891,8 +888,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int max_work_group_size = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&max_work_group_size,
-                             hipDeviceAttributeMaxThreadsPerBlock,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMaxThreadsPerBlock,
+                              device->get()) == hipSuccess);
 
     cl::sycl::detail::pi::assertion(max_work_group_size >= 0);
 
@@ -946,12 +943,12 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int max_threads = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&max_threads,
-                             hipDeviceAttributeMaxThreadsPerBlock,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMaxThreadsPerBlock,
+                              device->get()) == hipSuccess);
     int warpSize = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     int maxWarps = (max_threads + warpSize - 1) / warpSize;
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    static_cast<uint32_t>(maxWarps));
@@ -961,9 +958,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // TODO: Revisit for previous generation GPUs
     int major = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&major,
-                             hipDeviceAttributeComputeCapabilityMajor,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor,
+                              device->get()) == hipSuccess);
     bool ifp = (major >= 7);
     return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
   }
@@ -971,7 +967,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int warpSize = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     size_t sizes[1] = {static_cast<size_t>(warpSize)};
     return getInfoArray<size_t>(1, param_value_size, param_value,
                                 param_value_size_ret, sizes);
@@ -980,7 +976,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int clock_freq = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&clock_freq, hipDeviceAttributeClockRate,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(clock_freq >= 0);
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    pi_uint32(clock_freq) / 1000u);
@@ -1024,40 +1020,36 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // but some searching found as of SM 2.x 128 are supported.
     return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
   }
-  
+
   case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
     // Take the smaller of maximum surface and maximum texture height.
     int tex_height = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_height,
-                             hipDeviceAttributeMaxTexture2DHeight,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture2DHeight,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_height >= 0);
     int surf_height = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&surf_height,
-                             hipDeviceAttributeMaxTexture2DHeight,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMaxTexture2DHeight,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_height >= 0);
 
     int min = std::min(tex_height, surf_height);
 
     return getInfo(param_value_size, param_value, param_value_size_ret, min);
-    
   }
   case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
     // Take the smaller of maximum surface and maximum texture width.
     int tex_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_width,
-                             hipDeviceAttributeMaxTexture2DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture2DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_width >= 0);
     int surf_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&surf_width,
-                             hipDeviceAttributeMaxTexture2DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture2DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_width >= 0);
 
     int min = std::min(tex_width, surf_width);
@@ -1068,15 +1060,14 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // Take the smaller of maximum surface and maximum texture height.
     int tex_height = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_height,
-                             hipDeviceAttributeMaxTexture3DHeight,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_height, hipDeviceAttributeMaxTexture3DHeight,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_height >= 0);
     int surf_height = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&surf_height,
-                             hipDeviceAttributeMaxTexture3DHeight,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMaxTexture3DHeight,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_height >= 0);
 
     int min = std::min(tex_height, surf_height);
@@ -1087,15 +1078,13 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // Take the smaller of maximum surface and maximum texture width.
     int tex_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_width,
-                             hipDeviceAttributeMaxTexture3DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture3DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_width >= 0);
     int surf_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&surf_width,
-                             hipDeviceAttributeMaxTexture3DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture3DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_width >= 0);
 
     int min = std::min(tex_width, surf_width);
@@ -1106,15 +1095,13 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // Take the smaller of maximum surface and maximum texture depth.
     int tex_depth = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_depth,
-                             hipDeviceAttributeMaxTexture3DDepth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_depth, hipDeviceAttributeMaxTexture3DDepth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_depth >= 0);
     int surf_depth = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&surf_depth,
-                             hipDeviceAttributeMaxTexture3DDepth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&surf_depth, hipDeviceAttributeMaxTexture3DDepth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_depth >= 0);
 
     int min = std::min(tex_depth, surf_depth);
@@ -1125,15 +1112,13 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // Take the smaller of maximum surface and maximum texture width.
     int tex_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&tex_width,
-                             hipDeviceAttributeMaxTexture1DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&tex_width, hipDeviceAttributeMaxTexture1DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(tex_width >= 0);
     int surf_width = 0;
     cl::sycl::detail::pi::assertion(
-        hipDeviceGetAttribute(&surf_width,
-                             hipDeviceAttributeMaxTexture1DWidth,
-                             device->get()) == hipSuccess);
+        hipDeviceGetAttribute(&surf_width, hipDeviceAttributeMaxTexture1DWidth,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(surf_width >= 0);
 
     int min = std::min(tex_width, surf_width);
@@ -1159,8 +1144,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int mem_base_addr_align = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&mem_base_addr_align,
-                             hipDeviceAttributeTextureAlignment,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeTextureAlignment,
+                              device->get()) == hipSuccess);
     // Multiply by 8 as clGetDeviceInfo returns this value in bits
     mem_base_addr_align *= 8;
     return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -1193,7 +1178,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int cache_size = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&cache_size, hipDeviceAttributeL2CacheSize,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(cache_size >= 0);
     // The L2 cache is global to the GPU.
     return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -1211,8 +1196,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int constant_memory = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&constant_memory,
-                             hipDeviceAttributeTotalConstantMemory,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeTotalConstantMemory,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(constant_memory >= 0);
 
     return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -1235,8 +1220,8 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int local_mem_size = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&local_mem_size,
-                             hipDeviceAttributeMaxSharedMemoryPerBlock,
-                             device->get()) == hipSuccess);
+                              hipDeviceAttributeMaxSharedMemoryPerBlock,
+                              device->get()) == hipSuccess);
     cl::sycl::detail::pi::assertion(local_mem_size >= 0);
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    pi_uint64(local_mem_size));
@@ -1245,7 +1230,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int ecc_enabled = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&ecc_enabled, hipDeviceAttributeEccEnabled,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
 
     cl::sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
     auto result = static_cast<bool>(ecc_enabled);
@@ -1255,7 +1240,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     int is_integrated = 0;
     cl::sycl::detail::pi::assertion(
         hipDeviceGetAttribute(&is_integrated, hipDeviceAttributeIntegrated,
-                             device->get()) == hipSuccess);
+                              device->get()) == hipSuccess);
 
     cl::sycl::detail::pi::assertion((is_integrated == 0) |
                                     (is_integrated == 1));
@@ -1379,20 +1364,19 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // query if/how the device can access page-locked host memory, possibly
     // through PCIe, using the same pointer as the host
     pi_bitfield value = {};
-    //if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
-      // the device shares a unified address space with the host
-      if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
-          6) {
-        // compute capability 6.x introduces operations that are atomic with
-        // respect to other CPUs and GPUs in the system
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } else {
-        // on GPU architectures with compute capability lower than 6.x, atomic
-        // operations from the GPU to CPU memory will not be atomic with respect
-        // to CPU initiated atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
+    // if (getAttribute(device, HIP_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+    // the device shares a unified address space with the host
+    if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
+      // compute capability 6.x introduces operations that are atomic with
+      // respect to other CPUs and GPUs in the system
+      value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS | PI_USM_CONCURRENT_ACCESS |
+              PI_USM_CONCURRENT_ATOMIC_ACCESS;
+    } else {
+      // on GPU architectures with compute capability lower than 6.x, atomic
+      // operations from the GPU to CPU memory will not be atomic with respect
+      // to CPU initiated atomic operations
+      value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+    }
     //}
     return getInfo(param_value_size, param_value, param_value_size_ret, value);
   }
@@ -1422,8 +1406,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
       // the device can coherently access managed memory concurrently with the
       // CPU
       value |= PI_USM_CONCURRENT_ACCESS;
-      if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
-          6) {
+      if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
         // compute capability 6.x introduces operations that are atomic with
         // respect to other CPUs and GPUs in the system
         value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
@@ -1451,8 +1434,7 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
       // CPU
       value |= PI_USM_CONCURRENT_ACCESS;
     }
-    if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >=
-        6) {
+    if (getAttribute(device, hipDeviceAttributeComputeCapabilityMajor) >= 6) {
       // compute capability 6.x introduces operations that are atomic with
       // respect to other CPUs and GPUs in the system
       if (value & PI_USM_ACCESS)
@@ -1471,9 +1453,9 @@ pi_result rocm_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     // system allocator
     pi_bitfield value = {};
     if (getAttribute(device, hipDeviceAttributePageableMemoryAccess)) {
-        // the link between the device and the host does not support native
-        // atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+      // the link between the device and the host does not support native
+      // atomic operations
+      value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
     }
     return getInfo(param_value_size, param_value, param_value_size_ret, value);
   }
@@ -1605,7 +1587,8 @@ pi_result rocm_piContextCreate(const pi_context_properties *properties,
     }
 
     // Use default stream to record base event counter
-    PI_CHECK_ERROR(hipEventCreateWithFlags(&piContextPtr->evBase_, hipEventDefault));
+    PI_CHECK_ERROR(
+        hipEventCreateWithFlags(&piContextPtr->evBase_, hipEventDefault));
     PI_CHECK_ERROR(hipEventRecord(piContextPtr->evBase_, 0));
 
     // For non-primary scoped contexts keep the last active on top of the stack
@@ -1637,8 +1620,7 @@ pi_result rocm_piContextRelease(pi_context ctxt) {
   std::unique_ptr<_pi_context> context{ctxt};
 
   PI_CHECK_ERROR(hipEventDestroy(context->evBase_));
-  
-  
+
   if (!ctxt->is_primary()) {
     hipCtx_t hipCtxt = ctxt->get();
     hipCtx_t current = nullptr;
@@ -1647,7 +1629,7 @@ pi_result rocm_piContextRelease(pi_context ctxt) {
       PI_CHECK_ERROR(hipCtxPushCurrent(hipCtxt));
     }
     //  hipErrorNotSupported this API
-    //PI_CHECK_ERROR(hipCtxSynchronize());
+    // PI_CHECK_ERROR(hipCtxSynchronize());
     PI_CHECK_ERROR(hipCtxGetCurrent(&current));
     if (hipCtxt == current) {
       PI_CHECK_ERROR(hipCtxPopCurrent(&current));
@@ -1660,7 +1642,7 @@ pi_result rocm_piContextRelease(pi_context ctxt) {
     PI_CHECK_ERROR(hipCtxPopCurrent(&current));
     return PI_CHECK_ERROR(hipDevicePrimaryCtxRelease(hipDev));
   }
-  
+
   hipCtx_t hipCtxt = ctxt->get();
   return PI_CHECK_ERROR(hipCtxDestroy(hipCtxt));
 }
@@ -1809,16 +1791,14 @@ pi_result rocm_piMemRelease(pi_mem memObj) {
         ret = PI_CHECK_ERROR(
             hipFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
       };
-    } 
-    
+    }
+
     else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
-      ret = PI_CHECK_ERROR(
-          hipDestroySurfaceObject(uniqueMemObj->mem_.surface_mem_.get_surface()));
+      ret = PI_CHECK_ERROR(hipDestroySurfaceObject(
+          uniqueMemObj->mem_.surface_mem_.get_surface()));
       auto array = uniqueMemObj->mem_.surface_mem_.get_array();
-      ret = PI_CHECK_ERROR(
-          hipFreeArray(&array));
+      ret = PI_CHECK_ERROR(hipFreeArray(&array));
     }
-    
 
   } catch (pi_result err) {
     ret = err;
@@ -1877,7 +1857,7 @@ pi_result rocm_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
   assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
          _pi_mem::mem_::buffer_mem_::native_type{0});
   _pi_mem::mem_::buffer_mem_::native_type ptr =
-      (uint8_t*)(parent_buffer->mem_.buffer_mem_.ptr_ )+ bufferRegion.origin;
+      (uint8_t *)(parent_buffer->mem_.buffer_mem_.ptr_) + bufferRegion.origin;
 
   void *hostPtr = nullptr;
   if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
@@ -1920,7 +1900,6 @@ pi_result rocm_piMemGetInfo(pi_mem memObj, cl_mem_info queriedInfo,
 ///
 /// \return PI_SUCCESS
 
-
 /// Created a PI mem object from a HIP mem handle.
 /// TODO: Implement this.
 /// NOTE: The created PI object takes ownership of the native handle.
@@ -2091,7 +2070,7 @@ pi_result rocm_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
 
 pi_result rocm_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
                                        pi_bool blocking_write, size_t offset,
-                                       size_t size,  void *ptr,
+                                       size_t size, void *ptr,
                                        pi_uint32 num_events_in_wait_list,
                                        const pi_event *event_wait_list,
                                        pi_event *event) {
@@ -2115,8 +2094,8 @@ pi_result rocm_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
       retImplEv->start();
     }
 
-    retErr =
-        PI_CHECK_ERROR(hipMemcpyHtoDAsync((uint8_t*)devPtr + offset, ptr, size, hipStream));
+    retErr = PI_CHECK_ERROR(
+        hipMemcpyHtoDAsync((uint8_t *)devPtr + offset, ptr, size, hipStream));
 
     if (event) {
       retErr = retImplEv->record();
@@ -2161,8 +2140,8 @@ pi_result rocm_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
       retImplEv->start();
     }
 
-    retErr =
-        PI_CHECK_ERROR(hipMemcpyDtoHAsync(ptr, (uint8_t*)devPtr + offset, size, hipStream));
+    retErr = PI_CHECK_ERROR(
+        hipMemcpyDtoHAsync(ptr, (uint8_t *)devPtr + offset, size, hipStream));
 
     if (event) {
       retErr = retImplEv->record();
@@ -2283,7 +2262,7 @@ pi_result rocm_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
   pi_result retErr = PI_SUCCESS;
   try {
     pi_mem arg_mem = *arg_value;
-    
+
     if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
       auto array = arg_mem->mem_.surface_mem_.get_array();
       if (array.Format != HIP_AD_FORMAT_UNSIGNED_INT32 &&
@@ -2296,9 +2275,9 @@ pi_result rocm_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
       }
       hipSurfaceObject_t hipSurf = arg_mem->mem_.surface_mem_.get_surface();
       kernel->set_kernel_arg(arg_index, sizeof(hipSurf), (void *)&hipSurf);
-    } else 
-    
-   {
+    } else
+
+    {
       hipDevPtr hipPtr = arg_mem->mem_.buffer_mem_.get();
       kernel->set_kernel_arg(arg_index, sizeof(hipDevPtr), (void *)&hipPtr);
     }
@@ -2380,20 +2359,18 @@ pi_result rocm_piEnqueueKernelLaunch(
           return err;
       }
     } else {
-       simpleGuessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock,
-                         kernel);
+      simpleGuessLocalWorkSize(threadsPerBlock, global_work_size,
+                               maxThreadsPerBlock, kernel);
     }
   }
-  
+
   int blocksPerGrid[3] = {1, 1, 1};
-  
 
   for (size_t i = 0; i < work_dim; i++) {
     blocksPerGrid[i] =
         static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
         threadsPerBlock[i];
   }
-  
 
   pi_result retError = PI_SUCCESS;
   std::unique_ptr<_pi_event> retImplEv{nullptr};
@@ -2429,12 +2406,11 @@ pi_result rocm_piEnqueueKernelLaunch(
           PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue));
       retImplEv->start();
     }
-    
+
     retError = PI_CHECK_ERROR(hipModuleLaunchKernel(
-        hipFunc, blocksPerGrid[0], 1, 1,
-        threadsPerBlock[0], 1, 1,
+        hipFunc, blocksPerGrid[0], 1, 1, threadsPerBlock[0], 1, 1,
         kernel->get_local_size(), hipStream, argIndices.data(), nullptr));
-      
+
     kernel->clear_local_size();
     if (event) {
       retError = retImplEv->record();
@@ -2461,12 +2437,11 @@ pi_result rocm_piEnqueueNativeKernel(
 
 /// \TODO Not implemented
 
-
 pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags,
                                 const pi_image_format *image_format,
                                 const pi_image_desc *image_desc, void *host_ptr,
                                 pi_mem *ret_mem) {
-  
+
   // Need input memory object
   assert(ret_mem != nullptr);
   const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
@@ -2596,8 +2571,8 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags,
     hipSurfaceObject_t surface;
     retErr = PI_CHECK_ERROR(hipCreateSurfaceObject(&surface, &image_res_desc));
 
-    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{context, *image_array, surface, image_desc->image_type, host_ptr});
-
+    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
+        context, *image_array, surface, image_desc->image_type, host_ptr});
 
     if (piMemObj == nullptr) {
       return PI_OUT_OF_HOST_MEMORY;
@@ -2614,7 +2589,6 @@ pi_result rocm_piMemImageCreate(pi_context context, pi_mem_flags flags,
   return retErr;
 }
 
-
 /// \TODO Not implemented
 pi_result rocm_piMemImageGetInfo(pi_mem image, pi_image_info param_name,
                                  size_t param_value_size, void *param_value,
@@ -2922,8 +2896,8 @@ pi_result rocm_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int max_threads = 0;
       cl::sycl::detail::pi::assertion(
           hipFuncGetAttribute(&max_threads,
-                             HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == hipSuccess);
+                              HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                              kernel->get()) == hipSuccess);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      size_t(max_threads));
     }
@@ -2943,7 +2917,7 @@ pi_result rocm_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int bytes = 0;
       cl::sycl::detail::pi::assertion(
           hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                             kernel->get()) == hipSuccess);
+                              kernel->get()) == hipSuccess);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      pi_uint64(bytes));
     }
@@ -2952,7 +2926,7 @@ pi_result rocm_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int warpSize = 0;
       cl::sycl::detail::pi::assertion(
           hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                               device->get()) == hipSuccess);
+                                device->get()) == hipSuccess);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      static_cast<size_t>(warpSize));
     }
@@ -2961,7 +2935,7 @@ pi_result rocm_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
       int bytes = 0;
       cl::sycl::detail::pi::assertion(
           hipFuncGetAttribute(&bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                             kernel->get()) == hipSuccess);
+                              kernel->get()) == hipSuccess);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      pi_uint64(bytes));
     }
@@ -2984,7 +2958,7 @@ pi_result rocm_piKernelGetSubGroupInfo(
       int warpSize = 0;
       cl::sycl::detail::pi::assertion(
           hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize,
-                               device->get()) == hipSuccess);
+                                device->get()) == hipSuccess);
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      static_cast<uint32_t>(warpSize));
     }
@@ -2993,8 +2967,8 @@ pi_result rocm_piKernelGetSubGroupInfo(
       int max_threads = 0;
       cl::sycl::detail::pi::assertion(
           hipFuncGetAttribute(&max_threads,
-                             HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == hipSuccess);
+                              HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                              kernel->get()) == hipSuccess);
       int warpSize = 0;
       rocm_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
                                    0, nullptr, sizeof(uint32_t), &warpSize,
@@ -3396,7 +3370,7 @@ static pi_result commonEnqueueMemBufferCopyRect(
     size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
     const hipMemoryType dst_type, pi_buff_rect_offset dst_offset,
     size_t dst_row_pitch, size_t dst_slice_pitch) {
-  
+
   assert(region != nullptr);
   assert(src_offset != nullptr);
   assert(dst_offset != nullptr);
@@ -3429,19 +3403,18 @@ static pi_result commonEnqueueMemBufferCopyRect(
   params.srcHeight = src_slice_pitch / src_row_pitch;
 
   params.dstMemoryType = dst_type;
-  params.dstDevice = dst_type == hipMemoryTypeDevice
-                         ? *static_cast<hipDevPtr *>(dst_ptr)
-                         : 0;
+  params.dstDevice =
+      dst_type == hipMemoryTypeDevice ? *static_cast<hipDevPtr *>(dst_ptr) : 0;
   params.dstHost = dst_type == hipMemoryTypeHost ? dst_ptr : nullptr;
   params.dstXInBytes = dst_offset->x_bytes;
   params.dstY = dst_offset->y_scalar;
   params.dstZ = dst_offset->z_scalar;
   params.dstPitch = dst_row_pitch;
   params.dstHeight = dst_slice_pitch / dst_row_pitch;
-  
+
   return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&params, hip_stream));
-  
- return PI_SUCCESS;
+
+  return PI_SUCCESS;
 }
 
 pi_result rocm_piEnqueueMemBufferReadRect(
@@ -3575,8 +3548,8 @@ pi_result rocm_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
     }
 
     auto stream = command_queue->get();
-    auto src = (uint8_t*)(src_buffer->mem_.buffer_mem_.get()) + src_offset;
-    auto dst = (uint8_t*)(dst_buffer->mem_.buffer_mem_.get()) + dst_offset;
+    auto src = (uint8_t *)(src_buffer->mem_.buffer_mem_.get()) + src_offset;
+    auto dst = (uint8_t *)(dst_buffer->mem_.buffer_mem_.get()) + dst_offset;
 
     result = PI_CHECK_ERROR(hipMemcpyDtoDAsync(dst, src, size, stream));
 
@@ -3680,7 +3653,7 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
       result = retImplEv->start();
     }
 
-    auto dstDevice = (uint8_t*)(buffer->mem_.buffer_mem_.get()) + offset;
+    auto dstDevice = (uint8_t *)(buffer->mem_.buffer_mem_.get()) + offset;
     auto stream = command_queue->get();
     auto N = size / pattern_size;
 
@@ -3701,12 +3674,11 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
       result = PI_CHECK_ERROR(hipMemsetD32Async(dstDevice, value, N, stream));
       break;
     }
-    
+
     default: {
       result = PI_INVALID_VALUE;
       break;
     }
-    
     }
 
     if (event) {
@@ -3722,7 +3694,6 @@ pi_result rocm_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
   }
 }
 
-
 static size_t imageElementByteSize(enum hipArray_Format array_format) {
   switch (array_format) {
   case HIP_AD_FORMAT_UNSIGNED_INT8:
@@ -3743,19 +3714,16 @@ static size_t imageElementByteSize(enum hipArray_Format array_format) {
   return 0;
 }
 
-
 /// General ND memory copy operation for images (where N > 1).
 /// This function requires the corresponding HIP context to be at the top of
 /// the context stack
 /// If the source and/or destination is an array, src_ptr and/or dst_ptr
 /// must be a pointer to a hipArray
 
-
 static pi_result commonEnqueueMemImageNDCopy(
     hipStream_t hip_stream, pi_mem_type img_type, const size_t *region,
-    const void *src_ptr, const hipMemoryType src_type,
-    const size_t *src_offset, void *dst_ptr, const hipMemoryType dst_type,
-    const size_t *dst_offset) {
+    const void *src_ptr, const hipMemoryType src_type, const size_t *src_offset,
+    void *dst_ptr, const hipMemoryType dst_type, const size_t *dst_offset) {
   assert(region != nullptr);
 
   assert(src_type == hipMemoryTypeArray || src_type == hipMemoryTypeHost);
@@ -3766,7 +3734,8 @@ static pi_result commonEnqueueMemImageNDCopy(
     memset(&cpyDesc, 0, sizeof(cpyDesc));
     cpyDesc.srcMemoryType = src_type;
     if (src_type == hipMemoryTypeArray) {
-      cpyDesc.srcArray = const_cast<hipArray*>(static_cast<const hipArray*>(src_ptr));
+      cpyDesc.srcArray =
+          const_cast<hipArray *>(static_cast<const hipArray *>(src_ptr));
       cpyDesc.srcXInBytes = src_offset[0];
       cpyDesc.srcY = src_offset[1];
     } else {
@@ -3774,7 +3743,8 @@ static pi_result commonEnqueueMemImageNDCopy(
     }
     cpyDesc.dstMemoryType = dst_type;
     if (dst_type == hipMemoryTypeArray) {
-      cpyDesc.dstArray = const_cast<hipArray*>(static_cast<const hipArray*>(dst_ptr));
+      cpyDesc.dstArray =
+          const_cast<hipArray *>(static_cast<const hipArray *>(dst_ptr));
       cpyDesc.dstXInBytes = dst_offset[0];
       cpyDesc.dstY = dst_offset[1];
     } else {
@@ -3784,14 +3754,15 @@ static pi_result commonEnqueueMemImageNDCopy(
     cpyDesc.Height = region[1];
     return PI_CHECK_ERROR(hipMemcpyParam2DAsync(&cpyDesc, hip_stream));
   }
-  
+
   if (img_type == PI_MEM_TYPE_IMAGE3D) {
-    
+
     HIP_MEMCPY3D cpyDesc;
     memset(&cpyDesc, 0, sizeof(cpyDesc));
     cpyDesc.srcMemoryType = src_type;
     if (src_type == hipMemoryTypeArray) {
-      cpyDesc.srcArray = const_cast<hipArray*>(static_cast<const hipArray*>(src_ptr));
+      cpyDesc.srcArray =
+          const_cast<hipArray *>(static_cast<const hipArray *>(src_ptr));
       cpyDesc.srcXInBytes = src_offset[0];
       cpyDesc.srcY = src_offset[1];
       cpyDesc.srcZ = src_offset[2];
@@ -3811,9 +3782,9 @@ static pi_result commonEnqueueMemImageNDCopy(
     cpyDesc.Height = region[1];
     cpyDesc.Depth = region[2];
     return PI_CHECK_ERROR(hipDrvMemcpy3DAsync(&cpyDesc, hip_stream));
-   return PI_ERROR_UNKNOWN;
+    return PI_ERROR_UNKNOWN;
   }
-  
+
   return PI_INVALID_VALUE;
 }
 
@@ -3822,7 +3793,7 @@ pi_result rocm_piEnqueueMemImageRead(
     const size_t *origin, const size_t *region, size_t row_pitch,
     size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
     const pi_event *event_wait_list, pi_event *event) {
-  
+
   assert(command_queue != nullptr);
   assert(image != nullptr);
   assert(image->mem_type_ == _pi_mem::mem_type::surface);
@@ -3846,18 +3817,17 @@ pi_result rocm_piEnqueueMemImageRead(
     size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
 
     pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-     
+
     size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
     size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
 
-    retErr = commonEnqueueMemImageNDCopy(
-        hipStream, imgType, adjustedRegion, &array, hipMemoryTypeArray,
-        srcOffset, ptr, hipMemoryTypeHost, nullptr);
+    retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
+                                         &array, hipMemoryTypeArray, srcOffset,
+                                         ptr, hipMemoryTypeHost, nullptr);
 
     if (retErr != PI_SUCCESS) {
       return retErr;
     }
-    
 
     if (event) {
       auto new_event =
@@ -3875,7 +3845,7 @@ pi_result rocm_piEnqueueMemImageRead(
     return PI_ERROR_UNKNOWN;
   }
   return PI_SUCCESS;
- return retErr;
+  return retErr;
 }
 
 pi_result
@@ -3885,8 +3855,7 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
                             size_t input_slice_pitch, const void *ptr,
                             pi_uint32 num_events_in_wait_list,
                             const pi_event *event_wait_list, pi_event *event) {
-  
-  
+
   assert(command_queue != nullptr);
   assert(image != nullptr);
   assert(image->mem_type_ == _pi_mem::mem_type::surface);
@@ -3910,18 +3879,17 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
     size_t bytesToCopy = elementByteSize * array.NumChannels * region[0];
 
     pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-     
+
     size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
     size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
 
-    retErr = commonEnqueueMemImageNDCopy(
-        hipStream, imgType, adjustedRegion, ptr, hipMemoryTypeHost, nullptr,
-        &array, hipMemoryTypeArray, dstOffset);
+    retErr = commonEnqueueMemImageNDCopy(hipStream, imgType, adjustedRegion,
+                                         ptr, hipMemoryTypeHost, nullptr,
+                                         &array, hipMemoryTypeArray, dstOffset);
 
     if (retErr != PI_SUCCESS) {
       return retErr;
     }
-    
 
     if (event) {
       auto new_event =
@@ -3936,8 +3904,8 @@ rocm_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
   }
 
   return PI_SUCCESS;
-  
- return retErr;
+
+  return retErr;
 }
 
 pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
@@ -3947,8 +3915,7 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
                                      pi_uint32 num_events_in_wait_list,
                                      const pi_event *event_wait_list,
                                      pi_event *event) {
-  
-  
+
   assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
   assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
   assert(src_image->mem_.surface_mem_.get_image_type() ==
@@ -3973,7 +3940,6 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
 
     int elementByteSize = imageElementByteSize(srcArray.Format);
 
-
     size_t dstByteOffsetX =
         dst_origin[0] * elementByteSize * srcArray.NumChannels;
     size_t srcByteOffsetX =
@@ -3981,7 +3947,7 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
     size_t bytesToCopy = elementByteSize * srcArray.NumChannels * region[0];
 
     pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
-    
+
     size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
     size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
     size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
@@ -3993,7 +3959,6 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
     if (retErr != PI_SUCCESS) {
       return retErr;
     }
-    
 
     if (event) {
       auto new_event =
@@ -4006,9 +3971,9 @@ pi_result rocm_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
   } catch (...) {
     return PI_ERROR_UNKNOWN;
   }
-  
+
   return PI_SUCCESS;
- return retErr;
+  return retErr;
 }
 
 /// \TODO Not implemented in HIP, requires untie from OpenCL
@@ -4198,8 +4163,8 @@ pi_result rocm_piextUSMSharedAlloc(void **result_ptr, pi_context context,
   pi_result result = PI_SUCCESS;
   try {
     ScopedContext active(context);
-    result = PI_CHECK_ERROR(hipMallocManaged((hipDevPtr *)result_ptr, size,
-                                              hipMemAttachGlobal));
+    result = PI_CHECK_ERROR(
+        hipMallocManaged((hipDevPtr *)result_ptr, size, hipMemAttachGlobal));
   } catch (pi_result error) {
     result = error;
   }
@@ -4213,15 +4178,15 @@ pi_result rocm_piextUSMSharedAlloc(void **result_ptr, pi_context context,
 /// USM: Frees the given USM pointer associated with the context.
 ///
 pi_result rocm_piextUSMFree(pi_context context, void *ptr) {
-  
+
   assert(context != nullptr);
   pi_result result = PI_SUCCESS;
   try {
     ScopedContext active(context);
     unsigned int type;
     hipPointerAttribute_t hipPointerAttributeType;
-    result = PI_CHECK_ERROR(hipPointerGetAttributes(
-           &hipPointerAttributeType, (hipDevPtr)ptr));
+    result = PI_CHECK_ERROR(
+        hipPointerGetAttributes(&hipPointerAttributeType, (hipDevPtr)ptr));
     type = hipPointerAttributeType.memoryType;
     assert(type == hipMemoryTypeDevice or type == hipMemoryTypeHost);
     if (type == hipMemoryTypeDevice) {
@@ -4241,7 +4206,7 @@ pi_result rocm_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
                                      pi_uint32 num_events_in_waitlist,
                                      const pi_event *events_waitlist,
                                      pi_event *event) {
-                            
+
   assert(queue != nullptr);
   assert(ptr != nullptr);
   hipStream_t hipStream = queue->get();
@@ -4266,7 +4231,7 @@ pi_result rocm_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
   } catch (pi_result err) {
     result = err;
   }
-  
+
   return result;
 }
 
@@ -4276,7 +4241,7 @@ pi_result rocm_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
                                      pi_uint32 num_events_in_waitlist,
                                      const pi_event *events_waitlist,
                                      pi_event *event) {
-  
+
   assert(queue != nullptr);
   assert(dst_ptr != nullptr);
   assert(src_ptr != nullptr);
@@ -4293,8 +4258,8 @@ pi_result rocm_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
           _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
       event_ptr->start();
     }
-    result = PI_CHECK_ERROR(hipMemcpyAsync(
-        dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
+    result = PI_CHECK_ERROR(
+        hipMemcpyAsync(dst_ptr, src_ptr, size, hipMemcpyDefault, hipStream));
     if (event) {
       result = event_ptr->record();
     }
@@ -4307,7 +4272,7 @@ pi_result rocm_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
   } catch (pi_result err) {
     result = err;
   }
-  
+
   return result;
 }
 
@@ -4317,7 +4282,7 @@ pi_result rocm_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
                                        pi_uint32 num_events_in_waitlist,
                                        const pi_event *events_waitlist,
                                        pi_event *event) {
-  
+
   assert(queue != nullptr);
   assert(ptr != nullptr);
   hipStream_t hipStream = queue->get();
@@ -4338,8 +4303,7 @@ pi_result rocm_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
       event_ptr->start();
     }
     result = PI_CHECK_ERROR(hipMemPrefetchAsync(
-        ptr, size, queue->get_context()->get_device()->get(),
-        hipStream));
+        ptr, size, queue->get_context()->get_device()->get(), hipStream));
     if (event) {
       result = event_ptr->record();
       *event = event_ptr.release();
@@ -4347,7 +4311,7 @@ pi_result rocm_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
   } catch (pi_result err) {
     result = err;
   }
-  
+
   return result;
 }
 
@@ -4355,14 +4319,14 @@ pi_result rocm_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
 pi_result rocm_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
                                         size_t length, pi_mem_advice advice,
                                         pi_event *event) {
-                                       
+
   assert(queue != nullptr);
   assert(ptr != nullptr);
   // TODO implement a mapping to hipMemAdvise once the expected behaviour
   // of piextUSMEnqueueMemAdvise is detailed in the USM extension
   return rocm_piEnqueueEventsWait(queue, 0, nullptr, event);
 
- return PI_SUCCESS;
+  return PI_SUCCESS;
 }
 
 /// API to query information about USM allocated pointers
@@ -4386,7 +4350,7 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
                                        size_t param_value_size,
                                        void *param_value,
                                        size_t *param_value_size_ret) {
-  
+
   assert(context != nullptr);
   assert(ptr != nullptr);
   pi_result result = PI_SUCCESS;
@@ -4398,8 +4362,7 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
     case PI_MEM_ALLOC_TYPE: {
       unsigned int value;
       // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue
-      hipError_t ret = hipPointerGetAttributes(
-          &hipPointerAttributeType, ptr);
+      hipError_t ret = hipPointerGetAttributes(&hipPointerAttributeType, ptr);
       if (ret == hipErrorInvalidValue) {
         // pointer not known to the HIP subsystem
         return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -4412,8 +4375,8 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
         return getInfo(param_value_size, param_value, param_value_size_ret,
                        PI_MEM_TYPE_SHARED);
       }
-      result = PI_CHECK_ERROR(hipPointerGetAttributes(
-        &hipPointerAttributeType, ptr));
+      result = PI_CHECK_ERROR(
+          hipPointerGetAttributes(&hipPointerAttributeType, ptr));
       value = hipPointerAttributeType.memoryType;
       assert(value == hipMemoryTypeDevice or value == hipMemoryTypeHost);
       if (value == hipMemoryTypeDevice) {
@@ -4437,24 +4400,25 @@ pi_result rocm_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
     case PI_MEM_ALLOC_SIZE: {
       return PI_INVALID_VALUE;
     }
-    
+
     case PI_MEM_ALLOC_DEVICE: {
       unsigned int value;
-      result = PI_CHECK_ERROR(hipPointerGetAttributes(
-          &hipPointerAttributeType, ptr));
-      auto devicePointer = static_cast<int*>(hipPointerAttributeType.devicePointer);
+      result = PI_CHECK_ERROR(
+          hipPointerGetAttributes(&hipPointerAttributeType, ptr));
+      auto devicePointer =
+          static_cast<int *>(hipPointerAttributeType.devicePointer);
       value = *devicePointer;
       pi_platform platform;
       result = rocm_piPlatformsGet(0, &platform, nullptr);
       pi_device device = platform->devices_[value].get();
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      device);
-    } 
-  } 
+    }
+    }
   } catch (pi_result error) {
     result = error;
   }
-  
+
   return result;
 }
 
diff --git a/sycl/plugins/rocm/pi_rocm.hpp b/sycl/plugins/rocm/pi_rocm.hpp
index e748ad12f6b6f..931b54c90bbc2 100644
--- a/sycl/plugins/rocm/pi_rocm.hpp
+++ b/sycl/plugins/rocm/pi_rocm.hpp
@@ -23,26 +23,26 @@
 #include <atomic>
 #include <cassert>
 #include <cstring>
+#include <functional>
 #include <hip/hip_runtime.h>
 #include <limits>
+#include <mutex>
 #include <numeric>
 #include <stdint.h>
 #include <string>
 #include <vector>
-#include <functional>
-#include <mutex>
 
-typedef  void* hipDevPtr;
+typedef void *hipDevPtr;
 
 extern "C" {
 
 /// \cond INGORE_BLOCK_IN_DOXYGEN
-pi_result rocm_piContextRetain(pi_context );
-pi_result rocm_piContextRelease(pi_context );
-pi_result rocm_piDeviceRelease(pi_device );
-pi_result rocm_piDeviceRetain(pi_device );
-pi_result rocm_piProgramRetain(pi_program );
-pi_result rocm_piProgramRelease(pi_program );
+pi_result rocm_piContextRetain(pi_context);
+pi_result rocm_piContextRelease(pi_context);
+pi_result rocm_piDeviceRelease(pi_device);
+pi_result rocm_piDeviceRetain(pi_device);
+pi_result rocm_piProgramRetain(pi_program);
+pi_result rocm_piProgramRelease(pi_program);
 pi_result rocm_piQueueRelease(pi_queue);
 pi_result rocm_piQueueRetain(pi_queue);
 pi_result rocm_piMemRetain(pi_mem);
@@ -310,7 +310,6 @@ struct _pi_mem {
     }
   };
 
-
   /// Constructs the PI allocation for an Image object
   _pi_mem(pi_context ctxt, hipArray array, hipSurfaceObject_t surf,
           pi_mem_type image_type, void *host_ptr)
@@ -320,7 +319,6 @@ struct _pi_mem {
     mem_.surface_mem_.surfObj_ = surf;
     rocm_piContextRetain(context_);
   }
- 
 
   ~_pi_mem() {
     if (mem_type_ == mem_type::buffer) {
@@ -514,7 +512,7 @@ struct _pi_program {
 
   pi_result set_binary(const char *binary, size_t binarySizeInBytes);
 
-  pi_result build_program(const char* build_options);
+  pi_result build_program(const char *build_options);
 
   pi_context get_context() const { return context_; };
 
@@ -623,8 +621,8 @@ struct _pi_kernel {
     }
   } args_;
 
-  _pi_kernel(hipFunction_t func, hipFunction_t funcWithOffsetParam, const char *name,
-             pi_program program, pi_context ctxt)
+  _pi_kernel(hipFunction_t func, hipFunction_t funcWithOffsetParam,
+             const char *name, pi_program program, pi_context ctxt)
       : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
         name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
     rocm_piProgramRetain(program_);
@@ -635,8 +633,7 @@ struct _pi_kernel {
              pi_context ctxt)
       : _pi_kernel{func, nullptr, name, program, ctxt} {}
 
-  ~_pi_kernel()
-  {
+  ~_pi_kernel() {
     rocm_piProgramRelease(program_);
     rocm_piContextRelease(context_);
   }

From 2fdf931b30405d637665f412fa63c1cbce56b3af Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Mon, 31 May 2021 06:14:21 +0000
Subject: [PATCH 08/18] modify code and comment

---
 clang/lib/Driver/Driver.cpp         | 4 ++--
 clang/lib/Driver/ToolChains/HIP.cpp | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 41a635be3a80d..6f2c9928b5b16 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3844,8 +3844,8 @@ class OffloadingActionBuilder final {
                                                     BA, AssociatedOffloadKind);
 
       ActionList AL = {AA};
-      Action *action = C.MakeAction<LinkJobAction>(AL, types::TY_Image);
-      ActionList HIPActions = {action};
+      Action *LinkAction = C.MakeAction<LinkJobAction>(AL, types::TY_Image);
+      ActionList HIPActions = {LinkAction};
       Action *HIPFatBinary =
           C.MakeAction<LinkJobAction>(HIPActions, types::TY_HIP_FATBIN);
       return HIPFatBinary;
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index f509e42f05ee5..1761c0e067e25 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -112,10 +112,8 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   // for backward compatibility. For code object version 4 and greater, the
   // offload kind in bundle ID is 'hipv4'.
   std::string OffloadKind = "hip";
-  // bundle ID equals 'hip' is always right.
-  if (getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
-    // OffloadKind = OffloadKind + "v4";
-    OffloadKind = OffloadKind;
+  if (haveAMDGPUCodeObjectVersionArgument(C.getDriver(), Args) && getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
+    OffloadKind = OffloadKind + "v4";
   for (const auto &II : Inputs) {
     const auto* A = II.getAction();
     BundlerTargetArg = BundlerTargetArg + "," + OffloadKind +
@@ -243,7 +241,7 @@ void HIPToolChain::addClangTargetOptions(
 
   assert((DeviceOffloadingKind == Action::OFK_HIP ||
           DeviceOffloadingKind == Action::OFK_SYCL) &&
-         "Only HIP offloading kinds are supported for GPUs.");
+         "Only HIP and SYCL offloading kinds are supported for GPUs.");
 
   StringRef GpuArch = getGPUArch(DriverArgs);
 

From 61d3f98882efb82a8d64eabcd005b89972e5a8bd Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Tue, 1 Jun 2021 01:48:21 +0000
Subject: [PATCH 09/18] add necessary comment and format code

---
 clang/lib/Driver/ToolChain.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 062d46e822036..9f7753c2bffbc 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1169,6 +1169,8 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
     // matches the current toolchain triple. If it is not present
     // at all, target and host share a toolchain.
     if (A->getOption().matches(options::OPT_m_Group)) {
+      // AMD GPU is a special case, as -mcpu is required for the device
+      // compilation.
       if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
         DAL->append(A);
       else

From 69f9d98e950c9cc658274e56af8b8a55f05f5b67 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Wed, 2 Jun 2021 01:34:50 +0000
Subject: [PATCH 10/18] clang format code

---
 clang/lib/Driver/ToolChains/HIP.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 1761c0e067e25..349a988f9c63f 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -112,7 +112,8 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   // for backward compatibility. For code object version 4 and greater, the
   // offload kind in bundle ID is 'hipv4'.
   std::string OffloadKind = "hip";
-  if (haveAMDGPUCodeObjectVersionArgument(C.getDriver(), Args) && getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
+  if (haveAMDGPUCodeObjectVersionArgument(C.getDriver(), Args) &&
+      getAMDGPUCodeObjectVersion(C.getDriver(), Args) >= 4)
     OffloadKind = OffloadKind + "v4";
   for (const auto &II : Inputs) {
     const auto* A = II.getAction();

From 10ee765d868451a2f6a1e6b62ea459809e921412 Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Mon, 21 Jun 2021 01:32:34 +0000
Subject: [PATCH 11/18] add amdgcn LIT

---
 clang/test/Driver/sycl-offload-amdgcn.cpp | 36 +++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 clang/test/Driver/sycl-offload-amdgcn.cpp

diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
new file mode 100644
index 0000000000000..fe3fa0acc1a8a
--- /dev/null
+++ b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -0,0 +1,36 @@
+/// Tests specific to `-fsycl-targets=amdgcn-amd-amdhsa-sycldevice`
+// REQUIRES: clang-driver
+
+// UNSUPPORTED: system-windows
+
+/// Check action graph.
+// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
+// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
+// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
+// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-feature" "+ptx42"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_50"{{.*}} "-std=c++11"{{.*}}
+// CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
+// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
+
+/// Check phases w/out specifying a compute capability.
+// RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl -fsycl-use-footer \
+// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
+// CHK-PHASES-NO-CC: 0: input, "vector_add.cpp", c++, (host-sycl)
+// CHK-PHASES-NO-CC: preprocessor, {0}, c++-cpp-output, (host-sycl)
+// CHK-PHASES-NO-CC:- 2: input, "vector_add.cpp", c++, (device-sycl)
+// CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (device-sycl)
+// CHK-PHASES-NO-CC: compiler, {3}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {4}, c++-cpp-output
+// CHK-PHASES-NO-CC: compiler, {5}, ir, (host-sycl)
+// CHK-PHASES-NO-CC: backend, {6}, assembler, (host-sycl)
+// CHK-PHASES-NO-CC: assembler, {7}, object, (host-sycl)
+// CHK-PHASES-NO-CC: linker, {8}, image, (host-sycl)
+// CHK-PHASES-NO-CC: linker, {4}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: sycl-post-link, {10}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: backend, {11}, assembler, (device-sycl)
+// CHK-PHASES-NO-CC: assembler, {12}, object, (device-sycl)
+// CHK-PHASES-NO-CC: linker, {13}, image, (device-sycl)
+// CHK-PHASES-NO-CC:linker, {14}, hip-fatbin, (device-sycl)
+// CHK-PHASES-NO-CC: clang-offload-wrapper, {15}, object, (device-sycl)
+// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {16}, image

From 50d6fe107b8868050b711e7da3f73050527d2d3b Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Tue, 22 Jun 2021 02:22:55 +0000
Subject: [PATCH 12/18] fix amdgcn LIT

---
 clang/test/Driver/sycl-offload-amdgcn.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
index fe3fa0acc1a8a..c2ca0f86b9800 100644
--- a/clang/test/Driver/sycl-offload-amdgcn.cpp
+++ b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -8,7 +8,7 @@
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
 // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
-// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libdevice{{.*}}.10.bc"{{.*}} "-target-feature" "+ptx42"{{.*}} "-target-sdk-version=[[CUDA_VERSION:[0-9.]+]]"{{.*}} "-target-cpu" "sm_50"{{.*}} "-std=c++11"{{.*}}
+// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}oclc_isa_version{{.*}}.906.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
 // CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
 // CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
 
@@ -16,9 +16,9 @@
 // RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl -fsycl-use-footer \
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
-// CHK-PHASES-NO-CC: 0: input, "vector_add.cpp", c++, (host-sycl)
+// CHK-PHASES-NO-CC: 0: input, "{{.*}}", c++, (host-sycl)
 // CHK-PHASES-NO-CC: preprocessor, {0}, c++-cpp-output, (host-sycl)
-// CHK-PHASES-NO-CC:- 2: input, "vector_add.cpp", c++, (device-sycl)
+// CHK-PHASES-NO-CC:- 2: input, "{{.*}}", c++, (device-sycl)
 // CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (device-sycl)
 // CHK-PHASES-NO-CC: compiler, {3}, ir, (device-sycl)
 // CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {4}, c++-cpp-output

From d08946448d90ebf2787a1d2402fcf0ea80b2440a Mon Sep 17 00:00:00 2001
From: malixian <1240609881@qq.com>
Date: Thu, 24 Jun 2021 02:53:12 +0000
Subject: [PATCH 13/18] fix the driver LIT

---
 clang/test/Driver/sycl-offload-amdgcn.cpp | 39 ++++++++++++-----------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
index c2ca0f86b9800..3f009bfcf07fa 100644
--- a/clang/test/Driver/sycl-offload-amdgcn.cpp
+++ b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -8,7 +8,8 @@
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
 // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
-// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-mlink-builtin-bitcode" "{{.*}}oclc_isa_version{{.*}}.906.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
+// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+\
+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
 // CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
 // CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
 
@@ -17,20 +18,22 @@
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
 // CHK-PHASES-NO-CC: 0: input, "{{.*}}", c++, (host-sycl)
-// CHK-PHASES-NO-CC: preprocessor, {0}, c++-cpp-output, (host-sycl)
-// CHK-PHASES-NO-CC:- 2: input, "{{.*}}", c++, (device-sycl)
-// CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (device-sycl)
-// CHK-PHASES-NO-CC: compiler, {3}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {1}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {4}, c++-cpp-output
-// CHK-PHASES-NO-CC: compiler, {5}, ir, (host-sycl)
-// CHK-PHASES-NO-CC: backend, {6}, assembler, (host-sycl)
-// CHK-PHASES-NO-CC: assembler, {7}, object, (host-sycl)
-// CHK-PHASES-NO-CC: linker, {8}, image, (host-sycl)
-// CHK-PHASES-NO-CC: linker, {4}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: sycl-post-link, {10}, ir, (device-sycl)
-// CHK-PHASES-NO-CC: backend, {11}, assembler, (device-sycl)
-// CHK-PHASES-NO-CC: assembler, {12}, object, (device-sycl)
-// CHK-PHASES-NO-CC: linker, {13}, image, (device-sycl)
-// CHK-PHASES-NO-CC:linker, {14}, hip-fatbin, (device-sycl)
-// CHK-PHASES-NO-CC: clang-offload-wrapper, {15}, object, (device-sycl)
-// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {9}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {16}, image
+// CHK-PHASES-NO-CC: 1: preprocessor, {0}, c++-cpp-output, (host-sycl)
+// CHK-PHASES-NO-CC: 2: append-footer, {1}, c++, (host-sycl)
+// CHK-PHASES-NO-CC: 3: preprocessor, {2}, c++-cpp-output, (host-sycl)
+// CHK-PHASES-NO-CC: 4: input, "{{.*}}", c++, (device-sycl)
+// CHK-PHASES-NO-CC: 5: preprocessor, {4}, c++-cpp-output, (device-sycl)
+// CHK-PHASES-NO-CC: compiler, {5}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {3}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {6}, c++-cpp-output
+// CHK-PHASES-NO-CC: compiler, {7}, ir, (host-sycl)
+// CHK-PHASES-NO-CC: backend, {8}, assembler, (host-sycl)
+// CHK-PHASES-NO-CC: assembler, {9}, object, (host-sycl)
+// CHK-PHASES-NO-CC: linker, {10}, image, (host-sycl)
+// CHK-PHASES-NO-CC: linker, {6}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: sycl-post-link, {12}, ir, (device-sycl)
+// CHK-PHASES-NO-CC: backend, {13}, assembler, (device-sycl)
+// CHK-PHASES-NO-CC: assembler, {14}, object, (device-sycl)
+// CHK-PHASES-NO-CC: linker, {15}, image, (device-sycl)
+// CHK-PHASES-NO-CC: linker, {16}, hip-fatbin, (device-sycl)
+// CHK-PHASES-NO-CC: clang-offload-wrapper, {17}, object, (device-sycl)
+// CHK-PHASES-NO-CC: offload, "host-sycl (x86_64-unknown-linux-gnu)" {11}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {18}, image

From 7e38d8879bdc89d710f2a5a91a0f761db07f5a0e Mon Sep 17 00:00:00 2001
From: Artem Gindinson <artem.gindinson@intel.com>
Date: Thu, 24 Jun 2021 08:43:58 +0300
Subject: [PATCH 14/18] Fix conflict resolution for
 clang/lib/Frontend/InitPreprocessor.cpp

---
 clang/lib/Frontend/InitPreprocessor.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 61543e5520257..a760ad6ad08e5 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1179,13 +1179,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     Builder.defineMacro("__SYCL_DEVICE_ONLY__", "1");
     Builder.defineMacro("SYCL_EXTERNAL", "__attribute__((sycl_device))");
 
-    if (TI.getTriple().isNVPTX()) {
-        Builder.defineMacro("__SYCL_NVPTX__", "1");
-    }
+    const llvm::Triple &DeviceTriple = TI.getTriple();
+    if (DeviceTriple.isNVPTX())
+      Builder.defineMacro("__SYCL_NVPTX__", "1");
 
-    if (TI.getTriple().isAMDGCN()) {
+    if (DeviceTriple.isAMDGCN())
       Builder.defineMacro("__SYCL_AMDGCN__", "1");
-    }
     const llvm::Triple::SubArchType DeviceSubArch = DeviceTriple.getSubArch();
     if (DeviceTriple.isSPIR() &&
         DeviceSubArch != llvm::Triple::SPIRSubArch_fpga)

From cdf4d73e3d5c999e3fee83d6695cf32db5a830b1 Mon Sep 17 00:00:00 2001
From: Artem Gindinson <artem.gindinson@intel.com>
Date: Thu, 24 Jun 2021 09:21:33 +0300
Subject: [PATCH 15/18] Update clang/test/Driver/sycl-offload-amdgcn.cpp

---
 clang/test/Driver/sycl-offload-amdgcn.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
index 3f009bfcf07fa..ca99da58a6304 100644
--- a/clang/test/Driver/sycl-offload-amdgcn.cpp
+++ b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -8,8 +8,8 @@
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
 // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
-// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+\
-}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
+// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}\
+include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
 // CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
 // CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
 

From d3e27756d7142a4ff01c5617a3e30e6cc87989f7 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Thu, 24 Jun 2021 09:31:06 +0300
Subject: [PATCH 16/18] Update clang/lib/Driver/ToolChains/HIP.h

---
 clang/lib/Driver/ToolChains/HIP.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/HIP.h b/clang/lib/Driver/ToolChains/HIP.h
index 6cc83e27fccd0..9057a4bc5802c 100644
--- a/clang/lib/Driver/ToolChains/HIP.h
+++ b/clang/lib/Driver/ToolChains/HIP.h
@@ -110,8 +110,6 @@ class LLVM_LIBRARY_VISIBILITY HIPToolChain final : public ROCMToolChain {
   unsigned GetDefaultDwarfVersion() const override { return 4; }
 
   const ToolChain &HostTC;
-
-  
   void checkTargetID(const llvm::opt::ArgList &DriverArgs) const override;
   Tool *SelectTool(const JobAction &JA) const override;
 

From e3ee3c914291c09c3f06d2b0524bdc92beff5f7f Mon Sep 17 00:00:00 2001
From: Artem Gindinson <artem.gindinson@intel.com>
Date: Thu, 24 Jun 2021 10:57:59 +0300
Subject: [PATCH 17/18] Apply suggestions: fix LIT, remove __SYCL_AMDGCN__

---
 clang/lib/Frontend/InitPreprocessor.cpp   | 3 ---
 clang/test/Driver/sycl-offload-amdgcn.cpp | 3 +--
 sycl/include/CL/__spirv/spirv_vars.hpp    | 2 +-
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index a760ad6ad08e5..662e71d734d45 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1182,9 +1182,6 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
     const llvm::Triple &DeviceTriple = TI.getTriple();
     if (DeviceTriple.isNVPTX())
       Builder.defineMacro("__SYCL_NVPTX__", "1");
-
-    if (DeviceTriple.isAMDGCN())
-      Builder.defineMacro("__SYCL_AMDGCN__", "1");
     const llvm::Triple::SubArchType DeviceSubArch = DeviceTriple.getSubArch();
     if (DeviceTriple.isSPIR() &&
         DeviceSubArch != llvm::Triple::SPIRSubArch_fpga)
diff --git a/clang/test/Driver/sycl-offload-amdgcn.cpp b/clang/test/Driver/sycl-offload-amdgcn.cpp
index ca99da58a6304..627932fc6f026 100644
--- a/clang/test/Driver/sycl-offload-amdgcn.cpp
+++ b/clang/test/Driver/sycl-offload-amdgcn.cpp
@@ -8,8 +8,7 @@
 // RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
 // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
 // RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
-// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}\
-include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
+// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
 // CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
 // CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
 
diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index 1abf991e393d3..2983b452630c6 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -15,7 +15,7 @@
 
 #define __SPIRV_VAR_QUALIFIERS extern "C" const
 
-#if defined(__SYCL_NVPTX__) || defined(__SYCL_AMDGCN__)
+#if defined(__SYCL_NVPTX__) || defined(__AMDCGN__)
 
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x();
 SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y();

From d831065256c2ff9e6453e22c0d87b57a364d4b49 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Thu, 24 Jun 2021 11:06:08 +0300
Subject: [PATCH 18/18] Update spirv_vars.hpp

Fixed comments.
---
 sycl/include/CL/__spirv/spirv_vars.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index 2983b452630c6..0e793c78863c9 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -51,7 +51,7 @@ SYCL_EXTERNAL uint32_t __spirv_NumSubgroups();
 SYCL_EXTERNAL uint32_t __spirv_SubgroupId();
 SYCL_EXTERNAL uint32_t __spirv_SubgroupLocalInvocationId();
 
-#else // __SYCL_NVPTX__
+#else // __SYCL_NVPTX__ || defined(__AMDCGN__)
 
 typedef size_t size_t_vec __attribute__((ext_vector_type(3)));
 __SPIRV_VAR_QUALIFIERS size_t_vec __spirv_BuiltInGlobalSize;
@@ -154,7 +154,7 @@ SYCL_EXTERNAL inline uint32_t __spirv_SubgroupLocalInvocationId() {
   return __spirv_BuiltInSubgroupLocalInvocationId;
 }
 
-#endif // __SYCL_NVPTX__
+#endif // __SYCL_NVPTX__ || defined(__AMDCGN__)
 
 #undef __SPIRV_VAR_QUALIFIERS