intel · steffenlarsen · Sep 2, 2022 · Jul 1, 2022 · Jul 4, 2022 · Jul 5, 2022
@@ -956,6 +956,12 @@ def cuda_include_ptx_EQ : Joined<["--"], "cuda-include-ptx=">, Flags<[NoXarchOpt
   HelpText<"Include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
 def no_cuda_include_ptx_EQ : Joined<["--"], "no-cuda-include-ptx=">, Flags<[NoXarchOption]>,
   HelpText<"Do not include PTX for the following GPU architecture (e.g. sm_35) or 'all'. May be specified more than once.">;
+def fno_bundle_offload_arch : Flag<["-"], "fno-bundle-offload-arch">, 
+  HelpText<"Specify that the offload bundler should not identify a bundle with "
+            "specific arch. For example, the bundle for `nvptx64-nvidia-cuda-sm_80` "
+            "uses the bundle tag `nvptx64-nvidia-cuda` when used. "
+            "This allows .o files to contain .bc bundles that are unspecific "
+            "to a particular arch version.">;
 def offload_arch_EQ : Joined<["--"], "offload-arch=">, Flags<[NoXarchOption]>,
   HelpText<"CUDA offloading device architecture (e.g. sm_35), or HIP offloading target ID in the form of a "
            "device architecture followed by target ID features delimited by a colon. Each target ID feature "

@@ -4473,6 +4473,9 @@ class OffloadingActionBuilder final {
     /// List of static archives to extract FPGA dependency info from
     ActionList FPGAArchiveInputs;
 
+    // SYCLInstallation is needed in order to link SYCLDeviceLibs
+    SYCLInstallationDetector SYCLInstallation;
+
     /// List of GPU architectures to use in this compilation with NVPTX/AMDGCN
     /// targets.
     SmallVector<std::pair<llvm::Triple, const char *>, 8> GpuArchList;
@@ -4513,7 +4516,8 @@ class OffloadingActionBuilder final {
     SYCLActionBuilder(Compilation &C, DerivedArgList &Args,
                       const Driver::InputList &Inputs,
                       OffloadingActionBuilder &OAB)
-        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL, OAB) {}
+        : DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL, OAB),
+          SYCLInstallation(C.getDriver()) {}
 
     void withBoundArchForToolChain(const ToolChain *TC,
                                    llvm::function_ref<void(const char *)> Op) {
@@ -4892,10 +4896,8 @@ class OffloadingActionBuilder final {
         }
       }
 
-      const toolchains::SYCLToolChain *SYCLTC =
-          static_cast<const toolchains::SYCLToolChain *>(TC);
       SmallVector<SmallString<128>, 4> LibLocCandidates;
-      SYCLTC->SYCLInstallation.getSYCLDeviceLibPath(LibLocCandidates);
+      SYCLInstallation.getSYCLDeviceLibPath(LibLocCandidates);
       StringRef LibSuffix = isMSVCEnv ? ".obj" : ".o";
       using SYCLDeviceLibsList = SmallVector<DeviceLibOptInfo, 5>;
 
@@ -4948,20 +4950,100 @@ class OffloadingActionBuilder final {
               auto *SYCLDeviceLibsUnbundleAction =
                   C.MakeAction<OffloadUnbundlingJobAction>(
                       SYCLDeviceLibsInputAction);
-              addDeviceDepences(SYCLDeviceLibsUnbundleAction);
-              DeviceLinkObjects.push_back(SYCLDeviceLibsUnbundleAction);
+
+              // We are using BoundArch="" here since the NVPTX bundles in
+              // the devicelib .o files do not contain any arch information
+              SYCLDeviceLibsUnbundleAction->registerDependentActionInfo(
+                  TC, /*BoundArch=*/"", Action::OFK_SYCL);
+              OffloadAction::DeviceDependences Dep;
+              Dep.add(*SYCLDeviceLibsUnbundleAction, *TC, /*BoundArch=*/"",
+                      Action::OFK_SYCL);
+              auto *SYCLDeviceLibsDependenciesAction =
+                  C.MakeAction<OffloadAction>(
+                      Dep, SYCLDeviceLibsUnbundleAction->getType());
+
+              DeviceLinkObjects.push_back(SYCLDeviceLibsDependenciesAction);
               if (!LibLocSelected)
                 LibLocSelected = !LibLocSelected;
             }
           }
         }
       };
+
       addInputs(sycl_device_wrapper_libs);
-      if (isSpirvAOT)
+      if (isSpirvAOT || TC->getTriple().isNVPTX())
         addInputs(sycl_device_fallback_libs);
       if (Args.hasFlag(options::OPT_fsycl_instrument_device_code,
                        options::OPT_fno_sycl_instrument_device_code, true))
         addInputs(sycl_device_annotation_libs);
+
+      // For NVPTX backend we need to also link libclc and CUDA libdevice
+      // at the same stage that we link all of the unbundled SYCL libdevice
+      // objects together.
+      if (TC->getTriple().isNVPTX() && NumOfDeviceLibLinked) {
+        std::string LibSpirvFile;
+        if (Args.hasArg(options::OPT_fsycl_libspirv_path_EQ)) {
+          auto ProvidedPath =
+              Args.getLastArgValue(options::OPT_fsycl_libspirv_path_EQ).str();
+          if (llvm::sys::fs::exists(ProvidedPath))
+            LibSpirvFile = ProvidedPath;
+        } else {
+          SmallVector<StringRef, 8> LibraryPaths;
+
+          // Expected path w/out install.
+          SmallString<256> WithoutInstallPath(C.getDriver().ResourceDir);
+          llvm::sys::path::append(WithoutInstallPath, Twine("../../clc"));
+          LibraryPaths.emplace_back(WithoutInstallPath.c_str());
+
+          // Expected path w/ install.
+          SmallString<256> WithInstallPath(C.getDriver().ResourceDir);
+          llvm::sys::path::append(WithInstallPath, Twine("../../../share/clc"));
+          LibraryPaths.emplace_back(WithInstallPath.c_str());
+
+          // Select remangled libclc variant
+          std::string LibSpirvTargetName =
+              (TC->getAuxTriple()->isOSWindows())
+                  ? "remangled-l32-signed_char.libspirv-nvptx64--nvidiacl."
+                    "bc"
+                  : "remangled-l64-signed_char.libspirv-nvptx64--nvidiacl."
+                    "bc";
+
+          for (StringRef LibraryPath : LibraryPaths) {
+            SmallString<128> LibSpirvTargetFile(LibraryPath);
+            llvm::sys::path::append(LibSpirvTargetFile, LibSpirvTargetName);
+            if (llvm::sys::fs::exists(LibSpirvTargetFile) ||
+                Args.hasArg(options::OPT__HASH_HASH_HASH)) {
+              LibSpirvFile = std::string(LibSpirvTargetFile.str());
+              break;
+            }
+          }
+        }
+
+        if (!LibSpirvFile.empty()) {
+          Arg *LibClcInputArg = MakeInputArg(Args, C.getDriver().getOpts(),
+                                             Args.MakeArgString(LibSpirvFile));
+          auto *SYCLLibClcInputAction =
+              C.MakeAction<InputAction>(*LibClcInputArg, types::TY_LLVM_BC);
+          DeviceLinkObjects.push_back(SYCLLibClcInputAction);
+        }
+
+        const toolchains::CudaToolChain *CudaTC =
+            static_cast<const toolchains::CudaToolChain *>(TC);
+        for (auto LinkInputEnum : enumerate(DeviceLinkerInputs)) {
+          const char *BoundArch =
+              SYCLTargetInfoList[LinkInputEnum.index()].BoundArch;
+          std::string LibDeviceFile =
+              CudaTC->CudaInstallation.getLibDeviceFile(BoundArch);
+          if (!LibDeviceFile.empty()) {
+            Arg *CudaDeviceLibInputArg =
+                MakeInputArg(Args, C.getDriver().getOpts(),
+                             Args.MakeArgString(LibDeviceFile));
+            auto *SYCLDeviceLibInputAction = C.MakeAction<InputAction>(
+                *CudaDeviceLibInputArg, types::TY_LLVM_BC);
+            DeviceLinkObjects.push_back(SYCLDeviceLibInputAction);
+          }
+        }
+      }
       return NumOfDeviceLibLinked != 0;
     }
 
@@ -5111,11 +5193,12 @@ class OffloadingActionBuilder final {
         // When spv online link is supported by all backends, the fallback
         // device libraries are only needed when current toolchain is using
         // AOT compilation.
-        if (isSPIR) {
+        if (isSPIR || isNVPTX) {
           bool UseJitLink =
+              isSPIR &&
               Args.hasFlag(options::OPT_fsycl_device_lib_jit_link,
                            options::OPT_fno_sycl_device_lib_jit_link, false);
-          bool UseAOTLink = isSpirvAOT || !UseJitLink;
+          bool UseAOTLink = isSPIR && (isSpirvAOT || !UseJitLink);
           SYCLDeviceLibLinked = addSYCLDeviceLibs(
               TC, FullLinkObjects, UseAOTLink,
               C.getDefaultToolChain().getTriple().isWindowsMSVCEnvironment());

@@ -8730,7 +8730,8 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
     Triples += CurTC->getTriple().normalize();
     if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_OpenMP ||
          CurKind == Action::OFK_Cuda || CurKind == Action::OFK_SYCL) &&
-        !StringRef(CurDep->getOffloadingArch()).empty()) {
+        !StringRef(CurDep->getOffloadingArch()).empty() &&
+        !TCArgs.hasArg(options::OPT_fno_bundle_offload_arch)) {
       Triples += '-';
       Triples += CurDep->getOffloadingArch();
     }
@@ -8910,7 +8911,8 @@ void OffloadBundler::ConstructJobMultipleOutputs(
          Dep.DependentOffloadKind == Action::OFK_OpenMP ||
          Dep.DependentOffloadKind == Action::OFK_Cuda ||
          Dep.DependentOffloadKind == Action::OFK_SYCL) &&
-        !Dep.DependentBoundArch.empty()) {
+        !Dep.DependentBoundArch.empty() &&
+        !TCArgs.hasArg(options::OPT_fno_bundle_offload_arch)) {
       Triples += '-';
       Triples += Dep.DependentBoundArch;
     }

@@ -183,7 +183,9 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
   bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override;
   void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind,
                            const llvm::opt::ArgList &Args) const override;
-  bool IsMathErrnoDefault() const override { return false; }
+
+  // math-errno should be the default for SYCL but not other OFK using CUDA TC
+  bool IsMathErrnoDefault() const override { return OK == Action::OFK_SYCL; }
 
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;

@@ -170,6 +170,13 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
         LibPostfix = ".obj";
       std::string FileName = this->getToolChain().getInputFilename(II);
       StringRef InputFilename = llvm::sys::path::filename(FileName);
+      if (this->getToolChain().getTriple().isNVPTX()) {
+        // Linking SYCL Device libs requires libclc as well as libdevice
+        if ((InputFilename.find("nvidiacl") != InputFilename.npos ||
+             InputFilename.find("libdevice") != InputFilename.npos))
+          return true;
+        LibPostfix = ".cubin";
+      }
       StringRef LibSyclPrefix("libsycl-");
       if (!InputFilename.startswith(LibSyclPrefix) ||
           !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2))
@@ -620,7 +627,7 @@ void SYCL::x86_64::BackendCompiler::ConstructJob(
 
 SYCLToolChain::SYCLToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ToolChain &HostTC, const ArgList &Args)
-    : ToolChain(D, Triple, Args), HostTC(HostTC), SYCLInstallation(D) {
+    : ToolChain(D, Triple, Args), HostTC(HostTC) {
   // Lookup binaries into the driver directory, this is used to
   // discover the clang-offload-bundler executable.
   getProgramPaths().push_back(getDriver().Dir);

@@ -172,9 +172,7 @@ class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain {
       const llvm::opt::ArgList &Args,
       llvm::opt::ArgStringList &CC1Args) const override;
 
-
   const ToolChain &HostTC;
-  const SYCLInstallationDetector SYCLInstallation;
 
 protected:
   Tool *buildBackendCompiler() const override;

@@ -1,4 +1,4 @@
-// RUN: %clangxx -ccc-print-phases -target x86_64-unknown-linux-gnu  -fsycl -fsycl-targets=nvptx64-nvidia-cuda  -Xsycl-target-backend  --cuda-gpu-arch=sm_80 --cuda-gpu-arch=sm_80 -c %s 2>&1 | FileCheck %s --check-prefix=DEFAULT-PHASES
+// RUN: %clangxx -ccc-print-phases --sysroot=%S/Inputs/SYCL -target x86_64-unknown-linux-gnu  -fsycl -fsycl-targets=nvptx64-nvidia-cuda  -Xsycl-target-backend  --cuda-gpu-arch=sm_80 --cuda-gpu-arch=sm_80 -c %s 2>&1 | FileCheck %s --check-prefix=DEFAULT-PHASES
 
 // Test the correct placement of the offloading actions for compiling CUDA sources (*.cu) in SYCL.
 
@@ -19,7 +19,7 @@
 // DEFAULT-PHASES:|- 14: assembler, {13}, object, (host-cuda-sycl)
 // DEFAULT-PHASES:15: clang-offload-bundler, {3, 14}, object, (host-cuda-sycl)
 
-// RUN: %clangxx -ccc-print-phases -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=nvptx64-nvidia-cuda  -Xsycl-target-backend  --cuda-gpu-arch=sm_80 --cuda-gpu-arch=sm_80 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT-PHASES2
+// RUN: %clangxx -ccc-print-phases --sysroot=%S/Inputs/SYCL --cuda-path=%S/Inputs/CUDA_111/usr/local/cuda -fsycl-libspirv-path=%S/Inputs/SYCL/lib/nvidiacl -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=nvptx64-nvidia-cuda  -Xsycl-target-backend  --cuda-gpu-arch=sm_80 --cuda-gpu-arch=sm_80 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT-PHASES2
 
 // DEFAULT-PHASES2:                     +- 0: input, "{{.*}}", cuda, (host-cuda)
 // DEFAULT-PHASES2:                  +- 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
@@ -37,14 +37,71 @@
 // DEFAULT-PHASES2:      +- 13: assembler, {12}, object, (host-cuda-sycl)
 // DEFAULT-PHASES2:   +- 14: offload, "host-cuda-sycl (x86_64-unknown-linux-gnu)" {13}, object
 // DEFAULT-PHASES2:+- 15: linker, {14}, image, (host-cuda-sycl)
-// DEFAULT-PHASES2:|           +- 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_80)" {5}, ir
-// DEFAULT-PHASES2:|        +- 17: linker, {16}, ir, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     +- 18: sycl-post-link, {17}, ir, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     |  +- 19: file-table-tform, {18}, ir, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     |  |  +- 20: backend, {19}, assembler, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     |  |  |- 21: assembler, {20}, object, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     |  |- 22: linker, {20, 21}, cuda-fatbin, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|     |- 23: foreach, {19, 22}, cuda-fatbin, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|  +- 24: file-table-tform, {18, 23}, tempfiletable, (device-sycl, sm_80)
-// DEFAULT-PHASES2:|- 25: clang-offload-wrapper, {24}, object, (device-sycl, sm_80)
-// DEFAULT-PHASES2:26: offload, "host-cuda-sycl (x86_64-unknown-linux-gnu)" {15}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {25}, image
+// DEFAULT-PHASES2:|              +- 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_80)" {5}, ir
+// DEFAULT-PHASES2:|           +- 17: linker, {16}, ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|           |     +- 18: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 19: clang-offload-unbundler, {18}, object
+// DEFAULT-PHASES2:|           |- 20: offload, " (nvptx64-nvidia-cuda)" {19}, object
+// DEFAULT-PHASES2:|           |     +- 21: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 22: clang-offload-unbundler, {21}, object
+// DEFAULT-PHASES2:|           |- 23: offload, " (nvptx64-nvidia-cuda)" {22}, object
+// DEFAULT-PHASES2:|           |     +- 24: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 25: clang-offload-unbundler, {24}, object
+// DEFAULT-PHASES2:|           |- 26: offload, " (nvptx64-nvidia-cuda)" {25}, object
+// DEFAULT-PHASES2:|           |     +- 27: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 28: clang-offload-unbundler, {27}, object
+// DEFAULT-PHASES2:|           |- 29: offload, " (nvptx64-nvidia-cuda)" {28}, object
+// DEFAULT-PHASES2:|           |     +- 30: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 31: clang-offload-unbundler, {30}, object
+// DEFAULT-PHASES2:|           |- 32: offload, " (nvptx64-nvidia-cuda)" {31}, object
+// DEFAULT-PHASES2:|           |     +- 33: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 34: clang-offload-unbundler, {33}, object
+// DEFAULT-PHASES2:|           |- 35: offload, " (nvptx64-nvidia-cuda)" {34}, object
+// DEFAULT-PHASES2:|           |     +- 36: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 37: clang-offload-unbundler, {36}, object
+// DEFAULT-PHASES2:|           |- 38: offload, " (nvptx64-nvidia-cuda)" {37}, object
+// DEFAULT-PHASES2:|           |     +- 39: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 40: clang-offload-unbundler, {39}, object
+// DEFAULT-PHASES2:|           |- 41: offload, " (nvptx64-nvidia-cuda)" {40}, object
+// DEFAULT-PHASES2:|           |     +- 42: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 43: clang-offload-unbundler, {42}, object
+// DEFAULT-PHASES2:|           |- 44: offload, " (nvptx64-nvidia-cuda)" {43}, object
+// DEFAULT-PHASES2:|           |     +- 45: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 46: clang-offload-unbundler, {45}, object
+// DEFAULT-PHASES2:|           |- 47: offload, " (nvptx64-nvidia-cuda)" {46}, object
+// DEFAULT-PHASES2:|           |     +- 48: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 49: clang-offload-unbundler, {48}, object
+// DEFAULT-PHASES2:|           |- 50: offload, " (nvptx64-nvidia-cuda)" {49}, object
+// DEFAULT-PHASES2:|           |     +- 51: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 52: clang-offload-unbundler, {51}, object
+// DEFAULT-PHASES2:|           |- 53: offload, " (nvptx64-nvidia-cuda)" {52}, object
+// DEFAULT-PHASES2:|           |     +- 54: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 55: clang-offload-unbundler, {54}, object
+// DEFAULT-PHASES2:|           |- 56: offload, " (nvptx64-nvidia-cuda)" {55}, object
+// DEFAULT-PHASES2:|           |     +- 57: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 58: clang-offload-unbundler, {57}, object
+// DEFAULT-PHASES2:|           |- 59: offload, " (nvptx64-nvidia-cuda)" {58}, object
+// DEFAULT-PHASES2:|           |     +- 60: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 61: clang-offload-unbundler, {60}, object
+// DEFAULT-PHASES2:|           |- 62: offload, " (nvptx64-nvidia-cuda)" {61}, object
+// DEFAULT-PHASES2:|           |     +- 63: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 64: clang-offload-unbundler, {63}, object
+// DEFAULT-PHASES2:|           |- 65: offload, " (nvptx64-nvidia-cuda)" {64}, object
+// DEFAULT-PHASES2:|           |     +- 66: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 67: clang-offload-unbundler, {66}, object
+// DEFAULT-PHASES2:|           |- 68: offload, " (nvptx64-nvidia-cuda)" {67}, object
+// DEFAULT-PHASES2:|           |     +- 69: input, "{{.*}}", object
+// DEFAULT-PHASES2:|           |  +- 70: clang-offload-unbundler, {69}, object
+// DEFAULT-PHASES2:|           |- 71: offload, " (nvptx64-nvidia-cuda)" {70}, object
+// DEFAULT-PHASES2:|           |- 72: input, "{{.*}}nvidiacl{{.*}}", ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|           |- 73: input, "{{.*}}libdevice{{.*}}", ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|        +- 74: linker, {17, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53, 56, 59, 62, 65, 68, 71, 72, 73}, ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     +- 75: sycl-post-link, {74}, ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     |  +- 76: file-table-tform, {75}, ir, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     |  |  +- 77: backend, {76}, assembler, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     |  |  |- 78: assembler, {77}, object, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     |  |- 79: linker, {77, 78}, cuda-fatbin, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|     |- 80: foreach, {76, 79}, cuda-fatbin, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|  +- 81: file-table-tform, {75, 80}, tempfiletable, (device-sycl, sm_80)
+// DEFAULT-PHASES2:|- 82: clang-offload-wrapper, {81}, object, (device-sycl, sm_80)
+// DEFAULT-PHASES2:83: offload, "host-cuda-sycl (x86_64-unknown-linux-gnu)" {15}, "device-sycl (nvptx64-nvidia-cuda:sm_80)" {82}, image