diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 81c15adb8248e..07542feb9d814 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -75,6 +75,9 @@ class TargetOptions { /// address space. bool NVPTXUseShortPointers = false; + /// \brief If enabled, use precise square root + bool NVVMCudaPrecSqrt = false; + /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d6c3ca7d9c4ea..8d9b18cb74c29 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -980,6 +980,11 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr", TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", + TargetOpts<"NVVMCudaPrecSqrt">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index c55f0457bc504..4d1209c69506d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -758,13 +758,15 @@ void CodeGenModule::Release() { llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput)); } - if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) { + if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) { // Indicate whether __nvvm_reflect should be configured to flush denormal // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", CodeGenOpts.FP32DenormalMode.Output != llvm::DenormalMode::IEEE); + getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt", + getTarget().getTargetOpts().NVVMCudaPrecSqrt); } if (LangOpts.EHAsynch) diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu index b5abc29dea14b..f19132b0b4643 100644 --- a/clang/test/CodeGenCUDA/flush-denormals.cu +++ b/clang/test/CodeGenCUDA/flush-denormals.cu @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" // NOFTZ-NOT: "denormal-fp-math-f32" -// PTXFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} +// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} -// PTXNOFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} +// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0} diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu b/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu new file mode 100644 index 0000000000000..9054f1ab785ec --- /dev/null +++ b/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-sqrt %s -o -| FileCheck --check-prefix=CHECK-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s + +#include "Inputs/cuda.h" + +// Check that the -fcuda-prec-sqrt flag correctly sets the nvvm-reflect module flags. + +extern "C" __device__ void foo() {} + +// CHECK-ON: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1} +// CHECK-OFF: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0} diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index e4b5ace6b6340..0c4fe7924a0e7 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -343,19 +343,22 @@ Reflection Parameters The libdevice library currently uses the following reflection parameters to control code generation: -==================== ====================================================== -Flag Description -==================== ====================================================== -``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero -==================== ====================================================== +=========================== ====================================================== +Flag Description +=========================== ====================================================== +``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero +``__CUDA_PREC_SQRT=[0,1]`` Use precise square root +=========================== ====================================================== -The value of this flag is determined by the "nvvm-reflect-ftz" module flag. -The following sets the ftz flag to 1. +The value of these flags are determined by the "nvvm-reflect-ftz" and +"nvvm-reflect-prec-sqrt" module flags respectively. +The following sets the ftz flag to 1, and the precise sqrt flag to 1. .. code-block:: llvm - !llvm.module.flag = !{!0} + !llvm.module.flag = !{!0, !1} !0 = !{i32 4, !"nvvm-reflect-ftz", i32 1} + !1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1} (``i32 4`` indicates that the value set here overrides the value in another module we link with. See the `LangRef ` diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 339f51d210874..6ffc49a59a551 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -170,6 +170,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { ReflectVal = Flag->getSExtValue(); } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; + } else if (ReflectArg == "__CUDA_PREC_SQRT") { + // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module + // flag. + if (auto *Flag = mdconst::extract_or_null( + F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) + ReflectVal = Flag->getSExtValue(); } Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll index 57ab33798709a..696fd6d983549 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll @@ -3,6 +3,7 @@ declare i32 @__nvvm_reflect(i8*) @str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" +@str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"__CUDA_PREC_SQRT\00" define i32 @foo() { %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) @@ -10,5 +11,12 @@ define i32 @foo() { ret i32 %call } -!llvm.module.flags = !{!0} +define i32 @foo_sqrt() { + %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([17 x i8], [17 x i8] addrspace(1)* @str.1, i32 0, i32 0) to i8*)) + ; CHECK: ret i32 42 + ret i32 %call +} + +!llvm.module.flags = !{!0, !1} !0 = !{i32 4, !"nvvm-reflect-ftz", i32 42} +!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 42} diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 2b1c682644db3..b112a6e6ac500 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -819,6 +819,12 @@ which contains all the symbols required. GPU (SM 71), but it should work on any GPU compatible with SM 50 or above * The NVIDIA OpenCL headers conflict with the OpenCL headers required for this project and may cause compilation issues on some platforms +* `sycl::sqrt` is not correctly rounded by default as the SYCL specification + allows lower precision, when porting from CUDA it may be helpful to use + `-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is + significantly slower but matches the default precision used by `nvcc`, and + this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that + it defaults to `false`. ### HIP back-end limitations