Restore functionality of FastMath.sincos. (#1627)

Base decided in JuliaLang/julia#24031 that FastMath.sincos should fall back to the native implementation in Julia, because it is faster than the intrinsics (for the CPU at least). That does not hold for CUDA GPUs, so have it again call sin_fast/cos_fast.
JuliaGPU · Jan 5, 2023 · a51ef2f · a51ef2f
1 parent b300158
commit a51ef2f
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 0 deletions.
diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl
@@ -40,6 +40,9 @@ end
     ccall("extern __nv_sincosf", llvmcall, Cvoid, (Cfloat, Ptr{Cfloat}, Ptr{Cfloat}), x, s, c)
     return (s[], c[])
 end
+# Base has sincos_fast fall back to the native implementation which is presumed faster,
+# but that is not the case compared to CUDA's intrinsics
+@device_override FastMath.sincos_fast(x::Union{Float64,Float32}) = (FastMath.sin_fast(x), FastMath.cos_fast(x))
 
 @device_override function Base.sincospi(x::Float64)
     s = Ref{Cdouble}()

diff --git a/test/device/intrinsics/math.jl b/test/device/intrinsics/math.jl
@@ -136,4 +136,16 @@ using SpecialFunctions
             @test Array(a)[3] == r
         end
     end
+
+    @testset "@fastmath sincos" begin
+        # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
+        function kernel(a, b, c)
+            @inbounds b[], c[] = @fastmath sincos(a[])
+            return
+        end
+        asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
+        @assert contains(asm, "sin.approx.f32")
+        @assert contains(asm, "cos.approx.f32")
+        @assert !contains(asm, "__nv")  # from libdevice
+    end
 end