diff --git a/src/device/intrinsics/math.jl b/src/device/intrinsics/math.jl index b746d8f02b..f0782a9036 100644 --- a/src/device/intrinsics/math.jl +++ b/src/device/intrinsics/math.jl @@ -40,6 +40,9 @@ end ccall("extern __nv_sincosf", llvmcall, Cvoid, (Cfloat, Ptr{Cfloat}, Ptr{Cfloat}), x, s, c) return (s[], c[]) end +# Base has sincos_fast fall back to the native implementation which is presumed faster, +# but that is not the case compared to CUDA's intrinsics +@device_override FastMath.sincos_fast(x::Union{Float64,Float32}) = (FastMath.sin_fast(x), FastMath.cos_fast(x)) @device_override function Base.sincospi(x::Float64) s = Ref{Cdouble}() diff --git a/test/device/intrinsics/math.jl b/test/device/intrinsics/math.jl index 16273aa823..502bab0447 100644 --- a/test/device/intrinsics/math.jl +++ b/test/device/intrinsics/math.jl @@ -136,4 +136,16 @@ using SpecialFunctions @test Array(a)[3] == r end end + + @testset "@fastmath sincos" begin + # JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos + function kernel(a, b, c) + @inbounds b[], c[] = @fastmath sincos(a[]) + return + end + asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}})) + @assert contains(asm, "sin.approx.f32") + @assert contains(asm, "cos.approx.f32") + @assert !contains(asm, "__nv") # from libdevice + end end