Skip to content

Commit

Permalink
Restore functionality of FastMath.sincos. (#1627)
Browse files Browse the repository at this point in the history
Base decided in JuliaLang/julia#24031 that FastMath.sincos should
fall back to the native implementation in Julia, because it is
faster than the intrinsics (for the CPU at least). That does not
hold for CUDA GPUs, so have it again call sin_fast/cos_fast.
  • Loading branch information
maleadt committed Jan 5, 2023
1 parent b300158 commit a51ef2f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ end
ccall("extern __nv_sincosf", llvmcall, Cvoid, (Cfloat, Ptr{Cfloat}, Ptr{Cfloat}), x, s, c)
return (s[], c[])
end
# Base has sincos_fast fall back to the native implementation which is presumed faster,
# but that is not the case compared to CUDA's intrinsics
@device_override FastMath.sincos_fast(x::Union{Float64,Float32}) = (FastMath.sin_fast(x), FastMath.cos_fast(x))

@device_override function Base.sincospi(x::Float64)
s = Ref{Cdouble}()
Expand Down
12 changes: 12 additions & 0 deletions test/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -136,4 +136,16 @@ using SpecialFunctions
@test Array(a)[3] == r
end
end

@testset "@fastmath sincos" begin
# JuliaGPU/CUDA.jl#1606: FastMath.sincos fell back to regular sin/cos
function kernel(a, b, c)
@inbounds b[], c[] = @fastmath sincos(a[])
return
end
asm = sprint(io->CUDA.code_ptx(io, kernel, NTuple{3,CuDeviceArray{Float32,1,AS.Global}}))
@assert contains(asm, "sin.approx.f32")
@assert contains(asm, "cos.approx.f32")
@assert !contains(asm, "__nv") # from libdevice
end
end

0 comments on commit a51ef2f

Please sign in to comment.