Add toggle for contextualization, and disable for tests relying on na…

…mes.
JuliaGPU · Jan 20, 2020 · ff3b34e · ff3b34e
1 parent 39c4b7a
commit ff3b34e
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 23 deletions.
diff --git a/src/compiler/common.jl b/src/compiler/common.jl
@@ -7,6 +7,8 @@ Base.@kwdef struct CompilerJob
  cap::VersionNumber
  kernel::Bool
 
+ contextualize::Bool = true
+
  # optional properties
  minthreads::Union{Nothing,CuDim} = nothing
  maxthreads::Union{Nothing,CuDim} = nothing

diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
@@ -62,7 +62,7 @@ function codegen(target::Symbol, job::CompilerJob;
  @timeit_debug to "validation" check_method(job)
 
  @timeit_debug to "Julia front-end" begin
- f = contextualize(job.f)
+ f = job.contextualize ? contextualize(job.f) : job.f
 
  # get the method instance
  world = typemax(UInt)

diff --git a/src/execution.jl b/src/execution.jl
@@ -9,7 +9,7 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nearest_warpsize
 # the code it generates, or the execution
 function split_kwargs(kwargs)
  macro_kws = [:dynamic]
- compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name]
+ compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name, :contextualize]
  call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream]
  macro_kwargs = []
  compiler_kwargs = []
@@ -351,6 +351,7 @@ The following keyword arguments are supported:
 - `maxregs`: the maximum number of registers to be allocated to a single thread (only
  supported on LLVM 4.0+)
 - `name`: override the name that the kernel will have in the generated code
+- `contextualize`: whether to contextualize functions using Cassette (default: true)
 
 The output of this function is automatically cached, i.e. you can simply call `cufunction`
 in a hot path without degrading performance. New code will be generated automatically, when

diff --git a/test/codegen.jl b/test/codegen.jl
@@ -8,7 +8,8 @@
  valid_kernel() = return
  invalid_kernel() = 1
 
- ir = sprint(io->CUDAnative.code_llvm(io, valid_kernel, Tuple{}; optimize=false, dump_module=true))
+ ir = sprint(io->CUDAnative.code_llvm(io, valid_kernel, Tuple{}; dump_module=true,
+ contextualize=false, optimize=false))
 
  # module should contain our function + a generic call wrapper
  @test occursin("define void @julia_valid_kernel", ir)
@@ -21,11 +22,6 @@
  @test_throws CUDAnative.KernelError CUDAnative.code_llvm(devnull, invalid_kernel, Tuple{}; kernel=true) == nothing
 end
 
-@testset "unbound typevars" begin
- invalid_kernel() where {unbound} = return
- @test_throws CUDAnative.KernelError CUDAnative.code_llvm(devnull, invalid_kernel, Tuple{})
-end
-
 @testset "exceptions" begin
  foobar() = throw(DivideError())
  ir = sprint(io->CUDAnative.code_llvm(io, foobar, Tuple{}))
@@ -52,7 +48,7 @@ end
  @noinline child(i) = sink(i)
  parent(i) = child(i)
 
- ir = sprint(io->CUDAnative.code_llvm(io, parent, Tuple{Int}))
+ ir = sprint(io->CUDAnative.code_llvm(io, parent, Tuple{Int}; contextualize=false))
  @test occursin(r"call .+ @julia_child_", ir)
 end
 
@@ -76,10 +72,10 @@ end
  x::Int
  end
 
- ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Aggregate}))
+ ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Aggregate}; contextualize=false))
  @test occursin(r"@julia_kernel_\d+\(({ i64 }|\[1 x i64\]) addrspace\(\d+\)?\*", ir)
 
- ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Aggregate}; kernel=true))
+ ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Aggregate}; contextualize=false, kernel=true))
  @test occursin(r"@ptxcall_kernel_\d+\(({ i64 }|\[1 x i64\])\)", ir)
 end
 
@@ -135,7 +131,7 @@ end
  closure = ()->return
 
  function test_name(f, name; kwargs...)
- code = sprint(io->CUDAnative.code_llvm(io, f, Tuple{}; kwargs...))
+ code = sprint(io->CUDAnative.code_llvm(io, f, Tuple{}; contextualize=false, kwargs...))
  @test occursin(name, code)
  end
 
@@ -221,7 +217,7 @@ end
  return
  end
 
- asm = sprint(io->CUDAnative.code_ptx(io, parent, Tuple{Int64}))
+ asm = sprint(io->CUDAnative.code_ptx(io, parent, Tuple{Int64}; contextualize=false))
  @test occursin(r"call.uni\s+julia_child_"m, asm)
 end
 
@@ -232,7 +228,7 @@ end
  return
  end
 
- asm = sprint(io->CUDAnative.code_ptx(io, entry, Tuple{Int64}; kernel=true))
+ asm = sprint(io->CUDAnative.code_ptx(io, entry, Tuple{Int64}; contextualize=false, kernel=true))
  @test occursin(r"\.visible \.entry ptxcall_entry_", asm)
  @test !occursin(r"\.visible \.func julia_nonentry_", asm)
  @test occursin(r"\.func julia_nonentry_", asm)
@@ -279,15 +275,15 @@ end
  return
  end
 
- asm = sprint(io->CUDAnative.code_ptx(io, parent1, Tuple{Int}))
+ asm = sprint(io->CUDAnative.code_ptx(io, parent1, Tuple{Int}; contextualize=false))
  @test occursin(r".func julia_child_", asm)
 
  function parent2(i)
  child(i+1)
  return
  end
 
- asm = sprint(io->CUDAnative.code_ptx(io, parent2, Tuple{Int}))
+ asm = sprint(io->CUDAnative.code_ptx(io, parent2, Tuple{Int}; contextualize=false))
  @test occursin(r".func julia_child_", asm)
 end
 
@@ -357,7 +353,7 @@ end
  closure = ()->nothing
 
  function test_name(f, name; kwargs...)
- code = sprint(io->CUDAnative.code_ptx(io, f, Tuple{}; kwargs...))
+ code = sprint(io->CUDAnative.code_ptx(io, f, Tuple{}; contextualize=false, kwargs...))
  @test occursin(name, code)
  end
 
@@ -429,7 +425,7 @@ end
  return
  end
 
- ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}}))
+ ir = sprint(io->CUDAnative.code_llvm(io, kernel, Tuple{Float32,Ptr{Float32}}; contextualize=false))
  @test occursin("jl_box_float32", ir)
  CUDAnative.code_ptx(devnull, kernel, Tuple{Float32,Ptr{Float32}})
 end
@@ -444,18 +440,20 @@ end
 
 # some validation happens in the emit_function hook, which is called by code_llvm
 
+# NOTE: contextualization changes order of frames
 @testset "recursion" begin
  @eval recurse_outer(i) = i > 0 ? i : recurse_inner(i)
  @eval @noinline recurse_inner(i) = i < 0 ? i : recurse_outer(i)
 
- @test_throws_message(CUDAnative.KernelError, CUDAnative.code_llvm(devnull, recurse_outer, Tuple{Int})) do msg
+ @test_throws_message(CUDAnative.KernelError, CUDAnative.code_llvm(devnull, recurse_outer, Tuple{Int}; contextualize=false)) do msg
  occursin("recursion is currently not supported", msg) &&
  occursin("[1] recurse_outer", msg) &&
  occursin("[2] recurse_inner", msg) &&
  occursin("[3] recurse_outer", msg)
  end
 end
 
+# FIXME: contextualization removes all frames here -- changed inlining behavior?
 @testset "base intrinsics" begin
  foobar(i) = sin(i)
 

diff --git a/test/device/execution.jl b/test/device/execution.jl
@@ -70,9 +70,9 @@ end
  @test_throws ErrorException @device_code_lowered nothing
 
  # make sure kernel name aliases are preserved in the generated code
- @test occursin("ptxcall_dummy", sprint(io->(@device_code_llvm io=io @cuda dummy())))
- @test occursin("ptxcall_dummy", sprint(io->(@device_code_ptx io=io @cuda dummy())))
- @test occursin("ptxcall_dummy", sprint(io->(@device_code_sass io=io @cuda dummy())))
+ @test occursin("ptxcall_dummy", sprint(io->(@device_code_llvm io=io @cuda contextualize=false dummy())))
+ @test occursin("ptxcall_dummy", sprint(io->(@device_code_ptx io=io @cuda contextualize=false dummy())))
+ @test occursin("ptxcall_dummy", sprint(io->(@device_code_sass io=io @cuda contextualize=false dummy())))
 
  # make sure invalid kernels can be partially reflected upon
  let
@@ -96,7 +96,7 @@ end
 
  # set name of kernel
  @test occursin("ptxcall_mykernel", sprint(io->(@device_code_llvm io=io begin
- k = cufunction(dummy, name="mykernel")
+ k = cufunction(dummy; name="mykernel", contextualize=false)
  k()
  end)))
 end