Skip to content

Commit

Permalink
Compare with cuBLAS during benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasfaingnaert committed Nov 7, 2023
1 parent 41466e4 commit 11187de
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 27 deletions.
56 changes: 45 additions & 11 deletions benchmarks/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ end
include("../configs/configs.jl")

results = Dict()
baseline_results = Dict()
details = Dict()

for cf in get_configs()
Expand All @@ -115,30 +116,49 @@ for cf in get_configs()
run_gemm(cf, a, b, c, d)

# benchmark
profile_results = CUDA.@profiled begin
profile_results = CUDA.@profile begin
for sample in 1:NUM_SAMPLES
run_gemm(cf, a, b, c, d)
end
end

# XXX: This works for now, since every GEMM is one kernel, but later on we may want to benchmark
# operations consisting of multiple kernel launches...
# XXX: Will this always work with mangling?
matmul_results = filter(row -> contains(row.name, String(Symbol(cf.kernel))), profile_results.device)

@assert size(matmul_results, 1) == NUM_SAMPLES
profile_results = profile_results.device

# get info
details[cf.name] = Dict(
"registers" => matmul_results[1, "registers"],
"dynamic_shared_mem" => matmul_results[1, "shared_mem"].dynamic,
"static_shared_mem" => matmul_results[1, "shared_mem"].static,
"local_mem" => matmul_results[1, "local_mem"].thread
"registers" => profile_results[1, "registers"],
"dynamic_shared_mem" => profile_results[1, "shared_mem"].dynamic,
"static_shared_mem" => profile_results[1, "shared_mem"].static,
"local_mem" => profile_results[1, "local_mem"].thread
)

times = 1e9 .* (matmul_results[!, "stop"] - matmul_results[!, "start"])
times = 1e9 .* (profile_results[!, "stop"] - profile_results[!, "start"])
@assert length(times) == NUM_SAMPLES

@info "\tGemmKernels: $(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))"

if !isnothing(cf.baseline)
# benchmark baseline
baseline_profile_results = CUDA.@profile begin
for sample in 1:NUM_SAMPLES
run_baseline(cf, a, b, c, d)
end
end

baseline_profile_results = baseline_profile_results.device
@assert size(baseline_profile_results, 1) % NUM_SAMPLES == 0

baseline_times = 1e9 .* sum.(Iterators.partition(baseline_profile_results[!, "stop"] - baseline_profile_results[!, "start"], size(baseline_profile_results, 1) ÷ NUM_SAMPLES))
@assert length(baseline_times) == NUM_SAMPLES

baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))"
@info "\tBaseline: $(prettytime(baseline_times)) $(prettyflops(baseline_times, cf.config.matmul_shape)) (GemmKernels: $(baseline_ratio)%)"

baseline_results[cf.name] = Dict("times" => baseline_times)
end

@info "\t$(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))"
results[cf.name] = Dict("times" => times)
end

Expand Down Expand Up @@ -303,6 +323,20 @@ if previous_results !== nothing
end
end

# Print results compared to baseline.
println(io, "# Comparison with baseline")

println(io, "| test | GemmKernels | Baseline | % |")
println(io, "|------|-------------|----------|---|")

for k in keys(baseline_results)
times = results[k]["times"]
baseline_times = baseline_results[k]["times"]
baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))"

println(io, "| $(markdown_escaped_code(k)) | $(prettytime(times)) | $(prettytime(baseline_times)) | $(baseline_ratio) |")
end

body = String(take!(io))
println(body)

Expand Down
57 changes: 41 additions & 16 deletions configs/configs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct Configuration
epilogue # The epilogue to use.
verify # Verify function to use.
kernel # The kernel function to use.
baseline # Baseline implementation to compare performance against
end

function get_custom_mul!(element_update)
Expand Down Expand Up @@ -82,6 +83,12 @@ function run_gemm(cf::Configuration, a, b, c, d)
kernel = cf.kernel)
end

# Run the baseline.
function run_baseline(cf::Configuration, a, b, c, d)
@assert !isnothing(cf.baseline)
cf.baseline(a, b, c, d, cf.alpha, cf.beta, cf.transpose_a, cf.transpose_b)
end

# Verify results.
function verify(cf::Configuration, c_h, d)
cf.verify(c_h, d)
Expand All @@ -101,18 +108,28 @@ function verify_dual(c_h, d)
isapprox(c_dual, d_dual)
end

function fpu_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b)
CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_DEFAULT_MATH)
CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c)
end

function wmma_baseline(a, b, c, d, alpha, beta, transpose_a, transpose_b)
CUDA.CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH)
CUDA.CUBLAS.gemmEx!(!transpose_a ? 'N' : 'T', !transpose_b ? 'N' : 'T', alpha, a, b, beta, c)
end

function get_configs()
rv = []

# FPU Op
for (A_type, B_type, CD_type) in [
(Float16, Float16, Float32),
(Float32, Float32, Float32),
(Float32, Float32, Float64),
(Float64, Float64, Float64),
(Int16, Int16, Int16),
(Int32, Int32, Int32),
(Int64, Int64, Int64)],
for (A_type, B_type, CD_type, baseline_func) in [
(Float16, Float16, Float32, fpu_baseline),
(Float32, Float32, Float32, fpu_baseline),
(Float32, Float32, Float64, nothing),
(Float64, Float64, Float64, fpu_baseline),
(Int16, Int16, Int16, nothing),
(Int32, Int32, Int32, nothing),
(Int64, Int64, Int64, nothing)],
transpose_a = [false, true],
transpose_b = [false, true],
(OP_M, OP_N, OP_K, OP_MB, OP_NB, OP_KB) in [(8, 16, 2, 4, 8, 1)],
Expand Down Expand Up @@ -151,7 +168,8 @@ function get_configs()
mul!,
Epilogue.Default(),
verify_default,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
baseline_func))
end

# FPU Op shapes
Expand Down Expand Up @@ -209,7 +227,8 @@ function get_configs()
mul!,
Epilogue.Default(),
verify_default,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
fpu_baseline))
end

# Tropical GEMM
Expand Down Expand Up @@ -254,7 +273,8 @@ function get_configs()
get_custom_mul!((a, b, c) -> max(a + b, c)),
Epilogue.Default(),
verify_default,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
nothing))
end

# WMMA GEMM
Expand Down Expand Up @@ -298,7 +318,8 @@ function get_configs()
mul!,
Epilogue.Default(),
verify_default,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
wmma_baseline))
end

# WMMA GEMM + bias
Expand Down Expand Up @@ -344,7 +365,8 @@ function get_configs()
mul!,
Epilogue.Bias(pointer(bias)),
(c_h, d) -> verify_bias(c_h, d, bias),
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
nothing))
end

# WMMA Diagonal GEMM
Expand Down Expand Up @@ -394,7 +416,8 @@ function get_configs()
(C, A, B, alpha, beta) -> mul!(C, Diagonal(A[1:M,1]), B, true, true),
Epilogue.Default(),
verify_default,
Kernel.matmul_singlestage))
Kernel.matmul_singlestage,
nothing))
end

# WMMA Complex GEMM
Expand Down Expand Up @@ -453,7 +476,8 @@ function get_configs()
mul!,
Epilogue.Default(),
verify_default,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
nothing))
end

# WMMA Dual GEMM
Expand Down Expand Up @@ -511,7 +535,8 @@ function get_configs()
(C, A, B, alpha, beta) -> mul!(dual_conv(C), dual_conv(Complex{Float32}.(A)), dual_conv(Complex{Float32}.(B)), true, true),
Epilogue.Default(),
verify_dual,
Kernel.matmul_pipelined))
Kernel.matmul_pipelined,
nothing))
end

rv
Expand Down

0 comments on commit 11187de

Please sign in to comment.