From d06271d4dd01214fc63b2463ebeea77d1240bccd Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 15 Nov 2023 17:18:44 +0100 Subject: [PATCH 1/3] Test more WMMA configurations --- configs/configs.jl | 19 +++++++++++++++++-- src/config.jl | 33 +++++++++++++++++++++++++++++++++ test/matmul.jl | 15 ++++++++++++--- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/configs/configs.jl b/configs/configs.jl index c88b2d8d..ca2f9a62 100644 --- a/configs/configs.jl +++ b/configs/configs.jl @@ -241,7 +241,7 @@ macro get_wmma_config() mul!, Epilogue.Default(), verify_default, - Kernel.matmul_pipelined, + kernel, wmma_baseline) end end) end @@ -520,7 +520,22 @@ function get_configs() [2, 2, 1], [1, 1, 2], [2, 2, 2]], [[2048, 2048, 2048]]), - zero_c in [false] + zero_c in [false], + kernel in [Kernel.matmul_pipelined] + + push!(rv, @get_wmma_config) + end + + # WMMA GEMM parameters + for (M, N, K) in [(256, 256, 256)], + (AB_type, CD_type) in [(Float16, Float32)], + transpose_a in [false, true], + transpose_b in [false, true], + (BLOCK_M, BLOCK_N, BLOCK_K) in filter(x -> prod(x[1:2]) <= 128*128, collect(Iterators.product([64, 128, 256], [64, 128, 256], [16, 32, 64]))[:]), + (WARPS_M, WARPS_N) in filter(x -> prod(x) >= 4, collect(Iterators.product([1, 2, 4], [1, 2, 4]))[:]), + zero_c in [false, true], + (OP_M, OP_N, OP_K) in [(16, 16, 16)], + kernel in [Kernel.matmul_singlestage, Kernel.matmul_pipelined] push!(rv, @get_wmma_config) end diff --git a/src/config.jl b/src/config.jl index 29c26f0c..d9a4966d 100644 --- a/src/config.jl +++ b/src/config.jl @@ -35,6 +35,39 @@ is_b_col_major end +function Base.show(io::IO, config::Config) + println(io, "matmul_shape: $(config.matmul_shape)") + println(io, "block_shape: $(config.block_shape)") + println(io, "warps_per_block: $(config.warps_per_block)") + + println(io, "mem_a_warp: $(config.mem_a_warp)") + println(io, "mem_a_thread: $(config.mem_a_thread)") + + println(io, "mem_b_warp: $(config.mem_b_warp)") + println(io, "mem_b_thread: $(config.mem_b_thread)") + + println(io, "mem_cd_warp: $(config.mem_cd_warp)") + println(io, "mem_cd_thread: $(config.mem_cd_thread)") + + println(io, "compute_warp: $(config.compute_warp)") + println(io, "compute_op_shape: $(config.compute_op_shape)") + + println(io, "global_a_layout: $(config.global_a_layout)") + println(io, "global_b_layout: $(config.global_b_layout)") + println(io, "global_c_layout: $(config.global_c_layout)") + println(io, "global_d_layout: $(config.global_d_layout)") + + println(io, "shared_a_layout: $(config.shared_a_layout)") + println(io, "shared_b_layout: $(config.shared_b_layout)") + println(io, "shared_c_layout: $(config.shared_c_layout)") + println(io, "shared_d_layout: $(config.shared_d_layout)") + + println(io, "operator: $(config.operator)") + + println(io, "is_a_col_major: $(config.is_a_col_major)") + println(io, "is_b_col_major: $(config.is_b_col_major)") +end + struct ConfigError <: Exception message::String end diff --git a/test/matmul.jl b/test/matmul.jl index fa625021..19f08de6 100644 --- a/test/matmul.jl +++ b/test/matmul.jl @@ -7,8 +7,17 @@ include("../configs/configs.jl") @testset "Matrix multiplication" begin @testcase "$( cf.name )" for cf in get_configs() - c_h, a, b, c, d = generate_inputs(cf) - run_gemm(cf, a, b, c, d) - @test verify(cf, c_h, d) + try + c_h, a, b, c, d = generate_inputs(cf) + run_gemm(cf, a, b, c, d) + @test verify(cf, c_h, d) + catch err + # Count tests with config errors as "broken". + if isa(err, GemmKernels.ConfigError) + @test true skip=true + else + rethrow() + end + end end end From 4b435ec2c90b0fdd86686300ba0706b4fdab9c7c Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Fri, 17 Nov 2023 11:41:12 +0100 Subject: [PATCH 2/3] Skip benchmarks with invalid config --- benchmarks/runbenchmarks.jl | 79 +++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/benchmarks/runbenchmarks.jl b/benchmarks/runbenchmarks.jl index 5f402eec..4f84fd22 100644 --- a/benchmarks/runbenchmarks.jl +++ b/benchmarks/runbenchmarks.jl @@ -112,54 +112,63 @@ for cf in get_configs() @info "Running benchmark $( cf.name )..." c_h, a, b, c, d = generate_inputs(cf) - # warmup - run_gemm(cf, a, b, c, d) + try + # warmup + run_gemm(cf, a, b, c, d) - # benchmark - profile_results = CUDA.@profile begin - for sample in 1:NUM_SAMPLES - run_gemm(cf, a, b, c, d) + # benchmark + profile_results = CUDA.@profile begin + for sample in 1:NUM_SAMPLES + run_gemm(cf, a, b, c, d) + end end - end - # XXX: This works for now, since every GEMM is one kernel, but later on we may want to benchmark - # operations consisting of multiple kernel launches... - profile_results = profile_results.device + # XXX: This works for now, since every GEMM is one kernel, but later on we may want to benchmark + # operations consisting of multiple kernel launches... + profile_results = profile_results.device - # get info - details[cf.name] = Dict( - "registers" => profile_results[1, "registers"], - "dynamic_shared_mem" => profile_results[1, "shared_mem"].dynamic, - "static_shared_mem" => profile_results[1, "shared_mem"].static, - "local_mem" => profile_results[1, "local_mem"].thread - ) + # get info + details[cf.name] = Dict( + "registers" => profile_results[1, "registers"], + "dynamic_shared_mem" => profile_results[1, "shared_mem"].dynamic, + "static_shared_mem" => profile_results[1, "shared_mem"].static, + "local_mem" => profile_results[1, "local_mem"].thread + ) - times = 1e9 .* (profile_results[!, "stop"] - profile_results[!, "start"]) - @assert length(times) == NUM_SAMPLES + times = 1e9 .* (profile_results[!, "stop"] - profile_results[!, "start"]) + @assert length(times) == NUM_SAMPLES - @info "\tGemmKernels: $(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))" + @info "\tGemmKernels: $(prettytime(times)) $(prettyflops(times, cf.config.matmul_shape))" - if !isnothing(cf.baseline) - # benchmark baseline - baseline_profile_results = CUDA.@profile begin - for sample in 1:NUM_SAMPLES - run_baseline(cf, a, b, c, d) + if !isnothing(cf.baseline) + # benchmark baseline + baseline_profile_results = CUDA.@profile begin + for sample in 1:NUM_SAMPLES + run_baseline(cf, a, b, c, d) + end end - end - baseline_profile_results = baseline_profile_results.device - @assert size(baseline_profile_results, 1) % NUM_SAMPLES == 0 + baseline_profile_results = baseline_profile_results.device + @assert size(baseline_profile_results, 1) % NUM_SAMPLES == 0 - baseline_times = 1e9 .* sum.(Iterators.partition(baseline_profile_results[!, "stop"] - baseline_profile_results[!, "start"], size(baseline_profile_results, 1) ÷ NUM_SAMPLES)) - @assert length(baseline_times) == NUM_SAMPLES + baseline_times = 1e9 .* sum.(Iterators.partition(baseline_profile_results[!, "stop"] - baseline_profile_results[!, "start"], size(baseline_profile_results, 1) ÷ NUM_SAMPLES)) + @assert length(baseline_times) == NUM_SAMPLES - baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))" - @info "\tBaseline: $(prettytime(baseline_times)) $(prettyflops(baseline_times, cf.config.matmul_shape)) (GemmKernels: $(baseline_ratio)%)" + baseline_ratio = "$(round(100 * minimum(baseline_times) / minimum(times); sigdigits=3))" + @info "\tBaseline: $(prettytime(baseline_times)) $(prettyflops(baseline_times, cf.config.matmul_shape)) (GemmKernels: $(baseline_ratio)%)" - baseline_results[cf.name] = Dict("times" => baseline_times) - end + baseline_results[cf.name] = Dict("times" => baseline_times) + end - results[cf.name] = Dict("times" => times) + results[cf.name] = Dict("times" => times) + catch err + if isa(err, GemmKernels.ConfigError) + # Skip this benchmark. + @warn "Skipping benchmark $(cf.name): Invalid configuration: $(err)." + else + rethrow() + end + end end function save_results(results_file, details_file, results, details) From bffb68e0301a7f20d5703431ef5fbac2b4ef3242 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Tue, 2 Jan 2024 10:38:13 +0100 Subject: [PATCH 3/3] Retrigger CI