Skip to content

Commit

Permalink
Add runic as a formatter
Browse files Browse the repository at this point in the history
  • Loading branch information
vchuravy committed Aug 9, 2024
1 parent ab87f45 commit faf168b
Show file tree
Hide file tree
Showing 34 changed files with 819 additions and 763 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/runic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Runic formatting
on:
push:
branches:
- 'master'
- 'release-'
tags:
- '*'
pull_request:
jobs:
runic:
name: Runic
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: "nightly" # Only nightly have the -m flag currently
- uses: julia-actions/cache@v2
- name: Install Runic
run: |
julia --color=yes --project=@runic -e 'using Pkg; Pkg.add(url = "https://github.com/fredrikekre/Runic.jl")'
- name: Run Runic
run: |
git ls-files -z -- '*.jl' | xargs -0 julia --project=@runic -m Runic --check --diff
14 changes: 8 additions & 6 deletions benchmark/benchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ let static = BenchmarkGroup()
for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
dtype[N] = @benchmarkable begin
kernel = saxpy_kernel!($BACKEND, 1024)
kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
end setup=(
kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
end setup = (
X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
Z = KernelAbstractions.zeros($BACKEND, $T, $N))
Z = KernelAbstractions.zeros($BACKEND, $T, $N)
)
end
static["$T"] = dtype
end
Expand All @@ -53,11 +54,12 @@ let default = BenchmarkGroup()
for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
dtype[N] = @benchmarkable begin
kernel = saxpy_kernel!($BACKEND)
kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
end setup=(
kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
end setup = (
X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
Z = KernelAbstractions.zeros($BACKEND, $T, $N))
Z = KernelAbstractions.zeros($BACKEND, $T, $N)
)
end
default["$T"] = dtype
end
Expand Down
24 changes: 12 additions & 12 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@ function main()
ci = get(ENV, "CI", "") == "true"

makedocs(;
modules=[KernelAbstractions],
authors="JuliaGPU and contributors",
repo="https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
sitename="KernelAbstractions.jl",
format=Documenter.HTML(;
prettyurls=ci,
canonical="https://juliagpu.github.io/KernelAbstractions.jl",
assets=String[],
modules = [KernelAbstractions],
authors = "JuliaGPU and contributors",
repo = "https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
sitename = "KernelAbstractions.jl",
format = Documenter.HTML(;
prettyurls = ci,
canonical = "https://juliagpu.github.io/KernelAbstractions.jl",
assets = String[],
),
warnonly=[:missing_docs],
pages=[
warnonly = [:missing_docs],
pages = [
"Home" => "index.md",
"Quickstart" => "quickstart.md",
"Writing kernels" => "kernels.md",
Expand All @@ -34,13 +34,13 @@ function main()
"Extras" => [
"extras/unrolling.md",
], # Extras
"Notes for implementations" => "implementations.md"
"Notes for implementations" => "implementations.md",
], # pages
)

if ci
deploydocs(;
repo="github.com/JuliaGPU/KernelAbstractions.jl",
repo = "github.com/JuliaGPU/KernelAbstractions.jl",
push_preview = true,
)
end
Expand Down
18 changes: 9 additions & 9 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,28 +30,28 @@ end
# possible to get a value of 312, then we will have 2 separate shmem blocks,
# one from 1->256, and another from 256->512
@uniform max_element = 1
for min_element = 1:gs:N
for min_element in 1:gs:N

# Setting shared_histogram to 0
@inbounds shared_histogram[lid] = 0
@synchronize()

max_element = min_element + gs
if max_element > N
max_element = N+1
max_element = N + 1
end

# Defining bin on shared memory and writing to it if possible
bin = input[tid]
if bin >= min_element && bin < max_element
bin -= min_element-1
bin -= min_element - 1
@atomic shared_histogram[bin] += 1
end

@synchronize()

if ((lid+min_element-1) <= N)
@atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
if ((lid + min_element - 1) <= N)
@atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
end

end
Expand All @@ -62,7 +62,7 @@ function histogram!(histogram_output, input)
backend = get_backend(histogram_output)
# Need static block size
kernel! = histogram_kernel!(backend, (256,))
kernel!(histogram_output, input, ndrange=size(input))
kernel!(histogram_output, input, ndrange = size(input))
end

function move(backend, input)
Expand All @@ -75,9 +75,9 @@ end
if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
@test_skip false
else
rand_input = [rand(1:128) for i = 1:1000]
linear_input = [i for i = 1:1024]
all_two = [2 for i = 1:512]
rand_input = [rand(1:128) for i in 1:1000]
linear_input = [i for i in 1:1024]
all_two = [2 for i in 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
Expand Down
12 changes: 6 additions & 6 deletions examples/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #

# creating a temporary sum variable for matrix multiplication
tmp_sum = zero(eltype(output))
for k = 1:size(a)[2]
tmp_sum += a[i,k] * b[k, j]
for k in 1:size(a)[2]
tmp_sum += a[i, k] * b[k, j]
end

output[i,j] = tmp_sum
output[i, j] = tmp_sum
end

# Creating a wrapper kernel for launching with error checks
Expand All @@ -22,14 +22,14 @@ function matmul!(output, a, b)
end
backend = KernelAbstractions.get_backend(a)
kernel! = matmul_kernel!(backend)
kernel!(output, a, b, ndrange=size(output))
kernel!(output, a, b, ndrange = size(output))
end

a = rand!(allocate(backend, Float32, 256, 123))
b = rand!(allocate(backend, Float32, 123, 45))
output = KernelAbstractions.zeros(backend, Float32, 256, 45)

matmul!(output, a,b)
matmul!(output, a, b)
KernelAbstractions.synchronize(backend)

@test isapprox(output, a*b)
@test isapprox(output, a * b)
2 changes: 1 addition & 1 deletion examples/memcopy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function mycopy!(A, B)
@assert get_backend(B) == backend

kernel = copy_kernel!(backend)
kernel(A, B, ndrange=length(A))
kernel(A, B, ndrange = length(A))
end

A = KernelAbstractions.zeros(backend, Float64, 128, 128)
Expand Down
2 changes: 1 addition & 1 deletion examples/memcopy_static.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ function mycopy_static!(A, B)
@assert get_backend(B) == backend

kernel = copy_kernel!(backend, 32, size(A)) # if size(A) varies this will cause recompilation
kernel(A, B, ndrange=size(A))
kernel(A, B, ndrange = size(A))
end

A = KernelAbstractions.zeros(backend, Float64, 128, 128)
Expand Down
10 changes: 6 additions & 4 deletions examples/mpi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ function main(backend)
comm = MPI.COMM_WORLD
MPI.Barrier(comm)

dst_rank = mod(MPI.Comm_rank(comm)+1, MPI.Comm_size(comm))
src_rank = mod(MPI.Comm_rank(comm)-1, MPI.Comm_size(comm))
dst_rank = mod(MPI.Comm_rank(comm) + 1, MPI.Comm_size(comm))
src_rank = mod(MPI.Comm_rank(comm) - 1, MPI.Comm_size(comm))

T = Int64
M = 10
Expand All @@ -59,8 +59,10 @@ function main(backend)

KernelAbstractions.synchronize(backend)

recv_task, send_task = exchange!(h_send_buf, d_recv_buf, h_recv_buf,
src_rank, dst_rank, comm)
recv_task, send_task = exchange!(
h_send_buf, d_recv_buf, h_recv_buf,
src_rank, dst_rank, comm,
)

cooperative_wait(recv_task)
cooperative_wait(send_task)
Expand Down
5 changes: 2 additions & 3 deletions examples/naive_transpose.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ function naive_transpose!(a, b)
@assert get_backend(b) == backend
groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
kernel! = naive_transpose_kernel!(backend, groupsize)
kernel!(a, b, ndrange=size(a))
kernel!(a, b, ndrange = size(a))
end

# resolution of grid will be res*res
Expand All @@ -27,7 +27,6 @@ res = 1024
b = rand!(allocate(backend, Float32, res, res))
a = KernelAbstractions.zeros(backend, Float32, res, res)

naive_transpose!(a,b)
naive_transpose!(a, b)
KernelAbstractions.synchronize(backend)
@test a == transpose(b)

14 changes: 8 additions & 6 deletions examples/numa_aware.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ end
Estimate the memory bandwidth (GB/s) by performing a time measurement of a
SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
"""
function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
init = :parallel)
function measure_membw(
backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
init = :parallel,
)
bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
flops = 2 * N # num flops in SAXY

Expand All @@ -37,10 +39,10 @@ function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtyp
kernel = saxpy_kernel($backend, $workgroup_size, $(size(Y)))
kernel($a, $X, $Y, ndrange = $(size(Y)))
KernelAbstractions.synchronize($backend)
end evals=2 samples=10
end evals = 2 samples = 10

mem_rate = bytes * 1e-9 / t # GB/s
flop_rate = flops * 1e-9 / t # GFLOP/s
mem_rate = bytes * 1.0e-9 / t # GB/s
flop_rate = flops * 1.0e-9 / t # GFLOP/s

if verbose
println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits = 2))
Expand All @@ -51,7 +53,7 @@ end

# Static should be much better (on a system with multiple NUMA domains)
measure_membw(CPU());
measure_membw(CPU(; static=true));
measure_membw(CPU(; static = true));

# The following has significantly worse performance (even on systems with a single memory domain)!
# measure_membw(CPU(); init=:serial);
Expand Down
Loading

0 comments on commit faf168b

Please sign in to comment.