diff --git a/Project.toml b/Project.toml index cc31133..5b27dff 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Polyester" uuid = "f517fe37-dbe3-4b94-8317-1923a5111588" authors = ["Chris Elrod and contributors"] -version = "0.6.13" +version = "0.6.14" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" @@ -21,7 +21,7 @@ BitTwiddlingConvenienceFunctions = "0.1" CPUSummary = "0.1.2 - 0.1.8, 0.1.11" IfElse = "0.1" ManualMemory = "0.1.3" -PolyesterWeave = "0.1" +PolyesterWeave = "0.1.7" Requires = "1" Static = "0.7" StrideArraysCore = "0.3.11" diff --git a/README.md b/README.md index 120355a..1b3852d 100644 --- a/README.md +++ b/README.md @@ -413,6 +413,8 @@ Note that `@batch` defaults to using up to one thread per physical core, instead is because [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl) currently only uses up to 1 thread per physical core, and switching the number of threads incurs some overhead. See the docstring on `@batch` (i.e., `?@batch` in a Julia REPL) for some more discussion. +## Local per-thread storage + You also can define local storage for each thread, providing a vector containing each of the local storages at the end. ```julia @@ -446,4 +448,165 @@ julia> let end Float16[83.0, 90.0, 27.0, 65.0] -``` \ No newline at end of file +``` + +## Disabling Polyester threads + +When running many repetitions of a Polyester-multithreaded function (e.g. in an embarrassingly parallel problem that repeatedly executes a small already Polyester-multithreaded function), it can be beneficial to disable Polyester (the inner multithreaded loop) and multithread only at the outer level (e.g. with `Base.Threads`). This can be done with the `disable_polyester_threads` context manager. In the expandable section below you can see examples with benchmarks. + +It is best to call `disable_polyester_threads` only once, before any `@thread` uses happen, to avoid overhead. E.g. best to do it as: +```julia +disable_polyester_threads() do + @threads for i in 1:n + f() + end +end +``` +instead of doing it in the following unnecessarily slow manner: +```julia +@threads for i in 1:n # DO NOT DO THIS + disable_polyester_threads() do # IT HAS UNNECESSARY OVERHEAD + f() + end +end +``` + + +
+Benchmarks of nested multi-threading with Polyester + +```julia +# Big inner problem, repeated only a few times + +y = rand(10000000,4); +x = rand(size(y)...); + +@btime inner($x,$y,1) # 73.319 ms (0 allocations: 0 bytes) +@btime inner_polyester($x,$y,1) # 8.936 ms (0 allocations: 0 bytes) +@btime inner_thread($x,$y,1) # 11.206 ms (49 allocations: 4.56 KiB) + +@btime sequential_sequential($x,$y) # 274.926 ms (0 allocations: 0 bytes) +@btime sequential_polyester($x,$y) # 36.963 ms (0 allocations: 0 bytes) +@btime sequential_thread($x,$y) # 49.373 ms (196 allocations: 18.25 KiB) + +@btime threads_of_polyester($x,$y) # 78.828 ms (58 allocations: 4.84 KiB) +# the following is a purposefully suboptimal way to disable threads +@btime threads_of_polyester_inner_disable($x,$y) # 70.182 ms (47 allocations: 4.50 KiB) +# the following is a good way to disable threads (the disable call happening once in the outer scope) +@btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 71.141 ms (47 allocations: 4.50 KiB) +@btime threads_of_sequential($x,$y) # 70.857 ms (46 allocations: 4.47 KiB) +@btime threads_of_thread($x,$y) # 45.116 ms (219 allocations: 22.00 KiB) + +# Small inner problem, repeated many times + +y = rand(1000,1000); +x = rand(size(y)...); + +@btime inner($x,$y,1) # 7.028 μs (0 allocations: 0 bytes) +@btime inner_polyester($x,$y,1) # 1.917 μs (0 allocations: 0 bytes) +@btime inner_thread($x,$y,1) # 7.544 μs (45 allocations: 4.44 KiB) + +@btime sequential_sequential($x,$y) # 6.790 ms (0 allocations: 0 bytes) +@btime sequential_polyester($x,$y) # 2.070 ms (0 allocations: 0 bytes) +@btime sequential_thread($x,$y) # 9.296 ms (49002 allocations: 4.46 MiB) + +@btime threads_of_polyester($x,$y) # 2.090 ms (42 allocations: 4.34 KiB) +# the following is a purposefully suboptimal way to disable threads +@btime threads_of_polyester_inner_disable($x,$y) # 1.065 ms (42 allocations: 4.34 KiB) +# the following is a good way to disable threads (the disable call happening once in the outer scope) +@btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 997.918 μs (49 allocations: 4.56 KiB) +@btime threads_of_sequential($x,$y) # 1.057 ms (48 allocations: 4.53 KiB) +@btime threads_of_thread($x,$y) # 4.105 ms (42059 allocations: 4.25 MiB) + +# The tested functions +# All of these would be better implemented by just using @tturbo, +# but these suboptimal implementations serve as good test case for +# Polyster-vs-Base thread scheduling. + +function inner(x,y,j) + for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function inner_polyester(x,y,j) + @batch for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function inner_thread(x,y,j) + @threads for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function sequential_sequential(x,y) + for j ∈ axes(x,2) + inner(x,y,j) + end +end + +function sequential_polyester(x,y) + for j ∈ axes(x,2) + inner_polyester(x,y,j) + end +end + +function sequential_thread(x,y) + for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_polyester(x,y) + @threads for j ∈ axes(x,2) + inner_polyester(x,y,j) + end +end + +function threads_of_polyester_inner_disable(x,y) + # XXX This is a bad way to disable Polyester threads as + # it causes unnecessary overhead for each @threads thread. + # See the benchmarks above for a better way. + @threads for j ∈ axes(x,2) + Polyester.disable_polyester_threads() do + inner_polyester(x,y,j) + end + end +end + +function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_sequential(x,y) + @threads for j ∈ axes(x,2) + inner(x,y,j) + end +end +``` +Benchmarks executed on: +``` +Julia Version 1.9.0-DEV.998 +Commit e1739aa42a1 (2022-07-18 10:27 UTC) +Platform Info: + OS: Linux (x86_64-linux-gnu) + CPU: 16 × AMD Ryzen 7 1700 Eight-Core Processor + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-14.0.5 (ORCJIT, znver1) + Threads: 8 on 16 virtual cores +Environment: + JULIA_EDITOR = code + JULIA_NUM_THREADS = 8 +``` +
\ No newline at end of file diff --git a/src/Polyester.jl b/src/Polyester.jl index 97fe14e..014b8f5 100644 --- a/src/Polyester.jl +++ b/src/Polyester.jl @@ -8,10 +8,11 @@ using ManualMemory: Reference using Static using Requires using PolyesterWeave: - request_threads, free_threads!, mask, UnsignedIteratorEarlyStop, assume + request_threads, free_threads!, mask, UnsignedIteratorEarlyStop, assume, + disable_polyester_threads using CPUSummary: num_threads, num_cores -export batch, @batch, num_threads +export batch, @batch, num_threads, disable_polyester_threads include("batch.jl") diff --git a/test/runtests.jl b/test/runtests.jl index 8b0cdf2..6c8eb62 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,7 @@ println( "Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...", ) using Polyester, Aqua, ForwardDiff +using Base.Threads: @threads using Test function bsin!(y, x, r = eachindex(y, x)) @@ -395,10 +396,10 @@ end # issue 78 (lack of support for keyword arguments using only variable names without `=`) f(a; b=10.0, c=100.0) = a + b + c - + buf = [0, 0] b = 0.0 - + Threads.nthreads() == 1 && println("the issue arises only on multithreading runs") @batch for i in 1:2 @@ -408,6 +409,95 @@ end @test buf == [1, 2] end +@testset "disable_polyester_threads" begin + function inner(x,y,j) + for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function inner_polyester(x,y,j) + @batch for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function inner_thread(x,y,j) + @threads for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function sequential_sequential(x,y) + for j ∈ axes(x,2) + inner(x,y,j) + end + end + + function sequential_polyester(x,y) + for j ∈ axes(x,2) + inner_polyester(x,y,j) + end + end + + function sequential_thread(x,y) + for j ∈ axes(x,2) + inner_thread(x,y,j) + end + end + + function threads_of_polyester(x,y) + @threads for j ∈ axes(x,2) + inner_polyester(x,y,j) + end + end + + function threads_of_polyester_inner_disable(x,y) + @threads for j ∈ axes(x,2) + Polyester.disable_polyester_threads() do + inner_polyester(x,y,j) + end + end + end + + function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end + end + + function threads_of_sequential(x,y) + @threads for j ∈ axes(x,2) + inner(x,y,j) + end + end + + y = rand(10,10); # (size of inner problem, size of outer problem) + x = rand(size(y)...); + inner(x,y,1) + good_y = copy(y) + inner_polyester(x,y,1) + @assert good_y == y + inner_thread(x,y,1) + @assert good_y == y + sequential_sequential(x,y) + good_y = copy(y) + sequential_polyester(x,y) + @assert good_y == y + sequential_thread(x,y) + @assert good_y == y + threads_of_polyester(x,y) + @assert good_y == y + threads_of_polyester_inner_disable(x,y) + @assert good_y == y + disable_polyester_threads() do; threads_of_polyester(x,y) end + @assert good_y == y + threads_of_sequential(x,y) + @assert good_y == y + threads_of_thread(x,y) + @assert good_y == y +end + if VERSION ≥ v"1.6" println("Package tests complete. Running `Aqua` checks.") Aqua.test_all(Polyester)