From 8d5588911cfb27ee95ad51c360a6ea8dbf332cef Mon Sep 17 00:00:00 2001 From: Stefan Krastanov Date: Wed, 20 Jul 2022 11:44:17 -0400 Subject: [PATCH 1/4] disable_polyester_threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Benchmarks with and without it below: ``` using Base.Threads using BenchmarkTools using Revise using Polyester \## function inner(x,y,j) for i ∈ axes(x,1) y[i,j] = sin(x[i,j]) end end function inner_polyester(x,y,j) @batch for i ∈ axes(x,1) y[i,j] = sin(x[i,j]) end end function inner_thread(x,y,j) @threads for i ∈ axes(x,1) y[i,j] = sin(x[i,j]) end end function sequential_sequential(x,y) for j ∈ axes(x,2) inner(x,y,j) end end function sequential_polyester(x,y) for j ∈ axes(x,2) inner_polyester(x,y,j) end end function sequential_thread(x,y) for j ∈ axes(x,2) inner_thread(x,y,j) end end function threads_of_polyester(x,y) @threads for j ∈ axes(x,2) inner_polyester(x,y,j) end end function threads_of_polyester_inner_disable(x,y) @threads for j ∈ axes(x,2) Polyester.disable_polyester_threads() do inner_polyester(x,y,j) end end end function threads_of_thread(x,y) @threads for j ∈ axes(x,2) inner_thread(x,y,j) end end function threads_of_thread(x,y) @threads for j ∈ axes(x,2) inner_thread(x,y,j) end end function threads_of_sequential(x,y) @threads for j ∈ axes(x,2) inner(x,y,j) end end # Big inner problem, repeated only a few times y = rand(10000000,4); x = rand(size(y)...); @btime inner($x,$y,1) # 73.319 ms (0 allocations: 0 bytes) @btime inner_polyester($x,$y,1) # 8.936 ms (0 allocations: 0 bytes) @btime inner_thread($x,$y,1) # 11.206 ms (49 allocations: 4.56 KiB) @btime sequential_sequential($x,$y) # 274.926 ms (0 allocations: 0 bytes) @btime sequential_polyester($x,$y) # 36.963 ms (0 allocations: 0 bytes) @btime sequential_thread($x,$y) # 49.373 ms (196 allocations: 18.25 KiB) @btime threads_of_polyester($x,$y) # 78.828 ms (58 allocations: 4.84 KiB) @btime threads_of_polyester_inner_disable($x,$y) # 70.182 ms (47 allocations: 4.50 KiB) @btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 71.141 ms (47 allocations: 4.50 KiB) @btime threads_of_sequential($x,$y) # 70.857 ms (46 allocations: 4.47 KiB) @btime threads_of_thread($x,$y) # 45.116 ms (219 allocations: 22.00 KiB) # Small inner problem, repated many times y = rand(1000,1000); x = rand(size(y)...); @btime inner($x,$y,1) # 7.028 μs (0 allocations: 0 bytes) @btime inner_polyester($x,$y,1) # 1.917 μs (0 allocations: 0 bytes) @btime inner_thread($x,$y,1) # 7.544 μs (45 allocations: 4.44 KiB) @btime sequential_sequential($x,$y) # 6.790 ms (0 allocations: 0 bytes) @btime sequential_polyester($x,$y) # 2.070 ms (0 allocations: 0 bytes) @btime sequential_thread($x,$y) # 9.296 ms (49002 allocations: 4.46 MiB) @btime threads_of_polyester($x,$y) # 2.090 ms (42 allocations: 4.34 KiB) @btime threads_of_polyester_inner_disable($x,$y) # 1.065 ms (42 allocations: 4.34 KiB) @btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 997.918 μs (49 allocations: 4.56 KiB) @btime threads_of_sequential($x,$y) # 1.057 ms (48 allocations: 4.53 KiB) @btime threads_of_thread($x,$y) # 4.105 ms (42059 allocations: 4.25 MiB) ``` --- Project.toml | 2 +- README.md | 122 ++++++++++++++++++++++++++++++++++++++++++++++- src/Polyester.jl | 3 +- src/utility.jl | 13 +++++ test/runtests.jl | 94 +++++++++++++++++++++++++++++++++++- 5 files changed, 229 insertions(+), 5 deletions(-) create mode 100644 src/utility.jl diff --git a/Project.toml b/Project.toml index cc31133..2ccc027 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Polyester" uuid = "f517fe37-dbe3-4b94-8317-1923a5111588" authors = ["Chris Elrod and contributors"] -version = "0.6.13" +version = "0.6.14" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/README.md b/README.md index 120355a..804dbe9 100644 --- a/README.md +++ b/README.md @@ -413,6 +413,8 @@ Note that `@batch` defaults to using up to one thread per physical core, instead is because [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl) currently only uses up to 1 thread per physical core, and switching the number of threads incurs some overhead. See the docstring on `@batch` (i.e., `?@batch` in a Julia REPL) for some more discussion. +## Local per-thread storage + You also can define local storage for each thread, providing a vector containing each of the local storages at the end. ```julia @@ -446,4 +448,122 @@ julia> let end Float16[83.0, 90.0, 27.0, 65.0] -``` \ No newline at end of file +``` + +## Disabling Polyester threads + +When running many repetitions of a Polyester-multithreaded function (e.g. in an embarrassingly parallel problem that repeatedly executes a small already Polyester-multithreaded function), it can be beneficial to disable Polyester (the inner multithreaded loop) and multithread only at the outer level (e.g. with `Base.Threads`). This can be done with the `disable_polyester_threads` context manager. In the expandable section below you can see example benchmarks. + +
+Benchmarks of nested multi-threading with Polyester + +```julia +# Big inner problem, repeated only a few times + +y = rand(10000000,4); +x = rand(size(y)...); + +@btime inner($x,$y,1) # 73.319 ms (0 allocations: 0 bytes) +@btime inner_polyester($x,$y,1) # 8.936 ms (0 allocations: 0 bytes) +@btime inner_thread($x,$y,1) # 11.206 ms (49 allocations: 4.56 KiB) + +@btime sequential_sequential($x,$y) # 274.926 ms (0 allocations: 0 bytes) +@btime sequential_polyester($x,$y) # 36.963 ms (0 allocations: 0 bytes) +@btime sequential_thread($x,$y) # 49.373 ms (196 allocations: 18.25 KiB) + +@btime threads_of_polyester($x,$y) # 78.828 ms (58 allocations: 4.84 KiB) +@btime threads_of_polyester_inner_disable($x,$y) # 70.182 ms (47 allocations: 4.50 KiB) +@btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 71.141 ms (47 allocations: 4.50 KiB) +@btime threads_of_sequential($x,$y) # 70.857 ms (46 allocations: 4.47 KiB) +@btime threads_of_thread($x,$y) # 45.116 ms (219 allocations: 22.00 KiB) + +# Small inner problem, repeated many times + +y = rand(1000,1000); +x = rand(size(y)...); + +@btime inner($x,$y,1) # 7.028 μs (0 allocations: 0 bytes) +@btime inner_polyester($x,$y,1) # 1.917 μs (0 allocations: 0 bytes) +@btime inner_thread($x,$y,1) # 7.544 μs (45 allocations: 4.44 KiB) + +@btime sequential_sequential($x,$y) # 6.790 ms (0 allocations: 0 bytes) +@btime sequential_polyester($x,$y) # 2.070 ms (0 allocations: 0 bytes) +@btime sequential_thread($x,$y) # 9.296 ms (49002 allocations: 4.46 MiB) + +@btime threads_of_polyester($x,$y) # 2.090 ms (42 allocations: 4.34 KiB) +@btime threads_of_polyester_inner_disable($x,$y) # 1.065 ms (42 allocations: 4.34 KiB) +@btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 997.918 μs (49 allocations: 4.56 KiB) +@btime threads_of_sequential($x,$y) # 1.057 ms (48 allocations: 4.53 KiB) +@btime threads_of_thread($x,$y) # 4.105 ms (42059 allocations: 4.25 MiB) + +# The tested functions + +function inner(x,y,j) + for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function inner_polyester(x,y,j) + @batch for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function inner_thread(x,y,j) + @threads for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end +end + +function sequential_sequential(x,y) + for j ∈ axes(x,2) + inner(x,y,j) + end +end + +function sequential_polyester(x,y) + for j ∈ axes(x,2) + inner_polyester(x,y,j) + end +end + +function sequential_thread(x,y) + for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_polyester(x,y) + @threads for j ∈ axes(x,2) + inner_polyester(x,y,j) + end +end + +function threads_of_polyester_inner_disable(x,y) + @threads for j ∈ axes(x,2) + Polyester.disable_polyester_threads() do + inner_polyester(x,y,j) + end + end +end + +function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end +end + +function threads_of_sequential(x,y) + @threads for j ∈ axes(x,2) + inner(x,y,j) + end +end +``` +
\ No newline at end of file diff --git a/src/Polyester.jl b/src/Polyester.jl index 97fe14e..01302a0 100644 --- a/src/Polyester.jl +++ b/src/Polyester.jl @@ -11,11 +11,12 @@ using PolyesterWeave: request_threads, free_threads!, mask, UnsignedIteratorEarlyStop, assume using CPUSummary: num_threads, num_cores -export batch, @batch, num_threads +export batch, @batch, num_threads, disable_polyester_threads include("batch.jl") include("closure.jl") +include("utility.jl") # y = rand(1) # x = rand(1) diff --git a/src/utility.jl b/src/utility.jl new file mode 100644 index 0000000..180a2bd --- /dev/null +++ b/src/utility.jl @@ -0,0 +1,13 @@ +""" + disable_polyester_threads(f::F) + +A context manager function that disables Polyester threads without affecting the scheduling +of `Base.Treads.@threads`. Particularly useful for cases when Polyester has been used to +multithread an inner small problem that is now to be used in an outer embarassingly parallel +problem (in such cases it is best to multithread only at the outermost level). +""" +function disable_polyester_threads(f::F) where {F} + t, r = request_threads(num_threads()) + f() + foreach(free_threads!, r) +end diff --git a/test/runtests.jl b/test/runtests.jl index 8b0cdf2..6c8eb62 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,6 +2,7 @@ println( "Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...", ) using Polyester, Aqua, ForwardDiff +using Base.Threads: @threads using Test function bsin!(y, x, r = eachindex(y, x)) @@ -395,10 +396,10 @@ end # issue 78 (lack of support for keyword arguments using only variable names without `=`) f(a; b=10.0, c=100.0) = a + b + c - + buf = [0, 0] b = 0.0 - + Threads.nthreads() == 1 && println("the issue arises only on multithreading runs") @batch for i in 1:2 @@ -408,6 +409,95 @@ end @test buf == [1, 2] end +@testset "disable_polyester_threads" begin + function inner(x,y,j) + for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function inner_polyester(x,y,j) + @batch for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function inner_thread(x,y,j) + @threads for i ∈ axes(x,1) + y[i,j] = sin(x[i,j]) + end + end + + function sequential_sequential(x,y) + for j ∈ axes(x,2) + inner(x,y,j) + end + end + + function sequential_polyester(x,y) + for j ∈ axes(x,2) + inner_polyester(x,y,j) + end + end + + function sequential_thread(x,y) + for j ∈ axes(x,2) + inner_thread(x,y,j) + end + end + + function threads_of_polyester(x,y) + @threads for j ∈ axes(x,2) + inner_polyester(x,y,j) + end + end + + function threads_of_polyester_inner_disable(x,y) + @threads for j ∈ axes(x,2) + Polyester.disable_polyester_threads() do + inner_polyester(x,y,j) + end + end + end + + function threads_of_thread(x,y) + @threads for j ∈ axes(x,2) + inner_thread(x,y,j) + end + end + + function threads_of_sequential(x,y) + @threads for j ∈ axes(x,2) + inner(x,y,j) + end + end + + y = rand(10,10); # (size of inner problem, size of outer problem) + x = rand(size(y)...); + inner(x,y,1) + good_y = copy(y) + inner_polyester(x,y,1) + @assert good_y == y + inner_thread(x,y,1) + @assert good_y == y + sequential_sequential(x,y) + good_y = copy(y) + sequential_polyester(x,y) + @assert good_y == y + sequential_thread(x,y) + @assert good_y == y + threads_of_polyester(x,y) + @assert good_y == y + threads_of_polyester_inner_disable(x,y) + @assert good_y == y + disable_polyester_threads() do; threads_of_polyester(x,y) end + @assert good_y == y + threads_of_sequential(x,y) + @assert good_y == y + threads_of_thread(x,y) + @assert good_y == y +end + if VERSION ≥ v"1.6" println("Package tests complete. Running `Aqua` checks.") Aqua.test_all(Polyester) From fe7ef9edcf1773432e2c423d35dd237e0447bad9 Mon Sep 17 00:00:00 2001 From: Stefan Krastanov Date: Wed, 20 Jul 2022 15:00:16 -0400 Subject: [PATCH 2/4] use try block in disable_polyester_threads --- src/utility.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/utility.jl b/src/utility.jl index 180a2bd..f53527c 100644 --- a/src/utility.jl +++ b/src/utility.jl @@ -8,6 +8,9 @@ problem (in such cases it is best to multithread only at the outermost level). """ function disable_polyester_threads(f::F) where {F} t, r = request_threads(num_threads()) - f() - foreach(free_threads!, r) + try + f() + finally + foreach(free_threads!, r) + end end From 6535fe6ae79b147618503557e49824315cf58c4b Mon Sep 17 00:00:00 2001 From: Stefan Krastanov Date: Wed, 20 Jul 2022 15:16:36 -0400 Subject: [PATCH 3/4] move disable_polyester_threads to PolyesterWeave --- Project.toml | 2 +- README.md | 42 +++++++++++++++++++++++++++++++++++++++++- src/Polyester.jl | 4 ++-- src/utility.jl | 16 ---------------- 4 files changed, 44 insertions(+), 20 deletions(-) delete mode 100644 src/utility.jl diff --git a/Project.toml b/Project.toml index 2ccc027..5b27dff 100644 --- a/Project.toml +++ b/Project.toml @@ -21,7 +21,7 @@ BitTwiddlingConvenienceFunctions = "0.1" CPUSummary = "0.1.2 - 0.1.8, 0.1.11" IfElse = "0.1" ManualMemory = "0.1.3" -PolyesterWeave = "0.1" +PolyesterWeave = "0.1.7" Requires = "1" Static = "0.7" StrideArraysCore = "0.3.11" diff --git a/README.md b/README.md index 804dbe9..90b9089 100644 --- a/README.md +++ b/README.md @@ -452,7 +452,25 @@ Float16[83.0, 90.0, 27.0, 65.0] ## Disabling Polyester threads -When running many repetitions of a Polyester-multithreaded function (e.g. in an embarrassingly parallel problem that repeatedly executes a small already Polyester-multithreaded function), it can be beneficial to disable Polyester (the inner multithreaded loop) and multithread only at the outer level (e.g. with `Base.Threads`). This can be done with the `disable_polyester_threads` context manager. In the expandable section below you can see example benchmarks. +When running many repetitions of a Polyester-multithreaded function (e.g. in an embarrassingly parallel problem that repeatedly executes a small already Polyester-multithreaded function), it can be beneficial to disable Polyester (the inner multithreaded loop) and multithread only at the outer level (e.g. with `Base.Threads`). This can be done with the `disable_polyester_threads` context manager. In the expandable section below you can see examples with benchmarks. + +It is best to call `disable_polyester_threads` only once, before any `@thread` uses happen, to avoid overhead. E.g. best to do it as: +```julia +disable_polyester_threads() do + @threads for i in 1:n + f() + end +end +``` +instead of doing it in the following unnecessarily slow manner: +```julia +@threads for i in 1:n # DO NOT DO THIS + disable_polyester_threads() do # IT HAS UNNECESSARY OVERHEAD + f() + end +end +``` +
Benchmarks of nested multi-threading with Polyester @@ -472,7 +490,9 @@ x = rand(size(y)...); @btime sequential_thread($x,$y) # 49.373 ms (196 allocations: 18.25 KiB) @btime threads_of_polyester($x,$y) # 78.828 ms (58 allocations: 4.84 KiB) +# the following is a purposefully suboptimal way to disable threads @btime threads_of_polyester_inner_disable($x,$y) # 70.182 ms (47 allocations: 4.50 KiB) +# the following is a good way to disable threads (the disable call happening once in the outer scope) @btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 71.141 ms (47 allocations: 4.50 KiB) @btime threads_of_sequential($x,$y) # 70.857 ms (46 allocations: 4.47 KiB) @btime threads_of_thread($x,$y) # 45.116 ms (219 allocations: 22.00 KiB) @@ -491,7 +511,9 @@ x = rand(size(y)...); @btime sequential_thread($x,$y) # 9.296 ms (49002 allocations: 4.46 MiB) @btime threads_of_polyester($x,$y) # 2.090 ms (42 allocations: 4.34 KiB) +# the following is a purposefully suboptimal way to disable threads @btime threads_of_polyester_inner_disable($x,$y) # 1.065 ms (42 allocations: 4.34 KiB) +# the following is a good way to disable threads (the disable call happening once in the outer scope) @btime Polyester.disable_polyester_threads() do; threads_of_polyester($x,$y) end; # 997.918 μs (49 allocations: 4.56 KiB) @btime threads_of_sequential($x,$y) # 1.057 ms (48 allocations: 4.53 KiB) @btime threads_of_thread($x,$y) # 4.105 ms (42059 allocations: 4.25 MiB) @@ -541,6 +563,9 @@ function threads_of_polyester(x,y) end function threads_of_polyester_inner_disable(x,y) + # XXX This is a bad way to disable Polyester threads as + # it causes unnecessary overhead for each @threads thread. + # See the benchmarks above for a better way. @threads for j ∈ axes(x,2) Polyester.disable_polyester_threads() do inner_polyester(x,y,j) @@ -566,4 +591,19 @@ function threads_of_sequential(x,y) end end ``` +Benchmarks executed on: +``` +Julia Version 1.9.0-DEV.998 +Commit e1739aa42a1 (2022-07-18 10:27 UTC) +Platform Info: + OS: Linux (x86_64-linux-gnu) + CPU: 16 × AMD Ryzen 7 1700 Eight-Core Processor + WORD_SIZE: 64 + LIBM: libopenlibm + LLVM: libLLVM-14.0.5 (ORCJIT, znver1) + Threads: 8 on 16 virtual cores +Environment: + JULIA_EDITOR = code + JULIA_NUM_THREADS = 8 +```
\ No newline at end of file diff --git a/src/Polyester.jl b/src/Polyester.jl index 01302a0..014b8f5 100644 --- a/src/Polyester.jl +++ b/src/Polyester.jl @@ -8,7 +8,8 @@ using ManualMemory: Reference using Static using Requires using PolyesterWeave: - request_threads, free_threads!, mask, UnsignedIteratorEarlyStop, assume + request_threads, free_threads!, mask, UnsignedIteratorEarlyStop, assume, + disable_polyester_threads using CPUSummary: num_threads, num_cores export batch, @batch, num_threads, disable_polyester_threads @@ -16,7 +17,6 @@ export batch, @batch, num_threads, disable_polyester_threads include("batch.jl") include("closure.jl") -include("utility.jl") # y = rand(1) # x = rand(1) diff --git a/src/utility.jl b/src/utility.jl deleted file mode 100644 index f53527c..0000000 --- a/src/utility.jl +++ /dev/null @@ -1,16 +0,0 @@ -""" - disable_polyester_threads(f::F) - -A context manager function that disables Polyester threads without affecting the scheduling -of `Base.Treads.@threads`. Particularly useful for cases when Polyester has been used to -multithread an inner small problem that is now to be used in an outer embarassingly parallel -problem (in such cases it is best to multithread only at the outermost level). -""" -function disable_polyester_threads(f::F) where {F} - t, r = request_threads(num_threads()) - try - f() - finally - foreach(free_threads!, r) - end -end From 36aacb832b6e2e905ac823275ecd69fe1ca0c396 Mon Sep 17 00:00:00 2001 From: Stefan Krastanov Date: Wed, 20 Jul 2022 15:20:11 -0400 Subject: [PATCH 4/4] Mention @tturbo in the examples. --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 90b9089..1b3852d 100644 --- a/README.md +++ b/README.md @@ -519,6 +519,9 @@ x = rand(size(y)...); @btime threads_of_thread($x,$y) # 4.105 ms (42059 allocations: 4.25 MiB) # The tested functions +# All of these would be better implemented by just using @tturbo, +# but these suboptimal implementations serve as good test case for +# Polyster-vs-Base thread scheduling. function inner(x,y,j) for i ∈ axes(x,1)