From 23b59d30fd4917a05afc668487a476fb0344e0a0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 09:29:45 -0800 Subject: [PATCH 01/65] Keeping only the suspected faulty test (by commenting out rest (!)) --- test/runtests.jl | 230 +++++++++++++++++++++++------------------------ 1 file changed, 115 insertions(+), 115 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 26604bf25..ae31c943f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -37,73 +37,73 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "Stepping stone" begin - pt = pigeons(target = toy_mvn_target(100)); - p = stepping_stone_pair(pt) - truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) - @test abs(p[1] - truth) < 1 - @test abs(p[2] - truth) < 1 -end - -@testset "Round trips" begin - n_chains = 4 - n_rounds = 5 +# @testset "Stepping stone" begin +# pt = pigeons(target = toy_mvn_target(100)); +# p = stepping_stone_pair(pt) +# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) +# @test abs(p[1] - truth) < 1 +# @test abs(p[2] - truth) < 1 +# end + +# @testset "Round trips" begin +# n_chains = 4 +# n_rounds = 5 - pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); +# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); - len = 2^(n_rounds) - truth = 0.0 - for i in 0:(n_chains-1) - truth += floor(max(len - i, 0) / n_chains / 2) - end - - @test truth == Pigeons.n_round_trips(pt) -end - -@testset "Moments" begin - pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); - for var_name in Pigeons.continuous_variables(pt) - m = mean(pt, var_name) - for i in eachindex(m) - @test abs(m[i] - 0.0) < 0.001 - end - v = var(pt, var_name) - for i in eachindex(v) - @test abs(v[i] - 0.1) < 0.001 - end - end -end - -@testset "Parallelism Invariance" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # Turing: - pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 4, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Turing, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 2)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 4, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - end -end +# len = 2^(n_rounds) +# truth = 0.0 +# for i in 0:(n_chains-1) +# truth += floor(max(len - i, 0) / n_chains / 2) +# end + +# @test truth == Pigeons.n_round_trips(pt) +# end + +# @testset "Moments" begin +# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); +# for var_name in Pigeons.continuous_variables(pt) +# m = mean(pt, var_name) +# for i in eachindex(m) +# @test abs(m[i] - 0.0) < 0.001 +# end +# v = var(pt, var_name) +# for i in eachindex(v) +# @test abs(v[i] - 0.1) < 0.001 +# end +# end +# end + +# @testset "Parallelism Invariance" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] +# # Turing: +# pigeons( +# target = TuringLogPotential(flip_model_unidentifiable()), +# n_rounds = 4, +# checked_round = 3, +# multithreaded = true, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# dependencies = [Turing, LinearAlgebra, "turing.jl"], +# n_local_mpi_processes = n_mpis, +# n_threads = 2)) +# # Blang: +# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf +# Pigeons.setup_blang("blangDemos") +# pigeons(; +# target = Pigeons.blang_ising(), +# n_rounds = 4, +# checked_round = 3, +# recorder_builders = recorder_builders, +# multithreaded = true, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 2)) +# end +# end @testset "Longer MPI" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -121,50 +121,50 @@ end n_threads = 1)) end -@testset "Entanglement" begin - mpi_test(1, "entanglement_test.jl") - mpi_test(2, "entanglement_test.jl") +# @testset "Entanglement" begin +# mpi_test(1, "entanglement_test.jl") +# mpi_test(2, "entanglement_test.jl") + +# mpi_test(1, "reduce_test.jl") +# mpi_test(2, "reduce_test.jl") +# mpi_test(3, "reduce_test.jl") +# end + +# @testset "PermutedDistributedArray" begin +# mpi_test(1, "permuted_test.jl", options = ["-s"]) +# mpi_test(1, "permuted_test.jl") +# mpi_test(2, "permuted_test.jl") +# end + +# @testset "LoadBalance" begin +# for i in 1:20 +# for j in i:30 +# test_load_balance(i, j) +# end +# end +# end + +# @testset "LogSum" begin +# m = Pigeons.LogSum() + +# fit!(m, 2.1) +# fit!(m, 4) +# v1 = value(m) +# @assert v1 ≈ log(exp(2.1) + exp(4)) - mpi_test(1, "reduce_test.jl") - mpi_test(2, "reduce_test.jl") - mpi_test(3, "reduce_test.jl") -end -@testset "PermutedDistributedArray" begin - mpi_test(1, "permuted_test.jl", options = ["-s"]) - mpi_test(1, "permuted_test.jl") - mpi_test(2, "permuted_test.jl") -end +# fit!(m, 2.1) +# fit!(m, 4) +# m2 = Pigeons.LogSum() +# fit!(m2, 50.1) +# combined = merge(m, m2) +# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) -@testset "LoadBalance" begin - for i in 1:20 - for j in i:30 - test_load_balance(i, j) - end - end -end - -@testset "LogSum" begin - m = Pigeons.LogSum() - - fit!(m, 2.1) - fit!(m, 4) - v1 = value(m) - @assert v1 ≈ log(exp(2.1) + exp(4)) - - - fit!(m, 2.1) - fit!(m, 4) - m2 = Pigeons.LogSum() - fit!(m2, 50.1) - combined = merge(m, m2) - @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - - fit!(m, 2.1) - fit!(m, 4) - empty!(m) - @assert value(m) == -Pigeons.inf(0.0) -end +# fit!(m, 2.1) +# fit!(m, 4) +# empty!(m) +# @assert value(m) == -Pigeons.inf(0.0) +# end function test_split_slice() # test disjoint random streams @@ -183,14 +183,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -@testset "split_test" begin - test_split_slice() -end +# @testset "split_test" begin +# test_split_slice() +# end -@testset "Serialize" begin - mpi_test(1, "serialization_test.jl") -end +# @testset "Serialize" begin +# mpi_test(1, "serialization_test.jl") +# end -@testset "SliceSampler" begin - test_slice_sampler() -end \ No newline at end of file +# @testset "SliceSampler" begin +# test_slice_sampler() +# end \ No newline at end of file From 40001d0b6aeb495e938b80927cc1c61bc77310a6 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 09:57:39 -0800 Subject: [PATCH 02/65] Removing Turing from test to isolate only one error at the time --- test/runtests.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index ae31c943f..9906c351f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,9 +8,9 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] - Pkg.add(i) -end +# for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] +# Pkg.add(i) +# end using Test using Distributions @@ -18,13 +18,13 @@ using Random using Statistics using OnlineStats using LinearAlgebra -using Turing +# using Turing using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice -include("slice_sampler_test.jl") -include("turing.jl") +#include("slice_sampler_test.jl") +# include("turing.jl") function test_load_balance(n_processes, n_tasks) for p in 1:n_processes From ff5aac9048b98b18119445a3e0071dc432cfe66b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:16:43 -0800 Subject: [PATCH 03/65] Check behaviour on different threadlevels --- src/mpi_utils/Entangler.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index fd93d4050..842f165e1 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -73,7 +73,7 @@ mutable struct Entangler println("Entangler initialized 1 process (without MPI); $(Threads.nthreads())") end else - Init(threadlevel = :funneled) + init_mpi() comm = Comm_dup(parent_communicator) transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) my_process_index = Comm_rank(comm) + 1 @@ -101,10 +101,12 @@ mpi_active() = if silence_mpi[] false else - Init(threadlevel = :funneled) + init_mpi() Comm_size(COMM_WORLD) > 1 end +init_mpi() = Init() #threadlevel = :funneled) + """ $SIGNATURES From d00ff5347fd9fd80884a38b84561e615aa521ba5 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:34:10 -0800 Subject: [PATCH 04/65] Back to same threadlevel as not changing crashing behaviour --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 842f165e1..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init() #threadlevel = :funneled) +init_mpi() = Init(threadlevel = :funneled) """ $SIGNATURES From 69bb270c62040bdbd9281d86dd8c1f79ca641252 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:39:07 -0800 Subject: [PATCH 05/65] More conservative tag ub? --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..ba026a706 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -75,7 +75,7 @@ mutable struct Entangler else init_mpi() comm = Comm_dup(parent_communicator) - transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) + transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices / 2) my_process_index = Comm_rank(comm) + 1 n_processes = Comm_size(comm) if verbose && my_process_index == 1 From d94252f915775119bd2317f91d19a807aee138cc Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:56:43 -0800 Subject: [PATCH 06/65] Revert "More conservative tag ub?" This reverts commit 69bb270c62040bdbd9281d86dd8c1f79ca641252. --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index ba026a706..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -75,7 +75,7 @@ mutable struct Entangler else init_mpi() comm = Comm_dup(parent_communicator) - transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices / 2) + transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) my_process_index = Comm_rank(comm) + 1 n_processes = Comm_size(comm) if verbose && my_process_index == 1 From bcf016d5589b3c34c3e50b4fc48a2dd9614a6cd0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 15:15:54 -0800 Subject: [PATCH 07/65] Trying on OpenMPI instead of mpich --- test/runtests.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 9906c351f..f069ce82d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,9 @@ Rationale for this hack: # Pkg.add(i) # end +using MPIPreferences +MPIPreferences.use_jll_binary("OpenMPI_jll") + using Test using Distributions using Random From 379df4b28aba0e983d2f7240dc9ed0c50db820f4 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 16:18:49 -0800 Subject: [PATCH 08/65] Going back to mpich, trying threadlevel = :multiple --- src/mpi_utils/Entangler.jl | 2 +- test/runtests.jl | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..574eb414c 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init(threadlevel = :funneled) +init_mpi() = Init(threadlevel = :multiple) """ $SIGNATURES diff --git a/test/runtests.jl b/test/runtests.jl index f069ce82d..bfab24ba5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,8 +12,6 @@ Rationale for this hack: # Pkg.add(i) # end -using MPIPreferences -MPIPreferences.use_jll_binary("OpenMPI_jll") using Test using Distributions From 2cd3571e61b1fb41f77c2c26535c0cc38d59e4e0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 21:20:22 -0800 Subject: [PATCH 09/65] Back to funneled as :multiple still crashes --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 574eb414c..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init(threadlevel = :multiple) +init_mpi() = Init(threadlevel = :funneled) """ $SIGNATURES From c8a02615f7aec89ee336c0c87c8f30d4f0984505 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Tue, 7 Mar 2023 08:48:52 -0800 Subject: [PATCH 10/65] Further simplification --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index bfab24ba5..ba59261ec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -110,7 +110,7 @@ end n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [] pigeons( - target = toy_mvn_target(1), + target = Pigeons.TestSwapper(0.5), n_rounds = 12, checked_round = 12, n_chains = 200, From 6217650345d28f96b631575efb35fecda33a710b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 09:59:39 -0800 Subject: [PATCH 11/65] Candidate fix --- src/Pigeons.jl | 2 +- src/mpi_utils/Entangler.jl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Pigeons.jl b/src/Pigeons.jl index ce87c42c9..9d6cc8734 100644 --- a/src/Pigeons.jl +++ b/src/Pigeons.jl @@ -8,7 +8,7 @@ import MPI: Comm, Allreduce, Comm_rank, Comm_dup, Request, Waitall, RequestSet, mpiexec, Allreduce, Allgather, Comm_split, isend, recv, - bcast, tag_ub + bcast, tag_ub, free using Base: Forward diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..7e3888937 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -174,7 +174,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: - Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + dummy_request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + free(dummy_request) # <-- critical - see https://github.com/pmodels/mpich/issues/6432#issue-1612064302 end end From 1f16936bed4832d0769bac0d93b3840ed9ae5737 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 10:20:32 -0800 Subject: [PATCH 12/65] Reintroducing other tests --- test/runtests.jl | 242 +++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index ba59261ec..14887026a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,9 +8,9 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -# for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] -# Pkg.add(i) -# end +for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] + Pkg.add(i) +end using Test @@ -19,13 +19,13 @@ using Random using Statistics using OnlineStats using LinearAlgebra -# using Turing +using Turing using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice -#include("slice_sampler_test.jl") -# include("turing.jl") +include("slice_sampler_test.jl") +include("turing.jl") function test_load_balance(n_processes, n_tasks) for p in 1:n_processes @@ -38,73 +38,73 @@ function test_load_balance(n_processes, n_tasks) end end -# @testset "Stepping stone" begin -# pt = pigeons(target = toy_mvn_target(100)); -# p = stepping_stone_pair(pt) -# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) -# @test abs(p[1] - truth) < 1 -# @test abs(p[2] - truth) < 1 -# end - -# @testset "Round trips" begin -# n_chains = 4 -# n_rounds = 5 +@testset "Stepping stone" begin + pt = pigeons(target = toy_mvn_target(100)); + p = stepping_stone_pair(pt) + truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) + @test abs(p[1] - truth) < 1 + @test abs(p[2] - truth) < 1 +end + +@testset "Round trips" begin + n_chains = 4 + n_rounds = 5 -# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); + pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); -# len = 2^(n_rounds) -# truth = 0.0 -# for i in 0:(n_chains-1) -# truth += floor(max(len - i, 0) / n_chains / 2) -# end - -# @test truth == Pigeons.n_round_trips(pt) -# end - -# @testset "Moments" begin -# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); -# for var_name in Pigeons.continuous_variables(pt) -# m = mean(pt, var_name) -# for i in eachindex(m) -# @test abs(m[i] - 0.0) < 0.001 -# end -# v = var(pt, var_name) -# for i in eachindex(v) -# @test abs(v[i] - 0.1) < 0.001 -# end -# end -# end - -# @testset "Parallelism Invariance" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] -# # Turing: -# pigeons( -# target = TuringLogPotential(flip_model_unidentifiable()), -# n_rounds = 4, -# checked_round = 3, -# multithreaded = true, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# dependencies = [Turing, LinearAlgebra, "turing.jl"], -# n_local_mpi_processes = n_mpis, -# n_threads = 2)) -# # Blang: -# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf -# Pigeons.setup_blang("blangDemos") -# pigeons(; -# target = Pigeons.blang_ising(), -# n_rounds = 4, -# checked_round = 3, -# recorder_builders = recorder_builders, -# multithreaded = true, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 2)) -# end -# end + len = 2^(n_rounds) + truth = 0.0 + for i in 0:(n_chains-1) + truth += floor(max(len - i, 0) / n_chains / 2) + end + + @test truth == Pigeons.n_round_trips(pt) +end + +@testset "Moments" begin + pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); + for var_name in Pigeons.continuous_variables(pt) + m = mean(pt, var_name) + for i in eachindex(m) + @test abs(m[i] - 0.0) < 0.001 + end + v = var(pt, var_name) + for i in eachindex(v) + @test abs(v[i] - 0.1) < 0.001 + end + end +end + +@testset "Parallelism Invariance" begin + n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf + recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 4, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Turing, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = n_mpis, + n_threads = 2)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 4, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2)) + end +end @testset "Longer MPI" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -122,50 +122,50 @@ end n_threads = 1)) end -# @testset "Entanglement" begin -# mpi_test(1, "entanglement_test.jl") -# mpi_test(2, "entanglement_test.jl") - -# mpi_test(1, "reduce_test.jl") -# mpi_test(2, "reduce_test.jl") -# mpi_test(3, "reduce_test.jl") -# end - -# @testset "PermutedDistributedArray" begin -# mpi_test(1, "permuted_test.jl", options = ["-s"]) -# mpi_test(1, "permuted_test.jl") -# mpi_test(2, "permuted_test.jl") -# end - -# @testset "LoadBalance" begin -# for i in 1:20 -# for j in i:30 -# test_load_balance(i, j) -# end -# end -# end - -# @testset "LogSum" begin -# m = Pigeons.LogSum() - -# fit!(m, 2.1) -# fit!(m, 4) -# v1 = value(m) -# @assert v1 ≈ log(exp(2.1) + exp(4)) +@testset "Entanglement" begin + mpi_test(1, "entanglement_test.jl") + mpi_test(2, "entanglement_test.jl") + mpi_test(1, "reduce_test.jl") + mpi_test(2, "reduce_test.jl") + mpi_test(3, "reduce_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# m2 = Pigeons.LogSum() -# fit!(m2, 50.1) -# combined = merge(m, m2) -# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) +@testset "PermutedDistributedArray" begin + mpi_test(1, "permuted_test.jl", options = ["-s"]) + mpi_test(1, "permuted_test.jl") + mpi_test(2, "permuted_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# empty!(m) -# @assert value(m) == -Pigeons.inf(0.0) -# end +@testset "LoadBalance" begin + for i in 1:20 + for j in i:30 + test_load_balance(i, j) + end + end +end + +@testset "LogSum" begin + m = Pigeons.LogSum() + + fit!(m, 2.1) + fit!(m, 4) + v1 = value(m) + @assert v1 ≈ log(exp(2.1) + exp(4)) + + + fit!(m, 2.1) + fit!(m, 4) + m2 = Pigeons.LogSum() + fit!(m2, 50.1) + combined = merge(m, m2) + @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + + fit!(m, 2.1) + fit!(m, 4) + empty!(m) + @assert value(m) == -Pigeons.inf(0.0) +end function test_split_slice() # test disjoint random streams @@ -184,14 +184,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -# @testset "split_test" begin -# test_split_slice() -# end +@testset "split_test" begin + test_split_slice() +end -# @testset "Serialize" begin -# mpi_test(1, "serialization_test.jl") -# end +@testset "Serialize" begin + mpi_test(1, "serialization_test.jl") +end -# @testset "SliceSampler" begin -# test_slice_sampler() -# end \ No newline at end of file +@testset "SliceSampler" begin + test_slice_sampler() +end \ No newline at end of file From 0ca198f9037eeb7cfc66f867ff0fe0958948fc49 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 11:06:25 -0800 Subject: [PATCH 13/65] purge tests from Turing, use only DynamicPPL --- test/runtests.jl | 9 +++------ test/slice_sampler_test.jl | 6 ------ test/turing.jl | 15 ++++++--------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 14887026a..20873ccc1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,10 +8,7 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] - Pkg.add(i) -end - +Pkg.add(["Test", "LinearAlgebra", "DynamicPPL", "ArgMacros", "Plots"]) using Test using Distributions @@ -19,7 +16,7 @@ using Random using Statistics using OnlineStats using LinearAlgebra -using Turing +using DynamicPPL using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice @@ -87,7 +84,7 @@ end recorder_builders = recorder_builders, checkpoint = true, on = ChildProcess( - dependencies = [Turing, LinearAlgebra, "turing.jl"], + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, n_threads = 2)) # Blang: diff --git a/test/slice_sampler_test.jl b/test/slice_sampler_test.jl index bfac6dbeb..82c4d4acb 100644 --- a/test/slice_sampler_test.jl +++ b/test/slice_sampler_test.jl @@ -1,9 +1,3 @@ -using Pigeons -using Distributions -using Random -using Turing -using SplittableRandoms - import Pigeons: SliceSampler, slice_sample! include("turing.jl") diff --git a/test/turing.jl b/test/turing.jl index 2bfa1fca6..d15d80143 100644 --- a/test/turing.jl +++ b/test/turing.jl @@ -1,28 +1,25 @@ # Unconditioned coinflip model with `N` observations. -@model function coinflip(; N::Int) +@model function coinflip(y) p ~ Beta(1, 12) - y ~ filldist(Bernoulli(p), N) + y .~ Bernoulli(p) return y end; -coinflip(y::AbstractVector{<:Real}) = coinflip(; N=length(y)) | (; y) # *Unidentifiable* unconditioned coinflip model with `N` observations. -@model function coinflip_unidentifiable(; N::Int) +@model function coinflip_unidentifiable(y) p1 ~ Uniform(0, 1) p2 ~ Uniform(0, 1) - y ~ filldist(Bernoulli(p1*p2), N) + y .~ Bernoulli(p1*p2) return y end; -coinflip_unidentifiable(y::AbstractVector{<:Real}) = coinflip_unidentifiable(; N=length(y)) | (; y) -@model function coinflip_modified(; N::Int) +@model function coinflip_modified(y) p ~ Uniform(0.3, 0.7) # δ ~ Bernoulli(0.5) δ ~ DiscreteUniform(0, 2) - y ~ filldist(Bernoulli(p + 0.1*δ), N) + y .~ Bernoulli(p + 0.1*δ) return y end; -coinflip_modified(y::AbstractVector{<:Real}) = coinflip_modified(; N=length(y)) | (; y) function flip_model() From ee276fc9bf58b5b20bdb901f722d2f6883524b9b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 12:20:54 -0800 Subject: [PATCH 14/65] Another free --- src/mpi_utils/Entangler.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 7e3888937..224437abb 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -254,7 +254,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: dest_global_index = current_global - spacing dest_process = find_process(e.load, dest_global_index) dest_rank = dest_process - 1 - isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + dummy_request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + free(dummy_request) current_local += spacing did_send = true elseif current_global + spacing ≤ e.load.n_global_indices From 491c26bfe0b41d89fb87fcadd6703b6cae795acd Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 14:58:34 -0800 Subject: [PATCH 15/65] Trying to run tests with more rounds --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 20873ccc1..311ad231f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -78,7 +78,7 @@ end # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 4, + n_rounds = 13, checked_round = 3, multithreaded = true, recorder_builders = recorder_builders, @@ -92,7 +92,7 @@ end Pigeons.setup_blang("blangDemos") pigeons(; target = Pigeons.blang_ising(), - n_rounds = 4, + n_rounds = 13, checked_round = 3, recorder_builders = recorder_builders, multithreaded = true, @@ -108,7 +108,7 @@ end recorder_builders = [] pigeons( target = Pigeons.TestSwapper(0.5), - n_rounds = 12, + n_rounds = 14, checked_round = 12, n_chains = 200, multithreaded = false, From fab79345cd7d9b356228dc66e329fa2e6a2d8972 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:41:39 -0800 Subject: [PATCH 16/65] system MPI test --- .github/workflows/CI.yml | 62 ++++++++++++++++++++++++++++++++++++++-- test/runtests.jl | 7 +++++ 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1c631be32..af99d7d9a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,17 +1,20 @@ name: CI + on: push: branches: - main tags: ['*'] pull_request: + concurrency: # Skip intermediate builds: always. # Cancel intermediate builds: only if it is a pull request build. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + jobs: - test: + test-default: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -27,7 +30,7 @@ jobs: arch: - x64 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-java@v3 with: distribution: 'temurin' @@ -43,13 +46,66 @@ jobs: - uses: codecov/codecov-action@v2 with: files: lcov.info + + + # adapted from MPI.jl + test-system-MPI-apt: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + os: + - ubuntu-latest + mpi: + - libmpich-dev + - libopenmpi-dev + steps: + - uses: actions/checkout@v3 + + - name: Install MPI via apt + run: | + sudo apt-get update + sudo apt-get install $MPI + env: + MPI: ${{ matrix.mpi }} + + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: x64 + + - uses: julia-actions/cache@v1 + + - name: add MPIPreferences + shell: julia --color=yes --project=. {0} + run: | + using Pkg + Pkg.develop(path="lib/MPIPreferences") + + - name: use system MPI + shell: julia --color=yes --project=. {0} + run: | + using MPIPreferences + MPIPreferences.use_system_binary() + + - uses: julia-actions/julia-runtest@v1 + + docs: name: Documentation runs-on: ubuntu-latest permissions: contents: write steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-java@v3 with: distribution: 'temurin' diff --git a/test/runtests.jl b/test/runtests.jl index 311ad231f..aee31e4f9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ using OnlineStats using LinearAlgebra using DynamicPPL using SplittableRandoms +using MPIPreferences import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice @@ -72,6 +73,12 @@ end end end +@testset "MPI" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end +end + @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] From 2ed0aae88ad6a9a84e3b7c8b2917cfce57ce41cb Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:43:57 -0800 Subject: [PATCH 17/65] fix typos --- .github/workflows/CI.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index af99d7d9a..ad00fb1e8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,10 +91,10 @@ jobs: Pkg.develop(path="lib/MPIPreferences") - name: use system MPI - shell: julia --color=yes --project=. {0} - run: | - using MPIPreferences - MPIPreferences.use_system_binary() + shell: julia --color=yes --project=. {0} + run: | + using MPIPreferences + MPIPreferences.use_system_binary() - uses: julia-actions/julia-runtest@v1 From c271261cfeb0853d943fc1fe3ad04f4d26f5f58c Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:47:07 -0800 Subject: [PATCH 18/65] remove step that does not make sense outside MPI.jl --- .github/workflows/CI.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ad00fb1e8..25e285d96 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -84,19 +84,13 @@ jobs: - uses: julia-actions/cache@v1 - - name: add MPIPreferences - shell: julia --color=yes --project=. {0} - run: | - using Pkg - Pkg.develop(path="lib/MPIPreferences") - - name: use system MPI shell: julia --color=yes --project=. {0} run: | using MPIPreferences MPIPreferences.use_system_binary() - - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-runtest@latest docs: From e3f569ae3702ddc6bf1d09eec10df604495d3cb2 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:52:59 -0800 Subject: [PATCH 19/65] setup MPIPreferences in the test env --- .github/workflows/CI.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 25e285d96..13fdc294d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -85,8 +85,10 @@ jobs: - uses: julia-actions/cache@v1 - name: use system MPI - shell: julia --color=yes --project=. {0} + shell: julia --color=yes --project=test {0} run: | + using Pkg + Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() From 77824eac6dfc0aed77e479ceab94b1401ade53f7 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:04:08 -0800 Subject: [PATCH 20/65] possible fix to Pkg missing --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 13fdc294d..110263bb8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,6 +91,7 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@latest From c69b76a335f7aacbd29ccc5eccc257ff1c8290b2 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:10:44 -0800 Subject: [PATCH 21/65] possible fix to Pkg missing --- .github/workflows/CI.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 110263bb8..e73835d36 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,10 +91,11 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Project.toml") rm("test/Manifest.toml") - - uses: julia-actions/julia-runtest@latest - + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 docs: name: Documentation From 523f9c120de94551621ec8bafc0c9cae7f2543fb Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:23:35 -0800 Subject: [PATCH 22/65] add missing env --- .github/workflows/CI.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index e73835d36..5860d9b1a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -62,6 +62,8 @@ jobs: mpi: - libmpich-dev - libopenmpi-dev + env: + JULIA_MPI_TEST_BINARY: system steps: - uses: actions/checkout@v3 @@ -91,10 +93,10 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + # need to remove these to avoid getting a 'Package "Pkg" missing' error rm("test/Project.toml") rm("test/Manifest.toml") - - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 docs: From 949b62880b9460f20178cfc1b3d0cbc348a4c909 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:33:50 -0800 Subject: [PATCH 23/65] another try --- .github/workflows/CI.yml | 5 +---- test/runtests.jl | 12 ++++++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5860d9b1a..8bbee8e2a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -90,12 +90,9 @@ jobs: shell: julia --color=yes --project=test {0} run: | using Pkg - Pkg.add("MPIPreferences") + Pkg.add(["Pkg","MPIPreferences"]) using MPIPreferences MPIPreferences.use_system_binary() - # need to remove these to avoid getting a 'Package "Pkg" missing' error - rm("test/Project.toml") - rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 diff --git a/test/runtests.jl b/test/runtests.jl index aee31e4f9..d1981f8d7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -36,6 +36,12 @@ function test_load_balance(n_processes, n_tasks) end end +@testset "System MPI" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end +end + @testset "Stepping stone" begin pt = pigeons(target = toy_mvn_target(100)); p = stepping_stone_pair(pt) @@ -73,12 +79,6 @@ end end end -@testset "MPI" begin - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end -end - @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] From 895a45fed65c5a968e46faa6df8c7f3371a7c060 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 18:33:42 -0800 Subject: [PATCH 24/65] test now has its own Project.toml --- .github/workflows/CI.yml | 2 +- Project.toml | 6 ------ src/utils/misc.jl | 14 -------------- test/Project.toml | 14 ++++++++++++++ test/misc.jl | 13 +++++++++++++ test/runtests.jl | 29 ++++++++++++----------------- test/single_cell_example.jl | 2 -- 7 files changed, 40 insertions(+), 40 deletions(-) create mode 100644 test/Project.toml create mode 100644 test/misc.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 8bbee8e2a..b61b43228 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -90,7 +90,7 @@ jobs: shell: julia --color=yes --project=test {0} run: | using Pkg - Pkg.add(["Pkg","MPIPreferences"]) + Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() diff --git a/Project.toml b/Project.toml index 662a88ceb..615e602cc 100644 --- a/Project.toml +++ b/Project.toml @@ -52,9 +52,3 @@ SpecialFunctions = "2" SplittableRandoms = "0.1" StatsBase = "0.33" julia = "1.6" - -[extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["Test"] diff --git a/src/utils/misc.jl b/src/utils/misc.jl index 91a19e68c..d3535b2d5 100644 --- a/src/utils/misc.jl +++ b/src/utils/misc.jl @@ -97,20 +97,6 @@ is attempted). """ macro abstract() quote error("Attempted to call an abstract function.") end end -function mpi_test(n_processes::Int, test_file::String; options = []) - project_folder = dirname(Base.current_project()) - # handle 2 different "modes" that tests can be ran (for julia 1.0,1.1 vs. >1.1) - resolved_test_file = - if isfile("$project_folder/$test_file") - "$project_folder/$test_file" - else - "$project_folder/test/$test_file" - end - mpiexec() do exe - run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) - end -end - """ @weighted(w, x) diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 000000000..f71ba28fa --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,14 @@ +[deps] +ArgMacros = "dbc42088-9de8-42a0-8ec8-2cd114e1ea3e" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" +OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +SplittableRandoms = "8efc31e9-3fb0-4277-b18c-5a3d5d07abad" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/misc.jl b/test/misc.jl new file mode 100644 index 000000000..4100a32ba --- /dev/null +++ b/test/misc.jl @@ -0,0 +1,13 @@ +function mpi_test(n_processes::Int, test_file::String; options = []) + project_folder = dirname(Base.current_project()) + # handle 2 different "modes" that tests can be ran (for julia 1.0,1.1 vs. >1.1) + resolved_test_file = + if isfile("$project_folder/$test_file") + "$project_folder/$test_file" + else + "$project_folder/test/$test_file" + end + mpiexec() do exe + run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d1981f8d7..17d571edb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,27 +1,22 @@ using Pigeons -using Pkg -#= -Rationale for this hack: -- putting those in the [extras] section will lead to ChildProcess not - having access to it -- the other method, a second toml file, seems more promising but - proved challenging to get to work on CI -=# -Pkg.add(["Test", "LinearAlgebra", "DynamicPPL", "ArgMacros", "Plots"]) - -using Test +using ArgMacros using Distributions -using Random -using Statistics -using OnlineStats -using LinearAlgebra using DynamicPPL -using SplittableRandoms +using LinearAlgebra +using MPI using MPIPreferences -import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, +using OnlineStats +using Random +using Serialization +using SplittableRandoms +using Statistics +using Test + +import Pigeons: my_global_indices, LoadBalance, my_load, find_process, split_slice +include("misc.jl") include("slice_sampler_test.jl") include("turing.jl") diff --git a/test/single_cell_example.jl b/test/single_cell_example.jl index 062a24e70..e83a0e954 100644 --- a/test/single_cell_example.jl +++ b/test/single_cell_example.jl @@ -1,5 +1,3 @@ -using Pigeons - if !isdir("nowellpack") && !islink(blang_repo) # Download and compile the Blang model used in https://www.biorxiv.org/content/10.1101/2020.05.06.058180 println("cloning and building nowellpack") From 13c9cd2c91398e1eff54c80ca24076fcd22352d9 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:00:32 -0800 Subject: [PATCH 25/65] force Pkg.instantiate in ChildProcess --- src/submission/ChildProcess.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 8a8718cbb..8ccf7a8d2 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -124,6 +124,8 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ + using Pkg + Pkg.instantiate() $dependency_declarations $silence_code From 1772d82510a2fe30ad4b0db24d5aeec0a9483bc7 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:40:05 -0800 Subject: [PATCH 26/65] dont use --project in ChildProcess --- src/submission/ChildProcess.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 8ccf7a8d2..6251a4a45 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -74,7 +74,6 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil julia_bin = Base.julia_cmd() script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin - --project --threads=$n_threads $script_path` end @@ -124,8 +123,6 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - using Pkg - Pkg.instantiate() $dependency_declarations $silence_code From bdf71ce6aab21eee1d967475d73d8531c1b0fcab Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:59:41 -0800 Subject: [PATCH 27/65] ChildProcess inherits the exact same active project --- src/submission/ChildProcess.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 6251a4a45..ea76a2f26 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -73,7 +73,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) - return `$julia_bin + return `$julia_bin + --project=$(dirname(Base.active_project())) --threads=$n_threads $script_path` end From 2bcd82541510571dd34f99a024be098518b7931e Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 20:36:43 -0800 Subject: [PATCH 28/65] new approach --- .github/workflows/CI.yml | 7 +++++++ src/submission/ChildProcess.jl | 2 ++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b61b43228..fb73805ad 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -41,6 +41,13 @@ jobs: arch: ${{ matrix.arch }} - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@v1 + + - name: instantiate the test environment + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v2 diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index ea76a2f26..2579c1176 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -72,6 +72,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() + active_proj = dirname(Base.active_project()) + run(`$julia_bin --project=$active_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$(dirname(Base.active_project())) From eadbfcdb43289afd0dd1b5abc1983517365a45dd Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 06:34:26 -0800 Subject: [PATCH 29/65] Changing approach for Isend/isend: explicit Waitall for all requests A crash seemed to occur in CI during garbage collection julia:3364 terminated with signal 11 at PC=7f997aaec971 SP=7f9936ccb970. Backtrace: /opt/hostedtoolcache/julia/1.8.5/x64/bin/../lib/julia/libjulia-internal.so.1(ijl_gc_safepoint+0x11)[0x7f997aaec971] This looks like a crash during garbage collection code (this would be consistent with the fact that it only shows up with allocation-heavy test, i.e. Turing). So even though the finalizer checks first before calling free, maybe there is something flaky there. https://github.com/Julia-Tempering/Pigeons.jl/pull/30#issuecomment-1461317764 --- src/mpi_utils/Entangler.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 224437abb..a883b0dfe 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -157,6 +157,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic # indicators of whether each local index is to be received over MPI e.current_received_bits .= true at_least_one_mpi = false + + requests = RequestSet() # send (or copy if local) for local_index in 1:myload @@ -174,14 +176,13 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: - dummy_request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) - free(dummy_request) # <-- critical - see https://github.com/pmodels/mpich/issues/6432#issue-1612064302 + request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + push!(requests, request) end end # receive if at_least_one_mpi - requests = RequestSet() my_globals = my_global_indices(e.load) for local_index in 1:myload if e.current_received_bits[local_index] @@ -241,6 +242,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: # outer loop is over the levels of a binary tree over the global indices iteration = 1 + requests = RequestSet() + while n_remaining_to_reduce > 1 transmit_index = next_transmit_index!(e) current_local = my_first_remaining_local @@ -254,8 +257,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: dest_global_index = current_global - spacing dest_process = find_process(e.load, dest_global_index) dest_rank = dest_process - 1 - dummy_request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) - free(dummy_request) + request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + push!(requests, request) current_local += spacing did_send = true elseif current_global + spacing ≤ e.load.n_global_indices @@ -278,6 +281,7 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: if did_send my_first_remaining_local += spacing + Waitall(requests) end n_global_indices_remaining_before = ceil(Int, n_global_indices_remaining_before/2) spacing = spacing * 2 From 57d0921b22026d90c961933158b51cee6f027003 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 06:48:25 -0800 Subject: [PATCH 30/65] Is this a Turing multi-threading issue? --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 17d571edb..db4e0b206 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -88,7 +88,7 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 1)) # Blang: if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf Pigeons.setup_blang("blangDemos") From 63aad9766d55bc8789aa9e9363a00b791d685744 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 07:15:55 -0800 Subject: [PATCH 31/65] Testing single-thread for all Multi-threading issue seems not specific to Turing as crash occurred for Blang in last commit. --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index db4e0b206..5971d0e4a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -101,7 +101,7 @@ end checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 1)) end end From 7a8de7e0f38a42c8a566ede0a65b047c5538c585 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 07:41:31 -0800 Subject: [PATCH 32/65] Adding a temporary "dry run" to see if problem cause by some compilation --- test/runtests.jl | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 5971d0e4a..8a3bf86d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,6 +77,36 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + + # temp + # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 13, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = 1, + n_threads = 1)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 13, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = 1, + n_threads = 1)) + end + + # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), From 5812127c3b94caae6dad9035e58e282192201e4d Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 08:05:12 -0800 Subject: [PATCH 33/65] Commenting out Turing and Blang, to see behaviour on toy_mvn and swap(0.5) --- test/runtests.jl | 97 +++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 8a3bf86d8..79d3b78fa 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -78,10 +78,9 @@ end n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # temp - # Turing: + pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), + target = toy_mvn_target(1), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -89,27 +88,12 @@ end checkpoint = true, on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = 1, + n_local_mpi_processes = n_mpis, n_threads = 1)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = 1, - n_threads = 1)) - end - # Turing: pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), + target = Pigeons.TestSwapper(0.5), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -119,20 +103,65 @@ end dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, n_threads = 1)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 1)) - end + + + + + # # temp + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 13, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = 1, + # n_threads = 1)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 13, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = 1, + # n_threads = 1)) + # end + + + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 13, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = n_mpis, + # n_threads = 1)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 13, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = n_mpis, + # n_threads = 1)) + # end end @testset "Longer MPI" begin From 270333f4070c30c46744cf9017a841c61542d91f Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:08:58 -0800 Subject: [PATCH 34/65] openmpi_jll test --- .github/workflows/CI.yml | 42 ++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 8 +++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index fb73805ad..ccf71748e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -53,6 +53,48 @@ jobs: - uses: codecov/codecov-action@v2 with: files: lcov.info + + # test OpenMPI_jll + test-openmpi-jll: + strategy: + matrix: + version: + - '1.8' + - 'nightly' + os: + - ubuntu-latest + - macos-latest + - windows-latest + arch: + - x64 + + fail-fast: false + env: + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: julia-actions/setup-julia@latest + with: + arch: ${{ matrix.julia_arch }} + version: ${{ matrix.julia_version }} + - uses: julia-actions/cache@v1 + + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_system_binary() + + - uses: julia-actions/julia-runtest@latest + # adapted from MPI.jl diff --git a/test/runtests.jl b/test/runtests.jl index 79d3b78fa..52026e480 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,10 +31,16 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "System MPI" begin +@testset "MPI" begin if haskey(ENV,"JULIA_MPI_TEST_BINARY") @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary end + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end + if haskey(ENV,"JULIA_MPI_TEST_ABI") + @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi + end end @testset "Stepping stone" begin From 2b49008f382f56d582f84478791da62b6dab1b6a Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:12:08 -0800 Subject: [PATCH 35/65] openmpi_jll test --- .github/workflows/CI.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ccf71748e..caf161ff3 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -55,16 +55,14 @@ jobs: files: lcov.info # test OpenMPI_jll - test-openmpi-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} + runs-on: ${{ matrix.os }} strategy: matrix: version: - '1.8' - - 'nightly' os: - ubuntu-latest - - macos-latest - - windows-latest arch: - x64 @@ -72,9 +70,6 @@ jobs: env: JULIA_MPI_TEST_BINARY: OpenMPI_jll JULIA_MPI_TEST_ABI: OpenMPI - - runs-on: ${{ matrix.os }} - steps: - name: Checkout uses: actions/checkout@v3 @@ -92,7 +87,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() - + - uses: julia-actions/julia-runtest@latest From 9c6497f6b4584baacd6d3415117159c471beddc0 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:14:03 -0800 Subject: [PATCH 36/65] openmpi_jll test --- .github/workflows/CI.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index caf161ff3..afabef73a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -55,6 +55,7 @@ jobs: files: lcov.info # test OpenMPI_jll + test-OpenMPI-jll: name: Julia OpenMPI_jll - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -86,7 +87,7 @@ jobs: using Pkg Pkg.instantiate() using MPIPreferences - MPIPreferences.use_system_binary() + MPIPreferences.use_jll_binary("OpenMPI_jll") - uses: julia-actions/julia-runtest@latest From d68dc6c394c168cdb7a17ce4f7486d64bc32ddcf Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:15:41 -0800 Subject: [PATCH 37/65] openmpi_jll test --- .github/workflows/CI.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index afabef73a..0ab775a57 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -77,8 +77,8 @@ jobs: - uses: julia-actions/setup-julia@latest with: - arch: ${{ matrix.julia_arch }} - version: ${{ matrix.julia_version }} + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} - uses: julia-actions/cache@v1 - name: use OpenMPI_jll From 38de7cfb2d34f0458715b417f7131788c918ac43 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 09:53:10 -0800 Subject: [PATCH 38/65] more messages from Child processes --- .gitignore | 3 +- src/submission/ChildProcess.jl | 42 +++++++++++--- test/runtests.jl | 103 ++++++--------------------------- 3 files changed, 54 insertions(+), 94 deletions(-) diff --git a/.gitignore b/.gitignore index 8a9900510..fb28d4e81 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ .vscode/settings.json build .interfaces.md - +*.log +*.err machines.txt results .includes_bu.jl diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 2579c1176..5d9982eff 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -126,18 +126,42 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - $dependency_declarations - $silence_code - - Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") - pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") - - pt = PT(pt_arguments, exec_folder = raw"$exec_folder") - pigeons(pt) + prefix=string(getpid()) + println("hello from PID " * prefix) + open(prefix * ".log", "a") do out + open(prefix * ".err", "a") do err + redirect_stdout(out) do + redirect_stderr(err) do + $dependency_declarations + $silence_code + println("using Pigeons located @ " * pathof(Pigeons)) + end + end + end + end + # need to do this in order to be able to use declarations, since they happened inside a function + open(prefix * ".log", "a") do out + open(prefix * ".err", "a") do err + redirect_stdout(out) do + redirect_stderr(err) do + print("deserializing...") + Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") + pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") + println("done!") + print("running PT...") + pt = PT(pt_arguments, exec_folder = raw"$exec_folder") + println("done!") + print("running pigeons(pt)...") + pigeons(pt) + println("done") + end + end + end + end """ end -add_dependency(dependency::Module) = "using $dependency" +add_dependency(dependency::Module) = "@eval using $dependency" function add_dependency(dependency::String) abs_path = abspath(dependency) return """include(raw"$abs_path")""" diff --git a/test/runtests.jl b/test/runtests.jl index 52026e480..83e88c587 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,16 +31,10 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "MPI" begin +@testset "System MPI" begin if haskey(ENV,"JULIA_MPI_TEST_BINARY") @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary end - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end - if haskey(ENV,"JULIA_MPI_TEST_ABI") - @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi - end end @testset "Stepping stone" begin @@ -83,10 +77,9 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - - + # Turing: pigeons( - target = toy_mvn_target(1), + target = TuringLogPotential(flip_model_unidentifiable()), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -95,79 +88,21 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 1)) - - - pigeons( - target = Pigeons.TestSwapper(0.5), - n_rounds = 13, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 1)) - - - - - # # temp - # # Turing: - # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), - # n_rounds = 13, - # checked_round = 3, - # multithreaded = true, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - # n_local_mpi_processes = 1, - # n_threads = 1)) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 13, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = 1, - # n_threads = 1)) - # end - - - # # Turing: - # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), - # n_rounds = 13, - # checked_round = 3, - # multithreaded = true, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - # n_local_mpi_processes = n_mpis, - # n_threads = 1)) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 13, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 1)) - # end + n_threads = 2)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 13, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2)) + end end @testset "Longer MPI" begin @@ -258,4 +193,4 @@ end @testset "SliceSampler" begin test_slice_sampler() -end \ No newline at end of file +end From 8537307e929e578790b72782d7bfd035613632d5 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:22:56 -0800 Subject: [PATCH 39/65] extra arg to OpenMPI --- .github/workflows/CI.yml | 73 ++++++++++++++--------------- src/submission/ChildProcess.jl | 24 +++++++--- test/runtests.jl | 85 ++++++++++++++++++---------------- 3 files changed, 98 insertions(+), 84 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 0ab775a57..017f76bcf 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -54,42 +54,42 @@ jobs: with: files: lcov.info - # test OpenMPI_jll - test-OpenMPI-jll: - name: Julia OpenMPI_jll - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - version: - - '1.8' - os: - - ubuntu-latest - arch: - - x64 - - fail-fast: false - env: - JULIA_MPI_TEST_BINARY: OpenMPI_jll - JULIA_MPI_TEST_ABI: OpenMPI - steps: - - name: Checkout - uses: actions/checkout@v3 - - - uses: julia-actions/setup-julia@latest - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 - - - name: use OpenMPI_jll - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_jll_binary("OpenMPI_jll") - - - uses: julia-actions/julia-runtest@latest + # # test OpenMPI_jll + # test-OpenMPI-jll: + # name: Julia OpenMPI_jll - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # arch: + # - x64 + + # fail-fast: false + # env: + # JULIA_MPI_TEST_BINARY: OpenMPI_jll + # JULIA_MPI_TEST_ABI: OpenMPI + # steps: + # - name: Checkout + # uses: actions/checkout@v3 + + # - uses: julia-actions/setup-julia@latest + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 + + # - name: use OpenMPI_jll + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_jll_binary("OpenMPI_jll") + + # - uses: julia-actions/julia-runtest@latest @@ -105,7 +105,6 @@ jobs: os: - ubuntu-latest mpi: - - libmpich-dev - libopenmpi-dev env: JULIA_MPI_TEST_BINARY: system diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 5d9982eff..eb7c8a3b5 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,21 +62,29 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` + println("Checking MPI version:") + run(`$exe -V`) + mpi_args = extra_mpi_args() + mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` - run(cmd, wait = new_process.wait) + logfile = "Pigeons.log" + println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") + run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) end end return Result{PT}(exec_folder) end +function extra_mpi_args() + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 -v` : `` +end + function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - active_proj = dirname(Base.active_project()) - run(`$julia_bin --project=$active_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache + julia_bin = "julia"#Base.julia_cmd() + cur_proj = dirname(Base.current_project()) script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin - --project=$(dirname(Base.active_project())) + --project=$cur_proj --threads=$n_threads $script_path` end @@ -126,6 +134,8 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ + println("wd = " * pwd()) + println("active_proj = " * dirname(Base.active_project()) ) prefix=string(getpid()) println("hello from PID " * prefix) open(prefix * ".log", "a") do out @@ -153,7 +163,7 @@ function launch_code( println("done!") print("running pigeons(pt)...") pigeons(pt) - println("done") + println("done!") end end end diff --git a/test/runtests.jl b/test/runtests.jl index 83e88c587..b359afe46 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,48 +31,48 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "System MPI" begin - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end -end - -@testset "Stepping stone" begin - pt = pigeons(target = toy_mvn_target(100)); - p = stepping_stone_pair(pt) - truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) - @test abs(p[1] - truth) < 1 - @test abs(p[2] - truth) < 1 -end - -@testset "Round trips" begin - n_chains = 4 - n_rounds = 5 +# @testset "System MPI" begin +# if haskey(ENV,"JULIA_MPI_TEST_BINARY") +# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary +# end +# end + +# @testset "Stepping stone" begin +# pt = pigeons(target = toy_mvn_target(100)); +# p = stepping_stone_pair(pt) +# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) +# @test abs(p[1] - truth) < 1 +# @test abs(p[2] - truth) < 1 +# end + +# @testset "Round trips" begin +# n_chains = 4 +# n_rounds = 5 - pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); +# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); - len = 2^(n_rounds) - truth = 0.0 - for i in 0:(n_chains-1) - truth += floor(max(len - i, 0) / n_chains / 2) - end - - @test truth == Pigeons.n_round_trips(pt) -end - -@testset "Moments" begin - pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); - for var_name in Pigeons.continuous_variables(pt) - m = mean(pt, var_name) - for i in eachindex(m) - @test abs(m[i] - 0.0) < 0.001 - end - v = var(pt, var_name) - for i in eachindex(v) - @test abs(v[i] - 0.1) < 0.001 - end - end -end +# len = 2^(n_rounds) +# truth = 0.0 +# for i in 0:(n_chains-1) +# truth += floor(max(len - i, 0) / n_chains / 2) +# end + +# @test truth == Pigeons.n_round_trips(pt) +# end + +# @testset "Moments" begin +# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); +# for var_name in Pigeons.continuous_variables(pt) +# m = mean(pt, var_name) +# for i in eachindex(m) +# @test abs(m[i] - 0.0) < 0.001 +# end +# v = var(pt, var_name) +# for i in eachindex(v) +# @test abs(v[i] - 0.1) < 0.001 +# end +# end +# end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -194,3 +194,8 @@ end @testset "SliceSampler" begin test_slice_sampler() end + +# clean-up logs +ls = readdir() +foreach(rm, filter(endswith(".log"), ls)) +foreach(rm, filter(endswith(".err"), ls)) From c2b732bb6d16a6b69d6203dc3193c89fd2716626 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:29:56 -0800 Subject: [PATCH 40/65] rm mpi version query --- .github/workflows/CI.yml | 140 ++++++++++++++++----------------- src/submission/ChildProcess.jl | 2 - 2 files changed, 70 insertions(+), 72 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 017f76bcf..10a28cb49 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -54,91 +54,91 @@ jobs: with: files: lcov.info - # # test OpenMPI_jll - # test-OpenMPI-jll: - # name: Julia OpenMPI_jll - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # arch: - # - x64 - - # fail-fast: false - # env: - # JULIA_MPI_TEST_BINARY: OpenMPI_jll - # JULIA_MPI_TEST_ABI: OpenMPI - # steps: - # - name: Checkout - # uses: actions/checkout@v3 - - # - uses: julia-actions/setup-julia@latest - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - - # - name: use OpenMPI_jll - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_jll_binary("OpenMPI_jll") - - # - uses: julia-actions/julia-runtest@latest - - - - # adapted from MPI.jl - test-system-MPI-apt: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # test OpenMPI_jll + test-OpenMPI-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: - fail-fast: false matrix: version: - '1.8' os: - ubuntu-latest - mpi: - - libopenmpi-dev + arch: + - x64 + + fail-fast: false env: - JULIA_MPI_TEST_BINARY: system + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI steps: - - uses: actions/checkout@v3 + - name: Checkout + uses: actions/checkout@v3 - - name: Install MPI via apt - run: | - sudo apt-get update - sudo apt-get install $MPI - env: - MPI: ${{ matrix.mpi }} + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_jll_binary("OpenMPI_jll") - - uses: julia-actions/setup-julia@v1 - with: - version: ${{ matrix.version }} - arch: x64 + - uses: julia-actions/julia-runtest@latest - - uses: julia-actions/cache@v1 - - name: use system MPI - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_system_binary() + + # # adapted from MPI.jl + # test-system-MPI-apt: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # mpi: + # - libopenmpi-dev + # env: + # JULIA_MPI_TEST_BINARY: system + # steps: + # - uses: actions/checkout@v3 + + # - name: Install MPI via apt + # run: | + # sudo apt-get update + # sudo apt-get install $MPI + # env: + # MPI: ${{ matrix.mpi }} + + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: x64 + + # - uses: julia-actions/cache@v1 + + # - name: use system MPI + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_system_binary() - - uses: julia-actions/julia-runtest@v1 + # - uses: julia-actions/julia-runtest@v1 docs: name: Documentation diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index eb7c8a3b5..0b0a1cbe8 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,8 +62,6 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - println("Checking MPI version:") - run(`$exe -V`) mpi_args = extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` From 0e82a84d6684ced6cd25bc15695b7d3f5d455707 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:40:26 -0800 Subject: [PATCH 41/65] use old julia_cmd + rm manifest as in MPI.jl CI --- .github/workflows/CI.yml | 1 + src/submission/ChildProcess.jl | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 10a28cb49..a7b3301aa 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -88,6 +88,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_jll_binary("OpenMPI_jll") + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@latest diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 0b0a1cbe8..cbd8bbc09 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -67,7 +67,13 @@ function pigeons(pt_arguments, new_process::ChildProcess) cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") - run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + try + run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + catch + open(logfile, "r") do f + println(read(f, String)) + end + end end end return Result{PT}(exec_folder) @@ -78,8 +84,8 @@ function extra_mpi_args() end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = "julia"#Base.julia_cmd() - cur_proj = dirname(Base.current_project()) + julia_bin = Base.julia_cmd() + cur_proj = dirname(Base.current_project()) script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj From 82a4210ca2dbc1035dfab90a469195ae03ed9ead Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:52:29 -0800 Subject: [PATCH 42/65] add --oversubscribe for OpenMPI + rethrow exception --- src/submission/ChildProcess.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index cbd8bbc09..01b68422d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -69,10 +69,11 @@ function pigeons(pt_arguments, new_process::ChildProcess) println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") try run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) - catch + catch e open(logfile, "r") do f println(read(f, String)) end + rethrow(e) end end end @@ -80,7 +81,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 -v` : `` + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) From 700c71271548974ee0602a4e948a6cce594dc476 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 15:24:46 -0800 Subject: [PATCH 43/65] simplify logging --- src/submission/ChildProcess.jl | 54 ++++++++++++---------------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 01b68422d..521f512fe 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -68,8 +68,9 @@ function pigeons(pt_arguments, new_process::ChildProcess) logfile = "Pigeons.log" println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") try - run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + run(pipeline(cmd; stdout = logfile, stderr = logfile), wait = new_process.wait) catch e + println("pipeline terminated with non-zero status. Dumping stdout+stderr:\n\n") open(logfile, "r") do f println(read(f, String)) end @@ -139,44 +140,25 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - println("wd = " * pwd()) - println("active_proj = " * dirname(Base.active_project()) ) - prefix=string(getpid()) - println("hello from PID " * prefix) - open(prefix * ".log", "a") do out - open(prefix * ".err", "a") do err - redirect_stdout(out) do - redirect_stderr(err) do - $dependency_declarations - $silence_code - println("using Pigeons located @ " * pathof(Pigeons)) - end - end - end - end - # need to do this in order to be able to use declarations, since they happened inside a function - open(prefix * ".log", "a") do out - open(prefix * ".err", "a") do err - redirect_stdout(out) do - redirect_stderr(err) do - print("deserializing...") - Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") - pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") - println("done!") - print("running PT...") - pt = PT(pt_arguments, exec_folder = raw"$exec_folder") - println("done!") - print("running pigeons(pt)...") - pigeons(pt) - println("done!") - end - end - end - end + pid=string(getpid()) + println("hello from PID " * pid) + + println(pid * ": wd = " * pwd()) + println(pid * ": active_proj = " * dirname(Base.active_project()) ) + + $dependency_declarations + println(pid * ": using Pigeons located @ " * dirname(pathof(Pigeons))) + + $silence_code + + Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") + pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") + pt = PT(pt_arguments, exec_folder = raw"$exec_folder") + pigeons(pt) """ end -add_dependency(dependency::Module) = "@eval using $dependency" +add_dependency(dependency::Module) = "using $dependency" function add_dependency(dependency::String) abs_path = abspath(dependency) return """include(raw"$abs_path")""" From ebcb3a6c46423884b7bd3d280ec2b786fc431921 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 15:38:59 -0800 Subject: [PATCH 44/65] force instantiate + precompile --- src/submission/ChildProcess.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 521f512fe..94ba2ff3d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -88,6 +88,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() cur_proj = dirname(Base.current_project()) + @info "forcing instantiate + precompile on project $cur_proj" + run(`$julia_bin --project=$cur_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj From ad69b34e627d898eed0ec99a29409eaa291760f2 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 06:05:50 -0800 Subject: [PATCH 45/65] Add mpi_args to mpi_test --- test/misc.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/misc.jl b/test/misc.jl index 4100a32ba..7654799af 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,6 +8,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) + mpi_args = extra_mpi_args() + run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end end \ No newline at end of file From 6bde8fe5fe9ad2fb51759f5513b22199973f874e Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 06:19:57 -0800 Subject: [PATCH 46/65] Temporary: trying to speed up some key tests --- .github/workflows/CI.yml | 127 ++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index a7b3301aa..577f0f2c2 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,45 +14,46 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - test-default: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - version: - - '1.8' - - 'nightly' - os: - - ubuntu-latest - - macos-latest - - windows-latest - arch: - - x64 - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' - - uses: julia-actions/setup-julia@v1 - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 - - uses: julia-actions/julia-buildpkg@v1 - - - name: instantiate the test environment - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - - - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v2 - with: - files: lcov.info + + # test-default: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # - 'nightly' + # os: + # - ubuntu-latest + # - macos-latest + # - windows-latest + # arch: + # - x64 + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 + # - uses: julia-actions/julia-buildpkg@v1 + + # - name: instantiate the test environment + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + + # - uses: julia-actions/julia-runtest@v1 + # - uses: julia-actions/julia-processcoverage@v1 + # - uses: codecov/codecov-action@v2 + # with: + # files: lcov.info # test OpenMPI_jll test-OpenMPI-jll: @@ -141,27 +142,27 @@ jobs: # - uses: julia-actions/julia-runtest@v1 - docs: - name: Documentation - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' - - uses: julia-actions/setup-julia@v1 - with: - version: '1' - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-docdeploy@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - run: | - julia --project=docs -e ' - using Documenter: DocMeta, doctest - using Pigeons - DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) - doctest(Pigeons)' + # docs: + # name: Documentation + # runs-on: ubuntu-latest + # permissions: + # contents: write + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + # - uses: julia-actions/setup-julia@v1 + # with: + # version: '1' + # - uses: julia-actions/julia-buildpkg@v1 + # - uses: julia-actions/julia-docdeploy@v1 + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # - run: | + # julia --project=docs -e ' + # using Documenter: DocMeta, doctest + # using Pigeons + # DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) + # doctest(Pigeons)' From 7b5e7d41fa09d8b78e524b6a23df34f92baad904 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 07:00:18 -0800 Subject: [PATCH 47/65] Fix --- src/submission/ChildProcess.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 94ba2ff3d..b4a894aa3 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,7 +62,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = extra_mpi_args() + mpi_args = Pigeons.extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" From fc34b85bcfaa10b5a6299ef11ec415db65cc2e4f Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 07:51:01 -0800 Subject: [PATCH 48/65] Fix the fix + reintroducing the system-MPI tests --- .github/workflows/CI.yml | 90 +++++++++++++++++----------------- src/submission/ChildProcess.jl | 2 +- test/misc.jl | 2 +- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 577f0f2c2..7df61a6c4 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -95,52 +95,52 @@ jobs: - # # adapted from MPI.jl - # test-system-MPI-apt: - # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # fail-fast: false - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # mpi: - # - libopenmpi-dev - # env: - # JULIA_MPI_TEST_BINARY: system - # steps: - # - uses: actions/checkout@v3 - - # - name: Install MPI via apt - # run: | - # sudo apt-get update - # sudo apt-get install $MPI - # env: - # MPI: ${{ matrix.mpi }} - - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - - # - uses: julia-actions/setup-julia@v1 - # with: - # version: ${{ matrix.version }} - # arch: x64 - - # - uses: julia-actions/cache@v1 - - # - name: use system MPI - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_system_binary() + # adapted from MPI.jl + test-system-MPI-apt: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + os: + - ubuntu-latest + mpi: + - libopenmpi-dev + env: + JULIA_MPI_TEST_BINARY: system + steps: + - uses: actions/checkout@v3 + + - name: Install MPI via apt + run: | + sudo apt-get update + sudo apt-get install $MPI + env: + MPI: ${{ matrix.mpi }} + + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: x64 + + - uses: julia-actions/cache@v1 + + - name: use system MPI + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_system_binary() - # - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-runtest@v1 # docs: # name: Documentation diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index b4a894aa3..94ba2ff3d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,7 +62,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = Pigeons.extra_mpi_args() + mpi_args = extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" diff --git a/test/misc.jl b/test/misc.jl index 7654799af..ec24109d5 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,7 +8,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - mpi_args = extra_mpi_args() + mpi_args = Pigeons.extra_mpi_args() run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end end \ No newline at end of file From 5ba839e36ba7900f3ca2f130c325619d3df3f186 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 08:07:52 -0800 Subject: [PATCH 49/65] Add back libmpich-dev to resume investigation on ghostbug --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7df61a6c4..3c79cfa50 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -108,6 +108,7 @@ jobs: - ubuntu-latest mpi: - libopenmpi-dev + - libmpich-dev env: JULIA_MPI_TEST_BINARY: system steps: From 2b37c0f83e02296a1cb1768b3e8682aef4cc4717 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:02:42 -0800 Subject: [PATCH 50/65] Trying to simplify CI setup needed to reproduce ghostbug --- .github/workflows/CI.yml | 65 +++++++------- test/runtests.jl | 178 +++++++++++++++++++++------------------ 2 files changed, 127 insertions(+), 116 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3c79cfa50..b40ed6a01 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -56,42 +56,42 @@ jobs: # files: lcov.info # test OpenMPI_jll - test-OpenMPI-jll: - name: Julia OpenMPI_jll - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - version: - - '1.8' - os: - - ubuntu-latest - arch: - - x64 + # test-OpenMPI-jll: + # name: Julia OpenMPI_jll - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # arch: + # - x64 - fail-fast: false - env: - JULIA_MPI_TEST_BINARY: OpenMPI_jll - JULIA_MPI_TEST_ABI: OpenMPI - steps: - - name: Checkout - uses: actions/checkout@v3 + # fail-fast: false + # env: + # JULIA_MPI_TEST_BINARY: OpenMPI_jll + # JULIA_MPI_TEST_ABI: OpenMPI + # steps: + # - name: Checkout + # uses: actions/checkout@v3 - - uses: julia-actions/setup-julia@latest - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 + # - uses: julia-actions/setup-julia@latest + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 - - name: use OpenMPI_jll - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_jll_binary("OpenMPI_jll") - rm("test/Manifest.toml") + # - name: use OpenMPI_jll + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_jll_binary("OpenMPI_jll") + # rm("test/Manifest.toml") - - uses: julia-actions/julia-runtest@latest + # - uses: julia-actions/julia-runtest@latest @@ -107,7 +107,6 @@ jobs: os: - ubuntu-latest mpi: - - libopenmpi-dev - libmpich-dev env: JULIA_MPI_TEST_BINARY: system diff --git a/test/runtests.jl b/test/runtests.jl index b359afe46..0dad559cb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,94 +77,106 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # Turing: - pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 13, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 2)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - end -end -@testset "Longer MPI" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [] + # test swapper pigeons( target = Pigeons.TestSwapper(0.5), - n_rounds = 14, - checked_round = 12, - n_chains = 200, - multithreaded = false, + n_rounds = 10, + checked_round = 3, recorder_builders = recorder_builders, checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 1)) + n_threads = 2)) + + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 10, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = n_mpis, + # n_threads = 2)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 10, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = n_mpis, + # n_threads = 2)) + # end end -@testset "Entanglement" begin - mpi_test(1, "entanglement_test.jl") - mpi_test(2, "entanglement_test.jl") +# @testset "Longer MPI" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [] +# pigeons( +# target = Pigeons.TestSwapper(0.5), +# n_rounds = 14, +# checked_round = 12, +# n_chains = 200, +# multithreaded = false, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 1)) +# end - mpi_test(1, "reduce_test.jl") - mpi_test(2, "reduce_test.jl") - mpi_test(3, "reduce_test.jl") -end +# @testset "Entanglement" begin +# mpi_test(1, "entanglement_test.jl") +# mpi_test(2, "entanglement_test.jl") -@testset "PermutedDistributedArray" begin - mpi_test(1, "permuted_test.jl", options = ["-s"]) - mpi_test(1, "permuted_test.jl") - mpi_test(2, "permuted_test.jl") -end +# mpi_test(1, "reduce_test.jl") +# mpi_test(2, "reduce_test.jl") +# mpi_test(3, "reduce_test.jl") +# end -@testset "LoadBalance" begin - for i in 1:20 - for j in i:30 - test_load_balance(i, j) - end - end -end +# @testset "PermutedDistributedArray" begin +# mpi_test(1, "permuted_test.jl", options = ["-s"]) +# mpi_test(1, "permuted_test.jl") +# mpi_test(2, "permuted_test.jl") +# end + +# @testset "LoadBalance" begin +# for i in 1:20 +# for j in i:30 +# test_load_balance(i, j) +# end +# end +# end -@testset "LogSum" begin - m = Pigeons.LogSum() +# @testset "LogSum" begin +# m = Pigeons.LogSum() - fit!(m, 2.1) - fit!(m, 4) - v1 = value(m) - @assert v1 ≈ log(exp(2.1) + exp(4)) - - - fit!(m, 2.1) - fit!(m, 4) - m2 = Pigeons.LogSum() - fit!(m2, 50.1) - combined = merge(m, m2) - @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - - fit!(m, 2.1) - fit!(m, 4) - empty!(m) - @assert value(m) == -Pigeons.inf(0.0) -end +# fit!(m, 2.1) +# fit!(m, 4) +# v1 = value(m) +# @assert v1 ≈ log(exp(2.1) + exp(4)) + + +# fit!(m, 2.1) +# fit!(m, 4) +# m2 = Pigeons.LogSum() +# fit!(m2, 50.1) +# combined = merge(m, m2) +# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + +# fit!(m, 2.1) +# fit!(m, 4) +# empty!(m) +# @assert value(m) == -Pigeons.inf(0.0) +# end function test_split_slice() # test disjoint random streams @@ -183,17 +195,17 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -@testset "split_test" begin - test_split_slice() -end +# @testset "split_test" begin +# test_split_slice() +# end -@testset "Serialize" begin - mpi_test(1, "serialization_test.jl") -end +# @testset "Serialize" begin +# mpi_test(1, "serialization_test.jl") +# end -@testset "SliceSampler" begin - test_slice_sampler() -end +# @testset "SliceSampler" begin +# test_slice_sampler() +# end # clean-up logs ls = readdir() From 08479015c02e32611ae9960567a944fcd2e1a912 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:03:57 -0800 Subject: [PATCH 51/65] Fix last commit --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0dad559cb..91ae3ea66 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -80,7 +80,7 @@ end # test swapper pigeons( - target = Pigeons.TestSwapper(0.5), + target = toy_mvn_target(1), n_rounds = 10, checked_round = 3, recorder_builders = recorder_builders, From f75a280352ede950ef89f753f62718db9a311032 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:16:58 -0800 Subject: [PATCH 52/65] toy_mvn not enough to manifest ghostbug, trying Turing --- test/runtests.jl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 91ae3ea66..9812eb08b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -79,28 +79,28 @@ end recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] # test swapper - pigeons( - target = toy_mvn_target(1), - n_rounds = 10, - checked_round = 3, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - - # # Turing: # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), + # target = toy_mvn_target(1), # n_rounds = 10, # checked_round = 3, - # multithreaded = true, # recorder_builders = recorder_builders, # checkpoint = true, # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], # n_local_mpi_processes = n_mpis, - # n_threads = 2)) + # n_threads = 2)) + + # # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 10, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = n_mpis, + n_threads = 2)) # # Blang: # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf # Pigeons.setup_blang("blangDemos") From 93033fc3530dbfd7dc6141e0c88ede97c0fb9dad Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 12:55:38 -0800 Subject: [PATCH 53/65] test mpich+openmpi using brew --- .github/workflows/CI.yml | 68 ++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b40ed6a01..6e16c0d8a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -92,11 +92,10 @@ jobs: # rm("test/Manifest.toml") # - uses: julia-actions/julia-runtest@latest - - - + + # test system MPI using Brew in macOS # adapted from MPI.jl - test-system-MPI-apt: + test-system-MPI-brew: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -105,18 +104,18 @@ jobs: version: - '1.8' os: - - ubuntu-latest + - macos-latest mpi: - - libmpich-dev + - mpich + - openmpi env: JULIA_MPI_TEST_BINARY: system + ZES_ENABLE_SYSMAN: 1 # https://github.com/open-mpi/ompi/issues/10142 steps: - uses: actions/checkout@v3 - - name: Install MPI via apt - run: | - sudo apt-get update - sudo apt-get install $MPI + - name: Install MPI via homebrew + run: brew install $MPI env: MPI: ${{ matrix.mpi }} @@ -139,9 +138,58 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 + # # adapted from MPI.jl + # test-system-MPI-apt: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # mpi: + # - libmpich-dev + # env: + # JULIA_MPI_TEST_BINARY: system + # steps: + # - uses: actions/checkout@v3 + + # - name: Install MPI via apt + # run: | + # sudo apt-get update + # sudo apt-get install $MPI + # env: + # MPI: ${{ matrix.mpi }} + + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: x64 + + # - uses: julia-actions/cache@v1 + + # - name: use system MPI + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_system_binary() + # rm("test/Manifest.toml") + + # - uses: julia-actions/julia-runtest@v1 + # docs: # name: Documentation # runs-on: ubuntu-latest From d46d429151ff60d70a852faefed7326eecbd11a4 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 13:09:17 -0800 Subject: [PATCH 54/65] fix wrong abi detection for mpich --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6e16c0d8a..3eb9ee88a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -138,6 +138,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() + run(`sed -i.bu 's/unknown/MPICH/' test/LocalPreferences.toml`) # fix wrong abi detection for mpich rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 From 7a110746587fde6e8b66adc4ae5a0ad0b46d5a75 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 14:11:55 -0800 Subject: [PATCH 55/65] move xtra args to MPI struct + remove prints + failsafe for empty current project --- src/submission/ChildProcess.jl | 40 ++++++++++------------------------ src/submission/MPI.jl | 7 +++++- test/misc.jl | 6 ++++- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 94ba2ff3d..34c47ed50 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,34 +62,24 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = extra_mpi_args() - mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` - logfile = "Pigeons.log" - println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") - try - run(pipeline(cmd; stdout = logfile, stderr = logfile), wait = new_process.wait) - catch e - println("pipeline terminated with non-zero status. Dumping stdout+stderr:\n\n") - open(logfile, "r") do f - println(read(f, String)) - end - rethrow(e) - end + run(cmd, wait = new_process.wait) end end return Result{PT}(exec_folder) end -function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` -end - function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - cur_proj = dirname(Base.current_project()) - @info "forcing instantiate + precompile on project $cur_proj" - run(`$julia_bin --project=$cur_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache + julia_bin = Base.julia_cmd() + cur_proj = Base.current_project() + if !isnothing(cur_proj) + # instantiate the project to make sure dependencies exist + # also, precompile to issues with coordinating access to compilecache + dir = dirname(cur_proj) + @info "forcing instantiate + precompile on project $dir" + run(`$julia_bin --project=$dir -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) + end script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj @@ -142,15 +132,7 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - pid=string(getpid()) - println("hello from PID " * pid) - - println(pid * ": wd = " * pwd()) - println(pid * ": active_proj = " * dirname(Base.active_project()) ) - $dependency_declarations - println(pid * ": using Pigeons located @ " * dirname(pathof(Pigeons))) - $silence_code Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") diff --git a/src/submission/MPI.jl b/src/submission/MPI.jl index 8280fe749..eb74020e4 100644 --- a/src/submission/MPI.jl +++ b/src/submission/MPI.jl @@ -41,6 +41,11 @@ $FIELDS process. """ dependencies::Vector{Module} = [] + + """ + Extra arguments passed to mpiexec. + """ + mpiexec_args::String = "" end """ @@ -96,7 +101,7 @@ function mpi_submission_script(exec_folder, mpi_submission::MPI, julia_cmd) #PBS -e $info_folder/stderr.txt cd \$PBS_O_WORKDIR $(modules_string(mpi_settings)) - mpiexec --merge-stderr-to-stdout --output-filename $exec_folder $julia_cmd_str + mpiexec $(mpi_submission.mpiexec_args) --merge-stderr-to-stdout --output-filename $exec_folder $julia_cmd_str """ script_path = "$exec_folder/.submission_script.sh" write(script_path, code) diff --git a/test/misc.jl b/test/misc.jl index ec24109d5..6846d796d 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,7 +8,11 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - mpi_args = Pigeons.extra_mpi_args() + mpi_args = extra_mpi_args() run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end +end + +function extra_mpi_args() + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` end \ No newline at end of file From d89911a8c2c449f319cd0f23c2e3b59789ecdcda Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 14:36:48 -0800 Subject: [PATCH 56/65] mpiexec args for childprocess --- src/submission/ChildProcess.jl | 7 ++++++- test/misc.jl | 2 +- test/runtests.jl | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 34c47ed50..7ee8f1f3e 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -41,6 +41,11 @@ $FIELDS When wait is false, the process' I/O streams are directed to devnull. """ wait = true + + """ + Extra arguments passed to mpiexec. + """ + mpiexec_args::String = "" end """ @@ -62,7 +67,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end diff --git a/test/misc.jl b/test/misc.jl index 6846d796d..2cc89ebe2 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -14,5 +14,5 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` + MPIPreferences.abi == "OpenMPI" ? "--oversubscribe" : "" end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 9812eb08b..5599ad4d9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -100,7 +100,8 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 2, + mpiexec_args = extra_mpi_args())) # # Blang: # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf # Pigeons.setup_blang("blangDemos") From aa233e8a28a5e80253413635499ef29625622aca Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 19:40:45 -0800 Subject: [PATCH 57/65] re-introduce all CI tests + fix bug in building mpi cmd --- .github/workflows/CI.yml | 197 ++++++++++++++++----------------- src/submission/ChildProcess.jl | 6 +- test/runtests.jl | 5 - 3 files changed, 102 insertions(+), 106 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3eb9ee88a..d72b641cd 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,84 +14,80 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: + + # default test + test-MPICH-jll: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + - 'nightly' + os: + - ubuntu-latest + - macos-latest + - windows-latest + arch: + - x64 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v2 + with: + files: lcov.info - # test-default: - # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # fail-fast: false - # matrix: - # version: - # - '1.8' - # - 'nightly' - # os: - # - ubuntu-latest - # - macos-latest - # - windows-latest - # arch: - # - x64 - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - # - uses: julia-actions/setup-julia@v1 - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - # - uses: julia-actions/julia-buildpkg@v1 - - # - name: instantiate the test environment - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # - uses: julia-actions/julia-runtest@v1 - # - uses: julia-actions/julia-processcoverage@v1 - # - uses: codecov/codecov-action@v2 - # with: - # files: lcov.info - - # test OpenMPI_jll - # test-OpenMPI-jll: - # name: Julia OpenMPI_jll - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # arch: - # - x64 + # test OpenMPI by requesting it with MPIPreferences + # adapted from MPI.jl + test-OpenMPI-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + version: + - '1.8' + os: + - ubuntu-latest + arch: + - x64 - # fail-fast: false - # env: - # JULIA_MPI_TEST_BINARY: OpenMPI_jll - # JULIA_MPI_TEST_ABI: OpenMPI - # steps: - # - name: Checkout - # uses: actions/checkout@v3 - - # - uses: julia-actions/setup-julia@latest - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - - # - name: use OpenMPI_jll - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_jll_binary("OpenMPI_jll") - # rm("test/Manifest.toml") - - # - uses: julia-actions/julia-runtest@latest + fail-fast: false + env: + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_jll_binary("OpenMPI_jll") + rm("test/Manifest.toml") + + - uses: julia-actions/julia-runtest@latest # test system MPI using Brew in macOS # adapted from MPI.jl @@ -143,7 +139,10 @@ jobs: - uses: julia-actions/julia-runtest@v1 + # # test system MPI using apt in ubuntu # # adapted from MPI.jl + # # TODO: commented out because apt has older versions of MPICH and OMPI that + # # segfault with multithreading. Re-introduce them when apt pkgs are upgraded # test-system-MPI-apt: # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} # runs-on: ${{ matrix.os }} @@ -191,27 +190,27 @@ jobs: # - uses: julia-actions/julia-runtest@v1 - # docs: - # name: Documentation - # runs-on: ubuntu-latest - # permissions: - # contents: write - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - # - uses: julia-actions/setup-julia@v1 - # with: - # version: '1' - # - uses: julia-actions/julia-buildpkg@v1 - # - uses: julia-actions/julia-docdeploy@v1 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # - run: | - # julia --project=docs -e ' - # using Documenter: DocMeta, doctest - # using Pigeons - # DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) - # doctest(Pigeons)' + docs: + name: Documentation + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + - uses: julia-actions/setup-julia@v1 + with: + version: '1' + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - run: | + julia --project=docs -e ' + using Documenter: DocMeta, doctest + using Pigeons + DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) + doctest(Pigeons)' diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 7ee8f1f3e..2afda0c67 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -67,8 +67,10 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` - cmd = `$mpi_cmd $julia_cmd` + args = new_process.mpiexec_args + mpi_cmd = length(args)>0 ? `$exe $args` : `$exe` # need this because `$("")` == `''` != `` + mpi_cmd = `$mpi_cmd -n $(new_process.n_local_mpi_processes)` + cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end end diff --git a/test/runtests.jl b/test/runtests.jl index 5599ad4d9..7a4823274 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -207,8 +207,3 @@ test_split_slice_helper(range) = [rand(r) for r in split_slice(range, Splittabl # @testset "SliceSampler" begin # test_slice_sampler() # end - -# clean-up logs -ls = readdir() -foreach(rm, filter(endswith(".log"), ls)) -foreach(rm, filter(endswith(".err"), ls)) From 42993fdc7d1e5cf23a89ddd9ab4d72af9ee4cf47 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 20:08:55 -0800 Subject: [PATCH 58/65] add support for using without Project.toml --- src/Pigeons.jl | 2 +- src/submission/ChildProcess.jl | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/Pigeons.jl b/src/Pigeons.jl index 9d6cc8734..b8b59cf24 100644 --- a/src/Pigeons.jl +++ b/src/Pigeons.jl @@ -8,7 +8,7 @@ import MPI: Comm, Allreduce, Comm_rank, Comm_dup, Request, Waitall, RequestSet, mpiexec, Allreduce, Allgather, Comm_split, isend, recv, - bcast, tag_ub, free + bcast, tag_ub using Base: Forward diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 2afda0c67..d96d6222e 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -78,20 +78,18 @@ function pigeons(pt_arguments, new_process::ChildProcess) end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - cur_proj = Base.current_project() - if !isnothing(cur_proj) + script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) + jl_cmd = Base.julia_cmd() + project_file = Base.current_project() + if !isnothing(project_file) # instantiate the project to make sure dependencies exist - # also, precompile to issues with coordinating access to compilecache - dir = dirname(cur_proj) - @info "forcing instantiate + precompile on project $dir" - run(`$julia_bin --project=$dir -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) + # also, precompile to avoid issues with coordinating access to compile cache + project_dir = dirname(project_file) + jl_cmd = `$jl_cmd --project=$project_dir` + println("Instantiating and pre-compiling project on $project_dir") + run(`$jl_cmd -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) end - script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) - return `$julia_bin - --project=$cur_proj - --threads=$n_threads - $script_path` + return `$jl_cmd --threads=$n_threads $script_path` end function launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) From 42bdd753f198a1a559ce37fcb326b7ad26c36f7d Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 20:15:10 -0800 Subject: [PATCH 59/65] add comment explaining why we wait on Isend --- src/mpi_utils/Entangler.jl | 4 +++- src/submission/ChildProcess.jl | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index a883b0dfe..52c211425 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -158,7 +158,7 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic e.current_received_bits .= true at_least_one_mpi = false - requests = RequestSet() + requests = RequestSet() # non-blocking requests that will be waited on # send (or copy if local) for local_index in 1:myload @@ -176,6 +176,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: + # note: we wait for the Isend request to avoid the application + # terminating in the last iteration without completing its request. request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) push!(requests, request) end diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index d96d6222e..6b7aad2be 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -86,7 +86,6 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil # also, precompile to avoid issues with coordinating access to compile cache project_dir = dirname(project_file) jl_cmd = `$jl_cmd --project=$project_dir` - println("Instantiating and pre-compiling project on $project_dir") run(`$jl_cmd -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) end return `$jl_cmd --threads=$n_threads $script_path` From c5cb70a36a7f5f35975b42a621f321aa7b95d7c9 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 10:54:07 -0800 Subject: [PATCH 60/65] mpiexec_args is a Cmd now --- src/submission/ChildProcess.jl | 16 ++++++++-------- src/submission/MPI.jl | 2 +- test/misc.jl | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 6b7aad2be..c3cf5ba3c 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -19,7 +19,7 @@ $FIELDS (if of type `String`) needed by the child process. """ - dependencies::Vector{Any} = [] + dependencies::Vector = [] # eventually, detect & save which # modules should be loaded? E.g. could use # https://stackoverflow.com/questions/25575406/list-of-loaded-imported-packages-in-julia @@ -34,20 +34,22 @@ $FIELDS third-party target distribution which somehow does not support multi-threading. """ - n_local_mpi_processes = 1 + n_local_mpi_processes::Int = 1 """ If wait is false, the process runs asynchronously. When wait is false, the process' I/O streams are directed to devnull. """ - wait = true + wait::Bool = true """ Extra arguments passed to mpiexec. """ - mpiexec_args::String = "" + mpiexec_args::Cmd = `` end + + """ $SIGNATURES @@ -67,9 +69,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - args = new_process.mpiexec_args - mpi_cmd = length(args)>0 ? `$exe $args` : `$exe` # need this because `$("")` == `''` != `` - mpi_cmd = `$mpi_cmd -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end @@ -82,7 +82,7 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil jl_cmd = Base.julia_cmd() project_file = Base.current_project() if !isnothing(project_file) - # instantiate the project to make sure dependencies exist + # forcing instantiate the project to make sure dependencies exist # also, precompile to avoid issues with coordinating access to compile cache project_dir = dirname(project_file) jl_cmd = `$jl_cmd --project=$project_dir` diff --git a/src/submission/MPI.jl b/src/submission/MPI.jl index eb74020e4..511a7d564 100644 --- a/src/submission/MPI.jl +++ b/src/submission/MPI.jl @@ -45,7 +45,7 @@ $FIELDS """ Extra arguments passed to mpiexec. """ - mpiexec_args::String = "" + mpiexec_args::Cmd = `` end """ diff --git a/test/misc.jl b/test/misc.jl index 2cc89ebe2..17e3432bb 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -14,5 +14,5 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? "--oversubscribe" : "" + MPIPreferences.abi == "OpenMPI" ? `--oversubscribe` : `` end \ No newline at end of file From 9e31b49affe8e8b099dfc365c86edcb0be6de5bc Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 11:25:09 -0800 Subject: [PATCH 61/65] re-instate all tests --- test/runtests.jl | 269 ++++++++++++++++++++++++----------------------- test/turing.jl | 2 + 2 files changed, 140 insertions(+), 131 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7a4823274..379f3f6a1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,65 +31,69 @@ function test_load_balance(n_processes, n_tasks) end end -# @testset "System MPI" begin -# if haskey(ENV,"JULIA_MPI_TEST_BINARY") -# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary -# end -# end - -# @testset "Stepping stone" begin -# pt = pigeons(target = toy_mvn_target(100)); -# p = stepping_stone_pair(pt) -# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) -# @test abs(p[1] - truth) < 1 -# @test abs(p[2] - truth) < 1 -# end - -# @testset "Round trips" begin -# n_chains = 4 -# n_rounds = 5 +@testset "MPI backend" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end + if haskey(ENV,"JULIA_MPI_TEST_ABI") + @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi + end +end + +@testset "Stepping stone" begin + pt = pigeons(target = toy_mvn_target(100)); + p = stepping_stone_pair(pt) + truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) + @test abs(p[1] - truth) < 1 + @test abs(p[2] - truth) < 1 +end + +@testset "Round trips" begin + n_chains = 4 + n_rounds = 5 -# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); + pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); -# len = 2^(n_rounds) -# truth = 0.0 -# for i in 0:(n_chains-1) -# truth += floor(max(len - i, 0) / n_chains / 2) -# end - -# @test truth == Pigeons.n_round_trips(pt) -# end - -# @testset "Moments" begin -# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); -# for var_name in Pigeons.continuous_variables(pt) -# m = mean(pt, var_name) -# for i in eachindex(m) -# @test abs(m[i] - 0.0) < 0.001 -# end -# v = var(pt, var_name) -# for i in eachindex(v) -# @test abs(v[i] - 0.1) < 0.001 -# end -# end -# end + len = 2^(n_rounds) + truth = 0.0 + for i in 0:(n_chains-1) + truth += floor(max(len - i, 0) / n_chains / 2) + end + + @test truth == Pigeons.n_round_trips(pt) +end + +@testset "Moments" begin + pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); + for var_name in Pigeons.continuous_variables(pt) + m = mean(pt, var_name) + for i in eachindex(m) + @test abs(m[i] - 0.0) < 0.001 + end + v = var(pt, var_name) + for i in eachindex(v) + @test abs(v[i] - 0.1) < 0.001 + end + end +end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] # test swapper - # pigeons( - # target = toy_mvn_target(1), - # n_rounds = 10, - # checked_round = 3, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 2)) - - # # Turing: + pigeons( + target = toy_mvn_target(1), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + + # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), n_rounds = 10, @@ -102,82 +106,85 @@ end n_local_mpi_processes = n_mpis, n_threads = 2, mpiexec_args = extra_mpi_args())) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 10, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 2)) - # end + + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + end end -# @testset "Longer MPI" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [] -# pigeons( -# target = Pigeons.TestSwapper(0.5), -# n_rounds = 14, -# checked_round = 12, -# n_chains = 200, -# multithreaded = false, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 1)) -# end - -# @testset "Entanglement" begin -# mpi_test(1, "entanglement_test.jl") -# mpi_test(2, "entanglement_test.jl") - -# mpi_test(1, "reduce_test.jl") -# mpi_test(2, "reduce_test.jl") -# mpi_test(3, "reduce_test.jl") -# end - -# @testset "PermutedDistributedArray" begin -# mpi_test(1, "permuted_test.jl", options = ["-s"]) -# mpi_test(1, "permuted_test.jl") -# mpi_test(2, "permuted_test.jl") -# end - -# @testset "LoadBalance" begin -# for i in 1:20 -# for j in i:30 -# test_load_balance(i, j) -# end -# end -# end - -# @testset "LogSum" begin -# m = Pigeons.LogSum() - -# fit!(m, 2.1) -# fit!(m, 4) -# v1 = value(m) -# @assert v1 ≈ log(exp(2.1) + exp(4)) +@testset "Longer MPI" begin + n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf + recorder_builders = [] + pigeons( + target = Pigeons.TestSwapper(0.5), + n_rounds = 14, + checked_round = 12, + n_chains = 200, + multithreaded = false, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) +end +@testset "Entanglement" begin + mpi_test(1, "entanglement_test.jl") + mpi_test(2, "entanglement_test.jl") -# fit!(m, 2.1) -# fit!(m, 4) -# m2 = Pigeons.LogSum() -# fit!(m2, 50.1) -# combined = merge(m, m2) -# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + mpi_test(1, "reduce_test.jl") + mpi_test(2, "reduce_test.jl") + mpi_test(3, "reduce_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# empty!(m) -# @assert value(m) == -Pigeons.inf(0.0) -# end +@testset "PermutedDistributedArray" begin + mpi_test(1, "permuted_test.jl", options = ["-s"]) + mpi_test(1, "permuted_test.jl") + mpi_test(2, "permuted_test.jl") +end + +@testset "LoadBalance" begin + for i in 1:20 + for j in i:30 + test_load_balance(i, j) + end + end +end + +@testset "LogSum" begin + m = Pigeons.LogSum() + + fit!(m, 2.1) + fit!(m, 4) + v1 = value(m) + @assert v1 ≈ log(exp(2.1) + exp(4)) + + + fit!(m, 2.1) + fit!(m, 4) + m2 = Pigeons.LogSum() + fit!(m2, 50.1) + combined = merge(m, m2) + @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + + fit!(m, 2.1) + fit!(m, 4) + empty!(m) + @assert value(m) == -Pigeons.inf(0.0) +end function test_split_slice() # test disjoint random streams @@ -196,14 +203,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -# @testset "split_test" begin -# test_split_slice() -# end +@testset "split_test" begin + test_split_slice() +end -# @testset "Serialize" begin -# mpi_test(1, "serialization_test.jl") -# end +@testset "Serialize" begin + mpi_test(1, "serialization_test.jl") +end -# @testset "SliceSampler" begin -# test_slice_sampler() -# end +@testset "SliceSampler" begin + test_slice_sampler() +end diff --git a/test/turing.jl b/test/turing.jl index d15d80143..f894bd40a 100644 --- a/test/turing.jl +++ b/test/turing.jl @@ -1,3 +1,5 @@ +# note: the models here don't use `filldist` in order to avoid importing +# Turing, which was crashing as of 2023-03-06 # Unconditioned coinflip model with `N` observations. @model function coinflip(y) p ~ Beta(1, 12) From 74a4355d256cc69a8876043605b8348a6f4f434e Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Sat, 11 Mar 2023 14:49:09 -0800 Subject: [PATCH 62/65] Test hypothesis that GC+multithread is issue; determine all MPIs affects --- test/gc_test.jl | 38 +++++ test/misc.jl | 2 +- test/runtests.jl | 383 ++++++++++++++++++++++++----------------------- 3 files changed, 233 insertions(+), 190 deletions(-) create mode 100644 test/gc_test.jl diff --git a/test/gc_test.jl b/test/gc_test.jl new file mode 100644 index 000000000..31622dec4 --- /dev/null +++ b/test/gc_test.jl @@ -0,0 +1,38 @@ +# based on test_threads.jl in MPI.jl, added forced GC + +using Test +using MPI + +@info "nthreads = $(Threads.nthreads())" + +MPI.Init(threadlevel=:multiple) + + +comm = MPI.COMM_WORLD +size = MPI.Comm_size(comm) +rank = MPI.Comm_rank(comm) + +const N = 10 + +dst = mod(rank+1, size) +src = mod(rank-1, size) + +send_arr = collect(1.0:N) +recv_arr = zeros(N) + +reqs = Array{MPI.Request}(undef, 2N) + +Threads.@threads for i = 1:N + reqs[N+i] = MPI.Irecv!(@view(recv_arr[i:i]), comm; source=src, tag=i) + reqs[i] = MPI.Isend(@view(send_arr[i:i]), comm; dest=dst, tag=i) + if i == 1 + GC.gc() + end + +end + +MPI.Waitall(reqs) + +@test recv_arr == send_arr + +MPI.Finalize() diff --git a/test/misc.jl b/test/misc.jl index 17e3432bb..c2351fcdd 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -9,7 +9,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end mpiexec() do exe mpi_args = extra_mpi_args() - run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) + run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) -t 2 --project=$project_folder $resolved_test_file $options`) end end diff --git a/test/runtests.jl b/test/runtests.jl index 379f3f6a1..63fc6facb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,197 +20,202 @@ include("misc.jl") include("slice_sampler_test.jl") include("turing.jl") -function test_load_balance(n_processes, n_tasks) - for p in 1:n_processes - lb = LoadBalance(p, n_processes, n_tasks) - globals = my_global_indices(lb) - @assert length(globals) == my_load(lb) - for g in globals - @assert find_process(lb, g) == p - end - end +@testset "GC+multithreading" begin + @info MPIPreferences.abi + mpi_test(2, "gc_test.jl") end -@testset "MPI backend" begin - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end - if haskey(ENV,"JULIA_MPI_TEST_ABI") - @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi - end -end - -@testset "Stepping stone" begin - pt = pigeons(target = toy_mvn_target(100)); - p = stepping_stone_pair(pt) - truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) - @test abs(p[1] - truth) < 1 - @test abs(p[2] - truth) < 1 -end - -@testset "Round trips" begin - n_chains = 4 - n_rounds = 5 +# function test_load_balance(n_processes, n_tasks) +# for p in 1:n_processes +# lb = LoadBalance(p, n_processes, n_tasks) +# globals = my_global_indices(lb) +# @assert length(globals) == my_load(lb) +# for g in globals +# @assert find_process(lb, g) == p +# end +# end +# end + +# @testset "MPI backend" begin +# if haskey(ENV,"JULIA_MPI_TEST_BINARY") +# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary +# end +# if haskey(ENV,"JULIA_MPI_TEST_ABI") +# @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi +# end +# end + +# @testset "Stepping stone" begin +# pt = pigeons(target = toy_mvn_target(100)); +# p = stepping_stone_pair(pt) +# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) +# @test abs(p[1] - truth) < 1 +# @test abs(p[2] - truth) < 1 +# end + +# @testset "Round trips" begin +# n_chains = 4 +# n_rounds = 5 - pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); +# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); - len = 2^(n_rounds) - truth = 0.0 - for i in 0:(n_chains-1) - truth += floor(max(len - i, 0) / n_chains / 2) - end - - @test truth == Pigeons.n_round_trips(pt) -end - -@testset "Moments" begin - pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); - for var_name in Pigeons.continuous_variables(pt) - m = mean(pt, var_name) - for i in eachindex(m) - @test abs(m[i] - 0.0) < 0.001 - end - v = var(pt, var_name) - for i in eachindex(v) - @test abs(v[i] - 0.1) < 0.001 - end - end -end - -@testset "Parallelism Invariance" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - - # test swapper - pigeons( - target = toy_mvn_target(1), - n_rounds = 10, - checked_round = 3, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2, - mpiexec_args = extra_mpi_args())) - - # Turing: - pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 10, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 2, - mpiexec_args = extra_mpi_args())) - - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 10, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2, - mpiexec_args = extra_mpi_args())) - end -end - -@testset "Longer MPI" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [] - pigeons( - target = Pigeons.TestSwapper(0.5), - n_rounds = 14, - checked_round = 12, - n_chains = 200, - multithreaded = false, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2, - mpiexec_args = extra_mpi_args())) -end - -@testset "Entanglement" begin - mpi_test(1, "entanglement_test.jl") - mpi_test(2, "entanglement_test.jl") - - mpi_test(1, "reduce_test.jl") - mpi_test(2, "reduce_test.jl") - mpi_test(3, "reduce_test.jl") -end - -@testset "PermutedDistributedArray" begin - mpi_test(1, "permuted_test.jl", options = ["-s"]) - mpi_test(1, "permuted_test.jl") - mpi_test(2, "permuted_test.jl") -end - -@testset "LoadBalance" begin - for i in 1:20 - for j in i:30 - test_load_balance(i, j) - end - end -end - -@testset "LogSum" begin - m = Pigeons.LogSum() +# len = 2^(n_rounds) +# truth = 0.0 +# for i in 0:(n_chains-1) +# truth += floor(max(len - i, 0) / n_chains / 2) +# end + +# @test truth == Pigeons.n_round_trips(pt) +# end + +# @testset "Moments" begin +# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); +# for var_name in Pigeons.continuous_variables(pt) +# m = mean(pt, var_name) +# for i in eachindex(m) +# @test abs(m[i] - 0.0) < 0.001 +# end +# v = var(pt, var_name) +# for i in eachindex(v) +# @test abs(v[i] - 0.1) < 0.001 +# end +# end +# end + +# @testset "Parallelism Invariance" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + +# # test swapper +# pigeons( +# target = toy_mvn_target(1), +# n_rounds = 10, +# checked_round = 3, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 2, +# mpiexec_args = extra_mpi_args())) + +# # Turing: +# pigeons( +# target = TuringLogPotential(flip_model_unidentifiable()), +# n_rounds = 10, +# checked_round = 3, +# multithreaded = true, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], +# n_local_mpi_processes = n_mpis, +# n_threads = 2, +# mpiexec_args = extra_mpi_args())) + +# # Blang: +# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf +# Pigeons.setup_blang("blangDemos") +# pigeons(; +# target = Pigeons.blang_ising(), +# n_rounds = 10, +# checked_round = 3, +# recorder_builders = recorder_builders, +# multithreaded = true, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 2, +# mpiexec_args = extra_mpi_args())) +# end +# end + +# @testset "Longer MPI" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [] +# pigeons( +# target = Pigeons.TestSwapper(0.5), +# n_rounds = 14, +# checked_round = 12, +# n_chains = 200, +# multithreaded = false, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 2, +# mpiexec_args = extra_mpi_args())) +# end + +# @testset "Entanglement" begin +# mpi_test(1, "entanglement_test.jl") +# mpi_test(2, "entanglement_test.jl") + +# mpi_test(1, "reduce_test.jl") +# mpi_test(2, "reduce_test.jl") +# mpi_test(3, "reduce_test.jl") +# end + +# @testset "PermutedDistributedArray" begin +# mpi_test(1, "permuted_test.jl", options = ["-s"]) +# mpi_test(1, "permuted_test.jl") +# mpi_test(2, "permuted_test.jl") +# end + +# @testset "LoadBalance" begin +# for i in 1:20 +# for j in i:30 +# test_load_balance(i, j) +# end +# end +# end + +# @testset "LogSum" begin +# m = Pigeons.LogSum() - fit!(m, 2.1) - fit!(m, 4) - v1 = value(m) - @assert v1 ≈ log(exp(2.1) + exp(4)) - - - fit!(m, 2.1) - fit!(m, 4) - m2 = Pigeons.LogSum() - fit!(m2, 50.1) - combined = merge(m, m2) - @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - - fit!(m, 2.1) - fit!(m, 4) - empty!(m) - @assert value(m) == -Pigeons.inf(0.0) -end - -function test_split_slice() - # test disjoint random streams - set = Set{Float64}() - push!(set, test_split_slice_helper(1:10)...) - push!(set, test_split_slice_helper(11:20)...) - @test length(set) == 20 - - # test overlapping - set = Set{Float64}() - push!(set, test_split_slice_helper(1:15)...) - push!(set, test_split_slice_helper(10:20)...) - @test length(set) == 20 - return true -end - -test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] - -@testset "split_test" begin - test_split_slice() -end - -@testset "Serialize" begin - mpi_test(1, "serialization_test.jl") -end - -@testset "SliceSampler" begin - test_slice_sampler() -end +# fit!(m, 2.1) +# fit!(m, 4) +# v1 = value(m) +# @assert v1 ≈ log(exp(2.1) + exp(4)) + + +# fit!(m, 2.1) +# fit!(m, 4) +# m2 = Pigeons.LogSum() +# fit!(m2, 50.1) +# combined = merge(m, m2) +# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + +# fit!(m, 2.1) +# fit!(m, 4) +# empty!(m) +# @assert value(m) == -Pigeons.inf(0.0) +# end + +# function test_split_slice() +# # test disjoint random streams +# set = Set{Float64}() +# push!(set, test_split_slice_helper(1:10)...) +# push!(set, test_split_slice_helper(11:20)...) +# @test length(set) == 20 + +# # test overlapping +# set = Set{Float64}() +# push!(set, test_split_slice_helper(1:15)...) +# push!(set, test_split_slice_helper(10:20)...) +# @test length(set) == 20 +# return true +# end + +# test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] + +# @testset "split_test" begin +# test_split_slice() +# end + +# @testset "Serialize" begin +# mpi_test(1, "serialization_test.jl") +# end + +# @testset "SliceSampler" begin +# test_slice_sampler() +# end From 7454ae9363747d62d2915aadb93db9de8b8be5e1 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 15:39:59 -0800 Subject: [PATCH 63/65] force instantiate in mpi_test + add MicrosoftMPI test --- .github/workflows/CI.yml | 49 +++++++++++++++++++++++++++++++++++++--- test/misc.jl | 4 +++- test/runtests.jl | 20 ++++++++-------- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index d72b641cd..88247c20d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -15,7 +15,7 @@ concurrency: jobs: - # default test + # test using MPICH as MPI backend (default in MPI.jl) test-MPICH-jll: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} @@ -45,7 +45,7 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v2 + - uses: codecov/codecov-action@v3 with: files: lcov.info @@ -53,7 +53,7 @@ jobs: # test OpenMPI by requesting it with MPIPreferences # adapted from MPI.jl test-OpenMPI-jll: - name: Julia OpenMPI_jll - ${{ github.event_name }} + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - OpenMPI_jll - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: matrix: @@ -89,6 +89,47 @@ jobs: - uses: julia-actions/julia-runtest@latest + + # test MicrosoftMPI by requesting it with MPIPreferences + # adapted from MPI.jl + test-MicrosoftMPI-jll: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - MicrosoftMPI_jll - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + version: + - '1.8' + os: + - windows-latest + arch: + - x64 + + fail-fast: false + env: + JULIA_MPI_TEST_BINARY: MicrosoftMPI_jll + JULIA_MPI_TEST_ABI: MicrosoftMPI + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + + - name: use MicrosoftMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_jll_binary("MicrosoftMPI_jll") + rm("test/Manifest.toml") + + - uses: julia-actions/julia-runtest@latest + + # test system MPI using Brew in macOS # adapted from MPI.jl test-system-MPI-brew: @@ -139,6 +180,7 @@ jobs: - uses: julia-actions/julia-runtest@v1 + # # test system MPI using apt in ubuntu # # adapted from MPI.jl # # TODO: commented out because apt has older versions of MPICH and OMPI that @@ -190,6 +232,7 @@ jobs: # - uses: julia-actions/julia-runtest@v1 + docs: name: Documentation runs-on: ubuntu-latest diff --git a/test/misc.jl b/test/misc.jl index c2351fcdd..5e2a65dda 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -1,5 +1,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) + jl_cmd = Base.julia_cmd() project_folder = dirname(Base.current_project()) + run(`$jl_cmd --project=$(project_folder) -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # handle 2 different "modes" that tests can be ran (for julia 1.0,1.1 vs. >1.1) resolved_test_file = if isfile("$project_folder/$test_file") @@ -9,7 +11,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end mpiexec() do exe mpi_args = extra_mpi_args() - run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) -t 2 --project=$project_folder $resolved_test_file $options`) + run(`$exe $mpi_args -n $n_processes $jl_cmd -t 2 --project=$project_folder $resolved_test_file $options`) end end diff --git a/test/runtests.jl b/test/runtests.jl index 63fc6facb..21ce8b054 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,8 +20,17 @@ include("misc.jl") include("slice_sampler_test.jl") include("turing.jl") +@testset "MPI backend" begin + @info "MPI: using $(MPIPreferences.abi) ($(MPIPreferences.binary))" + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end + if haskey(ENV,"JULIA_MPI_TEST_ABI") + @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi + end +end + @testset "GC+multithreading" begin - @info MPIPreferences.abi mpi_test(2, "gc_test.jl") end @@ -36,15 +45,6 @@ end # end # end -# @testset "MPI backend" begin -# if haskey(ENV,"JULIA_MPI_TEST_BINARY") -# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary -# end -# if haskey(ENV,"JULIA_MPI_TEST_ABI") -# @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi -# end -# end - # @testset "Stepping stone" begin # pt = pigeons(target = toy_mvn_target(100)); # p = stepping_stone_pair(pt) From 18ef6559f41d80c95d8e4430f8e326dd6e498bc8 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 16:03:49 -0800 Subject: [PATCH 64/65] adding back all test --- test/runtests.jl | 364 +++++++++++++++++++++++------------------------ 1 file changed, 182 insertions(+), 182 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 21ce8b054..a4e91bb12 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,6 +20,17 @@ include("misc.jl") include("slice_sampler_test.jl") include("turing.jl") +function test_load_balance(n_processes, n_tasks) + for p in 1:n_processes + lb = LoadBalance(p, n_processes, n_tasks) + globals = my_global_indices(lb) + @assert length(globals) == my_load(lb) + for g in globals + @assert find_process(lb, g) == p + end + end +end + @testset "MPI backend" begin @info "MPI: using $(MPIPreferences.abi) ($(MPIPreferences.binary))" if haskey(ENV,"JULIA_MPI_TEST_BINARY") @@ -34,188 +45,177 @@ end mpi_test(2, "gc_test.jl") end -# function test_load_balance(n_processes, n_tasks) -# for p in 1:n_processes -# lb = LoadBalance(p, n_processes, n_tasks) -# globals = my_global_indices(lb) -# @assert length(globals) == my_load(lb) -# for g in globals -# @assert find_process(lb, g) == p -# end -# end -# end - -# @testset "Stepping stone" begin -# pt = pigeons(target = toy_mvn_target(100)); -# p = stepping_stone_pair(pt) -# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) -# @test abs(p[1] - truth) < 1 -# @test abs(p[2] - truth) < 1 -# end - -# @testset "Round trips" begin -# n_chains = 4 -# n_rounds = 5 +@testset "Stepping stone" begin + pt = pigeons(target = toy_mvn_target(100)); + p = stepping_stone_pair(pt) + truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) + @test abs(p[1] - truth) < 1 + @test abs(p[2] - truth) < 1 +end + +@testset "Round trips" begin + n_chains = 4 + n_rounds = 5 -# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); + pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); -# len = 2^(n_rounds) -# truth = 0.0 -# for i in 0:(n_chains-1) -# truth += floor(max(len - i, 0) / n_chains / 2) -# end - -# @test truth == Pigeons.n_round_trips(pt) -# end - -# @testset "Moments" begin -# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); -# for var_name in Pigeons.continuous_variables(pt) -# m = mean(pt, var_name) -# for i in eachindex(m) -# @test abs(m[i] - 0.0) < 0.001 -# end -# v = var(pt, var_name) -# for i in eachindex(v) -# @test abs(v[i] - 0.1) < 0.001 -# end -# end -# end - -# @testset "Parallelism Invariance" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - -# # test swapper -# pigeons( -# target = toy_mvn_target(1), -# n_rounds = 10, -# checked_round = 3, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 2, -# mpiexec_args = extra_mpi_args())) - -# # Turing: -# pigeons( -# target = TuringLogPotential(flip_model_unidentifiable()), -# n_rounds = 10, -# checked_round = 3, -# multithreaded = true, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], -# n_local_mpi_processes = n_mpis, -# n_threads = 2, -# mpiexec_args = extra_mpi_args())) - -# # Blang: -# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf -# Pigeons.setup_blang("blangDemos") -# pigeons(; -# target = Pigeons.blang_ising(), -# n_rounds = 10, -# checked_round = 3, -# recorder_builders = recorder_builders, -# multithreaded = true, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 2, -# mpiexec_args = extra_mpi_args())) -# end -# end - -# @testset "Longer MPI" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [] -# pigeons( -# target = Pigeons.TestSwapper(0.5), -# n_rounds = 14, -# checked_round = 12, -# n_chains = 200, -# multithreaded = false, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 2, -# mpiexec_args = extra_mpi_args())) -# end - -# @testset "Entanglement" begin -# mpi_test(1, "entanglement_test.jl") -# mpi_test(2, "entanglement_test.jl") - -# mpi_test(1, "reduce_test.jl") -# mpi_test(2, "reduce_test.jl") -# mpi_test(3, "reduce_test.jl") -# end - -# @testset "PermutedDistributedArray" begin -# mpi_test(1, "permuted_test.jl", options = ["-s"]) -# mpi_test(1, "permuted_test.jl") -# mpi_test(2, "permuted_test.jl") -# end - -# @testset "LoadBalance" begin -# for i in 1:20 -# for j in i:30 -# test_load_balance(i, j) -# end -# end -# end - -# @testset "LogSum" begin -# m = Pigeons.LogSum() + len = 2^(n_rounds) + truth = 0.0 + for i in 0:(n_chains-1) + truth += floor(max(len - i, 0) / n_chains / 2) + end + + @test truth == Pigeons.n_round_trips(pt) +end + +@testset "Moments" begin + pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); + for var_name in Pigeons.continuous_variables(pt) + m = mean(pt, var_name) + for i in eachindex(m) + @test abs(m[i] - 0.0) < 0.001 + end + v = var(pt, var_name) + for i in eachindex(v) + @test abs(v[i] - 0.1) < 0.001 + end + end +end + +@testset "Parallelism Invariance" begin + n_mpis = 4 + recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + + # test swapper + pigeons( + target = toy_mvn_target(1), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + + # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 10, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + end +end + +@testset "Longer MPI" begin + n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf + recorder_builders = [] + pigeons( + target = Pigeons.TestSwapper(0.5), + n_rounds = 14, + checked_round = 12, + n_chains = 200, + multithreaded = false, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) +end + +@testset "Entanglement" begin + mpi_test(1, "entanglement_test.jl") + mpi_test(2, "entanglement_test.jl") + + mpi_test(1, "reduce_test.jl") + mpi_test(2, "reduce_test.jl") + mpi_test(3, "reduce_test.jl") +end + +@testset "PermutedDistributedArray" begin + mpi_test(1, "permuted_test.jl", options = ["-s"]) + mpi_test(1, "permuted_test.jl") + mpi_test(2, "permuted_test.jl") +end + +@testset "LoadBalance" begin + for i in 1:20 + for j in i:30 + test_load_balance(i, j) + end + end +end + +@testset "LogSum" begin + m = Pigeons.LogSum() -# fit!(m, 2.1) -# fit!(m, 4) -# v1 = value(m) -# @assert v1 ≈ log(exp(2.1) + exp(4)) - - -# fit!(m, 2.1) -# fit!(m, 4) -# m2 = Pigeons.LogSum() -# fit!(m2, 50.1) -# combined = merge(m, m2) -# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - -# fit!(m, 2.1) -# fit!(m, 4) -# empty!(m) -# @assert value(m) == -Pigeons.inf(0.0) -# end - -# function test_split_slice() -# # test disjoint random streams -# set = Set{Float64}() -# push!(set, test_split_slice_helper(1:10)...) -# push!(set, test_split_slice_helper(11:20)...) -# @test length(set) == 20 - -# # test overlapping -# set = Set{Float64}() -# push!(set, test_split_slice_helper(1:15)...) -# push!(set, test_split_slice_helper(10:20)...) -# @test length(set) == 20 -# return true -# end - -# test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] - -# @testset "split_test" begin -# test_split_slice() -# end - -# @testset "Serialize" begin -# mpi_test(1, "serialization_test.jl") -# end - -# @testset "SliceSampler" begin -# test_slice_sampler() -# end + fit!(m, 2.1) + fit!(m, 4) + v1 = value(m) + @assert v1 ≈ log(exp(2.1) + exp(4)) + + + fit!(m, 2.1) + fit!(m, 4) + m2 = Pigeons.LogSum() + fit!(m2, 50.1) + combined = merge(m, m2) + @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + + fit!(m, 2.1) + fit!(m, 4) + empty!(m) + @assert value(m) == -Pigeons.inf(0.0) +end + +function test_split_slice() + # test disjoint random streams + set = Set{Float64}() + push!(set, test_split_slice_helper(1:10)...) + push!(set, test_split_slice_helper(11:20)...) + @test length(set) == 20 + + # test overlapping + set = Set{Float64}() + push!(set, test_split_slice_helper(1:15)...) + push!(set, test_split_slice_helper(10:20)...) + @test length(set) == 20 + return true +end + +test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] + +@testset "split_test" begin + test_split_slice() +end + +@testset "Serialize" begin + mpi_test(1, "serialization_test.jl") +end + +@testset "SliceSampler" begin + test_slice_sampler() +end From b9c34d97eb51beff7cba3993abe25e128319527e Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Sun, 12 Mar 2023 20:23:15 -0700 Subject: [PATCH 65/65] Change mpi_active impl to fix open mpi bug Before this, we had crashed such as in 18ef6559f41d80c95d8e4430f8e326dd6e498bc8 Here is what was happening in these crash 1. parent process is using a system library 2. parent process' test first call local pigeons 3. that in turns called mpi_active() which internally used MPI.Init() to see if Comm_size > 1 4. that had the side effect of changing ENV 5. then, when calling pigeons MPI tests, these ENV variables were passed to ChildProcess, causing problems The new approach avoids to call MPI.Init() frivolously --- src/includes.jl | 2 +- src/mpi_utils/Entangler.jl | 18 ----------------- .../{one_per_host.jl => misc_mpi_utils.jl} | 20 +++++++++++++++++++ src/submission/ChildProcess.jl | 16 +++++++-------- src/submission/MPI.jl | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) rename src/mpi_utils/{one_per_host.jl => misc_mpi_utils.jl} (59%) diff --git a/src/includes.jl b/src/includes.jl index c5343d7fe..0745c8c28 100644 --- a/src/includes.jl +++ b/src/includes.jl @@ -44,7 +44,7 @@ include("targets/TuringLogPotential.jl") include("paths/InterpolatingPath.jl") include("targets/target.jl") include("pt/checks.jl") -include("mpi_utils/one_per_host.jl") +include("mpi_utils/misc_mpi_utils.jl") include("mpi_utils/LoadBalance.jl") include("mpi_utils/Entangler.jl") include("mpi_utils/PermutedDistributedArray.jl") diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 52c211425..986e75cc6 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -89,24 +89,6 @@ mutable struct Entangler end end -# use this to force mpi_active() to return false -const silence_mpi = Ref(false) - -""" -$SIGNATURES - -Detect if more than one MPI processes can be found. -""" -mpi_active() = - if silence_mpi[] - false - else - init_mpi() - Comm_size(COMM_WORLD) > 1 - end - -init_mpi() = Init(threadlevel = :funneled) - """ $SIGNATURES diff --git a/src/mpi_utils/one_per_host.jl b/src/mpi_utils/misc_mpi_utils.jl similarity index 59% rename from src/mpi_utils/one_per_host.jl rename to src/mpi_utils/misc_mpi_utils.jl index 31da18434..e5df400ca 100644 --- a/src/mpi_utils/one_per_host.jl +++ b/src/mpi_utils/misc_mpi_utils.jl @@ -1,3 +1,23 @@ +""" +$SIGNATURES + +A flag is set by launch scripts (see ChildProcess.jl) to indicate +if this process is a child MPI process under an mpiexec. +Otherwise, that flag is false by default. + +This function retrieves the value of that flag. +""" +mpi_active() = mpi_active_ref[] + +const mpi_active_ref = Ref(false) + +#= +Rationale for :funneled / threading model: + - all the swap logic is single threaded + - multithreading occurs in exploration only +=# +init_mpi() = Init(threadlevel = :funneled) + """ For benchmarking purpose: subset the communicator so that at most one MPI process runs in each machine. diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index c3cf5ba3c..c21fabe66 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -63,7 +63,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) exec_folder, new_process.dependencies, new_process.n_threads, - new_process.n_local_mpi_processes == 1 + new_process.n_local_mpi_processes > 1 ) if new_process.n_local_mpi_processes == 1 run(julia_cmd, wait = new_process.wait) @@ -77,8 +77,8 @@ function pigeons(pt_arguments, new_process::ChildProcess) return Result{PT}(exec_folder) end -function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) +function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, on_mpi::Bool) + script_path = launch_script(pt_arguments, exec_folder, dependencies, on_mpi) jl_cmd = Base.julia_cmd() project_file = Base.current_project() if !isnothing(project_file) @@ -91,7 +91,7 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil return `$jl_cmd --threads=$n_threads $script_path` end -function launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) +function launch_script(pt_arguments, exec_folder, dependencies, on_mpi) path_to_serialized_pt_arguments = "$exec_folder/.pt_argument.jls" path_to_serialized_immutables = "$exec_folder/immutables.jls" @@ -108,7 +108,7 @@ function launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) path_to_serialized_pt_arguments, path_to_serialized_immutables, dependencies, - silence_mpi) + on_mpi) script_path = "$exec_folder/.launch_script.jl" write(script_path, code) return script_path @@ -119,7 +119,7 @@ function launch_code( path_to_serialized_pt_arguments::AbstractString, path_to_serialized_immutables::AbstractString, dependencies, - silence_mpi) + on_mpi) modules = copy(dependencies) push!(modules, Serialization) push!(modules, Pigeons) @@ -130,14 +130,14 @@ function launch_code( # when running check_against_serial(), the # child process still detects it is under MPI, so # we need to force it to ignore that - silence_code = silence_mpi ? "Pigeons.silence_mpi[] = true" : "" + mpi_flag = on_mpi ? "Pigeons.mpi_active_ref[] = true" : "" # Might be better with quote? # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ $dependency_declarations - $silence_code + $mpi_flag Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") diff --git a/src/submission/MPI.jl b/src/submission/MPI.jl index 511a7d564..bbce9824e 100644 --- a/src/submission/MPI.jl +++ b/src/submission/MPI.jl @@ -63,7 +63,7 @@ function pigeons(pt_arguments, mpi_submission::MPI) exec_folder, mpi_submission.dependencies, mpi_submission.n_threads, - false + true # set mpi_active_ref flag to true ) # generate qsub script