From 23b59d30fd4917a05afc668487a476fb0344e0a0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 09:29:45 -0800 Subject: [PATCH 01/63] Keeping only the suspected faulty test (by commenting out rest (!)) --- test/runtests.jl | 230 +++++++++++++++++++++++------------------------ 1 file changed, 115 insertions(+), 115 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 26604bf25..ae31c943f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -37,73 +37,73 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "Stepping stone" begin - pt = pigeons(target = toy_mvn_target(100)); - p = stepping_stone_pair(pt) - truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) - @test abs(p[1] - truth) < 1 - @test abs(p[2] - truth) < 1 -end - -@testset "Round trips" begin - n_chains = 4 - n_rounds = 5 +# @testset "Stepping stone" begin +# pt = pigeons(target = toy_mvn_target(100)); +# p = stepping_stone_pair(pt) +# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) +# @test abs(p[1] - truth) < 1 +# @test abs(p[2] - truth) < 1 +# end + +# @testset "Round trips" begin +# n_chains = 4 +# n_rounds = 5 - pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); +# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); - len = 2^(n_rounds) - truth = 0.0 - for i in 0:(n_chains-1) - truth += floor(max(len - i, 0) / n_chains / 2) - end - - @test truth == Pigeons.n_round_trips(pt) -end - -@testset "Moments" begin - pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); - for var_name in Pigeons.continuous_variables(pt) - m = mean(pt, var_name) - for i in eachindex(m) - @test abs(m[i] - 0.0) < 0.001 - end - v = var(pt, var_name) - for i in eachindex(v) - @test abs(v[i] - 0.1) < 0.001 - end - end -end - -@testset "Parallelism Invariance" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # Turing: - pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 4, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Turing, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 2)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 4, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - end -end +# len = 2^(n_rounds) +# truth = 0.0 +# for i in 0:(n_chains-1) +# truth += floor(max(len - i, 0) / n_chains / 2) +# end + +# @test truth == Pigeons.n_round_trips(pt) +# end + +# @testset "Moments" begin +# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); +# for var_name in Pigeons.continuous_variables(pt) +# m = mean(pt, var_name) +# for i in eachindex(m) +# @test abs(m[i] - 0.0) < 0.001 +# end +# v = var(pt, var_name) +# for i in eachindex(v) +# @test abs(v[i] - 0.1) < 0.001 +# end +# end +# end + +# @testset "Parallelism Invariance" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] +# # Turing: +# pigeons( +# target = TuringLogPotential(flip_model_unidentifiable()), +# n_rounds = 4, +# checked_round = 3, +# multithreaded = true, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# dependencies = [Turing, LinearAlgebra, "turing.jl"], +# n_local_mpi_processes = n_mpis, +# n_threads = 2)) +# # Blang: +# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf +# Pigeons.setup_blang("blangDemos") +# pigeons(; +# target = Pigeons.blang_ising(), +# n_rounds = 4, +# checked_round = 3, +# recorder_builders = recorder_builders, +# multithreaded = true, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 2)) +# end +# end @testset "Longer MPI" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -121,50 +121,50 @@ end n_threads = 1)) end -@testset "Entanglement" begin - mpi_test(1, "entanglement_test.jl") - mpi_test(2, "entanglement_test.jl") +# @testset "Entanglement" begin +# mpi_test(1, "entanglement_test.jl") +# mpi_test(2, "entanglement_test.jl") + +# mpi_test(1, "reduce_test.jl") +# mpi_test(2, "reduce_test.jl") +# mpi_test(3, "reduce_test.jl") +# end + +# @testset "PermutedDistributedArray" begin +# mpi_test(1, "permuted_test.jl", options = ["-s"]) +# mpi_test(1, "permuted_test.jl") +# mpi_test(2, "permuted_test.jl") +# end + +# @testset "LoadBalance" begin +# for i in 1:20 +# for j in i:30 +# test_load_balance(i, j) +# end +# end +# end + +# @testset "LogSum" begin +# m = Pigeons.LogSum() + +# fit!(m, 2.1) +# fit!(m, 4) +# v1 = value(m) +# @assert v1 ≈ log(exp(2.1) + exp(4)) - mpi_test(1, "reduce_test.jl") - mpi_test(2, "reduce_test.jl") - mpi_test(3, "reduce_test.jl") -end -@testset "PermutedDistributedArray" begin - mpi_test(1, "permuted_test.jl", options = ["-s"]) - mpi_test(1, "permuted_test.jl") - mpi_test(2, "permuted_test.jl") -end +# fit!(m, 2.1) +# fit!(m, 4) +# m2 = Pigeons.LogSum() +# fit!(m2, 50.1) +# combined = merge(m, m2) +# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) -@testset "LoadBalance" begin - for i in 1:20 - for j in i:30 - test_load_balance(i, j) - end - end -end - -@testset "LogSum" begin - m = Pigeons.LogSum() - - fit!(m, 2.1) - fit!(m, 4) - v1 = value(m) - @assert v1 ≈ log(exp(2.1) + exp(4)) - - - fit!(m, 2.1) - fit!(m, 4) - m2 = Pigeons.LogSum() - fit!(m2, 50.1) - combined = merge(m, m2) - @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - - fit!(m, 2.1) - fit!(m, 4) - empty!(m) - @assert value(m) == -Pigeons.inf(0.0) -end +# fit!(m, 2.1) +# fit!(m, 4) +# empty!(m) +# @assert value(m) == -Pigeons.inf(0.0) +# end function test_split_slice() # test disjoint random streams @@ -183,14 +183,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -@testset "split_test" begin - test_split_slice() -end +# @testset "split_test" begin +# test_split_slice() +# end -@testset "Serialize" begin - mpi_test(1, "serialization_test.jl") -end +# @testset "Serialize" begin +# mpi_test(1, "serialization_test.jl") +# end -@testset "SliceSampler" begin - test_slice_sampler() -end \ No newline at end of file +# @testset "SliceSampler" begin +# test_slice_sampler() +# end \ No newline at end of file From 40001d0b6aeb495e938b80927cc1c61bc77310a6 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 09:57:39 -0800 Subject: [PATCH 02/63] Removing Turing from test to isolate only one error at the time --- test/runtests.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index ae31c943f..9906c351f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,9 +8,9 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] - Pkg.add(i) -end +# for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] +# Pkg.add(i) +# end using Test using Distributions @@ -18,13 +18,13 @@ using Random using Statistics using OnlineStats using LinearAlgebra -using Turing +# using Turing using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice -include("slice_sampler_test.jl") -include("turing.jl") +#include("slice_sampler_test.jl") +# include("turing.jl") function test_load_balance(n_processes, n_tasks) for p in 1:n_processes From ff5aac9048b98b18119445a3e0071dc432cfe66b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:16:43 -0800 Subject: [PATCH 03/63] Check behaviour on different threadlevels --- src/mpi_utils/Entangler.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index fd93d4050..842f165e1 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -73,7 +73,7 @@ mutable struct Entangler println("Entangler initialized 1 process (without MPI); $(Threads.nthreads())") end else - Init(threadlevel = :funneled) + init_mpi() comm = Comm_dup(parent_communicator) transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) my_process_index = Comm_rank(comm) + 1 @@ -101,10 +101,12 @@ mpi_active() = if silence_mpi[] false else - Init(threadlevel = :funneled) + init_mpi() Comm_size(COMM_WORLD) > 1 end +init_mpi() = Init() #threadlevel = :funneled) + """ $SIGNATURES From d00ff5347fd9fd80884a38b84561e615aa521ba5 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:34:10 -0800 Subject: [PATCH 04/63] Back to same threadlevel as not changing crashing behaviour --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 842f165e1..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init() #threadlevel = :funneled) +init_mpi() = Init(threadlevel = :funneled) """ $SIGNATURES From 69bb270c62040bdbd9281d86dd8c1f79ca641252 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:39:07 -0800 Subject: [PATCH 05/63] More conservative tag ub? --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..ba026a706 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -75,7 +75,7 @@ mutable struct Entangler else init_mpi() comm = Comm_dup(parent_communicator) - transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) + transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices / 2) my_process_index = Comm_rank(comm) + 1 n_processes = Comm_size(comm) if verbose && my_process_index == 1 From d94252f915775119bd2317f91d19a807aee138cc Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 10:56:43 -0800 Subject: [PATCH 06/63] Revert "More conservative tag ub?" This reverts commit 69bb270c62040bdbd9281d86dd8c1f79ca641252. --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index ba026a706..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -75,7 +75,7 @@ mutable struct Entangler else init_mpi() comm = Comm_dup(parent_communicator) - transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices / 2) + transmit_counter_bound = ceil(Int, tag_ub() / n_global_indices - 2) my_process_index = Comm_rank(comm) + 1 n_processes = Comm_size(comm) if verbose && my_process_index == 1 From bcf016d5589b3c34c3e50b4fc48a2dd9614a6cd0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 15:15:54 -0800 Subject: [PATCH 07/63] Trying on OpenMPI instead of mpich --- test/runtests.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 9906c351f..f069ce82d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,6 +12,9 @@ Rationale for this hack: # Pkg.add(i) # end +using MPIPreferences +MPIPreferences.use_jll_binary("OpenMPI_jll") + using Test using Distributions using Random From 379df4b28aba0e983d2f7240dc9ed0c50db820f4 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 16:18:49 -0800 Subject: [PATCH 08/63] Going back to mpich, trying threadlevel = :multiple --- src/mpi_utils/Entangler.jl | 2 +- test/runtests.jl | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..574eb414c 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init(threadlevel = :funneled) +init_mpi() = Init(threadlevel = :multiple) """ $SIGNATURES diff --git a/test/runtests.jl b/test/runtests.jl index f069ce82d..bfab24ba5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,8 +12,6 @@ Rationale for this hack: # Pkg.add(i) # end -using MPIPreferences -MPIPreferences.use_jll_binary("OpenMPI_jll") using Test using Distributions From 2cd3571e61b1fb41f77c2c26535c0cc38d59e4e0 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Mon, 6 Mar 2023 21:20:22 -0800 Subject: [PATCH 09/63] Back to funneled as :multiple still crashes --- src/mpi_utils/Entangler.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 574eb414c..e34c8dafa 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -105,7 +105,7 @@ mpi_active() = Comm_size(COMM_WORLD) > 1 end -init_mpi() = Init(threadlevel = :multiple) +init_mpi() = Init(threadlevel = :funneled) """ $SIGNATURES From c8a02615f7aec89ee336c0c87c8f30d4f0984505 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Tue, 7 Mar 2023 08:48:52 -0800 Subject: [PATCH 10/63] Further simplification --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index bfab24ba5..ba59261ec 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -110,7 +110,7 @@ end n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [] pigeons( - target = toy_mvn_target(1), + target = Pigeons.TestSwapper(0.5), n_rounds = 12, checked_round = 12, n_chains = 200, From 6217650345d28f96b631575efb35fecda33a710b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 09:59:39 -0800 Subject: [PATCH 11/63] Candidate fix --- src/Pigeons.jl | 2 +- src/mpi_utils/Entangler.jl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Pigeons.jl b/src/Pigeons.jl index ce87c42c9..9d6cc8734 100644 --- a/src/Pigeons.jl +++ b/src/Pigeons.jl @@ -8,7 +8,7 @@ import MPI: Comm, Allreduce, Comm_rank, Comm_dup, Request, Waitall, RequestSet, mpiexec, Allreduce, Allgather, Comm_split, isend, recv, - bcast, tag_ub + bcast, tag_ub, free using Base: Forward diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index e34c8dafa..7e3888937 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -174,7 +174,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: - Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + dummy_request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + free(dummy_request) # <-- critical - see https://github.com/pmodels/mpich/issues/6432#issue-1612064302 end end From 1f16936bed4832d0769bac0d93b3840ed9ae5737 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 10:20:32 -0800 Subject: [PATCH 12/63] Reintroducing other tests --- test/runtests.jl | 242 +++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 121 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index ba59261ec..14887026a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,9 +8,9 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -# for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] -# Pkg.add(i) -# end +for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] + Pkg.add(i) +end using Test @@ -19,13 +19,13 @@ using Random using Statistics using OnlineStats using LinearAlgebra -# using Turing +using Turing using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice -#include("slice_sampler_test.jl") -# include("turing.jl") +include("slice_sampler_test.jl") +include("turing.jl") function test_load_balance(n_processes, n_tasks) for p in 1:n_processes @@ -38,73 +38,73 @@ function test_load_balance(n_processes, n_tasks) end end -# @testset "Stepping stone" begin -# pt = pigeons(target = toy_mvn_target(100)); -# p = stepping_stone_pair(pt) -# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) -# @test abs(p[1] - truth) < 1 -# @test abs(p[2] - truth) < 1 -# end - -# @testset "Round trips" begin -# n_chains = 4 -# n_rounds = 5 +@testset "Stepping stone" begin + pt = pigeons(target = toy_mvn_target(100)); + p = stepping_stone_pair(pt) + truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) + @test abs(p[1] - truth) < 1 + @test abs(p[2] - truth) < 1 +end + +@testset "Round trips" begin + n_chains = 4 + n_rounds = 5 -# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); + pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); -# len = 2^(n_rounds) -# truth = 0.0 -# for i in 0:(n_chains-1) -# truth += floor(max(len - i, 0) / n_chains / 2) -# end - -# @test truth == Pigeons.n_round_trips(pt) -# end - -# @testset "Moments" begin -# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); -# for var_name in Pigeons.continuous_variables(pt) -# m = mean(pt, var_name) -# for i in eachindex(m) -# @test abs(m[i] - 0.0) < 0.001 -# end -# v = var(pt, var_name) -# for i in eachindex(v) -# @test abs(v[i] - 0.1) < 0.001 -# end -# end -# end - -# @testset "Parallelism Invariance" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] -# # Turing: -# pigeons( -# target = TuringLogPotential(flip_model_unidentifiable()), -# n_rounds = 4, -# checked_round = 3, -# multithreaded = true, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# dependencies = [Turing, LinearAlgebra, "turing.jl"], -# n_local_mpi_processes = n_mpis, -# n_threads = 2)) -# # Blang: -# if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf -# Pigeons.setup_blang("blangDemos") -# pigeons(; -# target = Pigeons.blang_ising(), -# n_rounds = 4, -# checked_round = 3, -# recorder_builders = recorder_builders, -# multithreaded = true, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 2)) -# end -# end + len = 2^(n_rounds) + truth = 0.0 + for i in 0:(n_chains-1) + truth += floor(max(len - i, 0) / n_chains / 2) + end + + @test truth == Pigeons.n_round_trips(pt) +end + +@testset "Moments" begin + pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); + for var_name in Pigeons.continuous_variables(pt) + m = mean(pt, var_name) + for i in eachindex(m) + @test abs(m[i] - 0.0) < 0.001 + end + v = var(pt, var_name) + for i in eachindex(v) + @test abs(v[i] - 0.1) < 0.001 + end + end +end + +@testset "Parallelism Invariance" begin + n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf + recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 4, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Turing, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = n_mpis, + n_threads = 2)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 4, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2)) + end +end @testset "Longer MPI" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -122,50 +122,50 @@ end n_threads = 1)) end -# @testset "Entanglement" begin -# mpi_test(1, "entanglement_test.jl") -# mpi_test(2, "entanglement_test.jl") - -# mpi_test(1, "reduce_test.jl") -# mpi_test(2, "reduce_test.jl") -# mpi_test(3, "reduce_test.jl") -# end - -# @testset "PermutedDistributedArray" begin -# mpi_test(1, "permuted_test.jl", options = ["-s"]) -# mpi_test(1, "permuted_test.jl") -# mpi_test(2, "permuted_test.jl") -# end - -# @testset "LoadBalance" begin -# for i in 1:20 -# for j in i:30 -# test_load_balance(i, j) -# end -# end -# end - -# @testset "LogSum" begin -# m = Pigeons.LogSum() - -# fit!(m, 2.1) -# fit!(m, 4) -# v1 = value(m) -# @assert v1 ≈ log(exp(2.1) + exp(4)) +@testset "Entanglement" begin + mpi_test(1, "entanglement_test.jl") + mpi_test(2, "entanglement_test.jl") + mpi_test(1, "reduce_test.jl") + mpi_test(2, "reduce_test.jl") + mpi_test(3, "reduce_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# m2 = Pigeons.LogSum() -# fit!(m2, 50.1) -# combined = merge(m, m2) -# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) +@testset "PermutedDistributedArray" begin + mpi_test(1, "permuted_test.jl", options = ["-s"]) + mpi_test(1, "permuted_test.jl") + mpi_test(2, "permuted_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# empty!(m) -# @assert value(m) == -Pigeons.inf(0.0) -# end +@testset "LoadBalance" begin + for i in 1:20 + for j in i:30 + test_load_balance(i, j) + end + end +end + +@testset "LogSum" begin + m = Pigeons.LogSum() + + fit!(m, 2.1) + fit!(m, 4) + v1 = value(m) + @assert v1 ≈ log(exp(2.1) + exp(4)) + + + fit!(m, 2.1) + fit!(m, 4) + m2 = Pigeons.LogSum() + fit!(m2, 50.1) + combined = merge(m, m2) + @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + + fit!(m, 2.1) + fit!(m, 4) + empty!(m) + @assert value(m) == -Pigeons.inf(0.0) +end function test_split_slice() # test disjoint random streams @@ -184,14 +184,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -# @testset "split_test" begin -# test_split_slice() -# end +@testset "split_test" begin + test_split_slice() +end -# @testset "Serialize" begin -# mpi_test(1, "serialization_test.jl") -# end +@testset "Serialize" begin + mpi_test(1, "serialization_test.jl") +end -# @testset "SliceSampler" begin -# test_slice_sampler() -# end \ No newline at end of file +@testset "SliceSampler" begin + test_slice_sampler() +end \ No newline at end of file From 0ca198f9037eeb7cfc66f867ff0fe0958948fc49 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 11:06:25 -0800 Subject: [PATCH 13/63] purge tests from Turing, use only DynamicPPL --- test/runtests.jl | 9 +++------ test/slice_sampler_test.jl | 6 ------ test/turing.jl | 15 ++++++--------- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 14887026a..20873ccc1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -8,10 +8,7 @@ Rationale for this hack: - the other method, a second toml file, seems more promising but proved challenging to get to work on CI =# -for i in ["Test", "LinearAlgebra", "Turing", "ArgMacros", "Plots"] - Pkg.add(i) -end - +Pkg.add(["Test", "LinearAlgebra", "DynamicPPL", "ArgMacros", "Plots"]) using Test using Distributions @@ -19,7 +16,7 @@ using Random using Statistics using OnlineStats using LinearAlgebra -using Turing +using DynamicPPL using SplittableRandoms import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice @@ -87,7 +84,7 @@ end recorder_builders = recorder_builders, checkpoint = true, on = ChildProcess( - dependencies = [Turing, LinearAlgebra, "turing.jl"], + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, n_threads = 2)) # Blang: diff --git a/test/slice_sampler_test.jl b/test/slice_sampler_test.jl index bfac6dbeb..82c4d4acb 100644 --- a/test/slice_sampler_test.jl +++ b/test/slice_sampler_test.jl @@ -1,9 +1,3 @@ -using Pigeons -using Distributions -using Random -using Turing -using SplittableRandoms - import Pigeons: SliceSampler, slice_sample! include("turing.jl") diff --git a/test/turing.jl b/test/turing.jl index 2bfa1fca6..d15d80143 100644 --- a/test/turing.jl +++ b/test/turing.jl @@ -1,28 +1,25 @@ # Unconditioned coinflip model with `N` observations. -@model function coinflip(; N::Int) +@model function coinflip(y) p ~ Beta(1, 12) - y ~ filldist(Bernoulli(p), N) + y .~ Bernoulli(p) return y end; -coinflip(y::AbstractVector{<:Real}) = coinflip(; N=length(y)) | (; y) # *Unidentifiable* unconditioned coinflip model with `N` observations. -@model function coinflip_unidentifiable(; N::Int) +@model function coinflip_unidentifiable(y) p1 ~ Uniform(0, 1) p2 ~ Uniform(0, 1) - y ~ filldist(Bernoulli(p1*p2), N) + y .~ Bernoulli(p1*p2) return y end; -coinflip_unidentifiable(y::AbstractVector{<:Real}) = coinflip_unidentifiable(; N=length(y)) | (; y) -@model function coinflip_modified(; N::Int) +@model function coinflip_modified(y) p ~ Uniform(0.3, 0.7) # δ ~ Bernoulli(0.5) δ ~ DiscreteUniform(0, 2) - y ~ filldist(Bernoulli(p + 0.1*δ), N) + y .~ Bernoulli(p + 0.1*δ) return y end; -coinflip_modified(y::AbstractVector{<:Real}) = coinflip_modified(; N=length(y)) | (; y) function flip_model() From ee276fc9bf58b5b20bdb901f722d2f6883524b9b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 12:20:54 -0800 Subject: [PATCH 14/63] Another free --- src/mpi_utils/Entangler.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 7e3888937..224437abb 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -254,7 +254,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: dest_global_index = current_global - spacing dest_process = find_process(e.load, dest_global_index) dest_rank = dest_process - 1 - isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + dummy_request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + free(dummy_request) current_local += spacing did_send = true elseif current_global + spacing ≤ e.load.n_global_indices From 491c26bfe0b41d89fb87fcadd6703b6cae795acd Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Wed, 8 Mar 2023 14:58:34 -0800 Subject: [PATCH 15/63] Trying to run tests with more rounds --- test/runtests.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 20873ccc1..311ad231f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -78,7 +78,7 @@ end # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 4, + n_rounds = 13, checked_round = 3, multithreaded = true, recorder_builders = recorder_builders, @@ -92,7 +92,7 @@ end Pigeons.setup_blang("blangDemos") pigeons(; target = Pigeons.blang_ising(), - n_rounds = 4, + n_rounds = 13, checked_round = 3, recorder_builders = recorder_builders, multithreaded = true, @@ -108,7 +108,7 @@ end recorder_builders = [] pigeons( target = Pigeons.TestSwapper(0.5), - n_rounds = 12, + n_rounds = 14, checked_round = 12, n_chains = 200, multithreaded = false, From fab79345cd7d9b356228dc66e329fa2e6a2d8972 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:41:39 -0800 Subject: [PATCH 16/63] system MPI test --- .github/workflows/CI.yml | 62 ++++++++++++++++++++++++++++++++++++++-- test/runtests.jl | 7 +++++ 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 1c631be32..af99d7d9a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -1,17 +1,20 @@ name: CI + on: push: branches: - main tags: ['*'] pull_request: + concurrency: # Skip intermediate builds: always. # Cancel intermediate builds: only if it is a pull request build. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} + jobs: - test: + test-default: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -27,7 +30,7 @@ jobs: arch: - x64 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-java@v3 with: distribution: 'temurin' @@ -43,13 +46,66 @@ jobs: - uses: codecov/codecov-action@v2 with: files: lcov.info + + + # adapted from MPI.jl + test-system-MPI-apt: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + os: + - ubuntu-latest + mpi: + - libmpich-dev + - libopenmpi-dev + steps: + - uses: actions/checkout@v3 + + - name: Install MPI via apt + run: | + sudo apt-get update + sudo apt-get install $MPI + env: + MPI: ${{ matrix.mpi }} + + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: x64 + + - uses: julia-actions/cache@v1 + + - name: add MPIPreferences + shell: julia --color=yes --project=. {0} + run: | + using Pkg + Pkg.develop(path="lib/MPIPreferences") + + - name: use system MPI + shell: julia --color=yes --project=. {0} + run: | + using MPIPreferences + MPIPreferences.use_system_binary() + + - uses: julia-actions/julia-runtest@v1 + + docs: name: Documentation runs-on: ubuntu-latest permissions: contents: write steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions/setup-java@v3 with: distribution: 'temurin' diff --git a/test/runtests.jl b/test/runtests.jl index 311ad231f..aee31e4f9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -18,6 +18,7 @@ using OnlineStats using LinearAlgebra using DynamicPPL using SplittableRandoms +using MPIPreferences import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, find_process, split_slice @@ -72,6 +73,12 @@ end end end +@testset "MPI" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end +end + @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] From 2ed0aae88ad6a9a84e3b7c8b2917cfce57ce41cb Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:43:57 -0800 Subject: [PATCH 17/63] fix typos --- .github/workflows/CI.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index af99d7d9a..ad00fb1e8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,10 +91,10 @@ jobs: Pkg.develop(path="lib/MPIPreferences") - name: use system MPI - shell: julia --color=yes --project=. {0} - run: | - using MPIPreferences - MPIPreferences.use_system_binary() + shell: julia --color=yes --project=. {0} + run: | + using MPIPreferences + MPIPreferences.use_system_binary() - uses: julia-actions/julia-runtest@v1 From c271261cfeb0853d943fc1fe3ad04f4d26f5f58c Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:47:07 -0800 Subject: [PATCH 18/63] remove step that does not make sense outside MPI.jl --- .github/workflows/CI.yml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ad00fb1e8..25e285d96 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -84,19 +84,13 @@ jobs: - uses: julia-actions/cache@v1 - - name: add MPIPreferences - shell: julia --color=yes --project=. {0} - run: | - using Pkg - Pkg.develop(path="lib/MPIPreferences") - - name: use system MPI shell: julia --color=yes --project=. {0} run: | using MPIPreferences MPIPreferences.use_system_binary() - - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-runtest@latest docs: From e3f569ae3702ddc6bf1d09eec10df604495d3cb2 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 16:52:59 -0800 Subject: [PATCH 19/63] setup MPIPreferences in the test env --- .github/workflows/CI.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 25e285d96..13fdc294d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -85,8 +85,10 @@ jobs: - uses: julia-actions/cache@v1 - name: use system MPI - shell: julia --color=yes --project=. {0} + shell: julia --color=yes --project=test {0} run: | + using Pkg + Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() From 77824eac6dfc0aed77e479ceab94b1401ade53f7 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:04:08 -0800 Subject: [PATCH 20/63] possible fix to Pkg missing --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 13fdc294d..110263bb8 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,6 +91,7 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@latest From c69b76a335f7aacbd29ccc5eccc257ff1c8290b2 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:10:44 -0800 Subject: [PATCH 21/63] possible fix to Pkg missing --- .github/workflows/CI.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 110263bb8..e73835d36 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -91,10 +91,11 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Project.toml") rm("test/Manifest.toml") - - uses: julia-actions/julia-runtest@latest - + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 docs: name: Documentation From 523f9c120de94551621ec8bafc0c9cae7f2543fb Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:23:35 -0800 Subject: [PATCH 22/63] add missing env --- .github/workflows/CI.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index e73835d36..5860d9b1a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -62,6 +62,8 @@ jobs: mpi: - libmpich-dev - libopenmpi-dev + env: + JULIA_MPI_TEST_BINARY: system steps: - uses: actions/checkout@v3 @@ -91,10 +93,10 @@ jobs: Pkg.add("MPIPreferences") using MPIPreferences MPIPreferences.use_system_binary() + # need to remove these to avoid getting a 'Package "Pkg" missing' error rm("test/Project.toml") rm("test/Manifest.toml") - - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 docs: From 949b62880b9460f20178cfc1b3d0cbc348a4c909 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 17:33:50 -0800 Subject: [PATCH 23/63] another try --- .github/workflows/CI.yml | 5 +---- test/runtests.jl | 12 ++++++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 5860d9b1a..8bbee8e2a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -90,12 +90,9 @@ jobs: shell: julia --color=yes --project=test {0} run: | using Pkg - Pkg.add("MPIPreferences") + Pkg.add(["Pkg","MPIPreferences"]) using MPIPreferences MPIPreferences.use_system_binary() - # need to remove these to avoid getting a 'Package "Pkg" missing' error - rm("test/Project.toml") - rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 diff --git a/test/runtests.jl b/test/runtests.jl index aee31e4f9..d1981f8d7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -36,6 +36,12 @@ function test_load_balance(n_processes, n_tasks) end end +@testset "System MPI" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end +end + @testset "Stepping stone" begin pt = pigeons(target = toy_mvn_target(100)); p = stepping_stone_pair(pt) @@ -73,12 +79,6 @@ end end end -@testset "MPI" begin - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end -end - @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] From 895a45fed65c5a968e46faa6df8c7f3371a7c060 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 18:33:42 -0800 Subject: [PATCH 24/63] test now has its own Project.toml --- .github/workflows/CI.yml | 2 +- Project.toml | 6 ------ src/utils/misc.jl | 14 -------------- test/Project.toml | 14 ++++++++++++++ test/misc.jl | 13 +++++++++++++ test/runtests.jl | 29 ++++++++++++----------------- test/single_cell_example.jl | 2 -- 7 files changed, 40 insertions(+), 40 deletions(-) create mode 100644 test/Project.toml create mode 100644 test/misc.jl diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 8bbee8e2a..b61b43228 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -90,7 +90,7 @@ jobs: shell: julia --color=yes --project=test {0} run: | using Pkg - Pkg.add(["Pkg","MPIPreferences"]) + Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() diff --git a/Project.toml b/Project.toml index 662a88ceb..615e602cc 100644 --- a/Project.toml +++ b/Project.toml @@ -52,9 +52,3 @@ SpecialFunctions = "2" SplittableRandoms = "0.1" StatsBase = "0.33" julia = "1.6" - -[extras] -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[targets] -test = ["Test"] diff --git a/src/utils/misc.jl b/src/utils/misc.jl index 91a19e68c..d3535b2d5 100644 --- a/src/utils/misc.jl +++ b/src/utils/misc.jl @@ -97,20 +97,6 @@ is attempted). """ macro abstract() quote error("Attempted to call an abstract function.") end end -function mpi_test(n_processes::Int, test_file::String; options = []) - project_folder = dirname(Base.current_project()) - # handle 2 different "modes" that tests can be ran (for julia 1.0,1.1 vs. >1.1) - resolved_test_file = - if isfile("$project_folder/$test_file") - "$project_folder/$test_file" - else - "$project_folder/test/$test_file" - end - mpiexec() do exe - run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) - end -end - """ @weighted(w, x) diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 000000000..f71ba28fa --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,14 @@ +[deps] +ArgMacros = "dbc42088-9de8-42a0-8ec8-2cd114e1ea3e" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" +MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267" +OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +SplittableRandoms = "8efc31e9-3fb0-4277-b18c-5a3d5d07abad" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/misc.jl b/test/misc.jl new file mode 100644 index 000000000..4100a32ba --- /dev/null +++ b/test/misc.jl @@ -0,0 +1,13 @@ +function mpi_test(n_processes::Int, test_file::String; options = []) + project_folder = dirname(Base.current_project()) + # handle 2 different "modes" that tests can be ran (for julia 1.0,1.1 vs. >1.1) + resolved_test_file = + if isfile("$project_folder/$test_file") + "$project_folder/$test_file" + else + "$project_folder/test/$test_file" + end + mpiexec() do exe + run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index d1981f8d7..17d571edb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,27 +1,22 @@ using Pigeons -using Pkg -#= -Rationale for this hack: -- putting those in the [extras] section will lead to ChildProcess not - having access to it -- the other method, a second toml file, seems more promising but - proved challenging to get to work on CI -=# -Pkg.add(["Test", "LinearAlgebra", "DynamicPPL", "ArgMacros", "Plots"]) - -using Test +using ArgMacros using Distributions -using Random -using Statistics -using OnlineStats -using LinearAlgebra using DynamicPPL -using SplittableRandoms +using LinearAlgebra +using MPI using MPIPreferences -import Pigeons: mpi_test, my_global_indices, LoadBalance, my_load, +using OnlineStats +using Random +using Serialization +using SplittableRandoms +using Statistics +using Test + +import Pigeons: my_global_indices, LoadBalance, my_load, find_process, split_slice +include("misc.jl") include("slice_sampler_test.jl") include("turing.jl") diff --git a/test/single_cell_example.jl b/test/single_cell_example.jl index 062a24e70..e83a0e954 100644 --- a/test/single_cell_example.jl +++ b/test/single_cell_example.jl @@ -1,5 +1,3 @@ -using Pigeons - if !isdir("nowellpack") && !islink(blang_repo) # Download and compile the Blang model used in https://www.biorxiv.org/content/10.1101/2020.05.06.058180 println("cloning and building nowellpack") From 13c9cd2c91398e1eff54c80ca24076fcd22352d9 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:00:32 -0800 Subject: [PATCH 25/63] force Pkg.instantiate in ChildProcess --- src/submission/ChildProcess.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 8a8718cbb..8ccf7a8d2 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -124,6 +124,8 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ + using Pkg + Pkg.instantiate() $dependency_declarations $silence_code From 1772d82510a2fe30ad4b0db24d5aeec0a9483bc7 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:40:05 -0800 Subject: [PATCH 26/63] dont use --project in ChildProcess --- src/submission/ChildProcess.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 8ccf7a8d2..6251a4a45 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -74,7 +74,6 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil julia_bin = Base.julia_cmd() script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin - --project --threads=$n_threads $script_path` end @@ -124,8 +123,6 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - using Pkg - Pkg.instantiate() $dependency_declarations $silence_code From bdf71ce6aab21eee1d967475d73d8531c1b0fcab Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 19:59:41 -0800 Subject: [PATCH 27/63] ChildProcess inherits the exact same active project --- src/submission/ChildProcess.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 6251a4a45..ea76a2f26 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -73,7 +73,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) - return `$julia_bin + return `$julia_bin + --project=$(dirname(Base.active_project())) --threads=$n_threads $script_path` end From 2bcd82541510571dd34f99a024be098518b7931e Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Wed, 8 Mar 2023 20:36:43 -0800 Subject: [PATCH 28/63] new approach --- .github/workflows/CI.yml | 7 +++++++ src/submission/ChildProcess.jl | 2 ++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b61b43228..fb73805ad 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -41,6 +41,13 @@ jobs: arch: ${{ matrix.arch }} - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@v1 + + - name: instantiate the test environment + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - uses: codecov/codecov-action@v2 diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index ea76a2f26..2579c1176 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -72,6 +72,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() + active_proj = dirname(Base.active_project()) + run(`$julia_bin --project=$active_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$(dirname(Base.active_project())) From eadbfcdb43289afd0dd1b5abc1983517365a45dd Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 06:34:26 -0800 Subject: [PATCH 29/63] Changing approach for Isend/isend: explicit Waitall for all requests A crash seemed to occur in CI during garbage collection julia:3364 terminated with signal 11 at PC=7f997aaec971 SP=7f9936ccb970. Backtrace: /opt/hostedtoolcache/julia/1.8.5/x64/bin/../lib/julia/libjulia-internal.so.1(ijl_gc_safepoint+0x11)[0x7f997aaec971] This looks like a crash during garbage collection code (this would be consistent with the fact that it only shows up with allocation-heavy test, i.e. Turing). So even though the finalizer checks first before calling free, maybe there is something flaky there. https://github.com/Julia-Tempering/Pigeons.jl/pull/30#issuecomment-1461317764 --- src/mpi_utils/Entangler.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index 224437abb..a883b0dfe 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -157,6 +157,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic # indicators of whether each local index is to be received over MPI e.current_received_bits .= true at_least_one_mpi = false + + requests = RequestSet() # send (or copy if local) for local_index in 1:myload @@ -174,14 +176,13 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: - dummy_request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) - free(dummy_request) # <-- critical - see https://github.com/pmodels/mpich/issues/6432#issue-1612064302 + request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) + push!(requests, request) end end # receive if at_least_one_mpi - requests = RequestSet() my_globals = my_global_indices(e.load) for local_index in 1:myload if e.current_received_bits[local_index] @@ -241,6 +242,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: # outer loop is over the levels of a binary tree over the global indices iteration = 1 + requests = RequestSet() + while n_remaining_to_reduce > 1 transmit_index = next_transmit_index!(e) current_local = my_first_remaining_local @@ -254,8 +257,8 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: dest_global_index = current_global - spacing dest_process = find_process(e.load, dest_global_index) dest_rank = dest_process - 1 - dummy_request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) - free(dummy_request) + request = isend(work_array[current_local], e.communicator; dest = dest_rank, tag = tag(e, transmit_index, iteration)) + push!(requests, request) current_local += spacing did_send = true elseif current_global + spacing ≤ e.load.n_global_indices @@ -278,6 +281,7 @@ function reduce_deterministically(operation, source_data::AbstractVector{T}, e:: if did_send my_first_remaining_local += spacing + Waitall(requests) end n_global_indices_remaining_before = ceil(Int, n_global_indices_remaining_before/2) spacing = spacing * 2 From 57d0921b22026d90c961933158b51cee6f027003 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 06:48:25 -0800 Subject: [PATCH 30/63] Is this a Turing multi-threading issue? --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 17d571edb..db4e0b206 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -88,7 +88,7 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 1)) # Blang: if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf Pigeons.setup_blang("blangDemos") From 63aad9766d55bc8789aa9e9363a00b791d685744 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 07:15:55 -0800 Subject: [PATCH 31/63] Testing single-thread for all Multi-threading issue seems not specific to Turing as crash occurred for Blang in last commit. --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index db4e0b206..5971d0e4a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -101,7 +101,7 @@ end checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 1)) end end From 7a8de7e0f38a42c8a566ede0a65b047c5538c585 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 07:41:31 -0800 Subject: [PATCH 32/63] Adding a temporary "dry run" to see if problem cause by some compilation --- test/runtests.jl | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 5971d0e4a..8a3bf86d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,6 +77,36 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] + + # temp + # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 13, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = 1, + n_threads = 1)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 13, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = 1, + n_threads = 1)) + end + + # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), From 5812127c3b94caae6dad9035e58e282192201e4d Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Thu, 9 Mar 2023 08:05:12 -0800 Subject: [PATCH 33/63] Commenting out Turing and Blang, to see behaviour on toy_mvn and swap(0.5) --- test/runtests.jl | 97 +++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 8a3bf86d8..79d3b78fa 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -78,10 +78,9 @@ end n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # temp - # Turing: + pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), + target = toy_mvn_target(1), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -89,27 +88,12 @@ end checkpoint = true, on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = 1, + n_local_mpi_processes = n_mpis, n_threads = 1)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = 1, - n_threads = 1)) - end - # Turing: pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), + target = Pigeons.TestSwapper(0.5), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -119,20 +103,65 @@ end dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, n_threads = 1)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 1)) - end + + + + + # # temp + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 13, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = 1, + # n_threads = 1)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 13, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = 1, + # n_threads = 1)) + # end + + + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 13, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = n_mpis, + # n_threads = 1)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 13, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = n_mpis, + # n_threads = 1)) + # end end @testset "Longer MPI" begin From 270333f4070c30c46744cf9017a841c61542d91f Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:08:58 -0800 Subject: [PATCH 34/63] openmpi_jll test --- .github/workflows/CI.yml | 42 ++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 8 +++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index fb73805ad..ccf71748e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -53,6 +53,48 @@ jobs: - uses: codecov/codecov-action@v2 with: files: lcov.info + + # test OpenMPI_jll + test-openmpi-jll: + strategy: + matrix: + version: + - '1.8' + - 'nightly' + os: + - ubuntu-latest + - macos-latest + - windows-latest + arch: + - x64 + + fail-fast: false + env: + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI + + runs-on: ${{ matrix.os }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: julia-actions/setup-julia@latest + with: + arch: ${{ matrix.julia_arch }} + version: ${{ matrix.julia_version }} + - uses: julia-actions/cache@v1 + + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_system_binary() + + - uses: julia-actions/julia-runtest@latest + # adapted from MPI.jl diff --git a/test/runtests.jl b/test/runtests.jl index 79d3b78fa..52026e480 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,10 +31,16 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "System MPI" begin +@testset "MPI" begin if haskey(ENV,"JULIA_MPI_TEST_BINARY") @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary end + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end + if haskey(ENV,"JULIA_MPI_TEST_ABI") + @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi + end end @testset "Stepping stone" begin From 2b49008f382f56d582f84478791da62b6dab1b6a Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:12:08 -0800 Subject: [PATCH 35/63] openmpi_jll test --- .github/workflows/CI.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index ccf71748e..caf161ff3 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -55,16 +55,14 @@ jobs: files: lcov.info # test OpenMPI_jll - test-openmpi-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} + runs-on: ${{ matrix.os }} strategy: matrix: version: - '1.8' - - 'nightly' os: - ubuntu-latest - - macos-latest - - windows-latest arch: - x64 @@ -72,9 +70,6 @@ jobs: env: JULIA_MPI_TEST_BINARY: OpenMPI_jll JULIA_MPI_TEST_ABI: OpenMPI - - runs-on: ${{ matrix.os }} - steps: - name: Checkout uses: actions/checkout@v3 @@ -92,7 +87,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() - + - uses: julia-actions/julia-runtest@latest From 9c6497f6b4584baacd6d3415117159c471beddc0 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:14:03 -0800 Subject: [PATCH 36/63] openmpi_jll test --- .github/workflows/CI.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index caf161ff3..afabef73a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -55,6 +55,7 @@ jobs: files: lcov.info # test OpenMPI_jll + test-OpenMPI-jll: name: Julia OpenMPI_jll - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -86,7 +87,7 @@ jobs: using Pkg Pkg.instantiate() using MPIPreferences - MPIPreferences.use_system_binary() + MPIPreferences.use_jll_binary("OpenMPI_jll") - uses: julia-actions/julia-runtest@latest From d68dc6c394c168cdb7a17ce4f7486d64bc32ddcf Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 08:15:41 -0800 Subject: [PATCH 37/63] openmpi_jll test --- .github/workflows/CI.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index afabef73a..0ab775a57 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -77,8 +77,8 @@ jobs: - uses: julia-actions/setup-julia@latest with: - arch: ${{ matrix.julia_arch }} - version: ${{ matrix.julia_version }} + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} - uses: julia-actions/cache@v1 - name: use OpenMPI_jll From 38de7cfb2d34f0458715b417f7131788c918ac43 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 09:53:10 -0800 Subject: [PATCH 38/63] more messages from Child processes --- .gitignore | 3 +- src/submission/ChildProcess.jl | 42 +++++++++++--- test/runtests.jl | 103 ++++++--------------------------- 3 files changed, 54 insertions(+), 94 deletions(-) diff --git a/.gitignore b/.gitignore index 8a9900510..fb28d4e81 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ .vscode/settings.json build .interfaces.md - +*.log +*.err machines.txt results .includes_bu.jl diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 2579c1176..5d9982eff 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -126,18 +126,42 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - $dependency_declarations - $silence_code - - Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") - pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") - - pt = PT(pt_arguments, exec_folder = raw"$exec_folder") - pigeons(pt) + prefix=string(getpid()) + println("hello from PID " * prefix) + open(prefix * ".log", "a") do out + open(prefix * ".err", "a") do err + redirect_stdout(out) do + redirect_stderr(err) do + $dependency_declarations + $silence_code + println("using Pigeons located @ " * pathof(Pigeons)) + end + end + end + end + # need to do this in order to be able to use declarations, since they happened inside a function + open(prefix * ".log", "a") do out + open(prefix * ".err", "a") do err + redirect_stdout(out) do + redirect_stderr(err) do + print("deserializing...") + Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") + pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") + println("done!") + print("running PT...") + pt = PT(pt_arguments, exec_folder = raw"$exec_folder") + println("done!") + print("running pigeons(pt)...") + pigeons(pt) + println("done") + end + end + end + end """ end -add_dependency(dependency::Module) = "using $dependency" +add_dependency(dependency::Module) = "@eval using $dependency" function add_dependency(dependency::String) abs_path = abspath(dependency) return """include(raw"$abs_path")""" diff --git a/test/runtests.jl b/test/runtests.jl index 52026e480..83e88c587 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,16 +31,10 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "MPI" begin +@testset "System MPI" begin if haskey(ENV,"JULIA_MPI_TEST_BINARY") @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary end - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end - if haskey(ENV,"JULIA_MPI_TEST_ABI") - @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi - end end @testset "Stepping stone" begin @@ -83,10 +77,9 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - - + # Turing: pigeons( - target = toy_mvn_target(1), + target = TuringLogPotential(flip_model_unidentifiable()), n_rounds = 13, checked_round = 3, multithreaded = true, @@ -95,79 +88,21 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 1)) - - - pigeons( - target = Pigeons.TestSwapper(0.5), - n_rounds = 13, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 1)) - - - - - # # temp - # # Turing: - # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), - # n_rounds = 13, - # checked_round = 3, - # multithreaded = true, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - # n_local_mpi_processes = 1, - # n_threads = 1)) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 13, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = 1, - # n_threads = 1)) - # end - - - # # Turing: - # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), - # n_rounds = 13, - # checked_round = 3, - # multithreaded = true, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - # n_local_mpi_processes = n_mpis, - # n_threads = 1)) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 13, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 1)) - # end + n_threads = 2)) + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 13, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2)) + end end @testset "Longer MPI" begin @@ -258,4 +193,4 @@ end @testset "SliceSampler" begin test_slice_sampler() -end \ No newline at end of file +end From 8537307e929e578790b72782d7bfd035613632d5 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:22:56 -0800 Subject: [PATCH 39/63] extra arg to OpenMPI --- .github/workflows/CI.yml | 73 ++++++++++++++--------------- src/submission/ChildProcess.jl | 24 +++++++--- test/runtests.jl | 85 ++++++++++++++++++---------------- 3 files changed, 98 insertions(+), 84 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 0ab775a57..017f76bcf 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -54,42 +54,42 @@ jobs: with: files: lcov.info - # test OpenMPI_jll - test-OpenMPI-jll: - name: Julia OpenMPI_jll - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - version: - - '1.8' - os: - - ubuntu-latest - arch: - - x64 - - fail-fast: false - env: - JULIA_MPI_TEST_BINARY: OpenMPI_jll - JULIA_MPI_TEST_ABI: OpenMPI - steps: - - name: Checkout - uses: actions/checkout@v3 - - - uses: julia-actions/setup-julia@latest - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 - - - name: use OpenMPI_jll - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_jll_binary("OpenMPI_jll") - - - uses: julia-actions/julia-runtest@latest + # # test OpenMPI_jll + # test-OpenMPI-jll: + # name: Julia OpenMPI_jll - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # arch: + # - x64 + + # fail-fast: false + # env: + # JULIA_MPI_TEST_BINARY: OpenMPI_jll + # JULIA_MPI_TEST_ABI: OpenMPI + # steps: + # - name: Checkout + # uses: actions/checkout@v3 + + # - uses: julia-actions/setup-julia@latest + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 + + # - name: use OpenMPI_jll + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_jll_binary("OpenMPI_jll") + + # - uses: julia-actions/julia-runtest@latest @@ -105,7 +105,6 @@ jobs: os: - ubuntu-latest mpi: - - libmpich-dev - libopenmpi-dev env: JULIA_MPI_TEST_BINARY: system diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 5d9982eff..eb7c8a3b5 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,21 +62,29 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` + println("Checking MPI version:") + run(`$exe -V`) + mpi_args = extra_mpi_args() + mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` - run(cmd, wait = new_process.wait) + logfile = "Pigeons.log" + println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") + run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) end end return Result{PT}(exec_folder) end +function extra_mpi_args() + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 -v` : `` +end + function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - active_proj = dirname(Base.active_project()) - run(`$julia_bin --project=$active_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache + julia_bin = "julia"#Base.julia_cmd() + cur_proj = dirname(Base.current_project()) script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin - --project=$(dirname(Base.active_project())) + --project=$cur_proj --threads=$n_threads $script_path` end @@ -126,6 +134,8 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ + println("wd = " * pwd()) + println("active_proj = " * dirname(Base.active_project()) ) prefix=string(getpid()) println("hello from PID " * prefix) open(prefix * ".log", "a") do out @@ -153,7 +163,7 @@ function launch_code( println("done!") print("running pigeons(pt)...") pigeons(pt) - println("done") + println("done!") end end end diff --git a/test/runtests.jl b/test/runtests.jl index 83e88c587..b359afe46 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,48 +31,48 @@ function test_load_balance(n_processes, n_tasks) end end -@testset "System MPI" begin - if haskey(ENV,"JULIA_MPI_TEST_BINARY") - @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary - end -end - -@testset "Stepping stone" begin - pt = pigeons(target = toy_mvn_target(100)); - p = stepping_stone_pair(pt) - truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) - @test abs(p[1] - truth) < 1 - @test abs(p[2] - truth) < 1 -end - -@testset "Round trips" begin - n_chains = 4 - n_rounds = 5 +# @testset "System MPI" begin +# if haskey(ENV,"JULIA_MPI_TEST_BINARY") +# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary +# end +# end + +# @testset "Stepping stone" begin +# pt = pigeons(target = toy_mvn_target(100)); +# p = stepping_stone_pair(pt) +# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) +# @test abs(p[1] - truth) < 1 +# @test abs(p[2] - truth) < 1 +# end + +# @testset "Round trips" begin +# n_chains = 4 +# n_rounds = 5 - pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); +# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); - len = 2^(n_rounds) - truth = 0.0 - for i in 0:(n_chains-1) - truth += floor(max(len - i, 0) / n_chains / 2) - end - - @test truth == Pigeons.n_round_trips(pt) -end - -@testset "Moments" begin - pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); - for var_name in Pigeons.continuous_variables(pt) - m = mean(pt, var_name) - for i in eachindex(m) - @test abs(m[i] - 0.0) < 0.001 - end - v = var(pt, var_name) - for i in eachindex(v) - @test abs(v[i] - 0.1) < 0.001 - end - end -end +# len = 2^(n_rounds) +# truth = 0.0 +# for i in 0:(n_chains-1) +# truth += floor(max(len - i, 0) / n_chains / 2) +# end + +# @test truth == Pigeons.n_round_trips(pt) +# end + +# @testset "Moments" begin +# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); +# for var_name in Pigeons.continuous_variables(pt) +# m = mean(pt, var_name) +# for i in eachindex(m) +# @test abs(m[i] - 0.0) < 0.001 +# end +# v = var(pt, var_name) +# for i in eachindex(v) +# @test abs(v[i] - 0.1) < 0.001 +# end +# end +# end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf @@ -194,3 +194,8 @@ end @testset "SliceSampler" begin test_slice_sampler() end + +# clean-up logs +ls = readdir() +foreach(rm, filter(endswith(".log"), ls)) +foreach(rm, filter(endswith(".err"), ls)) From c2b732bb6d16a6b69d6203dc3193c89fd2716626 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:29:56 -0800 Subject: [PATCH 40/63] rm mpi version query --- .github/workflows/CI.yml | 140 ++++++++++++++++----------------- src/submission/ChildProcess.jl | 2 - 2 files changed, 70 insertions(+), 72 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 017f76bcf..10a28cb49 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -54,91 +54,91 @@ jobs: with: files: lcov.info - # # test OpenMPI_jll - # test-OpenMPI-jll: - # name: Julia OpenMPI_jll - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # arch: - # - x64 - - # fail-fast: false - # env: - # JULIA_MPI_TEST_BINARY: OpenMPI_jll - # JULIA_MPI_TEST_ABI: OpenMPI - # steps: - # - name: Checkout - # uses: actions/checkout@v3 - - # - uses: julia-actions/setup-julia@latest - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - - # - name: use OpenMPI_jll - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_jll_binary("OpenMPI_jll") - - # - uses: julia-actions/julia-runtest@latest - - - - # adapted from MPI.jl - test-system-MPI-apt: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # test OpenMPI_jll + test-OpenMPI-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: - fail-fast: false matrix: version: - '1.8' os: - ubuntu-latest - mpi: - - libopenmpi-dev + arch: + - x64 + + fail-fast: false env: - JULIA_MPI_TEST_BINARY: system + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI steps: - - uses: actions/checkout@v3 + - name: Checkout + uses: actions/checkout@v3 - - name: Install MPI via apt - run: | - sudo apt-get update - sudo apt-get install $MPI - env: - MPI: ${{ matrix.mpi }} + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_jll_binary("OpenMPI_jll") - - uses: julia-actions/setup-julia@v1 - with: - version: ${{ matrix.version }} - arch: x64 + - uses: julia-actions/julia-runtest@latest - - uses: julia-actions/cache@v1 - - name: use system MPI - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_system_binary() + + # # adapted from MPI.jl + # test-system-MPI-apt: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # mpi: + # - libopenmpi-dev + # env: + # JULIA_MPI_TEST_BINARY: system + # steps: + # - uses: actions/checkout@v3 + + # - name: Install MPI via apt + # run: | + # sudo apt-get update + # sudo apt-get install $MPI + # env: + # MPI: ${{ matrix.mpi }} + + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: x64 + + # - uses: julia-actions/cache@v1 + + # - name: use system MPI + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_system_binary() - - uses: julia-actions/julia-runtest@v1 + # - uses: julia-actions/julia-runtest@v1 docs: name: Documentation diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index eb7c8a3b5..0b0a1cbe8 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,8 +62,6 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - println("Checking MPI version:") - run(`$exe -V`) mpi_args = extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` From 0e82a84d6684ced6cd25bc15695b7d3f5d455707 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:40:26 -0800 Subject: [PATCH 41/63] use old julia_cmd + rm manifest as in MPI.jl CI --- .github/workflows/CI.yml | 1 + src/submission/ChildProcess.jl | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 10a28cb49..a7b3301aa 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -88,6 +88,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_jll_binary("OpenMPI_jll") + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@latest diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 0b0a1cbe8..cbd8bbc09 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -67,7 +67,13 @@ function pigeons(pt_arguments, new_process::ChildProcess) cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") - run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + try + run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + catch + open(logfile, "r") do f + println(read(f, String)) + end + end end end return Result{PT}(exec_folder) @@ -78,8 +84,8 @@ function extra_mpi_args() end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = "julia"#Base.julia_cmd() - cur_proj = dirname(Base.current_project()) + julia_bin = Base.julia_cmd() + cur_proj = dirname(Base.current_project()) script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj From 82a4210ca2dbc1035dfab90a469195ae03ed9ead Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 14:52:29 -0800 Subject: [PATCH 42/63] add --oversubscribe for OpenMPI + rethrow exception --- src/submission/ChildProcess.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index cbd8bbc09..01b68422d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -69,10 +69,11 @@ function pigeons(pt_arguments, new_process::ChildProcess) println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") try run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) - catch + catch e open(logfile, "r") do f println(read(f, String)) end + rethrow(e) end end end @@ -80,7 +81,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 -v` : `` + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) From 700c71271548974ee0602a4e948a6cce594dc476 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 15:24:46 -0800 Subject: [PATCH 43/63] simplify logging --- src/submission/ChildProcess.jl | 54 ++++++++++++---------------------- 1 file changed, 18 insertions(+), 36 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 01b68422d..521f512fe 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -68,8 +68,9 @@ function pigeons(pt_arguments, new_process::ChildProcess) logfile = "Pigeons.log" println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") try - run(pipeline(cmd; stdout = logfile, stderr = logfile, append = true), wait = new_process.wait) + run(pipeline(cmd; stdout = logfile, stderr = logfile), wait = new_process.wait) catch e + println("pipeline terminated with non-zero status. Dumping stdout+stderr:\n\n") open(logfile, "r") do f println(read(f, String)) end @@ -139,44 +140,25 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - println("wd = " * pwd()) - println("active_proj = " * dirname(Base.active_project()) ) - prefix=string(getpid()) - println("hello from PID " * prefix) - open(prefix * ".log", "a") do out - open(prefix * ".err", "a") do err - redirect_stdout(out) do - redirect_stderr(err) do - $dependency_declarations - $silence_code - println("using Pigeons located @ " * pathof(Pigeons)) - end - end - end - end - # need to do this in order to be able to use declarations, since they happened inside a function - open(prefix * ".log", "a") do out - open(prefix * ".err", "a") do err - redirect_stdout(out) do - redirect_stderr(err) do - print("deserializing...") - Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") - pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") - println("done!") - print("running PT...") - pt = PT(pt_arguments, exec_folder = raw"$exec_folder") - println("done!") - print("running pigeons(pt)...") - pigeons(pt) - println("done!") - end - end - end - end + pid=string(getpid()) + println("hello from PID " * pid) + + println(pid * ": wd = " * pwd()) + println(pid * ": active_proj = " * dirname(Base.active_project()) ) + + $dependency_declarations + println(pid * ": using Pigeons located @ " * dirname(pathof(Pigeons))) + + $silence_code + + Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") + pt_arguments = deserialize(raw"$path_to_serialized_pt_arguments") + pt = PT(pt_arguments, exec_folder = raw"$exec_folder") + pigeons(pt) """ end -add_dependency(dependency::Module) = "@eval using $dependency" +add_dependency(dependency::Module) = "using $dependency" function add_dependency(dependency::String) abs_path = abspath(dependency) return """include(raw"$abs_path")""" From ebcb3a6c46423884b7bd3d280ec2b786fc431921 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Thu, 9 Mar 2023 15:38:59 -0800 Subject: [PATCH 44/63] force instantiate + precompile --- src/submission/ChildProcess.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 521f512fe..94ba2ff3d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -88,6 +88,8 @@ end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) julia_bin = Base.julia_cmd() cur_proj = dirname(Base.current_project()) + @info "forcing instantiate + precompile on project $cur_proj" + run(`$julia_bin --project=$cur_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj From ad69b34e627d898eed0ec99a29409eaa291760f2 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 06:05:50 -0800 Subject: [PATCH 45/63] Add mpi_args to mpi_test --- test/misc.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/misc.jl b/test/misc.jl index 4100a32ba..7654799af 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,6 +8,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - run(`$exe -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) + mpi_args = extra_mpi_args() + run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end end \ No newline at end of file From 6bde8fe5fe9ad2fb51759f5513b22199973f874e Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 06:19:57 -0800 Subject: [PATCH 46/63] Temporary: trying to speed up some key tests --- .github/workflows/CI.yml | 127 ++++++++++++++++++++------------------- 1 file changed, 64 insertions(+), 63 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index a7b3301aa..577f0f2c2 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,45 +14,46 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: - test-default: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - version: - - '1.8' - - 'nightly' - os: - - ubuntu-latest - - macos-latest - - windows-latest - arch: - - x64 - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' - - uses: julia-actions/setup-julia@v1 - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 - - uses: julia-actions/julia-buildpkg@v1 - - - name: instantiate the test environment - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - - - uses: julia-actions/julia-runtest@v1 - - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v2 - with: - files: lcov.info + + # test-default: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # - 'nightly' + # os: + # - ubuntu-latest + # - macos-latest + # - windows-latest + # arch: + # - x64 + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 + # - uses: julia-actions/julia-buildpkg@v1 + + # - name: instantiate the test environment + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + + # - uses: julia-actions/julia-runtest@v1 + # - uses: julia-actions/julia-processcoverage@v1 + # - uses: codecov/codecov-action@v2 + # with: + # files: lcov.info # test OpenMPI_jll test-OpenMPI-jll: @@ -141,27 +142,27 @@ jobs: # - uses: julia-actions/julia-runtest@v1 - docs: - name: Documentation - runs-on: ubuntu-latest - permissions: - contents: write - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'temurin' - java-version: '11' - - uses: julia-actions/setup-julia@v1 - with: - version: '1' - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-docdeploy@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - run: | - julia --project=docs -e ' - using Documenter: DocMeta, doctest - using Pigeons - DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) - doctest(Pigeons)' + # docs: + # name: Documentation + # runs-on: ubuntu-latest + # permissions: + # contents: write + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + # - uses: julia-actions/setup-julia@v1 + # with: + # version: '1' + # - uses: julia-actions/julia-buildpkg@v1 + # - uses: julia-actions/julia-docdeploy@v1 + # env: + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # - run: | + # julia --project=docs -e ' + # using Documenter: DocMeta, doctest + # using Pigeons + # DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) + # doctest(Pigeons)' From 7b5e7d41fa09d8b78e524b6a23df34f92baad904 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 07:00:18 -0800 Subject: [PATCH 47/63] Fix --- src/submission/ChildProcess.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 94ba2ff3d..b4a894aa3 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,7 +62,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = extra_mpi_args() + mpi_args = Pigeons.extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" From fc34b85bcfaa10b5a6299ef11ec415db65cc2e4f Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 07:51:01 -0800 Subject: [PATCH 48/63] Fix the fix + reintroducing the system-MPI tests --- .github/workflows/CI.yml | 90 +++++++++++++++++----------------- src/submission/ChildProcess.jl | 2 +- test/misc.jl | 2 +- 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 577f0f2c2..7df61a6c4 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -95,52 +95,52 @@ jobs: - # # adapted from MPI.jl - # test-system-MPI-apt: - # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # fail-fast: false - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # mpi: - # - libopenmpi-dev - # env: - # JULIA_MPI_TEST_BINARY: system - # steps: - # - uses: actions/checkout@v3 - - # - name: Install MPI via apt - # run: | - # sudo apt-get update - # sudo apt-get install $MPI - # env: - # MPI: ${{ matrix.mpi }} - - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - - # - uses: julia-actions/setup-julia@v1 - # with: - # version: ${{ matrix.version }} - # arch: x64 - - # - uses: julia-actions/cache@v1 - - # - name: use system MPI - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_system_binary() + # adapted from MPI.jl + test-system-MPI-apt: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + os: + - ubuntu-latest + mpi: + - libopenmpi-dev + env: + JULIA_MPI_TEST_BINARY: system + steps: + - uses: actions/checkout@v3 + + - name: Install MPI via apt + run: | + sudo apt-get update + sudo apt-get install $MPI + env: + MPI: ${{ matrix.mpi }} + + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: x64 + + - uses: julia-actions/cache@v1 + + - name: use system MPI + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_system_binary() - # - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-runtest@v1 # docs: # name: Documentation diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index b4a894aa3..94ba2ff3d 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,7 +62,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = Pigeons.extra_mpi_args() + mpi_args = extra_mpi_args() mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` logfile = "Pigeons.log" diff --git a/test/misc.jl b/test/misc.jl index 7654799af..ec24109d5 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,7 +8,7 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - mpi_args = extra_mpi_args() + mpi_args = Pigeons.extra_mpi_args() run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end end \ No newline at end of file From 5ba839e36ba7900f3ca2f130c325619d3df3f186 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 08:07:52 -0800 Subject: [PATCH 49/63] Add back libmpich-dev to resume investigation on ghostbug --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7df61a6c4..3c79cfa50 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -108,6 +108,7 @@ jobs: - ubuntu-latest mpi: - libopenmpi-dev + - libmpich-dev env: JULIA_MPI_TEST_BINARY: system steps: From 2b37c0f83e02296a1cb1768b3e8682aef4cc4717 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:02:42 -0800 Subject: [PATCH 50/63] Trying to simplify CI setup needed to reproduce ghostbug --- .github/workflows/CI.yml | 65 +++++++------- test/runtests.jl | 178 +++++++++++++++++++++------------------ 2 files changed, 127 insertions(+), 116 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3c79cfa50..b40ed6a01 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -56,42 +56,42 @@ jobs: # files: lcov.info # test OpenMPI_jll - test-OpenMPI-jll: - name: Julia OpenMPI_jll - ${{ github.event_name }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - version: - - '1.8' - os: - - ubuntu-latest - arch: - - x64 + # test-OpenMPI-jll: + # name: Julia OpenMPI_jll - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # arch: + # - x64 - fail-fast: false - env: - JULIA_MPI_TEST_BINARY: OpenMPI_jll - JULIA_MPI_TEST_ABI: OpenMPI - steps: - - name: Checkout - uses: actions/checkout@v3 + # fail-fast: false + # env: + # JULIA_MPI_TEST_BINARY: OpenMPI_jll + # JULIA_MPI_TEST_ABI: OpenMPI + # steps: + # - name: Checkout + # uses: actions/checkout@v3 - - uses: julia-actions/setup-julia@latest - with: - version: ${{ matrix.version }} - arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 + # - uses: julia-actions/setup-julia@latest + # with: + # version: ${{ matrix.version }} + # arch: ${{ matrix.arch }} + # - uses: julia-actions/cache@v1 - - name: use OpenMPI_jll - shell: julia --color=yes --project=test {0} - run: | - using Pkg - Pkg.instantiate() - using MPIPreferences - MPIPreferences.use_jll_binary("OpenMPI_jll") - rm("test/Manifest.toml") + # - name: use OpenMPI_jll + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_jll_binary("OpenMPI_jll") + # rm("test/Manifest.toml") - - uses: julia-actions/julia-runtest@latest + # - uses: julia-actions/julia-runtest@latest @@ -107,7 +107,6 @@ jobs: os: - ubuntu-latest mpi: - - libopenmpi-dev - libmpich-dev env: JULIA_MPI_TEST_BINARY: system diff --git a/test/runtests.jl b/test/runtests.jl index b359afe46..0dad559cb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -77,94 +77,106 @@ end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] - # Turing: - pigeons( - target = TuringLogPotential(flip_model_unidentifiable()), - n_rounds = 13, - checked_round = 3, - multithreaded = true, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], - n_local_mpi_processes = n_mpis, - n_threads = 2)) - # Blang: - if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - Pigeons.setup_blang("blangDemos") - pigeons(; - target = Pigeons.blang_ising(), - n_rounds = 13, - checked_round = 3, - recorder_builders = recorder_builders, - multithreaded = true, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - end -end -@testset "Longer MPI" begin - n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf - recorder_builders = [] + # test swapper pigeons( target = Pigeons.TestSwapper(0.5), - n_rounds = 14, - checked_round = 12, - n_chains = 200, - multithreaded = false, + n_rounds = 10, + checked_round = 3, recorder_builders = recorder_builders, checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 1)) + n_threads = 2)) + + # # Turing: + # pigeons( + # target = TuringLogPotential(flip_model_unidentifiable()), + # n_rounds = 10, + # checked_round = 3, + # multithreaded = true, + # recorder_builders = recorder_builders, + # checkpoint = true, + # on = ChildProcess( + # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + # n_local_mpi_processes = n_mpis, + # n_threads = 2)) + # # Blang: + # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + # Pigeons.setup_blang("blangDemos") + # pigeons(; + # target = Pigeons.blang_ising(), + # n_rounds = 10, + # checked_round = 3, + # recorder_builders = recorder_builders, + # multithreaded = true, + # checkpoint = true, + # on = ChildProcess( + # n_local_mpi_processes = n_mpis, + # n_threads = 2)) + # end end -@testset "Entanglement" begin - mpi_test(1, "entanglement_test.jl") - mpi_test(2, "entanglement_test.jl") +# @testset "Longer MPI" begin +# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf +# recorder_builders = [] +# pigeons( +# target = Pigeons.TestSwapper(0.5), +# n_rounds = 14, +# checked_round = 12, +# n_chains = 200, +# multithreaded = false, +# recorder_builders = recorder_builders, +# checkpoint = true, +# on = ChildProcess( +# n_local_mpi_processes = n_mpis, +# n_threads = 1)) +# end - mpi_test(1, "reduce_test.jl") - mpi_test(2, "reduce_test.jl") - mpi_test(3, "reduce_test.jl") -end +# @testset "Entanglement" begin +# mpi_test(1, "entanglement_test.jl") +# mpi_test(2, "entanglement_test.jl") -@testset "PermutedDistributedArray" begin - mpi_test(1, "permuted_test.jl", options = ["-s"]) - mpi_test(1, "permuted_test.jl") - mpi_test(2, "permuted_test.jl") -end +# mpi_test(1, "reduce_test.jl") +# mpi_test(2, "reduce_test.jl") +# mpi_test(3, "reduce_test.jl") +# end -@testset "LoadBalance" begin - for i in 1:20 - for j in i:30 - test_load_balance(i, j) - end - end -end +# @testset "PermutedDistributedArray" begin +# mpi_test(1, "permuted_test.jl", options = ["-s"]) +# mpi_test(1, "permuted_test.jl") +# mpi_test(2, "permuted_test.jl") +# end + +# @testset "LoadBalance" begin +# for i in 1:20 +# for j in i:30 +# test_load_balance(i, j) +# end +# end +# end -@testset "LogSum" begin - m = Pigeons.LogSum() +# @testset "LogSum" begin +# m = Pigeons.LogSum() - fit!(m, 2.1) - fit!(m, 4) - v1 = value(m) - @assert v1 ≈ log(exp(2.1) + exp(4)) - - - fit!(m, 2.1) - fit!(m, 4) - m2 = Pigeons.LogSum() - fit!(m2, 50.1) - combined = merge(m, m2) - @assert value(combined) ≈ log(exp(v1) + exp(50.1)) - - fit!(m, 2.1) - fit!(m, 4) - empty!(m) - @assert value(m) == -Pigeons.inf(0.0) -end +# fit!(m, 2.1) +# fit!(m, 4) +# v1 = value(m) +# @assert v1 ≈ log(exp(2.1) + exp(4)) + + +# fit!(m, 2.1) +# fit!(m, 4) +# m2 = Pigeons.LogSum() +# fit!(m2, 50.1) +# combined = merge(m, m2) +# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + +# fit!(m, 2.1) +# fit!(m, 4) +# empty!(m) +# @assert value(m) == -Pigeons.inf(0.0) +# end function test_split_slice() # test disjoint random streams @@ -183,17 +195,17 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -@testset "split_test" begin - test_split_slice() -end +# @testset "split_test" begin +# test_split_slice() +# end -@testset "Serialize" begin - mpi_test(1, "serialization_test.jl") -end +# @testset "Serialize" begin +# mpi_test(1, "serialization_test.jl") +# end -@testset "SliceSampler" begin - test_slice_sampler() -end +# @testset "SliceSampler" begin +# test_slice_sampler() +# end # clean-up logs ls = readdir() From 08479015c02e32611ae9960567a944fcd2e1a912 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:03:57 -0800 Subject: [PATCH 51/63] Fix last commit --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 0dad559cb..91ae3ea66 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -80,7 +80,7 @@ end # test swapper pigeons( - target = Pigeons.TestSwapper(0.5), + target = toy_mvn_target(1), n_rounds = 10, checked_round = 3, recorder_builders = recorder_builders, From f75a280352ede950ef89f753f62718db9a311032 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Fri, 10 Mar 2023 09:16:58 -0800 Subject: [PATCH 52/63] toy_mvn not enough to manifest ghostbug, trying Turing --- test/runtests.jl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 91ae3ea66..9812eb08b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -79,28 +79,28 @@ end recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] # test swapper - pigeons( - target = toy_mvn_target(1), - n_rounds = 10, - checked_round = 3, - recorder_builders = recorder_builders, - checkpoint = true, - on = ChildProcess( - n_local_mpi_processes = n_mpis, - n_threads = 2)) - - # # Turing: # pigeons( - # target = TuringLogPotential(flip_model_unidentifiable()), + # target = toy_mvn_target(1), # n_rounds = 10, # checked_round = 3, - # multithreaded = true, # recorder_builders = recorder_builders, # checkpoint = true, # on = ChildProcess( - # dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], # n_local_mpi_processes = n_mpis, - # n_threads = 2)) + # n_threads = 2)) + + # # Turing: + pigeons( + target = TuringLogPotential(flip_model_unidentifiable()), + n_rounds = 10, + checked_round = 3, + multithreaded = true, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], + n_local_mpi_processes = n_mpis, + n_threads = 2)) # # Blang: # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf # Pigeons.setup_blang("blangDemos") From 93033fc3530dbfd7dc6141e0c88ede97c0fb9dad Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 12:55:38 -0800 Subject: [PATCH 53/63] test mpich+openmpi using brew --- .github/workflows/CI.yml | 68 ++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index b40ed6a01..6e16c0d8a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -92,11 +92,10 @@ jobs: # rm("test/Manifest.toml") # - uses: julia-actions/julia-runtest@latest - - - + + # test system MPI using Brew in macOS # adapted from MPI.jl - test-system-MPI-apt: + test-system-MPI-brew: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} runs-on: ${{ matrix.os }} strategy: @@ -105,18 +104,18 @@ jobs: version: - '1.8' os: - - ubuntu-latest + - macos-latest mpi: - - libmpich-dev + - mpich + - openmpi env: JULIA_MPI_TEST_BINARY: system + ZES_ENABLE_SYSMAN: 1 # https://github.com/open-mpi/ompi/issues/10142 steps: - uses: actions/checkout@v3 - - name: Install MPI via apt - run: | - sudo apt-get update - sudo apt-get install $MPI + - name: Install MPI via homebrew + run: brew install $MPI env: MPI: ${{ matrix.mpi }} @@ -139,9 +138,58 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() + rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 + # # adapted from MPI.jl + # test-system-MPI-apt: + # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} + # runs-on: ${{ matrix.os }} + # strategy: + # fail-fast: false + # matrix: + # version: + # - '1.8' + # os: + # - ubuntu-latest + # mpi: + # - libmpich-dev + # env: + # JULIA_MPI_TEST_BINARY: system + # steps: + # - uses: actions/checkout@v3 + + # - name: Install MPI via apt + # run: | + # sudo apt-get update + # sudo apt-get install $MPI + # env: + # MPI: ${{ matrix.mpi }} + + # - uses: actions/setup-java@v3 + # with: + # distribution: 'temurin' + # java-version: '11' + + # - uses: julia-actions/setup-julia@v1 + # with: + # version: ${{ matrix.version }} + # arch: x64 + + # - uses: julia-actions/cache@v1 + + # - name: use system MPI + # shell: julia --color=yes --project=test {0} + # run: | + # using Pkg + # Pkg.instantiate() + # using MPIPreferences + # MPIPreferences.use_system_binary() + # rm("test/Manifest.toml") + + # - uses: julia-actions/julia-runtest@v1 + # docs: # name: Documentation # runs-on: ubuntu-latest From d46d429151ff60d70a852faefed7326eecbd11a4 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 13:09:17 -0800 Subject: [PATCH 54/63] fix wrong abi detection for mpich --- .github/workflows/CI.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6e16c0d8a..3eb9ee88a 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -138,6 +138,7 @@ jobs: Pkg.instantiate() using MPIPreferences MPIPreferences.use_system_binary() + run(`sed -i.bu 's/unknown/MPICH/' test/LocalPreferences.toml`) # fix wrong abi detection for mpich rm("test/Manifest.toml") - uses: julia-actions/julia-runtest@v1 From 7a110746587fde6e8b66adc4ae5a0ad0b46d5a75 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 14:11:55 -0800 Subject: [PATCH 55/63] move xtra args to MPI struct + remove prints + failsafe for empty current project --- src/submission/ChildProcess.jl | 40 ++++++++++------------------------ src/submission/MPI.jl | 7 +++++- test/misc.jl | 6 ++++- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 94ba2ff3d..34c47ed50 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -62,34 +62,24 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_args = extra_mpi_args() - mpi_cmd = `$exe $mpi_args -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` - logfile = "Pigeons.log" - println("Launching command\n\tcmd = $cmd\n\tlogfile = $logfile") - try - run(pipeline(cmd; stdout = logfile, stderr = logfile), wait = new_process.wait) - catch e - println("pipeline terminated with non-zero status. Dumping stdout+stderr:\n\n") - open(logfile, "r") do f - println(read(f, String)) - end - rethrow(e) - end + run(cmd, wait = new_process.wait) end end return Result{PT}(exec_folder) end -function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` -end - function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - cur_proj = dirname(Base.current_project()) - @info "forcing instantiate + precompile on project $cur_proj" - run(`$julia_bin --project=$cur_proj -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) # instantiate and precompile before spawning children. otherwise all of them would need to do this and we'd have race conditions on the compilation cache + julia_bin = Base.julia_cmd() + cur_proj = Base.current_project() + if !isnothing(cur_proj) + # instantiate the project to make sure dependencies exist + # also, precompile to issues with coordinating access to compilecache + dir = dirname(cur_proj) + @info "forcing instantiate + precompile on project $dir" + run(`$julia_bin --project=$dir -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) + end script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) return `$julia_bin --project=$cur_proj @@ -142,15 +132,7 @@ function launch_code( # But prototype quote-based syntax seemed more messy.. # NB: using raw".." below to work around windows problem: backslash in paths interpreted as escape, so using suggestion in https://discourse.julialang.org/t/windows-file-path-string-slash-direction-best-way-to-copy-paste/29204 """ - pid=string(getpid()) - println("hello from PID " * pid) - - println(pid * ": wd = " * pwd()) - println(pid * ": active_proj = " * dirname(Base.active_project()) ) - $dependency_declarations - println(pid * ": using Pigeons located @ " * dirname(pathof(Pigeons))) - $silence_code Pigeons.deserialize_immutables(raw"$path_to_serialized_immutables") diff --git a/src/submission/MPI.jl b/src/submission/MPI.jl index 8280fe749..eb74020e4 100644 --- a/src/submission/MPI.jl +++ b/src/submission/MPI.jl @@ -41,6 +41,11 @@ $FIELDS process. """ dependencies::Vector{Module} = [] + + """ + Extra arguments passed to mpiexec. + """ + mpiexec_args::String = "" end """ @@ -96,7 +101,7 @@ function mpi_submission_script(exec_folder, mpi_submission::MPI, julia_cmd) #PBS -e $info_folder/stderr.txt cd \$PBS_O_WORKDIR $(modules_string(mpi_settings)) - mpiexec --merge-stderr-to-stdout --output-filename $exec_folder $julia_cmd_str + mpiexec $(mpi_submission.mpiexec_args) --merge-stderr-to-stdout --output-filename $exec_folder $julia_cmd_str """ script_path = "$exec_folder/.submission_script.sh" write(script_path, code) diff --git a/test/misc.jl b/test/misc.jl index ec24109d5..6846d796d 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -8,7 +8,11 @@ function mpi_test(n_processes::Int, test_file::String; options = []) "$project_folder/test/$test_file" end mpiexec() do exe - mpi_args = Pigeons.extra_mpi_args() + mpi_args = extra_mpi_args() run(`$exe $mpi_args -n $n_processes $(Base.julia_cmd()) --project=$project_folder $resolved_test_file $options`) end +end + +function extra_mpi_args() + MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` end \ No newline at end of file From d89911a8c2c449f319cd0f23c2e3b59789ecdcda Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 14:36:48 -0800 Subject: [PATCH 56/63] mpiexec args for childprocess --- src/submission/ChildProcess.jl | 7 ++++++- test/misc.jl | 2 +- test/runtests.jl | 3 ++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 34c47ed50..7ee8f1f3e 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -41,6 +41,11 @@ $FIELDS When wait is false, the process' I/O streams are directed to devnull. """ wait = true + + """ + Extra arguments passed to mpiexec. + """ + mpiexec_args::String = "" end """ @@ -62,7 +67,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end diff --git a/test/misc.jl b/test/misc.jl index 6846d796d..2cc89ebe2 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -14,5 +14,5 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? `--mca orte_base_help_aggregate 0 --oversubscribe -v` : `` + MPIPreferences.abi == "OpenMPI" ? "--oversubscribe" : "" end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 9812eb08b..5599ad4d9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -100,7 +100,8 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 2)) + n_threads = 2, + mpiexec_args = extra_mpi_args())) # # Blang: # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf # Pigeons.setup_blang("blangDemos") From aa233e8a28a5e80253413635499ef29625622aca Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 19:40:45 -0800 Subject: [PATCH 57/63] re-introduce all CI tests + fix bug in building mpi cmd --- .github/workflows/CI.yml | 197 ++++++++++++++++----------------- src/submission/ChildProcess.jl | 6 +- test/runtests.jl | 5 - 3 files changed, 102 insertions(+), 106 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3eb9ee88a..d72b641cd 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,84 +14,80 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} jobs: + + # default test + test-MPICH-jll: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - '1.8' + - 'nightly' + os: + - ubuntu-latest + - macos-latest + - windows-latest + arch: + - x64 + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v2 + with: + files: lcov.info - # test-default: - # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # fail-fast: false - # matrix: - # version: - # - '1.8' - # - 'nightly' - # os: - # - ubuntu-latest - # - macos-latest - # - windows-latest - # arch: - # - x64 - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - # - uses: julia-actions/setup-julia@v1 - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - # - uses: julia-actions/julia-buildpkg@v1 - - # - name: instantiate the test environment - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # - uses: julia-actions/julia-runtest@v1 - # - uses: julia-actions/julia-processcoverage@v1 - # - uses: codecov/codecov-action@v2 - # with: - # files: lcov.info - - # test OpenMPI_jll - # test-OpenMPI-jll: - # name: Julia OpenMPI_jll - ${{ github.event_name }} - # runs-on: ${{ matrix.os }} - # strategy: - # matrix: - # version: - # - '1.8' - # os: - # - ubuntu-latest - # arch: - # - x64 + # test OpenMPI by requesting it with MPIPreferences + # adapted from MPI.jl + test-OpenMPI-jll: + name: Julia OpenMPI_jll - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + version: + - '1.8' + os: + - ubuntu-latest + arch: + - x64 - # fail-fast: false - # env: - # JULIA_MPI_TEST_BINARY: OpenMPI_jll - # JULIA_MPI_TEST_ABI: OpenMPI - # steps: - # - name: Checkout - # uses: actions/checkout@v3 - - # - uses: julia-actions/setup-julia@latest - # with: - # version: ${{ matrix.version }} - # arch: ${{ matrix.arch }} - # - uses: julia-actions/cache@v1 - - # - name: use OpenMPI_jll - # shell: julia --color=yes --project=test {0} - # run: | - # using Pkg - # Pkg.instantiate() - # using MPIPreferences - # MPIPreferences.use_jll_binary("OpenMPI_jll") - # rm("test/Manifest.toml") - - # - uses: julia-actions/julia-runtest@latest + fail-fast: false + env: + JULIA_MPI_TEST_BINARY: OpenMPI_jll + JULIA_MPI_TEST_ABI: OpenMPI + steps: + - name: Checkout + uses: actions/checkout@v3 + + - uses: julia-actions/setup-julia@latest + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v1 + + - name: use OpenMPI_jll + shell: julia --color=yes --project=test {0} + run: | + using Pkg + Pkg.instantiate() + using MPIPreferences + MPIPreferences.use_jll_binary("OpenMPI_jll") + rm("test/Manifest.toml") + + - uses: julia-actions/julia-runtest@latest # test system MPI using Brew in macOS # adapted from MPI.jl @@ -143,7 +139,10 @@ jobs: - uses: julia-actions/julia-runtest@v1 + # # test system MPI using apt in ubuntu # # adapted from MPI.jl + # # TODO: commented out because apt has older versions of MPICH and OMPI that + # # segfault with multithreading. Re-introduce them when apt pkgs are upgraded # test-system-MPI-apt: # name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.mpi }} - ${{ github.event_name }} # runs-on: ${{ matrix.os }} @@ -191,27 +190,27 @@ jobs: # - uses: julia-actions/julia-runtest@v1 - # docs: - # name: Documentation - # runs-on: ubuntu-latest - # permissions: - # contents: write - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'temurin' - # java-version: '11' - # - uses: julia-actions/setup-julia@v1 - # with: - # version: '1' - # - uses: julia-actions/julia-buildpkg@v1 - # - uses: julia-actions/julia-docdeploy@v1 - # env: - # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # - run: | - # julia --project=docs -e ' - # using Documenter: DocMeta, doctest - # using Pigeons - # DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) - # doctest(Pigeons)' + docs: + name: Documentation + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'temurin' + java-version: '11' + - uses: julia-actions/setup-julia@v1 + with: + version: '1' + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - run: | + julia --project=docs -e ' + using Documenter: DocMeta, doctest + using Pigeons + DocMeta.setdocmeta!(Pigeons, :DocTestSetup, :(using Pigeons); recursive=true) + doctest(Pigeons)' diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 7ee8f1f3e..2afda0c67 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -67,8 +67,10 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` - cmd = `$mpi_cmd $julia_cmd` + args = new_process.mpiexec_args + mpi_cmd = length(args)>0 ? `$exe $args` : `$exe` # need this because `$("")` == `''` != `` + mpi_cmd = `$mpi_cmd -n $(new_process.n_local_mpi_processes)` + cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end end diff --git a/test/runtests.jl b/test/runtests.jl index 5599ad4d9..7a4823274 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -207,8 +207,3 @@ test_split_slice_helper(range) = [rand(r) for r in split_slice(range, Splittabl # @testset "SliceSampler" begin # test_slice_sampler() # end - -# clean-up logs -ls = readdir() -foreach(rm, filter(endswith(".log"), ls)) -foreach(rm, filter(endswith(".err"), ls)) From 42993fdc7d1e5cf23a89ddd9ab4d72af9ee4cf47 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 20:08:55 -0800 Subject: [PATCH 58/63] add support for using without Project.toml --- src/Pigeons.jl | 2 +- src/submission/ChildProcess.jl | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/src/Pigeons.jl b/src/Pigeons.jl index 9d6cc8734..b8b59cf24 100644 --- a/src/Pigeons.jl +++ b/src/Pigeons.jl @@ -8,7 +8,7 @@ import MPI: Comm, Allreduce, Comm_rank, Comm_dup, Request, Waitall, RequestSet, mpiexec, Allreduce, Allgather, Comm_split, isend, recv, - bcast, tag_ub, free + bcast, tag_ub using Base: Forward diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 2afda0c67..d96d6222e 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -78,20 +78,18 @@ function pigeons(pt_arguments, new_process::ChildProcess) end function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, silence_mpi::Bool) - julia_bin = Base.julia_cmd() - cur_proj = Base.current_project() - if !isnothing(cur_proj) + script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) + jl_cmd = Base.julia_cmd() + project_file = Base.current_project() + if !isnothing(project_file) # instantiate the project to make sure dependencies exist - # also, precompile to issues with coordinating access to compilecache - dir = dirname(cur_proj) - @info "forcing instantiate + precompile on project $dir" - run(`$julia_bin --project=$dir -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) + # also, precompile to avoid issues with coordinating access to compile cache + project_dir = dirname(project_file) + jl_cmd = `$jl_cmd --project=$project_dir` + println("Instantiating and pre-compiling project on $project_dir") + run(`$jl_cmd -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) end - script_path = launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) - return `$julia_bin - --project=$cur_proj - --threads=$n_threads - $script_path` + return `$jl_cmd --threads=$n_threads $script_path` end function launch_script(pt_arguments, exec_folder, dependencies, silence_mpi) From 42bdd753f198a1a559ce37fcb326b7ad26c36f7d Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Fri, 10 Mar 2023 20:15:10 -0800 Subject: [PATCH 59/63] add comment explaining why we wait on Isend --- src/mpi_utils/Entangler.jl | 4 +++- src/submission/ChildProcess.jl | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mpi_utils/Entangler.jl b/src/mpi_utils/Entangler.jl index a883b0dfe..52c211425 100644 --- a/src/mpi_utils/Entangler.jl +++ b/src/mpi_utils/Entangler.jl @@ -158,7 +158,7 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic e.current_received_bits .= true at_least_one_mpi = false - requests = RequestSet() + requests = RequestSet() # non-blocking requests that will be waited on # send (or copy if local) for local_index in 1:myload @@ -176,6 +176,8 @@ function transmit!(e::Entangler, source_data::AbstractVector{T}, to_global_indic source_view = Ref{T}(source_datum) mpi_rank = process_index - 1 # asynchronously (non-blocking) send over MPI: + # note: we wait for the Isend request to avoid the application + # terminating in the last iteration without completing its request. request = Isend(source_view, e.communicator, dest = mpi_rank, tag = tag(e, transmit_index, global_index)) push!(requests, request) end diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index d96d6222e..6b7aad2be 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -86,7 +86,6 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil # also, precompile to avoid issues with coordinating access to compile cache project_dir = dirname(project_file) jl_cmd = `$jl_cmd --project=$project_dir` - println("Instantiating and pre-compiling project on $project_dir") run(`$jl_cmd -e "using Pkg; Pkg.instantiate(); Pkg.precompile()"`) end return `$jl_cmd --threads=$n_threads $script_path` From c5cb70a36a7f5f35975b42a621f321aa7b95d7c9 Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 10:54:07 -0800 Subject: [PATCH 60/63] mpiexec_args is a Cmd now --- src/submission/ChildProcess.jl | 16 ++++++++-------- src/submission/MPI.jl | 2 +- test/misc.jl | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/submission/ChildProcess.jl b/src/submission/ChildProcess.jl index 6b7aad2be..c3cf5ba3c 100644 --- a/src/submission/ChildProcess.jl +++ b/src/submission/ChildProcess.jl @@ -19,7 +19,7 @@ $FIELDS (if of type `String`) needed by the child process. """ - dependencies::Vector{Any} = [] + dependencies::Vector = [] # eventually, detect & save which # modules should be loaded? E.g. could use # https://stackoverflow.com/questions/25575406/list-of-loaded-imported-packages-in-julia @@ -34,20 +34,22 @@ $FIELDS third-party target distribution which somehow does not support multi-threading. """ - n_local_mpi_processes = 1 + n_local_mpi_processes::Int = 1 """ If wait is false, the process runs asynchronously. When wait is false, the process' I/O streams are directed to devnull. """ - wait = true + wait::Bool = true """ Extra arguments passed to mpiexec. """ - mpiexec_args::String = "" + mpiexec_args::Cmd = `` end + + """ $SIGNATURES @@ -67,9 +69,7 @@ function pigeons(pt_arguments, new_process::ChildProcess) run(julia_cmd, wait = new_process.wait) else mpiexec() do exe - args = new_process.mpiexec_args - mpi_cmd = length(args)>0 ? `$exe $args` : `$exe` # need this because `$("")` == `''` != `` - mpi_cmd = `$mpi_cmd -n $(new_process.n_local_mpi_processes)` + mpi_cmd = `$exe $(new_process.mpiexec_args) -n $(new_process.n_local_mpi_processes)` cmd = `$mpi_cmd $julia_cmd` run(cmd, wait = new_process.wait) end @@ -82,7 +82,7 @@ function launch_cmd(pt_arguments, exec_folder, dependencies, n_threads::Int, sil jl_cmd = Base.julia_cmd() project_file = Base.current_project() if !isnothing(project_file) - # instantiate the project to make sure dependencies exist + # forcing instantiate the project to make sure dependencies exist # also, precompile to avoid issues with coordinating access to compile cache project_dir = dirname(project_file) jl_cmd = `$jl_cmd --project=$project_dir` diff --git a/src/submission/MPI.jl b/src/submission/MPI.jl index eb74020e4..511a7d564 100644 --- a/src/submission/MPI.jl +++ b/src/submission/MPI.jl @@ -45,7 +45,7 @@ $FIELDS """ Extra arguments passed to mpiexec. """ - mpiexec_args::String = "" + mpiexec_args::Cmd = `` end """ diff --git a/test/misc.jl b/test/misc.jl index 2cc89ebe2..17e3432bb 100644 --- a/test/misc.jl +++ b/test/misc.jl @@ -14,5 +14,5 @@ function mpi_test(n_processes::Int, test_file::String; options = []) end function extra_mpi_args() - MPIPreferences.abi == "OpenMPI" ? "--oversubscribe" : "" + MPIPreferences.abi == "OpenMPI" ? `--oversubscribe` : `` end \ No newline at end of file From 9e31b49affe8e8b099dfc365c86edcb0be6de5bc Mon Sep 17 00:00:00 2001 From: Miguel Biron-Lattes Date: Sat, 11 Mar 2023 11:25:09 -0800 Subject: [PATCH 61/63] re-instate all tests --- test/runtests.jl | 269 ++++++++++++++++++++++++----------------------- test/turing.jl | 2 + 2 files changed, 140 insertions(+), 131 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7a4823274..379f3f6a1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,65 +31,69 @@ function test_load_balance(n_processes, n_tasks) end end -# @testset "System MPI" begin -# if haskey(ENV,"JULIA_MPI_TEST_BINARY") -# @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary -# end -# end - -# @testset "Stepping stone" begin -# pt = pigeons(target = toy_mvn_target(100)); -# p = stepping_stone_pair(pt) -# truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) -# @test abs(p[1] - truth) < 1 -# @test abs(p[2] - truth) < 1 -# end - -# @testset "Round trips" begin -# n_chains = 4 -# n_rounds = 5 +@testset "MPI backend" begin + if haskey(ENV,"JULIA_MPI_TEST_BINARY") + @test ENV["JULIA_MPI_TEST_BINARY"] == MPIPreferences.binary + end + if haskey(ENV,"JULIA_MPI_TEST_ABI") + @test ENV["JULIA_MPI_TEST_ABI"] == MPIPreferences.abi + end +end + +@testset "Stepping stone" begin + pt = pigeons(target = toy_mvn_target(100)); + p = stepping_stone_pair(pt) + truth = Pigeons.analytic_lognormalization(toy_mvn_target(100)) + @test abs(p[1] - truth) < 1 + @test abs(p[2] - truth) < 1 +end + +@testset "Round trips" begin + n_chains = 4 + n_rounds = 5 -# pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); + pt = pigeons(; target = Pigeons.TestSwapper(1.0), recorder_builders = [Pigeons.round_trip], n_chains, n_rounds); -# len = 2^(n_rounds) -# truth = 0.0 -# for i in 0:(n_chains-1) -# truth += floor(max(len - i, 0) / n_chains / 2) -# end - -# @test truth == Pigeons.n_round_trips(pt) -# end - -# @testset "Moments" begin -# pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); -# for var_name in Pigeons.continuous_variables(pt) -# m = mean(pt, var_name) -# for i in eachindex(m) -# @test abs(m[i] - 0.0) < 0.001 -# end -# v = var(pt, var_name) -# for i in eachindex(v) -# @test abs(v[i] - 0.1) < 0.001 -# end -# end -# end + len = 2^(n_rounds) + truth = 0.0 + for i in 0:(n_chains-1) + truth += floor(max(len - i, 0) / n_chains / 2) + end + + @test truth == Pigeons.n_round_trips(pt) +end + +@testset "Moments" begin + pt = pigeons(target = toy_mvn_target(2), recorder_builders = [Pigeons.target_online], n_rounds = 20); + for var_name in Pigeons.continuous_variables(pt) + m = mean(pt, var_name) + for i in eachindex(m) + @test abs(m[i] - 0.0) < 0.001 + end + v = var(pt, var_name) + for i in eachindex(v) + @test abs(v[i] - 0.1) < 0.001 + end + end +end @testset "Parallelism Invariance" begin n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf recorder_builders = [swap_acceptance_pr, index_process, log_sum_ratio, round_trip, energy_ac1] # test swapper - # pigeons( - # target = toy_mvn_target(1), - # n_rounds = 10, - # checked_round = 3, - # recorder_builders = recorder_builders, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 2)) - - # # Turing: + pigeons( + target = toy_mvn_target(1), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + + # Turing: pigeons( target = TuringLogPotential(flip_model_unidentifiable()), n_rounds = 10, @@ -102,82 +106,85 @@ end n_local_mpi_processes = n_mpis, n_threads = 2, mpiexec_args = extra_mpi_args())) - # # Blang: - # if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf - # Pigeons.setup_blang("blangDemos") - # pigeons(; - # target = Pigeons.blang_ising(), - # n_rounds = 10, - # checked_round = 3, - # recorder_builders = recorder_builders, - # multithreaded = true, - # checkpoint = true, - # on = ChildProcess( - # n_local_mpi_processes = n_mpis, - # n_threads = 2)) - # end + + # Blang: + if !Sys.iswindows() # JNI crashes on windows; see commit right after c016f59c84645346692f720854b7531743c728bf + Pigeons.setup_blang("blangDemos") + pigeons(; + target = Pigeons.blang_ising(), + n_rounds = 10, + checked_round = 3, + recorder_builders = recorder_builders, + multithreaded = true, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) + end end -# @testset "Longer MPI" begin -# n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf -# recorder_builders = [] -# pigeons( -# target = Pigeons.TestSwapper(0.5), -# n_rounds = 14, -# checked_round = 12, -# n_chains = 200, -# multithreaded = false, -# recorder_builders = recorder_builders, -# checkpoint = true, -# on = ChildProcess( -# n_local_mpi_processes = n_mpis, -# n_threads = 1)) -# end - -# @testset "Entanglement" begin -# mpi_test(1, "entanglement_test.jl") -# mpi_test(2, "entanglement_test.jl") - -# mpi_test(1, "reduce_test.jl") -# mpi_test(2, "reduce_test.jl") -# mpi_test(3, "reduce_test.jl") -# end - -# @testset "PermutedDistributedArray" begin -# mpi_test(1, "permuted_test.jl", options = ["-s"]) -# mpi_test(1, "permuted_test.jl") -# mpi_test(2, "permuted_test.jl") -# end - -# @testset "LoadBalance" begin -# for i in 1:20 -# for j in i:30 -# test_load_balance(i, j) -# end -# end -# end - -# @testset "LogSum" begin -# m = Pigeons.LogSum() - -# fit!(m, 2.1) -# fit!(m, 4) -# v1 = value(m) -# @assert v1 ≈ log(exp(2.1) + exp(4)) +@testset "Longer MPI" begin + n_mpis = Sys.iswindows() ? 1 : 4 # MPI on child process crashes on windows; see c016f59c84645346692f720854b7531743c728bf + recorder_builders = [] + pigeons( + target = Pigeons.TestSwapper(0.5), + n_rounds = 14, + checked_round = 12, + n_chains = 200, + multithreaded = false, + recorder_builders = recorder_builders, + checkpoint = true, + on = ChildProcess( + n_local_mpi_processes = n_mpis, + n_threads = 2, + mpiexec_args = extra_mpi_args())) +end +@testset "Entanglement" begin + mpi_test(1, "entanglement_test.jl") + mpi_test(2, "entanglement_test.jl") -# fit!(m, 2.1) -# fit!(m, 4) -# m2 = Pigeons.LogSum() -# fit!(m2, 50.1) -# combined = merge(m, m2) -# @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + mpi_test(1, "reduce_test.jl") + mpi_test(2, "reduce_test.jl") + mpi_test(3, "reduce_test.jl") +end -# fit!(m, 2.1) -# fit!(m, 4) -# empty!(m) -# @assert value(m) == -Pigeons.inf(0.0) -# end +@testset "PermutedDistributedArray" begin + mpi_test(1, "permuted_test.jl", options = ["-s"]) + mpi_test(1, "permuted_test.jl") + mpi_test(2, "permuted_test.jl") +end + +@testset "LoadBalance" begin + for i in 1:20 + for j in i:30 + test_load_balance(i, j) + end + end +end + +@testset "LogSum" begin + m = Pigeons.LogSum() + + fit!(m, 2.1) + fit!(m, 4) + v1 = value(m) + @assert v1 ≈ log(exp(2.1) + exp(4)) + + + fit!(m, 2.1) + fit!(m, 4) + m2 = Pigeons.LogSum() + fit!(m2, 50.1) + combined = merge(m, m2) + @assert value(combined) ≈ log(exp(v1) + exp(50.1)) + + fit!(m, 2.1) + fit!(m, 4) + empty!(m) + @assert value(m) == -Pigeons.inf(0.0) +end function test_split_slice() # test disjoint random streams @@ -196,14 +203,14 @@ end test_split_slice_helper(range) = [rand(r) for r in split_slice(range, SplittableRandom(1))] -# @testset "split_test" begin -# test_split_slice() -# end +@testset "split_test" begin + test_split_slice() +end -# @testset "Serialize" begin -# mpi_test(1, "serialization_test.jl") -# end +@testset "Serialize" begin + mpi_test(1, "serialization_test.jl") +end -# @testset "SliceSampler" begin -# test_slice_sampler() -# end +@testset "SliceSampler" begin + test_slice_sampler() +end diff --git a/test/turing.jl b/test/turing.jl index d15d80143..f894bd40a 100644 --- a/test/turing.jl +++ b/test/turing.jl @@ -1,3 +1,5 @@ +# note: the models here don't use `filldist` in order to avoid importing +# Turing, which was crashing as of 2023-03-06 # Unconditioned coinflip model with `N` observations. @model function coinflip(y) p ~ Beta(1, 12) From f93548d105926c5d9dfa155416ec21b8b097142b Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Sat, 11 Mar 2023 16:01:33 -0800 Subject: [PATCH 62/63] Do we have the same bug with 1 thread --- test/runtests.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 379f3f6a1..8e63491ed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -90,7 +90,7 @@ end checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 2, + n_threads = 1, mpiexec_args = extra_mpi_args())) # Turing: @@ -104,7 +104,7 @@ end on = ChildProcess( dependencies = [Distributions, DynamicPPL, LinearAlgebra, "turing.jl"], n_local_mpi_processes = n_mpis, - n_threads = 2, + n_threads = 1, mpiexec_args = extra_mpi_args())) # Blang: @@ -119,7 +119,7 @@ end checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 2, + n_threads = 1, mpiexec_args = extra_mpi_args())) end end @@ -137,7 +137,7 @@ end checkpoint = true, on = ChildProcess( n_local_mpi_processes = n_mpis, - n_threads = 2, + n_threads = 1, mpiexec_args = extra_mpi_args())) end From 495b4ebaad99a56294209fb3519f4f9f28e6ef43 Mon Sep 17 00:00:00 2001 From: Alexandre Bouchard Date: Sat, 11 Mar 2023 16:42:35 -0800 Subject: [PATCH 63/63] Explicitly disable GC in all pigeons runs --- src/pt/pigeons.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pt/pigeons.jl b/src/pt/pigeons.jl index 8e0c8055e..9ba12a97b 100644 --- a/src/pt/pigeons.jl +++ b/src/pt/pigeons.jl @@ -10,6 +10,8 @@ This will also call [`report()`](@ref), [`write_checkpoint()`](@ref), and [`run_checks()`](@ref) between rounds. """ function pigeons(pt::PT) + @warn "TEMPORARY - GC DISABLED FOR TESTING" + GC.enable(false) preflight_checks(pt) while next_round!(pt) # NB: while-loop instead of for-loop to support resuming from checkpoint reduced_recorders = run_one_round!(pt)