diff --git a/base/sysimg.jl b/base/sysimg.jl index 7d3826eb13bdc..6f7219afda550 100644 --- a/base/sysimg.jl +++ b/base/sysimg.jl @@ -27,6 +27,7 @@ let task.rngState1 = 0x7431eaead385992c task.rngState2 = 0x503e1d32781c2608 task.rngState3 = 0x3a77f7189200c20b + task.rngState4 = 0x5502376d099035ae # Stdlibs sorted in dependency, then alphabetical, order by contrib/print_sorted_stdlibs.jl # Run with the `--exclude-jlls` option to filter out all JLL packages diff --git a/src/gc.c b/src/gc.c index 9d5d49fa4fc53..c780529406b8c 100644 --- a/src/gc.c +++ b/src/gc.c @@ -501,9 +501,9 @@ static void jl_gc_run_finalizers_in_list(jl_task_t *ct, arraylist_t *list) JL_NO ct->sticky = sticky; } -static uint64_t finalizer_rngState[4]; +static uint64_t finalizer_rngState[JL_RNG_SIZE]; -void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT; +void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_gc_init_finalizer_rng_state(void) { @@ -532,7 +532,7 @@ static void run_finalizers(jl_task_t *ct) jl_atomic_store_relaxed(&jl_gc_have_pending_finalizers, 0); arraylist_new(&to_finalize, 0); - uint64_t save_rngState[4]; + uint64_t save_rngState[JL_RNG_SIZE]; memcpy(&save_rngState[0], &ct->rngState[0], sizeof(save_rngState)); jl_rng_split(ct->rngState, finalizer_rngState); diff --git a/src/jltypes.c b/src/jltypes.c index 0439c8d034e6b..b4146aae4d44c 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -2768,7 +2768,7 @@ void jl_init_types(void) JL_GC_DISABLED NULL, jl_any_type, jl_emptysvec, - jl_perm_symsvec(15, + jl_perm_symsvec(16, "next", "queue", "storage", @@ -2780,11 +2780,12 @@ void jl_init_types(void) JL_GC_DISABLED "rngState1", "rngState2", "rngState3", + "rngState4", "_state", "sticky", "_isexception", "priority"), - jl_svec(15, + jl_svec(16, jl_any_type, jl_any_type, jl_any_type, @@ -2796,6 +2797,7 @@ void jl_init_types(void) JL_GC_DISABLED jl_uint64_type, jl_uint64_type, jl_uint64_type, + jl_uint64_type, jl_uint8_type, jl_bool_type, jl_bool_type, diff --git a/src/julia.h b/src/julia.h index 2186ee346418d..acfc4fc1e1caf 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1910,6 +1910,8 @@ typedef struct _jl_handler_t { size_t world_age; } jl_handler_t; +#define JL_RNG_SIZE 5 // xoshiro 4 + splitmix 1 + typedef struct _jl_task_t { JL_DATA_TYPE jl_value_t *next; // invasive linked list for scheduler @@ -1921,7 +1923,7 @@ typedef struct _jl_task_t { jl_function_t *start; // 4 byte padding on 32-bit systems // uint32_t padding0; - uint64_t rngState[4]; + uint64_t rngState[JL_RNG_SIZE]; _Atomic(uint8_t) _state; uint8_t sticky; // record whether this Task can be migrated to a new thread _Atomic(uint8_t) _isexception; // set if `result` is an exception to throw or that we exited with diff --git a/src/task.c b/src/task.c index 7102cdec35d08..4228da9f32b07 100644 --- a/src/task.c +++ b/src/task.c @@ -866,28 +866,187 @@ uint64_t jl_genrandom(uint64_t rngState[4]) JL_NOTSAFEPOINT return res; } -void jl_rng_split(uint64_t to[4], uint64_t from[4]) JL_NOTSAFEPOINT +/* +The jl_rng_split function forks a task's RNG state in a way that is essentially +guaranteed to avoid collisions between the RNG streams of all tasks. The main +RNG is the xoshiro256++ RNG whose state is stored in rngState[0..3]. There is +also a small internal RNG used for task forking stored in rngState[4]. This +state is used to iterate a LCG (linear congruential generator), which is then +put through four different variations of the strongest PCG output function, +referred to as PCG-RXS-M-XS-64 [1]. This output function is invertible: it maps +a 64-bit state to 64-bit output; which is one of the reasons it's not +recommended for general purpose RNGs unless space is at a premium, but in our +usage invertibility is actually a benefit, as is explained below. + +The goal of jl_rng_split is to perturb the state of each child task's RNG in +such a way each that for an entire tree of tasks spawned starting with a given +state in a root task, no two tasks have the same RNG state. Moreover, we want to +do this in a way that is deterministic and repeatable based on (1) the root +task's seed, (2) how many random numbers are generated, and (3) the task tree +structure. The RNG state of a parent task is allowed to affect the initial RNG +state of a child task, but the mere fact that a child was spawned should not +alter the RNG output of the parent. This second requirement rules out using the +main RNG to seed children -- some separate state must be maintained and changed +upon forking a child task while leaving the main RNG state unchanged. + +The basic approach is that used by the DotMix [2] and SplitMix [3] RNG systems: +each task is uniquely identified by a sequence of "pedigree" numbers, indicating +where in the task tree it was spawned. This vector of pedigree coordinates is +then reduced to a single value by computing a dot product with a common vector +of random weights. The DotMix paper provides a proof that this dot product hash +value (referred to as a "compression function") is collision resistant in the +sense the the pairwise collision probability of two distinct tasks is 1/N where +N is the number of possible weight values. Both DotMix and SplitMix use a prime +value of N because the proof requires that the difference between two distinct +pedigree coordinates must be invertible, which is guaranteed by N being prime. +We take a different approach: we instead limit pedigree coordinates to being +binary instead -- when a task spawns a child, both tasks share the same pedigree +prefix, with the parent appending a zero and the child appending a one. This way +a binary pedigree vector uniquely identifies each task. Moreover, since the +coordinates are binary, the difference between coordinates is always one which +is its own inverse regardless of whether N is prime or not. This allows us to +compute the dot product modulo 2^64 using native machine arithmetic, which is +considerably more efficient and simpler to implement than arithmetic in a prime +modulus. It also means that when accumulating the dot product incrementally, as +described in SplitMix, we don't need to multiply weights by anything, we simply +add the random weight for the current task tree depth to the parent's dot +product to derive the child's dot product. + +We use the LCG in rngState[4] to derive generate pseudorandom weights for the +dot product. Each time a child is forked, we update the LCG in both parent and +child tasks. In the parent, that's all we have to do -- the main RNG state +remains unchanged (recall that spawning a child should *not* affect subsequence +RNG draws in the parent). The next time the parent forks a child, the dot +product weight used will be different, corresponding to being a level deeper in +the binary task tree. In the child, we use the LCG state to generate four +pseudorandom 64-bit weights (more below) and add each weight to one of the +xoshiro256 state registers, rngState[0..3]. If we assume the main RNG remains +unused in all tasks, then each register rngState[0..3] accumulates a different +Dot/SplitMix dot product hash as additional child tasks are spawned. Each one is +collision resistant with a pairwise collision chance of only 1/2^64. Assuming +that the four pseudorandom 64-bit weight streams are sufficiently independent, +the pairwise collision probability for distinct tasks is 1/2^256. If we somehow +managed to spawn a trillion tasks, the probability of a collision would be on +the order of 1/10^54. Practically impossible. Put another way, this is the same +as the probability of two SHA256 hash values accidentally colliding, which we +generally consider so unlikely as not to be worth worrying about. + +What about the random "junk" that's in the xoshiro256 state registers from +normal use of the RNG? For a tree of tasks spawned with no intervening samples +taken from the main RNG, all tasks start with the same junk which doesn't affect +the chance of collision. The Dot/SplitMix papers even suggest adding a random +base value to the dot product, so we can consider whatever happens to be in the +xoshiro256 registers to be that. What if the main RNG gets used between task +forks? In that case, the initial state registers will be different. The DotMix +collision resistance proof doesn't apply without modification, but we can +generalize the setup by adding a different base constant to each compression +function and observe that we still have a 1/N chance of the weight value +matching that exact difference. This proves collision resistance even between +tasks whose dot product hashes are computed with arbitrary offsets. We can +conclude that this scheme provides collision resistance even in the face of +different starting states of the main RNG. Does this seem too good to be true? +Perhaps another way of thinking about it will help. Suppose we seeded each task +completely randomly. Then there would also be a 1/2^256 chance of collision, +just as the DotMix proof gives. Essentially what the proof is telling us is that +if the weights are chosen uniformly and uncorrelated with the rest of the +compression function, then the dot product construction is a good enough way to +pseudorandomly seed each task. From that perspective, it's easier to believe +that adding an arbitrary constant to each seed doesn't worsen its randomness. + +This leaves us with the question of how to generate four pseudorandom weights to +add to the rngState[0..3] registers at each depth of the task tree. The scheme +used here is that a single 64-bit LCG state is iterated in both parent and child +at each task fork, and four different variations of the PCG-RXS-M-XS-64 output +function are applied to that state to generate four different pseudorandom +weights. Another obvious way to generate four weights would be to iterate the +LCG four times per task split. There are two main reasons we've chosen to use +four output variants instead: + +1. Advancing four times per fork reduces the set of possible weights that each + register can be perturbed by from 2^64 to 2^60. Since collision resistance is + proportional to the number of possible weight values, that would reduce + collision resistance. + +2. It's easier to compute four PCG output variants in parallel. Iterating the + LCG is inherently sequential. Each PCG variant can be computed independently + from the LCG state. All four can even be computed at once with SIMD vector + instructions, but the compiler doesn't currently choose to do that. + +A key question is whether the approach of using four variations of PCG-RXS-M-XS +is sufficiently random both within and between streams to provide the collision +resistance we expect. We obviously can't test that with 256 bits, but we have +tested it with a reduced state analogue using four PCG-RXS-M-XS-8 output +variations applied to a common 8-bit LCG. Test results do indicate sufficient +independence: a single register has collisions at 2^5 while four registers only +start having collisions at 2^20, which is actually better scaling of collision +resistance than we expect in theory. In theory, with one byte of resistance we +have a 50% chance of some collision at 20, which matches, but four bytes gives a +50% chance of collision at 2^17 and our (reduced size analogue) construction is +still collision free at 2^19. This may be due to the next observation, which guarantees collision avoidance for certain shapes of task trees as a result of using an +invertible RNG to generate weights. + +In the specific case where a parent task spawns a sequence of child tasks with +no intervening usage of its main RNG, the parent and child tasks are actually +_guaranteed_ to have different RNG states. This is true because the four PCG +streams each produce every possible 2^64 bit output exactly once in the full +2^64 period of the LCG generator. This is considered a weakness of PCG-RXS-M-XS +when used as a general purpose RNG, but is quite beneficial in this application. +Since each of up to 2^64 children will be perturbed by different weights, they +cannot have hash collisions. What about parent colliding with child? That can +only happen if all four main RNG registers are perturbed by exactly zero. This +seems unlikely, but could it occur? Consider this part of each output function: + + p ^= p >> ((p >> 59) + 5); + p *= m[i]; + p ^= p >> 43 + +It's easy to check that this maps zero to zero. An unchanged parent RNG can only +happen if all four `p` values are zero at the end of this, which implies that +they were all zero at the beginning. However, that is impossible since the four +`p` values differ from `x` by different additive constants, so they cannot all +be zero. Stated more generally, this non-collision property: assuming the main +RNG isn't used between task forks, sibling and parent tasks cannot have RNG +collisions. If the task tree structure is more deeply nested or if there are +intervening uses of the main RNG, we're back to relying on "merely" 256 bits of +collision resistance, but it's nice to know that in what is likely the most +common case, RNG collisions are actually impossible. This fact may also explain +better-than-theoretical collision resistance observed in our experiment with a +reduced size analogue of our hashing system. + +[1]: https://www.pcg-random.org/pdf/hmc-cs-2014-0905.pdf + +[2]: http://supertech.csail.mit.edu/papers/dprng.pdf + +[3]: https://gee.cs.oswego.edu/dl/papers/oopsla14.pdf +*/ +void jl_rng_split(uint64_t dst[JL_RNG_SIZE], uint64_t src[JL_RNG_SIZE]) { - /* TODO: consider a less ad-hoc construction - Ideally we could just use the output of the random stream to seed the initial - state of the child. Out of an overabundance of caution we multiply with - effectively random coefficients, to break possible self-interactions. - - It is not the goal to mix bits -- we work under the assumption that the - source is well-seeded, and its output looks effectively random. - However, xoshiro has never been studied in the mode where we seed the - initial state with the output of another xoshiro instance. - - Constants have nothing up their sleeve: - 0x02011ce34bce797f == hash(UInt(1))|0x01 - 0x5a94851fb48a6e05 == hash(UInt(2))|0x01 - 0x3688cf5d48899fa7 == hash(UInt(3))|0x01 - 0x867b4bb4c42e5661 == hash(UInt(4))|0x01 - */ - to[0] = 0x02011ce34bce797f * jl_genrandom(from); - to[1] = 0x5a94851fb48a6e05 * jl_genrandom(from); - to[2] = 0x3688cf5d48899fa7 * jl_genrandom(from); - to[3] = 0x867b4bb4c42e5661 * jl_genrandom(from); + // load and advance the internal LCG state + uint64_t x = src[4]; + src[4] = dst[4] = x * 0xd1342543de82ef95 + 1; + // high spectrum multiplier from https://arxiv.org/abs/2001.05304 + + static const uint64_t a[4] = { + 0xe5f8fa077b92a8a8, // random additive offsets... + 0x7a0cd918958c124d, + 0x86222f7d388588d4, + 0xd30cbd35f2b64f52 + }; + static const uint64_t m[4] = { + 0xaef17502108ef2d9, // standard PCG multiplier + 0xf34026eeb86766af, // random odd multipliers... + 0x38fd70ad58dd9fbb, + 0x6677f9b93ab0c04d + }; + + // PCG-RXS-M-XS output with four variants + for (int i = 0; i < 4; i++) { + uint64_t p = x + a[i]; + p ^= p >> ((p >> 59) + 5); + p *= m[i]; + p ^= p >> 43; + dst[i] = src[i] + p; // SplitMix dot product + } } JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, jl_value_t *completion_future, size_t ssize) diff --git a/stdlib/Random/src/Xoshiro.jl b/stdlib/Random/src/Xoshiro.jl index 5b8aa4644d140..58e9f988f56e7 100644 --- a/stdlib/Random/src/Xoshiro.jl +++ b/stdlib/Random/src/Xoshiro.jl @@ -113,12 +113,17 @@ struct TaskLocalRNG <: AbstractRNG end TaskLocalRNG(::Nothing) = TaskLocalRNG() rng_native_52(::TaskLocalRNG) = UInt64 -function setstate!(x::TaskLocalRNG, s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64) +function setstate!( + x::TaskLocalRNG, + s0::UInt64, s1::UInt64, s2::UInt64, s3::UInt64, # xoshiro256 state + s4::UInt64 = hash((s0, s1, s2, s3)), # splitmix weight rng state +) t = current_task() t.rngState0 = s0 t.rngState1 = s1 t.rngState2 = s2 t.rngState3 = s3 + t.rngState4 = s4 x end @@ -128,11 +133,11 @@ end tmp = s0 + s3 res = ((tmp << 23) | (tmp >> 41)) + s0 t = s1 << 17 - s2 = xor(s2, s0) - s3 = xor(s3, s1) - s1 = xor(s1, s2) - s0 = xor(s0, s3) - s2 = xor(s2, t) + s2 ⊻= s0 + s3 ⊻= s1 + s1 ⊻= s2 + s0 ⊻= s3 + s2 ⊻= t s3 = s3 << 45 | s3 >> 19 task.rngState0, task.rngState1, task.rngState2, task.rngState3 = s0, s1, s2, s3 res @@ -159,7 +164,7 @@ seed!(rng::Union{TaskLocalRNG, Xoshiro}, seed::Integer) = seed!(rng, make_seed(s @inline function rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{UInt128}) first = rand(rng, UInt64) second = rand(rng,UInt64) - second + UInt128(first)<<64 + second + UInt128(first) << 64 end @inline rand(rng::Union{TaskLocalRNG, Xoshiro}, ::SamplerType{Int128}) = rand(rng, UInt128) % Int128 @@ -178,14 +183,14 @@ end function copy!(dst::TaskLocalRNG, src::Xoshiro) t = current_task() - t.rngState0, t.rngState1, t.rngState2, t.rngState3 = src.s0, src.s1, src.s2, src.s3 - dst + setstate!(dst, src.s0, src.s1, src.s2, src.s3) + return dst end function copy!(dst::Xoshiro, src::TaskLocalRNG) t = current_task() - dst.s0, dst.s1, dst.s2, dst.s3 = t.rngState0, t.rngState1, t.rngState2, t.rngState3 - dst + setstate!(dst, t.rngState0, t.rngState1, t.rngState2, t.rngState3) + return dst end function ==(a::Xoshiro, b::TaskLocalRNG) diff --git a/stdlib/Random/test/runtests.jl b/stdlib/Random/test/runtests.jl index 4e8d3b4ffb39a..3f570d862b743 100644 --- a/stdlib/Random/test/runtests.jl +++ b/stdlib/Random/test/runtests.jl @@ -1018,3 +1018,50 @@ guardseed() do @test f42752(true) === val end end + +@testset "TaskLocalRNG: stream collision smoke test" begin + # spawn a trinary tree of tasks: + # - spawn three recursive child tasks in each + # - generate a random UInt64 in each before, after and between + # - collect and count all the generated random values + # these should all be distinct across all tasks + function gen(d) + r = rand(UInt64) + vals = [r] + if d ≥ 0 + append!(vals, gent(d - 1)) + isodd(r) && append!(vals, gent(d - 1)) + push!(vals, rand(UInt64)) + iseven(r) && append!(vals, gent(d - 1)) + end + push!(vals, rand(UInt64)) + end + gent(d) = fetch(@async gen(d)) + seeds = rand(RandomDevice(), UInt64, 5) + for seed in seeds + Random.seed!(seed) + vals = gen(6) + @test allunique(vals) + end +end + +@testset "TaskLocalRNG: child doesn't affect parent" begin + seeds = rand(RandomDevice(), UInt64, 5) + for seed in seeds + Random.seed!(seed) + x = rand(UInt64) + y = rand(UInt64) + n = 3 + for i = 1:n + Random.seed!(seed) + @sync for j = 0:i + @async rand(UInt64) + end + @test x == rand(UInt64) + @sync for j = 0:(n-i) + @async rand(UInt64) + end + @test y == rand(UInt64) + end + end +end diff --git a/test/llvmpasses/alloc-opt-gcframe.jl b/test/llvmpasses/alloc-opt-gcframe.jl index c5a5b2be86614..e7ddf12d79bc7 100644 --- a/test/llvmpasses/alloc-opt-gcframe.jl +++ b/test/llvmpasses/alloc-opt-gcframe.jl @@ -14,7 +14,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" # CHECK-LABEL: @return_obj # CHECK-NOT: @julia.gc_alloc_obj # CHECK: %current_task = getelementptr inbounds {}*, {}** %gcstack, i64 -12 -# CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +# CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 # CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 # CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** # CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* diff --git a/test/llvmpasses/late-lower-gc-addrspaces.ll b/test/llvmpasses/late-lower-gc-addrspaces.ll index 7497febf1e846..8bdcbdeacf8f3 100644 --- a/test/llvmpasses/late-lower-gc-addrspaces.ll +++ b/test/llvmpasses/late-lower-gc-addrspaces.ll @@ -45,7 +45,7 @@ top: %0 = bitcast {}*** %pgcstack to {}** %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 ; CHECK: %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 -; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* @@ -70,7 +70,7 @@ top: %0 = bitcast {}*** %pgcstack to {}** %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 ; CHECK: %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 -; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll index 65a67c78d7810..04adfe72ff0b6 100644 --- a/test/llvmpasses/late-lower-gc.ll +++ b/test/llvmpasses/late-lower-gc.ll @@ -42,7 +42,7 @@ top: %0 = bitcast {}*** %pgcstack to {}** %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 ; CHECK: %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 -; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8* @@ -67,7 +67,7 @@ top: %0 = bitcast {}*** %pgcstack to {}** %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 ; CHECK: %current_task = getelementptr inbounds {}*, {}** %0, i64 -12 -; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 15 +; CHECK-NEXT: [[ptls_field:%.*]] = getelementptr inbounds {}*, {}** %current_task, i64 16 ; CHECK-NEXT: [[ptls_load:%.*]] = load {}*, {}** [[ptls_field]], align 8, !tbaa !0 ; CHECK-NEXT: [[ppjl_ptls:%.*]] = bitcast {}* [[ptls_load]] to {}** ; CHECK-NEXT: [[ptls_i8:%.*]] = bitcast {}** [[ppjl_ptls]] to i8*