diff --git a/numba_dpex/kernel_api_impl/common/__init__.py b/numba_dpex/kernel_api_impl/common/__init__.py new file mode 100644 index 0000000000..ee20629a35 --- /dev/null +++ b/numba_dpex/kernel_api_impl/common/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2024 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/numba_dpex/kernel_api_impl/common/random.py b/numba_dpex/kernel_api_impl/common/random.py new file mode 100644 index 0000000000..0bb15eb690 --- /dev/null +++ b/numba_dpex/kernel_api_impl/common/random.py @@ -0,0 +1,341 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Julien Jerphanion +# SPDX-FileCopyrightText: 2022 - 2023 Olivier Grisel +# SPDX-FileCopyrightText: 2022 - 2023 Franck Charras <29153872+fcharras@users.noreply.github.com> +# SPDX-FileCopyrightText: 2024 Intel Corporation +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +This code is largely inspired from the numba.cuda.random module and the +# numba/cuda/random.py where it's defined (v<0.57), and by the implementation of the +# same algorithm in the package `randomgen`. + +# numba.cuda.random: https://github.com/numba/numba/blob/0.56.3/numba/cuda/random.py +# randomgen: https://github.com/bashtage/randomgen/blob/v1.26.0/randomgen/xoroshiro128.pyx # noqa + +# NB1: we implement xoroshiro128++ rather than just xoroshiro128+, which is preferred. +# Reference resource about PRNG: https://prng.di.unimi.it/ + +# NB2: original numba.cuda.random code also includes functions for generating normally +# distributed floats but we don't include it here as long as it's not needed. +""" + +import random +import warnings +from functools import lru_cache + +import dpctl +import dpctl.tensor as dpt +import numpy as np +from numba import float32, float64, int64, uint32, uint64 + +import numba_dpex as dpex + + +def _get_sequential_processing_device(device: dpctl.SyclDevice): + """Returns a device most fitted for sequential processing. + + Selects a cpu rather than a gpu for sequential processing. If such a cpu + device is not found, returns the input device instead. + + Also returns a boolean that informs on wether the returned device is + different than the input device. + """ + if device.has_aspect_cpu: + return device, False + + try: + return dpctl.SyclDevice("cpu"), True + except dpctl.SyclDeviceCreationError: + warnings.warn( + "No CPU found, falling back to GPU for sequential instructions." + ) + return device, False + + +zero_idx = int64(0) +one_idx = int64(1) + + +def get_random_raw(states): + """Returns a single pseudo-random `uint64` integer value. + + Similar to numpy.random.BitGenerator.random_raw(size=1). + + Note, this always uses and updates state states[0]. + """ + result = dpt.empty((1,), dtype=np.uint64, device=states.device) + dpex.call_kernel(make_random_raw_kernel(), dpex.Range(1), states, result) + return result + + +@lru_cache +def make_random_raw_kernel(): + """Returns a single pseudo-random `uint64` integer value. + Similar to numpy.random.BitGenerator.random_raw(size=1). + + Note, this always uses and updates state states[0]. + """ + + @dpex.kernel + def _get_random_raw_kernel( + item, states, result + ): # pylint: disable=unused-argument + result[zero_idx] = _xoroshiro128pp_next(states, zero_idx) + + return _get_random_raw_kernel + + +def make_rand_uniform_kernel_func(dtype): + """Instantiate a kernel function that returns a random float in [0, 1) + + This factory returns a kernel function specialized to either generate + float32 or float64 values. Care has been taken so that the float32 + variant can be compiled to target a device that does not support the + float64 aspect. + + The returned kernel function takes two arguments: + + - states : a state array. See create_xoroshiro128pp_states for details. + + - state_idx : the index of the RNG state to use to generate the next + random float. + + """ + if not hasattr(dtype, "name"): + raise ValueError( + "dtype is expected to have an attribute 'name', like np.dtype " + "or numba types." + ) + + if dtype.name == "float64": + convert_rshift = uint32(11) + convert_const = float64(uint64(1) << uint32(53)) + convert_const_one = float64(1) + + @dpex.device_func + def uint64_to_unit_float(x): + """Convert uint64 to float64 value in the range [0.0, 1.0)""" + return float64(x >> convert_rshift) * ( + convert_const_one / convert_const + ) + + elif dtype.name == "float32": + convert_rshift = uint32(40) + convert_const = float32(uint32(1) << uint32(24)) + convert_const_one = float32(1) + + @dpex.device_func + def uint64_to_unit_float(x): + """Convert uint64 to float32 value in the range [0.0, 1.0) + + NB: this is different than original numba.cuda.random code. Instead + of generating a float64 random number before casting it to float32, + a float32 number is generated from uint64 without intermediate + float64. This change enables compatibility with devices that do not + support float64 numbers. However is seems to be exactly equivalent + e.g it passes the float precision test in sklearn. + """ + return float32(x >> convert_rshift) * ( + convert_const_one / convert_const + ) + + else: + raise ValueError( + "Expected dtype.name in {float32, float64} but got " + f"dtype.name == {dtype.name}" + ) + + @dpex.device_func + def xoroshiro128pp_uniform(states, state_idx): + """Return one random float in [0, 1) + + Calling this function advances the states[state_idx] by a single RNG + step and leaves the other states unchanged. + """ + return uint64_to_unit_float(_xoroshiro128pp_next(states, state_idx)) + + return xoroshiro128pp_uniform + + +def create_xoroshiro128pp_states( + n_states, subsequence_start=0, seed=None, device=None +): + """Returns a new device array initialized for n random number generators. + + This initializes the RNG states so that states in the array correspond to + subsequences separated by 2**64 steps from each other in the main sequence. + Therefore, as long as no thread requests more than 2**64 random numbers, all + the RNG states produced by this function are guaranteed to be independent. + + Parameters + ---------- + n_states : int + Number of RNG states to create. Each RNG state is meant to be used by a + distinct thread in the xoroshiro128pp RNG. Therefore n_states controls + the amount of parallelism when using the RNG to generate a large enough + sequence of pseudo-random values. Subsequent states are initialized + 2**64 RNG steps away from one another. + + subsequence_start : int + Advance the first RNG state by a multiple of 2**64 steps after the + state induced by the seed. The subsequent RNG states controlled by + `n_states` are each initialized 2**64 steps further from their + predecessor in the states array. + + seed : int or None + Starting seed for the list of generators. + + device : str or None (default) + A SYCL device or if None, takes the default sycl device. + """ + if seed is None: + seed = uint64(random.randint(0, np.iinfo(np.int64).max - 1)) + + if hasattr(seed, "randint"): + seed = uint64(seed.randint(0, np.iinfo(np.int64).max - 1)) + + init_xoroshiro128pp_states_kernel = _make_init_xoroshiro128pp_states_kernel( + n_states, subsequence_start + ) + + # Initialization is purely sequential so it will be faster on CPU, if a + # cpu device is available make sure to use it. + if device is None: + device = dpctl.SyclDevice() + + ( + sequential_processing_device, + sequential_processing_on_different_device, + ) = _get_sequential_processing_device(device) + + states = dpt.empty( + (n_states, 2), dtype=np.uint64, device=sequential_processing_device + ) + + seed = dpt.asarray( + [seed], dtype=np.uint64, device=sequential_processing_device + ) + + dpex.call_kernel( + init_xoroshiro128pp_states_kernel, dpex.Range(1), states, seed + ) + + if sequential_processing_on_different_device: + return states.to_device(device) + + return states + + +@lru_cache +def _make_init_xoroshiro128pp_states_kernel( + n_states, subsequence_start +): # pylint: disable=too-many-locals + n_states = int64(n_states) + + splitmix64_const_1 = uint64(0x9E3779B97F4A7C15) + splitmix64_const_2 = uint64(0xBF58476D1CE4E5B9) + splitmix64_const_3 = uint64(0x94D049BB133111EB) + splitmix64_rshift_1 = uint32(30) + splitmix64_rshift_2 = uint32(27) + splitmix64_rshift_3 = uint32(31) + + @dpex.device_func + def _splitmix64_next(state): + new_state = z = state + splitmix64_const_1 + z = (z ^ (z >> splitmix64_rshift_1)) * splitmix64_const_2 + z = (z ^ (z >> splitmix64_rshift_2)) * splitmix64_const_3 + return new_state, z ^ (z >> splitmix64_rshift_3) + + jump_const_1 = uint64(0x2BD7A6A6E99C2DDC) + jump_const_2 = uint64(0x0992CCAF6A6FCA05) + jump_const_3 = uint64(1) + jump_init = uint64(0) + long_2 = int64(2) + long_64 = int64(64) + + @dpex.device_func + def _xoroshiro128pp_jump(states, state_idx): + """Advance the RNG in ``states[state_idx]`` by 2**64 steps.""" + s0 = jump_init + s1 = jump_init + + for i in range(long_2): + if i == zero_idx: + jump_const = jump_const_1 + else: + jump_const = jump_const_2 + for b in range(long_64): + if jump_const & jump_const_3 << uint32(b): + s0 ^= states[state_idx, zero_idx] + s1 ^= states[state_idx, one_idx] + _xoroshiro128pp_next(states, state_idx) + + states[state_idx, zero_idx] = s0 + states[state_idx, one_idx] = s1 + + init_const_1 = np.uint64(0) + + @dpex.kernel + def init_xoroshiro128pp_states( + item, states, seed + ): # pylint: disable=unused-argument + """ + Use SplitMix64 to generate an xoroshiro128p state from a uint64 seed. + + This ensures that manually set small seeds don't result in a predictable + initial sequence from the random number generator. + """ + if n_states < one_idx: + return + + splitmix64_state = init_const_1 ^ seed[zero_idx] + splitmix64_state, states[zero_idx, zero_idx] = _splitmix64_next( + splitmix64_state + ) + _, states[zero_idx, one_idx] = _splitmix64_next(splitmix64_state) + + # advance to starting subsequence number + for _ in range(subsequence_start): + _xoroshiro128pp_jump(states, zero_idx) + + # populate the rest of the array + for idx in range(one_idx, n_states): + # take state of previous generator + states[idx, zero_idx] = states[idx - one_idx, zero_idx] + states[idx, one_idx] = states[idx - one_idx, one_idx] + # and jump forward 2**64 steps + _xoroshiro128pp_jump(states, idx) + + return init_xoroshiro128pp_states + + +_64_as_uint32 = uint32(64) + + +@dpex.device_func +def _rotl(x, k): + """Left rotate x by k bits. x is expected to be a uint64 integer.""" + return (x << k) | (x >> (_64_as_uint32 - k)) + + +next_rot_1 = uint32(17) +next_rot_2 = uint32(49) +next_rot_3 = uint32(28) +shift_1 = uint32(21) + + +@dpex.device_func +def _xoroshiro128pp_next(states, state_idx): + """ + Returns the next random uint64 and advance the RNG in states[state_idx]. + """ + s0 = states[state_idx, zero_idx] + s1 = states[state_idx, one_idx] + result = _rotl(s0 + s1, next_rot_1) + s0 + + s1 ^= s0 + states[state_idx, zero_idx] = _rotl(s0, next_rot_2) ^ s1 ^ (s1 << shift_1) + states[state_idx, one_idx] = _rotl(s1, next_rot_3) + + return result