lgarrison · lgarrison · Dec 19, 2021 · Dec 18, 2021
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -0,0 +1,15 @@
+Changelog
+=========
+
+0.1.1 (2021-12-18)
+------------------
+
+Fixes
+~~~~~
+- ``standard_normal`` sometimes produces ``inf`` [#1]
+
+
+0.1 (2021-12-06)
+----------------
+
+- Initial release
diff --git a/README.md b/README.md
@@ -17,13 +17,13 @@ seed = 123
 parallel_rng = parallel_numpy_rng.default_rng(seed)
 numpy_rng = np.random.default_rng(seed)
 
-%timeit numpy_rng.random(size=10**9, dtype=np.float32)                           # 2.89 s
-%timeit parallel_rng.random(size=10**9, dtype=np.float32, nthread=1)             # 3.35 s
-%timeit parallel_rng.random(size=10**9, dtype=np.float32, nthread=128)           # 73.9 ms
+%timeit numpy_rng.random(size=10**9, dtype=np.float32)                           # 2.85 s
+%timeit parallel_rng.random(size=10**9, dtype=np.float32, nthread=1)             # 3.34 s
+%timeit parallel_rng.random(size=10**9, dtype=np.float32, nthread=128)           # 67.8 ms
 
-%timeit numpy_rng.standard_normal(size=10**8, dtype=np.float32)                  # 1.13 s
-%timeit parallel_rng.standard_normal(size=10**8,dtype=np.float32, nthread=1)     # 1.87 s
-%timeit parallel_rng.standard_normal(size=10**8, dtype=np.float32, nthread=128)  # 36.6 ms
+%timeit numpy_rng.standard_normal(size=10**8, dtype=np.float32)                  # 1.12 s
+%timeit parallel_rng.standard_normal(size=10**8,dtype=np.float32, nthread=1)     # 1.85 s
+%timeit parallel_rng.standard_normal(size=10**8, dtype=np.float32, nthread=128)  # 43.5 ms
 ```
 
 Note that the `parallel_rng` is slower than Numpy when using a single thread, because the parallel implementation requires a slower algorithm in some cases (this is especially noticeable for float64 and normals)

diff --git a/parallel_numpy_rng.py b/parallel_numpy_rng.py
@@ -27,7 +27,7 @@ class MTGenerator:
 
     Usage
     -----
-    ```python
+    ```
     p = np.random.PCG64(123)  # or PCG64DXSM
     mtg = MTGenerator(p)
     r1 = mtg.random(size=16, nthread=2, dtype=np.float32)
@@ -58,7 +58,9 @@ def __init__(self, bit_generator, nthread=-1):
             nthread = len(os.sched_getaffinity(0))
         self.nthread = nthread
 
-        self._next_float = _next_float[bit_generator.state['bit_generator']]
+        # each is a dict, keyed on dtype
+        self._next_float = _next_float[bit_generator.state['bit_generator']]['zero']
+        self._next_float_nonzero = _next_float[bit_generator.state['bit_generator']]['nonzero']
 
         self._cached_normal = None
 
@@ -81,12 +83,8 @@ def random(self, size=None, nthread=None, out=None, verify_rng=True, dtype=np.fl
 
         if out is None:
             out = np.empty(size, dtype=dtype)
-        if dtype == np.float64:
-            _random(states, starts, out, self.bitgen.ctypes.next_double)
-        elif dtype == np.float32:
-            _random(states, starts, out, self._next_float)
-        else:
-            raise ValueError(dtype)
+        next_float = self._next_float[dtype]
+        _random(states, starts, out, next_float)
 
         if verify_rng:
             # Did we advance each RNG by the right amount?
@@ -159,12 +157,8 @@ def standard_normal(self, size=None, nthread=None, out=None, verify_rng=True, dt
 
         # now offset the out array
         _out = out[first:]
-        if dtype == np.float64:
-            _cached_normal = _boxmuller(states, ff, _out, self.bitgen.ctypes.next_double)
-        elif dtype == np.float32:
-            _cached_normal = _boxmuller(states, ff, _out, self._next_float)
-        else:
-            raise ValueError(dtype)
+        next_float_nonzero = self._next_float_nonzero[dtype]
+        _cached_normal = _boxmuller(states, ff, _out, next_float_nonzero)
         if not np.isnan(_cached_normal):
             self._cached_normal = _cached_normal
         del _out
@@ -236,19 +230,42 @@ def _boxmuller(states, starts, out, next_double):
 
     return cache[0]
 
-
-# initialize some numba functions
-_next_uint32_pcg64 = np.random.PCG64().ctypes.next_uint32
-@njit(fastmath=True)
-def _next_float_pcg64(state):
-    return np.float32(np.int32(_next_uint32_pcg64(state) >> np.uint32(8)) * (np.float32(1.) / np.float32(16777216.)))
+# TODO: there are now enough of these that they should live in their own module
+# TODO: for very low latency cases, there may be benefit to inlining these
 
-_next_float={'PCG64':_next_float_pcg64}
+def _generate_int_to_float(bitgen):
+    # initialize the numba functions to convert ints to floats
+    _next_uint32_pcg64 = bitgen().ctypes.next_uint32
+    _next_uint64_pcg64 = bitgen().ctypes.next_uint64
+
+    @njit(fastmath=True)
+    def _next_float_pcg64(state):
+        '''Random float in the semi-open interval [0,1)'''
+        return np.float32(np.int32(_next_uint32_pcg64(state) >> np.uint32(8)) * (np.float32(1.) / np.float32(16777216.)))
+
+    @njit(fastmath=True)
+    def _next_float_pcg64_nonzero(state):
+        '''Random float in the semi-open interval (0,1]'''
+        return np.float32((np.int32(_next_uint32_pcg64(state) >> np.uint32(8)) + np.int32(1))  * (np.float32(1.) / np.float32(16777216.)))
+
+    @njit(fastmath=True)
+    def _next_double_pcg64(state):
+        '''Random double in the semi-open interval [0,1)'''
+        return np.float64(np.int64(_next_uint64_pcg64(state) >> np.uint64(11)) * (np.float64(1.) / np.float64(9007199254740992.)))
+
+    @njit(fastmath=True)
+    def _next_double_pcg64_nonzero(state):
+        '''Random double in the semi-open interval (0,1]'''
+        return np.float64((np.int64(_next_uint64_pcg64(state) >> np.uint64(11)) + np.int64(1))  * (np.float64(1.) / np.float64(9007199254740992.)))
+
+    _next_float = {'zero': {np.float32:_next_float_pcg64, np.float64:_next_double_pcg64},
+                   'nonzero': {np.float32:_next_float_pcg64_nonzero, np.float64:_next_double_pcg64_nonzero},
+                  }
+    return _next_float
+
+_next_float = {}
+_next_float['PCG64'] = _generate_int_to_float(np.random.PCG64)
 
 if hasattr(np.random, 'PCG64DXSM'):
     # Numpy >= 1.21
-    _next_uint32_pcg64dxsm = np.random.PCG64DXSM().ctypes.next_uint32
-    @njit(fastmath=True)
-    def _next_float_pcg64dxsm(state):
-        return np.float32(np.int32(_next_uint32_pcg64dxsm(state) >> np.uint32(8)) * (np.float32(1.) / np.float32(16777216.)))
-    _next_float['PCG64DXSM'] = _next_float_pcg64dxsm
+    _next_float['PCG64XDSM'] = _generate_int_to_float(np.random.PCG64DXSM)
diff --git a/test_parallel_numpy_rng.py b/test_parallel_numpy_rng.py
@@ -135,3 +135,21 @@ def test_uniform_matches_numpy(someN, seed, nthread, dtype):
             assert np.allclose(a, b, atol=1e-7)
         else:
             raise ValueError(dtype)
+
+
+def test_finite_normals_float32():
+    '''
+    If the floats fed to Box-Muller can include 0, it will produce infinity.
+    We use the interval (0,1] to avoid this.
+
+    In theory, we ought to test float64 the same way. But it's hard to find
+    a PCG state that produces 53 zeros...
+
+    https://github.com/lgarrison/parallel-numpy-rng/issues/1
+    '''
+    from parallel_numpy_rng import MTGenerator
+    pcg = np.random.PCG64(1194)
+    mtg = MTGenerator(pcg)
+    a = mtg.standard_normal(size=20000, nthread=maxthreads, dtype=np.float32)
+    assert np.all(np.isfinite(a))
+