diff --git a/csep/core/catalogs.py b/csep/core/catalogs.py index b4257bb2..dc1d880c 100644 --- a/csep/core/catalogs.py +++ b/csep/core/catalogs.py @@ -395,7 +395,7 @@ def to_dataframe(self, with_datetime=False): def get_mag_idx(self): """ Return magnitude index from region magnitudes """ try: - return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, tol=0.00001, right_continuous=True) + return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, right_continuous=True) except AttributeError: raise CSEPCatalogException("Cannot return magnitude index without self.region.magnitudes") @@ -699,16 +699,21 @@ def spatial_event_probability(self): event_flag[idx] = 1 return event_flag - def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False): - """ Computes the count of events within mag_bins - + def magnitude_counts(self, mag_bins=None, tol=None, retbins=False): + """ Computes the count of events within magnitude bins Args: - mag_bins: uses csep.utils.constants.CSEP_MW_BINS as default magnitude bins - retbins (bool): if this is true, return the bins used + mag_bins (array-like): magnitude bin edges, by default tries to use magnitude bin edges + associated with region, otherwise :data:`csep.utils.constants.CSEP_MW_BINS` + tol (float): overwrite numerical tolerance, by default determined automatically from the + magnitudes' dtype to account for the limited precision of floating-point values. + Only necessary to specify if the magnitudes were subject to some + floating-point operations after loading or generating them + (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`). + retbins (bool): if true, return the used bins in a tuple together with the counts. Returns: - numpy.ndarray: showing the counts of hte events in each magnitude bin + numpy.ndarray: events counts in each magnitude bin """ # todo: keep track of events that are ignored if mag_bins is None: @@ -734,18 +739,24 @@ def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False): else: return out - def spatial_magnitude_counts(self, mag_bins=None, tol=0.00001): - """ Return counts of events in space-magnitude region. + def spatial_magnitude_counts(self, mag_bins=None, tol=None): + """ Return counts of events in space-magnitude bins. - We figure out the index of the polygons and create a map that relates the spatial coordinate in the - Cartesian grid with the polygon in region. + It figures out the index of the polygons and maps the spatial coordinates in the Cartesian + grid with the polygon in region. Args: - mag_bins (list, numpy.array): magnitude bins (optional), if empty tries to use magnitude bins associated with region - tol (float): tolerance for comparisons within magnitude bins + mag_bins (array-like): magnitude bin edges (optional), by default uses magnitude bin edges + associated with region. + tol (float): overwrite numerical tolerance, by default determined automatically from the + magnitudes' dtype to account for the limited precision of floating-point values. + Only necessary to specify if the magnitudes were subject to some + floating-point operations after loading or generating them + (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`). Returns: - output: unnormalized event count in each bin, 1d ndarray where index corresponds to midpoints + numpy.ndarray: unnormalized event count in each space-magnitude bin (2d, with indices + corresponding to spatial midpoints and magnitude bin, respectively) """ # make sure region is specified with catalog diff --git a/csep/core/forecasts.py b/csep/core/forecasts.py index 4b2f863f..37860e91 100644 --- a/csep/core/forecasts.py +++ b/csep/core/forecasts.py @@ -213,16 +213,21 @@ def magnitude_counts(self): """ Returns counts of events in magnitude bins """ return numpy.sum(self.data, axis=0) - def get_magnitude_index(self, mags, tol=0.00001): + def get_magnitude_index(self, mags, tol=None): """ Returns the indices into the magnitude bins of selected magnitudes Note: the right-most bin is treated as extending to infinity. Args: - mags (array-like): list of magnitudes + mags (array-like): magnitudes bin edges. + tol (float): overwrite numerical tolerance, by default determined automatically from the + magnitudes' dtype to account for the limited precision of floating-point values. + Only necessary to specify if the magnitudes were subject to some + floating-point operations after loading or generating them + (increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`). Returns: - idm (array-like): indices corresponding to mags + numpy.ndarray: indices corresponding to the magnitude bins Raises: ValueError diff --git a/csep/core/regions.py b/csep/core/regions.py index 04795c01..21f7d8e1 100644 --- a/csep/core/regions.py +++ b/csep/core/regions.py @@ -313,12 +313,7 @@ def magnitude_bins(start_magnitude, end_magnitude, dmw): Returns: bin_edges (numpy.ndarray) """ - # convert to integers to prevent accumulating floating point errors - const = 10000 - start = numpy.floor(const * start_magnitude) - end = numpy.floor(const * end_magnitude) - d = const * dmw - return numpy.arange(start, end + d / 2, d) / const + return cleaner_range(start_magnitude, end_magnitude, dmw) def create_space_magnitude_region(region, magnitudes): """Simple wrapper to create space-magnitude region """ @@ -495,7 +490,7 @@ def compute_vertices(origin_points, dh, tol=numpy.finfo(float).eps): """ return list(map(lambda x: compute_vertex(x, dh, tol=tol), origin_points)) -def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=0.00001): +def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=None): """ Returns a list of event counts as ndarray with shape (n_poly, n_cat) where each value represents the event counts within the polygon. @@ -625,8 +620,8 @@ def get_index_of(self, lons, lats): Returns: idx: ndarray-like """ - idx = bin1d_vec(numpy.array(lons), self.xs) - idy = bin1d_vec(numpy.array(lats), self.ys) + idx = bin1d_vec(lons, self.xs) + idy = bin1d_vec(lats, self.ys) if numpy.any(idx == -1) or numpy.any(idy == -1): raise ValueError("at least one lon and lat pair contain values that are outside of the valid region.") if numpy.any(self.bbox_mask[idy, idx] == 1): @@ -1061,7 +1056,7 @@ def get_cell_area(self): self.cell_area = cell_area return self.cell_area - def get_index_of(self, lons, lats): + def get_index_of(self, lons, lats): """ Returns the index of lons, lats in self.polygons Args: @@ -1182,7 +1177,7 @@ def _get_spatial_magnitude_counts(self, catalog, mag_bins=None): out = numpy.zeros([len(self.quadkeys), len(mag_bins)]) idx_loc = self.get_index_of(lon, lat) - idx_mag = bin1d_vec(mag, mag_bins, tol=0.00001, right_continuous=True) + idx_mag = bin1d_vec(mag, mag_bins, right_continuous=True) numpy.add.at(out, (idx_loc, idx_mag), 1) diff --git a/csep/utils/calc.py b/csep/utils/calc.py index dc208cda..ad46b59b 100644 --- a/csep/utils/calc.py +++ b/csep/utils/calc.py @@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False): x_new = bin_edges[idx] return x_new +def _get_tolerance(v): + """Determine numerical tolerance due to limited precision of floating-point values. + + ... to account for roundoff error. + + In other words, returns a maximum possible difference that can be considered negligible. + Only relevant for floating-point values. + """ + if issubclass(v.dtype.type, numpy.floating): + return numpy.abs(v) * numpy.finfo(v.dtype).eps + return 0 # assuming it's an int + def bin1d_vec(p, bins, tol=None, right_continuous=False): """Efficient implementation of binning routine on 1D Cartesian Grid. - Returns the indices of the points into bins. Bins are inclusive on the lower bound + Bins are inclusive on the lower bound and exclusive on the upper bound. In the case where a point does not fall within the bins a -1 will be returned. The last bin extends to infinity when right_continuous is set as true. + To correctly bin points that are practically on a bin edge, this function accounts for the + limited precision of floating-point numbers (the roundoff error) with a numerical tolerance. + If the provided points were subject to some floating-point operations after loading or + generating them, the roundoff error increases (which is not accounted for) and requires + overwriting the `tol` argument. + Args: - p (array-like): Point(s) to be placed into b - bins (array-like): bins to considering for binning, must be monotonically increasing - right_continuous (bool): if true, consider last bin extending to infinity + p (array-like): Point(s) to be placed into bins. + bins (array-like): bin edges; must be sorted (monotonically increasing) + tol (float): overwrite numerical tolerance, by default determined automatically from the + points' dtype to account for the roundoff error. + right_continuous (bool): if true, consider last bin extending to infinity. Returns: - idx (array-like): indexes hashed into grid + numpy.ndarray: indices for the points corresponding to the bins. Raises: ValueError: """ - bins = numpy.array(bins) - p = numpy.array(p) - a0 = numpy.min(bins) - # if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary + bins = numpy.asarray(bins) + p = numpy.asarray(p) + + # if not np.all(bins[:-1] <= bins[1:]): # check for sorted bins, which is a requirement + # raise ValueError("Bin edges are not sorted.") # (pyCSEP always passes sorted bins) + a0 = bins[0] if bins.size == 1: + # for a single bin, set `right_continuous` to true; h is now arbitrary right_continuous = True - h = 1 + h = 1. else: h = bins[1] - bins[0] - a0_tol = numpy.abs(a0) * numpy.finfo(numpy.float64).eps - h_tol = numpy.abs(h) * numpy.finfo(numpy.float64).eps - p_tol = numpy.abs(p) * numpy.finfo(numpy.float64).eps - - # absolute tolerance - if tol is None: - idx = numpy.floor((p + (p_tol + a0_tol) - a0) / (h - h_tol)) - else: - idx = numpy.floor((p + (tol + a0_tol) - a0) / (h - h_tol)) if h < 0: raise ValueError("grid spacing must be positive and monotonically increasing.") - # account for floating point uncertainties by considering extreme case + + # account for floating point precision + a0_tol = _get_tolerance(a0) + h_tol = a0_tol # must be based on *involved* numbers + p_tol = tol or _get_tolerance(p) + + idx = numpy.floor((p - a0 + p_tol + a0_tol) / (h - h_tol)) + idx = numpy.asarray(idx) # assure idx is an array if right_continuous: # set upper bin index to last - try: - idx[(idx < 0)] = -1 - idx[(idx >= len(bins) - 1)] = len(bins) - 1 - except TypeError: - if idx >= len(bins) - 1: - idx = len(bins) - 1 - if idx < 0: - idx = -1 + idx[idx < 0] = -1 + idx[idx >= len(bins) - 1] = len(bins) - 1 else: - try: - idx[((idx < 0) | (idx >= len(bins)))] = -1 - except TypeError: - if idx < 0 or idx >= len(bins): - idx = -1 - try: - idx = idx.astype(numpy.int64) - except AttributeError: - idx = int(idx) + idx[(idx < 0) | (idx >= len(bins))] = -1 + idx = idx.astype(numpy.int64) return idx def _compute_likelihood(gridded_data, apprx_rate_density, expected_cond_count, n_obs): @@ -215,12 +223,13 @@ def cleaner_range(start, end, h): Returns: bin_edges (numpy.ndarray) """ - # convert to integers to prevent accumulating floating point errors - const = 100000 - start = numpy.floor(const * start) - end = numpy.floor(const * end) - d = const * h - return numpy.arange(start, end + d / 2, d) / const + # determine required scaling to account for decimal places of bin edges and stepping + num_decimals_bins = len(str(float(start)).split('.')[1]) + scale = max(10**num_decimals_bins, 1 / h) + start = numpy.round(scale * start) + end = numpy.round(scale * end) + d = scale * h + return numpy.arange(start, end + d / 2, d) / scale def first_nonnan(arr, axis=0, invalid_val=-1): mask = arr==arr diff --git a/tests/test_calc.py b/tests/test_calc.py index 618443d1..000d14cb 100644 --- a/tests/test_calc.py +++ b/tests/test_calc.py @@ -106,6 +106,13 @@ def test_bin1d_single_bin1(self): expected = [-1, -1, 0, 0, 0, 0, 0, -1] self.assertListEqual(test.tolist(), expected) + def test_bin1d_single_bin2(self): + data = [4.0] + bin_edges = [3.0] + test = bin1d_vec(data, bin_edges) + expected = [0] + self.assertListEqual(test.tolist(), expected) + def test_upper_limit_right_continuous(self): data = [40, 40, 40] bin_edges = [0, 10, 20, 30] @@ -134,18 +141,27 @@ def test_less_and_greater_than(self): expected = [-1, 3, -1] self.assertListEqual(test.tolist(), expected) - def test_scalar_outside(self): - from csep.utils.calc import bin1d_vec - mbins = numpy.arange(5.95, 9, 0.1) # This gives bins from 5.95 to 8.95 - idx = bin1d_vec(5.95, mbins, tol=0.00001, right_continuous=True) - self.assertEqual(idx, 0) + def test_scalar_inside(self): + mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95 - idx = bin1d_vec(6, mbins, tol=0.00001, right_continuous=True) # This would give 0: Which is fine. - self.assertEqual(idx, 0) + for i, m in enumerate(mbins): + idx = bin1d_vec(m, mbins, right_continuous=True) # corner cases + self.assertEqual(idx, i) - idx = bin1d_vec(5, mbins, tol=0.00001, right_continuous=True) - self.assertEqual(idx, -1) + idx = bin1d_vec(m + 0.05, mbins, right_continuous=True) # center bins + self.assertEqual(idx, i) - idx = bin1d_vec(4, mbins, tol=0.00001, right_continuous=True) + idx = bin1d_vec(m + 0.099999999, mbins, right_continuous=True) # end of bins + self.assertEqual(idx, i) + + idx = bin1d_vec(10, mbins, right_continuous=True) # larger than last bin edge + self.assertEqual(idx, mbins.size - 1) + + def test_scalar_outside(self): + mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95 + + idx = bin1d_vec(5, mbins, right_continuous=True) self.assertEqual(idx, -1) + idx = bin1d_vec(4, mbins, right_continuous=True) + self.assertEqual(idx, -1) diff --git a/tests/test_regions.py b/tests/test_regions.py index a1e2cced..ccf4fd34 100644 --- a/tests/test_regions.py +++ b/tests/test_regions.py @@ -3,7 +3,11 @@ import numpy -from csep.core.regions import italy_csep_region, california_relm_region, nz_csep_region +from csep.core.regions import ( + italy_csep_region, italy_csep_collection_region, + california_relm_region, california_relm_collection_region, + nz_csep_region, nz_csep_collection_region, +) def get_italy_region_fname(): root_dir = os.path.dirname(os.path.abspath(__file__)) @@ -86,3 +90,76 @@ def test_origins(self): r = nz_csep_region() # they dont have to be in the same order, but they need numpy.testing.assert_array_equal(r.midpoints().sort(), self.from_dat.sort()) + +class TestBin2d(unittest.TestCase): + + @classmethod + def setUpClass(cls): + super(TestBin2d, cls).setUpClass() + + # (loading those is the bottleneck of this test case) + cls.regions = [ + italy_csep_region(), + italy_csep_collection_region(), + california_relm_region(), + california_relm_collection_region(), + nz_csep_region(), + nz_csep_collection_region(), + # global_region() # extreme slow-down (~2min loading + ~5min per loop + ~4s per vect) + ] + + def test_bin2d_regions_origins(self): + """every origin must be inside its own bin + """ + + for region in self.regions: + origins = region.origins() + self._test_bin2d_region_loop(region, origins) + self._test_bin2d_region_vect(region, origins) + self._test_bin2d_region_vect(region, origins.astype(numpy.float32)) + + def test_bin2d_regions_midpoints(self): + """every midpoint must be inside its own bin + """ + + for region in self.regions: + midpoints = region.midpoints() + self._test_bin2d_region_loop(region, midpoints) + self._test_bin2d_region_vect(region, midpoints) + self._test_bin2d_region_vect(region, midpoints.astype(numpy.float32)) + + def test_bin2d_regions_endcorner(self): + """every corner point (~opposite end of the origin) must be inside its own bin + """ + + for region in self.regions: + frac = 0.9999999999 + endcorners = region.origins() + frac*region.dh + self._test_bin2d_region_loop(region, endcorners) + self._test_bin2d_region_vect(region, endcorners) + frac = 0.999 # decrease frac for float32 due to its lower resolution + endcorners = region.origins() + frac*region.dh + self._test_bin2d_region_vect(region, endcorners.astype(numpy.float32)) + + def _test_bin2d_region_loop(self, region, coords): + """(slow) loop over origins; each time, calls bin1d_vec for lat & lon scalars + """ + + for i, origin in enumerate(coords): + idx = region.get_index_of( + origin[0], + origin[1], + ) + self.assertEqual(i, idx) + + def _test_bin2d_region_vect(self, region, coords): + """call bin1d_vec once for all lat origins & all lon origins + + Besides, also tests if vectors with ndim=2 are consumed properly + (returns 2nd-order/nested list ([[...]]). + """ + + lons, lats = numpy.split(coords.T, 2) # both have ndim=2! + test = region.get_index_of(lons, lats).tolist() # nested list ([[...]]) + expected = [numpy.arange(len(region.origins())).tolist()] # embed in another list + self.assertListEqual(test, expected)