Skip to content

Fix and optimize bin1d_vec + extend units tests + fix imprecise bin edge creation #270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 25 additions & 14 deletions csep/core/catalogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def to_dataframe(self, with_datetime=False):
def get_mag_idx(self):
""" Return magnitude index from region magnitudes """
try:
return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, tol=0.00001, right_continuous=True)
return bin1d_vec(self.get_magnitudes(), self.region.magnitudes, right_continuous=True)
except AttributeError:
raise CSEPCatalogException("Cannot return magnitude index without self.region.magnitudes")

Expand Down Expand Up @@ -699,16 +699,21 @@ def spatial_event_probability(self):
event_flag[idx] = 1
return event_flag

def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
""" Computes the count of events within mag_bins

def magnitude_counts(self, mag_bins=None, tol=None, retbins=False):
""" Computes the count of events within magnitude bins

Args:
mag_bins: uses csep.utils.constants.CSEP_MW_BINS as default magnitude bins
retbins (bool): if this is true, return the bins used
mag_bins (array-like): magnitude bin edges, by default tries to use magnitude bin edges
associated with region, otherwise :data:`csep.utils.constants.CSEP_MW_BINS`
tol (float): overwrite numerical tolerance, by default determined automatically from the
magnitudes' dtype to account for the limited precision of floating-point values.
Only necessary to specify if the magnitudes were subject to some
floating-point operations after loading or generating them
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).
retbins (bool): if true, return the used bins in a tuple together with the counts.

Returns:
numpy.ndarray: showing the counts of hte events in each magnitude bin
numpy.ndarray: events counts in each magnitude bin
"""
# todo: keep track of events that are ignored
if mag_bins is None:
Expand All @@ -734,18 +739,24 @@ def magnitude_counts(self, mag_bins=None, tol=0.00001, retbins=False):
else:
return out

def spatial_magnitude_counts(self, mag_bins=None, tol=0.00001):
""" Return counts of events in space-magnitude region.
def spatial_magnitude_counts(self, mag_bins=None, tol=None):
""" Return counts of events in space-magnitude bins.

We figure out the index of the polygons and create a map that relates the spatial coordinate in the
Cartesian grid with the polygon in region.
It figures out the index of the polygons and maps the spatial coordinates in the Cartesian
grid with the polygon in region.

Args:
mag_bins (list, numpy.array): magnitude bins (optional), if empty tries to use magnitude bins associated with region
tol (float): tolerance for comparisons within magnitude bins
mag_bins (array-like): magnitude bin edges (optional), by default uses magnitude bin edges
associated with region.
tol (float): overwrite numerical tolerance, by default determined automatically from the
magnitudes' dtype to account for the limited precision of floating-point values.
Only necessary to specify if the magnitudes were subject to some
floating-point operations after loading or generating them
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).

Returns:
output: unnormalized event count in each bin, 1d ndarray where index corresponds to midpoints
numpy.ndarray: unnormalized event count in each space-magnitude bin (2d, with indices
corresponding to spatial midpoints and magnitude bin, respectively)

"""
# make sure region is specified with catalog
Expand Down
11 changes: 8 additions & 3 deletions csep/core/forecasts.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,21 @@ def magnitude_counts(self):
""" Returns counts of events in magnitude bins """
return numpy.sum(self.data, axis=0)

def get_magnitude_index(self, mags, tol=0.00001):
def get_magnitude_index(self, mags, tol=None):
""" Returns the indices into the magnitude bins of selected magnitudes

Note: the right-most bin is treated as extending to infinity.

Args:
mags (array-like): list of magnitudes
mags (array-like): magnitudes bin edges.
tol (float): overwrite numerical tolerance, by default determined automatically from the
magnitudes' dtype to account for the limited precision of floating-point values.
Only necessary to specify if the magnitudes were subject to some
floating-point operations after loading or generating them
(increased roundoff error, see :func:`csep.utils.calc.bin1d_vec`).

Returns:
idm (array-like): indices corresponding to mags
numpy.ndarray: indices corresponding to the magnitude bins

Raises:
ValueError
Expand Down
17 changes: 6 additions & 11 deletions csep/core/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,12 +313,7 @@ def magnitude_bins(start_magnitude, end_magnitude, dmw):
Returns:
bin_edges (numpy.ndarray)
"""
# convert to integers to prevent accumulating floating point errors
const = 10000
start = numpy.floor(const * start_magnitude)
end = numpy.floor(const * end_magnitude)
d = const * dmw
return numpy.arange(start, end + d / 2, d) / const
return cleaner_range(start_magnitude, end_magnitude, dmw)

def create_space_magnitude_region(region, magnitudes):
"""Simple wrapper to create space-magnitude region """
Expand Down Expand Up @@ -495,7 +490,7 @@ def compute_vertices(origin_points, dh, tol=numpy.finfo(float).eps):
"""
return list(map(lambda x: compute_vertex(x, dh, tol=tol), origin_points))

def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=0.00001):
def _bin_catalog_spatio_magnitude_counts(lons, lats, mags, n_poly, mask, idx_map, binx, biny, mag_bins, tol=None):
"""
Returns a list of event counts as ndarray with shape (n_poly, n_cat) where each value
represents the event counts within the polygon.
Expand Down Expand Up @@ -625,8 +620,8 @@ def get_index_of(self, lons, lats):
Returns:
idx: ndarray-like
"""
idx = bin1d_vec(numpy.array(lons), self.xs)
idy = bin1d_vec(numpy.array(lats), self.ys)
idx = bin1d_vec(lons, self.xs)
idy = bin1d_vec(lats, self.ys)
if numpy.any(idx == -1) or numpy.any(idy == -1):
raise ValueError("at least one lon and lat pair contain values that are outside of the valid region.")
if numpy.any(self.bbox_mask[idy, idx] == 1):
Expand Down Expand Up @@ -1061,7 +1056,7 @@ def get_cell_area(self):
self.cell_area = cell_area
return self.cell_area

def get_index_of(self, lons, lats):
def get_index_of(self, lons, lats):
""" Returns the index of lons, lats in self.polygons

Args:
Expand Down Expand Up @@ -1182,7 +1177,7 @@ def _get_spatial_magnitude_counts(self, catalog, mag_bins=None):
out = numpy.zeros([len(self.quadkeys), len(mag_bins)])

idx_loc = self.get_index_of(lon, lat)
idx_mag = bin1d_vec(mag, mag_bins, tol=0.00001, right_continuous=True)
idx_mag = bin1d_vec(mag, mag_bins, right_continuous=True)

numpy.add.at(out, (idx_loc, idx_mag), 1)

Expand Down
95 changes: 52 additions & 43 deletions csep/utils/calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,67 +51,75 @@ def discretize(data, bin_edges, right_continuous=False):
x_new = bin_edges[idx]
return x_new

def _get_tolerance(v):
"""Determine numerical tolerance due to limited precision of floating-point values.

... to account for roundoff error.

In other words, returns a maximum possible difference that can be considered negligible.
Only relevant for floating-point values.
"""
if issubclass(v.dtype.type, numpy.floating):
return numpy.abs(v) * numpy.finfo(v.dtype).eps
return 0 # assuming it's an int

def bin1d_vec(p, bins, tol=None, right_continuous=False):
"""Efficient implementation of binning routine on 1D Cartesian Grid.

Returns the indices of the points into bins. Bins are inclusive on the lower bound
Bins are inclusive on the lower bound
and exclusive on the upper bound. In the case where a point does not fall within the bins a -1
will be returned. The last bin extends to infinity when right_continuous is set as true.

To correctly bin points that are practically on a bin edge, this function accounts for the
limited precision of floating-point numbers (the roundoff error) with a numerical tolerance.
If the provided points were subject to some floating-point operations after loading or
generating them, the roundoff error increases (which is not accounted for) and requires
overwriting the `tol` argument.

Args:
p (array-like): Point(s) to be placed into b
bins (array-like): bins to considering for binning, must be monotonically increasing
right_continuous (bool): if true, consider last bin extending to infinity
p (array-like): Point(s) to be placed into bins.
bins (array-like): bin edges; must be sorted (monotonically increasing)
tol (float): overwrite numerical tolerance, by default determined automatically from the
points' dtype to account for the roundoff error.
right_continuous (bool): if true, consider last bin extending to infinity.

Returns:
idx (array-like): indexes hashed into grid
numpy.ndarray: indices for the points corresponding to the bins.

Raises:
ValueError:
"""
bins = numpy.array(bins)
p = numpy.array(p)
a0 = numpy.min(bins)
# if user supplies only a single bin, do 2 things: 1) fix right continuous to true, and use of h is arbitrary
bins = numpy.asarray(bins)
p = numpy.asarray(p)

# if not np.all(bins[:-1] <= bins[1:]): # check for sorted bins, which is a requirement
# raise ValueError("Bin edges are not sorted.") # (pyCSEP always passes sorted bins)
a0 = bins[0]
if bins.size == 1:
# for a single bin, set `right_continuous` to true; h is now arbitrary
right_continuous = True
h = 1
h = 1.
else:
h = bins[1] - bins[0]

a0_tol = numpy.abs(a0) * numpy.finfo(numpy.float64).eps
h_tol = numpy.abs(h) * numpy.finfo(numpy.float64).eps
p_tol = numpy.abs(p) * numpy.finfo(numpy.float64).eps

# absolute tolerance
if tol is None:
idx = numpy.floor((p + (p_tol + a0_tol) - a0) / (h - h_tol))
else:
idx = numpy.floor((p + (tol + a0_tol) - a0) / (h - h_tol))
if h < 0:
raise ValueError("grid spacing must be positive and monotonically increasing.")
# account for floating point uncertainties by considering extreme case

# account for floating point precision
a0_tol = _get_tolerance(a0)
h_tol = a0_tol # must be based on *involved* numbers
p_tol = tol or _get_tolerance(p)

idx = numpy.floor((p - a0 + p_tol + a0_tol) / (h - h_tol))
idx = numpy.asarray(idx) # assure idx is an array

if right_continuous:
# set upper bin index to last
try:
idx[(idx < 0)] = -1
idx[(idx >= len(bins) - 1)] = len(bins) - 1
except TypeError:
if idx >= len(bins) - 1:
idx = len(bins) - 1
if idx < 0:
idx = -1
idx[idx < 0] = -1
idx[idx >= len(bins) - 1] = len(bins) - 1
else:
try:
idx[((idx < 0) | (idx >= len(bins)))] = -1
except TypeError:
if idx < 0 or idx >= len(bins):
idx = -1
try:
idx = idx.astype(numpy.int64)
except AttributeError:
idx = int(idx)
idx[(idx < 0) | (idx >= len(bins))] = -1
idx = idx.astype(numpy.int64)
return idx

def _compute_likelihood(gridded_data, apprx_rate_density, expected_cond_count, n_obs):
Expand Down Expand Up @@ -215,12 +223,13 @@ def cleaner_range(start, end, h):
Returns:
bin_edges (numpy.ndarray)
"""
# convert to integers to prevent accumulating floating point errors
const = 100000
start = numpy.floor(const * start)
end = numpy.floor(const * end)
d = const * h
return numpy.arange(start, end + d / 2, d) / const
# determine required scaling to account for decimal places of bin edges and stepping
num_decimals_bins = len(str(float(start)).split('.')[1])
scale = max(10**num_decimals_bins, 1 / h)
start = numpy.round(scale * start)
end = numpy.round(scale * end)
d = scale * h
return numpy.arange(start, end + d / 2, d) / scale

def first_nonnan(arr, axis=0, invalid_val=-1):
mask = arr==arr
Expand Down
36 changes: 26 additions & 10 deletions tests/test_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ def test_bin1d_single_bin1(self):
expected = [-1, -1, 0, 0, 0, 0, 0, -1]
self.assertListEqual(test.tolist(), expected)

def test_bin1d_single_bin2(self):
data = [4.0]
bin_edges = [3.0]
test = bin1d_vec(data, bin_edges)
expected = [0]
self.assertListEqual(test.tolist(), expected)

def test_upper_limit_right_continuous(self):
data = [40, 40, 40]
bin_edges = [0, 10, 20, 30]
Expand Down Expand Up @@ -134,18 +141,27 @@ def test_less_and_greater_than(self):
expected = [-1, 3, -1]
self.assertListEqual(test.tolist(), expected)

def test_scalar_outside(self):
from csep.utils.calc import bin1d_vec
mbins = numpy.arange(5.95, 9, 0.1) # This gives bins from 5.95 to 8.95
idx = bin1d_vec(5.95, mbins, tol=0.00001, right_continuous=True)
self.assertEqual(idx, 0)
def test_scalar_inside(self):
mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95

idx = bin1d_vec(6, mbins, tol=0.00001, right_continuous=True) # This would give 0: Which is fine.
self.assertEqual(idx, 0)
for i, m in enumerate(mbins):
idx = bin1d_vec(m, mbins, right_continuous=True) # corner cases
self.assertEqual(idx, i)

idx = bin1d_vec(5, mbins, tol=0.00001, right_continuous=True)
self.assertEqual(idx, -1)
idx = bin1d_vec(m + 0.05, mbins, right_continuous=True) # center bins
self.assertEqual(idx, i)

idx = bin1d_vec(4, mbins, tol=0.00001, right_continuous=True)
idx = bin1d_vec(m + 0.099999999, mbins, right_continuous=True) # end of bins
self.assertEqual(idx, i)

idx = bin1d_vec(10, mbins, right_continuous=True) # larger than last bin edge
self.assertEqual(idx, mbins.size - 1)

def test_scalar_outside(self):
mbins = numpy.arange(5.95, 9, 0.1) # (magnitude) bins from 5.95 to 8.95

idx = bin1d_vec(5, mbins, right_continuous=True)
self.assertEqual(idx, -1)

idx = bin1d_vec(4, mbins, right_continuous=True)
self.assertEqual(idx, -1)
Loading