Skip to content

Commit

Permalink
Allow using cx indexer without spatial index (#54)
Browse files Browse the repository at this point in the history
* Allow using cx indexer without spatial index

* Apply suggestions from code review

* Add explicit build_sindex method

* Revert change to DaskGeoDataFrame.cx

* Add build_sindex arg to read_parquet_dask

* Make build sindex twice a no-op
  • Loading branch information
philippjfr authored Jan 26, 2021
1 parent 8c156af commit 6bd3311
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 14 deletions.
12 changes: 12 additions & 0 deletions spatialpandas/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ def cx(self):
def cx_partitions(self):
return _DaskPartitionCoordinateIndexer(self, self.partition_sindex)

def build_sindex(self):
def build_sindex(series):
series.build_sindex()
return series
return self.map_partitions(build_sindex, meta=self._meta)

def intersects_bounds(self, bounds):
return self.map_partitions(lambda s: s.intersects_bounds(bounds))

Expand Down Expand Up @@ -515,6 +521,12 @@ def _propagate_props_to_series(self, new_series):
new_series._partition_sindex = self._partition_sindex[new_series.name]
return new_series

def build_sindex(self):
def build_sindex(df):
df.build_sindex()
return df
return self.map_partitions(build_sindex, meta=self._meta)

def persist(self, **kwargs):
return self._propagate_props_to_dataframe(
super().persist(**kwargs)
Expand Down
3 changes: 3 additions & 0 deletions spatialpandas/geodataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ def cx(self):
self.geometry.array, parent=self
)

def build_sindex(self):
self.geometry.build_sindex()

def _ensure_type(self, obj):
# Override because a GeoDataFrame operation may result in a regular DataFrame,
# and that's ok
Expand Down
43 changes: 30 additions & 13 deletions spatialpandas/geometry/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,9 +536,13 @@ def fillna(self, value=None, method=None, limit=None):
@property
def sindex(self):
if self._sindex is None:
self._sindex = HilbertRtree(self.bounds)
self.build_sindex()
return self._sindex

def build_sindex(self):
if self._sindex is None:
self._sindex = HilbertRtree(self.bounds)

@property
def cx(self):
"""
Expand Down Expand Up @@ -636,7 +640,10 @@ def _get_bounds(self, key):
"intervals in continuous coordinate space, and a slice step has no "
"clear interpretation in this context."
)
xmin, ymin, xmax, ymax = self._sindex.total_bounds
if self._sindex:
xmin, ymin, xmax, ymax = self._sindex.total_bounds
else:
xmin, ymin, xmax, ymax = self._obj.total_bounds
x0, y0, x1, y1 = (
xs.start if xs.start is not None else xmin,
ys.start if ys.start is not None else ymin,
Expand All @@ -652,7 +659,10 @@ def _get_bounds(self, key):

def __getitem__(self, key):
x0, x1, y0, y1 = self._get_bounds(key)
covers_inds, overlaps_inds = self._sindex.covers_overlaps((x0, y0, x1, y1))
if self._sindex:
covers_inds, overlaps_inds = self._sindex.covers_overlaps((x0, y0, x1, y1))
else:
covers_inds, overlaps_inds = None, None
return self._perform_get_item(covers_inds, overlaps_inds, x0, x1, y0, y1)

def _perform_get_item(self, covers_inds, overlaps_inds, x0, x1, y0, y1):
Expand All @@ -661,24 +671,31 @@ def _perform_get_item(self, covers_inds, overlaps_inds, x0, x1, y0, y1):

class _CoordinateIndexer(_BaseCoordinateIndexer):
def __init__(self, obj, parent=None):
super().__init__(obj.sindex)
super().__init__(obj._sindex)
self._obj = obj
self._parent = parent

def _perform_get_item(self, covers_inds, overlaps_inds, x0, x1, y0, y1):
overlaps_inds_mask = self._obj.intersects_bounds(
(x0, y0, x1, y1), overlaps_inds
)
selected_inds = np.sort(
np.concatenate([covers_inds, overlaps_inds[overlaps_inds_mask]])
)
if self._parent is not None:
if len(self._parent) > 0:
return self._parent.iloc[selected_inds]
else:
return self._parent
else:
if covers_inds is not None:
selected_inds = np.sort(
np.concatenate([covers_inds, overlaps_inds[overlaps_inds_mask]])
)
if self._parent is not None:
if len(self._parent) > 0:
return self._parent.iloc[selected_inds]
else:
return self._parent
return self._obj[selected_inds]
else:
if self._parent is not None:
if len(self._parent) > 0:
return self._parent[overlaps_inds_mask]
else:
return self._parent
return self._obj[overlaps_inds_mask]


@ngjit
Expand Down
3 changes: 3 additions & 0 deletions spatialpandas/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ def cx(self):
from .geometry.base import _CoordinateIndexer
return _CoordinateIndexer(self.array, parent=self)

def build_sindex(self):
self.array.build_sindex()

def intersects_bounds(self, bounds):
return pd.Series(
self.array.intersects_bounds(bounds), index=self.index
Expand Down
7 changes: 6 additions & 1 deletion spatialpandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def to_parquet_dask(

def read_parquet_dask(
path, columns=None, filesystem=None, load_divisions=False,
geometry=None, bounds=None, categories=None,
geometry=None, bounds=None, categories=None, build_sindex=False
):
"""
Read spatialpandas parquet dataset(s) as DaskGeoDataFrame. Datasets are assumed to
Expand Down Expand Up @@ -207,6 +207,8 @@ def read_parquet_dask(
If a list, assumes up to 2**16-1 labels; if a dict, specify the number
of labels expected; if None, will load categories automatically for
data written by dask/fastparquet, not otherwise.
build_sindex : boolean
Whether to build partition level spatial indexes to speed up indexing.
Returns:
DaskGeoDataFrame
"""
Expand All @@ -233,6 +235,9 @@ def read_parquet_dask(
categories=categories
)

if build_sindex:
result = result.build_sindex()

return result


Expand Down

0 comments on commit 6bd3311

Please sign in to comment.