Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GeoDataframe slicing issue by adding _slice_ method. #680

Merged
Merged
52 changes: 51 additions & 1 deletion python/cuspatial/cuspatial/core/geodataframe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION
from typing import Dict, Tuple, TypeVar, Union

import pandas as pd
from geopandas import GeoDataFrame as gpGeoDataFrame
from geopandas.geoseries import is_geometry_type as gp_is_geometry_type

Expand All @@ -9,13 +11,15 @@
from cuspatial.core.geoseries import GeoSeries
from cuspatial.io.geopandas_reader import GeoPandasReader

T = TypeVar("T", bound="GeoDataFrame")


class GeoDataFrame(cudf.DataFrame):
"""
A GPU GeoDataFrame object.
"""

def __init__(self, data: gpGeoDataFrame = None):
def __init__(self, data: Union[Dict, gpGeoDataFrame] = None):
"""
Constructs a GPU GeoDataFrame from a GeoPandas dataframe.

Expand All @@ -34,6 +38,8 @@ def __init__(self, data: gpGeoDataFrame = None):
self._data[col] = column
else:
self._data[col] = data[col]
elif isinstance(data, dict):
super()._init_from_dict_like(data)
elif data is None:
pass
else:
Expand Down Expand Up @@ -120,6 +126,50 @@ def _copy_type_metadata(

return type_copied

def _split_out_geometry_columns(self) -> Tuple:
"""
Break the geometry columns and non-geometry columns into
separate dataframes and return them separated.
"""
columns_mask = pd.Series(self.columns)
geocolumn_mask = pd.Series(
[isinstance(self[col], GeoSeries) for col in self.columns]
)
geo_columns = self[columns_mask[geocolumn_mask]]
# Send the rest of the columns to `cudf` to slice.
data_columns = cudf.DataFrame(
self[columns_mask[~geocolumn_mask].values]
)
return (geo_columns, data_columns)

def _recombine_columns(self, geo_columns, data_columns):
"""
Combine a GeoDataFrame of only geometry columns with a DataFrame
of non-geometry columns in the same order as the columns in `self`
"""
columns_mask = pd.Series(self.columns)
geocolumn_mask = pd.Series(
[isinstance(self[col], GeoSeries) for col in self.columns]
)
return {
name: (geo_columns[name] if mask else data_columns[name])
for name, mask in zip(columns_mask.values, geocolumn_mask.values)
}

def _slice(self: T, arg: slice) -> T:
"""
Overload the _slice functionality from cudf's frame members.
"""
geo_columns, data_columns = self._split_out_geometry_columns()
sliced_geo_columns = GeoDataFrame(
{name: geo_columns[name].iloc[arg] for name in geo_columns.columns}
)
sliced_data_columns = data_columns._slice(arg)
result = self._recombine_columns(
sliced_geo_columns, sliced_data_columns
)
return self.__class__(result)


class _GeoSeriesUtility:
@classmethod
Expand Down
11 changes: 8 additions & 3 deletions python/cuspatial/cuspatial/core/geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ def __init__(
index = data.index
if index is None:
index = cudf.RangeIndex(0, len(column))
super().__init__(column, index, dtype, name, nan_as_null)
super().__init__(
column, index, dtype=dtype, name=name, nan_as_null=nan_as_null
)

@property
def type(self):
Expand Down Expand Up @@ -272,9 +274,11 @@ def __getitem__(self, item):
)

if isinstance(item, Integral):
return GeoSeries(column).to_shapely()
return GeoSeries(column, name=self._sr.name).to_shapely()
else:
return GeoSeries(column, index=self._sr.index[indexes])
return GeoSeries(
column, index=self._sr.index[indexes], name=self._sr.name
)

def from_arrow(union):
column = GeoColumn(
Expand Down Expand Up @@ -316,6 +320,7 @@ def to_geopandas(self, nullable=False):
return gpGeoSeries(
final_union_slice.to_shapely(),
index=self.index.to_pandas(),
name=self.name,
)

def to_pandas(self):
Expand Down
72 changes: 67 additions & 5 deletions python/cuspatial/cuspatial/tests/test_geodataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,13 @@ def test_pre_slice(gpdf, pre_slice):

@pytest.mark.parametrize(
"post_slice",
[slice(0, 12)]
+ [slice(0, 10, 1)]
+ [slice(0, 3, 1)]
+ [slice(3, 6, 1)]
+ [slice(6, 9, 1)],
[
(slice(0, 12)),
(slice(0, 10, 1)),
(slice(0, 3, 1)),
(slice(3, 6, 1)),
(slice(6, 9, 1)),
],
)
def test_post_slice(gpdf, post_slice):
geometries = gpdf
Expand All @@ -241,6 +243,66 @@ def test_post_slice(gpdf, post_slice):
assert_eq_geo_df(gi[post_slice], cugpdf_back[post_slice])


@pytest.mark.parametrize(
"inline_slice",
[
(slice(0, 12)),
(slice(0, 10, 1)),
(slice(0, 3, 1)),
(slice(3, 6, 1)),
(slice(6, 9, 1)),
],
)
def test_inline_slice(gpdf, inline_slice):
gi = gpd.GeoDataFrame(gpdf)
cugpdf = cuspatial.from_geopandas(gi)
assert_eq_geo_df(gi[inline_slice], cugpdf[inline_slice].to_pandas())


def test_slice_column_order(gpdf):
gi = gpd.GeoDataFrame(gpdf)
cugpdf = cuspatial.from_geopandas(gi)

slice_df = cuspatial.core.geodataframe.GeoDataFrame(
{
"geo1": cugpdf["geometry"],
"data1": np.arange(len(cugpdf)),
"geo2": cugpdf["geometry"],
"data2": np.arange(len(cugpdf)),
}
)
slice_gi = slice_df.to_pandas()
assert_eq_geo_df(slice_gi[0:5], slice_df[0:5].to_pandas())

slice_df = cuspatial.core.geodataframe.GeoDataFrame(
{
"data1": np.arange(len(cugpdf)),
"geo1": cugpdf["geometry"],
"geo2": cugpdf["geometry"],
"data2": np.arange(len(cugpdf)),
}
)
slice_gi = slice_df.to_pandas()
assert_eq_geo_df(slice_gi[5:], slice_df[5:].to_pandas())

slice_df = cuspatial.core.geodataframe.GeoDataFrame(
{
"data1": np.arange(len(cugpdf)),
"geo4": cugpdf["geometry"],
"data2": np.arange(len(cugpdf)),
"geo3": cugpdf["geometry"],
"data3": np.arange(len(cugpdf)),
"geo2": cugpdf["geometry"],
"geo1": cugpdf["geometry"],
"data4": np.arange(len(cugpdf)),
"data5": np.arange(len(cugpdf)),
"data6": np.arange(len(cugpdf)),
}
)
slice_gi = slice_df.to_pandas()
assert_eq_geo_df(slice_gi[5:], slice_df[5:].to_pandas())


@pytest.mark.parametrize(
"df_boolmask",
[
Expand Down
22 changes: 12 additions & 10 deletions python/cuspatial/cuspatial/tests/test_geoseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,11 @@ def test_to_shapely_random():
"pre_slice",
[
list(np.arange(10)),
(slice(0, 12)),
(slice(0, 10, 1)),
(slice(0, 3, 1)),
(slice(3, 6, 1)),
(slice(6, 9, 1)),
slice(0, 12),
slice(0, 10, 1),
slice(0, 3, 1),
slice(3, 6, 1),
slice(6, 9, 1),
],
)
def test_to_shapely(gs, pre_slice):
Expand Down Expand Up @@ -333,11 +333,13 @@ def test_getitem_slice_mpolygons(gs):

@pytest.mark.parametrize(
"series_slice",
list(np.arange(10))
+ [slice(0, 10, 1)]
+ [slice(0, 3, 1)]
+ [slice(3, 6, 1)]
+ [slice(6, 9, 1)],
[
list(np.arange(10)),
slice(0, 10, 1),
slice(0, 3, 1),
slice(3, 6, 1),
slice(6, 9, 1),
],
)
def test_size(gs, series_slice):
geometries = gs[series_slice]
Expand Down