diff --git a/.gitignore b/.gitignore index ccecca937..347bbea87 100644 --- a/.gitignore +++ b/.gitignore @@ -146,4 +146,4 @@ ENV/ notebooks/taxi2016.csv notebooks/taxi2017.csv notebooks/tzones_lonlat.json -notebooks/cu_taxi_zones.* +notebooks/cu_taxi.zones.* diff --git a/python/cuspatial/cuspatial/geometry/geoarrowbuffers.py b/python/cuspatial/cuspatial/geometry/geoarrowbuffers.py index 7336a3c09..e39acee5b 100644 --- a/python/cuspatial/cuspatial/geometry/geoarrowbuffers.py +++ b/python/cuspatial/cuspatial/geometry/geoarrowbuffers.py @@ -1,9 +1,10 @@ -# Copyright (c) 2021 NVIDIA CORPORATION +# Copyright (c) 2021-2022 NVIDIA CORPORATION from typing import TypeVar, Union import numpy as np import pandas as pd +import pyarrow as pa import cudf @@ -83,10 +84,12 @@ def __init__(self, data: Union[dict, T], data_locale: object = cudf): self._multipoints = None self._lines = None self._polygons = None + self.data = None + if isinstance(data, dict): if data.get("points_xy") is not None: self._points = CoordinateArray( - data["points_xy"], data_locale=data_locale + pa.array(data["points_xy"]), data_locale=data_locale ) if data.get("mpoints_xy") is not None: if data.get("mpoints_offsets") is None: @@ -102,7 +105,9 @@ def __init__(self, data: Union[dict, T], data_locale: object = cudf): self._lines = LineArray( data["lines_xy"], data["lines_offsets"], - data.get("mlines"), + data["mlines"] + if data.get("mlines") + else np.arange(len(data["lines_offsets"])), data_locale=data_locale, ) if data.get("polygons_xy") is not None: @@ -115,11 +120,14 @@ def __init__(self, data: Union[dict, T], data_locale: object = cudf): ) self._polygons = PolygonArray( data["polygons_xy"], - data["polygons_polygons"], data["polygons_rings"], - data.get("mpolygons"), + data["polygons_polygons"], + data["mpolygons"] + if data.get("mpolygons") + else np.arange(len(data["polygons_polygons"])), data_locale=data_locale, ) + elif isinstance(data, GeoArrowBuffers): if data.points is not None: self._points = CoordinateArray( @@ -143,12 +151,39 @@ def __init__(self, data: Union[dict, T], data_locale: object = cudf): if data.polygons is not None: self._polygons = PolygonArray( data.polygons.xy, - data.polygons.polys, data.polygons.rings, + data.polygons.polys, data.polygons.mpolys, data.polygons.z, data_locale=data_locale, ) + + elif isinstance(data, pa.lib.UnionArray): + self.data = data + self._points = CoordinateArray( + data.field(0).values, [], data_locale=data_locale + ) + self._multipoints = MultiPointArray( + data.field(1).values.values, + data.field(1).offsets, + [], + data_locale=data_locale, + ) + self._lines = LineArray( + data.field(2).values.values.values, + data.field(2).values.offsets, + data.field(2).offsets, + [], + data_locale=data_locale, + ) + self._polygons = PolygonArray( + data.field(3).values.values.values.values, + data.field(3).values.values.offsets, + data.field(3).values.offsets, + data.field(3).offsets, + [], + data_locale=data_locale, + ) else: raise TypeError( f"Invalid type passed to GeoArrowBuffers ctor {type(data)}" @@ -200,13 +235,13 @@ def __len__(self): """ The numer of unique geometries stored in this GeoArrowBuffers. """ - points_length = len(self._points) if self.points is not None else 0 - lines_length = len(self._lines) if self.lines is not None else 0 + points_length = len(self.points) if self.points is not None else 0 + lines_length = len(self.lines) if self.lines is not None else 0 multipoints_length = ( - len(self._multipoints) if self.multipoints is not None else 0 + len(self.multipoints) if self.multipoints is not None else 0 ) polygons_length = ( - len(self._polygons) if self.polygons is not None else 0 + len(self.polygons) if self.polygons is not None else 0 ) return ( points_length + lines_length + multipoints_length + polygons_length @@ -279,9 +314,9 @@ def data_location(self): @data_location.setter def data_location(self, data_location): - if data_location not in (cudf, pd): + if data_location not in (cudf, pd, pa): raise NotImplementedError( - "only cudf and pandas CoordinateArrays " + "only cudf, pandas, and pa CoordinateArrays " "are supported at this time" ) else: @@ -289,7 +324,9 @@ def data_location(self, data_location): def _serialize(self, data): try: - if self._data_location == pd: + if self._data_location == pa: + return data + elif self._data_location == pd: if isinstance(data, cudf.Series): return data.to_pandas() else: @@ -361,14 +398,14 @@ def x(self): """ Return packed x-coordinates of this GeometryArray object. """ - return self.xy[slice(0, None, 2)].reset_index(drop=True) + return self.xy[::2] @property def y(self): """ Return packed y-coordinates of this GeometryArray object. """ - return self.xy[slice(1, None, 2)].reset_index(drop=True) + return self.xy[1::2] def __len__(self): return len(self.xy) // 2 @@ -415,9 +452,9 @@ def __getitem__(self, index): rindex = index else: rindex = slice(index, index + 1, 1) - new_slice = slice(self.offsets[rindex.start], None) + new_slice = slice(self.offsets[rindex.start] * 2, None) if rindex.stop < len(self.offsets): - new_slice = slice(new_slice.start, self.offsets[rindex.stop]) + new_slice = slice(new_slice.start, self.offsets[rindex.stop] * 2) result = self.xy[new_slice] return result @@ -490,22 +527,7 @@ def copy(self, deep=True): return result def __len__(self): - if len(self._mlines) > 0: - mlength = ( - self._mlines.values[ - np.arange( - 1, len(self._mlines), 2, like=self._mlines.values - ) - ] - - self._mlines.values[ - np.arange( - 0, len(self._mlines), 2, like=self._mlines.values - ) - ] - ).sum() - (len(self._mlines) // 2) - else: - mlength = 0 - return (len(self.offsets) - 1) - int(mlength) + return len(self._mlines) - 1 class MultiPointArray(OffsetArray): @@ -525,7 +547,7 @@ def copy(self, deep=True): class PolygonArray(OffsetArray): - def __init__(self, xy, polys, rings, mpolys, z=None, data_locale=cudf): + def __init__(self, xy, rings, polys, mpolys, z=None, data_locale=cudf): """ The GeoArrow column format for Polygons uses the same scheme as the format for LineStrings - MultiPolygons and Polygons from the same @@ -603,19 +625,4 @@ def copy(self, deep=True): return result def __len__(self): - if len(self._mpolys) > 0: - mlength = ( - self._mpolys.values[ - np.arange( - 1, len(self._mpolys), 2, like=self._mpolys.values - ) - ] - - self._mpolys.values[ - np.arange( - 0, len(self._mpolys), 2, like=self._mpolys.values - ) - ] - ).sum() - (len(self._mpolys) // 2) - else: - mlength = 0 - return (len(self.polys) - 1) - int(mlength) + return len(self._mpolys) - 1 diff --git a/python/cuspatial/cuspatial/geometry/geocolumn.py b/python/cuspatial/cuspatial/geometry/geocolumn.py index d6d6d2094..40360b323 100644 --- a/python/cuspatial/cuspatial/geometry/geocolumn.py +++ b/python/cuspatial/cuspatial/geometry/geocolumn.py @@ -1,9 +1,9 @@ -# Copyright (c) 2021 NVIDIA CORPORATION +# Copyright (c) 2021-2022 NVIDIA CORPORATION import numbers from itertools import repeat from typing import TypeVar, Union -import numpy as np +import pyarrow as pa from shapely.geometry import ( LineString, MultiLineString, @@ -37,54 +37,51 @@ def __init__(self, meta: Union[GeoArrowBuffers, dict]): self.input_types = [] self.input_lengths = [] if buffers.points is not None: - self.input_types.extend(repeat("p", len(buffers.points))) + self.input_types.extend(repeat(0, len(buffers.points))) self.input_lengths.extend(repeat(1, len(buffers.points))) if buffers.multipoints is not None: - self.input_types.extend(repeat("mp", len(buffers.multipoints))) + self.input_types.extend(repeat(1, len(buffers.multipoints))) self.input_lengths.extend(repeat(1, len(buffers.multipoints))) if buffers.lines is not None: - if len(buffers.lines.mlines) > 0: - self.input_types.extend(repeat("l", buffers.lines.mlines[0])) - self.input_lengths.extend(repeat(1, buffers.lines.mlines[0])) - for ml_index in range(len(buffers.lines.mlines) // 2): - self.input_types.extend(["ml"]) - self.input_lengths += [1] - mline_size = ( - buffers.lines.mlines[ml_index * 2 + 1] - - 1 - - buffers.lines.mlines[ml_index * 2] - ) - self.input_types.extend(repeat("l", mline_size)) - self.input_lengths.extend(repeat(1, mline_size)) - else: - self.input_types.extend(repeat("l", len(buffers.lines))) - self.input_lengths.extend(repeat(1, len(buffers.lines))) - if buffers.polygons is not None: - if len(buffers.polygons.mpolys) > 0: - self.input_types.extend( - repeat("poly", buffers.polygons.mpolys[0]) + for index in range(len(buffers.lines.mlines) - 1): + line_len = ( + buffers.lines.mlines[index + 1] + - buffers.lines.mlines[index] ) - self.input_lengths.extend( - repeat(1, buffers.polygons.mpolys[0]) + if line_len > 1: + self.input_types.extend([3]) + self.input_lengths.extend([line_len]) + else: + self.input_types.extend([2]) + self.input_lengths.extend([1]) + if buffers.polygons is not None: + for index in range(len(buffers.polygons.mpolys) - 1): + poly_len = ( + buffers.polygons.mpolys[index + 1] + - buffers.polygons.mpolys[index] ) - for mp_index in range(len(buffers.polygons.mpolys) // 2): - mpoly_size = ( - buffers.polygons.mpolys[mp_index * 2 + 1] - - buffers.polygons.mpolys[mp_index * 2] - ) - self.input_types.extend(["mpoly"]) - self.input_lengths.extend([mpoly_size]) - self.input_types.extend(repeat("poly", mpoly_size)) - self.input_lengths.extend(repeat(1, mpoly_size)) - else: - self.input_types.extend(repeat("poly", len(buffers.polygons))) - self.input_lengths.extend(repeat(1, len(buffers.polygons))) + if poly_len > 1: + self.input_types.extend([5]) + self.input_lengths.extend([poly_len]) + else: + self.input_types.extend([4]) + self.input_lengths.extend([1]) + self.input_types = pa.array(self.input_types, type=pa.int8()) + self.input_lengths = pa.array(self.input_lengths).cast(pa.int32()) def copy(self): - return type(self)( + return self.__class__( { - "input_types": self.input_types.copy(), - "input_lengths": self.input_lengths.copy(), + "input_types": pa.Int8Array.from_buffers( + self.input_types.type, + len(self.input_types), + self.input_types.buffers(), + ), + "input_lengths": pa.Int32Array.from_buffers( + self.input_lengths.type, + len(self.input_lengths), + self.input_lengths.buffers(), + ), } ) @@ -198,6 +195,13 @@ def copy(self, deep=True): ) return result + def from_arrow(self): + """ + I know what to do! + """ + print("Not ready to convert from arrow") + breakpoint() + class GeoColumnLocIndexer: """ @@ -229,14 +233,16 @@ def __getitem__(self, index): def _getitem_int(self, index): type_map = { - "p": PointShapelySerializer, - "mp": MultiPointShapelySerializer, - "l": LineStringShapelySerializer, - "ml": MultiLineStringShapelySerializer, - "poly": PolygonShapelySerializer, - "mpoly": MultiPolygonShapelySerializer, + 0: PointShapelySerializer, + 1: MultiPointShapelySerializer, + 2: LineStringShapelySerializer, + 3: MultiLineStringShapelySerializer, + 4: PolygonShapelySerializer, + 5: MultiPolygonShapelySerializer, } - return type_map[self._sr._meta.input_types[index]](self._sr, index) + return type_map[self._sr._meta.input_types[index].as_py()]( + self._sr, index + ) class ShapelySerializer: @@ -287,7 +293,7 @@ def to_shapely(self): ) item_source = self._source.multipoints result = item_source[item_start] - return MultiPoint(np.array(result).reshape(item_length // 2, 2)) + return MultiPoint(result.to_numpy().reshape(item_length, 2)) class LineStringShapelySerializer(ShapelySerializer): @@ -297,31 +303,23 @@ def to_shapely(self): of the LineString referenced by `self._index`, creates one, and returns it. """ - ml_index = self._index - 1 - preceding_line_count = 0 - preceding_ml_count = 0 - # Skip over any LineStrings that are part of a MultiLineString - while ml_index >= 0: - if self._source._meta.input_types[ml_index] == "ml": - preceding_ml_count = preceding_ml_count + 1 - elif ( - self._source._meta.input_types[ml_index] == "l" - and preceding_ml_count == 0 + index = 0 + for i in range(self._index): + if ( + self._source._meta.input_types[i] + == pa.array([2]).cast(pa.int8())[0] + or self._source._meta.input_types[i] + == pa.array([3]).cast(pa.int8())[0] ): - preceding_line_count = preceding_line_count + 1 - ml_index = ml_index - 1 - preceding_multis = preceding_ml_count - if preceding_multis > 0: - multi_end = self._source.lines.mlines[preceding_multis * 2 - 1] - item_start = multi_end + preceding_line_count - else: - item_start = preceding_line_count - item_length = self._source._meta.input_lengths[self._index] - item_end = item_length + item_start - item_source = self._source.lines - result = item_source[item_start:item_end] + index = index + 1 + ring_start = self._source.lines.mlines[index] + ring_end = self._source.lines.mlines[index + 1] + rings = self._source.lines.offsets * 2 + item_start = rings[ring_start] + item_end = rings[ring_end] + result = self._source.lines.xy[item_start:item_end] return LineString( - np.array(result).reshape(2 * (item_start - item_end), 2) + result.to_numpy().reshape(2 * (item_start - item_end), 2) ) @@ -333,21 +331,25 @@ def to_shapely(self): `self._index`, then return the MultiLineString at that position packed with the LineStrings in its range. """ - item_type = self._source._meta.input_types[self._index] index = 0 for i in range(self._index): - if self._source._meta.input_types[i] == item_type: + if ( + self._source._meta.input_types[i] + == pa.array([2]).cast(pa.int8())[0] + or self._source._meta.input_types[i] + == pa.array([3]).cast(pa.int8())[0] + ): index = index + 1 line_indices = slice( - self._source.lines.mlines[index * 2], - self._source.lines.mlines[index * 2 + 1], + self._source.lines.mlines[index], + self._source.lines.mlines[index + 1], ) return MultiLineString( [ LineString( - np.array(self._source.lines[i]).reshape( - int(len(self._source.lines[i]) / 2), 2 - ) + self._source.lines[i] + .to_numpy() + .reshape(int(len(self._source.lines[i]) / 2), 2) ) for i in range(line_indices.start, line_indices.stop, 1) ] @@ -365,36 +367,28 @@ class PolygonShapelySerializer(ShapelySerializer): """ def to_shapely(self): - mp_index = self._index - 1 - preceding_poly_count = 0 - preceding_mp_count = 0 - while mp_index >= 0: - if self._source._meta.input_types[mp_index] == "mpoly": - preceding_mp_count = preceding_mp_count + 1 - elif ( - self._source._meta.input_types[mp_index] == "poly" - and preceding_mp_count == 0 + index = 0 + for i in range(self._index): + if ( + self._source._meta.input_types[i] + == pa.array([4]).cast(pa.int8())[0] + or self._source._meta.input_types[i] + == pa.array([5]).cast(pa.int8())[0] ): - preceding_poly_count = preceding_poly_count + 1 - mp_index = mp_index - 1 - preceding_multis = preceding_mp_count - multi_index = ( - self._source.polygons.mpolys[preceding_multis * 2 - 1] - if preceding_multis > 0 - else 0 - ) - preceding_polys = preceding_poly_count - ring_start = self._source.polygons.polys[multi_index + preceding_polys] - ring_end = self._source.polygons.polys[ - multi_index + preceding_polys + 1 - ] - rings = self._source.polygons.rings + index = index + 1 + polygon_start = self._source.polygons.mpolys[index] + polygon_end = self._source.polygons.mpolys[index + 1] + ring_start = self._source.polygons.polys[polygon_start] + ring_end = self._source.polygons.polys[polygon_end] + rings = self._source.polygons.rings * 2 exterior_slice = slice(rings[ring_start], rings[ring_start + 1]) exterior = self._source.polygons.xy[exterior_slice] return Polygon( - np.array(exterior).reshape(2 * (ring_start - ring_end), 2), + exterior.to_numpy().reshape(2 * (ring_start - ring_end), 2), [ - np.array(self._source.polygons.xy[interior_slice]).reshape( + self._source.polygons.xy[interior_slice] + .to_numpy() + .reshape( int((interior_slice.stop - interior_slice.start + 1) / 2), 2, ) @@ -414,29 +408,35 @@ def to_shapely(self): subsequent interior rings of all polygons that around bound by the mpolygon specified by self._index. """ - item_type = self._source._meta.input_types[self._index] index = 0 for i in range(self._index): - if self._source._meta.input_types[i] == item_type: + if ( + self._source._meta.input_types[i] + == pa.array([4]).cast(pa.int8())[0] + or self._source._meta.input_types[i] + == pa.array([5]).cast(pa.int8())[0] + ): index = index + 1 poly_indices = slice( - self._source.polygons.mpolys[index * 2], - self._source.polygons.mpolys[index * 2 + 1], + self._source.polygons.mpolys[index], + self._source.polygons.mpolys[index + 1], ) polys = [] for i in range(poly_indices.start, poly_indices.stop): ring_start = self._source.polygons.polys[i] ring_end = self._source.polygons.polys[i + 1] - rings = self._source.polygons.rings + rings = self._source.polygons.rings * 2 exterior_slice = slice(rings[ring_start], rings[ring_start + 1]) exterior = self._source.polygons.xy[exterior_slice] polys.append( Polygon( - np.array(exterior).reshape(2 * (ring_start - ring_end), 2), + exterior.to_numpy().reshape( + 2 * (ring_start - ring_end), 2 + ), [ - np.array( - self._source.polygons.xy[interior_slice] - ).reshape( + self._source.polygons.xy[interior_slice] + .to_numpy() + .reshape( int( ( interior_slice.stop diff --git a/python/cuspatial/cuspatial/geometry/geodataframe.py b/python/cuspatial/cuspatial/geometry/geodataframe.py index 49dda32f7..c25274091 100644 --- a/python/cuspatial/cuspatial/geometry/geodataframe.py +++ b/python/cuspatial/cuspatial/geometry/geodataframe.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION +# Copyright (c) 2020-2022, NVIDIA CORPORATION +import pyarrow as pa from geopandas import GeoDataFrame as gpGeoDataFrame import cudf @@ -31,7 +32,7 @@ def __init__(self, data: gpGeoDataFrame = None): if is_geometry_type(data[col]): adapter = GeoPandasAdapter(data[col]) buffers = GeoArrowBuffers( - adapter.get_geoarrow_host_buffers() + adapter.get_geoarrow_union(), data_locale=pa ) pandas_meta = GeoMeta(adapter.get_geopandas_meta()) column = GeoColumn(buffers, pandas_meta) diff --git a/python/cuspatial/cuspatial/geometry/geoseries.py b/python/cuspatial/cuspatial/geometry/geoseries.py index 7794c9a15..5d1357aba 100644 --- a/python/cuspatial/cuspatial/geometry/geoseries.py +++ b/python/cuspatial/cuspatial/geometry/geoseries.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION +# Copyright (c) 2020-2022, NVIDIA CORPORATION from typing import TypeVar, Union import geopandas as gpd import pandas as pd +import pyarrow as pa from geopandas.geoseries import GeoSeries as gpGeoSeries import cudf @@ -55,7 +56,9 @@ def __init__( column = data._column elif isinstance(data, gpGeoSeries): adapter = GeoPandasAdapter(data) - buffers = GeoArrowBuffers(adapter.get_geoarrow_host_buffers()) + buffers = GeoArrowBuffers( + adapter.get_geoarrow_union(), data_locale=pa + ) pandas_meta = GeoMeta(adapter.get_geopandas_meta()) column = GeoColumn(buffers, pandas_meta) else: diff --git a/python/cuspatial/cuspatial/geometry/pygeoarrow.py b/python/cuspatial/cuspatial/geometry/pygeoarrow.py new file mode 100644 index 000000000..b14d97861 --- /dev/null +++ b/python/cuspatial/cuspatial/geometry/pygeoarrow.py @@ -0,0 +1,189 @@ +# Copyright (c) 2021 NVIDIA CORPORATION + +import geopandas as gpd +import pyarrow as pa +from shapely.geometry import ( + LineString, + MultiLineString, + MultiPoint, + MultiPolygon, + Point, + Polygon, +) + + +def named_list(name, x, size=-1): + return pa.list_(pa.field(name, x, nullable=False), list_size=size) + + +XYType: pa.ListType = named_list("xy", pa.float64(), size=2) + +ArrowPolygonsType: pa.ListType = named_list( + "polygons", named_list("rings", named_list("vertices", XYType)) +) + +ArrowLinestringsType: pa.ListType = named_list( + "lines", named_list("offsets", XYType) +) + +ArrowMultiPointsType: pa.ListType = named_list("points", XYType) + +ArrowPointsType: pa.ListType = XYType + + +def getGeoArrowUnionRootType() -> pa.union: + return pa.union( + [ + ArrowPointsType, + ArrowMultiPointsType, + ArrowLinestringsType, + ArrowPolygonsType, + ], + mode="dense", + ) + + +def from_geopandas(geoseries: gpd.GeoSeries) -> pa.lib.UnionArray: + def get_coordinates(data) -> tuple: + point_coords = [] + mpoint_coords = [] + line_coords = [] + polygon_coords = [] + all_offsets = [0] + type_buffer = [] + + for geom in data: + coords = geom.__geo_interface__["coordinates"] + if isinstance(geom, Point): + point_coords.append(coords) + all_offsets.append(1) + elif isinstance(geom, MultiPoint): + mpoint_coords.append(coords) + all_offsets.append(len(coords)) + elif isinstance(geom, LineString): + line_coords.append([coords]) + all_offsets.append(1) + elif isinstance(geom, MultiLineString): + line_coords.append(coords) + all_offsets.append(len(coords)) + elif isinstance(geom, Polygon): + polygon_coords.append([coords]) + all_offsets.append(1) + elif isinstance(geom, MultiPolygon): + polygon_coords.append(coords) + all_offsets.append(len(coords)) + else: + raise TypeError(type(geom)) + type_buffer.append( + { + Point: 0, + MultiPoint: 1, + LineString: 2, + MultiLineString: 3, + Polygon: 4, + MultiPolygon: 5, + }[type(geom)] + ) + return ( + type_buffer, + point_coords, + mpoint_coords, + line_coords, + polygon_coords, + all_offsets, + ) + + buffers = get_coordinates(geoseries) + type_buffer = pa.array(buffers[0]).cast(pa.int8()) + all_offsets = pa.array(buffers[5]).cast(pa.int32()) + children = [ + pa.array(buffers[1], type=ArrowPointsType), + pa.array(buffers[2], type=ArrowMultiPointsType), + pa.array(buffers[3], type=ArrowLinestringsType), + pa.array(buffers[4], type=ArrowPolygonsType), + ] + + return pa.UnionArray.from_dense( + type_buffer, + all_offsets, + children, + ["points", "mpoints", "lines", "polygons"], + ) + + +class GeoArrow(pa.UnionArray): + """The GeoArrow specification.""" + + def __init__(self, types, offsets, children): + super().__init__(self, types, offsets, children) + + @property + def points(self): + """ + A simple numeric column. x and y coordinates are interleaved such that + even coordinates are x axis and odd coordinates are y axis. + """ + return self.fields(0) + + @property + def multipoints(self): + """ + Similar to the Points column with the addition of an offsets column. + The offsets column stores the comparable sizes and coordinates of each + MultiPoint in the GeoArrowBuffers. + """ + return self.fields(1) + + @property + def lines(self): + """ + Contains the coordinates column, an offsets column, + and a mlines column. + The mlines column is optional. The mlines column stores + the indices of the offsets that indicate the beginning and end of each + MultiLineString segment. The absence of an `mlines` column indicates + there are no `MultiLineStrings` in the data source, + only `LineString` s. + """ + return self.fields(2) + + @property + def polygons(self): + """ + Contains the coordinates column, a rings column specifying + the beginning and end of every polygon, a polygons column specifying + the beginning, or exterior, ring of each polygon and the end ring. + All rings after the first ring are interior rings. Finally a + mpolygons column stores the offsets of the polygons that should be + grouped into MultiPolygons. + """ + return self.fields(3) + + def copy(self): + """ + Duplicates the UnionArray + """ + return pa.UnionArray.from_dense( + self.type_codes, + self.offsets, + [ + self.field(0).values, + self.field(1).values, + self.field(2).values, + self.field(3).values, + ], + ) + + def to_host(self): + """ + Return a copy of GeoArrowBuffers backed by host data structures. + """ + raise NotImplementedError + + def __repr__(self): + return ( + f"{self.points}\n" + f"{self.multipoints}\n" + f"{self.lines}\n" + f"{self.polygons}\n" + ) diff --git a/python/cuspatial/cuspatial/io/geopandas_adapter.py b/python/cuspatial/cuspatial/io/geopandas_adapter.py index dc518bf44..6b771ad8d 100644 --- a/python/cuspatial/cuspatial/io/geopandas_adapter.py +++ b/python/cuspatial/cuspatial/io/geopandas_adapter.py @@ -1,18 +1,15 @@ -# Copyright (c) 2020-2021 NVIDIA CORPORATION. +# Copyright (c) 2020-2022 NVIDIA CORPORATION. -import numpy as np +import pyarrow as pa from geopandas import GeoSeries as gpGeoSeries -from shapely.geometry import ( - LineString, - MultiLineString, - MultiPoint, - MultiPolygon, - Point, - Polygon, -) + +from cuspatial.geometry import pygeoarrow class GeoPandasAdapter: + buffers = None + source = None + def __init__(self, geoseries: gpGeoSeries): """ GeoPandasAdapter copies a GeoPandas GeoSeries object iteratively into @@ -22,280 +19,24 @@ def __init__(self, geoseries: gpGeoSeries): ---------- geoseries : A GeoPandas GeoSeries """ - self.offsets = self._load_geometry_offsets(geoseries) - self.buffers = self._read_geometries(geoseries, self.offsets) - - def _load_geometry_offsets(self, geoseries: gpGeoSeries) -> dict: - """ - Computes the offet arrays and buffer sizes that will be required - to store the geometries. - - Parameters - ---------- - geoseries : A GeoPandas GeoSeries - """ - offsets = { - "points": [0], - "multipoints": [0], - "lines": [0], - "mlines": [], - "polygons": {"polygons": [0], "rings": [0], "mpolys": []}, - } - for geometry in geoseries: - if isinstance(geometry, Point): - # a single Point geometry will go into the GpuPoints - # structure. No offsets are required, but an index to the - # position in the GeoSeries is required. - current = offsets["points"][-1] - offsets["points"].append(len(geometry.xy) + current) - elif isinstance(geometry, MultiPoint): - # A MultiPoint geometry also is copied into the GpuPoints - # structure. A MultiPoint object must be created, containing - # the size of the number of points, the position they are - # stored in GpuPoints, and the index of the MultiPoint in the - # GeoSeries. - current = offsets["multipoints"][-1] - offsets["multipoints"].append(len(geometry) * 2 + current) - elif isinstance(geometry, LineString): - # A LineString geometry is stored in the GpuLines structure. - # Every LineString has a size which is stored in the GpuLines - # structure. The index of the LineString back into the - # GeoSeries is also stored. - current = offsets["lines"][-1] - offsets["lines"].append(2 * len(geometry.coords) + current) - elif isinstance(geometry, MultiLineString): - # A MultiLineString geometry is stored identically to - # LineString in the GpuLines structure. The index of the - # GeoSeries object is also stored. - offsets["mlines"].append(len(offsets["lines"]) - 1) - for linestring in geometry: - current = offsets["lines"][-1] - offsets["lines"].append( - 2 * len(linestring.coords) + current - ) - offsets["mlines"].append(len(offsets["lines"]) - 1) - elif isinstance(geometry, Polygon): - # A Polygon geometry is stored like a LineString and also - # contains a buffer of sizes for each inner ring. - num_rings = 1 - rings_current = offsets["polygons"]["rings"][-1] - offsets["polygons"]["rings"].append( - len(geometry.exterior.coords) * 2 + rings_current - ) - for interior in geometry.interiors: - rings_current = offsets["polygons"]["rings"][-1] - offsets["polygons"]["rings"].append( - len(interior.coords) * 2 + rings_current - ) - num_rings = num_rings + 1 - current = offsets["polygons"]["polygons"][-1] - offsets["polygons"]["polygons"].append(num_rings + current) - elif isinstance(geometry, MultiPolygon): - current = offsets["polygons"]["polygons"][-1] - offsets["polygons"]["mpolys"].append( - len(offsets["polygons"]["polygons"]) - 1 - ) - for poly in geometry: - current = offsets["polygons"]["polygons"][-1] - num_rings = 1 - rings_current = offsets["polygons"]["rings"][-1] - offsets["polygons"]["rings"].append( - len(poly.exterior.coords) * 2 + rings_current - ) - for interior in poly.interiors: - rings_current = offsets["polygons"]["rings"][-1] - offsets["polygons"]["rings"].append( - len(interior.coords) * 2 + rings_current - ) - num_rings = num_rings + 1 - offsets["polygons"]["polygons"].append(num_rings + current) - offsets["polygons"]["mpolys"].append( - len(offsets["polygons"]["polygons"]) - 1 - ) - return offsets - - def _read_geometries( - self, - geoseries: gpGeoSeries, - offsets: dict, - ) -> dict: - """ - Creates a set of buffers sized to fit all of the geometries and - iteratively populates them with geometry coordinate values. - - Parameters - ---------- - geoseries : A GeoPandas GeoSeries object. - offsets : The set of offsets that correspond to the geoseries argument. - """ - buffers = { - "points": np.zeros(offsets["points"][-1]), - "multipoints": np.zeros(offsets["multipoints"][-1]), - "lines": np.zeros(offsets["lines"][-1]), - "polygons": { - "polygons": np.zeros(len(offsets["polygons"]["polygons"])), - "rings": np.zeros(len(offsets["polygons"]["rings"])), - "coords": np.zeros(offsets["polygons"]["rings"][-1]), - }, - } - read_count = { - "points": 0, - "multipoints": 0, - "lines": 0, - "polygons": 0, - } - inputs = [] - input_types = [] - input_lengths = [] - for geometry in geoseries: - if isinstance(geometry, Point): - # write a point to the points buffer - # increase read_count of points pass - i = read_count["points"] * 2 - buffers["points"][i] = geometry.x - buffers["points"][i + 1] = geometry.y - read_count["points"] = read_count["points"] + 1 - input_types.append("p") - input_lengths.append(1) - inputs.append({"type": "p", "length": 1}) - elif isinstance(geometry, MultiPoint): - points = np.array(geometry) - size = points.shape[0] * 2 - i = read_count["multipoints"] - buffers["multipoints"][slice(i, i + size, 2)] = points[:, 0] - buffers["multipoints"][slice(i + 1, i + size, 2)] = points[ - :, 1 - ] - read_count["multipoints"] = read_count["multipoints"] + size - input_types.append("mp") - input_lengths.append(len(geometry)) - inputs.append({"type": "mp", "length": len(geometry)}) - elif isinstance(geometry, LineString): - size = len(geometry.xy[0]) * 2 - i = read_count["lines"] - buffers["lines"][slice(i, i + size, 2)] = geometry.xy[0] - buffers["lines"][slice(i + 1, i + size, 2)] = geometry.xy[1] - read_count["lines"] = read_count["lines"] + size - input_types.append("l") - input_lengths.append(1) - inputs.append({"type": "l", "length": 1}) - elif isinstance(geometry, MultiLineString): - substrings = [] - for linestring in geometry: - size = len(linestring.xy[0]) * 2 - i = read_count["lines"] - buffers["lines"][slice(i, i + size, 2)] = linestring.xy[0] - buffers["lines"][ - slice(i + 1, i + size, 2) - ] = linestring.xy[1] - read_count["lines"] = read_count["lines"] + size - substrings.append({"type": "l", "length": size}) - input_types.append("ml") - input_lengths.append(len(geometry)) - inputs.append( - { - "type": "ml", - "length": len(geometry), - "children": substrings, - } - ) - elif isinstance(geometry, Polygon): - # copy exterior - exterior = geometry.exterior.coords.xy - size = len(exterior[0]) * 2 - i = read_count["polygons"] - buffers["polygons"]["coords"][ - slice(i, i + size, 2) - ] = exterior[0] - buffers["polygons"]["coords"][ - slice(i + 1, i + size, 2) - ] = exterior[1] - read_count["polygons"] = read_count["polygons"] + size - interiors = geometry.interiors - for interior in interiors: - interior_coords = interior.coords.xy - size = len(interior_coords[0]) * 2 - i = read_count["polygons"] - buffers["polygons"]["coords"][ - slice(i, i + size, 2) - ] = interior_coords[0] - buffers["polygons"]["coords"][ - slice(i + 1, i + size, 2) - ] = interior_coords[1] - read_count["polygons"] = read_count["polygons"] + size - input_types.append("poly") - input_lengths.append(1) - inputs.append({"type": "poly", "length": 1}) - elif isinstance(geometry, MultiPolygon): - subpolys = [] - for polygon in geometry: - exterior = polygon.exterior.coords.xy - size = len(exterior[0]) * 2 - i = read_count["polygons"] - buffers["polygons"]["coords"][ - slice(i, i + size, 2) - ] = exterior[0] - buffers["polygons"]["coords"][ - slice(i + 1, i + size, 2) - ] = exterior[1] - read_count["polygons"] = read_count["polygons"] + size - interiors = polygon.interiors - for interior in interiors: - interior_coords = interior.coords.xy - size = len(interior_coords[0]) * 2 - i = read_count["polygons"] - buffers["polygons"]["coords"][ - slice(i, i + size, 2) - ] = interior_coords[0] - buffers["polygons"]["coords"][ - slice(i + 1, i + size, 2) - ] = interior_coords[1] - read_count["polygons"] = read_count["polygons"] + size - subpolys.append({"type": "poly", "length": 1}) - input_types.append("mpoly") - input_lengths.append(len(geometry)) - inputs.append( - { - "type": "ml", - "length": len(geometry), - "children": subpolys, - } - ) - else: - raise NotImplementedError - return { - "buffers": buffers, - "input_types": input_types, - "input_lengths": input_lengths, - "inputs": inputs, - } + self.buffers = pygeoarrow.from_geopandas(geoseries) + self.source = geoseries def get_geoarrow_host_buffers(self) -> dict: """ Returns a set of host buffers containing the geopandas object converted to GeoArrow format. """ - points_xy = [] - mpoints_xy = [] - mpoints_offsets = [] - lines_xy = [] - lines_offsets = [] - mlines = [] - polygons_xy = [] - polygons_polygons = [] - polygons_rings = [] - mpolygons = [] - buffers = self.buffers["buffers"] - points_xy = buffers["points"] - mpoints_xy = buffers["multipoints"] - mpoints_offsets = self.offsets["multipoints"] - lines_xy = buffers["lines"] - lines_offsets = self.offsets["lines"] - mlines = self.offsets["mlines"] - polygons_xy = buffers["polygons"]["coords"] - polygons_polygons = self.offsets["polygons"]["polygons"] - polygons_rings = self.offsets["polygons"]["rings"] - mpolygons = self.offsets["polygons"]["mpolys"] + points_xy = self.buffers.field(0).values + mpoints_xy = self.buffers.field(1).values + mpoints_offsets = self.buffers.field(1).offsets + lines_xy = self.buffers.field(2).values.values + lines_offsets = self.buffers.field(2).offsets + mlines = lines_offsets + polygons_xy = self.buffers.field(3).values.values.values + mpolygons = self.buffers.field(3).offsets + polygons_polygons = self.buffers.field(3).values.offsets + polygons_rings = self.buffers.field(3).values.values.offsets return { "points_xy": points_xy, "mpoints_xy": mpoints_xy, @@ -309,6 +50,9 @@ def get_geoarrow_host_buffers(self) -> dict: "mpolygons": mpolygons, } + def get_geoarrow_union(self) -> pa.UnionArray: + return self.buffers + def get_geopandas_meta(self) -> dict: """ Returns the metadata that was created converting the GeoSeries into @@ -319,7 +63,7 @@ def get_geopandas_meta(self) -> dict: """ buffers = self.buffers return { - "input_types": buffers["input_types"], - "input_lengths": buffers["input_lengths"], - "inputs": buffers["inputs"], + "input_types": buffers.type_codes, + "input_lengths": buffers.offsets, + "inputs": self.source, } diff --git a/python/cuspatial/cuspatial/tests/test_from_geopandas.py b/python/cuspatial/cuspatial/tests/test_from_geopandas.py index 352cad9d2..f8abd5f00 100644 --- a/python/cuspatial/cuspatial/tests/test_from_geopandas.py +++ b/python/cuspatial/cuspatial/tests/test_from_geopandas.py @@ -36,38 +36,42 @@ def test_dataframe_column_access(gs): def test_from_geoseries_complex(gs): cugs = cuspatial.from_geopandas(gs) - assert cugs.points.xy.sum() == 18 - assert cugs.lines.xy.sum() == 540 - assert cugs.multipoints.xy.sum() == 36 - assert cugs.polygons.xy.sum() == 7436 - assert cugs.polygons.polys.sum() == 38 - assert cugs.polygons.rings.sum() == 654 + assert cugs.points.xy.sum().as_py() == 18 + assert cugs.lines.xy.sum().as_py() == 540 + assert cugs.multipoints.xy.sum().as_py() == 36 + assert cugs.polygons.xy.sum().as_py() == 7436 + assert cugs.polygons.polys.sum().as_py() == 38 + assert cugs.polygons.rings.sum().as_py() == 327 def test_from_geopandas_point(): gs = gpd.GeoSeries(Point(1.0, 2.0)) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal(cugs.points.xy, cudf.Series([1.0, 2.0])) + cudf.testing.assert_series_equal( + cudf.Series(cugs.points.xy), cudf.Series([1.0, 2.0], dtype="float64") + ) def test_from_geopandas_multipoint(): gs = gpd.GeoSeries(MultiPoint([(1.0, 2.0), (3.0, 4.0)])) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( - cugs.multipoints.xy, cudf.Series([1.0, 2.0, 3.0, 4.0]) + cudf.testing._utils.assert_eq( + cugs.multipoints.xy, cudf.Series([1.0, 2.0, 3.0, 4.0], dtype="float64") ) - cudf.testing.assert_series_equal( - cugs.multipoints.offsets, cudf.Series([0, 4]) + cudf.testing._utils.assert_eq( + cugs.multipoints.offsets, cudf.Series([0, 2], dtype="int32") ) def test_from_geopandas_linestring(): gs = gpd.GeoSeries(LineString(((4.0, 3.0), (2.0, 1.0)))) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( - cugs.lines.xy, cudf.Series([4.0, 3.0, 2.0, 1.0]) + cudf.testing._utils.assert_eq( + cugs.lines.xy, cudf.Series([4.0, 3.0, 2.0, 1.0], dtype="float64") + ) + cudf.testing._utils.assert_eq( + cugs.lines.offsets, cudf.Series([0, 2], dtype="int32") ) - cudf.testing.assert_series_equal(cugs.lines.offsets, cudf.Series([0, 4])) def test_from_geopandas_multilinestring(): @@ -80,12 +84,12 @@ def test_from_geopandas_multilinestring(): ) ) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( + cudf.testing._utils.assert_eq( cugs.lines.xy, - cudf.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]), + cudf.Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], dtype="float64"), ) - cudf.testing.assert_series_equal( - cugs.lines.offsets, cudf.Series([0, 4, 8]) + cudf.testing._utils.assert_eq( + cugs.lines.offsets, cudf.Series([0, 2, 4], dtype="int32") ) @@ -96,12 +100,16 @@ def test_from_geopandas_polygon(): ) ) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( + cudf.testing._utils.assert_eq( cugs.polygons.xy, - cudf.Series([0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]), + cudf.Series([0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], dtype="float64"), + ) + cudf.testing._utils.assert_eq( + cugs.polygons.polys, cudf.Series([0, 1], dtype="int32").to_arrow() + ) + cudf.testing._utils.assert_eq( + cugs.polygons.rings, cudf.Series([0, 4], dtype="int32").to_arrow() ) - cudf.testing.assert_series_equal(cugs.polygons.polys, cudf.Series([0, 1])) - cudf.testing.assert_series_equal(cugs.polygons.rings, cudf.Series([0, 8])) def test_from_geopandas_polygon_hole(): @@ -112,7 +120,7 @@ def test_from_geopandas_polygon_hole(): ) ) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( + cudf.testing._utils.assert_eq( cugs.polygons.xy, cudf.Series( [ @@ -132,12 +140,15 @@ def test_from_geopandas_polygon_hole(): 0.0, 1.0, 1.0, - ] + ], + dtype="float64", ), ) - cudf.testing.assert_series_equal(cugs.polygons.polys, cudf.Series([0, 2])) - cudf.testing.assert_series_equal( - cugs.polygons.rings, cudf.Series([0, 8, 16]) + cudf.testing._utils.assert_eq( + cugs.polygons.polys, cudf.Series([0, 2], dtype="int32") + ) + cudf.testing._utils.assert_eq( + cugs.polygons.rings, cudf.Series([0, 4, 8], dtype="int32") ) @@ -153,7 +164,7 @@ def test_from_geopandas_multipolygon(): ) ) cugs = cuspatial.from_geopandas(gs) - cudf.testing.assert_series_equal( + cudf.testing._utils.assert_eq( cugs.polygons.xy, cudf.Series( [ @@ -173,10 +184,13 @@ def test_from_geopandas_multipolygon(): 0.0, 1.0, 1.0, - ] + ], + dtype="float64", ), ) - cudf.testing.assert_series_equal(cugs.polygons.polys, cudf.Series([0, 2])) - cudf.testing.assert_series_equal( - cugs.polygons.rings, cudf.Series([0, 8, 16]) + cudf.testing._utils.assert_eq( + cugs.polygons.polys, cudf.Series([0, 2], dtype="int32") + ) + cudf.testing._utils.assert_eq( + cugs.polygons.rings, cudf.Series([0, 4, 8], dtype="int32") ) diff --git a/python/cuspatial/cuspatial/tests/test_geoarrowbuffers.py b/python/cuspatial/cuspatial/tests/test_geoarrowbuffers.py index 339216f69..6c9977c0a 100644 --- a/python/cuspatial/cuspatial/tests/test_geoarrowbuffers.py +++ b/python/cuspatial/cuspatial/tests/test_geoarrowbuffers.py @@ -31,7 +31,7 @@ def test_points(): def test_multipoints(): buffers = GeoArrowBuffers( - {"mpoints_xy": np.arange(0, 16), "mpoints_offsets": [0, 4, 8, 12, 16]} + {"mpoints_xy": np.arange(0, 16), "mpoints_offsets": [0, 2, 4, 6, 8]} ) cudf.testing.assert_series_equal( cudf.Series(np.arange(0, 16)), buffers.multipoints.xy @@ -53,10 +53,10 @@ def test_multipoints(): def test_homogeneous_lines(): buffers = GeoArrowBuffers( - {"lines_xy": range(24), "lines_offsets": np.array(range(5)) * 6} + {"lines_xy": range(24), "lines_offsets": np.array(range(5)) * 3} ) cudf.testing.assert_series_equal(cudf.Series(range(24)), buffers.lines.xy) - assert len(buffers.lines) == 4 + assert len(buffers.lines.offsets) == 5 column = GeoColumn(buffers) pd.testing.assert_series_equal( GeoSeries(column).to_pandas(), @@ -75,12 +75,12 @@ def test_mixed_lines(): buffers = GeoArrowBuffers( { "lines_xy": range(24), - "lines_offsets": np.array(range(5)) * 6, - "mlines": [1, 3], + "lines_offsets": np.array(range(5)) * 3, + "mlines": [0, 1, 3, 4], } ) cudf.testing.assert_series_equal(cudf.Series(range(24)), buffers.lines.xy) - assert len(buffers.lines) == 3 + assert len(buffers.lines.offsets) == 5 column = GeoColumn(buffers) pd.testing.assert_series_equal( GeoSeries(column).to_pandas(), @@ -100,17 +100,13 @@ def test_mixed_lines(): def test_homogeneous_polygons(): - polygons_xy = np.array( - [ - np.concatenate((x[0:6], x[0:2]), axis=None) - for x in np.arange(60).reshape(10, 6) - ] - ) + polygons_xy = np.arange(60) buffers = GeoArrowBuffers( { - "polygons_xy": polygons_xy.flatten(), - "polygons_polygons": np.array([0, 1, 3, 5, 7, 9, 10]), - "polygons_rings": np.arange(11) * 8, + "polygons_xy": polygons_xy, + "polygons_polygons": [0, 1, 3, 5, 7, 9, 10], + "polygons_rings": np.arange(11) * 3, + "mpolygons": [0, 1, 2, 3, 4, 5, 6], } ) cudf.testing.assert_series_equal( @@ -146,18 +142,13 @@ def test_homogeneous_polygons(): def test_polygons(): - polygons_xy = np.array( - [ - np.concatenate((x[0:6], x[0:2]), axis=None) - for x in np.arange(60).reshape(10, 6) - ] - ) + polygons_xy = np.arange(60) buffers = GeoArrowBuffers( { - "polygons_xy": polygons_xy.flatten(), - "polygons_polygons": np.array([0, 1, 3, 5, 7, 9, 10]), - "polygons_rings": np.arange(11) * 8, - "mpolygons": [2, 4], + "polygons_xy": polygons_xy, + "polygons_polygons": [0, 1, 3, 5, 7, 9, 10], + "polygons_rings": np.arange(11) * 3, + "mpolygons": [0, 1, 2, 4, 5, 6], } ) cudf.testing.assert_series_equal( @@ -208,7 +199,7 @@ def test_full(): "polygons_xy": range(12), "polygons_polygons": np.array(range(5)), "polygons_rings": np.array(range(5)) * 3, - "mpolygons": [1, 3], + "mpolygons": [0, 1, 3, 5], } ) - assert len(buffers) == 11 + assert len(buffers) == 9 diff --git a/python/cuspatial/cuspatial/tests/test_geodataframe.py b/python/cuspatial/cuspatial/tests/test_geodataframe.py index c53336a19..f10fac67f 100644 --- a/python/cuspatial/cuspatial/tests/test_geodataframe.py +++ b/python/cuspatial/cuspatial/tests/test_geodataframe.py @@ -137,7 +137,7 @@ def test_interleaved_point(gpdf, polys): gs[gs.type == "Point"].y.reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.multipoints.x, + cudf.Series.from_arrow(cugs.multipoints.x), cudf.Series( np.array( [np.array(p)[:, 0] for p in gs[gs.type == "MultiPoint"]] @@ -145,7 +145,7 @@ def test_interleaved_point(gpdf, polys): ), ) cudf.testing.assert_series_equal( - cugs.multipoints.y, + cudf.Series.from_arrow(cugs.multipoints.y), cudf.Series( np.array( [np.array(p)[:, 1] for p in gs[gs.type == "MultiPoint"]] @@ -153,24 +153,26 @@ def test_interleaved_point(gpdf, polys): ), ) cudf.testing.assert_series_equal( - cugs.lines.x, + cudf.Series.from_arrow(cugs.lines.x), cudf.Series( np.array([range(11, 34, 2)]).flatten(), dtype="float64", ), ) cudf.testing.assert_series_equal( - cugs.lines.y, + cudf.Series.from_arrow(cugs.lines.y), cudf.Series( np.array([range(12, 35, 2)]).flatten(), dtype="float64", ), ) cudf.testing.assert_series_equal( - cugs.polygons.x, cudf.Series(polys[:, 0], dtype="float64") + cudf.Series.from_arrow(cugs.polygons.x), + cudf.Series(polys[:, 0], dtype="float64"), ) cudf.testing.assert_series_equal( - cugs.polygons.y, cudf.Series(polys[:, 1], dtype="float64") + cudf.Series.from_arrow(cugs.polygons.y), + cudf.Series(polys[:, 1], dtype="float64"), ) diff --git a/python/cuspatial/cuspatial/tests/test_geoseries.py b/python/cuspatial/cuspatial/tests/test_geoseries.py index 8b7c5900d..3f59163b5 100644 --- a/python/cuspatial/cuspatial/tests/test_geoseries.py +++ b/python/cuspatial/cuspatial/tests/test_geoseries.py @@ -114,48 +114,52 @@ def assert_eq_geo(geo1, geo2): def test_interleaved_point(gs, polys): cugs = cuspatial.from_geopandas(gs) pd.testing.assert_series_equal( - cugs.points.x.to_pandas(), - gs[gs.type == "Point"].x.reset_index(drop=True), + pd.Series(cugs.points.x, dtype="float64"), + gs[gs.type == "Point"].x, + check_index=False, ) pd.testing.assert_series_equal( cugs.points.y.to_pandas(), - gs[gs.type == "Point"].y.reset_index(drop=True), + gs[gs.type == "Point"].y, + check_index=False, ) cudf.testing.assert_series_equal( - cugs.multipoints.x, + cudf.Series(cugs.multipoints.x).reset_index(drop=True), cudf.Series( np.array( [np.array(p)[:, 0] for p in gs[gs.type == "MultiPoint"]] ).flatten() - ), + ).reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.multipoints.y, + cudf.Series(cugs.multipoints.y).reset_index(drop=True), cudf.Series( np.array( [np.array(p)[:, 1] for p in gs[gs.type == "MultiPoint"]] ).flatten() - ), + ).reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.lines.x, + cudf.Series(cugs.lines.x).reset_index(drop=True), cudf.Series( np.array([range(11, 34, 2)]).flatten(), dtype="float64", - ), + ).reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.lines.y, + cudf.Series(cugs.lines.y).reset_index(drop=True), cudf.Series( np.array([range(12, 35, 2)]).flatten(), dtype="float64", - ), + ).reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.polygons.x, cudf.Series(polys[:, 0], dtype="float64") + cudf.Series(cugs.polygons.x).reset_index(drop=True), + cudf.Series(polys[:, 0], dtype="float64").reset_index(drop=True), ) cudf.testing.assert_series_equal( - cugs.polygons.y, cudf.Series(polys[:, 1], dtype="float64") + cudf.Series(cugs.polygons.y).reset_index(drop=True), + cudf.Series(polys[:, 1], dtype="float64").reset_index(drop=True), ) diff --git a/python/cuspatial/cuspatial/tests/test_points_in_spatial_window.py b/python/cuspatial/cuspatial/tests/test_points_in_spatial_window.py index becbcbd65..677f37eca 100644 --- a/python/cuspatial/cuspatial/tests/test_points_in_spatial_window.py +++ b/python/cuspatial/cuspatial/tests/test_points_in_spatial_window.py @@ -61,7 +61,6 @@ def test_half(): cudf.Series([-1.0, 1.0, 3.0, -3.0]), cudf.Series([1.0, -1.0, 3.0, -3.0]), ) - print(result) cudf.testing.assert_frame_equal( result, cudf.DataFrame({"x": [-1.0, 1.0], "y": [1.0, -1.0]}) )