Skip to content

Commit

Permalink
Merge branch 'main' into force_feature_count
Browse files Browse the repository at this point in the history
  • Loading branch information
brendan-ward committed Aug 22, 2023
2 parents d4630f3 + a847c27 commit 4701cc2
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 39 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

### Improvements

- Support writing dataframes without geometry column (#267)

- Calculate feature count by iterating over features if GDAL returns an
unknown count for a data layer (e.g., OSM driver); this may have signficant
performance impacts for some data sources that would otherwise return an
Expand Down
44 changes: 32 additions & 12 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1569,16 +1569,34 @@ def ogr_write(
cdef OGRwkbGeometryType geometry_code
cdef int err = 0
cdef int i = 0
cdef int num_records = len(geometry)
cdef int num_fields = len(field_data) if field_data else 0

if len(field_data) != len(fields):
raise ValueError("field_data and fields must be same length")

if num_fields:
cdef int num_records = -1
cdef int num_field_data = len(field_data) if field_data is not None else 0
cdef int num_fields = len(fields) if fields is not None else 0

if num_fields != num_field_data:
raise ValueError("field_data array needs to be same length as fields array")

if num_fields == 0 and geometry is None:
raise ValueError("You must provide at least a geometry column or a field")

if num_fields > 0:
num_records = len(field_data[0])
for i in range(1, len(field_data)):
if len(field_data[i]) != num_records:
raise ValueError("field_data arrays must be same length as geometry array")
raise ValueError("field_data arrays must be same length")

if geometry is None:
# If no geometry data, we ignore the geometry_type and don't create a geometry
# column
geometry_type = None
else:
if num_fields > 0:
if len(geometry) != num_records:
raise ValueError(
"field_data arrays must be same length as geometry array"
)
else:
num_records = len(geometry)

if field_mask is not None:
if len(field_data) != len(field_mask):
Expand All @@ -1587,7 +1605,7 @@ def ogr_write(
if field_mask[i] is not None and len(field_mask[i]) != num_records:
raise ValueError("field_mask arrays must be same length as geometry array")
else:
field_mask = [None] * len(field_data)
field_mask = [None] * num_fields

path_b = path.encode('UTF-8')
path_c = path_b
Expand Down Expand Up @@ -1676,7 +1694,7 @@ def ogr_write(
### Get geometry type
# TODO: this is brittle for 3D / ZM / M types
# TODO: fail on M / ZM types
geometry_code = get_geometry_type_code(geometry_type or "Unknown")
geometry_code = get_geometry_type_code(geometry_type)

try:
if create_layer:
Expand Down Expand Up @@ -1713,7 +1731,9 @@ def ogr_write(
layer_options = NULL

### Create the fields
field_types = infer_field_types([field.dtype for field in field_data])
field_types = None
if num_fields > 0:
field_types = infer_field_types([field.dtype for field in field_data])

### Create the fields
if create_layer:
Expand Down Expand Up @@ -1771,7 +1791,7 @@ def ogr_write(

# create the geometry based on specific WKB type (there might be mixed types in geometries)
# TODO: geometry must not be null or errors
wkb = geometry[i]
wkb = None if geometry is None else geometry[i]
if wkb is not None:
wkbtype = <int>bytearray(wkb)[1]
# may need to consider all 4 bytes: int.from_bytes(wkb[0][1:4], byteorder="little")
Expand Down
6 changes: 0 additions & 6 deletions pyogrio/_ogr.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,6 @@ def get_gdal_config_option(str name):


def ogr_driver_supports_write(driver):
# exclude drivers known to be unsupported by pyogrio even though they are
# supported for write by GDAL
if driver in {"XLSX"}:
return False


# check metadata for driver to see if it supports write
if _get_driver_metadata_item(driver, "DCAP_CREATE") == 'YES':
return True
Expand Down
33 changes: 20 additions & 13 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def write_dataframe(
Parameters
----------
df : GeoDataFrame
df : GeoDataFrame or DataFrame
The data to write. For attribute columns of the "object" dtype,
all values will be converted to strings to be written to the
output file, except None and np.nan, which will be set to NULL
Expand Down Expand Up @@ -294,7 +294,6 @@ def write_dataframe(
"""
# TODO: add examples to the docstring (e.g. OGR kwargs)
try:
import geopandas as gp
from geopandas.array import to_wkb
import pandas as pd

Expand All @@ -306,25 +305,27 @@ def write_dataframe(

path = str(path)

if not isinstance(df, gp.GeoDataFrame):
raise ValueError("'df' must be a GeoDataFrame")
if not isinstance(df, pd.DataFrame):
raise ValueError("'df' must be a DataFrame or GeoDataFrame")

if driver is None:
driver = detect_driver(path)

geometry_columns = df.columns[df.dtypes == "geometry"]
if len(geometry_columns) == 0:
raise ValueError("'df' does not have a geometry column")

if len(geometry_columns) > 1:
raise ValueError(
"'df' must have only one geometry column. "
"Multiple geometry columns are not supported for output using OGR."
)

geometry_column = geometry_columns[0]
geometry = df[geometry_column]
fields = [c for c in df.columns if not c == geometry_column]
if len(geometry_columns) > 0:
geometry_column = geometry_columns[0]
geometry = df[geometry_column]
fields = [c for c in df.columns if not c == geometry_column]
else:
geometry_column = None
geometry = None
fields = list(df.columns)

# TODO: may need to fill in pd.NA, etc
field_data = []
Expand All @@ -345,7 +346,9 @@ def write_dataframe(
field_mask.append(None)

# Determine geometry_type and/or promote_to_multi
if geometry_type is None or promote_to_multi is None:
if geometry_column is not None and (
geometry_type is None or promote_to_multi is None
):
tmp_geometry_type = "Unknown"
has_z = False

Expand Down Expand Up @@ -402,7 +405,7 @@ def write_dataframe(
geometry_type = f"{geometry_type} Z"

crs = None
if geometry.crs:
if geometry_column is not None and geometry.crs:
# TODO: this may need to be WKT1, due to issues
# if possible use EPSG codes instead
epsg = geometry.crs.to_epsg()
Expand All @@ -411,11 +414,15 @@ def write_dataframe(
else:
crs = geometry.crs.to_wkt(WktVersion.WKT1_GDAL)

# If there is geometry data, prepare it to be written
if geometry_column is not None:
geometry = to_wkb(geometry.values)

write(
path,
layer=layer,
driver=driver,
geometry=to_wkb(geometry.values),
geometry=geometry,
field_data=field_data,
field_mask=field_mask,
fields=fields,
Expand Down
7 changes: 2 additions & 5 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,6 @@ def write(
layer_options=None,
**kwargs,
):
if geometry_type is None:
raise ValueError("geometry_type must be provided")

if driver is None:
driver = detect_driver(path)

Expand Down Expand Up @@ -421,13 +418,13 @@ def write(
if not isinstance(v, str):
raise ValueError(f"metadata value {v} must be a string")

if promote_to_multi is None:
if geometry is not None and promote_to_multi is None:
promote_to_multi = (
geometry_type.startswith("Multi")
and driver in DRIVERS_NO_MIXED_SINGLE_MULTI
)

if crs is None:
if geometry is not None and crs is None:
warnings.warn(
"'crs' was not provided. The output dataset will not have "
"projection information defined and may not be usable in other "
Expand Down
2 changes: 0 additions & 2 deletions pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ def test_gdal_geos_version():
# drivers not supported for write by GDAL
("HTTP", False),
("OAPIF", False),
# drivers currently unsupported for write even though GDAL can write them
("XLSX", False),
],
)
def test_ogr_driver_supports_write(driver, expected):
Expand Down
47 changes: 47 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,53 @@ def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
)


@pytest.mark.filterwarnings("ignore:.*No SRS set on layer.*")
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS + [".xlsx"] if ext != ".fgb"])
def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext):
"""Test writing a dataframe without a geometry column.
FlatGeobuf (.fgb) doesn't seem to support this, and just writes an empty file.
"""
# Prepare test data
input_df = read_dataframe(naturalearth_lowres, read_geometry=False)
output_path = tmp_path / f"test{ext}"

# A shapefile without geometry column results in only a .dbf file.
if ext == ".shp":
output_path = output_path.with_suffix(".dbf")

# Determine driver
driver = DRIVERS[ext] if ext != ".xlsx" else "XLSX"

write_dataframe(input_df, output_path, driver=driver)

assert output_path.exists()
result_df = read_dataframe(output_path)

assert isinstance(result_df, pd.DataFrame)

# some dtypes do not round-trip precisely through these file types
check_dtype = ext not in [".json", ".geojson", ".geojsonl", ".xlsx"]

if ext in [".gpkg", ".shp", ".xlsx"]:
# These file types return a DataFrame when read.
assert not isinstance(result_df, gp.GeoDataFrame)
pd.testing.assert_frame_equal(
result_df, input_df, check_index_type=False, check_dtype=check_dtype
)
else:
# These file types return a GeoDataFrame with None Geometries when read.
input_none_geom_gdf = gp.GeoDataFrame(
input_df, geometry=np.repeat(None, len(input_df)), crs=4326
)
assert_geodataframe_equal(
result_df,
input_none_geom_gdf,
check_index_type=False,
check_dtype=check_dtype,
)


@pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read")
@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"])
def test_write_empty_dataframe(tmp_path, ext):
Expand Down
Loading

0 comments on commit 4701cc2

Please sign in to comment.