diff --git a/CHANGES.md b/CHANGES.md index 9ad6b5cf..e614d343 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,8 @@ ### Improvements +- Support writing dataframes without geometry column (#267) + - Calculate feature count by iterating over features if GDAL returns an unknown count for a data layer (e.g., OSM driver); this may have signficant performance impacts for some data sources that would otherwise return an diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx index 26582a67..2540961e 100644 --- a/pyogrio/_io.pyx +++ b/pyogrio/_io.pyx @@ -1569,16 +1569,34 @@ def ogr_write( cdef OGRwkbGeometryType geometry_code cdef int err = 0 cdef int i = 0 - cdef int num_records = len(geometry) - cdef int num_fields = len(field_data) if field_data else 0 - - if len(field_data) != len(fields): - raise ValueError("field_data and fields must be same length") - - if num_fields: + cdef int num_records = -1 + cdef int num_field_data = len(field_data) if field_data is not None else 0 + cdef int num_fields = len(fields) if fields is not None else 0 + + if num_fields != num_field_data: + raise ValueError("field_data array needs to be same length as fields array") + + if num_fields == 0 and geometry is None: + raise ValueError("You must provide at least a geometry column or a field") + + if num_fields > 0: + num_records = len(field_data[0]) for i in range(1, len(field_data)): if len(field_data[i]) != num_records: - raise ValueError("field_data arrays must be same length as geometry array") + raise ValueError("field_data arrays must be same length") + + if geometry is None: + # If no geometry data, we ignore the geometry_type and don't create a geometry + # column + geometry_type = None + else: + if num_fields > 0: + if len(geometry) != num_records: + raise ValueError( + "field_data arrays must be same length as geometry array" + ) + else: + num_records = len(geometry) if field_mask is not None: if len(field_data) != len(field_mask): @@ -1587,7 +1605,7 @@ def ogr_write( if field_mask[i] is not None and len(field_mask[i]) != num_records: raise ValueError("field_mask arrays must be same length as geometry array") else: - field_mask = [None] * len(field_data) + field_mask = [None] * num_fields path_b = path.encode('UTF-8') path_c = path_b @@ -1676,7 +1694,7 @@ def ogr_write( ### Get geometry type # TODO: this is brittle for 3D / ZM / M types # TODO: fail on M / ZM types - geometry_code = get_geometry_type_code(geometry_type or "Unknown") + geometry_code = get_geometry_type_code(geometry_type) try: if create_layer: @@ -1713,7 +1731,9 @@ def ogr_write( layer_options = NULL ### Create the fields - field_types = infer_field_types([field.dtype for field in field_data]) + field_types = None + if num_fields > 0: + field_types = infer_field_types([field.dtype for field in field_data]) ### Create the fields if create_layer: @@ -1771,7 +1791,7 @@ def ogr_write( # create the geometry based on specific WKB type (there might be mixed types in geometries) # TODO: geometry must not be null or errors - wkb = geometry[i] + wkb = None if geometry is None else geometry[i] if wkb is not None: wkbtype = bytearray(wkb)[1] # may need to consider all 4 bytes: int.from_bytes(wkb[0][1:4], byteorder="little") diff --git a/pyogrio/_ogr.pyx b/pyogrio/_ogr.pyx index fd938ef8..e5be3be6 100644 --- a/pyogrio/_ogr.pyx +++ b/pyogrio/_ogr.pyx @@ -101,12 +101,6 @@ def get_gdal_config_option(str name): def ogr_driver_supports_write(driver): - # exclude drivers known to be unsupported by pyogrio even though they are - # supported for write by GDAL - if driver in {"XLSX"}: - return False - - # check metadata for driver to see if it supports write if _get_driver_metadata_item(driver, "DCAP_CREATE") == 'YES': return True diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py index ef7de377..88c8ea1b 100644 --- a/pyogrio/geopandas.py +++ b/pyogrio/geopandas.py @@ -219,7 +219,7 @@ def write_dataframe( Parameters ---------- - df : GeoDataFrame + df : GeoDataFrame or DataFrame The data to write. For attribute columns of the "object" dtype, all values will be converted to strings to be written to the output file, except None and np.nan, which will be set to NULL @@ -294,7 +294,6 @@ def write_dataframe( """ # TODO: add examples to the docstring (e.g. OGR kwargs) try: - import geopandas as gp from geopandas.array import to_wkb import pandas as pd @@ -306,25 +305,27 @@ def write_dataframe( path = str(path) - if not isinstance(df, gp.GeoDataFrame): - raise ValueError("'df' must be a GeoDataFrame") + if not isinstance(df, pd.DataFrame): + raise ValueError("'df' must be a DataFrame or GeoDataFrame") if driver is None: driver = detect_driver(path) geometry_columns = df.columns[df.dtypes == "geometry"] - if len(geometry_columns) == 0: - raise ValueError("'df' does not have a geometry column") - if len(geometry_columns) > 1: raise ValueError( "'df' must have only one geometry column. " "Multiple geometry columns are not supported for output using OGR." ) - geometry_column = geometry_columns[0] - geometry = df[geometry_column] - fields = [c for c in df.columns if not c == geometry_column] + if len(geometry_columns) > 0: + geometry_column = geometry_columns[0] + geometry = df[geometry_column] + fields = [c for c in df.columns if not c == geometry_column] + else: + geometry_column = None + geometry = None + fields = list(df.columns) # TODO: may need to fill in pd.NA, etc field_data = [] @@ -345,7 +346,9 @@ def write_dataframe( field_mask.append(None) # Determine geometry_type and/or promote_to_multi - if geometry_type is None or promote_to_multi is None: + if geometry_column is not None and ( + geometry_type is None or promote_to_multi is None + ): tmp_geometry_type = "Unknown" has_z = False @@ -402,7 +405,7 @@ def write_dataframe( geometry_type = f"{geometry_type} Z" crs = None - if geometry.crs: + if geometry_column is not None and geometry.crs: # TODO: this may need to be WKT1, due to issues # if possible use EPSG codes instead epsg = geometry.crs.to_epsg() @@ -411,11 +414,15 @@ def write_dataframe( else: crs = geometry.crs.to_wkt(WktVersion.WKT1_GDAL) + # If there is geometry data, prepare it to be written + if geometry_column is not None: + geometry = to_wkb(geometry.values) + write( path, layer=layer, driver=driver, - geometry=to_wkb(geometry.values), + geometry=geometry, field_data=field_data, field_mask=field_mask, fields=fields, diff --git a/pyogrio/raw.py b/pyogrio/raw.py index 27bc5ed4..3a0e8076 100644 --- a/pyogrio/raw.py +++ b/pyogrio/raw.py @@ -387,9 +387,6 @@ def write( layer_options=None, **kwargs, ): - if geometry_type is None: - raise ValueError("geometry_type must be provided") - if driver is None: driver = detect_driver(path) @@ -421,13 +418,13 @@ def write( if not isinstance(v, str): raise ValueError(f"metadata value {v} must be a string") - if promote_to_multi is None: + if geometry is not None and promote_to_multi is None: promote_to_multi = ( geometry_type.startswith("Multi") and driver in DRIVERS_NO_MIXED_SINGLE_MULTI ) - if crs is None: + if geometry is not None and crs is None: warnings.warn( "'crs' was not provided. The output dataset will not have " "projection information defined and may not be usable in other " diff --git a/pyogrio/tests/test_core.py b/pyogrio/tests/test_core.py index 6db0eff7..538d6512 100644 --- a/pyogrio/tests/test_core.py +++ b/pyogrio/tests/test_core.py @@ -55,8 +55,6 @@ def test_gdal_geos_version(): # drivers not supported for write by GDAL ("HTTP", False), ("OAPIF", False), - # drivers currently unsupported for write even though GDAL can write them - ("XLSX", False), ], ) def test_ogr_driver_supports_write(driver, expected): diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py index 8e9fd23a..5f261d1a 100644 --- a/pyogrio/tests/test_geopandas_io.py +++ b/pyogrio/tests/test_geopandas_io.py @@ -460,6 +460,53 @@ def test_write_dataframe(tmp_path, naturalearth_lowres, ext): ) +@pytest.mark.filterwarnings("ignore:.*No SRS set on layer.*") +@pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS + [".xlsx"] if ext != ".fgb"]) +def test_write_dataframe_no_geom(tmp_path, naturalearth_lowres, ext): + """Test writing a dataframe without a geometry column. + + FlatGeobuf (.fgb) doesn't seem to support this, and just writes an empty file. + """ + # Prepare test data + input_df = read_dataframe(naturalearth_lowres, read_geometry=False) + output_path = tmp_path / f"test{ext}" + + # A shapefile without geometry column results in only a .dbf file. + if ext == ".shp": + output_path = output_path.with_suffix(".dbf") + + # Determine driver + driver = DRIVERS[ext] if ext != ".xlsx" else "XLSX" + + write_dataframe(input_df, output_path, driver=driver) + + assert output_path.exists() + result_df = read_dataframe(output_path) + + assert isinstance(result_df, pd.DataFrame) + + # some dtypes do not round-trip precisely through these file types + check_dtype = ext not in [".json", ".geojson", ".geojsonl", ".xlsx"] + + if ext in [".gpkg", ".shp", ".xlsx"]: + # These file types return a DataFrame when read. + assert not isinstance(result_df, gp.GeoDataFrame) + pd.testing.assert_frame_equal( + result_df, input_df, check_index_type=False, check_dtype=check_dtype + ) + else: + # These file types return a GeoDataFrame with None Geometries when read. + input_none_geom_gdf = gp.GeoDataFrame( + input_df, geometry=np.repeat(None, len(input_df)), crs=4326 + ) + assert_geodataframe_equal( + result_df, + input_none_geom_gdf, + check_index_type=False, + check_dtype=check_dtype, + ) + + @pytest.mark.filterwarnings("ignore:.*Layer .* does not have any features to read") @pytest.mark.parametrize("ext", [ext for ext in ALL_EXTS if ext not in ".geojsonl"]) def test_write_empty_dataframe(tmp_path, ext): diff --git a/pyogrio/tests/test_raw_io.py b/pyogrio/tests/test_raw_io.py index e5915a91..e07384e3 100644 --- a/pyogrio/tests/test_raw_io.py +++ b/pyogrio/tests/test_raw_io.py @@ -213,7 +213,6 @@ def test_read_fids(naturalearth_lowres): def test_return_fids(naturalearth_lowres): - # default is to not return fids fids = read(naturalearth_lowres)[1] assert fids is None @@ -316,6 +315,111 @@ def test_write_geojson(tmpdir, naturalearth_lowres): ) +def test_write_no_fields(tmp_path, naturalearth_lowres): + """Test writing file with no fields/attribute columns.""" + # Prepare test data + meta, _, geometry, field_data = read(naturalearth_lowres) + field_data = None + meta["fields"] = None + + # Test + filename = tmp_path / "test.gpkg" + write(filename, geometry, field_data, driver="GPKG", **meta) + + # Check result + assert os.path.exists(filename) + meta, _, geometry, fields = read(filename) + + assert meta["crs"] == "EPSG:4326" + assert meta["geometry_type"] == "Polygon" + assert meta["encoding"] == "UTF-8" + assert meta["fields"].shape == (0,) + assert len(fields) == 0 + assert len(geometry) == 177 + + # quick test that WKB is a Polygon type + assert geometry[0][:6] == b"\x01\x06\x00\x00\x00\x03" + + +def test_write_no_geom(tmp_path, naturalearth_lowres): + """Test writing file with no geometry column.""" + # Prepare test data + meta, _, geometry, field_data = read(naturalearth_lowres) + geometry = None + meta["geometry_type"] = None + + # Test + filename = tmp_path / "test.gpkg" + write(filename, geometry, field_data, driver="GPKG", **meta) + + # Check result + assert os.path.exists(filename) + meta, _, geometry, fields = read(filename) + + assert meta["crs"] is None + assert meta["geometry_type"] is None + assert meta["encoding"] == "UTF-8" + assert meta["fields"].shape == (5,) + + assert meta["fields"].tolist() == [ + "pop_est", + "continent", + "name", + "iso_a3", + "gdp_md_est", + ] + + assert len(fields) == 5 + assert len(fields[0]) == 177 + + +def test_write_no_geom_data(tmp_path, naturalearth_lowres): + """Test writing file with no geometry data passed but a geometry_type specified. + + In this case the geometry_type is ignored, so a file without geometry column is + written. + """ + # Prepare test data + meta, _, geometry, field_data = read(naturalearth_lowres) + # If geometry data is set to None, meta["geometry_type"] is ignored and so no + # geometry column will be created. + geometry = None + + # Test + filename = tmp_path / "test.gpkg" + write(filename, geometry, field_data, driver="GPKG", **meta) + + # Check result + assert os.path.exists(filename) + result_meta, _, result_geometry, result_field_data = read(filename) + + assert result_meta["crs"] is None + assert result_meta["geometry_type"] is None + assert result_meta["encoding"] == "UTF-8" + assert result_meta["fields"].shape == (5,) + + assert result_meta["fields"].tolist() == [ + "pop_est", + "continent", + "name", + "iso_a3", + "gdp_md_est", + ] + + assert len(result_field_data) == 5 + assert len(result_field_data[0]) == 177 + assert result_geometry is None + + +def test_write_no_geom_no_fields(): + """Test writing file with no geometry column nor fields -> error.""" + with pytest.raises( + ValueError, + match="You must provide at least a geometry column or a field", + ): + write("test.gpkg", geometry=None, field_data=None, fields=None) + + @pytest.mark.skipif( __gdal_version__ < (3, 6, 0), reason="OpenFileGDB write support only available for GDAL >= 3.6.0",