Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add append support to write functions #197

Merged
merged 5 commits into from
Jan 13, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 89 additions & 73 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ cdef void* ogr_open(const char* path_c, int mode, options) except NULL:
else:
flags |= GDAL_OF_READONLY

# TODO: other open opts from fiona
open_opts = CSLAddNameValue(open_opts, "VALIDATE_OPEN_OPTIONS", "NO")

try:
Expand Down Expand Up @@ -1062,7 +1061,7 @@ def ogr_read_arrow(

IF CTE_GDAL_VERSION < (3, 6, 0):
raise RuntimeError("Need GDAL>=3.6 for Arrow support")

if not OGR_L_GetArrowStream(ogr_layer, &stream, options):
raise RuntimeError("Failed to open ArrowArrayStream from Layer")

Expand Down Expand Up @@ -1316,10 +1315,10 @@ cdef infer_field_types(list dtypes):
return field_types


# TODO: handle updateable data sources, like GPKG
# TODO: set geometry and field data as memory views?
def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
str crs, str geometry_type, str encoding, bint promote_to_multi=False, **kwargs):
str crs, str geometry_type, str encoding, bint promote_to_multi=False,
bint append=False, **kwargs):

cdef const char *path_c = NULL
cdef const char *layer_c = NULL
Expand Down Expand Up @@ -1361,42 +1360,50 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
if not layer:
layer = os.path.splitext(os.path.split(path)[1])[0]

layer_b = layer.encode('UTF-8')
layer_c = layer_b

# if shapefile, GeoJSON, or FlatGeobuf, always delete first
# for other types, check if we can create layers
# GPKG might be the only multi-layer writeable type. TODO: check this
if driver in ('ESRI Shapefile', 'GeoJSON', 'GeoJSONSeq', 'FlatGeobuf') and os.path.exists(path):
os.unlink(path)
if not append:
os.unlink(path)

# TODO: invert this: if exists then try to update it, if that doesn't work then always create
layer_exists = False
if os.path.exists(path):
try:
ogr_dataset = ogr_open(path_c, 1, None)

# If layer exists, delete it.
for i in range(GDALDatasetGetLayerCount(ogr_dataset)):
name = OGR_L_GetName(GDALDatasetGetLayer(ogr_dataset, i))
if layer == name.decode('UTF-8'):
layer_idx = i
break

if layer_idx >= 0:
GDALDatasetDeleteLayer(ogr_dataset, layer_idx)
layer_exists = True

if not append:
GDALDatasetDeleteLayer(ogr_dataset, layer_idx)

except DataSourceError as e:
# open failed
if append:
raise e

except DataSourceError:
# open failed, so create from scratch
# force delete it first
# otherwise create from scratch
os.unlink(path)
ogr_dataset = NULL

# either it didn't exist or could not open it in write mode
if ogr_dataset == NULL:
ogr_dataset = ogr_create(path_c, driver_c)

# if we are not appending to an existing layer, we need to create
# the layer and all associated properties (CRS, field defs, etc)
create_layer = not (append and layer_exists)

### Create the CRS
if crs is not None:
if create_layer and crs is not None:
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
try:
ogr_crs = create_crs(crs)

Expand All @@ -1405,42 +1412,48 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
raise exc


### Create options
if not encoding:
encoding = locale.getpreferredencoding()

if driver == 'ESRI Shapefile':
# Fiona only sets encoding for shapefiles; other drivers do not support
# encoding as an option.
encoding_b = encoding.upper().encode('UTF-8')
encoding_c = encoding_b
options = CSLSetNameValue(options, "ENCODING", encoding_c)

# Setup other layer creation options
for k, v in kwargs.items():
if v is None:
continue

k = k.upper().encode('UTF-8')

if isinstance(v, bool):
v = ('ON' if v else 'OFF').encode('utf-8')
else:
v = str(v).encode('utf-8')
### Create the layer
if create_layer:
# Setup layer creation options
if not encoding:
encoding = locale.getpreferredencoding()

if driver == 'ESRI Shapefile':
# Fiona only sets encoding for shapefiles; other drivers do not support
# encoding as an option.
encoding_b = encoding.upper().encode('UTF-8')
encoding_c = encoding_b
options = CSLSetNameValue(options, "ENCODING", encoding_c)

for k, v in kwargs.items():
if v is None:
continue

k = k.upper().encode('UTF-8')

if isinstance(v, bool):
v = ('ON' if v else 'OFF').encode('utf-8')
else:
v = str(v).encode('utf-8')

options = CSLAddNameValue(options, <const char *>k, <const char *>v)
options = CSLAddNameValue(options, <const char *>k, <const char *>v)

layer_b = layer.encode('UTF-8')
layer_c = layer_b

### Get geometry type
# TODO: this is brittle for 3D / ZM / M types
# TODO: fail on M / ZM types
geometry_code = get_geometry_type_code(geometry_type or "Unknown")
### Get geometry type
# TODO: this is brittle for 3D / ZM / M types
# TODO: fail on M / ZM types
geometry_code = get_geometry_type_code(geometry_type or "Unknown")

### Create the layer
try:
ogr_layer = exc_wrap_pointer(
GDALDatasetCreateLayer(ogr_dataset, layer_c, ogr_crs,
<OGRwkbGeometryType>geometry_code, options))
if create_layer:
ogr_layer = exc_wrap_pointer(
GDALDatasetCreateLayer(ogr_dataset, layer_c, ogr_crs,
geometry_code, options))

else:
ogr_layer = exc_wrap_pointer(get_ogr_layer(ogr_dataset, layer))

except Exception as exc:
OGRReleaseDataSource(ogr_dataset)
Expand All @@ -1456,44 +1469,47 @@ def ogr_write(str path, str layer, str driver, geometry, field_data, fields,
CSLDestroy(<char**>options)
options = NULL

### Create the fields

field_types = infer_field_types([field.dtype for field in field_data])
for i in range(num_fields):
field_type, field_subtype, width, precision = field_types[i]

name_b = fields[i].encode(encoding)
try:
ogr_fielddef = exc_wrap_pointer(OGR_Fld_Create(name_b, field_type))
### Create the fields
if create_layer:
for i in range(num_fields):
field_type, field_subtype, width, precision = field_types[i]

# subtypes, see: https://gdal.org/development/rfc/rfc50_ogr_field_subtype.html
if field_subtype != OFSTNone:
OGR_Fld_SetSubType(ogr_fielddef, field_subtype)
name_b = fields[i].encode(encoding)
try:
ogr_fielddef = exc_wrap_pointer(OGR_Fld_Create(name_b, field_type))

if width:
OGR_Fld_SetWidth(ogr_fielddef, width)
# subtypes, see: https://gdal.org/development/rfc/rfc50_ogr_field_subtype.html
if field_subtype != OFSTNone:
OGR_Fld_SetSubType(ogr_fielddef, field_subtype)

# TODO: set precision
if width:
OGR_Fld_SetWidth(ogr_fielddef, width)

except:
if ogr_fielddef != NULL:
OGR_Fld_Destroy(ogr_fielddef)
ogr_fielddef = NULL
# TODO: set precision

OGRReleaseDataSource(ogr_dataset)
ogr_dataset = NULL
raise FieldError(f"Error creating field '{fields[i]}' from field_data") from None
except:
if ogr_fielddef != NULL:
OGR_Fld_Destroy(ogr_fielddef)
ogr_fielddef = NULL

try:
exc_wrap_int(OGR_L_CreateField(ogr_layer, ogr_fielddef, 1))
OGRReleaseDataSource(ogr_dataset)
ogr_dataset = NULL
raise FieldError(f"Error creating field '{fields[i]}' from field_data") from None

except:
OGRReleaseDataSource(ogr_dataset)
ogr_dataset = NULL
raise FieldError(f"Error adding field '{fields[i]}' to layer") from None
try:
exc_wrap_int(OGR_L_CreateField(ogr_layer, ogr_fielddef, 1))

finally:
if ogr_fielddef != NULL:
OGR_Fld_Destroy(ogr_fielddef)
except:
OGRReleaseDataSource(ogr_dataset)
ogr_dataset = NULL
raise FieldError(f"Error adding field '{fields[i]}' to layer") from None

finally:
if ogr_fielddef != NULL:
OGR_Fld_Destroy(ogr_fielddef)


### Create the features
Expand Down
6 changes: 6 additions & 0 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def write_dataframe(
encoding=None,
geometry_type=None,
promote_to_multi=None,
append=False,
**kwargs,
):
"""
Expand Down Expand Up @@ -232,6 +233,10 @@ def write_dataframe(
types will not be promoted, which may result in errors or invalid files when
attempting to write mixed singular and multi geometry types to drivers that do
not support such combinations.
append : bool, optional (default: False)
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something else: I like append=True/False, but note that geopandas / fiona uses mode="w"/"a" (but fiona has this because it mimics the open(..) API, and in geopandas we can easily map the keyword for compat; we currently even explicitly raise an error for mode="a" on the geopandas side in case of pyogrio engine)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that pandas supports the same modes as the open() API for things like to_csv(). So we may want to be consistent but only support a subset: write_dataframe would never support mode="r", for instance.

I'm not sure what the meaning of mode="r+" would be here, since we don't expose a stateful API, and we don't (yet) let you update existing records an existing data layer.

In which case, read_dataframe would only ever use mode="r" (so parameter is not needed), and write_dataframe could only ever use mode="w" or mode="a" - which ends up being an awful lot like our boolean append.

Would be nice not to change this API later, so I'm certainly open to using mode in order to make this more compatible with GeoPandas if that is what you'd like here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, for a direct user of pyogrio, I find append=True the nicer and easier API.

If True, the data source specified by path already exists, and the
driver supports appending to an existing data source, will cause the
data to be appended to the existing records in the data source.
**kwargs
The kwargs passed to OGR.
"""
Expand Down Expand Up @@ -335,5 +340,6 @@ def write_dataframe(
geometry_type=geometry_type,
encoding=encoding,
promote_to_multi=promote_to_multi,
append=append,
**kwargs,
)
2 changes: 2 additions & 0 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def write(
crs=None,
encoding=None,
promote_to_multi=None,
append=False,
**kwargs,
):
if geometry_type is None:
Expand Down Expand Up @@ -215,5 +216,6 @@ def write(
crs=crs,
encoding=encoding,
promote_to_multi=promote_to_multi,
append=append,
**kwargs,
)
30 changes: 30 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,36 @@ def test_write_read_empty_dataframe_unsupported(tmp_path, ext):
_ = read_dataframe(filename)


def test_write_dataframe_gpkg_multiple_layers(tmp_path, naturalearth_lowres):
input_gdf = read_dataframe(naturalearth_lowres)
output_path = tmp_path / "test.gpkg"

write_dataframe(input_gdf, output_path, layer="first", promote_to_multi=True)

assert os.path.exists(output_path)
assert np.array_equal(list_layers(output_path), [["first", "MultiPolygon"]])

write_dataframe(input_gdf, output_path, layer="second", promote_to_multi=True)
assert np.array_equal(
list_layers(output_path),
[["first", "MultiPolygon"], ["second", "MultiPolygon"]],
)


@pytest.mark.parametrize("ext", ALL_EXTS)
def test_write_dataframe_append(tmp_path, naturalearth_lowres, ext):
input_gdf = read_dataframe(naturalearth_lowres)
output_path = tmp_path / f"test{ext}"

write_dataframe(input_gdf, output_path)

assert os.path.exists(output_path)
assert len(read_dataframe(output_path)) == 177

write_dataframe(input_gdf, output_path, append=True)
assert len(read_dataframe(output_path)) == 354


def test_write_dataframe_gdalparams(tmp_path, naturalearth_lowres):
original_df = read_dataframe(naturalearth_lowres)

Expand Down
56 changes: 56 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,24 @@ def test_write_gpkg(tmpdir, naturalearth_lowres):
assert os.path.exists(filename)


def test_write_gpkg_multiple_layers(tmpdir, naturalearth_lowres):
meta, _, geometry, field_data = read(naturalearth_lowres)
meta["geometry_type"] = "MultiPolygon"

filename = os.path.join(str(tmpdir), "test.gpkg")
write(filename, geometry, field_data, driver="GPKG", layer="first", **meta)

assert os.path.exists(filename)

assert np.array_equal(list_layers(filename), [["first", "MultiPolygon"]])

write(filename, geometry, field_data, driver="GPKG", layer="second", **meta)

assert np.array_equal(
list_layers(filename), [["first", "MultiPolygon"], ["second", "MultiPolygon"]]
)


def test_write_geojson(tmpdir, naturalearth_lowres):
meta, _, geometry, field_data = read(naturalearth_lowres)

Expand All @@ -293,6 +311,44 @@ def test_write_geojson(tmpdir, naturalearth_lowres):
)


@pytest.mark.parametrize("ext", DRIVERS)
def test_write_append(tmpdir, naturalearth_lowres, ext):
meta, _, geometry, field_data = read(naturalearth_lowres)

# coerce output layer to MultiPolygon to avoid mixed type errors
meta["geometry_type"] = "MultiPolygon"

filename = os.path.join(str(tmpdir), f"test{ext}")
write(filename, geometry, field_data, **meta)

assert os.path.exists(filename)

assert read_info(filename)["features"] == 177

# write the same records again
write(filename, geometry, field_data, append=True, **meta)

assert read_info(filename)["features"] == 354


def test_write_append_unsupported(tmpdir, naturalearth_lowres):
meta, _, geometry, field_data = read(naturalearth_lowres)

# GML does not support append functionality
filename = os.path.join(str(tmpdir), "test.gml")
write(filename, geometry, field_data, driver="GML", **meta)

assert os.path.exists(filename)

assert read_info(filename)["features"] == 177

with pytest.raises(DataSourceError):
# NOTE: the raw error from GDAL is confusing:
# ("not recognized as a supported file format")
# TODO: we should probably check for append support ourselves
write(filename, geometry, field_data, driver="GML", append=True, **meta)


@pytest.mark.parametrize(
"driver",
[
Expand Down