Skip to content

Commit

Permalink
ENH: support for reading and writing datetimes with timezones (#253)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-richards authored Oct 20, 2023
1 parent 4fe4abe commit 2f6a679
Show file tree
Hide file tree
Showing 12 changed files with 253 additions and 56 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ jobs:

- name: Test
run: |
pytest -v -r s pyogrio/tests
pytest -v --color=yes -r s pyogrio/tests
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Improvements

- Support reading and writing datetimes with timezones (#253).
- Support writing dataframes without geometry column (#267).
- Calculate feature count by iterating over features if GDAL returns an
unknown count for a data layer (e.g., OSM driver); this may have signficant
Expand Down
21 changes: 21 additions & 0 deletions docs/source/introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,27 @@ You can also read from a URL with this syntax:
>>> read_dataframe("zip+https://s3.amazonaws.com/bucket/shapefile.zip")
```

## Reading and writing DateTimes

GDAL only supports datetimes at a millisecond resolution. Reading data will thus
give at most millisecond resolution (`datetime64[ms]` data type). With pandas 2.0
`pyogrio.read_dataframe()` will return datetime data as `datetime64[ms]`
correspondingly. For previous versions of pandas, `datetime64[ns]` is used as
ms precision was not supported. When writing, only precision up to
ms is retained.

Not all file formats have dedicated support to store datetime data, like ESRI
Shapefile. For such formats, or if you require precision > ms, a workaround is to
convert the datetimes to string.

Timezone information is preserved where possible, however GDAL only represents
time zones as UTC offsets, whilst pandas uses IANA time zones (via `pytz` or
`zoneinfo`). This means that dataframes with columns containing multiple offsets
(e.g. when switching from standard time to summer time) will be written correctly,
but when read via `pyogrio.read_dataframe()` will be returned as a UTC datetime
column, as there is no way to reconstruct the original timezone from the individual
offsets present.

## Dataset and layer creation options

It is possible to use dataset and layer creation options available for a given
Expand Down
14 changes: 0 additions & 14 deletions docs/source/known_issues.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,6 @@ Pyogrio does not currently validate attribute values or geometry types before
attempting to write to the output file. Invalid types may crash during writing
with obscure error messages.

## Support for reading and writing DateTimes

GDAL only supports datetimes at a millisecond resolution. Reading data will thus
give at most millisecond resolution (`datetime64[ms]` data type), even though
the data is cast `datetime64[ns]` data type when reading into a data frame
using `pyogrio.read_dataframe()`. When writing, only precision up to ms is retained.

Not all file formats have dedicated support to store datetime data, like ESRI
Shapefile. For such formats, or if you require precision > ms, a workaround is to
convert the datetimes to string.

Timezone information is ignored at the moment, both when reading and when writing
datetime columns.

## Support for OpenStreetMap (OSM) data

OpenStreetMap data do not natively support calculating the feature count by data
Expand Down
7 changes: 7 additions & 0 deletions pyogrio/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@
except ImportError:
geopandas = None

try:
import pandas
except ImportError:
pandas = None


HAS_ARROW_API = __gdal_version__ >= (3, 6, 0) and pyarrow is not None

HAS_GEOPANDAS = geopandas is not None

PANDAS_GE_20 = pandas is not None and Version(pandas.__version__) >= Version("2.0.0")

HAS_GDAL_GEOS = __gdal_geos_version__ is not None

HAS_SHAPELY = shapely is not None and Version(shapely.__version__) >= Version("2.0.0")
77 changes: 51 additions & 26 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,8 @@ cdef process_fields(
object field_data_view,
object field_indexes,
object field_ogr_types,
encoding
encoding,
bint datetime_as_string
):
cdef int j
cdef int success
Expand Down Expand Up @@ -756,7 +757,7 @@ cdef process_fields(
else:
data[i] = np.nan

elif field_type in ( OFTDate, OFTDateTime):
elif field_type in ( OFTDate, OFTDateTime) and not datetime_as_string:
data[i] = np.datetime64('NaT')

else:
Expand All @@ -782,22 +783,28 @@ cdef process_fields(
data[i] = bin_value[:ret_length]

elif field_type == OFTDateTime or field_type == OFTDate:
success = OGR_F_GetFieldAsDateTimeEx(
ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)

if datetime_as_string:
# defer datetime parsing to user/ pandas layer
# Update to OGR_F_GetFieldAsISO8601DateTime when GDAL 3.7+ only
data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding)
else:
success = OGR_F_GetFieldAsDateTimeEx(
ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)

ms, ss = math.modf(fsecond)
second = int(ss)
# fsecond has millisecond accuracy
microsecond = round(ms * 1000) * 1000
ms, ss = math.modf(fsecond)
second = int(ss)
# fsecond has millisecond accuracy
microsecond = round(ms * 1000) * 1000

if not success:
data[i] = np.datetime64('NaT')
if not success:
data[i] = np.datetime64('NaT')

elif field_type == OFTDate:
data[i] = datetime.date(year, month, day).isoformat()
elif field_type == OFTDate:
data[i] = datetime.date(year, month, day).isoformat()

elif field_type == OFTDateTime:
data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
elif field_type == OFTDateTime:
data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()


@cython.boundscheck(False) # Deactivate bounds checking
Expand All @@ -810,7 +817,8 @@ cdef get_features(
uint8_t force_2d,
int skip_features,
int num_features,
uint8_t return_fids
uint8_t return_fids,
bint datetime_as_string
):

cdef OGRFeatureH ogr_feature = NULL
Expand Down Expand Up @@ -843,7 +851,9 @@ cdef get_features(

field_data = [
np.empty(shape=(num_features, ),
dtype=fields[field_index,3]) for field_index in range(n_fields)
dtype = ("object" if datetime_as_string and
fields[field_index,3].startswith("datetime") else fields[field_index,3])
) for field_index in range(n_fields)
]

field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
Expand Down Expand Up @@ -884,7 +894,7 @@ cdef get_features(

process_fields(
ogr_feature, i, n_fields, field_data, field_data_view,
field_indexes, field_ogr_types, encoding
field_indexes, field_ogr_types, encoding, datetime_as_string
)
i += 1
finally:
Expand Down Expand Up @@ -914,7 +924,8 @@ cdef get_features_by_fid(
object[:,:] fields,
encoding,
uint8_t read_geometry,
uint8_t force_2d
uint8_t force_2d,
bint datetime_as_string
):

cdef OGRFeatureH ogr_feature = NULL
Expand All @@ -937,10 +948,11 @@ cdef get_features_by_fid(
n_fields = fields.shape[0]
field_indexes = fields[:,0]
field_ogr_types = fields[:,1]

field_data = [
np.empty(shape=(count, ),
dtype=fields[field_index,3]) for field_index in range(n_fields)
dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime")
else fields[field_index,3]))
for field_index in range(n_fields)
]

field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
Expand All @@ -963,7 +975,7 @@ cdef get_features_by_fid(

process_fields(
ogr_feature, i, n_fields, field_data, field_data_view,
field_indexes, field_ogr_types, encoding
field_indexes, field_ogr_types, encoding, datetime_as_string
)
finally:
if ogr_feature != NULL:
Expand Down Expand Up @@ -1063,7 +1075,9 @@ def ogr_read(
object fids=None,
str sql=None,
str sql_dialect=None,
int return_fids=False):
int return_fids=False,
bint datetime_as_string=False
):

cdef int err = 0
cdef const char *path_c = NULL
Expand Down Expand Up @@ -1161,6 +1175,7 @@ def ogr_read(
encoding,
read_geometry=read_geometry and geometry_type is not None,
force_2d=force_2d,
datetime_as_string=datetime_as_string
)

# bypass reading fids since these should match fids used for read
Expand Down Expand Up @@ -1193,13 +1208,15 @@ def ogr_read(
force_2d=force_2d,
skip_features=skip_features,
num_features=num_features,
return_fids=return_fids
return_fids=return_fids,
datetime_as_string=datetime_as_string
)

meta = {
'crs': crs,
'encoding': encoding,
'fields': fields[:,2], # return only names
'dtypes':fields[:,3],
'geometry_type': geometry_type,
}

Expand Down Expand Up @@ -1667,7 +1684,8 @@ def ogr_write(
str path, str layer, str driver, geometry, fields, field_data, field_mask,
str crs, str geometry_type, str encoding, object dataset_kwargs,
object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True,
bint append=False, dataset_metadata=None, layer_metadata=None
bint append=False, dataset_metadata=None, layer_metadata=None,
gdal_tz_offsets=None
):
cdef const char *path_c = NULL
cdef const char *layer_c = NULL
Expand Down Expand Up @@ -1738,6 +1756,9 @@ def ogr_write(
if not layer:
layer = os.path.splitext(os.path.split(path)[1])[0]

if gdal_tz_offsets is None:
gdal_tz_offsets = {}


# if shapefile, GeoJSON, or FlatGeobuf, always delete first
# for other types, check if we can create layers
Expand Down Expand Up @@ -2010,8 +2031,12 @@ def ogr_write(
if np.isnat(field_value):
OGR_F_SetFieldNull(ogr_feature, field_idx)
else:
# TODO: add support for timezones
datetime = field_value.astype("datetime64[ms]").item()
tz_array = gdal_tz_offsets.get(fields[field_idx], None)
if tz_array is None:
gdal_tz = 0
else:
gdal_tz = tz_array[i]
OGR_F_SetFieldDateTimeEx(
ogr_feature,
field_idx,
Expand All @@ -2021,7 +2046,7 @@ def ogr_write(
datetime.hour,
datetime.minute,
datetime.second + datetime.microsecond / 10**6,
0
gdal_tz
)

else:
Expand Down
Loading

0 comments on commit 2f6a679

Please sign in to comment.