geopandas · brendan-ward · Oct 20, 2023 · May 7, 2023 · May 7, 2023 · May 8, 2023
diff --git a/.github/workflows/tests-conda.yml b/.github/workflows/tests-conda.yml
@@ -66,4 +66,4 @@ jobs:
 
       - name: Test
         run: |
-          pytest -v -r s pyogrio/tests
+          pytest -v --color=yes -r s pyogrio/tests
diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -599,7 +599,8 @@ cdef process_fields(
     object field_data_view,
     object field_indexes,
     object field_ogr_types,
-    encoding
+    encoding,
+    bint datetime_as_string
 ):
     cdef int j
     cdef int success
@@ -631,7 +632,7 @@ cdef process_fields(
                 else:
                     data[i] = np.nan
 
-            elif field_type in ( OFTDate, OFTDateTime):
+            elif field_type in ( OFTDate, OFTDateTime) and not datetime_as_string:
                 data[i] = np.datetime64('NaT')
 
             else:
@@ -657,22 +658,27 @@ cdef process_fields(
             data[i] = bin_value[:ret_length]
 
         elif field_type == OFTDateTime or field_type == OFTDate:
-            success = OGR_F_GetFieldAsDateTimeEx(
-                ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
+
+            if datetime_as_string:
+                # defer datetime parsing to user/ pandas layer 
+                data[i] = get_string(OGR_F_GetFieldAsString(ogr_feature, field_index), encoding=encoding)
+            else:
+                success = OGR_F_GetFieldAsDateTimeEx(
+                    ogr_feature, field_index, &year, &month, &day, &hour, &minute, &fsecond, &timezone)
 
-            ms, ss = math.modf(fsecond)
-            second = int(ss)
-            # fsecond has millisecond accuracy
-            microsecond = round(ms * 1000) * 1000
+                ms, ss = math.modf(fsecond)
+                second = int(ss)
+                # fsecond has millisecond accuracy
+                microsecond = round(ms * 1000) * 1000
 
-            if not success:
-                data[i] = np.datetime64('NaT')
+                if not success:
+                    data[i] = np.datetime64('NaT')
 
-            elif field_type == OFTDate:
-                data[i] = datetime.date(year, month, day).isoformat()
+                elif field_type == OFTDate:
+                    data[i] = datetime.date(year, month, day).isoformat()
 
-            elif field_type == OFTDateTime:
-                data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
+                elif field_type == OFTDateTime:
+                    data[i] = datetime.datetime(year, month, day, hour, minute, second, microsecond).isoformat()
 
 
 @cython.boundscheck(False)  # Deactivate bounds checking
@@ -685,7 +691,8 @@ cdef get_features(
     uint8_t force_2d,
     int skip_features,
     int num_features,
-    uint8_t return_fids
+    uint8_t return_fids,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -718,7 +725,9 @@ cdef get_features(
 
     field_data = [
         np.empty(shape=(num_features, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype = ("object" if datetime_as_string and 
+                    fields[field_index,3].startswith("datetime") else fields[field_index,3])
+        ) for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -758,7 +767,7 @@ cdef get_features(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
             i += 1
         finally:
@@ -788,7 +797,8 @@ cdef get_features_by_fid(
     object[:,:] fields,
     encoding,
     uint8_t read_geometry,
-    uint8_t force_2d
+    uint8_t force_2d,
+    bint datetime_as_string
 ):
 
     cdef OGRFeatureH ogr_feature = NULL
@@ -811,10 +821,11 @@ cdef get_features_by_fid(
     n_fields = fields.shape[0]
     field_indexes = fields[:,0]
     field_ogr_types = fields[:,1]
-
     field_data = [
         np.empty(shape=(count, ),
-        dtype=fields[field_index,3]) for field_index in range(n_fields)
+        dtype=("object" if datetime_as_string and fields[field_index,3].startswith("datetime") 
+            else fields[field_index,3])) 
+        for field_index in range(n_fields)
     ]
 
     field_data_view = [field_data[field_index][:] for field_index in range(n_fields)]
@@ -837,7 +848,7 @@ cdef get_features_by_fid(
 
             process_fields(
                 ogr_feature, i, n_fields, field_data, field_data_view,
-                field_indexes, field_ogr_types, encoding
+                field_indexes, field_ogr_types, encoding, datetime_as_string
             )
         finally:
             if ogr_feature != NULL:
@@ -939,7 +950,9 @@ def ogr_read(
     object fids=None,
     str sql=None,
     str sql_dialect=None,
-    int return_fids=False):
+    int return_fids=False,
+    bint datetime_as_string=False
+    ):
 
     cdef int err = 0
     cdef const char *path_c = NULL
@@ -1022,6 +1035,7 @@ def ogr_read(
                 encoding,
                 read_geometry=read_geometry and geometry_type is not None,
                 force_2d=force_2d,
+                datetime_as_string=datetime_as_string
             )
 
             # bypass reading fids since these should match fids used for read
@@ -1051,13 +1065,15 @@ def ogr_read(
                 force_2d=force_2d,
                 skip_features=skip_features,
                 num_features=num_features,
-                return_fids=return_fids
+                return_fids=return_fids,
+                datetime_as_string=datetime_as_string
             )
 
         meta = {
             'crs': crs,
             'encoding': encoding,
             'fields': fields[:,2], # return only names
+            'dtypes':fields[:,3],
             'geometry_type': geometry_type,
         }
 
@@ -1468,12 +1484,22 @@ cdef infer_field_types(list dtypes):
     return field_types
 
 
+FIFTEEN_MINUTE_DELTA = datetime.timedelta(minutes=15)
+
+cdef int timezone_to_gdal_offset(tz_as_datetime):
+    """Convert to GDAL timezone offset representation.
+
+    https://gdal.org/development/rfc/rfc56_millisecond_precision.html#core-changes
+    """
+    return tz_as_datetime.utcoffset() / FIFTEEN_MINUTE_DELTA + 100
+
 # TODO: set geometry and field data as memory views?
 def ogr_write(
     str path, str layer, str driver, geometry, fields, field_data, field_mask,
     str crs, str geometry_type, str encoding, object dataset_kwargs,
     object layer_kwargs, bint promote_to_multi=False, bint nan_as_null=True,
-    bint append=False, dataset_metadata=None, layer_metadata=None
+    bint append=False, dataset_metadata=None, layer_metadata=None,
+    timezone_cols_metadata=None
 ):
     cdef const char *path_c = NULL
     cdef const char *layer_c = NULL
@@ -1526,6 +1552,9 @@ def ogr_write(
     if not layer:
         layer = os.path.splitext(os.path.split(path)[1])[0]
 
+    if timezone_cols_metadata is None:
+        timezone_cols_metadata = {}
+
 
     # if shapefile, GeoJSON, or FlatGeobuf, always delete first
     # for other types, check if we can create layers
@@ -1796,8 +1825,12 @@ def ogr_write(
                     if np.isnat(field_value):
                         OGR_F_SetFieldNull(ogr_feature, field_idx)
                     else:
-                        # TODO: add support for timezones
                         datetime = field_value.astype("datetime64[ms]").item()
+                        tz_array = timezone_cols_metadata.get(fields[field_idx], None)
+                        if tz_array is None:
+                            gdal_tz = 0
+                        else:
+                            gdal_tz = timezone_to_gdal_offset(tz_array[i])
                         OGR_F_SetFieldDateTimeEx(
                             ogr_feature,
                             field_idx,
@@ -1807,7 +1840,7 @@ def ogr_write(
                             datetime.hour,
                             datetime.minute,
                             datetime.second + datetime.microsecond / 10**6,
-                            0
+                            gdal_tz
                         )
 
                 else:

diff --git a/pyogrio/geopandas.py b/pyogrio/geopandas.py
@@ -2,6 +2,16 @@
 from pyogrio.raw import DRIVERS_NO_MIXED_SINGLE_MULTI, DRIVERS_NO_MIXED_DIMENSIONS
 from pyogrio.raw import detect_driver, read, read_arrow, write
 from pyogrio.errors import DataSourceError
+from packaging.version import Version
+
+
+try:
+    import pandas
+
+    PANDAS_GE_20 = Version(pandas.__version__) >= Version("2.0.0")
+
+except ImportError:
+    PANDAS_GE_20 = None
 
 
 def _stringify_path(path):
@@ -19,6 +29,26 @@ def _stringify_path(path):
     return path
 
 
+def _try_parse_datetime(ser):
+    import pandas as pd  # only called when pandas is known to be installed
+
+    if PANDAS_GE_20:
+        datetime_kwargs = dict(format="ISO8601", errors="ignore")
+    else:
+        datetime_kwargs = dict(yearfirst=True)
+    res = pd.to_datetime(ser, **datetime_kwargs)
+    # if object dtype, try parse as utc instead
+    if res.dtype == "object":
+        res = pd.to_datetime(ser, utc=True, **datetime_kwargs)
+
+    if res.dtype != "object":
+        if PANDAS_GE_20:
+            res = res.dt.as_unit("ms")
+        else:
+            res = res.dt.round(freq="ms")
+    return res
+
+
 def read_dataframe(
     path_or_buffer,
     /,
@@ -146,6 +176,8 @@ def read_dataframe(
     path_or_buffer = _stringify_path(path_or_buffer)
 
     read_func = read_arrow if use_arrow else read
+    if not use_arrow:
+        kwargs["datetime_as_string"] = True
     result = read_func(
         path_or_buffer,
         layer=layer,
@@ -182,8 +214,10 @@ def read_dataframe(
         index = pd.Index(index, name="fid")
     else:
         index = None
-
     df = pd.DataFrame(data, columns=columns, index=index)
+    for dtype, c in zip(meta["dtypes"], df.columns):
+        if dtype.startswith("datetime"):
+            df[c] = _try_parse_datetime(df[c])
 
     if geometry is None or not read_geometry:
         return df
@@ -326,8 +360,23 @@ def write_dataframe(
     # TODO: may need to fill in pd.NA, etc
     field_data = []
     field_mask = []
+    # dict[str, np.array(datetime.datetime)] special case for dt-tz fields
+    timezone_cols_metadata = {}
-    timezone_cols_metadata = {}
+    gdal_tz_offsets = {}
-    timezone_cols_metadata = {}
+    gdal_tz_offsets = {}
     for name in fields:
-        col = df[name].values
+        ser = df[name]
+        col = ser.values
+        if isinstance(ser.dtype, pd.DatetimeTZDtype):
+            # Deal with datetimes with timezones by passing down timezone separately
+            # pass down naive datetime
+            col = ser.dt.tz_localize(None).values
+            # pandas only supports a single offset per column
+            # access via array since we want a numpy array not a series
+            # (only care about the utc offset, not actually the date)
+            # but ser.array.timetz won't have valid utc offset for pytz time zones
+            # (per https://docs.python.org/3/library/datetime.html#datetime.time.utcoffset) # noqa
+            timezone_cols_metadata[name] = ser.array.to_pydatetime()
+        else:
+            col = ser.values
         if isinstance(col, pd.api.extensions.ExtensionArray):
             from pandas.arrays import IntegerArray, FloatingArray, BooleanArray
 
@@ -427,5 +476,6 @@ def write_dataframe(
         metadata=metadata,
         dataset_options=dataset_options,
         layer_options=layer_options,
+        timezone_cols_metadata=timezone_cols_metadata,
         **kwargs,
     )
diff --git a/pyogrio/raw.py b/pyogrio/raw.py
@@ -53,6 +53,7 @@ def read(
     sql=None,
     sql_dialect=None,
     return_fids=False,
+    datetime_as_string=False,
     **kwargs,
 ):
     """Read OGR data source into numpy arrays.
@@ -108,6 +109,10 @@ def read(
         number of features usings FIDs is also driver specific.
     return_fids : bool, optional (default: False)
         If True, will return the FIDs of the feature that were read.
+    datetime_as_string : bool, optional (default: False)
+        If True, will return datetime dtypes as detected by GDAL as a string
+        array, instead of a datetime64 array (used to extract timezone info).
+
     **kwargs
         Additional driver-specific dataset open options passed to OGR.  Invalid
         options will trigger a warning.
@@ -150,6 +155,7 @@ def read(
             sql_dialect=sql_dialect,
             return_fids=return_fids,
             dataset_kwargs=dataset_kwargs,
+            datetime_as_string=datetime_as_string,
         )
     finally:
         if buffer is not None:
@@ -385,8 +391,10 @@ def write(
     metadata=None,
     dataset_options=None,
     layer_options=None,
+    timezone_cols_metadata=None,
     **kwargs,
 ):
+    kwargs.pop("dtypes", None)
     if geometry_type is None:
         raise ValueError("geometry_type must be provided")
 
@@ -471,4 +479,5 @@ def write(
         layer_metadata=layer_metadata,
         dataset_kwargs=dataset_kwargs,
         layer_kwargs=layer_kwargs,
+        timezone_cols_metadata=timezone_cols_metadata,
     )
diff --git a/pyogrio/tests/conftest.py b/pyogrio/tests/conftest.py
@@ -97,3 +97,8 @@ def test_ogr_types_list():
 @pytest.fixture(scope="session")
 def test_datetime():
     return _data_dir / "test_datetime.geojson"
+
+
+@pytest.fixture(scope="session")
+def test_datetime_tz():
+    return _data_dir / "test_datetime_tz.geojson"
diff --git a/pyogrio/tests/fixtures/test_datetime_tz.geojson b/pyogrio/tests/fixtures/test_datetime_tz.geojson
@@ -0,0 +1,7 @@
+{
+"type": "FeatureCollection",
+"features": [
+{ "type": "Feature", "properties": { "col": "2020-01-01T09:00:00.123-05:00" }, "geometry": { "type": "Point", "coordinates": [ 1.0, 1.0 ] } },
+{ "type": "Feature", "properties": { "col": "2020-01-01T10:00:00-05:00" }, "geometry": { "type": "Point", "coordinates": [ 2.0, 2.0 ] } }
+]
+}