Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of datetime dtypes #661

Merged
merged 5 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions rdt/transformers/datetime.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Transformer for datetime data."""
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_dtype
from pandas.api.types import is_datetime64_dtype, is_numeric_dtype
from pandas.core.tools.datetimes import _guess_datetime_format_for_array

from rdt.transformers.base import BaseTransformer
Expand Down Expand Up @@ -55,7 +55,26 @@ def __init__(self, missing_value_replacement='mean', model_missing_values=None,
self._dtype = None

def _convert_to_datetime(self, data):
amontanez24 marked this conversation as resolved.
Show resolved Hide resolved
if data.dtype == 'object':
"""Convert datetime column into datetime dtype.

Convert the datetime column to datetime dtype using the ``datetime_format``.
All non-numeric columns will automatically be cast to datetimes. Numeric columns
with a ``datetime_format`` will be treated as strings and cast to datetime. Numeric
columns without a ``datetime_format`` will be treated as already converted datetimes.

Args:
data (pandas.Series):
The datetime column.

Raises:
- ``TypeError`` if data cannot be converted to datetime.
- ``ValueError`` if data does not match the specified datetime format

Returns:
pandas.Series:
The datetime column converted to the datetime dtype.
"""
if self.datetime_format or not is_numeric_dtype(data):
try:
pandas_datetime_format = None
if self.datetime_format:
Expand Down Expand Up @@ -138,13 +157,16 @@ def _reverse_transform(self, data):
data = self._reverse_transform_helper(data)
datetime_data = pd.to_datetime(data)
if self.datetime_format:
if self._dtype == 'object':
datetime_data = datetime_data.dt.strftime(self.datetime_format)
elif is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format:
if is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format:
datetime_data = pd.to_datetime(
datetime_data.dt.strftime(self.datetime_format),
format=self.datetime_format,
)
else:
datetime_data = datetime_data.dt.strftime(self.datetime_format).astype(self._dtype)
elif is_numeric_dtype(self._dtype):
datetime_data = pd.to_numeric(datetime_data.astype('object'), errors='coerce')
datetime_data = datetime_data.astype(self._dtype)

return datetime_data

Expand Down
41 changes: 41 additions & 0 deletions tests/integration/transformers/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,47 @@ def test_unixtimestampencoder_with_model_missing_values(self):
pd.testing.assert_frame_equal(expected_transformed, transformed)
pd.testing.assert_frame_equal(reverted, data)

def test_unixtimestampencoder_with_integer_datetimes(self):
"""Test that the transformer properly handles integer columns."""
# Setup
ute = UnixTimestampEncoder('mean', True, datetime_format='%m%d%Y')
data = pd.DataFrame({'column': [1201992, 11022028, 10011990]})

# Run
ute.fit(data, column='column')
ute.set_random_state(np.random.RandomState(7), 'reverse_transform')
transformed = ute.transform(data)
reverted = ute.reverse_transform(transformed)

# Asserts
expected_transformed = pd.DataFrame({
'column': [6.958656e+17, 1.856736e+18, 6.547392e+17],
})

pd.testing.assert_frame_equal(expected_transformed, transformed)
pd.testing.assert_frame_equal(reverted, data)

def test_unixtimestampencoder_with_nans(self):
"""Test that the transformer properly handles null columns."""
# Setup
ute = UnixTimestampEncoder('mean', True)
data = pd.DataFrame({'column': [np.nan, np.nan, np.nan]})

# Run
ute.fit(data, column='column')
ute.set_random_state(np.random.RandomState(7), 'reverse_transform')
transformed = ute.transform(data)
reverted = ute.reverse_transform(transformed)

# Asserts
expected_transformed = pd.DataFrame({
'column': [0., 0., 0.],
'column.is_null': [1., 1., 1.]
})

pd.testing.assert_frame_equal(expected_transformed, transformed)
pd.testing.assert_frame_equal(reverted, data)


class TestOptimizedTimestampEncoder:
def test_optimizedtimestampencoder(self):
Expand Down
31 changes: 31 additions & 0 deletions tests/unit/transformers/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,37 @@ def test__reverse_transform_datetime_format_with_strftime_formats(self):
if 'windows' not in platform.system().lower():
pd.testing.assert_series_equal(output, expected)

def test__reverse_transform_datetime_format_with_nans(self):
"""Test the ``_reverse_transform`` method returns the correct datetime format with nans."""
# Setup
ute = UnixTimestampEncoder()
ute.datetime_format = '%b %d, %Y'
transformed = np.array([1.5778368e+18, 1.5805152e+18, np.nan])
ute._dtype = 'object'
ute.null_transformer = NullTransformer('mean')

# Run
output = ute._reverse_transform(transformed)

# Assert
expected = pd.Series(['Jan 01, 2020', 'Feb 01, 2020', np.nan])
pd.testing.assert_series_equal(output, expected)

def test__reverse_transform_only_nans(self):
"""Test the ``_reverse_transform`` method returns the correct datetime format with nans."""
# Setup
ute = UnixTimestampEncoder()
transformed = np.array([np.nan, np.nan, np.nan])
ute._dtype = 'float'
ute.null_transformer = NullTransformer('mean')

# Run
output = ute._reverse_transform(transformed)

# Assert
expected = pd.Series([np.nan, np.nan, np.nan])
pd.testing.assert_series_equal(output, expected)


class TestOptimizedTimestampEncoder:

Expand Down