From eb3b7ca18ee9402dbb18993f92394bb6d8a6b81c Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Tue, 27 Jun 2023 10:57:30 -0400 Subject: [PATCH 1/5] fix integer and null datetimes --- rdt/transformers/datetime.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 89c653074..769e012f4 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -1,7 +1,7 @@ """Transformer for datetime data.""" import numpy as np import pandas as pd -from pandas.api.types import is_datetime64_dtype +from pandas.api.types import is_datetime64_dtype, is_numeric_dtype from pandas.core.tools.datetimes import _guess_datetime_format_for_array from rdt.transformers.base import BaseTransformer @@ -55,7 +55,7 @@ def __init__(self, missing_value_replacement='mean', model_missing_values=None, self._dtype = None def _convert_to_datetime(self, data): - if data.dtype == 'object': + if self.datetime_format or not is_numeric_dtype(data): try: pandas_datetime_format = None if self.datetime_format: @@ -138,13 +138,16 @@ def _reverse_transform(self, data): data = self._reverse_transform_helper(data) datetime_data = pd.to_datetime(data) if self.datetime_format: - if self._dtype == 'object': - datetime_data = datetime_data.dt.strftime(self.datetime_format) - elif is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format: + if is_datetime64_dtype(self._dtype) and '.%f' not in self.datetime_format: datetime_data = pd.to_datetime( datetime_data.dt.strftime(self.datetime_format), format=self.datetime_format, ) + else: + datetime_data = datetime_data.dt.strftime(self.datetime_format).astype(self._dtype) + elif is_numeric_dtype(self._dtype): + datetime_data = pd.to_numeric(datetime_data.astype('object'), errors='coerce') + datetime_data = datetime_data.astype(self._dtype) return datetime_data From 74088941c6d0801f38e8eae502f59c0cd4063be7 Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Tue, 27 Jun 2023 10:57:46 -0400 Subject: [PATCH 2/5] add tests --- .../integration/transformers/test_datetime.py | 41 +++++++++++++++++++ tests/unit/transformers/test_datetime.py | 33 +++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/tests/integration/transformers/test_datetime.py b/tests/integration/transformers/test_datetime.py index 02dcb2c17..bf278569f 100644 --- a/tests/integration/transformers/test_datetime.py +++ b/tests/integration/transformers/test_datetime.py @@ -90,6 +90,47 @@ def test_unixtimestampencoder_with_model_missing_values(self): pd.testing.assert_frame_equal(expected_transformed, transformed) pd.testing.assert_frame_equal(reverted, data) + def test_unixtimestampencoder_with_integer_datetimes(self): + """Test that the transformer properly handles integer columns.""" + # Setup + ute = UnixTimestampEncoder('mean', True, datetime_format='%m%d%Y') + data = pd.DataFrame({'column': [1201992, 11022028, 10011990]}) + + # Run + ute.fit(data, column='column') + ute.set_random_state(np.random.RandomState(7), 'reverse_transform') + transformed = ute.transform(data) + reverted = ute.reverse_transform(transformed) + + # Asserts + expected_transformed = pd.DataFrame({ + 'column': [6.958656e+17, 1.856736e+18, 6.547392e+17], + }) + + pd.testing.assert_frame_equal(expected_transformed, transformed) + pd.testing.assert_frame_equal(reverted, data) + + def test_unixtimestampencoder_with_nans(self): + """Test that the transformer properly handles null columns.""" + # Setup + ute = UnixTimestampEncoder('mean', True) + data = pd.DataFrame({'column': [np.nan, np.nan, np.nan]}) + + # Run + ute.fit(data, column='column') + ute.set_random_state(np.random.RandomState(7), 'reverse_transform') + transformed = ute.transform(data) + reverted = ute.reverse_transform(transformed) + + # Asserts + expected_transformed = pd.DataFrame({ + 'column': [0., 0., 0.], + 'column.is_null': [1., 1., 1.] + }) + + pd.testing.assert_frame_equal(expected_transformed, transformed) + pd.testing.assert_frame_equal(reverted, data) + class TestOptimizedTimestampEncoder: def test_optimizedtimestampencoder(self): diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index b3b217357..c8448269f 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -430,6 +430,39 @@ def test__reverse_transform_datetime_format_with_strftime_formats(self): if 'windows' not in platform.system().lower(): pd.testing.assert_series_equal(output, expected) + def test__reverse_transform_datetime_format_with_nans(self): + """Test the ``_reverse_transform`` method returns the correct datetime format with nans.""" + # Setup + ute = UnixTimestampEncoder() + ute.datetime_format = '%b %-d, %Y' + transformed = np.array([1.5778368e+18, 1.5805152e+18, np.nan]) + ute._dtype = 'object' + ute.null_transformer = NullTransformer('mean') + + # Run + output = ute._reverse_transform(transformed) + + # Assert + expected = pd.Series(['Jan 1, 2020', 'Feb 1, 2020', np.nan]) + if 'windows' not in platform.system().lower(): + pd.testing.assert_series_equal(output, expected) + + def test__reverse_transform_only_nans(self): + """Test the ``_reverse_transform`` method returns the correct datetime format with nans.""" + # Setup + ute = UnixTimestampEncoder() + transformed = np.array([np.nan, np.nan, np.nan]) + ute._dtype = 'float' + ute.null_transformer = NullTransformer('mean') + + # Run + output = ute._reverse_transform(transformed) + + # Assert + expected = pd.Series([np.nan, np.nan, np.nan]) + if 'windows' not in platform.system().lower(): + pd.testing.assert_series_equal(output, expected) + class TestOptimizedTimestampEncoder: From 72975fac0c163b37b26a1e5f5c224fe3540ed62d Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Wed, 28 Jun 2023 09:54:21 -0400 Subject: [PATCH 3/5] add docstring and add windows back to test --- rdt/transformers/datetime.py | 19 +++++++++++++++++++ tests/unit/transformers/test_datetime.py | 4 ---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/rdt/transformers/datetime.py b/rdt/transformers/datetime.py index 769e012f4..00b2c7f11 100644 --- a/rdt/transformers/datetime.py +++ b/rdt/transformers/datetime.py @@ -55,6 +55,25 @@ def __init__(self, missing_value_replacement='mean', model_missing_values=None, self._dtype = None def _convert_to_datetime(self, data): + """Convert datetime column into datetime dtype. + + Convert the datetime column to datetime dtype using the ``datetime_format``. + All non-numeric columns will automatically be cast to datetimes. Numeric columns + with a ``datetime_format`` will be treated as strings and cast to datetime. Numeric + columns without a ``datetime_format`` will be treated as already converted datetimes. + + Args: + data (pandas.Series): + The datetime column. + + Raises: + - ``TypeError`` if data cannot be converted to datetime. + - ``ValueError`` if data does not match the specified datetime format + + Returns: + pandas.Series: + The datetime column converted to the datetime dtype. + """ if self.datetime_format or not is_numeric_dtype(data): try: pandas_datetime_format = None diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index c8448269f..3bba7845b 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -444,8 +444,6 @@ def test__reverse_transform_datetime_format_with_nans(self): # Assert expected = pd.Series(['Jan 1, 2020', 'Feb 1, 2020', np.nan]) - if 'windows' not in platform.system().lower(): - pd.testing.assert_series_equal(output, expected) def test__reverse_transform_only_nans(self): """Test the ``_reverse_transform`` method returns the correct datetime format with nans.""" @@ -460,8 +458,6 @@ def test__reverse_transform_only_nans(self): # Assert expected = pd.Series([np.nan, np.nan, np.nan]) - if 'windows' not in platform.system().lower(): - pd.testing.assert_series_equal(output, expected) class TestOptimizedTimestampEncoder: From 0ce6a464045807927e0c4a73245f5a9814deb5fb Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Wed, 28 Jun 2023 10:31:27 -0400 Subject: [PATCH 4/5] fix test --- tests/unit/transformers/test_datetime.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index 3bba7845b..b8a142b12 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -444,6 +444,7 @@ def test__reverse_transform_datetime_format_with_nans(self): # Assert expected = pd.Series(['Jan 1, 2020', 'Feb 1, 2020', np.nan]) + pd.testing.assert_series_equal(output, expected) def test__reverse_transform_only_nans(self): """Test the ``_reverse_transform`` method returns the correct datetime format with nans.""" @@ -458,6 +459,7 @@ def test__reverse_transform_only_nans(self): # Assert expected = pd.Series([np.nan, np.nan, np.nan]) + pd.testing.assert_series_equal(output, expected) class TestOptimizedTimestampEncoder: From 07c31f9764bc55478ae6ce72269403641fb27328 Mon Sep 17 00:00:00 2001 From: Frances Hartwell Date: Thu, 29 Jun 2023 15:01:19 -0400 Subject: [PATCH 5/5] actually fix test --- tests/unit/transformers/test_datetime.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/transformers/test_datetime.py b/tests/unit/transformers/test_datetime.py index b8a142b12..685504ff2 100644 --- a/tests/unit/transformers/test_datetime.py +++ b/tests/unit/transformers/test_datetime.py @@ -434,7 +434,7 @@ def test__reverse_transform_datetime_format_with_nans(self): """Test the ``_reverse_transform`` method returns the correct datetime format with nans.""" # Setup ute = UnixTimestampEncoder() - ute.datetime_format = '%b %-d, %Y' + ute.datetime_format = '%b %d, %Y' transformed = np.array([1.5778368e+18, 1.5805152e+18, np.nan]) ute._dtype = 'object' ute.null_transformer = NullTransformer('mean') @@ -443,7 +443,7 @@ def test__reverse_transform_datetime_format_with_nans(self): output = ute._reverse_transform(transformed) # Assert - expected = pd.Series(['Jan 1, 2020', 'Feb 1, 2020', np.nan]) + expected = pd.Series(['Jan 01, 2020', 'Feb 01, 2020', np.nan]) pd.testing.assert_series_equal(output, expected) def test__reverse_transform_only_nans(self):