Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[feat] Support for numeric date feature inputs #3517

Merged
merged 10 commits into from
Aug 11, 2023
22 changes: 12 additions & 10 deletions ludwig/features/date_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,12 @@

import numpy as np
import torch
from dateutil.parser import parse

from ludwig.constants import COLUMN, DATE, PROC_COLUMN
from ludwig.features.base_feature import BaseFeatureMixin, InputFeature
from ludwig.schema.features.date_feature import DateInputFeatureConfig
from ludwig.types import FeatureConfigDict, FeatureMetadataDict, PreprocessingConfigDict, TrainingSetMetadataDict
from ludwig.utils.date_utils import create_vector_from_datetime_obj
from ludwig.utils.date_utils import create_vector_from_datetime_obj, parse_datetime
from ludwig.utils.types import DataFrame, TorchscriptPreprocessingInput

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -63,17 +62,20 @@ def get_feature_meta(
return {"preprocessing": preprocessing_parameters}

@staticmethod
def date_to_list(date_str, datetime_format, preprocessing_parameters):
def date_to_list(date_value, datetime_format, preprocessing_parameters):
try:
if isinstance(date_str, datetime):
datetime_obj = date_str
elif datetime_format is not None:
datetime_obj = datetime.strptime(date_str, datetime_format)
if isinstance(date_value, datetime):
datetime_obj = date_value
elif isinstance(date_value, str) and datetime_format is not None:
try:
datetime_obj = datetime.strptime(date_value, datetime_format)
except ValueError:
datetime_obj = parse_datetime(date_value)
else:
datetime_obj = parse(date_str)
datetime_obj = parse_datetime(date_value)
except Exception as e:
logger.error(
f"Error parsing date: '{date_str}' with error '{e}' "
f"Error parsing date: '{date_value}' with error '{e}' "
"Please provide a datetime format that parses it "
"in the preprocessing section of the date feature "
"in the config. "
Expand All @@ -83,7 +85,7 @@ def date_to_list(date_str, datetime_format, preprocessing_parameters):
)
fill_value = preprocessing_parameters["fill_value"]
if fill_value != "":
datetime_obj = parse(fill_value)
datetime_obj = parse_datetime(fill_value)
else:
datetime_obj = datetime.now()

Expand Down
60 changes: 59 additions & 1 deletion ludwig/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from datetime import date
import time
from datetime import date, datetime
from typing import Union

import numpy as np
from dateutil.parser import parse, ParserError

from ludwig.api_annotations import DeveloperAPI

SCALE_S = np.floor(np.log10(time.time()))


@DeveloperAPI
def create_vector_from_datetime_obj(datetime_obj):
Expand All @@ -36,3 +43,54 @@ def create_vector_from_datetime_obj(datetime_obj):
datetime_obj.second,
second_of_day,
]


@DeveloperAPI
def parse_datetime(timestamp: Union[float, int, str]) -> datetime:
"""Parse a datetime from a string or a numeric timestamp.

Args:
timestamp: A datetime string or numeric timestamp.

Returns:
A datetime representation of `timestamp`.
"""
try:
dt = parse(timestamp)
except (OverflowError, ParserError, TypeError):
dt = convert_number_to_datetime(timestamp)

return dt


@DeveloperAPI
def convert_number_to_datetime(timestamp: Union[float, int, str]) -> datetime:
"""Convert a numeric timestamp to a datetime object.

`datetime` objects can be created from POSIX timestamps like those returned by `time.time()`.

Args:
timestamp: A numeric timestamp.

Returns:
A datetime representation of `timestamp`.

Raises:
ValueError: Raised if `timestamp` is not a number or not a valid datetime.
"""
try:
timestamp = float(timestamp)
except TypeError:
raise ValueError(f"Provided value {timestamp} is not a valid numeric timestamp")

# Determine the unit of the timestamp
ts_scale = np.floor(np.log10(timestamp))

# `datetime.datetime.fromtimestamp` expects a timestamp in seconds. Rescale the timestamp if it is not in seconds.
if SCALE_S < ts_scale:
delta = ts_scale - SCALE_S
timestamp = timestamp / np.power(10, delta)

# Convert the timestamp to a datetime object. If it is not a valid timestamp, `ValueError` is raised.
dt = datetime.utcfromtimestamp(timestamp)
return dt
104 changes: 104 additions & 0 deletions tests/integration_tests/test_date_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import datetime
import time

import pandas as pd
import pytest
from dateutil.parser import parse

from ludwig.api import LudwigModel
from ludwig.constants import (
BACKEND,
BINARY,
DATE,
EPOCHS,
FILL_WITH_CONST,
INPUT_FEATURES,
MISSING_VALUE_STRATEGY,
NAME,
OUTPUT_FEATURES,
PREPROCESSING,
RAY,
TRAINER,
TYPE,
)
from ludwig.utils.date_utils import create_vector_from_datetime_obj

ray = pytest.importorskip("ray")

pytestmark = [
pytest.mark.distributed,
]


@pytest.fixture(scope="module")
def string_date_df() -> "pd.DataFrame":
df = pd.DataFrame.from_dict(
{
"date_feature": [str(datetime.datetime.now()) for i in range(100)],
"binary_feature": [i % 2 for i in range(100)],
}
)
return df


@pytest.fixture(scope="module")
def int_date_df() -> "pd.DataFrame":
df = pd.DataFrame.from_dict(
{
"date_feature": [time.time_ns() for i in range(100)],
"binary_feature": [i % 2 for i in range(100)],
}
)
return df


@pytest.fixture(scope="module")
def float_date_df() -> "pd.DataFrame":
df = pd.DataFrame.from_dict(
{
"date_feature": [time.time() for i in range(100)],
"binary_feature": [i % 2 for i in range(100)],
}
)
return df


@pytest.mark.parametrize(
"date_df",
[
pytest.param("string_date_df", id="string_date"),
pytest.param("int_date_df", id="int_date"),
pytest.param("float_date_df", id="float_date"),
],
)
def test_date_feature_formats(date_df, request, ray_cluster_2cpu):
df = request.getfixturevalue(date_df)

config = {
INPUT_FEATURES: [
{
NAME: "date_feature",
TYPE: DATE,
PREPROCESSING: {MISSING_VALUE_STRATEGY: FILL_WITH_CONST, "fill_value": "1970-01-01 00:00:00"},
}
],
OUTPUT_FEATURES: [{NAME: "binary_feature", TYPE: BINARY}],
TRAINER: {EPOCHS: 2},
BACKEND: {TYPE: RAY, "processor": {TYPE: "dask"}},
}

fill_value = create_vector_from_datetime_obj(parse("1970-01-01 00:00:00"))

model = LudwigModel(config)
preprocessed = model.preprocess(df)

# Because parsing errors are suppressed, we want to ensure that the data was preprocessed correctly. Sample data is
# drawn from the current time, so the recorded years should not match the fill value's year.
for date in preprocessed.training_set.to_df().compute().iloc[:, 0].values:
assert date[0] != fill_value[0]

for date in preprocessed.validation_set.to_df().compute().iloc[:, 0].values:
assert date[0] != fill_value[0]

for date in preprocessed.test_set.to_df().compute().iloc[:, 0].values:
assert date[0] != fill_value[0]
69 changes: 68 additions & 1 deletion tests/ludwig/features/test_date_feature.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
from copy import deepcopy
from datetime import datetime
from typing import Any, List

import pytest
import torch
from dateutil.parser import parse

from ludwig.constants import ENCODER_OUTPUT
from ludwig.constants import ENCODER_OUTPUT, FILL_WITH_CONST, MISSING_VALUE_STRATEGY
from ludwig.features import date_feature
from ludwig.features.date_feature import DateInputFeature
from ludwig.schema.features.date_feature import DateInputFeatureConfig
from ludwig.schema.utils import load_config_with_kwargs
from ludwig.types import FeatureConfigDict
from ludwig.utils.date_utils import create_vector_from_datetime_obj
from ludwig.utils.misc_utils import merge_dict
from ludwig.utils.torch_utils import get_torch_device

Expand Down Expand Up @@ -59,6 +62,70 @@ def test_date_to_list(date_str, datetime_format, expected_list):
)


@pytest.fixture(scope="module")
def reference_date_list() -> List[int]:
return create_vector_from_datetime_obj(datetime.utcfromtimestamp(1691600953.443032))


@pytest.fixture(scope="module")
def fill_value() -> str:
return "1970-01-01 00:00:00"


@pytest.fixture(scope="module")
def fill_value_list(fill_value: str) -> List[int]:
return create_vector_from_datetime_obj(parse(fill_value))


@pytest.mark.parametrize(
"timestamp,datetime_format,expected_list",
[
pytest.param(1691600953.443032, None, "reference_date_list", id="float-s"),
pytest.param(1691600953443.032, None, "reference_date_list", id="float-ms"),
pytest.param(1691600953, None, "reference_date_list", id="int-s"),
pytest.param(1691600953443, None, "reference_date_list", id="int-ms"),
pytest.param(1691600953.443032, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="float-s-fmt"),
pytest.param(1691600953443.032, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="float-ms-fmt"),
pytest.param(1691600953, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="int-s-fmt"),
pytest.param(1691600953443, "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="int-ms-fmt"),
pytest.param("1691600953.443032", None, "reference_date_list", id="string[float]-s"),
pytest.param("1691600953443.0032", None, "reference_date_list", id="string[float]-ms"),
pytest.param("1691600953", None, "reference_date_list", id="string[int]-s"),
pytest.param("1691600953443", None, "reference_date_list", id="string[int]-ms"),
pytest.param("1691600953.443032", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[float]-s-fmt"),
pytest.param("1691600953443.0032", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[float]-ms-fmt"),
pytest.param("1691600953", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[int]-s-fmt"),
pytest.param("1691600953443", "%d/%m/%y %H:%M:%S.%f", "reference_date_list", id="string[int]-ms-fmt"),
pytest.param("foo", None, "fill_value_list", id="string error"),
pytest.param([1691600953.443032], None, "fill_value_list", id="list error"),
pytest.param(None, None, "fill_value_list", id="NoneType error"),
],
)
def test_date_to_list_numeric(timestamp: Any, datetime_format: str, expected_list: List[int], fill_value: str, request):
"""Test that numeric datetime formats are converted correctly.

Currently, we support int, float, and string representations of POSIX timestamps in seconds and milliseconds. Valid
timestamps should be converted to datetime lists by `luwdig.utils.date_utils.create_vector_from_datetime_object`.
If a string format is provided, it should be ignored.

Args:
timestamp: Input to be converted to a date vector
datetime_format: Optional format string, should be ignored under the hood with these timestamps.
expected_list: The expected output of `DateFeatureMixin.date_to_list`
fill_value: Date to be used as fallback
request: pytest request fixture
"""
expected_result = request.getfixturevalue(expected_list)

# The default fill value is `datetime.now`, for testing we override this to be a constant.
preprocessing_parameters = {MISSING_VALUE_STRATEGY: FILL_WITH_CONST, "fill_value": fill_value}

# No exception should ever be raised from `date_to_list` due to a parsing error. The expected behavior is to fall
# back to the fill value.
dt = date_feature.DateInputFeature.date_to_list(timestamp, datetime_format, preprocessing_parameters)
assert dt == expected_result


def test_date_to_list__DatetimeObjectFromParsedJSON():
preprocessing_parameters = None
datetime_obj = datetime.fromisoformat("2022-06-25")
Expand Down
49 changes: 49 additions & 0 deletions tests/ludwig/utils/test_date_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import datetime
from contextlib import nullcontext as does_not_raise
from typing import Any, ContextManager

import pytest

from ludwig.utils.date_utils import convert_number_to_datetime


@pytest.fixture(scope="module")
def reference_datetime() -> datetime.datetime:
return datetime.datetime.utcfromtimestamp(1691600953.443032)


@pytest.mark.parametrize(
"timestamp,raises",
[
pytest.param(1691600953.443032, does_not_raise(), id="float-s"),
pytest.param(1691600953443.032, does_not_raise(), id="float-ms"),
pytest.param(1691600953, does_not_raise(), id="int-s"),
pytest.param(1691600953443, does_not_raise(), id="int-ms"),
pytest.param("1691600953.443032", does_not_raise(), id="string[float]-s"),
pytest.param("1691600953443.0032", does_not_raise(), id="string[float]-ms"),
pytest.param("1691600953", does_not_raise(), id="string[int]-s"),
pytest.param("1691600953443", does_not_raise(), id="string[int]-ms"),
pytest.param("foo", pytest.raises(ValueError), id="string error"),
pytest.param([1691600953.443032], pytest.raises(ValueError), id="list error"),
pytest.param(datetime.datetime(2023, 8, 9, 13, 9, 13), pytest.raises(ValueError), id="datetime error"),
pytest.param(None, pytest.raises(ValueError), id="NoneType error"),
],
)
def test_convert_number_to_datetime(reference_datetime: datetime.datetime, timestamp: Any, raises: ContextManager):
"""Ensure that numeric timestamps are correctly converted to datetime objects.

Args:
reference_datetime: A datetime object with the expected date/time
timestamp: The timestamp to convert in s or ms
raises: context manager to check for expected exceptions
"""
with raises:
dt = convert_number_to_datetime(timestamp)

# Check that the returned datetime is accurate to the scale of seconds.
assert dt.year == reference_datetime.year
assert dt.month == reference_datetime.month
assert dt.day == reference_datetime.day
assert dt.hour == reference_datetime.hour
assert dt.minute == reference_datetime.minute
assert dt.second == reference_datetime.second