Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stats for datetimes #3007

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions libs/libcommon/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ module = [
"moto.*",
"aiobotocore.*",
"requests.*",
"dateutil.*"
]
# ^ huggingface_hub is not typed since version 0.13.0
ignore_missing_imports = true
Expand Down
13 changes: 13 additions & 0 deletions libs/libcommon/src/libcommon/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import orjson
import pandas as pd
import pytz
from dateutil import parser
from huggingface_hub import constants, hf_hub_download
from requests.exceptions import ReadTimeout

Expand Down Expand Up @@ -93,6 +94,18 @@ def get_datetime(days: Optional[float] = None) -> datetime:
return date


def is_datetime(string: str) -> bool:
try:
parser.parse(string)
return True
except ValueError:
return False


def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
return dt.strftime(format)


def get_duration(started_at: datetime) -> float:
"""
Get time in seconds that has passed from `started_at` until now.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
AudioColumn,
BoolColumn,
ClassLabelColumn,
DatetimeColumn,
FloatColumn,
ImageColumn,
IntColumn,
Expand All @@ -57,7 +58,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):


SupportedColumns = Union[
ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
ClassLabelColumn,
IntColumn,
FloatColumn,
StringColumn,
BoolColumn,
ListColumn,
AudioColumn,
ImageColumn,
DatetimeColumn,
]


Expand Down Expand Up @@ -215,29 +224,34 @@ def _column_from_feature(
return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if isinstance(dataset_feature, dict):
if dataset_feature.get("_type") == "ClassLabel":
_type = dataset_feature.get("_type")
if _type == "ClassLabel":
return ClassLabelColumn(
feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
)

if dataset_feature.get("_type") == "Audio":
if _type == "Audio":
return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("_type") == "Image":
if _type == "Image":
return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("_type") == "Value":
if dataset_feature.get("dtype") in INTEGER_DTYPES:
if _type == "Value":
dtype = dataset_feature.get("dtype", "")
if dtype in INTEGER_DTYPES:
return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") in FLOAT_DTYPES:
if dtype in FLOAT_DTYPES:
return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") in STRING_DTYPES:
if dtype in STRING_DTYPES:
return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dataset_feature.get("dtype") == "bool":
if dtype == "bool":
return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)

if dtype.startswith("timestamp"):
return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
return None

columns: list[SupportedColumns] = []
Expand All @@ -249,7 +263,7 @@ def _column_from_feature(
if not columns:
raise NoSupportedFeaturesError(
"No columns for statistics computation found. Currently supported feature types are: "
f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
)

column_names_str = ", ".join([column.name for column in columns])
Expand Down
130 changes: 125 additions & 5 deletions services/worker/src/worker/statistics_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2024 The HuggingFace Authors.
import datetime
import enum
import io
import logging
Expand All @@ -14,6 +15,7 @@
from libcommon.exceptions import (
StatisticsComputationError,
)
from libcommon.utils import datetime_to_string, is_datetime
from PIL import Image
from tqdm.contrib.concurrent import thread_map

Expand Down Expand Up @@ -50,24 +52,41 @@ class ColumnType(str, enum.Enum):
STRING_TEXT = "string_text"
AUDIO = "audio"
IMAGE = "image"
DATETIME = "datetime"


class Histogram(TypedDict):
hist: list[int]
bin_edges: list[Union[int, float]]


class DatetimeHistogram(TypedDict):
hist: list[int]
bin_edges: list[str] # edges are string representations of dates


class NumericalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
min: Optional[float] # might be None in very rare cases when the whole column is only None values
max: Optional[float]
min: Optional[Union[int, float]] # might be None in very rare cases when the whole column is only None values
max: Optional[Union[int, float]]
mean: Optional[float]
median: Optional[float]
std: Optional[float]
histogram: Optional[Histogram]


class DatetimeStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
min: Optional[str] # might be None in very rare cases when the whole column is only None values
max: Optional[str]
mean: Optional[str]
median: Optional[str]
std: Optional[str] # string representation of timedelta
histogram: Optional[DatetimeHistogram]


class CategoricalStatisticsItem(TypedDict):
nan_count: int
nan_proportion: float
Expand All @@ -83,7 +102,9 @@ class BoolStatisticsItem(TypedDict):
frequencies: dict[str, int]


SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem]
SupportedStatistics = Union[
NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem
]


class StatisticsPerColumnItem(TypedDict):
Expand Down Expand Up @@ -456,6 +477,13 @@ def is_class(n_unique: int, n_samples: int) -> bool:
n_unique / n_samples <= MAX_PROPORTION_STRING_LABELS and n_unique <= MAX_NUM_STRING_LABELS
) or n_unique <= NUM_BINS

@staticmethod
def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
"""Check if first 1000 non-null samples in a column match datetime format."""

values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
return all(is_datetime(value) for value in values)

@classmethod
def compute_transformed_data(
cls,
Expand All @@ -473,7 +501,7 @@ def _compute_statistics(
data: pl.DataFrame,
column_name: str,
n_samples: int,
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem]:
) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
n_unique = data[column_name].n_unique()
if cls.is_class(n_unique, n_samples):
Expand All @@ -489,6 +517,13 @@ def _compute_statistics(
n_unique=len(labels2counts),
frequencies=labels2counts,
)
if cls.is_datetime(data, column_name):
datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
data.select(pl.col(column_name).cast(pl.Datetime)),
column_name=column_name,
n_samples=n_samples,
)
return datetime_stats

lengths_column_name = f"{column_name}_len"
lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name)
Expand All @@ -499,7 +534,12 @@ def _compute_statistics(

def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
string_type = ColumnType.STRING_LABEL if "frequencies" in stats else ColumnType.STRING_TEXT
if "frequencies" in stats:
string_type = ColumnType.STRING_LABEL
elif isinstance(stats["histogram"], DatetimeHistogram): # type: ignore
string_type = ColumnType.DATETIME
else:
string_type = ColumnType.STRING_TEXT
return StatisticsPerColumnItem(
column_name=self.name,
column_type=string_type,
Expand Down Expand Up @@ -699,3 +739,83 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No
@classmethod
def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]:
return cls.get_width(example)


class DatetimeColumn(Column):
transform_column = IntColumn

@classmethod
def compute_transformed_data(
cls,
data: pl.DataFrame,
column_name: str,
transformed_column_name: str,
min_date: datetime.datetime,
) -> pl.DataFrame:
return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))

@staticmethod
def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))

@classmethod
def _compute_statistics(
cls,
data: pl.DataFrame,
column_name: str,
n_samples: int,
) -> DatetimeStatisticsItem:
nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
if nan_count == n_samples: # all values are None
return DatetimeStatisticsItem(
nan_count=n_samples,
nan_proportion=1.0,
min=None,
max=None,
mean=None,
median=None,
std=None,
histogram=None,
)

min_date: datetime.datetime = data[column_name].min() # type: ignore # mypy infers type of datetime column .min() incorrectly
timedelta_column_name = f"{column_name}_timedelta"
# compute distribution of time passed from min date in **seconds**
timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics(
timedelta_df,
column_name=timedelta_column_name,
n_samples=n_samples,
)
# to assure mypy that there values are not None to pass to conversion functions:
assert timedelta_stats["histogram"] is not None # nosec
assert timedelta_stats["max"] is not None # nosec
assert timedelta_stats["mean"] is not None # nosec
assert timedelta_stats["median"] is not None # nosec
assert timedelta_stats["std"] is not None # nosec

datetime_bin_edges = [
cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
]

return DatetimeStatisticsItem(
nan_count=nan_count,
nan_proportion=nan_proportion,
min=datetime_to_string(min_date),
max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]),
mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]),
median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]),
std=str(datetime.timedelta(seconds=timedelta_stats["std"])),
histogram=DatetimeHistogram(
hist=timedelta_stats["histogram"]["hist"],
bin_edges=datetime_bin_edges,
),
)

def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
return StatisticsPerColumnItem(
column_name=self.name,
column_type=ColumnType.DATETIME,
column_statistics=stats,
)
2 changes: 2 additions & 0 deletions services/worker/tests/fixtures/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from .statistics_dataset import (
audio_dataset,
datetime_dataset,
image_dataset,
null_column,
statistics_dataset,
Expand Down Expand Up @@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]:
"descriptive_statistics_not_supported": statistics_not_supported_dataset,
"audio_statistics": audio_dataset,
"image_statistics": image_dataset,
"datetime_statistics": datetime_dataset,
}
20 changes: 20 additions & 0 deletions services/worker/tests/fixtures/hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str
delete_hub_dataset_repo(repo_id=repo_id)


@pytest.fixture(scope="session")
def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]:
repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"])
yield repo_id
delete_hub_dataset_repo(repo_id=repo_id)


@pytest.fixture(scope="session")
def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]:
default_config_name, _ = get_default_config_split()
Expand Down Expand Up @@ -1177,6 +1184,19 @@ def hub_responses_image_statistics(
}


@pytest.fixture
def hub_responses_datetime_statistics(
hub_public_datetime_statistics: str,
) -> HubDatasetTest:
return {
"name": hub_public_datetime_statistics,
"config_names_response": create_config_names_response(hub_public_datetime_statistics),
"splits_response": create_splits_response(hub_public_datetime_statistics),
"first_rows_response": None,
"parquet_and_info_response": None,
}


@pytest.fixture
def hub_responses_descriptive_statistics_parquet_builder(
hub_public_descriptive_statistics_parquet_builder: str,
Expand Down
Loading
Loading