From 79790e0ed3cded901a5e22ad7adeaac0ce9e3d67 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Wed, 31 Jul 2024 14:37:42 +0200
Subject: [PATCH 01/10] compute stats for datetimes

---
 .../worker/src/worker/statistics_utils.py     | 109 +++++++++++++++++-
 1 file changed, 107 insertions(+), 2 deletions(-)

diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index f2651bb091..ccb28ace6b 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The HuggingFace Authors.
+import datetime
 import enum
 import io
 import logging
@@ -50,11 +51,12 @@ class ColumnType(str, enum.Enum):
     STRING_TEXT = "string_text"
     AUDIO = "audio"
     IMAGE = "image"
+    DATETIME = "datetime"
 
 
 class Histogram(TypedDict):
     hist: list[int]
-    bin_edges: list[Union[int, float]]
+    bin_edges: list[Union[int, float, str]]
 
 
 class NumericalStatisticsItem(TypedDict):
@@ -68,6 +70,17 @@ class NumericalStatisticsItem(TypedDict):
     histogram: Optional[Histogram]
 
 
+class DatetimeStatisticsItem(TypedDict):
+    nan_count: int
+    nan_proportion: float
+    min: Optional[str]  # might be None in very rare cases when the whole column is only None values
+    max: Optional[str]
+    mean: Optional[str]
+    median: Optional[str]
+    std: Optional[str]  # string representation of timedelta
+    histogram: Optional[Histogram]
+
+
 class CategoricalStatisticsItem(TypedDict):
     nan_count: int
     nan_proportion: float
@@ -83,7 +96,9 @@ class BoolStatisticsItem(TypedDict):
     frequencies: dict[str, int]
 
 
-SupportedStatistics = Union[NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem]
+SupportedStatistics = Union[
+    NumericalStatisticsItem, CategoricalStatisticsItem, BoolStatisticsItem, DatetimeStatisticsItem
+]
 
 
 class StatisticsPerColumnItem(TypedDict):
@@ -699,3 +714,93 @@ def get_shape(example: Optional[Union[bytes, dict[str, Any]]]) -> Union[tuple[No
     @classmethod
     def transform(cls, example: Optional[Union[bytes, dict[str, Any]]]) -> Optional[int]:
         return cls.get_width(example)
+
+
+class DatetimeColumn(Column):
+    transform_column = IntColumn
+
+    @classmethod
+    def compute_transformed_data(
+        cls,
+        data: pl.DataFrame,
+        column_name: str,
+        transformed_column_name: str,
+        min_date: datetime.datetime,
+    ) -> pl.DataFrame:
+        return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
+
+    @staticmethod
+    def shift_and_convert_to_string(min_date, seconds) -> str:
+        return datetime_to_string(min_date + datetime.timedelta(seconds=seconds))
+
+    @classmethod
+    def _compute_statistics(
+        cls,
+        data: pl.DataFrame,
+        column_name: str,
+        n_samples: int,
+    ) -> DatetimeStatisticsItem:
+        nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
+        if nan_count == n_samples:  # all values are None
+            return DatetimeStatisticsItem(
+                nan_count=n_samples,
+                nan_proportion=1.0,
+                min=None,
+                max=None,
+                mean=None,
+                median=None,
+                std=None,
+                histogram=None,
+            )
+
+        min_date = data[column_name].min()
+        timedelta_column_name = f"{column_name}_timedelta"
+        # compute distribution of time passed from min date in **seconds**
+        timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
+        timedelta_stats: NumericalStatisticsItem = cls.transform_column.compute_statistics(
+            timedelta_df,
+            column_name=timedelta_column_name,
+            n_samples=n_samples,
+        )
+        for stat in ("max", "mean", "median"):
+            timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat])
+
+        bin_edges = [
+            cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
+        ]
+
+        return DatetimeStatisticsItem(
+            nan_count=nan_count,
+            nan_proportion=nan_proportion,
+            min=datetime_to_string(min_date),
+            max=timedelta_stats["max"],
+            mean=timedelta_stats["mean"],
+            median=timedelta_stats["median"],
+            std=str(timedelta_stats["std"]),
+            histogram=Histogram(
+                hist=timedelta_stats["histogram"]["hist"],
+                bin_edges=bin_edges,
+            ),
+        )
+
+    def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
+        stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
+        return StatisticsPerColumnItem(
+            column_name=self.name,
+            column_type=ColumnType.DATETIME,
+            column_statistics=stats,
+        )
+
+
+def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str:
+    """
+    Convert a datetime.datetime object to a string.
+
+    Args:
+        dt (datetime): The datetime object to convert.
+        format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S".
+
+    Returns:
+        str: The datetime object as a string.
+    """
+    return dt.strftime(format)

From 851ec1b434a586e92e04de90e3ad4967ca674bc2 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Wed, 31 Jul 2024 16:42:59 +0200
Subject: [PATCH 02/10] fix typing

---
 .../worker/src/worker/statistics_utils.py     | 43 +++++++++++--------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index ccb28ace6b..dfd2599164 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -56,14 +56,19 @@ class ColumnType(str, enum.Enum):
 
 class Histogram(TypedDict):
     hist: list[int]
-    bin_edges: list[Union[int, float, str]]
+    bin_edges: list[Union[int, float]]
+
+
+class DatetimeHistogram(TypedDict):
+    hist: list[int]
+    bin_edges: list[str]  # edges are string representations of dates
 
 
 class NumericalStatisticsItem(TypedDict):
     nan_count: int
     nan_proportion: float
-    min: Optional[float]  # might be None in very rare cases when the whole column is only None values
-    max: Optional[float]
+    min: Optional[Union[int, float]]  # might be None in very rare cases when the whole column is only None values
+    max: Optional[Union[int, float]]
     mean: Optional[float]
     median: Optional[float]
     std: Optional[float]
@@ -78,7 +83,7 @@ class DatetimeStatisticsItem(TypedDict):
     mean: Optional[str]
     median: Optional[str]
     std: Optional[str]  # string representation of timedelta
-    histogram: Optional[Histogram]
+    histogram: Optional[DatetimeHistogram]
 
 
 class CategoricalStatisticsItem(TypedDict):
@@ -730,8 +735,8 @@ def compute_transformed_data(
         return data.select((pl.col(column_name) - min_date).dt.total_seconds().alias(transformed_column_name))
 
     @staticmethod
-    def shift_and_convert_to_string(min_date, seconds) -> str:
-        return datetime_to_string(min_date + datetime.timedelta(seconds=seconds))
+    def shift_and_convert_to_string(base_date: datetime.datetime, seconds: Union[int, float]) -> str:
+        return datetime_to_string(base_date + datetime.timedelta(seconds=seconds))
 
     @classmethod
     def _compute_statistics(
@@ -753,7 +758,7 @@ def _compute_statistics(
                 histogram=None,
             )
 
-        min_date = data[column_name].min()
+        min_date: datetime.datetime = data[column_name].min()  # type: ignore   # mypy infers type of datetime column .min() incorrectly
         timedelta_column_name = f"{column_name}_timedelta"
         # compute distribution of time passed from min date in **seconds**
         timedelta_df = cls.compute_transformed_data(data, column_name, timedelta_column_name, min_date)
@@ -762,10 +767,14 @@ def _compute_statistics(
             column_name=timedelta_column_name,
             n_samples=n_samples,
         )
-        for stat in ("max", "mean", "median"):
-            timedelta_stats[stat] = cls.shift_and_convert_to_string(min_date, timedelta_stats[stat])
-
-        bin_edges = [
+        # to assure mypy that there values are not None to pass to conversion functions:
+        assert timedelta_stats["histogram"] is not None
+        assert timedelta_stats["max"] is not None
+        assert timedelta_stats["mean"] is not None
+        assert timedelta_stats["median"] is not None
+        assert timedelta_stats["std"] is not None
+
+        datetime_bin_edges = [
             cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]
         ]
 
@@ -773,13 +782,13 @@ def _compute_statistics(
             nan_count=nan_count,
             nan_proportion=nan_proportion,
             min=datetime_to_string(min_date),
-            max=timedelta_stats["max"],
-            mean=timedelta_stats["mean"],
-            median=timedelta_stats["median"],
-            std=str(timedelta_stats["std"]),
-            histogram=Histogram(
+            max=cls.shift_and_convert_to_string(min_date, timedelta_stats["max"]),
+            mean=cls.shift_and_convert_to_string(min_date, timedelta_stats["mean"]),
+            median=cls.shift_and_convert_to_string(min_date, timedelta_stats["median"]),
+            std=str(datetime.timedelta(seconds=timedelta_stats["std"])),
+            histogram=DatetimeHistogram(
                 hist=timedelta_stats["histogram"]["hist"],
-                bin_edges=bin_edges,
+                bin_edges=datetime_bin_edges,
             ),
         )
 

From 3347c134fa2d062a9d4e4844f14118758d428838 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Thu, 1 Aug 2024 17:11:12 +0200
Subject: [PATCH 03/10] add testcase

---
 services/worker/tests/fixtures/datasets.py    |  2 +
 .../tests/fixtures/statistics_dataset.py      | 25 ++++++++
 .../worker/tests/test_statistics_utils.py     | 57 ++++++++++++++++++-
 3 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/services/worker/tests/fixtures/datasets.py b/services/worker/tests/fixtures/datasets.py
index 77e41e2ae4..2b471a9861 100644
--- a/services/worker/tests/fixtures/datasets.py
+++ b/services/worker/tests/fixtures/datasets.py
@@ -28,6 +28,7 @@
 
 from .statistics_dataset import (
     audio_dataset,
+    datetime_dataset,
     image_dataset,
     null_column,
     statistics_dataset,
@@ -238,4 +239,5 @@ def datasets() -> Mapping[str, Dataset]:
         "descriptive_statistics_not_supported": statistics_not_supported_dataset,
         "audio_statistics": audio_dataset,
         "image_statistics": image_dataset,
+        "datetime_statistics": datetime_dataset,
     }
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index f32e404131..7d60fd100c 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The HuggingFace Authors.
 
+from datetime import datetime
 from pathlib import Path
 from typing import Optional
 
@@ -1698,3 +1699,27 @@ def null_column(n_samples: int) -> list[None]:
         }
     ),
 )
+
+
+datetime_dataset = Dataset.from_dict(
+    {
+        "datetime": [
+            datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-02 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-04 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-06 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-08 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        ]
+    },
+    features=Features(
+        {
+            "datetime": Value("timestamp[s]"),
+        }
+    ),
+)
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 80f41f317f..29abdfb3eb 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright 2024 The HuggingFace Authors.
+import datetime
 from collections.abc import Mapping
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -22,6 +23,7 @@
     BoolColumn,
     ClassLabelColumn,
     ColumnType,
+    DatetimeColumn,
     FloatColumn,
     ImageColumn,
     IntColumn,
@@ -470,3 +472,56 @@ def test_image_statistics(
         n_samples=4,
     )
     assert computed == expected
+
+
+def count_expected_statistics_for_datetime() -> dict[str, Any]:
+    seconds_in_day = 24 * 60 * 60
+    timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+    std = timedeltas.std()
+    std_str = str(datetime.timedelta(seconds=std))
+    std_str = std_str.split(".")[0]  # check precision up to seconds
+    return {
+        "nan_count": 0,
+        "nan_proportion": 0.0,
+        "min": "2024-01-01 00:00:00",
+        "max": "2024-01-11 00:00:00",
+        "mean": "2024-01-06 00:00:00",
+        "median": "2024-01-06 00:00:00",
+        "std": std_str,
+        "histogram": {
+            "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "bin_edges": [
+                "2024-01-01 00:00:00",
+                "2024-01-02 00:00:01",
+                "2024-01-03 00:00:02",
+                "2024-01-04 00:00:03",
+                "2024-01-05 00:00:04",
+                "2024-01-06 00:00:05",
+                "2024-01-07 00:00:06",
+                "2024-01-08 00:00:07",
+                "2024-01-09 00:00:08",
+                "2024-01-10 00:00:09",
+                "2024-01-11 00:00:00",
+            ],
+        },
+    }
+
+
+@pytest.mark.parametrize(
+    "column_name",
+    ["datetime_column"],
+)
+def test_datetime_statistics(
+    column_name: str,
+    datasets: Mapping[str, Dataset],
+) -> None:
+    column_name = "datetime"
+    expected = count_expected_statistics_for_datetime()
+    data = datasets["datetime_statistics"].to_pandas()
+    computed = DatetimeColumn.compute_statistics(
+        data=pl.from_pandas(data),
+        column_name=column_name,
+        n_samples=len(data[column_name]),
+    )
+    assert computed.pop("std").split(".")[0] == expected.pop("std")
+    assert computed == expected

From 0340b54c25bd47d2af51b4eaf553139c8023fe1b Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Mon, 5 Aug 2024 15:37:00 +0200
Subject: [PATCH 04/10] moar tests: column with nulls and all nulls column

---
 .../worker/src/worker/statistics_utils.py     |  4 +-
 .../tests/fixtures/statistics_dataset.py      | 18 +++-
 .../worker/tests/test_statistics_utils.py     | 89 +++++++++++++------
 3 files changed, 80 insertions(+), 31 deletions(-)

diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index dfd2599164..9b4a2302b2 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -801,13 +801,13 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum
         )
 
 
-def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S") -> str:
+def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
     """
     Convert a datetime.datetime object to a string.
 
     Args:
         dt (datetime): The datetime object to convert.
-        format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S".
+        format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".
 
     Returns:
         str: The datetime object as a string.
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index 7d60fd100c..c00c63afc5 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1715,11 +1715,27 @@ def null_column(n_samples: int) -> list[None]:
             datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
             datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
             datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
-        ]
+        ],
+        "datetime_null": [
+            datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-03 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-05 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-07 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-09 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            None,
+            datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        ],
+        "datetime_all_null": [None] * 11,
     },
     features=Features(
         {
             "datetime": Value("timestamp[s]"),
+            "datetime_null": Value("timestamp[s]"),
+            "datetime_all_null": Value("timestamp[s]"),
         }
     ),
 )
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 29abdfb3eb..84eee81448 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -2,7 +2,7 @@
 # Copyright 2024 The HuggingFace Authors.
 import datetime
 from collections.abc import Mapping
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -474,54 +474,87 @@ def test_image_statistics(
     assert computed == expected
 
 
-def count_expected_statistics_for_datetime() -> dict[str, Any]:
+def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict:  # type: ignore
+    n_samples = column.shape[0]
+    nan_count = column.isna().sum()
+    if nan_count == n_samples:
+        return {
+            "nan_count": n_samples,
+            "nan_proportion": 1.0,
+            "min": None,
+            "max": None,
+            "mean": None,
+            "median": None,
+            "std": None,
+            "histogram": None,
+        }
+
+    # hardcode expected values
+    minv = "2024-01-01 00:00:00"
+    maxv = "2024-01-11 00:00:00"
+    mean = "2024-01-06 00:00:00"
+    median = "2024-01-06 00:00:00"
+    bin_edges = [
+        "2024-01-01 00:00:00",
+        "2024-01-02 00:00:01",
+        "2024-01-03 00:00:02",
+        "2024-01-04 00:00:03",
+        "2024-01-05 00:00:04",
+        "2024-01-06 00:00:05",
+        "2024-01-07 00:00:06",
+        "2024-01-08 00:00:07",
+        "2024-01-09 00:00:08",
+        "2024-01-10 00:00:09",
+        "2024-01-11 00:00:00",
+    ]
+
+    # compute std
     seconds_in_day = 24 * 60 * 60
-    timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+    if column_name == "datetime":
+        timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
+        hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    elif column_name == "datetime_null":
+        timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day))  # take every second day
+        hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+    else:
+        raise ValueError("Incorrect column")
+
     std = timedeltas.std()
     std_str = str(datetime.timedelta(seconds=std))
-    std_str = std_str.split(".")[0]  # check precision up to seconds
+
     return {
-        "nan_count": 0,
-        "nan_proportion": 0.0,
-        "min": "2024-01-01 00:00:00",
-        "max": "2024-01-11 00:00:00",
-        "mean": "2024-01-06 00:00:00",
-        "median": "2024-01-06 00:00:00",
+        "nan_count": nan_count,
+        "nan_proportion": np.round(nan_count / n_samples, DECIMALS).item() if nan_count else 0.0,
+        "min": minv,
+        "max": maxv,
+        "mean": mean,
+        "median": median,
         "std": std_str,
         "histogram": {
-            "hist": [2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            "bin_edges": [
-                "2024-01-01 00:00:00",
-                "2024-01-02 00:00:01",
-                "2024-01-03 00:00:02",
-                "2024-01-04 00:00:03",
-                "2024-01-05 00:00:04",
-                "2024-01-06 00:00:05",
-                "2024-01-07 00:00:06",
-                "2024-01-08 00:00:07",
-                "2024-01-09 00:00:08",
-                "2024-01-10 00:00:09",
-                "2024-01-11 00:00:00",
-            ],
+            "hist": hist,
+            "bin_edges": bin_edges,
         },
     }
 
 
 @pytest.mark.parametrize(
     "column_name",
-    ["datetime_column"],
+    ["datetime", "datetime_null", "datetime_all_null"],
 )
 def test_datetime_statistics(
     column_name: str,
     datasets: Mapping[str, Dataset],
 ) -> None:
-    column_name = "datetime"
-    expected = count_expected_statistics_for_datetime()
     data = datasets["datetime_statistics"].to_pandas()
+    expected = count_expected_statistics_for_datetime(data[column_name], column_name)
     computed = DatetimeColumn.compute_statistics(
         data=pl.from_pandas(data),
         column_name=column_name,
         n_samples=len(data[column_name]),
     )
-    assert computed.pop("std").split(".")[0] == expected.pop("std")
+    computed_std, expected_std = computed.pop("std"), expected.pop("std")
+    if computed_std:
+        assert computed_std.split(".")[0] == expected_std.split(".")[0]  # check with precision up to seconds
+    else:
+        assert computed_std == expected_std
     assert computed == expected

From 434b2d8a0d487425d0e8078f9ff3c9392de69a3c Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Thu, 8 Aug 2024 14:07:01 +0200
Subject: [PATCH 05/10] add datetime to worker

---
 .../job_runners/split/descriptive_statistics.py   | 15 ++++++++++++++-
 services/worker/src/worker/statistics_utils.py    | 11 ++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
index ed8bb6aa17..06d485d5bb 100644
--- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py
+++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -32,6 +32,7 @@
 from worker.dtos import CompleteJobResult
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
 from worker.statistics_utils import (
+    DATETIME_DTYPES,
     FLOAT_DTYPES,
     INTEGER_DTYPES,
     NUMERICAL_DTYPES,
@@ -39,6 +40,7 @@
     AudioColumn,
     BoolColumn,
     ClassLabelColumn,
+    DatetimeColumn,
     FloatColumn,
     ImageColumn,
     IntColumn,
@@ -57,7 +59,15 @@ class SplitDescriptiveStatisticsResponse(TypedDict):
 
 
 SupportedColumns = Union[
-    ClassLabelColumn, IntColumn, FloatColumn, StringColumn, BoolColumn, ListColumn, AudioColumn, ImageColumn
+    ClassLabelColumn,
+    IntColumn,
+    FloatColumn,
+    StringColumn,
+    BoolColumn,
+    ListColumn,
+    AudioColumn,
+    ImageColumn,
+    DatetimeColumn,
 ]
 
 
@@ -238,6 +248,9 @@ def _column_from_feature(
 
                 if dataset_feature.get("dtype") == "bool":
                     return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
+
+                if dataset_feature.get("dtype") in DATETIME_DTYPES:
+                    return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
         return None
 
     columns: list[SupportedColumns] = []
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 9b4a2302b2..23e80ab775 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -39,6 +39,7 @@
 FLOAT_DTYPES = ["float16", "float32", "float64"]
 NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES
 STRING_DTYPES = ["string", "large_string"]
+DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"]
 
 
 class ColumnType(str, enum.Enum):
@@ -768,11 +769,11 @@ def _compute_statistics(
             n_samples=n_samples,
         )
         # to assure mypy that there values are not None to pass to conversion functions:
-        assert timedelta_stats["histogram"] is not None
-        assert timedelta_stats["max"] is not None
-        assert timedelta_stats["mean"] is not None
-        assert timedelta_stats["median"] is not None
-        assert timedelta_stats["std"] is not None
+        assert timedelta_stats["histogram"] is not None  # nosec
+        assert timedelta_stats["max"] is not None  # nosec
+        assert timedelta_stats["mean"] is not None  # nosec
+        assert timedelta_stats["median"] is not None  # nosec
+        assert timedelta_stats["std"] is not None  # nosec
 
         datetime_bin_edges = [
             cls.shift_and_convert_to_string(min_date, seconds) for seconds in timedelta_stats["histogram"]["bin_edges"]

From 260458758c3f64c7af1bd17eef7475444a3414ce Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Thu, 8 Aug 2024 14:07:12 +0200
Subject: [PATCH 06/10] add test

---
 services/worker/tests/fixtures/hub.py         | 20 ++++++
 .../split/test_descriptive_statistics.py      | 67 ++++++++++++++-----
 .../worker/tests/test_statistics_utils.py     |  4 +-
 3 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/services/worker/tests/fixtures/hub.py b/services/worker/tests/fixtures/hub.py
index 62046b66c7..d5b890dbcc 100644
--- a/services/worker/tests/fixtures/hub.py
+++ b/services/worker/tests/fixtures/hub.py
@@ -354,6 +354,13 @@ def hub_public_image_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str
     delete_hub_dataset_repo(repo_id=repo_id)
 
 
+@pytest.fixture(scope="session")
+def hub_public_datetime_statistics(datasets: Mapping[str, Dataset]) -> Iterator[str]:
+    repo_id = create_hub_dataset_repo(prefix="datetime_statistics", dataset=datasets["datetime_statistics"])
+    yield repo_id
+    delete_hub_dataset_repo(repo_id=repo_id)
+
+
 @pytest.fixture(scope="session")
 def hub_public_n_configs_with_default(datasets: Mapping[str, Dataset]) -> Iterator[str]:
     default_config_name, _ = get_default_config_split()
@@ -1207,6 +1214,19 @@ def hub_responses_image_statistics(
     }
 
 
+@pytest.fixture
+def hub_responses_datetime_statistics(
+    hub_public_datetime_statistics: str,
+) -> HubDatasetTest:
+    return {
+        "name": hub_public_datetime_statistics,
+        "config_names_response": create_config_names_response(hub_public_datetime_statistics),
+        "splits_response": create_splits_response(hub_public_datetime_statistics),
+        "first_rows_response": None,
+        "parquet_and_info_response": None,
+    }
+
+
 @pytest.fixture
 def hub_responses_descriptive_statistics_parquet_builder(
     hub_public_descriptive_statistics_parquet_builder: str,
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 54f1f53954..a95932d67a 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -3,7 +3,7 @@
 from collections.abc import Callable, Mapping
 from dataclasses import replace
 from http import HTTPStatus
-from typing import Optional
+from typing import Any, Optional
 
 import pandas as pd
 import polars as pl
@@ -30,6 +30,7 @@
 from ...test_statistics_utils import (
     count_expected_statistics_for_bool_column,
     count_expected_statistics_for_categorical_column,
+    count_expected_statistics_for_datetime_column,
     count_expected_statistics_for_list_column,
     count_expected_statistics_for_numerical_column,
     count_expected_statistics_for_string_column,
@@ -215,7 +216,7 @@ def _get_job_runner(
 
 
 @pytest.fixture
-def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict:  # type: ignore
+def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
     ds = datasets["descriptive_statistics"]
     df = ds.to_pandas()
     expected_statistics = {}
@@ -253,7 +254,7 @@ def descriptive_statistics_expected(datasets: Mapping[str, Dataset]) -> dict:  #
 
 
 @pytest.fixture
-def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict:  # type: ignore
+def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
     ds = datasets["descriptive_statistics_string_text"]
     df = ds.to_pandas()
     expected_statistics = {}
@@ -270,7 +271,7 @@ def descriptive_statistics_string_text_expected(datasets: Mapping[str, Dataset])
 
 
 @pytest.fixture
-def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict:  # type: ignore
+def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
     ds = datasets["descriptive_statistics_string_text"]
     df = ds.to_pandas()[:50]  # see `fixtures.hub.hub_public_descriptive_statistics_parquet_builder`
     expected_statistics = {}
@@ -287,7 +288,7 @@ def descriptive_statistics_string_text_partial_expected(datasets: Mapping[str, D
 
 
 @pytest.fixture
-def audio_statistics_expected() -> dict:  # type: ignore
+def audio_statistics_expected() -> dict[str, Any]:
     column_names_to_durations = [
         ("audio", [1.0, 2.0, 3.0, 4.0]),  # datasets consists of 4 audio files of 1, 2, 3, 4 seconds lengths
         ("audio_null", [1.0, None, 3.0, None]),  # take first and third audio file for this testcase
@@ -312,7 +313,7 @@ def audio_statistics_expected() -> dict:  # type: ignore
 
 
 @pytest.fixture
-def image_statistics_expected() -> dict:  # type: ignore
+def image_statistics_expected() -> dict[str, Any]:
     column_names_to_widths = [
         ("image", [640, 1440, 520, 1240]),  # datasets consists of 4 image files
         ("image_null", [640, None, 520, None]),  # take first and third image file for this testcase
@@ -334,6 +335,21 @@ def image_statistics_expected() -> dict:  # type: ignore
     }
 
 
+@pytest.fixture
+def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, Any]:
+    ds = datasets["datetime_statistics"]
+    df = ds.to_pandas()
+    expected_statistics = {}
+    for column_name in df.columns:
+        statistics = count_expected_statistics_for_datetime_column(column=df[column_name], column_name=column_name)
+        expected_statistics[column_name] = {
+            "column_name": column_name,
+            "column_type": ColumnType.DATETIME,
+            "column_statistics": statistics,
+        }
+    return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False}
+
+
 @pytest.fixture
 def struct_thread_panic_error_parquet_file(tmp_path_factory: pytest.TempPathFactory) -> str:
     repo_id = "__DUMMY_TRANSFORMERS_USER__/test_polars_panic_error"
@@ -369,13 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file
 @pytest.mark.parametrize(
     "hub_dataset_name,expected_error_code",
     [
-        ("descriptive_statistics", None),
-        ("descriptive_statistics_string_text", None),
-        ("descriptive_statistics_string_text_partial", None),
-        ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
-        ("audio_statistics", None),
-        ("image_statistics", None),
-        ("gated", None),
+        # ("descriptive_statistics", None),
+        # ("descriptive_statistics_string_text", None),
+        # ("descriptive_statistics_string_text_partial", None),
+        # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+        # ("audio_statistics", None),
+        # ("image_statistics", None),
+        ("datetime_statistics", None),
+        # ("gated", None),
     ],
 )
 def test_compute(
@@ -391,13 +408,15 @@ def test_compute(
     hub_responses_descriptive_statistics_not_supported: HubDatasetTest,
     hub_responses_audio_statistics: HubDatasetTest,
     hub_responses_image_statistics: HubDatasetTest,
+    hub_responses_datetime_statistics: HubDatasetTest,
     hub_dataset_name: str,
     expected_error_code: Optional[str],
-    descriptive_statistics_expected: dict,  # type: ignore
-    descriptive_statistics_string_text_expected: dict,  # type: ignore
-    descriptive_statistics_string_text_partial_expected: dict,  # type: ignore
-    audio_statistics_expected: dict,  # type: ignore
-    image_statistics_expected: dict,  # type: ignore
+    descriptive_statistics_expected: dict[str, Any],
+    descriptive_statistics_string_text_expected: dict[str, Any],
+    descriptive_statistics_string_text_partial_expected: dict[str, Any],
+    audio_statistics_expected: dict[str, Any],
+    image_statistics_expected: dict[str, Any],
+    datetime_statistics_expected: dict[str, Any],
 ) -> None:
     hub_datasets = {
         "descriptive_statistics": hub_responses_descriptive_statistics,
@@ -407,6 +426,7 @@ def test_compute(
         "gated": hub_responses_gated_descriptive_statistics,
         "audio_statistics": hub_responses_audio_statistics,
         "image_statistics": hub_responses_image_statistics,
+        "datetime_statistics": hub_responses_datetime_statistics,
     }
     expected = {
         "descriptive_statistics": descriptive_statistics_expected,
@@ -416,6 +436,7 @@ def test_compute(
         "descriptive_statistics_string_text_partial": descriptive_statistics_string_text_partial_expected,
         "audio_statistics": audio_statistics_expected,
         "image_statistics": image_statistics_expected,
+        "datetime_statistics": datetime_statistics_expected,
     }
     dataset = hub_datasets[hub_dataset_name]["name"]
     splits_response = hub_datasets[hub_dataset_name]["splits_response"]
@@ -534,5 +555,15 @@ def test_compute(
                     column_response_stats.pop("nan_proportion")
                 ) == expected_column_response_stats.pop("nan_proportion")
                 assert column_response_stats == expected_column_response_stats
+            elif column_response["column_type"] is ColumnType.DATETIME:
+                std, expected_std = (
+                    column_response_stats.pop("std"),
+                    expected_column_response_stats.pop("std"),
+                )
+                if std:
+                    assert std.split(".")[0] == expected_std.split(".")[0]
+                else:
+                    assert std == expected_std
+                assert column_response_stats == expected_column_response_stats
             else:
                 raise ValueError("Incorrect data type")
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 84eee81448..377cb47c86 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -474,7 +474,7 @@ def test_image_statistics(
     assert computed == expected
 
 
-def count_expected_statistics_for_datetime(column: pd.Series, column_name: str) -> dict:  # type: ignore
+def count_expected_statistics_for_datetime_column(column: pd.Series, column_name: str) -> dict:  # type: ignore
     n_samples = column.shape[0]
     nan_count = column.isna().sum()
     if nan_count == n_samples:
@@ -546,7 +546,7 @@ def test_datetime_statistics(
     datasets: Mapping[str, Dataset],
 ) -> None:
     data = datasets["datetime_statistics"].to_pandas()
-    expected = count_expected_statistics_for_datetime(data[column_name], column_name)
+    expected = count_expected_statistics_for_datetime_column(data[column_name], column_name)
     computed = DatetimeColumn.compute_statistics(
         data=pl.from_pandas(data),
         column_name=column_name,

From 913f812f472e30ca1eca102ac0eaa5eecb7814b3 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Fri, 9 Aug 2024 13:23:39 +0200
Subject: [PATCH 07/10] include timezone aware

---
 .../split/descriptive_statistics.py           | 23 ++++++++++---------
 .../worker/src/worker/statistics_utils.py     |  1 -
 .../tests/fixtures/statistics_dataset.py      | 14 +++++++++++
 .../split/test_descriptive_statistics.py      | 14 +++++------
 .../worker/tests/test_statistics_utils.py     |  7 ++++--
 5 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/services/worker/src/worker/job_runners/split/descriptive_statistics.py b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
index 06d485d5bb..3c3886d703 100644
--- a/services/worker/src/worker/job_runners/split/descriptive_statistics.py
+++ b/services/worker/src/worker/job_runners/split/descriptive_statistics.py
@@ -32,7 +32,6 @@
 from worker.dtos import CompleteJobResult
 from worker.job_runners.split.split_job_runner import SplitJobRunnerWithCache
 from worker.statistics_utils import (
-    DATETIME_DTYPES,
     FLOAT_DTYPES,
     INTEGER_DTYPES,
     NUMERICAL_DTYPES,
@@ -225,31 +224,33 @@ def _column_from_feature(
                 return ListColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
         if isinstance(dataset_feature, dict):
-            if dataset_feature.get("_type") == "ClassLabel":
+            _type = dataset_feature.get("_type")
+            if _type == "ClassLabel":
                 return ClassLabelColumn(
                     feature_name=dataset_feature_name, n_samples=num_examples, feature_dict=dataset_feature
                 )
 
-            if dataset_feature.get("_type") == "Audio":
+            if _type == "Audio":
                 return AudioColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Image":
+            if _type == "Image":
                 return ImageColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-            if dataset_feature.get("_type") == "Value":
-                if dataset_feature.get("dtype") in INTEGER_DTYPES:
+            if _type == "Value":
+                dtype = dataset_feature.get("dtype", "")
+                if dtype in INTEGER_DTYPES:
                     return IntColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in FLOAT_DTYPES:
+                if dtype in FLOAT_DTYPES:
                     return FloatColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in STRING_DTYPES:
+                if dtype in STRING_DTYPES:
                     return StringColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") == "bool":
+                if dtype == "bool":
                     return BoolColumn(feature_name=dataset_feature_name, n_samples=num_examples)
 
-                if dataset_feature.get("dtype") in DATETIME_DTYPES:
+                if dtype.startswith("timestamp"):
                     return DatetimeColumn(feature_name=dataset_feature_name, n_samples=num_examples)
         return None
 
@@ -262,7 +263,7 @@ def _column_from_feature(
     if not columns:
         raise NoSupportedFeaturesError(
             "No columns for statistics computation found. Currently supported feature types are: "
-            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, list/Sequence and bool. "
+            f"{NUMERICAL_DTYPES}, {STRING_DTYPES}, ClassLabel, Image, Audio, list/Sequence, datetime and bool. "
         )
 
     column_names_str = ", ".join([column.name for column in columns])
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index 23e80ab775..d514cec931 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -39,7 +39,6 @@
 FLOAT_DTYPES = ["float16", "float32", "float64"]
 NUMERICAL_DTYPES = INTEGER_DTYPES + FLOAT_DTYPES
 STRING_DTYPES = ["string", "large_string"]
-DATETIME_DTYPES = ["timestamp[s]", "timestamp[ms]", "timestamp[us]", "timestamp[ns]"]
 
 
 class ColumnType(str, enum.Enum):
diff --git a/services/worker/tests/fixtures/statistics_dataset.py b/services/worker/tests/fixtures/statistics_dataset.py
index c00c63afc5..c233e61639 100644
--- a/services/worker/tests/fixtures/statistics_dataset.py
+++ b/services/worker/tests/fixtures/statistics_dataset.py
@@ -1716,6 +1716,19 @@ def null_column(n_samples: int) -> list[None]:
             datetime.strptime("2024-01-10 00:00:00", "%Y-%m-%d %H:%M:%S"),
             datetime.strptime("2024-01-11 00:00:00", "%Y-%m-%d %H:%M:%S"),
         ],
+        "datetime_tz": [
+            datetime.strptime("2024-01-01 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-02 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-03 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-04 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-05 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-06 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-07 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-08 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-09 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-10 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+            datetime.strptime("2024-01-11 00:00:00+0200", "%Y-%m-%d %H:%M:%S%z"),
+        ],
         "datetime_null": [
             datetime.strptime("2024-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
             None,
@@ -1734,6 +1747,7 @@ def null_column(n_samples: int) -> list[None]:
     features=Features(
         {
             "datetime": Value("timestamp[s]"),
+            "datetime_tz": Value("timestamp[s, tz=+02:00]"),
             "datetime_null": Value("timestamp[s]"),
             "datetime_all_null": Value("timestamp[s]"),
         }
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index a95932d67a..7cdd785def 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -385,14 +385,14 @@ def test_polars_struct_thread_panic_error(struct_thread_panic_error_parquet_file
 @pytest.mark.parametrize(
     "hub_dataset_name,expected_error_code",
     [
-        # ("descriptive_statistics", None),
-        # ("descriptive_statistics_string_text", None),
-        # ("descriptive_statistics_string_text_partial", None),
-        # ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
-        # ("audio_statistics", None),
-        # ("image_statistics", None),
+        ("descriptive_statistics", None),
+        ("descriptive_statistics_string_text", None),
+        ("descriptive_statistics_string_text_partial", None),
+        ("descriptive_statistics_not_supported", "NoSupportedFeaturesError"),
+        ("audio_statistics", None),
+        ("image_statistics", None),
         ("datetime_statistics", None),
-        # ("gated", None),
+        ("gated", None),
     ],
 )
 def test_compute(
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 377cb47c86..1a34fadce1 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -507,10 +507,13 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
         "2024-01-10 00:00:09",
         "2024-01-11 00:00:00",
     ]
+    if column_name == "datetime_tz":
+        bin_edges = [f"{bin_edge}+0200" for bin_edge in bin_edges]
+        minv, maxv, mean, median = f"{minv}+0200", f"{maxv}+0200", f"{mean}+0200", f"{median}+0200"
 
     # compute std
     seconds_in_day = 24 * 60 * 60
-    if column_name == "datetime":
+    if column_name in ["datetime", "datetime_tz"]:
         timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
         hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     elif column_name == "datetime_null":
@@ -539,7 +542,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
 
 @pytest.mark.parametrize(
     "column_name",
-    ["datetime", "datetime_null", "datetime_all_null"],
+    ["datetime", "datetime_tz", "datetime_null", "datetime_all_null"],
 )
 def test_datetime_statistics(
     column_name: str,

From d51739356a2834ef2df49fb8f8ae86dd1c9561e6 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Mon, 14 Oct 2024 15:10:19 +0200
Subject: [PATCH 08/10] refactor

---
 libs/libcommon/src/libcommon/utils.py         | 14 +++++++
 .../worker/src/worker/statistics_utils.py     | 38 +++++++++++--------
 .../split/test_descriptive_statistics.py      |  1 -
 .../worker/tests/test_statistics_utils.py     |  2 +-
 4 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index c85079b697..3a08ebf8d1 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -2,12 +2,14 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import base64
+import datetime
 import functools
 import logging
 import mimetypes
 import time
 from collections.abc import Callable, Sequence
 from datetime import datetime, timedelta, timezone
+from dateutil import parser
 from fnmatch import fnmatch
 from pathlib import Path
 from typing import Any, Optional, TypeVar, Union, cast
@@ -93,6 +95,18 @@ def get_datetime(days: Optional[float] = None) -> datetime:
     return date
 
 
+def is_datetime(string: str):
+    try:
+        parser.parse(string)
+        return True
+    except ValueError:
+        return False
+
+
+def datetime_to_string(dt: datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
+    return dt.strftime(format)
+
+
 def get_duration(started_at: datetime) -> float:
     """
     Get time in seconds that has passed from `started_at` until now.
diff --git a/services/worker/src/worker/statistics_utils.py b/services/worker/src/worker/statistics_utils.py
index d514cec931..28d340faa5 100644
--- a/services/worker/src/worker/statistics_utils.py
+++ b/services/worker/src/worker/statistics_utils.py
@@ -15,6 +15,7 @@
 from libcommon.exceptions import (
     StatisticsComputationError,
 )
+from libcommon.utils import datetime_to_string, is_datetime
 from PIL import Image
 from tqdm.contrib.concurrent import thread_map
 
@@ -476,6 +477,13 @@ def is_class(n_unique: int, n_samples: int) -> bool:
             n_unique / n_samples <= MAX_PROPORTION_STRING_LABELS and n_unique <= MAX_NUM_STRING_LABELS
         ) or n_unique <= NUM_BINS
 
+    @staticmethod
+    def is_datetime(data: pl.DataFrame, column_name: str) -> bool:
+        """Check if first 1000 non-null samples in a column match datetime format."""
+
+        values = data.filter(pl.col(column_name).is_not_null()).head(1000)[column_name].to_list()
+        return all(is_datetime(value) for value in values)
+
     @classmethod
     def compute_transformed_data(
         cls,
@@ -493,7 +501,7 @@ def _compute_statistics(
         data: pl.DataFrame,
         column_name: str,
         n_samples: int,
-    ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem]:
+    ) -> Union[CategoricalStatisticsItem, NumericalStatisticsItem, DatetimeStatisticsItem]:
         nan_count, nan_proportion = nan_count_proportion(data, column_name, n_samples)
         n_unique = data[column_name].n_unique()
         if cls.is_class(n_unique, n_samples):
@@ -509,6 +517,13 @@ def _compute_statistics(
                 n_unique=len(labels2counts),
                 frequencies=labels2counts,
             )
+        if cls.is_datetime(data, column_name):
+            datetime_stats: DatetimeStatisticsItem = DatetimeColumn.compute_statistics(
+                data.select(pl.col(column_name).cast(pl.Datetime)),
+                column_name=column_name,
+                n_samples=n_samples,
+            )
+            return datetime_stats
 
         lengths_column_name = f"{column_name}_len"
         lengths_df = cls.compute_transformed_data(data, column_name, transformed_column_name=lengths_column_name)
@@ -519,7 +534,12 @@ def _compute_statistics(
 
     def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColumnItem:
         stats = self.compute_statistics(data, column_name=self.name, n_samples=self.n_samples)
-        string_type = ColumnType.STRING_LABEL if "frequencies" in stats else ColumnType.STRING_TEXT
+        if "frequencies" in stats:
+            string_type = ColumnType.STRING_LABEL
+        elif isinstance(stats["histogram"], DatetimeHistogram):  # type: ignore
+            string_type = ColumnType.DATETIME
+        else:
+            string_type = ColumnType.STRING_TEXT
         return StatisticsPerColumnItem(
             column_name=self.name,
             column_type=string_type,
@@ -799,17 +819,3 @@ def compute_and_prepare_response(self, data: pl.DataFrame) -> StatisticsPerColum
             column_type=ColumnType.DATETIME,
             column_statistics=stats,
         )
-
-
-def datetime_to_string(dt: datetime.datetime, format: str = "%Y-%m-%d %H:%M:%S%z") -> str:
-    """
-    Convert a datetime.datetime object to a string.
-
-    Args:
-        dt (datetime): The datetime object to convert.
-        format (str, optional): The format of the output string. Defaults to "%Y-%m-%d %H:%M:%S%z".
-
-    Returns:
-        str: The datetime object as a string.
-    """
-    return dt.strftime(format)
diff --git a/services/worker/tests/job_runners/split/test_descriptive_statistics.py b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
index 14fb9dbf3a..4aa1c68900 100644
--- a/services/worker/tests/job_runners/split/test_descriptive_statistics.py
+++ b/services/worker/tests/job_runners/split/test_descriptive_statistics.py
@@ -347,7 +347,6 @@ def datetime_statistics_expected(datasets: Mapping[str, Dataset]) -> dict[str, A
     return {"num_examples": df.shape[0], "statistics": expected_statistics, "partial": False}
 
 
-
 @pytest.mark.parametrize(
     "hub_dataset_name,expected_error_code",
     [
diff --git a/services/worker/tests/test_statistics_utils.py b/services/worker/tests/test_statistics_utils.py
index 1a34fadce1..dc74d9a31c 100644
--- a/services/worker/tests/test_statistics_utils.py
+++ b/services/worker/tests/test_statistics_utils.py
@@ -517,7 +517,7 @@ def count_expected_statistics_for_datetime_column(column: pd.Series, column_name
         timedeltas = pd.Series(range(0, 11 * seconds_in_day, seconds_in_day))
         hist = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1]
     elif column_name == "datetime_null":
-        timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day))  # take every second day
+        timedeltas = pd.Series(range(0, 6 * 2 * seconds_in_day, 2 * seconds_in_day))  # take every other day
         hist = [1, 1, 0, 1, 0, 1, 0, 1, 0, 1]
     else:
         raise ValueError("Incorrect column")

From 7046d8b7d67d1926d2e3e41b80420395b1f0f647 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Mon, 14 Oct 2024 15:21:10 +0200
Subject: [PATCH 09/10] fix

---
 libs/libcommon/src/libcommon/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libs/libcommon/src/libcommon/utils.py b/libs/libcommon/src/libcommon/utils.py
index 3a08ebf8d1..b81ff70fff 100644
--- a/libs/libcommon/src/libcommon/utils.py
+++ b/libs/libcommon/src/libcommon/utils.py
@@ -2,14 +2,12 @@
 # Copyright 2022 The HuggingFace Authors.
 
 import base64
-import datetime
 import functools
 import logging
 import mimetypes
 import time
 from collections.abc import Callable, Sequence
 from datetime import datetime, timedelta, timezone
-from dateutil import parser
 from fnmatch import fnmatch
 from pathlib import Path
 from typing import Any, Optional, TypeVar, Union, cast
@@ -17,6 +15,7 @@
 import orjson
 import pandas as pd
 import pytz
+from dateutil import parser
 from huggingface_hub import constants, hf_hub_download
 from requests.exceptions import ReadTimeout
 
@@ -95,7 +94,7 @@ def get_datetime(days: Optional[float] = None) -> datetime:
     return date
 
 
-def is_datetime(string: str):
+def is_datetime(string: str) -> bool:
     try:
         parser.parse(string)
         return True

From 945dff0378043a9ae4ab79ef56fa82f1b8abab44 Mon Sep 17 00:00:00 2001
From: polinaeterna <polina@huggingface.co>
Date: Mon, 14 Oct 2024 15:37:04 +0200
Subject: [PATCH 10/10] do not typecheck dateutil

---
 libs/libcommon/pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/libcommon/pyproject.toml b/libs/libcommon/pyproject.toml
index 48cc7629bb..c6c0b9e679 100644
--- a/libs/libcommon/pyproject.toml
+++ b/libs/libcommon/pyproject.toml
@@ -76,6 +76,7 @@ module = [
     "moto.*",
     "aiobotocore.*",
     "requests.*",
+    "dateutil.*"
 ]
 # ^ huggingface_hub is not typed since version 0.13.0
 ignore_missing_imports = true