ohadmata · ohadmata · Jan 18, 2024 · Jan 18, 2024 · Jan 17, 2024 · Jan 18, 2024
diff --git a/README.md b/README.md
@@ -118,7 +118,9 @@ df = Shmessy().read_csv('/tmp/file.csv')
 ### Constructor
 ```python
 shmessy = Shmessy(
-    sample_size: Optional[int] = 1000
+    sample_size: Optional[int] = 1000,
+    reader_encoding: Optional[str] = "UTF-8",
+    locale_formatter: Optional[str] = "en_US"
 )
 ```
 

diff --git a/src/shmessy/__init__.py b/src/shmessy/__init__.py
@@ -1,4 +1,5 @@
 import csv
+import locale
 import logging
 import time
 from typing import BinaryIO, Optional, TextIO, Union
@@ -20,12 +21,22 @@
 
 
 class Shmessy:
-    def __init__(self, sample_size: Optional[int] = 1000) -> None:
+    def __init__(
+        self,
+        sample_size: Optional[int] = 1000,
+        reader_encoding: Optional[str] = "UTF-8",
+        locale_formatter: Optional[str] = "en_US",
+    ) -> None:
         self.__types_handler = TypesHandler()
         self.__sample_size = sample_size
-        self.__csv_reader_encoding: str = "UTF-8"
+        self.__reader_encoding = reader_encoding
+        self.__locale_formatter = locale_formatter
         self.__inferred_schema: Optional[ShmessySchema] = None
 
+        locale.setlocale(
+            locale.LC_ALL, f"{self.__locale_formatter}.{self.__reader_encoding}"
+        )
+
     def get_inferred_schema(self) -> ShmessySchema:
         return self.__inferred_schema
 
@@ -81,7 +92,7 @@ def read_csv(
                 sample=_get_sample_from_csv(
                     filepath_or_buffer=filepath_or_buffer,
                     sample_size=self.__sample_size,
-                    encoding=self.__csv_reader_encoding,
+                    encoding=self.__reader_encoding,
                 ),
                 delimiters="".join([",", "\t", ";", " ", ":"]),
             )

diff --git a/src/shmessy/types/date.py b/src/shmessy/types/date.py
@@ -14,7 +14,6 @@ class DateType(BaseType):
         "%m/%d/%Y",  # 12/01/2022
         "%m-%d-%Y",  # 12-01-2022
         "%m.%d.%Y",  # 12.01.2022
-        "%-d/%m/%Y",  # 1/12/2023
         "%m/%d/%y",  # 12/01/22
         "%m-%d-%y",  # 12.01.2022
         "%m.%d.%y",  # 12.01.22
@@ -26,16 +25,7 @@ class DateType(BaseType):
         "%d.%m.%Y",  # 01.12.2022
         "%d/%b/%Y",  # 01/Mar/2022
         "%d-%b-%Y",  # 01-Mar-2022
-        "%-d-%b-%y",  # 1-Mar-22
         "%Y-%m",  # 2022-07
-        "%-m-%-d-%Y",
-        "%-d-%-m-%Y",
-        "%Y-%-m-%-d",
-        "%-m/%-d/%Y",
-        "%-d/%-m/%Y",
-        "%Y/%-m/%-d",
-        "%-d.%-m.%Y",
-        "%Y.%-m.%-d",
     ]
 
     def validate(self, data: ndarray) -> Optional[InferredField]:

diff --git a/src/shmessy/types/datetime_.py b/src/shmessy/types/datetime_.py
@@ -11,9 +11,7 @@
 class DatetimeType(BaseType):
     weight = 3
     patterns: list[str] = [
-        "%m/%d/%Y %-H:%M",  # 11/14/2003 0:00
         "%d-%m-%Y %H:%M",  # 11-14-2003 00:00
-        "%d-%m-%Y %-H:%M",  # 11-14-2003 0:00
         "%m/%d/%y %H:%M:%S",  # 12/15/22 00:00:00
         "%m-%d-%y %H:%M:%S",  # 12-30-2022 00:00:00
         "%m/%d/%Y %H:%M:%S",  # 12/30/2022 00:00:00
@@ -26,7 +24,6 @@ class DatetimeType(BaseType):
         "%Y-%m-%d %H:%M:%S.%fZ",  # 2022-12-30 00:00:00.000Z
         "%Y-%m-%d %H:%M:%S.%f",  # 2022-12-30 00:00:00.000
         "%Y-%m-%dT%H:%M:%S.%fZ",  # 2022-12-30T00:00:00.000Z
-        "%b %-d, %Y %H:%M %p",  # Jul 3, 2023 12:10 PM
         "%Y-%m-%dT%H:%M:%S",
     ]
 

diff --git a/src/shmessy/types/float.py b/src/shmessy/types/float.py
@@ -1,8 +1,10 @@
+import locale
 import logging
 from typing import Optional
 
 from numpy import ndarray
-from pandas import Series
+from pandas import Series, to_numeric
+from pandas.api.types import is_numeric_dtype
 
 from ..schema import InferredField
 from .base import BaseType
@@ -16,14 +18,19 @@ class FloatType(BaseType):
     def validate(self, data: ndarray) -> Optional[InferredField]:
         for value in data:
             try:
-                float(value)
+                if isinstance(value, str):
+                    float(locale.atof(value))
+                else:
+                    float(value)
             except Exception:  # noqa
                 logger.debug(f"Cannot cast the value '{value}' to {self.name}")
                 return None
         return InferredField(inferred_type=self.name)
 
     def fix(self, column: Series, inferred_field: InferredField) -> Series:
-        raise NotImplementedError()
+        if is_numeric_dtype(column):
+            return column
+        return to_numeric(column.apply(locale.atof))
 
 
 def get_type() -> FloatType:

diff --git a/src/shmessy/types/integer.py b/src/shmessy/types/integer.py
@@ -1,8 +1,10 @@
+import locale
 import logging
 from typing import Optional
 
 from numpy import ndarray
-from pandas import Series
+from pandas import Series, to_numeric
+from pandas.api.types import is_numeric_dtype
 
 from ..schema import InferredField
 from .base import BaseType
@@ -16,14 +18,19 @@ class IntegerType(BaseType):
     def validate(self, data: ndarray) -> Optional[InferredField]:
         for value in data:
             try:
-                int(value)
+                if isinstance(value, str):
+                    int(locale.atoi(value))
+                else:
+                    int(value)
             except Exception:  # noqa
                 logger.debug(f"Cannot cast the value '{value}' to {self.name}")
                 return None
         return InferredField(inferred_type=self.name)
 
     def fix(self, column: Series, inferred_field: InferredField) -> Series:
-        raise NotImplementedError()
+        if is_numeric_dtype(column):
+            return column
+        return to_numeric(column.apply(locale.atoi))
 
 
 def get_type() -> IntegerType:

diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/intg/__init__.py b/tests/intg/__init__.py
diff --git a/tests/intg/validator_boolean.py b/tests/intg/validator_boolean.py
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
diff --git a/tests/intg/test_boolean_type.py → tests/unit/test_boolean_type.py b/tests/intg/test_boolean_type.py → tests/unit/test_boolean_type.py
@@ -2,6 +2,7 @@
 from parametrization import Parametrization
 
 from shmessy import Shmessy
+import numpy as np
 
 
 @Parametrization.autodetect_parameters()
@@ -10,54 +11,73 @@
     df_data={
         "test_column": [1, 0, 1, 1, 0, 1, 1, 0, 1, 0]
     },
-    expected_result="Boolean"
+    expected_result=[True, False, True, True, False, True, True, False, True, False],
+    expected_shmessy_type="Boolean",
+    expected_numpy_type=np.dtype("bool")
 )
 @Parametrization.case(
     name="Base case - Yes / No",
     df_data={
         "test_column": ["yes", "no", "yes", "yes", "no"]
     },
-    expected_result="Boolean"
+    expected_result=[True, False, True, True, False],
+    expected_shmessy_type="Boolean",
+    expected_numpy_type=np.dtype("bool")
 )
 @Parametrization.case(
     name="1 / 0 with bad value",
     df_data={
         "test_column": [1, 0, 1, 1, 4, 1, 1, 0, 1, 0]
     },
-    expected_result="Integer"
+    expected_result=[1, 0, 1, 1, 4, 1, 1, 0, 1, 0],
+    expected_shmessy_type="Integer",
+    expected_numpy_type=np.dtype("int64")
 )
 @Parametrization.case(
     name="1 / 0 with bad string value",
     df_data={
         "test_column": [1, 0, 1, 1, "hello", 1, 1, 0, 1, 0]
     },
-    expected_result="String"
+    expected_result=[1, 0, 1, 1, "hello", 1, 1, 0, 1, 0],
+    expected_shmessy_type="String",
+    expected_numpy_type=np.dtype("object")
 )
 @Parametrization.case(
     name="Only 1 should be identify as integer",
     df_data={
         "test_column": [1, 1, 1, 1, 1, 1, 1]
     },
-    expected_result="Integer"
+    expected_result=[1, 1, 1, 1, 1, 1, 1],
+    expected_shmessy_type="Integer",
+    expected_numpy_type=np.dtype("int64")
 )
 @Parametrization.case(
     name="Only no should be identify as String",
     df_data={
         "test_column": ["no", "no", "no", "no", "no"]
     },
-    expected_result="String"
+    expected_result=["no", "no", "no", "no", "no"],
+    expected_shmessy_type="String",
+    expected_numpy_type=np.dtype("object")
 )
 @Parametrization.case(
     name="Only no with single yes should be identify as bool",
     df_data={
         "test_column": ["no", "no", "no", "no", "no", "yes"]
     },
-    expected_result="Boolean"
+    expected_result=[False, False, False, False, False, True],
+    expected_shmessy_type="Boolean",
+    expected_numpy_type=np.dtype("bool")
 )
-def test_boolean_match_at_least_once_for_each_value(df_data, expected_result):
+def test_boolean_match_at_least_once_for_each_value(df_data, expected_shmessy_type, expected_numpy_type, expected_result):
+    shmessy = Shmessy()
     df = pd.DataFrame(df_data)
-    result = Shmessy().infer_schema(df=df)
-    assert result.columns[0].inferred_type == expected_result
+    fixed_df = shmessy.fix_schema(df)
+    result = shmessy.get_inferred_schema()
+
+    assert result.columns[0].inferred_type == expected_shmessy_type
+    assert fixed_df["test_column"].dtype.type == expected_numpy_type.type
+    assert [x for x in df["test_column"]] == [x for x in expected_result]
 
 
 def test_read_bool_from_csv_only_true_values():

diff --git a/tests/unit/test_date_type.py b/tests/unit/test_date_type.py
@@ -0,0 +1,50 @@
+from datetime import datetime
+
+import numpy as np
+import pandas as pd
+from parametrization import Parametrization
+
+from shmessy import Shmessy
+
+
+@Parametrization.autodetect_parameters()
+@Parametrization.case(
+    name="Base case",
+    df_data={
+        "test_column": ["23-11-2023", "21-04-2022", "11-08-2021"]
+    },
+    expected_pattern="%d-%m-%Y",
+    expected_result=[
+        datetime(2023, 11, 23),
+        datetime(2022, 4, 21),
+        datetime(2021, 8, 11)
+    ],
+    expected_shmessy_type="Date",
+    expected_numpy_type=np.dtype("datetime64")
+)
+@Parametrization.case(
+    name="Date without leading zeros in the day part",
+    df_data={
+        "test_column": ["1-11-2023", "9-04-2022", "11-12-2021", "2-01-2020", "23-04-2019"]
+    },
+    expected_pattern="%d-%m-%Y",
+    expected_result=[
+        datetime(2023, 11, 1),
+        datetime(2022, 4, 9),
+        datetime(2021, 12, 11),
+        datetime(2020, 1, 2),
+        datetime(2019, 4, 23)
+    ],
+    expected_shmessy_type="Date",
+    expected_numpy_type=np.dtype("datetime64")
+)
+def test_date_type(df_data, expected_shmessy_type, expected_numpy_type, expected_result, expected_pattern):
+    shmessy = Shmessy()
+    df = pd.DataFrame(df_data)
+    fixed_df = shmessy.fix_schema(df)
+    inferred_schema = shmessy.get_inferred_schema()
+
+    assert inferred_schema.columns[0].inferred_pattern == expected_pattern
+    assert inferred_schema.columns[0].inferred_type == expected_shmessy_type
+    assert fixed_df["test_column"].dtype.type == expected_numpy_type.type
+    assert [x for x in df["test_column"]] == [x for x in expected_result]