Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unsupported strftypes #45

Merged
merged 6 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,9 @@ df = Shmessy().read_csv('/tmp/file.csv')
### Constructor
```python
shmessy = Shmessy(
sample_size: Optional[int] = 1000
sample_size: Optional[int] = 1000,
reader_encoding: Optional[str] = "UTF-8",
locale_formatter: Optional[str] = "en_US"
)
```

Expand Down
17 changes: 14 additions & 3 deletions src/shmessy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import csv
import locale
import logging
import time
from typing import BinaryIO, Optional, TextIO, Union
Expand All @@ -20,12 +21,22 @@


class Shmessy:
def __init__(self, sample_size: Optional[int] = 1000) -> None:
def __init__(
self,
sample_size: Optional[int] = 1000,
reader_encoding: Optional[str] = "UTF-8",
locale_formatter: Optional[str] = "en_US",
) -> None:
self.__types_handler = TypesHandler()
self.__sample_size = sample_size
self.__csv_reader_encoding: str = "UTF-8"
self.__reader_encoding = reader_encoding
self.__locale_formatter = locale_formatter
self.__inferred_schema: Optional[ShmessySchema] = None

locale.setlocale(
locale.LC_ALL, f"{self.__locale_formatter}.{self.__reader_encoding}"
)

def get_inferred_schema(self) -> ShmessySchema:
return self.__inferred_schema

Expand Down Expand Up @@ -81,7 +92,7 @@ def read_csv(
sample=_get_sample_from_csv(
filepath_or_buffer=filepath_or_buffer,
sample_size=self.__sample_size,
encoding=self.__csv_reader_encoding,
encoding=self.__reader_encoding,
),
delimiters="".join([",", "\t", ";", " ", ":"]),
)
Expand Down
10 changes: 0 additions & 10 deletions src/shmessy/types/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class DateType(BaseType):
"%m/%d/%Y", # 12/01/2022
"%m-%d-%Y", # 12-01-2022
"%m.%d.%Y", # 12.01.2022
"%-d/%m/%Y", # 1/12/2023
"%m/%d/%y", # 12/01/22
"%m-%d-%y", # 12.01.2022
"%m.%d.%y", # 12.01.22
Expand All @@ -26,16 +25,7 @@ class DateType(BaseType):
"%d.%m.%Y", # 01.12.2022
"%d/%b/%Y", # 01/Mar/2022
"%d-%b-%Y", # 01-Mar-2022
"%-d-%b-%y", # 1-Mar-22
"%Y-%m", # 2022-07
"%-m-%-d-%Y",
"%-d-%-m-%Y",
"%Y-%-m-%-d",
"%-m/%-d/%Y",
"%-d/%-m/%Y",
"%Y/%-m/%-d",
"%-d.%-m.%Y",
"%Y.%-m.%-d",
]

def validate(self, data: ndarray) -> Optional[InferredField]:
Expand Down
3 changes: 0 additions & 3 deletions src/shmessy/types/datetime_.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
class DatetimeType(BaseType):
weight = 3
patterns: list[str] = [
"%m/%d/%Y %-H:%M", # 11/14/2003 0:00
"%d-%m-%Y %H:%M", # 11-14-2003 00:00
"%d-%m-%Y %-H:%M", # 11-14-2003 0:00
"%m/%d/%y %H:%M:%S", # 12/15/22 00:00:00
"%m-%d-%y %H:%M:%S", # 12-30-2022 00:00:00
"%m/%d/%Y %H:%M:%S", # 12/30/2022 00:00:00
Expand All @@ -26,7 +24,6 @@ class DatetimeType(BaseType):
"%Y-%m-%d %H:%M:%S.%fZ", # 2022-12-30 00:00:00.000Z
"%Y-%m-%d %H:%M:%S.%f", # 2022-12-30 00:00:00.000
"%Y-%m-%dT%H:%M:%S.%fZ", # 2022-12-30T00:00:00.000Z
"%b %-d, %Y %H:%M %p", # Jul 3, 2023 12:10 PM
"%Y-%m-%dT%H:%M:%S",
]

Expand Down
13 changes: 10 additions & 3 deletions src/shmessy/types/float.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import locale
import logging
from typing import Optional

from numpy import ndarray
from pandas import Series
from pandas import Series, to_numeric
from pandas.api.types import is_numeric_dtype

from ..schema import InferredField
from .base import BaseType
Expand All @@ -16,14 +18,19 @@ class FloatType(BaseType):
def validate(self, data: ndarray) -> Optional[InferredField]:
for value in data:
try:
float(value)
if isinstance(value, str):
float(locale.atof(value))
else:
float(value)
except Exception: # noqa
logger.debug(f"Cannot cast the value '{value}' to {self.name}")
return None
return InferredField(inferred_type=self.name)

def fix(self, column: Series, inferred_field: InferredField) -> Series:
raise NotImplementedError()
if is_numeric_dtype(column):
return column
return to_numeric(column.apply(locale.atof))


def get_type() -> FloatType:
Expand Down
13 changes: 10 additions & 3 deletions src/shmessy/types/integer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import locale
import logging
from typing import Optional

from numpy import ndarray
from pandas import Series
from pandas import Series, to_numeric
from pandas.api.types import is_numeric_dtype

from ..schema import InferredField
from .base import BaseType
Expand All @@ -16,14 +18,19 @@ class IntegerType(BaseType):
def validate(self, data: ndarray) -> Optional[InferredField]:
for value in data:
try:
int(value)
if isinstance(value, str):
int(locale.atoi(value))
else:
int(value)
except Exception: # noqa
logger.debug(f"Cannot cast the value '{value}' to {self.name}")
return None
return InferredField(inferred_type=self.name)

def fix(self, column: Series, inferred_field: InferredField) -> Series:
raise NotImplementedError()
if is_numeric_dtype(column):
return column
return to_numeric(column.apply(locale.atoi))


def get_type() -> IntegerType:
Expand Down
Empty file added tests/__init__.py
Empty file.
Empty file added tests/intg/__init__.py
Empty file.
70 changes: 0 additions & 70 deletions tests/intg/validator_boolean.py

This file was deleted.

Empty file added tests/unit/__init__.py
Empty file.
40 changes: 30 additions & 10 deletions tests/intg/test_boolean_type.py → tests/unit/test_boolean_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from parametrization import Parametrization

from shmessy import Shmessy
import numpy as np


@Parametrization.autodetect_parameters()
Expand All @@ -10,54 +11,73 @@
df_data={
"test_column": [1, 0, 1, 1, 0, 1, 1, 0, 1, 0]
},
expected_result="Boolean"
expected_result=[True, False, True, True, False, True, True, False, True, False],
expected_shmessy_type="Boolean",
expected_numpy_type=np.dtype("bool")
)
@Parametrization.case(
name="Base case - Yes / No",
df_data={
"test_column": ["yes", "no", "yes", "yes", "no"]
},
expected_result="Boolean"
expected_result=[True, False, True, True, False],
expected_shmessy_type="Boolean",
expected_numpy_type=np.dtype("bool")
)
@Parametrization.case(
name="1 / 0 with bad value",
df_data={
"test_column": [1, 0, 1, 1, 4, 1, 1, 0, 1, 0]
},
expected_result="Integer"
expected_result=[1, 0, 1, 1, 4, 1, 1, 0, 1, 0],
expected_shmessy_type="Integer",
expected_numpy_type=np.dtype("int64")
)
@Parametrization.case(
name="1 / 0 with bad string value",
df_data={
"test_column": [1, 0, 1, 1, "hello", 1, 1, 0, 1, 0]
},
expected_result="String"
expected_result=[1, 0, 1, 1, "hello", 1, 1, 0, 1, 0],
expected_shmessy_type="String",
expected_numpy_type=np.dtype("object")
)
@Parametrization.case(
name="Only 1 should be identify as integer",
df_data={
"test_column": [1, 1, 1, 1, 1, 1, 1]
},
expected_result="Integer"
expected_result=[1, 1, 1, 1, 1, 1, 1],
expected_shmessy_type="Integer",
expected_numpy_type=np.dtype("int64")
)
@Parametrization.case(
name="Only no should be identify as String",
df_data={
"test_column": ["no", "no", "no", "no", "no"]
},
expected_result="String"
expected_result=["no", "no", "no", "no", "no"],
expected_shmessy_type="String",
expected_numpy_type=np.dtype("object")
)
@Parametrization.case(
name="Only no with single yes should be identify as bool",
df_data={
"test_column": ["no", "no", "no", "no", "no", "yes"]
},
expected_result="Boolean"
expected_result=[False, False, False, False, False, True],
expected_shmessy_type="Boolean",
expected_numpy_type=np.dtype("bool")
)
def test_boolean_match_at_least_once_for_each_value(df_data, expected_result):
def test_boolean_match_at_least_once_for_each_value(df_data, expected_shmessy_type, expected_numpy_type, expected_result):
shmessy = Shmessy()
df = pd.DataFrame(df_data)
result = Shmessy().infer_schema(df=df)
assert result.columns[0].inferred_type == expected_result
fixed_df = shmessy.fix_schema(df)
result = shmessy.get_inferred_schema()

assert result.columns[0].inferred_type == expected_shmessy_type
assert fixed_df["test_column"].dtype.type == expected_numpy_type.type
assert [x for x in df["test_column"]] == [x for x in expected_result]


def test_read_bool_from_csv_only_true_values():
Expand Down
50 changes: 50 additions & 0 deletions tests/unit/test_date_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from datetime import datetime

import numpy as np
import pandas as pd
from parametrization import Parametrization

from shmessy import Shmessy


@Parametrization.autodetect_parameters()
@Parametrization.case(
name="Base case",
df_data={
"test_column": ["23-11-2023", "21-04-2022", "11-08-2021"]
},
expected_pattern="%d-%m-%Y",
expected_result=[
datetime(2023, 11, 23),
datetime(2022, 4, 21),
datetime(2021, 8, 11)
],
expected_shmessy_type="Date",
expected_numpy_type=np.dtype("datetime64")
)
@Parametrization.case(
name="Date without leading zeros in the day part",
df_data={
"test_column": ["1-11-2023", "9-04-2022", "11-12-2021", "2-01-2020", "23-04-2019"]
},
expected_pattern="%d-%m-%Y",
expected_result=[
datetime(2023, 11, 1),
datetime(2022, 4, 9),
datetime(2021, 12, 11),
datetime(2020, 1, 2),
datetime(2019, 4, 23)
],
expected_shmessy_type="Date",
expected_numpy_type=np.dtype("datetime64")
)
def test_date_type(df_data, expected_shmessy_type, expected_numpy_type, expected_result, expected_pattern):
shmessy = Shmessy()
df = pd.DataFrame(df_data)
fixed_df = shmessy.fix_schema(df)
inferred_schema = shmessy.get_inferred_schema()

assert inferred_schema.columns[0].inferred_pattern == expected_pattern
assert inferred_schema.columns[0].inferred_type == expected_shmessy_type
assert fixed_df["test_column"].dtype.type == expected_numpy_type.type
assert [x for x in df["test_column"]] == [x for x in expected_result]
Loading