Skip to content

Commit

Permalink
tests and pre commits
Browse files Browse the repository at this point in the history
  • Loading branch information
ohadmata committed Dec 28, 2023
1 parent 1f417d2 commit bab2391
Show file tree
Hide file tree
Showing 15 changed files with 116 additions and 87 deletions.
14 changes: 7 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
files: src

repos:
- repo: local
hooks:
- id: custom-script-sh
name: custom-script-sh
entry: pre-commit.sh
language: script
pass_filenames: false
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
Expand Down Expand Up @@ -28,13 +35,6 @@ repos:
types: [python]
args: ["-rn", "-sn", "--rcfile=.pylintrc", "--fail-on=I"]
exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/
# - id: mypy
# name: mypy
# entry: mypy
# language: system
# types: [python]
# args: ["--no-strict-optional", "--ignore-missing-imports"]
# exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/
- id: isort
name: isort
entry: isort
Expand Down
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ disable=
missing-class-docstring,
logging-fstring-interpolation,
too-few-public-methods,
R1710,
R0801,

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[![PyPI version](https://badge.fury.io/py/shmessy.svg)](https://badge.fury.io/py/shmessy)
[![PyPI - Downloads](https://img.shields.io/pypi/dm/shmessy)](https://pypi.org/project/shmessy/)
[![License](https://img.shields.io/:license-MIT-blue.svg)](https://opensource.org/license/mit/)
![Coverage report](./coverage.svg)
### If your data is messy - Use Shmessy!

Shmessy designed to deal with messy pandas dataframes.
Expand Down
4 changes: 4 additions & 0 deletions pre-commit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

coverage run -m pytest
coverage-badge -f -o coverage.svg
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ pytest-mock = "^3.10.0"
pylint = "^2.14.5"
mypy = "^0.991"
pylint-junit = "^0.3.2"
pytest-parametrization = "^2022.2"
coverage-badge = "^1.1.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
18 changes: 6 additions & 12 deletions src/shmessy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@

from pandas import DataFrame

from .schema import Field, ShmessySchema, InferredField
from .validators.base import BaseValidator
from .schema import ShmessySchema
from .validators_handler import ValidatorsHandler

logger = logging.getLogger(__name__)


class Shmessy:

def __init__(self, sample_size: Optional[int] = 1000) -> None:
self.__validators_handler = ValidatorsHandler()
self.__sample_size = sample_size
Expand All @@ -28,22 +26,18 @@ def infer_schema(self, df: DataFrame) -> ShmessySchema:
df = self._get_sampled_df(df)
columns = [
self.__validators_handler.infer_field(
field_name=column,
data=df[column].values
) for column in df
field_name=column, data=df[column].values
)
for column in df
]
infer_duration_ms = int((time.time() - start_time) * 1000)

return ShmessySchema(
columns=columns,
infer_duration_ms=infer_duration_ms
)
return ShmessySchema(columns=columns, infer_duration_ms=infer_duration_ms)

def fix_schema(self, df: DataFrame) -> DataFrame:
for column in df:
df[column] = self.__validators_handler.fix_field(
column=df[column],
sample_size=self.__sample_size
column=df[column], sample_size=self.__sample_size
)

return df
17 changes: 2 additions & 15 deletions src/shmessy/schema.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from enum import Enum
from typing import Optional, List
from typing import Type
from typing import List, Optional, Type

from pydantic import BaseModel, field_serializer
from pydantic import BaseModel


class ValidatorTypes(str, Enum):
Expand All @@ -14,24 +13,12 @@ class BaseField(BaseModel):
field_name: str
source_type: Type

@field_serializer('source_type')
def serialize_source_type(self, source_type: Type, _info):
return str(source_type)


class InferredField(BaseModel):
inferred_type: Optional[Type] = None
inferred_virtual_type: Optional[Type] = None
inferred_pattern: Optional[str] = None

@field_serializer('inferred_type')
def serialize_inferred_type(self, inferred_type: Type, _info):
return str(inferred_type)

@field_serializer('inferred_virtual_type')
def serialize_inferred_virtual_type(self, inferred_virtual_type: Type, _info):
return str(inferred_virtual_type)


class Field(InferredField, BaseField):
pass
Expand Down
14 changes: 9 additions & 5 deletions src/shmessy/validators/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from abc import ABC, abstractmethod
from typing import Optional, Type, Any
from typing import Optional, Type

from numpy import issubdtype, ndarray, number, object_, str_
from pandas import Series
from numpy import ndarray, issubdtype, number, object_, str_

from ..schema import InferredField, ValidatorTypes

Expand All @@ -18,9 +19,12 @@ def fix(self, column: Series, sample_size: int) -> Series:
pass

def check_validation_type(self, dtype: Type) -> None:
if self.validator_type == ValidatorTypes.NUMERIC and not issubdtype(dtype, number):
if self.validator_type == ValidatorTypes.NUMERIC and not issubdtype(
dtype, number
):
raise ValueError(f"NUMERIC validation is not supported for {dtype}")

if (self.validator_type == ValidatorTypes.STRING and
not (issubdtype(dtype, object_) or issubdtype(dtype, str_))):
if self.validator_type == ValidatorTypes.STRING and not (
issubdtype(dtype, object_) or issubdtype(dtype, str_)
):
raise ValueError(f"STRING validation is not supported for {dtype}")
32 changes: 19 additions & 13 deletions src/shmessy/validators/date.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
from datetime import datetime, date
from datetime import date, datetime
from typing import Optional

from numpy import ndarray
from pandas import to_datetime, Series
from pandas import Series, to_datetime

from .base import BaseValidator
from ..schema import InferredField, ValidatorTypes
from .base import BaseValidator


class Validator(BaseValidator):
validator_type = ValidatorTypes.STRING
ignore_nan: bool = True
patterns: list[str] = [
"%m/%d/%Y", "%m-%d-%Y", "%m.%d.%Y",
"%m/%d/%y", "%m-%d-%y", "%m.%d.%y",
"%Y/%m/%d", "%Y-%m-%d", "%Y.%m.%d",
"%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y",
"%d/%b/%Y", "%d-%b-%Y",
"%Y-%m"
"%m/%d/%Y",
"%m-%d-%Y",
"%m.%d.%Y",
"%m/%d/%y",
"%m-%d-%y",
"%m.%d.%y",
"%Y/%m/%d",
"%Y-%m-%d",
"%Y.%m.%d",
"%d/%m/%Y",
"%d-%m-%Y",
"%d.%m.%Y",
"%d/%b/%Y",
"%d-%b-%Y",
"%Y-%m",
]

def validate(self, data: ndarray) -> Optional[InferredField]:
Expand All @@ -38,10 +47,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]:
if not self.ignore_nan:
valid = False
if valid:
return InferredField(
inferred_type=date,
inferred_pattern=pattern
)
return InferredField(inferred_type=date, inferred_pattern=pattern)

def fix(self, column: Series, sample_size: int) -> Series:
sample_data = column[:sample_size]
Expand Down
23 changes: 13 additions & 10 deletions src/shmessy/validators/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,25 @@
from numpy import ndarray
from pandas import Series, to_datetime

from .base import BaseValidator
from ..schema import InferredField, ValidatorTypes
from .base import BaseValidator


class Validator(BaseValidator):
validator_type = ValidatorTypes.STRING
ignore_nan: bool = True
patterns: list[str] = [
"%m/%d/%y %H:%M:%S", "%m-%d-%y %H:%M:%S",
"%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S",
"%Y/%m/%d %H:%M:%S", "%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%SZ", "%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%d %H:%M:%S.%fZ", "%Y-%m-%d %H:%M:%S.%f",
"%m/%d/%y %H:%M:%S",
"%m-%d-%y %H:%M:%S",
"%m/%d/%Y %H:%M:%S",
"%m-%d-%Y %H:%M:%S",
"%Y/%m/%d %H:%M:%S",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%SZ",
"%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%d %H:%M:%S.%fZ",
"%Y-%m-%d %H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S.%fZ",
]

Expand All @@ -38,10 +44,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]:
if not self.ignore_nan:
valid = False
if valid:
return InferredField(
inferred_type=datetime,
inferred_pattern=pattern
)
return InferredField(inferred_type=datetime, inferred_pattern=pattern)

def fix(self, column: Series, sample_size: int) -> Series:
sample_data = column[:sample_size]
Expand Down
9 changes: 3 additions & 6 deletions src/shmessy/validators/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

from numpy import ndarray
from pandas import Series
from pydantic import EmailStr, BaseModel
from pydantic import BaseModel, EmailStr

from .base import BaseValidator
from ..schema import InferredField, ValidatorTypes
from .base import BaseValidator


class Model(BaseModel):
Expand All @@ -26,10 +26,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]:
Model(email=value)
except ValueError:
return None
return InferredField(
inferred_type=str,
inferred_virtual_type=EmailStr
)
return InferredField(inferred_type=str, inferred_virtual_type=EmailStr)

def fix(self, column: Series, sample_size: int) -> Series:
sample_data = column[:sample_size]
Expand Down
7 changes: 2 additions & 5 deletions src/shmessy/validators/ipv4_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from pydantic import BaseModel
from pydantic.networks import IPv4Address # noqa

from .base import BaseValidator
from ..schema import InferredField, ValidatorTypes
from .base import BaseValidator


class Model(BaseModel):
Expand All @@ -27,10 +27,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]:
Model(ip=value)
except ValueError:
return None
return InferredField(
inferred_type=str,
inferred_virtual_type=IPv4Address
)
return InferredField(inferred_type=str, inferred_virtual_type=IPv4Address)

def fix(self, column: Series, sample_size: int) -> Series:
sample_data = column[:sample_size]
Expand Down
22 changes: 14 additions & 8 deletions src/shmessy/validators/unix_timestamp.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import logging
import math
from datetime import datetime
from enum import Enum
from typing import Optional
import math

from numpy import ndarray
from pandas import Series, to_datetime
from enum import Enum

from .base import BaseValidator
from ..schema import InferredField, ValidatorTypes
from .base import BaseValidator

logger = logging.getLogger(__name__)

Expand All @@ -35,7 +35,9 @@ def _unix_timestamp_resolution(value: float) -> TimestampResolution:
return TimestampResolution.NANOSECONDS

@staticmethod
def _fix_input_resolution(value: float, selected_resolution: TimestampResolution) -> float:
def _fix_input_resolution(
value: float, selected_resolution: TimestampResolution
) -> float:
if selected_resolution == TimestampResolution.SECONDS:
return value
if selected_resolution == TimestampResolution.MILLISECONDS:
Expand All @@ -51,13 +53,17 @@ def validate(self, data: ndarray) -> Optional[InferredField]:
return None
for value in data:
if not math.isnan(value):
parsed_value = datetime.utcfromtimestamp(self._fix_input_resolution(value, selected_resolution))
if parsed_value.year < self.min_valid_year or parsed_value.year > self.max_valid_year:
parsed_value = datetime.utcfromtimestamp(
self._fix_input_resolution(value, selected_resolution)
)
if (
parsed_value.year < self.min_valid_year
or parsed_value.year > self.max_valid_year
):
return None

return InferredField(
inferred_type=datetime,
inferred_pattern=selected_resolution
inferred_type=datetime, inferred_pattern=selected_resolution
)
except ValueError:
return None
Expand Down
9 changes: 3 additions & 6 deletions src/shmessy/validators_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from importlib import import_module
from types import ModuleType
from typing import List, Optional, Any
from typing import Any, List, Optional

from numpy import ndarray

Expand All @@ -21,7 +21,7 @@ def __init__(self):

@classmethod
def _discover_validators(cls) -> List[BaseValidator]:
validators: List[BaseValidator] = list()
validators: List[BaseValidator] = []
root_directory = os.path.join(os.path.dirname(__file__))
validators_directory = os.path.join(root_directory, cls.VALIDATORS_DIR)

Expand Down Expand Up @@ -65,7 +65,4 @@ def infer_field(self, field_name: str, data: ndarray) -> Field:
inferred_pattern=inferred.inferred_pattern,
)

return Field(
field_name=field_name,
source_type=data.dtype.type
)
return Field(field_name=field_name, source_type=data.dtype.type)
Loading

0 comments on commit bab2391

Please sign in to comment.