From bab239107c4723b14b011ba53692f97dfc6a767b Mon Sep 17 00:00:00 2001 From: ohadmata Date: Thu, 28 Dec 2023 23:41:05 +0200 Subject: [PATCH] tests and pre commits --- .pre-commit-config.yaml | 14 +++++------ .pylintrc | 2 ++ README.md | 1 + pre-commit.sh | 4 +++ pyproject.toml | 2 ++ src/shmessy/__init__.py | 18 +++++-------- src/shmessy/schema.py | 17 ++----------- src/shmessy/validators/base.py | 14 +++++++---- src/shmessy/validators/date.py | 32 ++++++++++++++---------- src/shmessy/validators/datetime.py | 23 +++++++++-------- src/shmessy/validators/email.py | 9 +++---- src/shmessy/validators/ipv4_address.py | 7 ++---- src/shmessy/validators/unix_timestamp.py | 22 ++++++++++------ src/shmessy/validators_handler.py | 9 +++---- tests/unit/test_validators_handler.py | 29 +++++++++++++++++++++ 15 files changed, 116 insertions(+), 87 deletions(-) create mode 100755 pre-commit.sh create mode 100644 tests/unit/test_validators_handler.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dbad91a..cd53714 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,13 @@ files: src repos: +- repo: local + hooks: + - id: custom-script-sh + name: custom-script-sh + entry: pre-commit.sh + language: script + pass_filenames: false - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: @@ -28,13 +35,6 @@ repos: types: [python] args: ["-rn", "-sn", "--rcfile=.pylintrc", "--fail-on=I"] exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/ -# - id: mypy -# name: mypy -# entry: mypy -# language: system -# types: [python] -# args: ["--no-strict-optional", "--ignore-missing-imports"] -# exclude: tests(/\w*)*/functional/|tests/input|tests(/\w*)*data/|doc/ - id: isort name: isort entry: isort diff --git a/.pylintrc b/.pylintrc index 5906bfa..ecdd717 100644 --- a/.pylintrc +++ b/.pylintrc @@ -68,6 +68,8 @@ disable= missing-class-docstring, logging-fstring-interpolation, too-few-public-methods, + R1710, + R0801, # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/README.md b/README.md index 7e96430..00227d2 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![PyPI version](https://badge.fury.io/py/shmessy.svg)](https://badge.fury.io/py/shmessy) [![PyPI - Downloads](https://img.shields.io/pypi/dm/shmessy)](https://pypi.org/project/shmessy/) [![License](https://img.shields.io/:license-MIT-blue.svg)](https://opensource.org/license/mit/) +![Coverage report](./coverage.svg) ### If your data is messy - Use Shmessy! Shmessy designed to deal with messy pandas dataframes. diff --git a/pre-commit.sh b/pre-commit.sh new file mode 100755 index 0000000..b8c7996 --- /dev/null +++ b/pre-commit.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +coverage run -m pytest +coverage-badge -f -o coverage.svg \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bc1b650..cb67f37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,8 @@ pytest-mock = "^3.10.0" pylint = "^2.14.5" mypy = "^0.991" pylint-junit = "^0.3.2" +pytest-parametrization = "^2022.2" +coverage-badge = "^1.1.0" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/shmessy/__init__.py b/src/shmessy/__init__.py index 1c11236..ed13d52 100644 --- a/src/shmessy/__init__.py +++ b/src/shmessy/__init__.py @@ -4,15 +4,13 @@ from pandas import DataFrame -from .schema import Field, ShmessySchema, InferredField -from .validators.base import BaseValidator +from .schema import ShmessySchema from .validators_handler import ValidatorsHandler logger = logging.getLogger(__name__) class Shmessy: - def __init__(self, sample_size: Optional[int] = 1000) -> None: self.__validators_handler = ValidatorsHandler() self.__sample_size = sample_size @@ -28,22 +26,18 @@ def infer_schema(self, df: DataFrame) -> ShmessySchema: df = self._get_sampled_df(df) columns = [ self.__validators_handler.infer_field( - field_name=column, - data=df[column].values - ) for column in df + field_name=column, data=df[column].values + ) + for column in df ] infer_duration_ms = int((time.time() - start_time) * 1000) - return ShmessySchema( - columns=columns, - infer_duration_ms=infer_duration_ms - ) + return ShmessySchema(columns=columns, infer_duration_ms=infer_duration_ms) def fix_schema(self, df: DataFrame) -> DataFrame: for column in df: df[column] = self.__validators_handler.fix_field( - column=df[column], - sample_size=self.__sample_size + column=df[column], sample_size=self.__sample_size ) return df diff --git a/src/shmessy/schema.py b/src/shmessy/schema.py index f028789..df17075 100644 --- a/src/shmessy/schema.py +++ b/src/shmessy/schema.py @@ -1,8 +1,7 @@ from enum import Enum -from typing import Optional, List -from typing import Type +from typing import List, Optional, Type -from pydantic import BaseModel, field_serializer +from pydantic import BaseModel class ValidatorTypes(str, Enum): @@ -14,24 +13,12 @@ class BaseField(BaseModel): field_name: str source_type: Type - @field_serializer('source_type') - def serialize_source_type(self, source_type: Type, _info): - return str(source_type) - class InferredField(BaseModel): inferred_type: Optional[Type] = None inferred_virtual_type: Optional[Type] = None inferred_pattern: Optional[str] = None - @field_serializer('inferred_type') - def serialize_inferred_type(self, inferred_type: Type, _info): - return str(inferred_type) - - @field_serializer('inferred_virtual_type') - def serialize_inferred_virtual_type(self, inferred_virtual_type: Type, _info): - return str(inferred_virtual_type) - class Field(InferredField, BaseField): pass diff --git a/src/shmessy/validators/base.py b/src/shmessy/validators/base.py index c048c28..aac1456 100644 --- a/src/shmessy/validators/base.py +++ b/src/shmessy/validators/base.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod -from typing import Optional, Type, Any +from typing import Optional, Type + +from numpy import issubdtype, ndarray, number, object_, str_ from pandas import Series -from numpy import ndarray, issubdtype, number, object_, str_ from ..schema import InferredField, ValidatorTypes @@ -18,9 +19,12 @@ def fix(self, column: Series, sample_size: int) -> Series: pass def check_validation_type(self, dtype: Type) -> None: - if self.validator_type == ValidatorTypes.NUMERIC and not issubdtype(dtype, number): + if self.validator_type == ValidatorTypes.NUMERIC and not issubdtype( + dtype, number + ): raise ValueError(f"NUMERIC validation is not supported for {dtype}") - if (self.validator_type == ValidatorTypes.STRING and - not (issubdtype(dtype, object_) or issubdtype(dtype, str_))): + if self.validator_type == ValidatorTypes.STRING and not ( + issubdtype(dtype, object_) or issubdtype(dtype, str_) + ): raise ValueError(f"STRING validation is not supported for {dtype}") diff --git a/src/shmessy/validators/date.py b/src/shmessy/validators/date.py index 9304025..1ceb12c 100644 --- a/src/shmessy/validators/date.py +++ b/src/shmessy/validators/date.py @@ -1,23 +1,32 @@ -from datetime import datetime, date +from datetime import date, datetime from typing import Optional from numpy import ndarray -from pandas import to_datetime, Series +from pandas import Series, to_datetime -from .base import BaseValidator from ..schema import InferredField, ValidatorTypes +from .base import BaseValidator class Validator(BaseValidator): validator_type = ValidatorTypes.STRING ignore_nan: bool = True patterns: list[str] = [ - "%m/%d/%Y", "%m-%d-%Y", "%m.%d.%Y", - "%m/%d/%y", "%m-%d-%y", "%m.%d.%y", - "%Y/%m/%d", "%Y-%m-%d", "%Y.%m.%d", - "%d/%m/%Y", "%d-%m-%Y", "%d.%m.%Y", - "%d/%b/%Y", "%d-%b-%Y", - "%Y-%m" + "%m/%d/%Y", + "%m-%d-%Y", + "%m.%d.%Y", + "%m/%d/%y", + "%m-%d-%y", + "%m.%d.%y", + "%Y/%m/%d", + "%Y-%m-%d", + "%Y.%m.%d", + "%d/%m/%Y", + "%d-%m-%Y", + "%d.%m.%Y", + "%d/%b/%Y", + "%d-%b-%Y", + "%Y-%m", ] def validate(self, data: ndarray) -> Optional[InferredField]: @@ -38,10 +47,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]: if not self.ignore_nan: valid = False if valid: - return InferredField( - inferred_type=date, - inferred_pattern=pattern - ) + return InferredField(inferred_type=date, inferred_pattern=pattern) def fix(self, column: Series, sample_size: int) -> Series: sample_data = column[:sample_size] diff --git a/src/shmessy/validators/datetime.py b/src/shmessy/validators/datetime.py index fe494ad..bf8966e 100644 --- a/src/shmessy/validators/datetime.py +++ b/src/shmessy/validators/datetime.py @@ -4,19 +4,25 @@ from numpy import ndarray from pandas import Series, to_datetime -from .base import BaseValidator from ..schema import InferredField, ValidatorTypes +from .base import BaseValidator class Validator(BaseValidator): validator_type = ValidatorTypes.STRING ignore_nan: bool = True patterns: list[str] = [ - "%m/%d/%y %H:%M:%S", "%m-%d-%y %H:%M:%S", - "%m/%d/%Y %H:%M:%S", "%m-%d-%Y %H:%M:%S", - "%Y/%m/%d %H:%M:%S", "%Y-%m-%d %H:%M:%S", - "%Y-%m-%d %H:%M:%SZ", "%Y-%m-%dT%H:%M:%SZ", - "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%d %H:%M:%S.%fZ", "%Y-%m-%d %H:%M:%S.%f", + "%m/%d/%y %H:%M:%S", + "%m-%d-%y %H:%M:%S", + "%m/%d/%Y %H:%M:%S", + "%m-%d-%Y %H:%M:%S", + "%Y/%m/%d %H:%M:%S", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d %H:%M:%SZ", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%dT%H:%M:%S.%f", + "%Y-%m-%d %H:%M:%S.%fZ", + "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%fZ", ] @@ -38,10 +44,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]: if not self.ignore_nan: valid = False if valid: - return InferredField( - inferred_type=datetime, - inferred_pattern=pattern - ) + return InferredField(inferred_type=datetime, inferred_pattern=pattern) def fix(self, column: Series, sample_size: int) -> Series: sample_data = column[:sample_size] diff --git a/src/shmessy/validators/email.py b/src/shmessy/validators/email.py index 4430e3c..cf14f75 100644 --- a/src/shmessy/validators/email.py +++ b/src/shmessy/validators/email.py @@ -2,10 +2,10 @@ from numpy import ndarray from pandas import Series -from pydantic import EmailStr, BaseModel +from pydantic import BaseModel, EmailStr -from .base import BaseValidator from ..schema import InferredField, ValidatorTypes +from .base import BaseValidator class Model(BaseModel): @@ -26,10 +26,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]: Model(email=value) except ValueError: return None - return InferredField( - inferred_type=str, - inferred_virtual_type=EmailStr - ) + return InferredField(inferred_type=str, inferred_virtual_type=EmailStr) def fix(self, column: Series, sample_size: int) -> Series: sample_data = column[:sample_size] diff --git a/src/shmessy/validators/ipv4_address.py b/src/shmessy/validators/ipv4_address.py index 636f666..e2c006d 100644 --- a/src/shmessy/validators/ipv4_address.py +++ b/src/shmessy/validators/ipv4_address.py @@ -5,8 +5,8 @@ from pydantic import BaseModel from pydantic.networks import IPv4Address # noqa -from .base import BaseValidator from ..schema import InferredField, ValidatorTypes +from .base import BaseValidator class Model(BaseModel): @@ -27,10 +27,7 @@ def validate(self, data: ndarray) -> Optional[InferredField]: Model(ip=value) except ValueError: return None - return InferredField( - inferred_type=str, - inferred_virtual_type=IPv4Address - ) + return InferredField(inferred_type=str, inferred_virtual_type=IPv4Address) def fix(self, column: Series, sample_size: int) -> Series: sample_data = column[:sample_size] diff --git a/src/shmessy/validators/unix_timestamp.py b/src/shmessy/validators/unix_timestamp.py index 9c2e694..3b49d3d 100644 --- a/src/shmessy/validators/unix_timestamp.py +++ b/src/shmessy/validators/unix_timestamp.py @@ -1,14 +1,14 @@ import logging +import math from datetime import datetime +from enum import Enum from typing import Optional -import math from numpy import ndarray from pandas import Series, to_datetime -from enum import Enum -from .base import BaseValidator from ..schema import InferredField, ValidatorTypes +from .base import BaseValidator logger = logging.getLogger(__name__) @@ -35,7 +35,9 @@ def _unix_timestamp_resolution(value: float) -> TimestampResolution: return TimestampResolution.NANOSECONDS @staticmethod - def _fix_input_resolution(value: float, selected_resolution: TimestampResolution) -> float: + def _fix_input_resolution( + value: float, selected_resolution: TimestampResolution + ) -> float: if selected_resolution == TimestampResolution.SECONDS: return value if selected_resolution == TimestampResolution.MILLISECONDS: @@ -51,13 +53,17 @@ def validate(self, data: ndarray) -> Optional[InferredField]: return None for value in data: if not math.isnan(value): - parsed_value = datetime.utcfromtimestamp(self._fix_input_resolution(value, selected_resolution)) - if parsed_value.year < self.min_valid_year or parsed_value.year > self.max_valid_year: + parsed_value = datetime.utcfromtimestamp( + self._fix_input_resolution(value, selected_resolution) + ) + if ( + parsed_value.year < self.min_valid_year + or parsed_value.year > self.max_valid_year + ): return None return InferredField( - inferred_type=datetime, - inferred_pattern=selected_resolution + inferred_type=datetime, inferred_pattern=selected_resolution ) except ValueError: return None diff --git a/src/shmessy/validators_handler.py b/src/shmessy/validators_handler.py index fa47ea5..209be6c 100644 --- a/src/shmessy/validators_handler.py +++ b/src/shmessy/validators_handler.py @@ -2,7 +2,7 @@ import os from importlib import import_module from types import ModuleType -from typing import List, Optional, Any +from typing import Any, List, Optional from numpy import ndarray @@ -21,7 +21,7 @@ def __init__(self): @classmethod def _discover_validators(cls) -> List[BaseValidator]: - validators: List[BaseValidator] = list() + validators: List[BaseValidator] = [] root_directory = os.path.join(os.path.dirname(__file__)) validators_directory = os.path.join(root_directory, cls.VALIDATORS_DIR) @@ -65,7 +65,4 @@ def infer_field(self, field_name: str, data: ndarray) -> Field: inferred_pattern=inferred.inferred_pattern, ) - return Field( - field_name=field_name, - source_type=data.dtype.type - ) + return Field(field_name=field_name, source_type=data.dtype.type) diff --git a/tests/unit/test_validators_handler.py b/tests/unit/test_validators_handler.py new file mode 100644 index 0000000..1ed775c --- /dev/null +++ b/tests/unit/test_validators_handler.py @@ -0,0 +1,29 @@ +import pytest + +from shmessy.validators_handler import ValidatorsHandler +from parametrization import Parametrization + + +@pytest.fixture +def validators_handler() -> ValidatorsHandler: + return ValidatorsHandler() + + +@Parametrization.autodetect_parameters() +@Parametrization.case( + name="init file should not identified as validator", + filename="__init__.py", + expected_output=False, +) +@Parametrization.case( + name="Ignore base class - Should not considered as validator", + filename="base.py", + expected_output=False, +) +@Parametrization.case( + name="Legit validator", + filename="datetime.py", + expected_output=True, +) +def test_is_validator(filename, expected_output, validators_handler): + assert validators_handler._is_validator(filename) == expected_output