Skip to content

Commit

Permalink
Merge pull request #50 from factly/feat/check_index_in_columns
Browse files Browse the repository at this point in the history
Added check to check index in column names
  • Loading branch information
paul-tharun authored Nov 13, 2024
2 parents eb92bd5 + f8e9411 commit 541cfe0
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 5 deletions.
10 changes: 8 additions & 2 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,12 @@ class NoteSettings(BaseSettings):

class CustomExpectationsSettings(BaseSettings):

INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME: str = (
"Index not in Column Names"
)
INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
"Column names should not have 'index' as a column so please rename - {column}"
)
NULL_DATETIME_VALUE_NAME: str = "Null date values Flag - {column}"
NULL_DATETIME_VALUE_MSG: str = (
"Null values should not be permitted for datetime values"
Expand All @@ -330,7 +336,7 @@ class CustomExpectationsSettings(BaseSettings):
"Numeric values in specific pattern - {column}"
)
NUMERIC_EXPECTATION_ERR_MSG: str = (
"Numeric values should be in proper format both integer and float(roundoff to two decimal places)"
"Numeric values should be in proper format both integer and float(round-off to two decimal places)"
)

NEGATIVE_NUMERIC_VALUES_PATTERN = re.compile(r"^-\d+(\.\d{1,})?$")
Expand All @@ -344,7 +350,7 @@ class CustomExpectationsSettings(BaseSettings):
COLUMN_NAMES_PATTERN = re.compile(r"^[a-z]+(?:_[a-z]+)*$")
COLUMN_NAMES_EXPECTATION_NAME: str = "Column names in specific pattern"
COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
"Column names should be in lower case and separated by underscore - {column}"
"Column names should be in lower case and separated by underscore - Example 'Sub Category' column should be written as 'sub_category' The improper columns list is: {column}"
)

TRAIL_OR_LEAD_WHITESPACE_PATTERN = re.compile(r"^\s+.*|.*\s+$")
Expand Down
27 changes: 25 additions & 2 deletions app/expectations/custom_expectations.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,34 @@ def expect_column_names_to_be_in_specific_pattern(
include_meta=True,
find_columns=False,
):
boolean_list = (
boolean_value = (
pd.Series(column_list.columns)
.apply(lambda x: True if pattern.match(str(x)) else False)
.all()
)
boolean_list = pd.Series([boolean_list] * len(column_list))
if not boolean_value:
boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
else:
boolean_list = pd.Series([boolean_value] * len(column_list))
return boolean_list

@MetaPandasDataset.multicolumn_map_expectation
def expect_index_not_in_column_values(
self,
column_list,
meta={
"expectation_name": "Column names should not have index as a column",
},
include_meta=True,
find_columns=False,
):
boolean_value = (
pd.Series(column_list.columns)
.apply(lambda x: False if x == "index" else True)
.all()
)
if not boolean_value:
boolean_list = pd.Series([False] + [True] * (len(column_list) - 1))
else:
boolean_list = pd.Series([boolean_value] * len(column_list))
return boolean_list
41 changes: 40 additions & 1 deletion app/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,15 @@ async def column_names_expectation_suite(dataset, result_format):
"cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
"expectation_name": custom_settings.COLUMN_NAMES_EXPECTATION_NAME,
"expectation_error_message": custom_settings.COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
column=dataset.columns.tolist()
column=list(
set(dataset.columns.tolist())
- set(
[
i.lower().replace(" ", "_")
for i in dataset.columns.tolist()
]
)
)
),
}
response = {
Expand All @@ -354,6 +362,36 @@ async def column_names_expectation_suite(dataset, result_format):
return response


async def index_not_in_columns_expectation_suite(dataset, result_format):
ge_pandas_dataset = ge.from_pandas(
dataset, dataset_class=GenericCustomExpectations
)
expectation = ge_pandas_dataset.expect_index_not_in_column_values(
column_list=dataset.columns.tolist(),
result_format=result_format,
)
expectation_dict = expectation.to_json_dict()
expectation_dict["expectation_config"]["meta"] = {
"cleaning_pdf_link": settings.DATA_CLEANING_GUIDE_LINK,
"expectation_name": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME,
"expectation_error_message": custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_ERR_MSG.format(
column=[i for i in dataset.columns.tolist() if i == "index"]
),
}
response = {
expectation_dict["expectation_config"]["meta"][
"expectation_name"
]: expectation_dict
}
response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
"result"
]["partial_unexpected_index_list"] = []
response[custom_settings.INDEX_NOT_IN_COLUMN_NAMES_EXPECTATION_NAME][
"result"
]["partial_unexpected_list"] = []
return response


async def general_table_expectation_suite(dataset, result_format):
"""Chaining all general expectaion suites for Datasets
Expand Down Expand Up @@ -407,6 +445,7 @@ async def general_table_expectation_suite(dataset, result_format):
],
column_names_expectation_suite(dataset, result_format),
observation_more_than_thresh_expectation_suite(dataset, result_format),
index_not_in_columns_expectation_suite(dataset, result_format),
)
expectations = ChainMap(*expectations)
return expectations
Expand Down

0 comments on commit 541cfe0

Please sign in to comment.