Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix/metadata_from_g_sheet: Added checks suggested in google sheet #44

Merged
merged 3 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 62 additions & 18 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ class Settings(BaseSettings):
SERVICE_ACCOUNT_CONF: Dict[str, str] = {"<CHANGE_ME>": "<CHANGE_ME>"}
GSHEET_SCOPES: List[str] = ["https://www.googleapis.com/auth/spreadsheets"]

# Metadata File Parameters
METADATA_COLUMN_ORDER_STRING = ""

class Config:
env_file = ".env"

Expand Down Expand Up @@ -397,7 +400,7 @@ class MetadataSettings(BaseSettings):

SECTOR_KEYWORD = "sector"
ORGANIZATION_KEYWORD = "organization"
SHORT_FORM_KEYWORD = "short_form"
# SHORT_FORM_KEYWORD = "short_form"

DESCRIPTION_KEYWORD = "description"
DATASET_NAME_FOR_FACTLY_KEYWORD = "dataset_name_for_factly"
Expand All @@ -412,66 +415,107 @@ class MetadataSettings(BaseSettings):
VARIABLE_MEASURED_KEYWORD = "variable_measured"
DATA_NEXT_UPDATE_KEYWORD = "data_next_update"
SOURCE_KEYWORD = "source"
SECTOR_EXPECTATION = {
DATASET_NAME_FOR_FACTLY_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "sector_expectation_suite",
"expectation_suite_name": "dataset_name_for_factly_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"expectation_type": "expect_column_value_lengths_to_be_between",
"kwargs": {
"column": "sector",
"value_set": [],
"column": "dataset_name_for_factly",
"min_value": 5,
"max_value": 200,
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Sector Name in set of values",
"expectation_name": "Dataset Name For Factly Length",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Sector Name should be from the Data Dictionary",
"expectation_error_message": "Dataset Name For Factly Length should be less than 200",
},
}
],
}

ORGANIZATION_EXPECTATION = {
DESCRIPTION_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "organization_expectation_suite",
"expectation_suite_name": "description_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_value_lengths_to_be_between",
"kwargs": {
"column": "description",
"min_value": 50,
"max_value": 5000,
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Description Length",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Description should be grater than 50",
},
}
],
}
SECTOR_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "sector_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"kwargs": {
"column": "organization",
"column": "sector",
"value_set": [],
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Organization Name in set of values",
"expectation_name": "Sector Name in set of values",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Organization Name should be from the Data Dictionary",
"expectation_error_message": "Sector Name should be from the Data Dictionary",
},
}
],
}

SHORT_FORM_EXPECTATION = {
ORGANIZATION_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "short_form_expectation_suite",
"expectation_suite_name": "organization_expectation_suite",
"expectations": [
{
"expectation_type": "expect_column_values_to_be_in_set",
"kwargs": {
"column": "short_form",
"column": "organization",
"value_set": [],
"result_format": "SUMMARY",
},
"meta": {
"expectation_name": "Short Form in set of values",
"expectation_name": "Organization Name in set of values",
"cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
"expectation_error_message": "Short Form should be from the Data Dictionary",
"expectation_error_message": "Organization Name should be from the Data Dictionary",
},
}
],
}

# SHORT_FORM_EXPECTATION = {
# "data_asset_type": None,
# "expectation_suite_name": "short_form_expectation_suite",
# "expectations": [
# {
# "expectation_type": "expect_column_values_to_be_in_set",
# "kwargs": {
# "column": "short_form",
# "value_set": [],
# "result_format": "SUMMARY",
# },
# "meta": {
# "expectation_name": "Short Form in set of values",
# "cleaning_pdf_link": "https://wp.me/ad1WQ9-dvg",
# "expectation_error_message": "Short Form should be from the Data Dictionary",
# },
# }
# ],
# }

FREQUENCY_OF_UPDATE_EXPECTATION = {
"data_asset_type": None,
"expectation_suite_name": "frequency_of_update_expectation_suite",
Expand Down
14 changes: 7 additions & 7 deletions app/utils/column_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ async def find_metadata_columns(columns: set):
organization_pattern = re.compile(
r".*({}).*".format(metadata_settings.ORGANIZATION_KEYWORD)
)
short_form_pattern = re.compile(
r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD)
)
# short_form_pattern = re.compile(
# r".*({}).*".format(metadata_settings.SHORT_FORM_KEYWORD)
# )
description_pattern = re.compile(
r".*({}).*".format(metadata_settings.DESCRIPTION_KEYWORD)
)
Expand Down Expand Up @@ -217,9 +217,9 @@ async def find_metadata_columns(columns: set):
organization_column, columns = extract_pattern_from_columns(
columns, organization_pattern
)
short_form_column, columns = extract_pattern_from_columns(
columns, short_form_pattern
)
# short_form_column, columns = extract_pattern_from_columns(
# columns, short_form_pattern
# )
description_column, columns = extract_pattern_from_columns(
columns, description_pattern
)
Expand Down Expand Up @@ -261,7 +261,7 @@ async def find_metadata_columns(columns: set):
return {
"sector": list(sector_column),
"organization": list(organization_column),
"short_form": list(short_form_column),
# "short_form": list(short_form_column),
"description": list(description_column),
"tags": list(tags_column),
"temporal_coverage": list(temporal_coverage_column),
Expand Down
29 changes: 29 additions & 0 deletions app/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,21 @@ async def modify_default_expectation_suite(
return expectation_suite


async def modify_column_order_expectation_suite(
expectation_suite: dict, column_order: list
):
modified_expectations = []
for expectation in expectation_suite["expectations"]:
if (
expectation["expectation_type"]
== "expect_table_columns_to_match_ordered_list"
):
expectation["kwargs"]["column_list"] = column_order
modified_expectations.append(expectation)
expectation_suite["expectations"] = modified_expectations
return expectation_suite


async def modify_values_to_be_in_between(
changed_config: dict, default_config: str
):
Expand All @@ -126,6 +141,20 @@ async def modify_values_to_be_in_between(
return default_config


async def modify_values_length_to_be_between(
changed_config: dict, default_config: str
):
for expectation in default_config["expectations"]:
if (
expectation["expectation_type"]
== "expect_column_value_lengths_to_be_between"
):
expectation["kwargs"].update(
changed_config["expect_column_value_lengths_to_be_between"]
)
return default_config


async def modify_values_to_be_in_set(
changed_config: dict, default_config: str
):
Expand Down
Loading