From dc54846f487ada4f5f58e00fbb1e0eedf833461d Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Thu, 15 Feb 2024 08:28:07 +0100 Subject: [PATCH 1/2] fix: column value test for SQA types --- .../validations/table/base/tableColumnToMatchSet.py | 3 ++- .../validations/table/pandas/tableColumnToMatchSet.py | 4 ++-- .../validations/table/sqlalchemy/tableColumnToMatchSet.py | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py index d135109351af..983b40313e79 100644 --- a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py @@ -16,6 +16,7 @@ import collections import traceback from abc import abstractmethod +from typing import List from metadata.data_quality.validations.base_test_handler import BaseTestValidator from metadata.generated.schema.tests.basic import ( @@ -93,5 +94,5 @@ def run_validation(self) -> TestCaseResult: ) @abstractmethod - def _run_results(self): + def _run_results(self) -> List[str]: raise NotImplementedError diff --git a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py index ad1718c50c99..42cf6e9dd293 100644 --- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py @@ -13,7 +13,7 @@ Validator for table column name to match set test case """ - +from typing import List from metadata.data_quality.validations.mixins.pandas_validator_mixin import ( PandasValidatorMixin, ) @@ -30,7 +30,7 @@ class TableColumnToMatchSetValidator( ): """Validator table column name to match set test case""" - def _run_results(self): + def _run_results(self) -> List[str]: """compute result of the test case""" names = list(self.runner[0].columns) if not names: diff --git a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py index 32743f2d441e..8c6a76cb2114 100644 --- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py @@ -14,9 +14,10 @@ """ -from typing import Optional +from typing import List, cast from sqlalchemy import inspect +from sqlalchemy.sql.base import ColumnCollection from metadata.data_quality.validations.mixins.sqa_validator_mixin import ( SQAValidatorMixin, @@ -34,11 +35,13 @@ class TableColumnToMatchSetValidator( ): """Validator for table column name to match set test case""" - def _run_results(self) -> Optional[int]: + def _run_results(self) -> List[str]: """compute result of the test case""" names = inspect(self.runner.table).c if not names: raise ValueError( f"Column names for test case {self.test_case.name} returned None" ) + names = cast(ColumnCollection, names) # satisfy type checker for names.keys() access + names = list(names.keys()) return names From b1dbfbeee93d19b2a1cbae11e7b705c2c3c23e55 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Thu, 15 Feb 2024 08:38:50 +0100 Subject: [PATCH 2/2] style: ran python linting --- .../validations/table/pandas/tableColumnToMatchSet.py | 1 + .../table/sqlalchemy/tableColumnToMatchSet.py | 4 +++- .../ingestion/source/database/bigtable/metadata.py | 4 ++-- .../ingestion/source/database/bigtable/models.py | 10 +++++----- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py index 42cf6e9dd293..efc09df5776a 100644 --- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py @@ -14,6 +14,7 @@ """ from typing import List + from metadata.data_quality.validations.mixins.pandas_validator_mixin import ( PandasValidatorMixin, ) diff --git a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py index 8c6a76cb2114..4fc678f23417 100644 --- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py +++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py @@ -42,6 +42,8 @@ def _run_results(self) -> List[str]: raise ValueError( f"Column names for test case {self.test_case.name} returned None" ) - names = cast(ColumnCollection, names) # satisfy type checker for names.keys() access + names = cast( + ColumnCollection, names + ) # satisfy type checker for names.keys() access names = list(names.keys()) return names diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py index a2a072db53f0..f635b9de7d65 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py @@ -158,10 +158,10 @@ def get_table_columns_dict( records = [{"row_key": b"row_key"}] # In order to get a "good" sample of data, we try to distribute the sampling # across multiple column families. - for cf in list(column_families.keys())[:MAX_COLUMN_FAMILIES]: + for column_family in list(column_families.keys())[:MAX_COLUMN_FAMILIES]: records.extend( self._get_records_for_column_family( - table, cf, SAMPLES_PER_COLUMN_FAMILY + table, column_family, SAMPLES_PER_COLUMN_FAMILY ) ) if len(records) >= GLOBAL_SAMPLE_SIZE: diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py index f8da387c8a55..146cc658b00b 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py @@ -39,22 +39,22 @@ class Row(BaseModel): @classmethod def from_partial_row(cls, row: PartialRowData): cells = {} - for cf, cf_cells in row.cells.items(): - cells.setdefault(cf, {}) + for column_family, cf_cells in row.cells.items(): + cells.setdefault(column_family, {}) for column, cell in cf_cells.items(): - cells[cf][column] = Cell( + cells[column_family][column] = Cell( values=[Value(timestamp=c.timestamp, value=c.value) for c in cell] ) return cls(cells=cells, row_key=row.row_key) def to_record(self) -> Dict[str, bytes]: record = {} - for cf, cells in self.cells.items(): + for column_family, cells in self.cells.items(): for column, cell in cells.items(): # Since each cell can have multiple values and the API returns them in descending order # from latest to oldest, we only take the latest value. This probably does not matter since # all we care about is data types and all data stored in BigTable is of type `bytes`. - record[f"{cf}.{column.decode()}"] = cell.values[0].value + record[f"{column_family}.{column.decode()}"] = cell.values[0].value record["row_key"] = self.row_key return record