Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MINOR - Fix column to match set test #15186

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import collections
import traceback
from abc import abstractmethod
from typing import List

from metadata.data_quality.validations.base_test_handler import BaseTestValidator
from metadata.generated.schema.tests.basic import (
Expand Down Expand Up @@ -93,5 +94,5 @@ def run_validation(self) -> TestCaseResult:
)

@abstractmethod
def _run_results(self):
def _run_results(self) -> List[str]:
raise NotImplementedError
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Validator for table column name to match set test case
"""

from typing import List

from metadata.data_quality.validations.mixins.pandas_validator_mixin import (
PandasValidatorMixin,
Expand All @@ -30,7 +31,7 @@ class TableColumnToMatchSetValidator(
):
"""Validator table column name to match set test case"""

def _run_results(self):
def _run_results(self) -> List[str]:
"""compute result of the test case"""
names = list(self.runner[0].columns)
if not names:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
"""


from typing import Optional
from typing import List, cast

from sqlalchemy import inspect
from sqlalchemy.sql.base import ColumnCollection

from metadata.data_quality.validations.mixins.sqa_validator_mixin import (
SQAValidatorMixin,
Expand All @@ -34,11 +35,15 @@ class TableColumnToMatchSetValidator(
):
"""Validator for table column name to match set test case"""

def _run_results(self) -> Optional[int]:
def _run_results(self) -> List[str]:
"""compute result of the test case"""
names = inspect(self.runner.table).c
if not names:
raise ValueError(
f"Column names for test case {self.test_case.name} returned None"
)
names = cast(
ColumnCollection, names
) # satisfy type checker for names.keys() access
names = list(names.keys())
return names
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,10 @@ def get_table_columns_dict(
records = [{"row_key": b"row_key"}]
# In order to get a "good" sample of data, we try to distribute the sampling
# across multiple column families.
for cf in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
for column_family in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
records.extend(
self._get_records_for_column_family(
table, cf, SAMPLES_PER_COLUMN_FAMILY
table, column_family, SAMPLES_PER_COLUMN_FAMILY
)
)
if len(records) >= GLOBAL_SAMPLE_SIZE:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,22 @@ class Row(BaseModel):
@classmethod
def from_partial_row(cls, row: PartialRowData):
cells = {}
for cf, cf_cells in row.cells.items():
cells.setdefault(cf, {})
for column_family, cf_cells in row.cells.items():
cells.setdefault(column_family, {})
for column, cell in cf_cells.items():
cells[cf][column] = Cell(
cells[column_family][column] = Cell(
values=[Value(timestamp=c.timestamp, value=c.value) for c in cell]
)
return cls(cells=cells, row_key=row.row_key)

def to_record(self) -> Dict[str, bytes]:
record = {}
for cf, cells in self.cells.items():
for column_family, cells in self.cells.items():
for column, cell in cells.items():
# Since each cell can have multiple values and the API returns them in descending order
# from latest to oldest, we only take the latest value. This probably does not matter since
# all we care about is data types and all data stored in BigTable is of type `bytes`.
record[f"{cf}.{column.decode()}"] = cell.values[0].value
record[f"{column_family}.{column.decode()}"] = cell.values[0].value
record["row_key"] = self.row_key

return record
Loading