From dc54846f487ada4f5f58e00fbb1e0eedf833461d Mon Sep 17 00:00:00 2001
From: Teddy Crepineau <teddy.crepineau@gmail.com>
Date: Thu, 15 Feb 2024 08:28:07 +0100
Subject: [PATCH 1/2] fix: column value test for SQA types

---
 .../validations/table/base/tableColumnToMatchSet.py        | 3 ++-
 .../validations/table/pandas/tableColumnToMatchSet.py      | 4 ++--
 .../validations/table/sqlalchemy/tableColumnToMatchSet.py  | 7 +++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py
index d135109351af..983b40313e79 100644
--- a/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/base/tableColumnToMatchSet.py
@@ -16,6 +16,7 @@
 import collections
 import traceback
 from abc import abstractmethod
+from typing import List
 
 from metadata.data_quality.validations.base_test_handler import BaseTestValidator
 from metadata.generated.schema.tests.basic import (
@@ -93,5 +94,5 @@ def run_validation(self) -> TestCaseResult:
         )
 
     @abstractmethod
-    def _run_results(self):
+    def _run_results(self) -> List[str]:
         raise NotImplementedError
diff --git a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
index ad1718c50c99..42cf6e9dd293 100644
--- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
@@ -13,7 +13,7 @@
 Validator for table column name to match set test case
 """
 
-
+from typing import List
 from metadata.data_quality.validations.mixins.pandas_validator_mixin import (
     PandasValidatorMixin,
 )
@@ -30,7 +30,7 @@ class TableColumnToMatchSetValidator(
 ):
     """Validator table column name to match set test case"""
 
-    def _run_results(self):
+    def _run_results(self) -> List[str]:
         """compute result of the test case"""
         names = list(self.runner[0].columns)
         if not names:
diff --git a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
index 32743f2d441e..8c6a76cb2114 100644
--- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
@@ -14,9 +14,10 @@
 """
 
 
-from typing import Optional
+from typing import List, cast
 
 from sqlalchemy import inspect
+from sqlalchemy.sql.base import ColumnCollection
 
 from metadata.data_quality.validations.mixins.sqa_validator_mixin import (
     SQAValidatorMixin,
@@ -34,11 +35,13 @@ class TableColumnToMatchSetValidator(
 ):
     """Validator for table column name to match set test case"""
 
-    def _run_results(self) -> Optional[int]:
+    def _run_results(self) -> List[str]:
         """compute result of the test case"""
         names = inspect(self.runner.table).c
         if not names:
             raise ValueError(
                 f"Column names for test case {self.test_case.name} returned None"
             )
+        names = cast(ColumnCollection, names) # satisfy type checker for names.keys() access
+        names = list(names.keys())
         return names

From b1dbfbeee93d19b2a1cbae11e7b705c2c3c23e55 Mon Sep 17 00:00:00 2001
From: Teddy Crepineau <teddy.crepineau@gmail.com>
Date: Thu, 15 Feb 2024 08:38:50 +0100
Subject: [PATCH 2/2] style: ran python linting

---
 .../validations/table/pandas/tableColumnToMatchSet.py  |  1 +
 .../table/sqlalchemy/tableColumnToMatchSet.py          |  4 +++-
 .../ingestion/source/database/bigtable/metadata.py     |  4 ++--
 .../ingestion/source/database/bigtable/models.py       | 10 +++++-----
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
index 42cf6e9dd293..efc09df5776a 100644
--- a/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/pandas/tableColumnToMatchSet.py
@@ -14,6 +14,7 @@
 """
 
 from typing import List
+
 from metadata.data_quality.validations.mixins.pandas_validator_mixin import (
     PandasValidatorMixin,
 )
diff --git a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
index 8c6a76cb2114..4fc678f23417 100644
--- a/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
+++ b/ingestion/src/metadata/data_quality/validations/table/sqlalchemy/tableColumnToMatchSet.py
@@ -42,6 +42,8 @@ def _run_results(self) -> List[str]:
             raise ValueError(
                 f"Column names for test case {self.test_case.name} returned None"
             )
-        names = cast(ColumnCollection, names) # satisfy type checker for names.keys() access
+        names = cast(
+            ColumnCollection, names
+        )  # satisfy type checker for names.keys() access
         names = list(names.keys())
         return names
diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py
index a2a072db53f0..f635b9de7d65 100644
--- a/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigtable/metadata.py
@@ -158,10 +158,10 @@ def get_table_columns_dict(
             records = [{"row_key": b"row_key"}]
             # In order to get a "good" sample of data, we try to distribute the sampling
             # across multiple column families.
-            for cf in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
+            for column_family in list(column_families.keys())[:MAX_COLUMN_FAMILIES]:
                 records.extend(
                     self._get_records_for_column_family(
-                        table, cf, SAMPLES_PER_COLUMN_FAMILY
+                        table, column_family, SAMPLES_PER_COLUMN_FAMILY
                     )
                 )
                 if len(records) >= GLOBAL_SAMPLE_SIZE:
diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py
index f8da387c8a55..146cc658b00b 100644
--- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py
+++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py
@@ -39,22 +39,22 @@ class Row(BaseModel):
     @classmethod
     def from_partial_row(cls, row: PartialRowData):
         cells = {}
-        for cf, cf_cells in row.cells.items():
-            cells.setdefault(cf, {})
+        for column_family, cf_cells in row.cells.items():
+            cells.setdefault(column_family, {})
             for column, cell in cf_cells.items():
-                cells[cf][column] = Cell(
+                cells[column_family][column] = Cell(
                     values=[Value(timestamp=c.timestamp, value=c.value) for c in cell]
                 )
         return cls(cells=cells, row_key=row.row_key)
 
     def to_record(self) -> Dict[str, bytes]:
         record = {}
-        for cf, cells in self.cells.items():
+        for column_family, cells in self.cells.items():
             for column, cell in cells.items():
                 # Since each cell can have multiple values and the API returns them in descending order
                 # from latest to oldest, we only take the latest value. This probably does not matter since
                 # all we care about is data types and all data stored in BigTable is of type `bytes`.
-                record[f"{cf}.{column.decode()}"] = cell.values[0].value
+                record[f"{column_family}.{column.decode()}"] = cell.values[0].value
         record["row_key"] = self.row_key
 
         return record