🐛Source Google Sheets: fix names conversion (airbytehq#34376)

ollionorg · Feb 26, 2024 · 6471eb2 · 6471eb2
1 parent 3fb8ebb
commit 6471eb2
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 12 deletions.
diff --git a/airbyte-integrations/connectors/source-google-sheets/metadata.yaml b/airbyte-integrations/connectors/source-google-sheets/metadata.yaml
@@ -10,7 +10,7 @@ data:
   connectorSubtype: file
   connectorType: source
   definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
-  dockerImageTag: 0.3.12
+  dockerImageTag: 0.3.13
   dockerRepository: airbyte/source-google-sheets
   documentationUrl: https://docs.airbyte.com/integrations/sources/google-sheets
   githubIssueLabel: source-google-sheets

diff --git a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/helpers.py b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/helpers.py
@@ -52,7 +52,7 @@ def headers_to_airbyte_stream(logger: AirbyteLogger, sheet_name: str, header_row
         """
         fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
         if duplicate_fields:
-            logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}")
+            logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")
 
         sheet_json_schema = {
             "$schema": "http://json-schema.org/draft-07/schema#",
@@ -85,8 +85,8 @@ def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str]
     @staticmethod
     def get_formatted_row_values(row_data: RowData) -> List[str]:
         """
-        Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet. It can be a raw
-        string input by the user, or the result of a sheets function call.
+        Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
+        It can be a raw string input by the user, or the result of a sheets function call.
         """
         return [value.formattedValue for value in row_data.values]
 
@@ -114,7 +114,8 @@ def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:
 
         first_row_data = all_row_data[0]
 
-        return Helpers.get_formatted_row_values(first_row_data)
+        # When a column is deleted by backspace, a cell with None value is returned, so should be filtered here
+        return [header_cell for header_cell in Helpers.get_formatted_row_values(first_row_data) if header_cell is not None]
 
     @staticmethod
     def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
@@ -151,6 +152,9 @@ def get_available_sheets_to_column_index_to_name(
                 first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
                 if names_conversion:
                     first_row = [safe_name_conversion(h) for h in first_row]
+                    # When performing names conversion, they won't match what is listed in catalog for the majority of cases,
+                    # so they should be cast here in order to have them in records
+                    columns = {safe_name_conversion(c) for c in columns if c is not None}
                 # Find the column index of each header value
                 idx = 0
                 for cell_value in first_row:

diff --git a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/utils.py b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/utils.py
@@ -12,7 +12,7 @@
 DEFAULT_SEPARATOR = "_"
 
 
-def name_conversion(text):
+def name_conversion(text: str) -> str:
     """
     convert name using a set of rules, for example: '1MyName' -> '_1_my_name'
     """
@@ -36,7 +36,9 @@ def name_conversion(text):
     return text
 
 
-def safe_name_conversion(text):
+def safe_name_conversion(text: str) -> str:
+    if not text:
+        return text
     new = name_conversion(text)
     if not new:
         raise Exception(f"initial string '{text}' converted to empty")

diff --git a/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_helpers.py b/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_helpers.py
@@ -65,7 +65,7 @@ def test_headers_to_airbyte_stream(self):
         actual_stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_values)
         self.assertEqual(expected_stream, actual_stream)
 
-    def test_duplicate_headers_retrived(self):
+    def test_duplicate_headers_retrieved(self):
         header_values = ["h1", "h1", "h3"]
 
         expected_valid_header_values = ["h3"]
@@ -266,10 +266,35 @@ def mock_client_call(spreadsheetId, includeGridData, ranges=None):
         with patch.object(GoogleSheetsClient, "__init__", lambda s, credentials, scopes: None):
             sheet_client = GoogleSheetsClient({"fake": "credentials"}, ["auth_scopes"])
             sheet_client.client = client
+
+        expected = {sheet1: {0: "1", 1: "2", 2: "3", 3: "4"}}
+
+        # names_conversion = False
         actual = Helpers.get_available_sheets_to_column_index_to_name(
-            sheet_client, spreadsheet_id, {sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])}
+            client=sheet_client,
+            spreadsheet_id=spreadsheet_id,
+            requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
+        )
+        self.assertEqual(expected, actual)
+
+        # names_conversion = False, with null header cell
+        sheet1_first_row = ["1", "2", "3", "4", None]
+        actual = Helpers.get_available_sheets_to_column_index_to_name(
+            client=sheet_client,
+            spreadsheet_id=spreadsheet_id,
+            requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
+        )
+        self.assertEqual(expected, actual)
+
+        # names_conversion = True, with null header cell
+        sheet1_first_row = ["AB", "Some Header", "Header", "4", "1MyName", None]
+        expected = {sheet1: {0: "ab", 1: "some_header", 2: "header", 3: "_4", 4: "_1_my_name"}}
+        actual = Helpers.get_available_sheets_to_column_index_to_name(
+            client=sheet_client,
+            spreadsheet_id=spreadsheet_id,
+            requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
+            names_conversion=True,
         )
-        expected = {sheet1: {0: "1", 1: "2", 2: "3", 3: "4"}}
 
         self.assertEqual(expected, actual)
 

diff --git a/docs/integrations/sources/google-sheets.md b/docs/integrations/sources/google-sheets.md
@@ -151,8 +151,9 @@ Airbyte batches requests to the API in order to efficiently pull data and respec
 
 | Version | Date       | Pull Request                                             | Subject                                                                           |
 |---------|------------|----------------------------------------------------------|-----------------------------------------------------------------------------------|
-| 0.3.12 | 2023-12-14 | [33414](https://github.com/airbytehq/airbyte/pull/33414) | Prepare for airbyte-lib |
-| 0.3.11 | 2023-10-19 | [31599](https://github.com/airbytehq/airbyte/pull/31599) | Base image migration: remove Dockerfile and use the python-connector-base image |
+| 0.3.13  | 2024-01-19 | [34376](https://github.com/airbytehq/airbyte/pull/34376) | Fix names conversion                                                              |
+| 0.3.12  | 2023-12-14 | [33414](https://github.com/airbytehq/airbyte/pull/33414) | Prepare for airbyte-lib                                                           |
+| 0.3.11  | 2023-10-19 | [31599](https://github.com/airbytehq/airbyte/pull/31599) | Base image migration: remove Dockerfile and use the python-connector-base image   |
 | 0.3.10  | 2023-09-27 | [30487](https://github.com/airbytehq/airbyte/pull/30487) | Fix bug causing rows to be skipped when batch size increased due to rate limits.  |
 | 0.3.9   | 2023-09-25 | [30749](https://github.com/airbytehq/airbyte/pull/30749) | Performance testing - include socat binary in docker image                        |
 | 0.3.8   | 2023-09-25 | [30747](https://github.com/airbytehq/airbyte/pull/30747) | Performance testing - include socat binary in docker image                        |