Skip to content

Commit

Permalink
🐛Source Google Sheets: fix names conversion (airbytehq#34376)
Browse files Browse the repository at this point in the history
  • Loading branch information
Anton Karpets authored and jatinyadav-cc committed Feb 26, 2024
1 parent 3fb8ebb commit 6471eb2
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
dockerImageTag: 0.3.12
dockerImageTag: 0.3.13
dockerRepository: airbyte/source-google-sheets
documentationUrl: https://docs.airbyte.com/integrations/sources/google-sheets
githubIssueLabel: source-google-sheets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def headers_to_airbyte_stream(logger: AirbyteLogger, sheet_name: str, header_row
"""
fields, duplicate_fields = Helpers.get_valid_headers_and_duplicates(header_row_values)
if duplicate_fields:
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them :{duplicate_fields}")
logger.warn(f"Duplicate headers found in {sheet_name}. Ignoring them: {duplicate_fields}")

sheet_json_schema = {
"$schema": "http://json-schema.org/draft-07/schema#",
Expand Down Expand Up @@ -85,8 +85,8 @@ def get_valid_headers_and_duplicates(header_row_values: List[str]) -> (List[str]
@staticmethod
def get_formatted_row_values(row_data: RowData) -> List[str]:
"""
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet. It can be a raw
string input by the user, or the result of a sheets function call.
Gets the formatted values of all cell data in this row. A formatted value is the final value a user sees in a spreadsheet.
It can be a raw string input by the user, or the result of a sheets function call.
"""
return [value.formattedValue for value in row_data.values]

Expand Down Expand Up @@ -114,7 +114,8 @@ def get_first_row(client, spreadsheet_id: str, sheet_name: str) -> List[str]:

first_row_data = all_row_data[0]

return Helpers.get_formatted_row_values(first_row_data)
# When a column is deleted by backspace, a cell with None value is returned, so should be filtered here
return [header_cell for header_cell in Helpers.get_formatted_row_values(first_row_data) if header_cell is not None]

@staticmethod
def parse_sheet_and_column_names_from_catalog(catalog: ConfiguredAirbyteCatalog) -> Dict[str, FrozenSet[str]]:
Expand Down Expand Up @@ -151,6 +152,9 @@ def get_available_sheets_to_column_index_to_name(
first_row = Helpers.get_first_row(client, spreadsheet_id, sheet)
if names_conversion:
first_row = [safe_name_conversion(h) for h in first_row]
# When performing names conversion, they won't match what is listed in catalog for the majority of cases,
# so they should be cast here in order to have them in records
columns = {safe_name_conversion(c) for c in columns if c is not None}
# Find the column index of each header value
idx = 0
for cell_value in first_row:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
DEFAULT_SEPARATOR = "_"


def name_conversion(text):
def name_conversion(text: str) -> str:
"""
convert name using a set of rules, for example: '1MyName' -> '_1_my_name'
"""
Expand All @@ -36,7 +36,9 @@ def name_conversion(text):
return text


def safe_name_conversion(text):
def safe_name_conversion(text: str) -> str:
if not text:
return text
new = name_conversion(text)
if not new:
raise Exception(f"initial string '{text}' converted to empty")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_headers_to_airbyte_stream(self):
actual_stream = Helpers.headers_to_airbyte_stream(logger, sheet_name, header_values)
self.assertEqual(expected_stream, actual_stream)

def test_duplicate_headers_retrived(self):
def test_duplicate_headers_retrieved(self):
header_values = ["h1", "h1", "h3"]

expected_valid_header_values = ["h3"]
Expand Down Expand Up @@ -266,10 +266,35 @@ def mock_client_call(spreadsheetId, includeGridData, ranges=None):
with patch.object(GoogleSheetsClient, "__init__", lambda s, credentials, scopes: None):
sheet_client = GoogleSheetsClient({"fake": "credentials"}, ["auth_scopes"])
sheet_client.client = client

expected = {sheet1: {0: "1", 1: "2", 2: "3", 3: "4"}}

# names_conversion = False
actual = Helpers.get_available_sheets_to_column_index_to_name(
sheet_client, spreadsheet_id, {sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])}
client=sheet_client,
spreadsheet_id=spreadsheet_id,
requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
)
self.assertEqual(expected, actual)

# names_conversion = False, with null header cell
sheet1_first_row = ["1", "2", "3", "4", None]
actual = Helpers.get_available_sheets_to_column_index_to_name(
client=sheet_client,
spreadsheet_id=spreadsheet_id,
requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
)
self.assertEqual(expected, actual)

# names_conversion = True, with null header cell
sheet1_first_row = ["AB", "Some Header", "Header", "4", "1MyName", None]
expected = {sheet1: {0: "ab", 1: "some_header", 2: "header", 3: "_4", 4: "_1_my_name"}}
actual = Helpers.get_available_sheets_to_column_index_to_name(
client=sheet_client,
spreadsheet_id=spreadsheet_id,
requested_sheets_and_columns={sheet1: frozenset(sheet1_first_row), "doesnotexist": frozenset(["1", "2"])},
names_conversion=True,
)
expected = {sheet1: {0: "1", 1: "2", 2: "3", 3: "4"}}

self.assertEqual(expected, actual)

Expand Down
5 changes: 3 additions & 2 deletions docs/integrations/sources/google-sheets.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,9 @@ Airbyte batches requests to the API in order to efficiently pull data and respec

| Version | Date | Pull Request | Subject |
|---------|------------|----------------------------------------------------------|-----------------------------------------------------------------------------------|
| 0.3.12 | 2023-12-14 | [33414](https://github.com/airbytehq/airbyte/pull/33414) | Prepare for airbyte-lib |
| 0.3.11 | 2023-10-19 | [31599](https://github.com/airbytehq/airbyte/pull/31599) | Base image migration: remove Dockerfile and use the python-connector-base image |
| 0.3.13 | 2024-01-19 | [34376](https://github.com/airbytehq/airbyte/pull/34376) | Fix names conversion |
| 0.3.12 | 2023-12-14 | [33414](https://github.com/airbytehq/airbyte/pull/33414) | Prepare for airbyte-lib |
| 0.3.11 | 2023-10-19 | [31599](https://github.com/airbytehq/airbyte/pull/31599) | Base image migration: remove Dockerfile and use the python-connector-base image |
| 0.3.10 | 2023-09-27 | [30487](https://github.com/airbytehq/airbyte/pull/30487) | Fix bug causing rows to be skipped when batch size increased due to rate limits. |
| 0.3.9 | 2023-09-25 | [30749](https://github.com/airbytehq/airbyte/pull/30749) | Performance testing - include socat binary in docker image |
| 0.3.8 | 2023-09-25 | [30747](https://github.com/airbytehq/airbyte/pull/30747) | Performance testing - include socat binary in docker image |
Expand Down

0 comments on commit 6471eb2

Please sign in to comment.