Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Source Google Sheets: added logic to skip modified sheet #29246

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ COPY source_google_sheets ./source_google_sheets
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]

LABEL io.airbyte.version=0.3.1
LABEL io.airbyte.version=0.3.2
LABEL io.airbyte.name=airbyte/source-google-sheets
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ data:
connectorSubtype: file
connectorType: source
definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
dockerImageTag: 0.3.1
dockerImageTag: 0.3.2
dockerRepository: airbyte/source-google-sheets
githubIssueLabel: source-google-sheets
icon: google-sheets.svg
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import re
from collections import defaultdict
from datetime import datetime
from typing import Dict, FrozenSet, Iterable, List
from typing import Dict, FrozenSet, Iterable, List, Tuple

from airbyte_cdk.logger import AirbyteLogger
from airbyte_cdk.models.airbyte_protocol import AirbyteRecordMessage, AirbyteStream, ConfiguredAirbyteCatalog, SyncMode
Expand Down Expand Up @@ -218,3 +218,11 @@ def get_spreadsheet_id(id_or_url: str) -> str:
return m.group(2)
else:
return id_or_url

@staticmethod
def check_sheet_is_valid(client, spreadsheet_id: str, sheet_name: str) -> Tuple[bool, str]:
try:
Helpers.get_first_row(client, spreadsheet_id, sheet_name)
return True, ""
except Exception as e:
return False, str(e)
Original file line number Diff line number Diff line change
Expand Up @@ -159,32 +159,40 @@ def read(
logger.info(f"Row counts: {sheet_row_counts}")
for sheet in sheet_to_column_index_to_name.keys():
logger.info(f"Syncing sheet {sheet}")
column_index_to_name = sheet_to_column_index_to_name[sheet]
row_cursor = 2 # we start syncing past the header row
# For the loop, it is necessary that the initial row exists when we send a request to the API,
# if the last row of the interval goes outside the sheet - this is normal, we will return
# only the real data of the sheet and in the next iteration we will loop out.
while row_cursor <= sheet_row_counts[sheet]:
range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}"
logger.info(f"Fetching range {range}")
row_batch = SpreadsheetValues.parse_obj(
client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")
)

row_cursor += row_batch_size + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]

if not value_ranges.values:
break

row_values = value_ranges.values
if len(row_values) == 0:
break

for row in row_values:
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name))
# We revalidate the sheet here to avoid errors in case the sheet was changed after the sync started
is_valid, reason = Helpers.check_sheet_is_valid(client, spreadsheet_id, sheet)
if is_valid:
column_index_to_name = sheet_to_column_index_to_name[sheet]
row_cursor = 2 # we start syncing past the header row
# For the loop, it is necessary that the initial row exists when we send a request to the API,
# if the last row of the interval goes outside the sheet - this is normal, we will return
# only the real data of the sheet and in the next iteration we will loop out.
while row_cursor <= sheet_row_counts[sheet]:
range = f"{sheet}!{row_cursor}:{row_cursor + row_batch_size}"
logger.info(f"Fetching range {range}")
row_batch = SpreadsheetValues.parse_obj(
client.get_values(spreadsheetId=spreadsheet_id, ranges=range, majorDimension="ROWS")
)

row_cursor += row_batch_size + 1
# there should always be one range since we requested only one
value_ranges = row_batch.valueRanges[0]

if not value_ranges.values:
break

row_values = value_ranges.values
if len(row_values) == 0:
break

for row in row_values:
if not Helpers.is_row_empty(row) and Helpers.row_contains_relevant_data(row, column_index_to_name.keys()):
yield AirbyteMessage(
type=Type.RECORD, record=Helpers.row_data_to_record_message(sheet, row, column_index_to_name)
)
else:
logger.info(f"Skipping syncing sheet {sheet}: {reason}")

logger.info(f"Finished syncing spreadsheet {spreadsheet_id}")

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ def google_sheet_client(row_data, spreadsheet_id, client):
return sheet_client


def google_sheet_invalid_client(spreadsheet_id, client):
fake_response = Spreadsheet(
spreadsheetId=spreadsheet_id,
sheets=[Sheet(data=[])],
)
client.get.return_value.execute.return_value = fake_response
with patch.object(GoogleSheetsClient, "__init__", lambda s, credentials, scopes: None):
sheet_client = GoogleSheetsClient({"fake": "credentials"}, ["auth_scopes"])
sheet_client.client = client
return sheet_client


class TestHelpers(unittest.TestCase):
def test_headers_to_airbyte_stream(self):
sheet_name = "sheet1"
Expand Down Expand Up @@ -190,6 +202,26 @@ def test_get_first_row_empty_sheet(self):
self.assertEqual(Helpers.get_first_row(sheet_client, spreadsheet_id, sheet), [])
client.get.assert_called_with(spreadsheetId=spreadsheet_id, includeGridData=True, ranges=f"{sheet}!1:1")

def test_check_sheet_is_valid(self):
spreadsheet_id = "123"
sheet = "s1"
expected_first_row = ["1", "2", "3", "4"]
row_data = [RowData(values=[CellData(formattedValue=v) for v in expected_first_row])]
client = Mock()
sheet_client = google_sheet_client(row_data, spreadsheet_id, client)
is_valid, reason = Helpers.check_sheet_is_valid(sheet_client, spreadsheet_id, sheet)
self.assertTrue(is_valid)
self.assertEqual(reason, "")

def test_check_sheet_is_valid_empty(self):
spreadsheet_id = "123"
sheet = "s1"
client = Mock()
sheet_client = google_sheet_invalid_client(spreadsheet_id, client)
is_valid, reason = Helpers.check_sheet_is_valid(sheet_client, spreadsheet_id, sheet)
self.assertFalse(is_valid)
self.assertEqual(reason, "Expected data for exactly one range for sheet s1")

def test_get_sheets_in_spreadsheet(self):
spreadsheet_id = "id1"
expected_sheets = ["s1", "s2"]
Expand Down
77 changes: 39 additions & 38 deletions docs/integrations/sources/google-sheets.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,43 +74,44 @@ The [Google API rate limit](https://developers.google.com/sheets/api/limits) is

## Changelog

| Version | Date | Pull Request | Subject |
|---------|------------|----------------------------------------------------------|------------------------------------------------------------------------------------|
| 0.3.1 | 2023-07-06 | [28033](https://github.com/airbytehq/airbyte/pull/28033) | Fixed several reported vulnerabilities (25 total), CVE-2022-37434, CVE-2022-42898 |
| 0.3.0 | 2023-06-26 | [27738](https://github.com/airbytehq/airbyte/pull/27738) | License Update: Elv2 |
| 0.2.39 | 2023-05-31 | [26833](https://github.com/airbytehq/airbyte/pull/26833) | Remove authSpecification in favour of advancedAuth in specification |
| 0.2.38 | 2023-05-16 | [26097](https://github.com/airbytehq/airbyte/pull/26097) | Refactor config error |
| 0.2.37 | 2023-02-21 | [23292](https://github.com/airbytehq/airbyte/pull/23292) | Skip non grid sheets. |
| 0.2.36 | 2023-02-21 | [23272](https://github.com/airbytehq/airbyte/pull/23272) | Handle empty sheets gracefully. |
| 0.2.35 | 2023-02-23 | [23057](https://github.com/airbytehq/airbyte/pull/23057) | Slugify column names |
| 0.2.34 | 2023-02-15 | [23071](https://github.com/airbytehq/airbyte/pull/23071) | Change min spreadsheet id size to 20 symbols |
| 0.2.33 | 2023-02-13 | [23278](https://github.com/airbytehq/airbyte/pull/23278) | Handle authentication errors |
| 0.2.32 | 2023-02-13 | [22884](https://github.com/airbytehq/airbyte/pull/22884) | Do not consume http spreadsheets. |
| 0.2.31 | 2022-10-09 | [19574](https://github.com/airbytehq/airbyte/pull/19574) | Revert 'Add row_id to rows and use as primary key' |
| 0.2.30 | 2022-10-09 | [19215](https://github.com/airbytehq/airbyte/pull/19215) | Add row_id to rows and use as primary key |
| 0.2.21 | 2022-10-04 | [15591](https://github.com/airbytehq/airbyte/pull/15591) | Clean instantiation of AirbyteStream |
| 0.2.20 | 2022-10-10 | [17766](https://github.com/airbytehq/airbyte/pull/17766) | Fix null pointer exception when parsing the spreadsheet id. |
| 0.2.19 | 2022-09-29 | [17410](https://github.com/airbytehq/airbyte/pull/17410) | Use latest CDK. |
| 0.2.18 | 2022-09-28 | [17326](https://github.com/airbytehq/airbyte/pull/17326) | Migrate to per-stream states. |
| 0.2.17 | 2022-08-03 | [15107](https://github.com/airbytehq/airbyte/pull/15107) | Expose Row Batch Size in Connector Specification |
| 0.2.16 | 2022-07-07 | [13729](https://github.com/airbytehq/airbyte/pull/13729) | Improve configuration field description |
| 0.2.15 | 2022-06-02 | [13446](https://github.com/airbytehq/airbyte/pull/13446) | Retry requests resulting in a server error |
| 0.2.13 | 2022-05-06 | [12685](https://github.com/airbytehq/airbyte/pull/12685) | Update CDK to v0.1.56 to emit an `AirbyeTraceMessage` on uncaught exceptions |
| 0.2.12 | 2022-04-20 | [12230](https://github.com/airbytehq/airbyte/pull/12230) | Update connector to use a `spec.yaml` |
| 0.2.11 | 2022-04-13 | [11977](https://github.com/airbytehq/airbyte/pull/11977) | Replace leftover print statement with airbyte logger |
| 0.2.10 | 2022-03-25 | [11404](https://github.com/airbytehq/airbyte/pull/11404) | Allow using Spreadsheet Link/URL instead of Spreadsheet ID |
| 0.2.9 | 2022-01-25 | [9208](https://github.com/airbytehq/airbyte/pull/9208) | Update title and descriptions |
| 0.2.7 | 2021-09-27 | [8470](https://github.com/airbytehq/airbyte/pull/8470) | Migrate to the CDK |
| 0.2.6 | 2021-09-27 | [6354](https://github.com/airbytehq/airbyte/pull/6354) | Support connecting via Oauth webflow |
| 0.2.5 | 2021-09-12 | [5972](https://github.com/airbytehq/airbyte/pull/5972) | Fix full_refresh test by adding supported_sync_modes to Stream initialization |
| 0.2.4 | 2021-08-05 | [5233](https://github.com/airbytehq/airbyte/pull/5233) | Fix error during listing sheets with diagram only |
| 0.2.3 | 2021-06-09 | [3973](https://github.com/airbytehq/airbyte/pull/3973) | Add AIRBYTE_ENTRYPOINT for Kubernetes support |
| 0.2.2 | 2021-04-20 | [2994](https://github.com/airbytehq/airbyte/pull/2994) | Formatting spec |
| 0.2.1 | 2021-04-03 | [2726](https://github.com/airbytehq/airbyte/pull/2726) | Fix base connector versioning |
| 0.2.0 | 2021-03-09 | [2238](https://github.com/airbytehq/airbyte/pull/2238) | Protocol allows future/unknown properties |
| 0.1.7 | 2021-01-21 | [1762](https://github.com/airbytehq/airbyte/pull/1762) | Fix issue large spreadsheet |
| 0.1.6 | 2021-01-27 | [1668](https://github.com/airbytehq/airbyte/pull/1668) | Adopt connector best practices |
| 0.1.5 | 2020-12-30 | [1438](https://github.com/airbytehq/airbyte/pull/1438) | Implement backoff |
| 0.1.4 | 2020-11-30 | [1046](https://github.com/airbytehq/airbyte/pull/1046) | Add connectors using an index YAML file |
| Version | Date | Pull Request | Subject |
|---------|------------|----------------------------------------------------------|-----------------------------------------------------------------------------------|
| 0.3.2 | 2023-08-09 | [29246](https://github.com/airbytehq/airbyte/pull/29246) | Add checking while reading to skip modified sheets |
| 0.3.1 | 2023-07-06 | [28033](https://github.com/airbytehq/airbyte/pull/28033) | Fixed several reported vulnerabilities (25 total), CVE-2022-37434, CVE-2022-42898 |
| 0.3.0 | 2023-06-26 | [27738](https://github.com/airbytehq/airbyte/pull/27738) | License Update: Elv2 |
| 0.2.39 | 2023-05-31 | [26833](https://github.com/airbytehq/airbyte/pull/26833) | Remove authSpecification in favour of advancedAuth in specification |
| 0.2.38 | 2023-05-16 | [26097](https://github.com/airbytehq/airbyte/pull/26097) | Refactor config error |
| 0.2.37 | 2023-02-21 | [23292](https://github.com/airbytehq/airbyte/pull/23292) | Skip non grid sheets. |
| 0.2.36 | 2023-02-21 | [23272](https://github.com/airbytehq/airbyte/pull/23272) | Handle empty sheets gracefully. |
| 0.2.35 | 2023-02-23 | [23057](https://github.com/airbytehq/airbyte/pull/23057) | Slugify column names |
| 0.2.34 | 2023-02-15 | [23071](https://github.com/airbytehq/airbyte/pull/23071) | Change min spreadsheet id size to 20 symbols |
| 0.2.33 | 2023-02-13 | [23278](https://github.com/airbytehq/airbyte/pull/23278) | Handle authentication errors |
| 0.2.32 | 2023-02-13 | [22884](https://github.com/airbytehq/airbyte/pull/22884) | Do not consume http spreadsheets. |
| 0.2.31 | 2022-10-09 | [19574](https://github.com/airbytehq/airbyte/pull/19574) | Revert 'Add row_id to rows and use as primary key' |
| 0.2.30 | 2022-10-09 | [19215](https://github.com/airbytehq/airbyte/pull/19215) | Add row_id to rows and use as primary key |
| 0.2.21 | 2022-10-04 | [15591](https://github.com/airbytehq/airbyte/pull/15591) | Clean instantiation of AirbyteStream |
| 0.2.20 | 2022-10-10 | [17766](https://github.com/airbytehq/airbyte/pull/17766) | Fix null pointer exception when parsing the spreadsheet id. |
| 0.2.19 | 2022-09-29 | [17410](https://github.com/airbytehq/airbyte/pull/17410) | Use latest CDK. |
| 0.2.18 | 2022-09-28 | [17326](https://github.com/airbytehq/airbyte/pull/17326) | Migrate to per-stream states. |
| 0.2.17 | 2022-08-03 | [15107](https://github.com/airbytehq/airbyte/pull/15107) | Expose Row Batch Size in Connector Specification |
| 0.2.16 | 2022-07-07 | [13729](https://github.com/airbytehq/airbyte/pull/13729) | Improve configuration field description |
| 0.2.15 | 2022-06-02 | [13446](https://github.com/airbytehq/airbyte/pull/13446) | Retry requests resulting in a server error |
| 0.2.13 | 2022-05-06 | [12685](https://github.com/airbytehq/airbyte/pull/12685) | Update CDK to v0.1.56 to emit an `AirbyeTraceMessage` on uncaught exceptions |
| 0.2.12 | 2022-04-20 | [12230](https://github.com/airbytehq/airbyte/pull/12230) | Update connector to use a `spec.yaml` |
| 0.2.11 | 2022-04-13 | [11977](https://github.com/airbytehq/airbyte/pull/11977) | Replace leftover print statement with airbyte logger |
| 0.2.10 | 2022-03-25 | [11404](https://github.com/airbytehq/airbyte/pull/11404) | Allow using Spreadsheet Link/URL instead of Spreadsheet ID |
| 0.2.9 | 2022-01-25 | [9208](https://github.com/airbytehq/airbyte/pull/9208) | Update title and descriptions |
| 0.2.7 | 2021-09-27 | [8470](https://github.com/airbytehq/airbyte/pull/8470) | Migrate to the CDK |
| 0.2.6 | 2021-09-27 | [6354](https://github.com/airbytehq/airbyte/pull/6354) | Support connecting via Oauth webflow |
| 0.2.5 | 2021-09-12 | [5972](https://github.com/airbytehq/airbyte/pull/5972) | Fix full_refresh test by adding supported_sync_modes to Stream initialization |
| 0.2.4 | 2021-08-05 | [5233](https://github.com/airbytehq/airbyte/pull/5233) | Fix error during listing sheets with diagram only |
| 0.2.3 | 2021-06-09 | [3973](https://github.com/airbytehq/airbyte/pull/3973) | Add AIRBYTE_ENTRYPOINT for Kubernetes support |
| 0.2.2 | 2021-04-20 | [2994](https://github.com/airbytehq/airbyte/pull/2994) | Formatting spec |
| 0.2.1 | 2021-04-03 | [2726](https://github.com/airbytehq/airbyte/pull/2726) | Fix base connector versioning |
| 0.2.0 | 2021-03-09 | [2238](https://github.com/airbytehq/airbyte/pull/2238) | Protocol allows future/unknown properties |
| 0.1.7 | 2021-01-21 | [1762](https://github.com/airbytehq/airbyte/pull/1762) | Fix issue large spreadsheet |
| 0.1.6 | 2021-01-27 | [1668](https://github.com/airbytehq/airbyte/pull/1668) | Adopt connector best practices |
| 0.1.5 | 2020-12-30 | [1438](https://github.com/airbytehq/airbyte/pull/1438) | Implement backoff |
| 0.1.4 | 2020-11-30 | [1046](https://github.com/airbytehq/airbyte/pull/1046) | Add connectors using an index YAML file |

<!-- /env:oss -->