From ef41bf928a20fd70ebc70522c656a6be1d1b950d Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 11 Jul 2024 14:26:32 +0300 Subject: [PATCH 01/21] test --- .bumpversion.cfg | 13 --- .dockerignore | 8 -- .editorconfig | 226 ---------------------------------------- .gitignore | 105 ------------------- .pre-commit-config.yaml | 10 -- .prettierignore | 3 - .prettierrc | 10 -- .python-version | 1 - .readthedocs.yaml | 24 ----- .root | 0 10 files changed, 400 deletions(-) delete mode 100644 .bumpversion.cfg delete mode 100644 .dockerignore delete mode 100644 .editorconfig delete mode 100644 .gitignore delete mode 100644 .pre-commit-config.yaml delete mode 100644 .prettierignore delete mode 100644 .prettierrc delete mode 100644 .python-version delete mode 100644 .readthedocs.yaml delete mode 100644 .root diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index a63968d6735a..000000000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,13 +0,0 @@ -[bumpversion] -current_version = 0.63.6 -commit = False -tag = False -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-[a-z]+)? -serialize = - {major}.{minor}.{patch} - -[bumpversion:file:.bumpversion.cfg] - -[bumpversion:file:gradle.properties] - -[bumpversion:file:run-ab-platform.sh] diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index cc026d5dc6c1..000000000000 --- a/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -.dockerignore -.git -.idea -.gradle -**/build -**/node_modules -Dockerfile.* -docker-compose*.yaml diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 9b977f3960d5..000000000000 --- a/.editorconfig +++ /dev/null @@ -1,226 +0,0 @@ -root = true - -[{*.bash,*.sh,*.zsh}] -indent_size = 2 -tab_width = 2 -ij_shell_binary_ops_start_line = false -ij_shell_keep_column_alignment_padding = false -ij_shell_minify_program = false -ij_shell_redirect_followed_by_space = false -ij_shell_switch_cases_indented = false -ij_shell_use_unix_line_separator = true - -[{*.har,*.jsb2,*.jsb3,*.json,.babelrc,.eslintrc,.stylelintrc,bowerrc,jest.config}] -indent_size = 2 -ij_json_keep_blank_lines_in_code = 0 -ij_json_keep_indents_on_empty_lines = false -ij_json_keep_line_breaks = true -ij_json_space_after_colon = true -ij_json_space_after_comma = true -ij_json_space_before_colon = true -ij_json_space_before_comma = false -ij_json_spaces_within_braces = false -ij_json_spaces_within_brackets = false -ij_json_wrap_long_lines = false - -[{*.kt,*.kts}] -indent_style = space -insert_final_newline = true -max_line_length = 100 -indent_size = 4 -ij_continuation_indent_size = 4 -ij_java_names_count_to_use_import_on_demand = 9999 -ij_kotlin_align_in_columns_case_branch = false -ij_kotlin_align_multiline_binary_operation = false -ij_kotlin_align_multiline_extends_list = false -ij_kotlin_align_multiline_method_parentheses = false -ij_kotlin_align_multiline_parameters = true -ij_kotlin_align_multiline_parameters_in_calls = false -ij_kotlin_allow_trailing_comma = true -ij_kotlin_allow_trailing_comma_on_call_site = true -ij_kotlin_assignment_wrap = normal -ij_kotlin_blank_lines_after_class_header = 0 -ij_kotlin_blank_lines_around_block_when_branches = 0 -ij_kotlin_blank_lines_before_declaration_with_comment_or_annotation_on_separate_line = 1 -ij_kotlin_block_comment_at_first_column = true -ij_kotlin_call_parameters_new_line_after_left_paren = true -ij_kotlin_call_parameters_right_paren_on_new_line = false -ij_kotlin_call_parameters_wrap = on_every_item -ij_kotlin_catch_on_new_line = false -ij_kotlin_class_annotation_wrap = split_into_lines -ij_kotlin_code_style_defaults = KOTLIN_OFFICIAL -ij_kotlin_continuation_indent_for_chained_calls = true -ij_kotlin_continuation_indent_for_expression_bodies = true -ij_kotlin_continuation_indent_in_argument_lists = true -ij_kotlin_continuation_indent_in_elvis = false -ij_kotlin_continuation_indent_in_if_conditions = false -ij_kotlin_continuation_indent_in_parameter_lists = false -ij_kotlin_continuation_indent_in_supertype_lists = false -ij_kotlin_else_on_new_line = false -ij_kotlin_enum_constants_wrap = off -ij_kotlin_extends_list_wrap = normal -ij_kotlin_field_annotation_wrap = off -ij_kotlin_finally_on_new_line = false -ij_kotlin_if_rparen_on_new_line = false -ij_kotlin_import_nested_classes = false -ij_kotlin_imports_layout = * -ij_kotlin_insert_whitespaces_in_simple_one_line_method = true -ij_kotlin_keep_blank_lines_before_right_brace = 2 -ij_kotlin_keep_blank_lines_in_code = 2 -ij_kotlin_keep_blank_lines_in_declarations = 2 -ij_kotlin_keep_first_column_comment = true -ij_kotlin_keep_indents_on_empty_lines = false -ij_kotlin_keep_line_breaks = true -ij_kotlin_lbrace_on_next_line = false -ij_kotlin_line_comment_add_space = false -ij_kotlin_line_comment_at_first_column = true -ij_kotlin_method_annotation_wrap = split_into_lines -ij_kotlin_method_call_chain_wrap = normal -ij_kotlin_method_parameters_new_line_after_left_paren = true -ij_kotlin_method_parameters_right_paren_on_new_line = true -ij_kotlin_method_parameters_wrap = on_every_item -ij_kotlin_name_count_to_use_star_import = 9999 -ij_kotlin_name_count_to_use_star_import_for_members = 9999 -ij_kotlin_parameter_annotation_wrap = off -ij_kotlin_space_after_comma = true -ij_kotlin_space_after_extend_colon = true -ij_kotlin_space_after_type_colon = true -ij_kotlin_space_before_catch_parentheses = true -ij_kotlin_space_before_comma = false -ij_kotlin_space_before_extend_colon = true -ij_kotlin_space_before_for_parentheses = true -ij_kotlin_space_before_if_parentheses = true -ij_kotlin_space_before_lambda_arrow = true -ij_kotlin_space_before_type_colon = false -ij_kotlin_space_before_when_parentheses = true -ij_kotlin_space_before_while_parentheses = true -ij_kotlin_spaces_around_additive_operators = true -ij_kotlin_spaces_around_assignment_operators = true -ij_kotlin_spaces_around_equality_operators = true -ij_kotlin_spaces_around_function_type_arrow = true -ij_kotlin_spaces_around_logical_operators = true -ij_kotlin_spaces_around_multiplicative_operators = true -ij_kotlin_spaces_around_range = false -ij_kotlin_spaces_around_relational_operators = true -ij_kotlin_spaces_around_unary_operator = false -ij_kotlin_spaces_around_when_arrow = true -ij_kotlin_variable_annotation_wrap = off -ij_kotlin_while_on_new_line = false -ij_kotlin_wrap_elvis_expressions = 1 -ij_kotlin_wrap_expression_body_functions = 1 -ij_kotlin_wrap_first_method_in_call_chain = false - -[{*.markdown,*.md}] -ij_markdown_force_one_space_after_blockquote_symbol = true -ij_markdown_force_one_space_after_header_symbol = true -ij_markdown_force_one_space_after_list_bullet = true -ij_markdown_force_one_space_between_words = true -ij_markdown_keep_indents_on_empty_lines = false -ij_markdown_max_lines_around_block_elements = 1 -ij_markdown_max_lines_around_header = 1 -ij_markdown_max_lines_between_paragraphs = 1 -ij_markdown_min_lines_around_block_elements = 1 -ij_markdown_min_lines_around_header = 1 -ij_markdown_min_lines_between_paragraphs = 1 - -[{*.py,*.pyw,Tiltfile}] -charset = utf-8 -indent_size = 4 -indent_style = space -insert_final_newline = true -max_line_length = 140 -tab_width = 4 -ij_continuation_indent_size = 8 -ij_formatter_off_tag = @formatter:off -ij_formatter_on_tag = @formatter:on -ij_formatter_tags_enabled = false -ij_smart_tabs = false -ij_visual_guides = none -ij_wrap_on_typing = false - -ij_python_align_collections_and_comprehensions = true -ij_python_align_multiline_imports = true -ij_python_align_multiline_parameters = true -ij_python_align_multiline_parameters_in_calls = true -ij_python_blank_line_at_file_end = true -ij_python_blank_lines_after_imports = 1 -ij_python_blank_lines_after_local_imports = 0 -ij_python_blank_lines_around_class = 1 -ij_python_blank_lines_around_method = 1 -ij_python_blank_lines_around_top_level_classes_functions = 2 -ij_python_blank_lines_before_first_method = 0 -ij_python_call_parameters_new_line_after_left_paren = false -ij_python_call_parameters_right_paren_on_new_line = false -ij_python_call_parameters_wrap = normal -ij_python_dict_alignment = 0 -ij_python_dict_new_line_after_left_brace = false -ij_python_dict_new_line_before_right_brace = false -ij_python_dict_wrapping = 1 -ij_python_from_import_new_line_after_left_parenthesis = false -ij_python_from_import_new_line_before_right_parenthesis = false -ij_python_from_import_parentheses_force_if_multiline = false -ij_python_from_import_trailing_comma_if_multiline = false -ij_python_from_import_wrapping = 1 -ij_python_hang_closing_brackets = false -ij_python_keep_blank_lines_in_code = 1 -ij_python_keep_blank_lines_in_declarations = 1 -ij_python_keep_indents_on_empty_lines = false -ij_python_keep_line_breaks = true -ij_python_method_parameters_new_line_after_left_paren = false -ij_python_method_parameters_right_paren_on_new_line = false -ij_python_method_parameters_wrap = normal -ij_python_new_line_after_colon = false -ij_python_new_line_after_colon_multi_clause = true -ij_python_optimize_imports_always_split_from_imports = false -ij_python_optimize_imports_case_insensitive_order = false -ij_python_optimize_imports_join_from_imports_with_same_source = false -ij_python_optimize_imports_sort_by_type_first = true -ij_python_optimize_imports_sort_imports = true -ij_python_optimize_imports_sort_names_in_from_imports = false -ij_python_space_after_comma = true -ij_python_space_after_number_sign = true -ij_python_space_after_py_colon = true -ij_python_space_before_backslash = true -ij_python_space_before_comma = false -ij_python_space_before_for_semicolon = false -ij_python_space_before_lbracket = false -ij_python_space_before_method_call_parentheses = false -ij_python_space_before_method_parentheses = false -ij_python_space_before_number_sign = true -ij_python_space_before_py_colon = false -ij_python_space_within_empty_method_call_parentheses = false -ij_python_space_within_empty_method_parentheses = false -ij_python_spaces_around_additive_operators = true -ij_python_spaces_around_assignment_operators = true -ij_python_spaces_around_bitwise_operators = true -ij_python_spaces_around_eq_in_keyword_argument = false -ij_python_spaces_around_eq_in_named_parameter = false -ij_python_spaces_around_equality_operators = true -ij_python_spaces_around_multiplicative_operators = true -ij_python_spaces_around_power_operator = true -ij_python_spaces_around_relational_operators = true -ij_python_spaces_around_shift_operators = true -ij_python_spaces_within_braces = false -ij_python_spaces_within_brackets = false -ij_python_spaces_within_method_call_parentheses = false -ij_python_spaces_within_method_parentheses = false -ij_python_use_continuation_indent_for_arguments = false -ij_python_use_continuation_indent_for_collection_and_comprehensions = false -ij_python_use_continuation_indent_for_parameters = true -ij_python_wrap_long_lines = false - -[{*.toml,Cargo.lock,Cargo.toml.orig,Gopkg.lock,Pipfile,poetry.lock}] -ij_toml_keep_indents_on_empty_lines = false - -[{*.yaml,*.yml}] -indent_size = 2 -ij_yaml_align_values_properties = do_not_align -ij_yaml_autoinsert_sequence_marker = true -ij_yaml_block_mapping_on_new_line = false -ij_yaml_indent_sequence_value = true -ij_yaml_keep_indents_on_empty_lines = false -ij_yaml_keep_line_breaks = true -ij_yaml_sequence_on_new_line = false -ij_yaml_space_before_colon = false -ij_yaml_spaces_within_braces = true -ij_yaml_spaces_within_brackets = true diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 67b21afbcb8b..000000000000 --- a/.gitignore +++ /dev/null @@ -1,105 +0,0 @@ -.gradle -.idea -*.iml -*.swp -build -out -.DS_Store -.dockerversions -.classpath -.project -.settings -.vscode -**/gmon.out -static_checker_reports/ - -# Logs -acceptance_tests_logs/ -airbyte_ci_logs/ -live_tests_debug_reports/ -dagger_engine_logs* - -# Secrets -secrets -updated_configurations -!airbyte-integrations/connector-templates/**/secrets - -# Connector debug configs -airbyte-integrations/connectors/**/src/test/resources/debug_resources - -# Python -*.egg-info -__pycache__ -.eggs -.venv -.mypy_cache -.ipynb_checkpoints -.pytest_ - -# Python unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# dbt -profiles.yml - -# Terraform -.terraform/ -crash.log -*.tfstate -*.tfstate.backup -*.lock.hcl - -# Airflow Demo -resources/examples/airflow/logs/* -!resources/examples/airflow/logs/.gitkeep - -# Summary.md keeps getting added and we just don't like it -docs/SUMMARY.md - -# Output Files generated by scripts -lowcode_connector_names.txt -num_lowcode_connectors.csv - -# Helm charts .tgz dependencies -charts/**/charts - -# Snyk -.dccache - -# Datadog -dd-java-agent.jar - -# Files needed to run airbyte-platform that are downloaded on-the-fly via run-ab-platform.sh -/docker-compose.yaml -/docker-compose.debug.yaml -/.env -/.env.dev -/flags.yml -/temporal/dynamicconfig/development.yaml - -# Ignore generated credentials from google-github-actions/auth -gha-creds-*.json - -# Legacy pipeline reports path -tools/ci_connector_ops/pipeline_reports/ - -# ignore local build scan uri output -scan-journal.log - -# connectors' cache -*.sqlite - -# mvn -target/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index 4fd1a68cdc3b..000000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -repos: - - repo: local - hooks: - - id: format-fix-all-on-push - always_run: true - entry: airbyte-ci --disable-update-check format fix all - language: system - name: Run airbyte-ci format fix on git push (~30s) - pass_filenames: false - stages: [push] diff --git a/.prettierignore b/.prettierignore deleted file mode 100644 index 9579ba1a2fc9..000000000000 --- a/.prettierignore +++ /dev/null @@ -1,3 +0,0 @@ -airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output -airbyte-ci/connectors/pipelines/tests/test_changelog/result_files -airbyte-integrations/bases/connector-acceptance-test/unit_tests/data/docs diff --git a/.prettierrc b/.prettierrc deleted file mode 100644 index 31cda2d92570..000000000000 --- a/.prettierrc +++ /dev/null @@ -1,10 +0,0 @@ -{ - "overrides": [ - { - "files": "*.md", - "options": { - "proseWrap": "preserve" - } - } - ] -} diff --git a/.python-version b/.python-version deleted file mode 100644 index c8cfe3959183..000000000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.10 diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index 5fb58f45882d..000000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# .readthedocs.yaml -# Read the Docs configuration file -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -build: - os: ubuntu-20.04 - tools: - python: "3.9" - -# Build documentation in the docs/ directory with Sphinx -sphinx: - builder: html - configuration: airbyte-cdk/python/reference_docs/_source/conf.py - -# Set the version of Python and requirements required to build your docs -python: - install: - - method: pip - path: airbyte-cdk/python - extra_requirements: - - sphinx-docs diff --git a/.root b/.root deleted file mode 100644 index e69de29bb2d1..000000000000 From 706d5079c96c01a5ceef22303fe789a2022cb535 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 18 Jul 2024 02:51:25 +0300 Subject: [PATCH 02/21] added checkpointing to BULK --- .../shopify_graphql/bulk/job.py | 73 +++++++++--- .../source_shopify/streams/base_streams.py | 7 +- .../source-shopify/unit_tests/conftest.py | 112 ++++++++++++++++++ .../unit_tests/graphql_bulk/test_job.py | 54 +++++++++ 4 files changed, 229 insertions(+), 17 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index cf117b97d73e..9c9920f59b06 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -66,8 +66,15 @@ class ShopifyBulkManager: # 0.1 ~= P2H, default value, lower boundary for slice size _job_size_min: Final[float] = 0.1 - # P365D, upper boundary for slice size - _job_size_max: Final[float] = 365.0 + + # last running job object count + _job_last_rec_count: int = field(init=False, default=0) + # how many records should be collected before we use the checkpoining + _job_checkpoint_interval: Final[int] = 200000 + # the flag to adjust the next slice from the checkpointed cursor vaue + _job_adjust_slice_from_checkpoint: bool = field(init=False, default=False) + # flag to mark the long running BULK job + _job_long_running_cancelation: bool = field(init=False, default=False) # expand slice factor _job_size_expand_factor: int = field(init=False, default=2) @@ -82,6 +89,8 @@ class ShopifyBulkManager: def __post_init__(self): self._http_client = HttpClient(self.stream_name, self.logger, ShopifyErrorHandler(), session=self.session) self._job_size = self.job_size + # The upper boundary for slice size is limited by the value from the config, default value is `P30D` + self._job_size_max = self.job_size # Each job ideally should be executed within the specified time (in sec), # to maximize the performance for multi-connection syncs and control the bulk job size within +- 1 hours (3600 sec), # Ideally the source will balance on it's own rate, based on the time taken to return the data for the slice. @@ -143,6 +152,10 @@ def _is_long_running_job(self) -> bool: self._job_should_revert_slice = False return False + @property + def _job_should_checkpoint(self) -> bool: + return self._job_last_rec_count >= self._job_checkpoint_interval + def _expand_job_size(self) -> None: self._job_size += self._job_size_adjusted_expand_factor @@ -176,6 +189,8 @@ def __reset_state(self) -> None: self._job_self_canceled = False # set the running job message counter to default self._log_job_msg_count = 0 + # set the running job object count to default + self._job_last_rec_count = 0 def _job_completed(self) -> bool: return self._job_state == ShopifyBulkJobStatus.COMPLETED.value @@ -206,6 +221,9 @@ def _log_job_state_with_count(self) -> None: self._log_job_msg_count += 1 else: message = f"Elapsed time: {self._job_elapsed_time_in_state} sec" + if self._job_last_rec_count > 0: + count_message = f". Lines collected: {self._job_last_rec_count}" + message = message + count_message self._log_state(message) self._log_job_msg_count = 0 @@ -218,7 +236,7 @@ def _log_state(self, message: Optional[str] = None) -> None: def _job_get_result(self, response: Optional[requests.Response] = None) -> Optional[str]: parsed_response = response.json().get("data", {}).get("node", {}) if response else None - job_result_url = parsed_response.get("url") if parsed_response and not self._job_self_canceled else None + job_result_url = parsed_response.get("url") if parsed_response else None if job_result_url: # save to local file using chunks to avoid OOM filename = self._tools.filename_from_url(job_result_url) @@ -234,8 +252,16 @@ def _job_get_result(self, response: Optional[requests.Response] = None) -> Optio def _job_update_state(self, response: Optional[requests.Response] = None) -> None: if response: self._job_state = response.json().get("data", {}).get("node", {}).get("status") - if self._job_state in [ShopifyBulkJobStatus.RUNNING.value, ShopifyBulkJobStatus.CANCELING.value]: + self._job_last_rec_count = int(response.json().get("data", {}).get("node", {}).get("objectCount", 0)) + + if self._job_state == ShopifyBulkJobStatus.RUNNING.value: self._log_job_state_with_count() + elif self._job_state in [ShopifyBulkJobStatus.CANCELED.value, ShopifyBulkJobStatus.CANCELING.value]: + # do not emit `CANCELED / CANCELING` Bulk Job status, while checkpointing + if self._job_should_checkpoint: + pass + else: + self._log_job_state_with_count() else: self._log_state() @@ -245,19 +271,36 @@ def _on_created_job(self, **kwargs) -> None: def _on_canceled_job(self, response: requests.Response) -> Optional[AirbyteTracedException]: if not self._job_self_canceled: raise ShopifyBulkExceptions.BulkJobCanceled( - f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}", + f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}" ) + else: + if self._job_should_checkpoint: + # set the flag to adjust the next slice from the checkpointed cursor value + self._job_adjust_slice_from_checkpoint = True + # fetch the collected records from CANCELED Job on checkpointing + self._job_result_filename = self._job_get_result(response) def _on_canceling_job(self, **kwargs) -> None: sleep(self._job_check_interval) + def _cancel_on_long_running_job(self) -> None: + self.logger.info( + f"Stream: `{self.stream_name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." + ) + self._job_long_running_cancelation = True + self._job_cancel() + + def _cancel_on_checkpointing(self) -> None: + self.logger.info(f"Stream: `{self.stream_name}`, checkpointing after >= `{self._job_checkpoint_interval}` lines collected.") + # set the flag to adjust the next slice from the checkpointed cursor value + self._job_adjust_slice_from_checkpoint = True + self._job_cancel() + def _on_running_job(self, **kwargs) -> None: if self._is_long_running_job: - self.logger.info( - f"Stream: `{self.stream_name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." - ) - # cancel the long-running bulk job - self._job_cancel() + self._cancel_on_long_running_job() + elif self._job_should_checkpoint: + self._cancel_on_checkpointing() else: sleep(self._job_check_interval) @@ -311,7 +354,6 @@ def _job_track_running(self) -> None: request_kwargs={}, ) self._job_healthcheck(response) - self._job_update_state(response) self._job_state_to_fn_map.get(self._job_state)(response=response) @@ -396,12 +438,15 @@ def get_adjusted_job_start(self, slice_start: datetime) -> datetime: step = self._job_size if self._job_size else self._job_size_min return slice_start.add(days=step) - def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime) -> datetime: + def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, checkpointed_cursor: Optional[datetime] = None) -> datetime: + if self._job_adjust_slice_from_checkpoint: + return pdm.parse(checkpointed_cursor) if checkpointed_cursor else slice_end + if self._is_long_running_job: self._job_size_reduce_next() return slice_start - else: - return slice_end + + return slice_end @limiter.balance_rate_limit(api_type=ApiTypeEnum.graphql.value) def job_check_for_completion(self) -> Optional[str]: diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index cfffc4f34892..f119b784add7 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -215,7 +215,7 @@ def track_checkpoint_cursor(self, record_value: Union[str, int]) -> None: if self.filter_by_state_checkpoint: # set checkpoint cursor if not self._checkpoint_cursor: - self._checkpoint_cursor = self.config.get("start_date") + self._checkpoint_cursor = self.default_state_comparison_value # track checkpoint cursor if record_value >= self._checkpoint_cursor: self._checkpoint_cursor = record_value @@ -629,6 +629,7 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream): data_field = "graphql" parent_stream_class: Optional[Union[ShopifyStream, IncrementalShopifyStream]] = None + filter_by_state_checkpoint = True def __init__(self, config: Dict) -> None: super().__init__(config) @@ -642,7 +643,7 @@ def __init__(self, config: Dict) -> None: query=self.query, job_termination_threshold=float(config.get("job_termination_threshold", 3600)), # overide the default job slice size, if provided (it's auto-adjusted, later on) - job_size=config.get("bulk_window_in_days", 0.0), + job_size=config.get("bulk_window_in_days", 30.0), ) # define Record Producer instance @@ -738,7 +739,7 @@ def stream_slices(self, stream_state: Optional[Mapping[str, Any]] = None, **kwar self.emit_slice_message(start, slice_end) yield {"start": start.to_rfc3339_string(), "end": slice_end.to_rfc3339_string()} # increment the end of the slice or reduce the next slice - start = self.job_manager.get_adjusted_job_end(start, slice_end) + start = self.job_manager.get_adjusted_job_end(start, slice_end, self._checkpoint_cursor) else: # for the streams that don't support filtering yield {} diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py index 0eea96d11371..4eac8f69fe76 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py @@ -408,6 +408,118 @@ def bulk_job_running_response(): } }, } + + +@pytest.fixture +def bulk_job_running_with_object_count_and_url_response(): + return { + "data": { + "node": { + "id": "gid://shopify/BulkOperation/4047052112061", + "status": "RUNNING", + "errorCode": None, + "objectCount": "15", + "fileSize": None, + "url": 'https://some_url?response-content-disposition=attachment;+filename="bulk-123456789.jsonl";+filename*=UTF-8', + "partialDataUrl": None, + } + }, + "extensions": { + "cost": { + "requestedQueryCost": 1, + "actualQueryCost": 1, + "throttleStatus": { + "maximumAvailable": 1000.0, + "currentlyAvailable": 999, + "restoreRate": 50.0, + }, + } + }, + } + + +@pytest.fixture +def bulk_job_canceled_with_object_count_and_url_response(): + return { + "data": { + "node": { + "id": "gid://shopify/BulkOperation/4047052112061", + "status": "CANCELED", + "errorCode": None, + "objectCount": "15", + "fileSize": None, + "url": 'https://some_url?response-content-disposition=attachment;+filename="bulk-123456789.jsonl";+filename*=UTF-8', + "partialDataUrl": None, + } + }, + "extensions": { + "cost": { + "requestedQueryCost": 1, + "actualQueryCost": 1, + "throttleStatus": { + "maximumAvailable": 1000.0, + "currentlyAvailable": 999, + "restoreRate": 50.0, + }, + } + }, + } + + +@pytest.fixture +def bulk_job_running_with_object_count_no_url_response(): + return { + "data": { + "node": { + "id": "gid://shopify/BulkOperation/4047052112061", + "status": "RUNNING", + "errorCode": None, + "objectCount": "4", + "fileSize": None, + "url": None, + "partialDataUrl": None, + } + }, + "extensions": { + "cost": { + "requestedQueryCost": 1, + "actualQueryCost": 1, + "throttleStatus": { + "maximumAvailable": 1000.0, + "currentlyAvailable": 999, + "restoreRate": 50.0, + }, + } + }, + } + + +@pytest.fixture +def bulk_job_canceled_with_object_count_no_url_response(): + return { + "data": { + "node": { + "id": "gid://shopify/BulkOperation/4047052112061", + "status": "CANCELED", + "errorCode": None, + "objectCount": "4", + "fileSize": None, + "url": None, + "partialDataUrl": None, + } + }, + "extensions": { + "cost": { + "requestedQueryCost": 1, + "actualQueryCost": 1, + "throttleStatus": { + "maximumAvailable": 1000.0, + "currentlyAvailable": 999, + "restoreRate": 50.0, + }, + } + }, + } @pytest.fixture diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py index dd8985f4ac9e..bfc97eb29d94 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py @@ -3,6 +3,8 @@ # +from os import remove + import pytest import requests from airbyte_protocol.models import SyncMode @@ -237,6 +239,58 @@ def test_job_check_with_running_scenario(request, requests_mock, job_response, a assert stream.job_manager._job_state == expected +@pytest.mark.parametrize( + "running_job_response, canceled_job_response, expected", + [ + ( + "bulk_job_running_with_object_count_and_url_response", + "bulk_job_canceled_with_object_count_and_url_response", + "bulk-123456789.jsonl", + ), + ( + "bulk_job_running_with_object_count_no_url_response", + "bulk_job_canceled_with_object_count_no_url_response", + None, + ), + ], + ids=[ + "self-canceled with url", + "self-canceled with no url", + ], +) +def test_job_running_with_canceled_scenario(request, requests_mock, running_job_response, canceled_job_response, auth_config, expected) -> None: + stream = MetafieldOrders(auth_config) + # modify the sleep time for the test + stream.job_manager._job_check_interval = 0 + # get job_id from FIXTURE + job_id = request.getfixturevalue(running_job_response).get("data", {}).get("node", {}).get("id") + # mocking the response for STATUS CHECKS + requests_mock.post( + stream.job_manager.base_url, + [ + {"json": request.getfixturevalue(running_job_response)}, + {"json": request.getfixturevalue(canceled_job_response)}, + ], + ) + job_result_url = request.getfixturevalue(canceled_job_response).get("data", {}).get("node", {}).get("url") + # test the state of the job isn't assigned + assert stream.job_manager._job_state == None + + stream.job_manager._job_id = job_id + stream.job_manager._job_checkpoint_interval = 5 + # faking self-canceled job + stream.job_manager._job_self_canceled = True + # mocking the nested request call to retrieve the data from result URL + requests_mock.get(job_result_url, json=request.getfixturevalue(canceled_job_response)) + # calling the sceario processing + assert not stream.job_manager._job_long_running_cancelation + assert stream.job_manager.job_check_for_completion() == expected + # clean up + if expected: + remove(expected) + + + def test_job_read_file_invalid_filename(mocker, auth_config) -> None: stream = MetafieldOrders(auth_config) expected = "An error occured while producing records from BULK Job result" From aa9b31e130d4bb3ea8188fac3befdc525ee6235f Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 18 Jul 2024 03:13:58 +0300 Subject: [PATCH 03/21] added url_base switch to redirect --- .../connectors/source-shopify/metadata.yaml | 2 +- .../connectors/source-shopify/pyproject.toml | 2 +- .../shopify_graphql/bulk/exceptions.py | 5 +++ .../shopify_graphql/bulk/job.py | 42 +++++++++++++++++-- .../shopify_graphql/bulk/retry.py | 5 +++ .../source_shopify/streams/base_streams.py | 2 +- .../source-shopify/unit_tests/unit_test.py | 12 ++---- 7 files changed, 55 insertions(+), 15 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/metadata.yaml b/airbyte-integrations/connectors/source-shopify/metadata.yaml index 2cf3a19fd9ec..cee6367b3c50 100644 --- a/airbyte-integrations/connectors/source-shopify/metadata.yaml +++ b/airbyte-integrations/connectors/source-shopify/metadata.yaml @@ -11,7 +11,7 @@ data: connectorSubtype: api connectorType: source definitionId: 9da77001-af33-4bcd-be46-6252bf9342b9 - dockerImageTag: 2.4.13 + dockerImageTag: 2.4.14 dockerRepository: airbyte/source-shopify documentationUrl: https://docs.airbyte.com/integrations/sources/shopify githubIssueLabel: source-shopify diff --git a/airbyte-integrations/connectors/source-shopify/pyproject.toml b/airbyte-integrations/connectors/source-shopify/pyproject.toml index 93756757bbd0..393e7b482749 100644 --- a/airbyte-integrations/connectors/source-shopify/pyproject.toml +++ b/airbyte-integrations/connectors/source-shopify/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "2.4.13" +version = "2.4.14" name = "source-shopify" description = "Source CDK implementation for Shopify." authors = [ "Airbyte ",] diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/exceptions.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/exceptions.py index 3dcc00d14e52..d666d86a40d6 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/exceptions.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/exceptions.py @@ -52,6 +52,11 @@ class BulkJobCreationFailedConcurrentError(BaseBulkException): failure_type: FailureType = FailureType.transient_error + class BulkJobRedirectToOtherShopError(BaseBulkException): + """Raised when the response contains another shop name""" + + failure_type: FailureType = FailureType.transient_error + class BulkJobConcurrentError(BaseBulkException): """Raised when failing the job after hitting too many BulkJobCreationFailedConcurrentError.""" diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index 9c9920f59b06..0fa34dba5fcb 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -328,12 +328,17 @@ def _on_job_with_errors(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedE def _on_non_handable_job_error(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedException: raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.stream_name}`, Non-handable error occured: {errors}") + def _get_server_errors(self, response: requests.Response) -> List[Optional[dict]]: + server_errors = response.json().get("errors", []) + return [server_errors] if isinstance(server_errors, str) else server_errors + + def _get_user_errors(self, response: requests.Response) -> List[Optional[dict]]: + user_errors = response.json().get("data", {}).get("bulkOperationRunQuery", {}).get("userErrors", []) + return [user_errors] if isinstance(user_errors, str) else user_errors + def _collect_bulk_errors(self, response: requests.Response) -> List[Optional[dict]]: try: - server_errors = response.json().get("errors", []) - user_errors = response.json().get("data", {}).get("bulkOperationRunQuery", {}).get("userErrors", []) - errors = server_errors + user_errors - return errors + return self._get_server_errors(response) + self._get_user_errors(response) except (Exception, JSONDecodeError) as e: raise ShopifyBulkExceptions.BulkJobBadResponse( f"Couldn't check the `response` for `errors`, status: {response.status_code}, response: `{response.text}`. Trace: {repr(e)}." @@ -381,6 +386,32 @@ def _has_running_concurrent_job(self, errors: Optional[Iterable[Mapping[str, Any def _has_reached_max_concurrency(self) -> bool: return self._concurrent_attempt == self._concurrent_max_retry + def _switch_base_url(self) -> None: + if self._new_base_url: + self.base_url = self._new_base_url + else: + self.logger.warning(f"Failed switching the `base url`, no `new base url` has been retrieved.") + + def _should_switch_shop_name(self, response: requests.Response) -> bool: + """ + Sometimes the API returns the redirected response that points to the same Store but with different Name: + >> case: + -- The user inputs the `shop name` as "A": + while attempting to place the BULK Job + -- The response contains the redirected results to the `shop name` as "B", like: + response.url == "https://B.myshopify.com" + + This redirection is related to: + 1) `aliased` or `hidden` store names from being exposed + 2) migrated to data to the new store, but referenced within the old one stil. + + reference issue: https://github.com/airbytehq/oncall/issues/5866 + """ + if self.base_url != response.url: + self._new_base_url = response.url + return True + return False + @bulk_retry_on_exception(logger) def _job_check_state(self) -> None: while not self._job_completed(): @@ -408,6 +439,9 @@ def create_job(self, stream_slice: Mapping[str, str], filter_field: str) -> None # when the concurrent job takes place, another job could not be created # we typically need to wait and retry, but no longer than 10 min. (see retry in `bulk_retry_on_exception`) raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.stream_name}") + elif self._should_switch_shop_name(response): + # assign new shop name, since the one that specified in `config` was redirected to the different one. + raise ShopifyBulkExceptions.BulkJobRedirectToOtherShopError(f"Switching the `store` name, redirected to: {response.url}") else: # There were no concurrent error for this job so even if there were other errors, we can reset this self._concurrent_attempt = 0 diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py index 140d77e91ad5..080e4f7b7185 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py @@ -54,6 +54,11 @@ def wrapper(self, *args, **kwargs) -> Any: f"Stream: `{self.stream_name}`, the BULK concurrency limit has reached. Waiting {self._concurrent_interval} sec before retry, attempt: {self._concurrent_attempt}.", ) sleep(self._concurrent_interval) + except ShopifyBulkExceptions.BulkJobRedirectToOtherShopError: + logger.warning( + f"Stream: `{self.stream_name}`, the `shop name` differs from the provided by the User: `{self.base_url}`. Switching to the `{self._new_base_url}`.", + ) + self._switch_base_url() return wrapper diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index f119b784add7..0eb23df22352 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -217,7 +217,7 @@ def track_checkpoint_cursor(self, record_value: Union[str, int]) -> None: if not self._checkpoint_cursor: self._checkpoint_cursor = self.default_state_comparison_value # track checkpoint cursor - if record_value >= self._checkpoint_cursor: + if str(record_value) >= str(self._checkpoint_cursor): self._checkpoint_cursor = record_value def should_checkpoint(self, index: int) -> bool: diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/unit_test.py b/airbyte-integrations/connectors/source-shopify/unit_tests/unit_test.py index 157fe1d2be05..ca9e306e6698 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/unit_test.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/unit_test.py @@ -100,11 +100,9 @@ def test_privileges_validation(requests_mock, fetch_transactions_user_id, basic_ "Internal Server Error for slice (500)", ], ) -def test_unavailable_stream(requests_mock, basic_config, stream, slice: Optional[Mapping[str, Any]], status_code: int, +def test_unavailable_stream(requests_mock, auth_config, stream, slice: Optional[Mapping[str, Any]], status_code: int, json_response: Mapping[str, Any]): - config = basic_config - config["authenticator"] = None - stream = stream(config) + stream = stream(auth_config) url = stream.url_base + stream.path(stream_slice=slice) requests_mock.get(url=url, json=json_response, status_code=status_code) response = requests.get(url) @@ -112,10 +110,8 @@ def test_unavailable_stream(requests_mock, basic_config, stream, slice: Optional assert stream.get_error_handler().interpret_response(response) == expected_error_resolution -def test_filter_records_newer_than_state(basic_config): - config = basic_config - config["authenticator"] = None - stream = DiscountCodes(config) +def test_filter_records_newer_than_state(auth_config): + stream = DiscountCodes(auth_config) records_slice = [ # present cursor older than state - record should be omitted {"id": 1, "updated_at": "2022-01-01T01:01:01-07:00"}, From f03bf111204f7f76c97aba10e7136725f260461c Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 18 Jul 2024 15:16:58 +0300 Subject: [PATCH 04/21] updated and formated --- .../source-shopify/acceptance-test-config.yml | 5 ++ .../shopify_graphql/bulk/job.py | 33 +++++++------ .../shopify_graphql/bulk/retry.py | 15 ++---- .../source-shopify/source_shopify/spec.json | 11 ++++- .../source_shopify/streams/base_streams.py | 14 ++++-- .../unit_tests/graphql_bulk/test_job.py | 49 ++++++++++++++++++- 6 files changed, 95 insertions(+), 32 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml index 5473d1c9d319..2107558e8c03 100644 --- a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml @@ -4,6 +4,11 @@ acceptance_tests: spec: tests: - spec_path: "source_shopify/spec.json" + backward_compatibility_tests_config: + # This is the intentional change. + # Added new field `job_checkpoint_interval`, + # to provide the ability to override this value by the User. + disable_for_version: 2.4.13 connection: tests: - config_path: "secrets/config.json" diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index 0fa34dba5fcb..a74b66ed91a0 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -25,12 +25,12 @@ @dataclass class ShopifyBulkManager: - session: requests.Session + http_client: HttpClient base_url: str - stream_name: str query: ShopifyBulkQuery job_termination_threshold: float job_size: float + job_checkpoint_interval: int # default logger logger: Final[logging.Logger] = logging.getLogger("airbyte") @@ -69,8 +69,6 @@ class ShopifyBulkManager: # last running job object count _job_last_rec_count: int = field(init=False, default=0) - # how many records should be collected before we use the checkpoining - _job_checkpoint_interval: Final[int] = 200000 # the flag to adjust the next slice from the checkpointed cursor vaue _job_adjust_slice_from_checkpoint: bool = field(init=False, default=False) # flag to mark the long running BULK job @@ -87,7 +85,6 @@ class ShopifyBulkManager: _job_last_elapsed_time: float = field(init=False, default=2.0) def __post_init__(self): - self._http_client = HttpClient(self.stream_name, self.logger, ShopifyErrorHandler(), session=self.session) self._job_size = self.job_size # The upper boundary for slice size is limited by the value from the config, default value is `P30D` self._job_size_max = self.job_size @@ -96,6 +93,8 @@ def __post_init__(self): # Ideally the source will balance on it's own rate, based on the time taken to return the data for the slice. # This behaviour could be overidden by providing the `BULK Job termination threshold` option in the `config`. self._job_max_elapsed_time = self.job_termination_threshold + # how many records should be collected before we use the checkpoining + self._job_checkpoint_interval = self.job_checkpoint_interval @property def _tools(self) -> BulkTools: @@ -199,7 +198,7 @@ def _job_canceled(self) -> bool: return self._job_state == ShopifyBulkJobStatus.CANCELED.value def _job_cancel(self) -> None: - _, canceled_response = self._http_client.send_request( + _, canceled_response = self.http_client.send_request( http_method="POST", url=self.base_url, data=ShopifyBulkTemplates.cancel(self._job_id), @@ -228,7 +227,7 @@ def _log_job_state_with_count(self) -> None: self._log_job_msg_count = 0 def _log_state(self, message: Optional[str] = None) -> None: - pattern = f"Stream: `{self.stream_name}`, the BULK Job: `{self._job_id}` is {self._job_state}" + pattern = f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` is {self._job_state}" if message: self.logger.info(f"{pattern}. {message}.") else: @@ -240,7 +239,7 @@ def _job_get_result(self, response: Optional[requests.Response] = None) -> Optio if job_result_url: # save to local file using chunks to avoid OOM filename = self._tools.filename_from_url(job_result_url) - _, response = self._http_client.send_request(http_method="GET", url=job_result_url, request_kwargs={"stream": True}) + _, response = self.http_client.send_request(http_method="GET", url=job_result_url, request_kwargs={"stream": True}) response.raise_for_status() with open(filename, "wb") as file: for chunk in response.iter_content(chunk_size=self._retrieve_chunk_size): @@ -285,13 +284,13 @@ def _on_canceling_job(self, **kwargs) -> None: def _cancel_on_long_running_job(self) -> None: self.logger.info( - f"Stream: `{self.stream_name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." + f"Stream: `{self.http_client._name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." ) self._job_long_running_cancelation = True self._job_cancel() def _cancel_on_checkpointing(self) -> None: - self.logger.info(f"Stream: `{self.stream_name}`, checkpointing after >= `{self._job_checkpoint_interval}` lines collected.") + self.logger.info(f"Stream: `{self.http_client._name}`, checkpointing after >= `{self._job_checkpoint_interval}` lines collected.") # set the flag to adjust the next slice from the checkpointed cursor value self._job_adjust_slice_from_checkpoint = True self._job_cancel() @@ -326,7 +325,7 @@ def _on_job_with_errors(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedE raise ShopifyBulkExceptions.BulkJobError(f"Could not validate the status of the BULK Job `{self._job_id}`. Errors: {errors}.") def _on_non_handable_job_error(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedException: - raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.stream_name}`, Non-handable error occured: {errors}") + raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client._name}`, Non-handable error occured: {errors}") def _get_server_errors(self, response: requests.Response) -> List[Optional[dict]]: server_errors = response.json().get("errors", []) @@ -351,7 +350,7 @@ def _job_healthcheck(self, response: requests.Response) -> Optional[Exception]: self._on_job_with_errors(errors) def _job_track_running(self) -> None: - _, response = self._http_client.send_request( + _, response = self.http_client.send_request( http_method="POST", url=self.base_url, data=ShopifyBulkTemplates.status(self._job_id), @@ -427,7 +426,7 @@ def create_job(self, stream_slice: Mapping[str, str], filter_field: str) -> None else: query = self.query.get() - _, response = self._http_client.send_request( + _, response = self.http_client.send_request( http_method="POST", url=self.base_url, json={"query": ShopifyBulkTemplates.prepare(query)}, @@ -438,7 +437,7 @@ def create_job(self, stream_slice: Mapping[str, str], filter_field: str) -> None if self._has_running_concurrent_job(errors): # when the concurrent job takes place, another job could not be created # we typically need to wait and retry, but no longer than 10 min. (see retry in `bulk_retry_on_exception`) - raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.stream_name}") + raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.http_client._name}") elif self._should_switch_shop_name(response): # assign new shop name, since the one that specified in `config` was redirected to the different one. raise ShopifyBulkExceptions.BulkJobRedirectToOtherShopError(f"Switching the `store` name, redirected to: {response.url}") @@ -460,7 +459,7 @@ def _job_process_created(self, response: requests.Response) -> None: self._job_id = bulk_response.get("id") self._job_created_at = bulk_response.get("createdAt") self._job_state = ShopifyBulkJobStatus.CREATED.value - self.logger.info(f"Stream: `{self.stream_name}`, the BULK Job: `{self._job_id}` is {ShopifyBulkJobStatus.CREATED.value}") + self.logger.info(f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` is {ShopifyBulkJobStatus.CREATED.value}") def job_size_normalize(self, start: datetime, end: datetime) -> datetime: # adjust slice size when it's bigger than the loop point when it should end, @@ -505,7 +504,9 @@ def job_check_for_completion(self) -> Optional[str]: raise bulk_job_error finally: job_current_elapsed_time = round((time() - job_started), 3) - self.logger.info(f"Stream: `{self.stream_name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec.") + self.logger.info( + f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." + ) # check whether or not we should expand or reduce the size of the slice self.__adjust_job_size(job_current_elapsed_time) # reset the state for COMPLETED job diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py index 080e4f7b7185..d5f166b5da40 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py @@ -24,25 +24,20 @@ def bulk_retry_on_exception(logger: logging.Logger, more_exceptions: Optional[Tu def decorator(func: Callable) -> Callable: @wraps(func) def wrapper(self, *args, **kwargs) -> Any: - # mandatory class attributes - max_retries = self._job_max_retries - stream_name = self.stream_name - backoff_time = self._job_backoff_time - current_retries = 0 while True: try: return func(self, *args, **kwargs) except BULK_RETRY_ERRORS or more_exceptions as ex: current_retries += 1 - if current_retries > max_retries: + if current_retries > self._job_max_retries: logger.error("Exceeded retry limit. Giving up.") raise else: logger.warning( - f"Stream `{stream_name}`: {ex}. Retrying {current_retries}/{max_retries} after {backoff_time} seconds." + f"Stream `{self.http_client._name}`: {ex}. Retrying {current_retries}/{self._job_max_retries} after {self._job_backoff_time} seconds." ) - sleep(backoff_time) + sleep(self._job_backoff_time) except ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError: if self._concurrent_attempt == self._concurrent_max_retry: message = f"The BULK Job couldn't be created at this time, since another job is running." @@ -51,12 +46,12 @@ def wrapper(self, *args, **kwargs) -> Any: self._concurrent_attempt += 1 logger.warning( - f"Stream: `{self.stream_name}`, the BULK concurrency limit has reached. Waiting {self._concurrent_interval} sec before retry, attempt: {self._concurrent_attempt}.", + f"Stream: `{self.http_client._name}`, the BULK concurrency limit has reached. Waiting {self._concurrent_interval} sec before retry, attempt: {self._concurrent_attempt}.", ) sleep(self._concurrent_interval) except ShopifyBulkExceptions.BulkJobRedirectToOtherShopError: logger.warning( - f"Stream: `{self.stream_name}`, the `shop name` differs from the provided by the User: `{self.base_url}`. Switching to the `{self._new_base_url}`.", + f"Stream: `{self.http_client._name}`, the `shop name` differs from the provided by the User: `{self.base_url}`. Switching to the `{self._new_base_url}`.", ) self._switch_base_url() diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json index 9f5b1ca82a85..00108355a1ca 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json @@ -103,7 +103,16 @@ "title": "BULK Job termination threshold", "description": "The max time in seconds, after which the single BULK Job should be `CANCELED` and retried. The bigger the value the longer the BULK Job is allowed to run.", "default": 3600, - "minimum": 1 + "minimum": 3600, + "maximum": 21600 + }, + "job_checkpoint_interval": { + "type": "integer", + "title": "BULK Job checkpoint (lines collected)", + "description": "The threshold, after which the single BULK Job should be `CANCELED` and checkpointed.", + "default": 200000, + "minimum": 50000, + "maximum": 1000000 } } }, diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index 0eb23df22352..96f82031aadb 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -13,13 +13,14 @@ import pendulum as pdm import requests from airbyte_cdk.sources.streams.core import StreamData -from airbyte_cdk.sources.streams.http import HttpStream +from airbyte_cdk.sources.streams.http import HttpClient, HttpStream from airbyte_cdk.sources.streams.http.error_handlers import ErrorHandler, HttpStatusErrorHandler from airbyte_cdk.sources.streams.http.error_handlers.default_error_mapping import DEFAULT_ERROR_MAPPING from airbyte_protocol.models import SyncMode from requests.exceptions import RequestException +from source_shopify.http_request import ShopifyErrorHandler from source_shopify.shopify_graphql.bulk.job import ShopifyBulkManager -from source_shopify.shopify_graphql.bulk.query import ShopifyBulkQuery, ShopifyBulkTemplates +from source_shopify.shopify_graphql.bulk.query import ShopifyBulkQuery from source_shopify.shopify_graphql.bulk.record import ShopifyBulkRecord from source_shopify.transform import DataTypeEnforcer from source_shopify.utils import EagerlyCachedStreamState as stream_state_cache @@ -637,18 +638,23 @@ def __init__(self, config: Dict) -> None: self.query = self.bulk_query(shop_id=config.get("shop_id")) # define BULK Manager instance self.job_manager: ShopifyBulkManager = ShopifyBulkManager( - session=self._http_client._session, + http_client=self.bulk_http_client, base_url=f"{self.url_base}{self.path()}", - stream_name=self.name, query=self.query, job_termination_threshold=float(config.get("job_termination_threshold", 3600)), # overide the default job slice size, if provided (it's auto-adjusted, later on) job_size=config.get("bulk_window_in_days", 30.0), + # provide the job checkpoint interval value, default value is 200k lines collected + job_checkpoint_interval=config.get("job_checkpoint_interval", 200000), ) # define Record Producer instance self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord(self.query) + @property + def bulk_http_client(self) -> HttpClient: + return HttpClient(self.name, self.logger, ShopifyErrorHandler(), session=self._http_client._session) + @cached_property def parent_stream(self) -> object: """ diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py index bfc97eb29d94..ddccf67f5810 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py @@ -29,6 +29,54 @@ _ANY_FILTER_FIELD = "any_filter_field" +def test_job_manager_default_values(auth_config) -> None: + stream = Products(auth_config) + + # 10Mb chunk size to save the file + assert stream.job_manager._retrieve_chunk_size == 10485760 # 1024 * 1024 * 10 + assert stream.job_manager._job_max_retries == 6 + assert stream.job_manager._job_backoff_time == 5 + # running job logger constrain, every 100-ish message will be printed + assert stream.job_manager._log_job_msg_frequency == 100 + assert stream.job_manager._log_job_msg_count == 0 + # attempt counter + assert stream.job_manager._concurrent_attempt == 0 + # sleep time per creation attempt + assert stream.job_manager._concurrent_interval == 30 + # max attempts for job creation + assert stream.job_manager._concurrent_max_retry == 120 + # currents: _job_id, _job_state, _job_created_at, _job_self_canceled + assert not stream.job_manager._job_id + # this string is based on ShopifyBulkJobStatus + assert not stream.job_manager._job_state + # completed and saved Bulk Job result filename + assert not stream.job_manager._job_result_filename + # date-time when the Bulk Job was created on the server + assert not stream.job_manager._job_created_at + # indicated whether or not we manually force-cancel the current job + assert not stream.job_manager._job_self_canceled + # time between job status checks + assert stream.job_manager. _job_check_interval == 3 + # 0.1 ~= P2H, default value, lower boundary for slice size + assert stream.job_manager._job_size_min == 0.1 + # last running job object count + assert stream.job_manager._job_last_rec_count == 0 + # how many records should be collected before we use the checkpoining + assert stream.job_manager._job_checkpoint_interval == 200000 + # the flag to adjust the next slice from the checkpointed cursor vaue + assert not stream.job_manager._job_adjust_slice_from_checkpoint + # flag to mark the long running BULK job + assert not stream.job_manager._job_long_running_cancelation + # expand slice factor + assert stream.job_manager._job_size_expand_factor == 2 + # reduce slice factor + assert stream.job_manager._job_size_reduce_factor == 2 + # whether or not the slicer should revert the previous start value + assert not stream.job_manager._job_should_revert_slice + # 2 sec is set as default value to cover the case with the empty-fast-completed jobs + assert stream.job_manager._job_last_elapsed_time == 2.0 + + def test_get_errors_from_response_invalid_response(auth_config) -> None: expected = "Couldn't check the `response` for `errors`" stream = MetafieldOrders(auth_config) @@ -290,7 +338,6 @@ def test_job_running_with_canceled_scenario(request, requests_mock, running_job_ remove(expected) - def test_job_read_file_invalid_filename(mocker, auth_config) -> None: stream = MetafieldOrders(auth_config) expected = "An error occured while producing records from BULK Job result" From d809c17a1db2eaad45a0ef40ce689f282ec08152 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 18 Jul 2024 16:28:00 +0300 Subject: [PATCH 05/21] updated changelog --- docs/integrations/sources/shopify.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/integrations/sources/shopify.md b/docs/integrations/sources/shopify.md index a1d4224d9db0..4d9e3f5172e1 100644 --- a/docs/integrations/sources/shopify.md +++ b/docs/integrations/sources/shopify.md @@ -212,6 +212,7 @@ For all `Shopify GraphQL BULK` api requests these limitations are applied: https | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 2.4.14 | 2024-07-18 | [42095](https://github.com/airbytehq/airbyte/pull/42095) | Added the `Checkpointing` for the `BULK` streams, fixed the `store` redirection | | 2.4.13 | 2024-07-13 | [41809](https://github.com/airbytehq/airbyte/pull/41809) | Update dependencies | | 2.4.12 | 2024-07-10 | [41103](https://github.com/airbytehq/airbyte/pull/41103) | Update dependencies | | 2.4.11 | 2024-07-09 | [41068](https://github.com/airbytehq/airbyte/pull/41068) | Added `options` field to `Product Variants` stream | From 67e23f86ee8b08833eda18eb5e2478d289ba9151 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Fri, 19 Jul 2024 15:35:39 +0300 Subject: [PATCH 06/21] updated after review --- .../connectors/source-shopify/poetry.lock | 31 ++++++++++--------- .../shopify_graphql/bulk/job.py | 31 ++++++++++--------- .../source_shopify/streams/base_streams.py | 3 ++ .../unit_tests/graphql_bulk/test_job.py | 4 --- 4 files changed, 36 insertions(+), 33 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/poetry.lock b/airbyte-integrations/connectors/source-shopify/poetry.lock index ae103848695f..4e0792d724dd 100644 --- a/airbyte-integrations/connectors/source-shopify/poetry.lock +++ b/airbyte-integrations/connectors/source-shopify/poetry.lock @@ -1,14 +1,14 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "airbyte-cdk" -version = "3.4.1" +version = "3.8.2" description = "A framework for writing Airbyte Connectors." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "airbyte_cdk-3.4.1-py3-none-any.whl", hash = "sha256:423b238da33480f1a464ee92277be55b3661612cd9bfeacc82313bc97792e775"}, - {file = "airbyte_cdk-3.4.1.tar.gz", hash = "sha256:cf66f4bdc85fa6a66a822ce074130f0d82eb3e8e8c85508b14b9fc1784b9416e"}, + {file = "airbyte_cdk-3.8.2-py3-none-any.whl", hash = "sha256:66503cd86d8d515aac256a77f00d158b118365b465eb80c9e798e6d4fd36b7ba"}, + {file = "airbyte_cdk-3.8.2.tar.gz", hash = "sha256:ce8abb5d208338d8d1084ab4d2afa9463fab6336f7ef6b431d80c7bcf9136599"}, ] [package.dependencies] @@ -108,13 +108,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.3" +version = "5.4.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.3-py3-none-any.whl", hash = "sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945"}, - {file = "cachetools-5.3.3.tar.gz", hash = "sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105"}, + {file = "cachetools-5.4.0-py3-none-any.whl", hash = "sha256:3ae3b49a3d5e28a77a0be2b37dbcb89005058959cb2323858c2657c4a8cab474"}, + {file = "cachetools-5.4.0.tar.gz", hash = "sha256:b8adc2e7c07f105ced7bc56dbb6dfbe7c4a00acce20e2227b3f355be89bc6827"}, ] [[package]] @@ -612,13 +612,13 @@ extended-testing = ["jinja2 (>=3,<4)"] [[package]] name = "langsmith" -version = "0.1.85" +version = "0.1.92" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.85-py3-none-any.whl", hash = "sha256:c1f94384f10cea96f7b4d33fd3db7ec180c03c7468877d50846f881d2017ff94"}, - {file = "langsmith-0.1.85.tar.gz", hash = "sha256:acff31f9e53efa48586cf8e32f65625a335c74d7c4fa306d1655ac18452296f6"}, + {file = "langsmith-0.1.92-py3-none-any.whl", hash = "sha256:8acb27844ff5263bde14b23425f83ee63996f4d5a8e9998cdeef07fd913137ff"}, + {file = "langsmith-0.1.92.tar.gz", hash = "sha256:681a613a4dc8c8e57c8961c347a39ffcb64d6c697e8ddde1fd8458fcfaef6c13"}, ] [package.dependencies] @@ -1240,18 +1240,19 @@ fixture = ["fixtures"] [[package]] name = "setuptools" -version = "70.3.0" +version = "71.0.3" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-70.3.0-py3-none-any.whl", hash = "sha256:fe384da74336c398e0d956d1cae0669bc02eed936cdb1d49b57de1990dc11ffc"}, - {file = "setuptools-70.3.0.tar.gz", hash = "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5"}, + {file = "setuptools-71.0.3-py3-none-any.whl", hash = "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207"}, + {file = "setuptools-71.0.3.tar.gz", hash = "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d"}, ] [package.extras] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "ordered-set (>=3.1.1)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (<7.4)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "sgqlc" diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index a74b66ed91a0..ed7980b438d0 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -15,7 +15,6 @@ from source_shopify.utils import ApiTypeEnum from source_shopify.utils import ShopifyRateLimiter as limiter -from ...http_request import ShopifyErrorHandler from .exceptions import AirbyteTracedException, ShopifyBulkExceptions from .query import ShopifyBulkQuery, ShopifyBulkTemplates from .retry import bulk_retry_on_exception @@ -71,8 +70,6 @@ class ShopifyBulkManager: _job_last_rec_count: int = field(init=False, default=0) # the flag to adjust the next slice from the checkpointed cursor vaue _job_adjust_slice_from_checkpoint: bool = field(init=False, default=False) - # flag to mark the long running BULK job - _job_long_running_cancelation: bool = field(init=False, default=False) # expand slice factor _job_size_expand_factor: int = field(init=False, default=2) @@ -155,6 +152,10 @@ def _is_long_running_job(self) -> bool: def _job_should_checkpoint(self) -> bool: return self._job_last_rec_count >= self._job_checkpoint_interval + @property + def _job_any_lines_collected(self) -> bool: + return self._job_last_rec_count > 0 + def _expand_job_size(self) -> None: self._job_size += self._job_size_adjusted_expand_factor @@ -257,9 +258,7 @@ def _job_update_state(self, response: Optional[requests.Response] = None) -> Non self._log_job_state_with_count() elif self._job_state in [ShopifyBulkJobStatus.CANCELED.value, ShopifyBulkJobStatus.CANCELING.value]: # do not emit `CANCELED / CANCELING` Bulk Job status, while checkpointing - if self._job_should_checkpoint: - pass - else: + if not self._job_should_checkpoint: self._log_job_state_with_count() else: self._log_state() @@ -273,7 +272,7 @@ def _on_canceled_job(self, response: requests.Response) -> Optional[AirbyteTrace f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}" ) else: - if self._job_should_checkpoint: + if self._job_any_lines_collected or self._job_should_checkpoint: # set the flag to adjust the next slice from the checkpointed cursor value self._job_adjust_slice_from_checkpoint = True # fetch the collected records from CANCELED Job on checkpointing @@ -286,13 +285,11 @@ def _cancel_on_long_running_job(self) -> None: self.logger.info( f"Stream: `{self.http_client._name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." ) - self._job_long_running_cancelation = True self._job_cancel() def _cancel_on_checkpointing(self) -> None: self.logger.info(f"Stream: `{self.http_client._name}`, checkpointing after >= `{self._job_checkpoint_interval}` lines collected.") # set the flag to adjust the next slice from the checkpointed cursor value - self._job_adjust_slice_from_checkpoint = True self._job_cancel() def _on_running_job(self, **kwargs) -> None: @@ -327,15 +324,15 @@ def _on_job_with_errors(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedE def _on_non_handable_job_error(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedException: raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client._name}`, Non-handable error occured: {errors}") - def _get_server_errors(self, response: requests.Response) -> List[Optional[dict]]: + def _get_server_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]: server_errors = response.json().get("errors", []) return [server_errors] if isinstance(server_errors, str) else server_errors - def _get_user_errors(self, response: requests.Response) -> List[Optional[dict]]: + def _get_user_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]: user_errors = response.json().get("data", {}).get("bulkOperationRunQuery", {}).get("userErrors", []) return [user_errors] if isinstance(user_errors, str) else user_errors - def _collect_bulk_errors(self, response: requests.Response) -> List[Optional[dict]]: + def _collect_bulk_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]: try: return self._get_server_errors(response) + self._get_user_errors(response) except (Exception, JSONDecodeError) as e: @@ -471,9 +468,15 @@ def get_adjusted_job_start(self, slice_start: datetime) -> datetime: step = self._job_size if self._job_size else self._job_size_min return slice_start.add(days=step) - def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, checkpointed_cursor: Optional[datetime] = None) -> datetime: + def _adjust_slice_end(self, slice_end: datetime, checkpointed_cursor: Optional[str] = None) -> datetime: + """ + Choose between the existing `slice_end` value or `checkpointed_cursor` value, if provided. + """ + return pdm.parse(checkpointed_cursor) if checkpointed_cursor else slice_end + + def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, checkpointed_cursor: Optional[str] = None) -> datetime: if self._job_adjust_slice_from_checkpoint: - return pdm.parse(checkpointed_cursor) if checkpointed_cursor else slice_end + return self._adjust_slice_end(slice_end, checkpointed_cursor) if self._is_long_running_job: self._job_size_reduce_next() diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index 96f82031aadb..e3bd1e4fcc94 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -653,6 +653,9 @@ def __init__(self, config: Dict) -> None: @property def bulk_http_client(self) -> HttpClient: + """ + Returns the instance of the `HttpClient`, with the stream info. + """ return HttpClient(self.name, self.logger, ShopifyErrorHandler(), session=self._http_client._session) @cached_property diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py index ddccf67f5810..5cceb6da3651 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py @@ -65,8 +65,6 @@ def test_job_manager_default_values(auth_config) -> None: assert stream.job_manager._job_checkpoint_interval == 200000 # the flag to adjust the next slice from the checkpointed cursor vaue assert not stream.job_manager._job_adjust_slice_from_checkpoint - # flag to mark the long running BULK job - assert not stream.job_manager._job_long_running_cancelation # expand slice factor assert stream.job_manager._job_size_expand_factor == 2 # reduce slice factor @@ -330,8 +328,6 @@ def test_job_running_with_canceled_scenario(request, requests_mock, running_job_ stream.job_manager._job_self_canceled = True # mocking the nested request call to retrieve the data from result URL requests_mock.get(job_result_url, json=request.getfixturevalue(canceled_job_response)) - # calling the sceario processing - assert not stream.job_manager._job_long_running_cancelation assert stream.job_manager.job_check_for_completion() == expected # clean up if expected: From aff67d60f31c72e778628f99d2260b81755108f9 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 13:13:07 +0300 Subject: [PATCH 07/21] added more BULK related info to the log messages, changed the min value for checkpointing to 15k lines --- .../connectors/source-shopify/poetry.lock | 47 ++++--------------- .../shopify_graphql/bulk/job.py | 29 ++++++++---- .../source-shopify/source_shopify/spec.json | 2 +- .../source_shopify/streams/base_streams.py | 5 +- 4 files changed, 32 insertions(+), 51 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/poetry.lock b/airbyte-integrations/connectors/source-shopify/poetry.lock index 2648beacf163..c40071323bce 100644 --- a/airbyte-integrations/connectors/source-shopify/poetry.lock +++ b/airbyte-integrations/connectors/source-shopify/poetry.lock @@ -2,22 +2,13 @@ [[package]] name = "airbyte-cdk" -<<<<<<< HEAD -version = "3.8.2" -======= version = "3.9.0" ->>>>>>> origin/master description = "A framework for writing Airbyte Connectors." optional = false python-versions = "<4.0,>=3.9" files = [ -<<<<<<< HEAD - {file = "airbyte_cdk-3.8.2-py3-none-any.whl", hash = "sha256:66503cd86d8d515aac256a77f00d158b118365b465eb80c9e798e6d4fd36b7ba"}, - {file = "airbyte_cdk-3.8.2.tar.gz", hash = "sha256:ce8abb5d208338d8d1084ab4d2afa9463fab6336f7ef6b431d80c7bcf9136599"}, -======= {file = "airbyte_cdk-3.9.0-py3-none-any.whl", hash = "sha256:5f8b800f75b3bba1ae1950eee001e12633826e338e6427fa99c89d11c861d283"}, {file = "airbyte_cdk-3.9.0.tar.gz", hash = "sha256:cc54a63ef76896a18133b4bcb2191013c92d1ce1b405fe5261e8479b79076d63"}, ->>>>>>> origin/master ] [package.dependencies] @@ -621,22 +612,13 @@ extended-testing = ["jinja2 (>=3,<4)"] [[package]] name = "langsmith" -<<<<<<< HEAD -version = "0.1.92" -======= version = "0.1.93" ->>>>>>> origin/master description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ -<<<<<<< HEAD - {file = "langsmith-0.1.92-py3-none-any.whl", hash = "sha256:8acb27844ff5263bde14b23425f83ee63996f4d5a8e9998cdeef07fd913137ff"}, - {file = "langsmith-0.1.92.tar.gz", hash = "sha256:681a613a4dc8c8e57c8961c347a39ffcb64d6c697e8ddde1fd8458fcfaef6c13"}, -======= {file = "langsmith-0.1.93-py3-none-any.whl", hash = "sha256:811210b9d5f108f36431bd7b997eb9476a9ecf5a2abd7ddbb606c1cdcf0f43ce"}, {file = "langsmith-0.1.93.tar.gz", hash = "sha256:285b6ad3a54f50fa8eb97b5f600acc57d0e37e139dd8cf2111a117d0435ba9b4"}, ->>>>>>> origin/master ] [package.dependencies] @@ -1055,13 +1037,13 @@ files = [ [[package]] name = "pytest" -version = "8.2.2" +version = "8.3.1" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, - {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, + {file = "pytest-8.3.1-py3-none-any.whl", hash = "sha256:e9600ccf4f563976e2c99fa02c7624ab938296551f280835ee6516df8bc4ae8c"}, + {file = "pytest-8.3.1.tar.gz", hash = "sha256:7e8e5c5abd6e93cb1cc151f23e57adc31fcf8cfd2a3ff2da63e23f732de35db6"}, ] [package.dependencies] @@ -1069,7 +1051,7 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.5,<2.0" +pluggy = ">=1.5,<2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] @@ -1258,32 +1240,19 @@ fixture = ["fixtures"] [[package]] name = "setuptools" -<<<<<<< HEAD -version = "71.0.3" -======= -version = "71.0.4" ->>>>>>> origin/master +version = "71.1.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ -<<<<<<< HEAD - {file = "setuptools-71.0.3-py3-none-any.whl", hash = "sha256:f501b6e6db709818dc76882582d9c516bf3b67b948864c5fa1d1624c09a49207"}, - {file = "setuptools-71.0.3.tar.gz", hash = "sha256:3d8531791a27056f4a38cd3e54084d8b1c4228ff9cf3f2d7dd075ec99f9fd70d"}, -======= - {file = "setuptools-71.0.4-py3-none-any.whl", hash = "sha256:ed2feca703be3bdbd94e6bb17365d91c6935c6b2a8d0bb09b66a2c435ba0b1a5"}, - {file = "setuptools-71.0.4.tar.gz", hash = "sha256:48297e5d393a62b7cb2a10b8f76c63a73af933bd809c9e0d0d6352a1a0135dd8"}, ->>>>>>> origin/master + {file = "setuptools-71.1.0-py3-none-any.whl", hash = "sha256:33874fdc59b3188304b2e7c80d9029097ea31627180896fb549c578ceb8a0855"}, + {file = "setuptools-71.1.0.tar.gz", hash = "sha256:032d42ee9fb536e33087fb66cac5f840eb9391ed05637b3f2a76a7c8fb477936"}, ] [package.extras] core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.text (>=3.7)", "more-itertools (>=8.8)", "ordered-set (>=3.1.1)", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] -<<<<<<< HEAD -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (<7.4)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -======= doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] ->>>>>>> origin/master -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.10.0)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "mypy (==1.11.*)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf", "pytest-ruff (<0.4)", "pytest-ruff (>=0.2.1)", "pytest-ruff (>=0.3.2)", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] [[package]] name = "sgqlc" diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index ed7980b438d0..dbf030e87343 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -236,7 +236,8 @@ def _log_state(self, message: Optional[str] = None) -> None: def _job_get_result(self, response: Optional[requests.Response] = None) -> Optional[str]: parsed_response = response.json().get("data", {}).get("node", {}) if response else None - job_result_url = parsed_response.get("url") if parsed_response else None + # get `complete` or `partial` result from collected Bulk Job results + job_result_url = parsed_response.get("url", parsed_response.get("partialDataUrl")) if parsed_response else None if job_result_url: # save to local file using chunks to avoid OOM filename = self._tools.filename_from_url(job_result_url) @@ -249,6 +250,13 @@ def _job_get_result(self, response: Optional[requests.Response] = None) -> Optio file.write(END_OF_FILE.encode()) return filename + def _job_get_checkpointed_result(self, response: Optional[requests.Response]) -> None: + if self._job_any_lines_collected or self._job_should_checkpoint: + # set the flag to adjust the next slice from the checkpointed cursor value + self._job_adjust_slice_from_checkpoint = True + # fetch the collected records from CANCELED Job on checkpointing + self._job_result_filename = self._job_get_result(response) + def _job_update_state(self, response: Optional[requests.Response] = None) -> None: if response: self._job_state = response.json().get("data", {}).get("node", {}).get("status") @@ -272,11 +280,7 @@ def _on_canceled_job(self, response: requests.Response) -> Optional[AirbyteTrace f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}" ) else: - if self._job_any_lines_collected or self._job_should_checkpoint: - # set the flag to adjust the next slice from the checkpointed cursor value - self._job_adjust_slice_from_checkpoint = True - # fetch the collected records from CANCELED Job on checkpointing - self._job_result_filename = self._job_get_result(response) + self._job_get_checkpointed_result(response) def _on_canceling_job(self, **kwargs) -> None: sleep(self._job_check_interval) @@ -304,9 +308,14 @@ def _on_completed_job(self, response: Optional[requests.Response] = None) -> Non self._job_result_filename = self._job_get_result(response) def _on_failed_job(self, response: requests.Response) -> AirbyteTracedException: - raise ShopifyBulkExceptions.BulkJobFailed( - f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}", - ) + if not self._job_any_lines_collected: + raise ShopifyBulkExceptions.BulkJobFailed( + f"The BULK Job: `{self._job_id}` exited with {self._job_state}, details: {response.text}", + ) + else: + # when the Bulk Job fails, usually there is a `partialDataUrl` available, + # we leverage the checkpointing in this case + self._job_get_checkpointed_result(response) def _on_timeout_job(self, **kwargs) -> AirbyteTracedException: raise ShopifyBulkExceptions.BulkJobTimout( @@ -508,7 +517,7 @@ def job_check_for_completion(self) -> Optional[str]: finally: job_current_elapsed_time = round((time() - job_started), 3) self.logger.info( - f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." + f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec. Lines collected: `{self._job_last_rec_count}`." ) # check whether or not we should expand or reduce the size of the slice self.__adjust_job_size(job_current_elapsed_time) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json index 00108355a1ca..2bbdaf8a4965 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json @@ -111,7 +111,7 @@ "title": "BULK Job checkpoint (lines collected)", "description": "The threshold, after which the single BULK Job should be `CANCELED` and checkpointed.", "default": 200000, - "minimum": 50000, + "minimum": 15000, "maximum": 1000000 } } diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index e3bd1e4fcc94..bba24cf06931 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -734,7 +734,10 @@ def get_state_value(self, stream_state: Mapping[str, Any] = None) -> Optional[Un def emit_slice_message(self, slice_start: datetime, slice_end: datetime) -> None: slice_size_message = f"Slice size: `P{round(self.job_manager._job_size, 1)}D`" - self.logger.info(f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}") + checkpointing_message = f"Checkpoint after `{self.job_manager.job_checkpoint_interval}` lines" + self.logger.info( + f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}. {checkpointing_message}." + ) @stream_state_cache.cache_stream_state def stream_slices(self, stream_state: Optional[Mapping[str, Any]] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: From af063d1b5359fae70258e8257d96735ed4f12648 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 20:12:01 +0300 Subject: [PATCH 08/21] added default ASC sorting for BUlk streams --- .../shopify_graphql/bulk/job.py | 18 +++++++-- .../shopify_graphql/bulk/record.py | 37 ++++++++++++++++++- .../source_shopify/streams/base_streams.py | 12 +++++- .../source_shopify/streams/streams.py | 2 - 4 files changed, 61 insertions(+), 8 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index dbf030e87343..a7ab9b65ecfd 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -491,8 +491,21 @@ def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, check self._job_size_reduce_next() return slice_start + # reseting the checkpoint flag, if bulk job has completed normally + self._job_adjust_slice_from_checkpoint = False + return slice_end + def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: + final_message = f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." + + if self._job_any_lines_collected: + lines_collected_message = f" Rows collected: `{self._job_last_rec_count}`." + final_message = final_message + lines_collected_message + + # emit final Bulk job status message + self.logger.info(f"{final_message}") + @limiter.balance_rate_limit(api_type=ApiTypeEnum.graphql.value) def job_check_for_completion(self) -> Optional[str]: """ @@ -516,9 +529,8 @@ def job_check_for_completion(self) -> Optional[str]: raise bulk_job_error finally: job_current_elapsed_time = round((time() - job_started), 3) - self.logger.info( - f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec. Lines collected: `{self._job_last_rec_count}`." - ) + # emit the final Bulk Job log message + self._emit_final_job_message(job_current_elapsed_time) # check whether or not we should expand or reduce the size of the slice self.__adjust_job_size(job_current_elapsed_time) # reset the state for COMPLETED job diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index 27641669d942..c5740476901b 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -18,6 +18,8 @@ @dataclass class ShopifyBulkRecord: query: ShopifyBulkQuery + checkpoint_interval: int + cursor_field: Optional[Union[int, str]] # default buffer buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) @@ -34,6 +36,13 @@ def __post_init__(self) -> None: def tools(self) -> BulkTools: return BulkTools() + @property + def default_cursor_comparison_value(self) -> Union[int, str]: + if self.cursor_field: + return 0 if self.cursor_field == "id" else "" + else: + return None + @staticmethod def check_type(record: Mapping[str, Any], types: Union[List[str], str]) -> bool: record_type = record.get("__typename") @@ -69,6 +78,21 @@ def buffer_flush(self) -> Iterable[Mapping[str, Any]]: yield from self.record_process_components(record) # clean the buffer self.buffer.clear() + + def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]]) -> Iterable[Mapping[str, Any]]: + """ + Apply sorting for collected records, to guarantee the `asc` output. + This handles the STATE and CHECKPOINTING correctly, for the `incremental` streams. + """ + if self.cursor_field: + yield from sorted( + non_sorted_records, + key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_cursor_comparison_value, + ) + # clear sorted output + non_sorted_records.clear() + else: + yield from non_sorted_records def record_compose(self, record: Mapping[str, Any]) -> Optional[Iterable[MutableMapping[str, Any]]]: """ @@ -124,11 +148,20 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: The filename example: `bulk-4039263649981.jsonl`, where `4039263649981` is the `id` of the COMPLETED BULK Jobw with `result_url`. Note: typically the `filename` is taken from the `result_url` string provided in the response. + + The output is sorted by ASC, if `cursor_field` has been provided to the `ShopifyBulkRecord` instance. + Otherwise, the records are emitted `as is`. """ - + output_buffer: List[Mapping[str, Any]] = [] + with open(filename, "r") as jsonl_file: for record in self.process_line(jsonl_file): - yield self.tools.fields_names_to_snake_case(record) + output_buffer.append(self.tools.fields_names_to_snake_case(record)) + if len(output_buffer) == self.checkpoint_interval: + yield from self.sort_output_asc(output_buffer) + + # emit what's left in the output buffer, typically last record + yield from self.sort_output_asc(output_buffer) def read_file(self, filename: str, remove_file: Optional[bool] = True) -> Iterable[Mapping[str, Any]]: try: diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index bba24cf06931..c5d3021d6ca5 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -649,7 +649,11 @@ def __init__(self, config: Dict) -> None: ) # define Record Producer instance - self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord(self.query) + self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord( + self.query, + self.state_checkpoint_interval, + self.cursor_field, + ) @property def bulk_http_client(self) -> HttpClient: @@ -738,6 +742,10 @@ def emit_slice_message(self, slice_start: datetime, slice_end: datetime) -> None self.logger.info( f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}. {checkpointing_message}." ) + + def emit_checkpoint_message(self) -> None: + if self.job_manager._job_adjust_slice_from_checkpoint: + self.logger.info(f"Stream {self.name}, continue from checkpoint: `{self._checkpoint_cursor}`.") @stream_state_cache.cache_stream_state def stream_slices(self, stream_state: Optional[Mapping[str, Any]] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: @@ -775,3 +783,5 @@ def read_records( self.record_producer.read_file(filename) ) yield from self.filter_records_newer_than_state(stream_state, records) + # add log message about the checkpoint value + self.emit_checkpoint_message() diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/streams.py index 5de98f1798ea..a5708d32594d 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/streams.py @@ -116,7 +116,6 @@ class MetafieldDraftOrders(IncrementalShopifyGraphQlBulkStream): class Products(IncrementalShopifyGraphQlBulkStream): bulk_query: Product = Product - # pin the api version class ProductsGraphQl(IncrementalShopifyStream): @@ -274,7 +273,6 @@ class OrderRefunds(IncrementalShopifyNestedStream): class OrderRisks(IncrementalShopifyGraphQlBulkStream): bulk_query: OrderRisk = OrderRisk - # the updated stream works only with >= `2024-04` shopify api version class Transactions(IncrementalShopifySubstream): From edfe43ffed5d0130bfb8a035712c5ae898378d11 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 20:14:39 +0300 Subject: [PATCH 09/21] formatted --- .../source_shopify/shopify_graphql/bulk/job.py | 4 ++-- .../source_shopify/shopify_graphql/bulk/record.py | 8 ++++---- .../source-shopify/source_shopify/streams/base_streams.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index a7ab9b65ecfd..21c2b4e854d7 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -493,7 +493,7 @@ def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, check # reseting the checkpoint flag, if bulk job has completed normally self._job_adjust_slice_from_checkpoint = False - + return slice_end def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: @@ -502,7 +502,7 @@ def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: if self._job_any_lines_collected: lines_collected_message = f" Rows collected: `{self._job_last_rec_count}`." final_message = final_message + lines_collected_message - + # emit final Bulk job status message self.logger.info(f"{final_message}") diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index c5740476901b..fb14e67ac0d3 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -78,7 +78,7 @@ def buffer_flush(self) -> Iterable[Mapping[str, Any]]: yield from self.record_process_components(record) # clean the buffer self.buffer.clear() - + def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]]) -> Iterable[Mapping[str, Any]]: """ Apply sorting for collected records, to guarantee the `asc` output. @@ -86,7 +86,7 @@ def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]]) -> It """ if self.cursor_field: yield from sorted( - non_sorted_records, + non_sorted_records, key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_cursor_comparison_value, ) # clear sorted output @@ -148,12 +148,12 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: The filename example: `bulk-4039263649981.jsonl`, where `4039263649981` is the `id` of the COMPLETED BULK Jobw with `result_url`. Note: typically the `filename` is taken from the `result_url` string provided in the response. - + The output is sorted by ASC, if `cursor_field` has been provided to the `ShopifyBulkRecord` instance. Otherwise, the records are emitted `as is`. """ output_buffer: List[Mapping[str, Any]] = [] - + with open(filename, "r") as jsonl_file: for record in self.process_line(jsonl_file): output_buffer.append(self.tools.fields_names_to_snake_case(record)) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index c5d3021d6ca5..08054e82632c 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -742,7 +742,7 @@ def emit_slice_message(self, slice_start: datetime, slice_end: datetime) -> None self.logger.info( f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}. {checkpointing_message}." ) - + def emit_checkpoint_message(self) -> None: if self.job_manager._job_adjust_slice_from_checkpoint: self.logger.info(f"Stream {self.name}, continue from checkpoint: `{self._checkpoint_cursor}`.") From 260bdcc5eb2ed386263aea984114a557941cc77b Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 20:16:38 +0300 Subject: [PATCH 10/21] reverted .dockerignore --- .dockerignore | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000000..cc026d5dc6c1 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.dockerignore +.git +.idea +.gradle +**/build +**/node_modules +Dockerfile.* +docker-compose*.yaml From f5fbbc713654751c41ec7a0b12bab20163d54cad Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 20:18:45 +0300 Subject: [PATCH 11/21] reverted non-functional changes --- .editorconfig | 226 ++++++++++++++++++++++++++++++++++++++++ .gitignore | 105 +++++++++++++++++++ .pre-commit-config.yaml | 10 ++ .prettierignore | 3 + .prettierrc | 10 ++ .python-version | 1 + .readthedocs.yaml | 24 +++++ .root | 0 8 files changed, 379 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 .prettierignore create mode 100644 .prettierrc create mode 100644 .python-version create mode 100644 .readthedocs.yaml create mode 100644 .root diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000000..9b977f3960d5 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,226 @@ +root = true + +[{*.bash,*.sh,*.zsh}] +indent_size = 2 +tab_width = 2 +ij_shell_binary_ops_start_line = false +ij_shell_keep_column_alignment_padding = false +ij_shell_minify_program = false +ij_shell_redirect_followed_by_space = false +ij_shell_switch_cases_indented = false +ij_shell_use_unix_line_separator = true + +[{*.har,*.jsb2,*.jsb3,*.json,.babelrc,.eslintrc,.stylelintrc,bowerrc,jest.config}] +indent_size = 2 +ij_json_keep_blank_lines_in_code = 0 +ij_json_keep_indents_on_empty_lines = false +ij_json_keep_line_breaks = true +ij_json_space_after_colon = true +ij_json_space_after_comma = true +ij_json_space_before_colon = true +ij_json_space_before_comma = false +ij_json_spaces_within_braces = false +ij_json_spaces_within_brackets = false +ij_json_wrap_long_lines = false + +[{*.kt,*.kts}] +indent_style = space +insert_final_newline = true +max_line_length = 100 +indent_size = 4 +ij_continuation_indent_size = 4 +ij_java_names_count_to_use_import_on_demand = 9999 +ij_kotlin_align_in_columns_case_branch = false +ij_kotlin_align_multiline_binary_operation = false +ij_kotlin_align_multiline_extends_list = false +ij_kotlin_align_multiline_method_parentheses = false +ij_kotlin_align_multiline_parameters = true +ij_kotlin_align_multiline_parameters_in_calls = false +ij_kotlin_allow_trailing_comma = true +ij_kotlin_allow_trailing_comma_on_call_site = true +ij_kotlin_assignment_wrap = normal +ij_kotlin_blank_lines_after_class_header = 0 +ij_kotlin_blank_lines_around_block_when_branches = 0 +ij_kotlin_blank_lines_before_declaration_with_comment_or_annotation_on_separate_line = 1 +ij_kotlin_block_comment_at_first_column = true +ij_kotlin_call_parameters_new_line_after_left_paren = true +ij_kotlin_call_parameters_right_paren_on_new_line = false +ij_kotlin_call_parameters_wrap = on_every_item +ij_kotlin_catch_on_new_line = false +ij_kotlin_class_annotation_wrap = split_into_lines +ij_kotlin_code_style_defaults = KOTLIN_OFFICIAL +ij_kotlin_continuation_indent_for_chained_calls = true +ij_kotlin_continuation_indent_for_expression_bodies = true +ij_kotlin_continuation_indent_in_argument_lists = true +ij_kotlin_continuation_indent_in_elvis = false +ij_kotlin_continuation_indent_in_if_conditions = false +ij_kotlin_continuation_indent_in_parameter_lists = false +ij_kotlin_continuation_indent_in_supertype_lists = false +ij_kotlin_else_on_new_line = false +ij_kotlin_enum_constants_wrap = off +ij_kotlin_extends_list_wrap = normal +ij_kotlin_field_annotation_wrap = off +ij_kotlin_finally_on_new_line = false +ij_kotlin_if_rparen_on_new_line = false +ij_kotlin_import_nested_classes = false +ij_kotlin_imports_layout = * +ij_kotlin_insert_whitespaces_in_simple_one_line_method = true +ij_kotlin_keep_blank_lines_before_right_brace = 2 +ij_kotlin_keep_blank_lines_in_code = 2 +ij_kotlin_keep_blank_lines_in_declarations = 2 +ij_kotlin_keep_first_column_comment = true +ij_kotlin_keep_indents_on_empty_lines = false +ij_kotlin_keep_line_breaks = true +ij_kotlin_lbrace_on_next_line = false +ij_kotlin_line_comment_add_space = false +ij_kotlin_line_comment_at_first_column = true +ij_kotlin_method_annotation_wrap = split_into_lines +ij_kotlin_method_call_chain_wrap = normal +ij_kotlin_method_parameters_new_line_after_left_paren = true +ij_kotlin_method_parameters_right_paren_on_new_line = true +ij_kotlin_method_parameters_wrap = on_every_item +ij_kotlin_name_count_to_use_star_import = 9999 +ij_kotlin_name_count_to_use_star_import_for_members = 9999 +ij_kotlin_parameter_annotation_wrap = off +ij_kotlin_space_after_comma = true +ij_kotlin_space_after_extend_colon = true +ij_kotlin_space_after_type_colon = true +ij_kotlin_space_before_catch_parentheses = true +ij_kotlin_space_before_comma = false +ij_kotlin_space_before_extend_colon = true +ij_kotlin_space_before_for_parentheses = true +ij_kotlin_space_before_if_parentheses = true +ij_kotlin_space_before_lambda_arrow = true +ij_kotlin_space_before_type_colon = false +ij_kotlin_space_before_when_parentheses = true +ij_kotlin_space_before_while_parentheses = true +ij_kotlin_spaces_around_additive_operators = true +ij_kotlin_spaces_around_assignment_operators = true +ij_kotlin_spaces_around_equality_operators = true +ij_kotlin_spaces_around_function_type_arrow = true +ij_kotlin_spaces_around_logical_operators = true +ij_kotlin_spaces_around_multiplicative_operators = true +ij_kotlin_spaces_around_range = false +ij_kotlin_spaces_around_relational_operators = true +ij_kotlin_spaces_around_unary_operator = false +ij_kotlin_spaces_around_when_arrow = true +ij_kotlin_variable_annotation_wrap = off +ij_kotlin_while_on_new_line = false +ij_kotlin_wrap_elvis_expressions = 1 +ij_kotlin_wrap_expression_body_functions = 1 +ij_kotlin_wrap_first_method_in_call_chain = false + +[{*.markdown,*.md}] +ij_markdown_force_one_space_after_blockquote_symbol = true +ij_markdown_force_one_space_after_header_symbol = true +ij_markdown_force_one_space_after_list_bullet = true +ij_markdown_force_one_space_between_words = true +ij_markdown_keep_indents_on_empty_lines = false +ij_markdown_max_lines_around_block_elements = 1 +ij_markdown_max_lines_around_header = 1 +ij_markdown_max_lines_between_paragraphs = 1 +ij_markdown_min_lines_around_block_elements = 1 +ij_markdown_min_lines_around_header = 1 +ij_markdown_min_lines_between_paragraphs = 1 + +[{*.py,*.pyw,Tiltfile}] +charset = utf-8 +indent_size = 4 +indent_style = space +insert_final_newline = true +max_line_length = 140 +tab_width = 4 +ij_continuation_indent_size = 8 +ij_formatter_off_tag = @formatter:off +ij_formatter_on_tag = @formatter:on +ij_formatter_tags_enabled = false +ij_smart_tabs = false +ij_visual_guides = none +ij_wrap_on_typing = false + +ij_python_align_collections_and_comprehensions = true +ij_python_align_multiline_imports = true +ij_python_align_multiline_parameters = true +ij_python_align_multiline_parameters_in_calls = true +ij_python_blank_line_at_file_end = true +ij_python_blank_lines_after_imports = 1 +ij_python_blank_lines_after_local_imports = 0 +ij_python_blank_lines_around_class = 1 +ij_python_blank_lines_around_method = 1 +ij_python_blank_lines_around_top_level_classes_functions = 2 +ij_python_blank_lines_before_first_method = 0 +ij_python_call_parameters_new_line_after_left_paren = false +ij_python_call_parameters_right_paren_on_new_line = false +ij_python_call_parameters_wrap = normal +ij_python_dict_alignment = 0 +ij_python_dict_new_line_after_left_brace = false +ij_python_dict_new_line_before_right_brace = false +ij_python_dict_wrapping = 1 +ij_python_from_import_new_line_after_left_parenthesis = false +ij_python_from_import_new_line_before_right_parenthesis = false +ij_python_from_import_parentheses_force_if_multiline = false +ij_python_from_import_trailing_comma_if_multiline = false +ij_python_from_import_wrapping = 1 +ij_python_hang_closing_brackets = false +ij_python_keep_blank_lines_in_code = 1 +ij_python_keep_blank_lines_in_declarations = 1 +ij_python_keep_indents_on_empty_lines = false +ij_python_keep_line_breaks = true +ij_python_method_parameters_new_line_after_left_paren = false +ij_python_method_parameters_right_paren_on_new_line = false +ij_python_method_parameters_wrap = normal +ij_python_new_line_after_colon = false +ij_python_new_line_after_colon_multi_clause = true +ij_python_optimize_imports_always_split_from_imports = false +ij_python_optimize_imports_case_insensitive_order = false +ij_python_optimize_imports_join_from_imports_with_same_source = false +ij_python_optimize_imports_sort_by_type_first = true +ij_python_optimize_imports_sort_imports = true +ij_python_optimize_imports_sort_names_in_from_imports = false +ij_python_space_after_comma = true +ij_python_space_after_number_sign = true +ij_python_space_after_py_colon = true +ij_python_space_before_backslash = true +ij_python_space_before_comma = false +ij_python_space_before_for_semicolon = false +ij_python_space_before_lbracket = false +ij_python_space_before_method_call_parentheses = false +ij_python_space_before_method_parentheses = false +ij_python_space_before_number_sign = true +ij_python_space_before_py_colon = false +ij_python_space_within_empty_method_call_parentheses = false +ij_python_space_within_empty_method_parentheses = false +ij_python_spaces_around_additive_operators = true +ij_python_spaces_around_assignment_operators = true +ij_python_spaces_around_bitwise_operators = true +ij_python_spaces_around_eq_in_keyword_argument = false +ij_python_spaces_around_eq_in_named_parameter = false +ij_python_spaces_around_equality_operators = true +ij_python_spaces_around_multiplicative_operators = true +ij_python_spaces_around_power_operator = true +ij_python_spaces_around_relational_operators = true +ij_python_spaces_around_shift_operators = true +ij_python_spaces_within_braces = false +ij_python_spaces_within_brackets = false +ij_python_spaces_within_method_call_parentheses = false +ij_python_spaces_within_method_parentheses = false +ij_python_use_continuation_indent_for_arguments = false +ij_python_use_continuation_indent_for_collection_and_comprehensions = false +ij_python_use_continuation_indent_for_parameters = true +ij_python_wrap_long_lines = false + +[{*.toml,Cargo.lock,Cargo.toml.orig,Gopkg.lock,Pipfile,poetry.lock}] +ij_toml_keep_indents_on_empty_lines = false + +[{*.yaml,*.yml}] +indent_size = 2 +ij_yaml_align_values_properties = do_not_align +ij_yaml_autoinsert_sequence_marker = true +ij_yaml_block_mapping_on_new_line = false +ij_yaml_indent_sequence_value = true +ij_yaml_keep_indents_on_empty_lines = false +ij_yaml_keep_line_breaks = true +ij_yaml_sequence_on_new_line = false +ij_yaml_space_before_colon = false +ij_yaml_spaces_within_braces = true +ij_yaml_spaces_within_brackets = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000000..67b21afbcb8b --- /dev/null +++ b/.gitignore @@ -0,0 +1,105 @@ +.gradle +.idea +*.iml +*.swp +build +out +.DS_Store +.dockerversions +.classpath +.project +.settings +.vscode +**/gmon.out +static_checker_reports/ + +# Logs +acceptance_tests_logs/ +airbyte_ci_logs/ +live_tests_debug_reports/ +dagger_engine_logs* + +# Secrets +secrets +updated_configurations +!airbyte-integrations/connector-templates/**/secrets + +# Connector debug configs +airbyte-integrations/connectors/**/src/test/resources/debug_resources + +# Python +*.egg-info +__pycache__ +.eggs +.venv +.mypy_cache +.ipynb_checkpoints +.pytest_ + +# Python unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# dbt +profiles.yml + +# Terraform +.terraform/ +crash.log +*.tfstate +*.tfstate.backup +*.lock.hcl + +# Airflow Demo +resources/examples/airflow/logs/* +!resources/examples/airflow/logs/.gitkeep + +# Summary.md keeps getting added and we just don't like it +docs/SUMMARY.md + +# Output Files generated by scripts +lowcode_connector_names.txt +num_lowcode_connectors.csv + +# Helm charts .tgz dependencies +charts/**/charts + +# Snyk +.dccache + +# Datadog +dd-java-agent.jar + +# Files needed to run airbyte-platform that are downloaded on-the-fly via run-ab-platform.sh +/docker-compose.yaml +/docker-compose.debug.yaml +/.env +/.env.dev +/flags.yml +/temporal/dynamicconfig/development.yaml + +# Ignore generated credentials from google-github-actions/auth +gha-creds-*.json + +# Legacy pipeline reports path +tools/ci_connector_ops/pipeline_reports/ + +# ignore local build scan uri output +scan-journal.log + +# connectors' cache +*.sqlite + +# mvn +target/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000000..4fd1a68cdc3b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: + - repo: local + hooks: + - id: format-fix-all-on-push + always_run: true + entry: airbyte-ci --disable-update-check format fix all + language: system + name: Run airbyte-ci format fix on git push (~30s) + pass_filenames: false + stages: [push] diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 000000000000..9579ba1a2fc9 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,3 @@ +airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output +airbyte-ci/connectors/pipelines/tests/test_changelog/result_files +airbyte-integrations/bases/connector-acceptance-test/unit_tests/data/docs diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 000000000000..31cda2d92570 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,10 @@ +{ + "overrides": [ + { + "files": "*.md", + "options": { + "proseWrap": "preserve" + } + } + ] +} diff --git a/.python-version b/.python-version new file mode 100644 index 000000000000..c8cfe3959183 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000000..5fb58f45882d --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,24 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +build: + os: ubuntu-20.04 + tools: + python: "3.9" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + builder: html + configuration: airbyte-cdk/python/reference_docs/_source/conf.py + +# Set the version of Python and requirements required to build your docs +python: + install: + - method: pip + path: airbyte-cdk/python + extra_requirements: + - sphinx-docs diff --git a/.root b/.root new file mode 100644 index 000000000000..e69de29bb2d1 From e04faed7fd88f469ab418470dc3051e117eac989 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 21:27:11 +0300 Subject: [PATCH 12/21] fixed unit_tests up to the changes --- .../shopify_graphql/bulk/record.py | 53 ++++++++-------- .../source-shopify/unit_tests/conftest.py | 61 ++++++++++--------- .../unit_tests/graphql_bulk/test_record.py | 18 +++--- 3 files changed, 69 insertions(+), 63 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index fb14e67ac0d3..7fd38dcbecc8 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -18,11 +18,13 @@ @dataclass class ShopifyBulkRecord: query: ShopifyBulkQuery - checkpoint_interval: int - cursor_field: Optional[Union[int, str]] + checkpoint_interval: Optional[int] = 250 + cursor_field: Optional[Union[int, str]] = None - # default buffer - buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) + # default instance records collector + input_buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) + # default instance output buffer + output_buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) # default logger logger: Final[logging.Logger] = logging.getLogger("airbyte") @@ -54,13 +56,13 @@ def check_type(record: Mapping[str, Any], types: Union[List[str], str]) -> bool: def record_new(self, record: MutableMapping[str, Any]) -> None: record = self.component_prepare(record) record.pop("__typename") - self.buffer.append(record) + self.input_buffer.append(record) def record_new_component(self, record: MutableMapping[str, Any]) -> None: component = record.get("__typename") record.pop("__typename") # add component to its placeholder in the components list - self.buffer[-1]["record_components"][component].append(record) + self.input_buffer[-1]["record_components"][component].append(record) def component_prepare(self, record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: if self.components: @@ -69,30 +71,33 @@ def component_prepare(self, record: MutableMapping[str, Any]) -> MutableMapping[ record["record_components"][component] = [] return record - def buffer_flush(self) -> Iterable[Mapping[str, Any]]: - if len(self.buffer) > 0: - for record in self.buffer: + def flush_input_buffer(self) -> Iterable[Mapping[str, Any]]: + if len(self.input_buffer) > 0: + for record in self.input_buffer: # resolve id from `str` to `int` record = self.record_resolve_id(record) # process record components yield from self.record_process_components(record) - # clean the buffer - self.buffer.clear() + # clean the input_buffer + self.input_buffer.clear() - def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]]) -> Iterable[Mapping[str, Any]]: + def output_sort_asc(self) -> Iterable[Mapping[str, Any]]: """ Apply sorting for collected records, to guarantee the `asc` output. This handles the STATE and CHECKPOINTING correctly, for the `incremental` streams. """ if self.cursor_field: yield from sorted( - non_sorted_records, + self.output_buffer, key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_cursor_comparison_value, ) # clear sorted output - non_sorted_records.clear() + self.output_buffer.clear() else: - yield from non_sorted_records + # emit without sorting + yield from self.output_buffer + # clear non-sorted output + self.output_buffer.clear() def record_compose(self, record: Mapping[str, Any]) -> Optional[Iterable[MutableMapping[str, Any]]]: """ @@ -102,7 +107,7 @@ def record_compose(self, record: Mapping[str, Any]) -> Optional[Iterable[Mutable """ if self.check_type(record, self.composition.get("new_record")): # emit from previous iteration, if present - yield from self.buffer_flush() + yield from self.flush_input_buffer() # register the record self.record_new(record) # components check @@ -118,8 +123,8 @@ def process_line(self, jsonl_file: TextIOWrapper) -> Iterable[MutableMapping[str elif line != "": yield from self.record_compose(loads(line)) - # emit what's left in the buffer, typically last record - yield from self.buffer_flush() + # emit what's left in the input_buffer, typically last record + yield from self.flush_input_buffer() def record_resolve_id(self, record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """ @@ -152,16 +157,16 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: The output is sorted by ASC, if `cursor_field` has been provided to the `ShopifyBulkRecord` instance. Otherwise, the records are emitted `as is`. """ - output_buffer: List[Mapping[str, Any]] = [] with open(filename, "r") as jsonl_file: for record in self.process_line(jsonl_file): - output_buffer.append(self.tools.fields_names_to_snake_case(record)) - if len(output_buffer) == self.checkpoint_interval: - yield from self.sort_output_asc(output_buffer) + self.output_buffer.append(self.tools.fields_names_to_snake_case(record)) + if len(self.output_buffer) == self.checkpoint_interval: + # emit records from output_buffer, sorted ASC + yield from self.output_sort_asc() - # emit what's left in the output buffer, typically last record - yield from self.sort_output_asc(output_buffer) + # emit what's left in the output output_buffer, typically last record + yield from self.output_sort_asc() def read_file(self, filename: str, remove_file: Optional[bool] = True) -> Iterable[Mapping[str, Any]]: try: diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py index 4eac8f69fe76..ce1898b8de19 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py @@ -932,36 +932,7 @@ def product_images_response_expected_result(): @pytest.fixture def product_variants_response_expected_result(): return [ - { - "id": 40091751448765, - "title": "Metal", - "price": 64.0, - "sku": "", - "position": 1, - "inventory_policy": "DENY", - "compare_at_price": None, - "inventory_management": "SHOPIFY", - "created_at": "2021-06-23T06:04:41+00:00", - "updated_at": "2023-10-27T16:56:50+00:00", - "taxable": True, - "barcode": None, - "weight": 0.0, - "weight_unit": "GRAMS", - "inventory_quantity": 6, - "requires_shipping": False, - "available_for_sale": True, - "display_name": "Waterproof iPhone Speaker - Metal", - "tax_code": "", - "grams": 0, - "old_inventory_quantity": 6, - "fulfillment_service": "manual", - "admin_graphql_api_id": "gid://shopify/ProductVariant/40091751448765", - "presentment_prices": [{"price": {"amount": 64.0, "currency_code": "USD"}, "compare_at_price": {"amount": None}}], - "product_id": 6796825198781, - "inventory_item_id": 42186366255293, - "image_id": None, - "shop_url": "test_shop", - }, + # sorted records in ASC, check the `updated_at` field { "id": 41561955827901, "title": "Test Variant 1", @@ -992,6 +963,36 @@ def product_variants_response_expected_result(): "image_id": None, "shop_url": "test_shop", }, + { + "id": 40091751448765, + "title": "Metal", + "price": 64.0, + "sku": "", + "position": 1, + "inventory_policy": "DENY", + "compare_at_price": None, + "inventory_management": "SHOPIFY", + "created_at": "2021-06-23T06:04:41+00:00", + "updated_at": "2023-10-27T16:56:50+00:00", + "taxable": True, + "barcode": None, + "weight": 0.0, + "weight_unit": "GRAMS", + "inventory_quantity": 6, + "requires_shipping": False, + "available_for_sale": True, + "display_name": "Waterproof iPhone Speaker - Metal", + "tax_code": "", + "grams": 0, + "old_inventory_quantity": 6, + "fulfillment_service": "manual", + "admin_graphql_api_id": "gid://shopify/ProductVariant/40091751448765", + "presentment_prices": [{"price": {"amount": 64.0, "currency_code": "USD"}, "compare_at_price": {"amount": None}}], + "product_id": 6796825198781, + "inventory_item_id": 42186366255293, + "image_id": None, + "shop_url": "test_shop", + }, ] diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py index 0f18c1965949..0c45d8257b91 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py @@ -80,7 +80,7 @@ def test_record_new(record, expected) -> None: query = ShopifyBulkQuery(shop_id=0) record_instance = ShopifyBulkRecord(query) record_instance.record_new(record) - assert record_instance.buffer == [expected] + assert record_instance.input_buffer == [expected] @pytest.mark.parametrize( @@ -116,15 +116,15 @@ def test_record_new_component(records_from_jsonl, record_components, expected) - record_instance.components = record_components.get("record_components") # register new record first record_instance.record_new(records_from_jsonl[0]) - assert len(record_instance.buffer) > 0 + assert len(record_instance.input_buffer) > 0 # check the components placeholder was created for new record registered - assert "record_components" in record_instance.buffer[-1].keys() + assert "record_components" in record_instance.input_buffer[-1].keys() # register record component record_instance.record_new_component(records_from_jsonl[1]) # check the component was proccessed - assert len(record_instance.buffer[-1]["record_components"]["RecordComponent"]) > 0 + assert len(record_instance.input_buffer[-1]["record_components"]["RecordComponent"]) > 0 # general check - assert record_instance.buffer == expected + assert record_instance.input_buffer == expected @pytest.mark.parametrize( @@ -164,9 +164,9 @@ def test_record_new_component(records_from_jsonl, record_components, expected) - def test_buffer_flush(buffered_record, expected) -> None: query = ShopifyBulkQuery(shop_id=0) record_instance = ShopifyBulkRecord(query) - # populate the buffer with record - record_instance.buffer.append(buffered_record) - assert list(record_instance.buffer_flush()) == expected + # populate the input_buffer with record + record_instance.input_buffer.append(buffered_record) + assert list(record_instance.flush_input_buffer()) == expected @pytest.mark.parametrize( @@ -206,4 +206,4 @@ def test_record_compose(records_from_jsonl, record_composition, expected) -> Non for record in records_from_jsonl: list(record_instance.record_compose(record)) - assert record_instance.buffer == expected + assert record_instance.input_buffer == expected From e3a1c41f7643bcef23f815294fd89e46d3e1b01c Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 22 Jul 2024 21:29:59 +0300 Subject: [PATCH 13/21] formatted --- .../source_shopify/shopify_graphql/bulk/record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index 7fd38dcbecc8..053ae284e662 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -162,7 +162,7 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: for record in self.process_line(jsonl_file): self.output_buffer.append(self.tools.fields_names_to_snake_case(record)) if len(self.output_buffer) == self.checkpoint_interval: - # emit records from output_buffer, sorted ASC + # emit records from output_buffer, sorted ASC yield from self.output_sort_asc() # emit what's left in the output output_buffer, typically last record From 0b6c107b74af230595bbb526037d3358550bee26 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Tue, 23 Jul 2024 17:02:36 +0300 Subject: [PATCH 14/21] updated --- .../source-shopify/acceptance-test-config.yml | 2 +- .../shopify_graphql/bulk/job.py | 16 ++++- .../shopify_graphql/bulk/record.py | 64 ++++--------------- .../source_shopify/streams/base_streams.py | 51 ++++++++------- .../unit_tests/graphql_bulk/test_job.py | 15 +++-- .../unit_tests/graphql_bulk/test_record.py | 18 +++--- 6 files changed, 74 insertions(+), 92 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml index 2107558e8c03..4df7a28f2086 100644 --- a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml @@ -8,7 +8,7 @@ acceptance_tests: # This is the intentional change. # Added new field `job_checkpoint_interval`, # to provide the ability to override this value by the User. - disable_for_version: 2.4.13 + disable_for_version: 2.4.14 connection: tests: - config_path: "secrets/config.json" diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index 21c2b4e854d7..17088b4a0998 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -17,6 +17,7 @@ from .exceptions import AirbyteTracedException, ShopifyBulkExceptions from .query import ShopifyBulkQuery, ShopifyBulkTemplates +from .record import ShopifyBulkRecord from .retry import bulk_retry_on_exception from .status import ShopifyBulkJobStatus from .tools import END_OF_FILE, BulkTools @@ -81,7 +82,7 @@ class ShopifyBulkManager: # 2 sec is set as default value to cover the case with the empty-fast-completed jobs _job_last_elapsed_time: float = field(init=False, default=2.0) - def __post_init__(self): + def __post_init__(self) -> None: self._job_size = self.job_size # The upper boundary for slice size is limited by the value from the config, default value is `P30D` self._job_size_max = self.job_size @@ -92,6 +93,8 @@ def __post_init__(self): self._job_max_elapsed_time = self.job_termination_threshold # how many records should be collected before we use the checkpoining self._job_checkpoint_interval = self.job_checkpoint_interval + # define Record Producer instance + self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord(self.query) @property def _tools(self) -> BulkTools: @@ -506,8 +509,15 @@ def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: # emit final Bulk job status message self.logger.info(f"{final_message}") + def _process_bulk_results(self) -> Iterable[Mapping[str, Any]]: + if self._job_result_filename: + # produce records from saved bulk job result + yield from self.record_producer.read_file(self._job_result_filename) + else: + yield from [] + @limiter.balance_rate_limit(api_type=ApiTypeEnum.graphql.value) - def job_check_for_completion(self) -> Optional[str]: + def job_get_results(self) -> Optional[Iterable[Mapping[str, Any]]]: """ This method checks the status for the `CREATED` Shopify BULK Job, using it's `ID`. The time spent for the Job execution is tracked to understand the effort. @@ -517,7 +527,7 @@ def job_check_for_completion(self) -> Optional[str]: try: # track created job until it's COMPLETED self._job_check_state() - return self._job_result_filename + yield from self._process_bulk_results() except ( ShopifyBulkExceptions.BulkJobFailed, ShopifyBulkExceptions.BulkJobTimout, diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index 053ae284e662..27641669d942 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -18,13 +18,9 @@ @dataclass class ShopifyBulkRecord: query: ShopifyBulkQuery - checkpoint_interval: Optional[int] = 250 - cursor_field: Optional[Union[int, str]] = None - # default instance records collector - input_buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) - # default instance output buffer - output_buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) + # default buffer + buffer: List[MutableMapping[str, Any]] = field(init=False, default_factory=list) # default logger logger: Final[logging.Logger] = logging.getLogger("airbyte") @@ -38,13 +34,6 @@ def __post_init__(self) -> None: def tools(self) -> BulkTools: return BulkTools() - @property - def default_cursor_comparison_value(self) -> Union[int, str]: - if self.cursor_field: - return 0 if self.cursor_field == "id" else "" - else: - return None - @staticmethod def check_type(record: Mapping[str, Any], types: Union[List[str], str]) -> bool: record_type = record.get("__typename") @@ -56,13 +45,13 @@ def check_type(record: Mapping[str, Any], types: Union[List[str], str]) -> bool: def record_new(self, record: MutableMapping[str, Any]) -> None: record = self.component_prepare(record) record.pop("__typename") - self.input_buffer.append(record) + self.buffer.append(record) def record_new_component(self, record: MutableMapping[str, Any]) -> None: component = record.get("__typename") record.pop("__typename") # add component to its placeholder in the components list - self.input_buffer[-1]["record_components"][component].append(record) + self.buffer[-1]["record_components"][component].append(record) def component_prepare(self, record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: if self.components: @@ -71,33 +60,15 @@ def component_prepare(self, record: MutableMapping[str, Any]) -> MutableMapping[ record["record_components"][component] = [] return record - def flush_input_buffer(self) -> Iterable[Mapping[str, Any]]: - if len(self.input_buffer) > 0: - for record in self.input_buffer: + def buffer_flush(self) -> Iterable[Mapping[str, Any]]: + if len(self.buffer) > 0: + for record in self.buffer: # resolve id from `str` to `int` record = self.record_resolve_id(record) # process record components yield from self.record_process_components(record) - # clean the input_buffer - self.input_buffer.clear() - - def output_sort_asc(self) -> Iterable[Mapping[str, Any]]: - """ - Apply sorting for collected records, to guarantee the `asc` output. - This handles the STATE and CHECKPOINTING correctly, for the `incremental` streams. - """ - if self.cursor_field: - yield from sorted( - self.output_buffer, - key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_cursor_comparison_value, - ) - # clear sorted output - self.output_buffer.clear() - else: - # emit without sorting - yield from self.output_buffer - # clear non-sorted output - self.output_buffer.clear() + # clean the buffer + self.buffer.clear() def record_compose(self, record: Mapping[str, Any]) -> Optional[Iterable[MutableMapping[str, Any]]]: """ @@ -107,7 +78,7 @@ def record_compose(self, record: Mapping[str, Any]) -> Optional[Iterable[Mutable """ if self.check_type(record, self.composition.get("new_record")): # emit from previous iteration, if present - yield from self.flush_input_buffer() + yield from self.buffer_flush() # register the record self.record_new(record) # components check @@ -123,8 +94,8 @@ def process_line(self, jsonl_file: TextIOWrapper) -> Iterable[MutableMapping[str elif line != "": yield from self.record_compose(loads(line)) - # emit what's left in the input_buffer, typically last record - yield from self.flush_input_buffer() + # emit what's left in the buffer, typically last record + yield from self.buffer_flush() def record_resolve_id(self, record: MutableMapping[str, Any]) -> MutableMapping[str, Any]: """ @@ -153,20 +124,11 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: The filename example: `bulk-4039263649981.jsonl`, where `4039263649981` is the `id` of the COMPLETED BULK Jobw with `result_url`. Note: typically the `filename` is taken from the `result_url` string provided in the response. - - The output is sorted by ASC, if `cursor_field` has been provided to the `ShopifyBulkRecord` instance. - Otherwise, the records are emitted `as is`. """ with open(filename, "r") as jsonl_file: for record in self.process_line(jsonl_file): - self.output_buffer.append(self.tools.fields_names_to_snake_case(record)) - if len(self.output_buffer) == self.checkpoint_interval: - # emit records from output_buffer, sorted ASC - yield from self.output_sort_asc() - - # emit what's left in the output output_buffer, typically last record - yield from self.output_sort_asc() + yield self.tools.fields_names_to_snake_case(record) def read_file(self, filename: str, remove_file: Optional[bool] = True) -> Iterable[Mapping[str, Any]]: try: diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index 08054e82632c..31555f29a352 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -21,7 +21,6 @@ from source_shopify.http_request import ShopifyErrorHandler from source_shopify.shopify_graphql.bulk.job import ShopifyBulkManager from source_shopify.shopify_graphql.bulk.query import ShopifyBulkQuery -from source_shopify.shopify_graphql.bulk.record import ShopifyBulkRecord from source_shopify.transform import DataTypeEnforcer from source_shopify.utils import EagerlyCachedStreamState as stream_state_cache from source_shopify.utils import ShopifyNonRetryableErrors @@ -630,12 +629,13 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream): data_field = "graphql" parent_stream_class: Optional[Union[ShopifyStream, IncrementalShopifyStream]] = None + filter_by_state_checkpoint = True def __init__(self, config: Dict) -> None: super().__init__(config) # init BULK Query instance, pass `shop_id` from config - self.query = self.bulk_query(shop_id=config.get("shop_id")) + self.query: ShopifyBulkQuery = self.bulk_query(shop_id=config.get("shop_id")) # define BULK Manager instance self.job_manager: ShopifyBulkManager = ShopifyBulkManager( http_client=self.bulk_http_client, @@ -648,13 +648,6 @@ def __init__(self, config: Dict) -> None: job_checkpoint_interval=config.get("job_checkpoint_interval", 200000), ) - # define Record Producer instance - self.record_producer: ShopifyBulkRecord = ShopifyBulkRecord( - self.query, - self.state_checkpoint_interval, - self.cursor_field, - ) - @property def bulk_http_client(self) -> HttpClient: """ @@ -663,7 +656,7 @@ def bulk_http_client(self) -> HttpClient: return HttpClient(self.name, self.logger, ShopifyErrorHandler(), session=self._http_client._session) @cached_property - def parent_stream(self) -> object: + def parent_stream(self) -> Union[ShopifyStream, IncrementalShopifyStream]: """ Returns the instance of parent stream, if the substream has a `parent_stream_class` dependency. """ @@ -764,6 +757,23 @@ def stream_slices(self, stream_state: Optional[Mapping[str, Any]] = None, **kwar # for the streams that don't support filtering yield {} + def sort_output_asc(self, non_sorted_records: Iterable[Mapping[str, Any]] = None) -> Iterable[Mapping[str, Any]]: + """ + Apply sorting for collected records, to guarantee the `ASC` output. + This handles the STATE and CHECKPOINTING correctly, for the `incremental` streams. + """ + if non_sorted_records: + if not self.cursor_field: + yield from non_sorted_records + else: + yield from sorted( + non_sorted_records, + key=lambda x: x.get(self.cursor_field) if x.get(self.cursor_field) else self.default_state_comparison_value, + ) + else: + # always return an empty iterable, if no records + return [] + def read_records( self, sync_mode: SyncMode, @@ -773,15 +783,12 @@ def read_records( ) -> Iterable[StreamData]: self.job_manager.create_job(stream_slice, self.filter_field) stream_state = stream_state_cache.cached_state.get(self.name, {self.cursor_field: self.default_state_comparison_value}) - - filename = self.job_manager.job_check_for_completion() - # the `filename` could be `None`, meaning there are no data available for the slice period. - if filename: - # add `shop_url` field to each record produced - records = self.add_shop_url_field( - # produce records from saved bulk job result - self.record_producer.read_file(filename) - ) - yield from self.filter_records_newer_than_state(stream_state, records) - # add log message about the checkpoint value - self.emit_checkpoint_message() + # add `shop_url` field to each record produced + records = self.add_shop_url_field( + # produce records from saved bulk job result + self.job_manager.job_get_results() + ) + # emit records in ASC order + yield from self.filter_records_newer_than_state(stream_state, self.sort_output_asc(records)) + # add log message about the checkpoint value + self.emit_checkpoint_message() diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py index 5cceb6da3651..647f187300f9 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_job.py @@ -189,14 +189,15 @@ def test_job_check_for_completion(mocker, request, requests_mock, job_response, job_result_url = test_job_status_response.json().get("data", {}).get("node", {}).get("url") if error_type: with pytest.raises(error_type) as error: - stream.job_manager.job_check_for_completion() + list(stream.job_manager.job_get_results()) assert expected in repr(error.value) else: if job_result_url: # mocking the nested request call to retrieve the data from result URL requests_mock.get(job_result_url, json=request.getfixturevalue(job_response)) - result = stream.job_manager.job_check_for_completion() - assert expected == result + mocker.patch("source_shopify.shopify_graphql.bulk.record.ShopifyBulkRecord.read_file", return_value=[]) + stream.job_manager._job_check_state() + assert expected == stream.job_manager._job_result_filename @pytest.mark.parametrize( @@ -304,7 +305,7 @@ def test_job_check_with_running_scenario(request, requests_mock, job_response, a "self-canceled with no url", ], ) -def test_job_running_with_canceled_scenario(request, requests_mock, running_job_response, canceled_job_response, auth_config, expected) -> None: +def test_job_running_with_canceled_scenario(mocker, request, requests_mock, running_job_response, canceled_job_response, auth_config, expected) -> None: stream = MetafieldOrders(auth_config) # modify the sleep time for the test stream.job_manager._job_check_interval = 0 @@ -328,7 +329,9 @@ def test_job_running_with_canceled_scenario(request, requests_mock, running_job_ stream.job_manager._job_self_canceled = True # mocking the nested request call to retrieve the data from result URL requests_mock.get(job_result_url, json=request.getfixturevalue(canceled_job_response)) - assert stream.job_manager.job_check_for_completion() == expected + mocker.patch("source_shopify.shopify_graphql.bulk.record.ShopifyBulkRecord.read_file", return_value=[]) + stream.job_manager._job_check_state() + assert stream.job_manager._job_result_filename == expected # clean up if expected: remove(expected) @@ -340,7 +343,7 @@ def test_job_read_file_invalid_filename(mocker, auth_config) -> None: # patching the method to get the filename mocker.patch("source_shopify.shopify_graphql.bulk.record.ShopifyBulkRecord.produce_records", side_effect=Exception) with pytest.raises(ShopifyBulkExceptions.BulkRecordProduceError) as error: - list(stream.record_producer.read_file("test.jsonl")) + list(stream.job_manager.record_producer.read_file("test.jsonl")) assert expected in repr(error.value) diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py index 0c45d8257b91..0f18c1965949 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py @@ -80,7 +80,7 @@ def test_record_new(record, expected) -> None: query = ShopifyBulkQuery(shop_id=0) record_instance = ShopifyBulkRecord(query) record_instance.record_new(record) - assert record_instance.input_buffer == [expected] + assert record_instance.buffer == [expected] @pytest.mark.parametrize( @@ -116,15 +116,15 @@ def test_record_new_component(records_from_jsonl, record_components, expected) - record_instance.components = record_components.get("record_components") # register new record first record_instance.record_new(records_from_jsonl[0]) - assert len(record_instance.input_buffer) > 0 + assert len(record_instance.buffer) > 0 # check the components placeholder was created for new record registered - assert "record_components" in record_instance.input_buffer[-1].keys() + assert "record_components" in record_instance.buffer[-1].keys() # register record component record_instance.record_new_component(records_from_jsonl[1]) # check the component was proccessed - assert len(record_instance.input_buffer[-1]["record_components"]["RecordComponent"]) > 0 + assert len(record_instance.buffer[-1]["record_components"]["RecordComponent"]) > 0 # general check - assert record_instance.input_buffer == expected + assert record_instance.buffer == expected @pytest.mark.parametrize( @@ -164,9 +164,9 @@ def test_record_new_component(records_from_jsonl, record_components, expected) - def test_buffer_flush(buffered_record, expected) -> None: query = ShopifyBulkQuery(shop_id=0) record_instance = ShopifyBulkRecord(query) - # populate the input_buffer with record - record_instance.input_buffer.append(buffered_record) - assert list(record_instance.flush_input_buffer()) == expected + # populate the buffer with record + record_instance.buffer.append(buffered_record) + assert list(record_instance.buffer_flush()) == expected @pytest.mark.parametrize( @@ -206,4 +206,4 @@ def test_record_compose(records_from_jsonl, record_composition, expected) -> Non for record in records_from_jsonl: list(record_instance.record_compose(record)) - assert record_instance.input_buffer == expected + assert record_instance.buffer == expected From 1883317ceceed1af7e4126cd2686bc36c10a1eaa Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Wed, 24 Jul 2024 01:46:34 +0300 Subject: [PATCH 15/21] fixed inf.loop --- .../shopify_graphql/bulk/job.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index 17088b4a0998..ce18effe0092 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -195,6 +195,14 @@ def __reset_state(self) -> None: # set the running job object count to default self._job_last_rec_count = 0 + def set_checkpointing(self) -> None: + # set the flag to adjust the next slice from the checkpointed cursor value + self._job_adjust_slice_from_checkpoint = True + + def reset_checkpointing(self) -> None: + # reseting the checkpoint flag, if bulk job has completed normally + self._job_adjust_slice_from_checkpoint = False + def _job_completed(self) -> bool: return self._job_state == ShopifyBulkJobStatus.COMPLETED.value @@ -225,7 +233,7 @@ def _log_job_state_with_count(self) -> None: else: message = f"Elapsed time: {self._job_elapsed_time_in_state} sec" if self._job_last_rec_count > 0: - count_message = f". Lines collected: {self._job_last_rec_count}" + count_message = f". Rows collected: {self._job_last_rec_count}" message = message + count_message self._log_state(message) self._log_job_msg_count = 0 @@ -256,7 +264,7 @@ def _job_get_result(self, response: Optional[requests.Response] = None) -> Optio def _job_get_checkpointed_result(self, response: Optional[requests.Response]) -> None: if self._job_any_lines_collected or self._job_should_checkpoint: # set the flag to adjust the next slice from the checkpointed cursor value - self._job_adjust_slice_from_checkpoint = True + self.set_checkpointing() # fetch the collected records from CANCELED Job on checkpointing self._job_result_filename = self._job_get_result(response) @@ -295,7 +303,7 @@ def _cancel_on_long_running_job(self) -> None: self._job_cancel() def _cancel_on_checkpointing(self) -> None: - self.logger.info(f"Stream: `{self.http_client._name}`, checkpointing after >= `{self._job_checkpoint_interval}` lines collected.") + self.logger.info(f"Stream: `{self.http_client._name}`, checkpointing after >= `{self._job_checkpoint_interval}` rows collected.") # set the flag to adjust the next slice from the checkpointed cursor value self._job_cancel() @@ -488,15 +496,14 @@ def _adjust_slice_end(self, slice_end: datetime, checkpointed_cursor: Optional[s def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, checkpointed_cursor: Optional[str] = None) -> datetime: if self._job_adjust_slice_from_checkpoint: + # set the checkpointing to default, before the next slice is emitted, to avoid inf.loop + self.reset_checkpointing() return self._adjust_slice_end(slice_end, checkpointed_cursor) if self._is_long_running_job: self._job_size_reduce_next() return slice_start - # reseting the checkpoint flag, if bulk job has completed normally - self._job_adjust_slice_from_checkpoint = False - return slice_end def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: From b499cb009c2bd6377673acdd3434faa4e90ffc1c Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Wed, 24 Jul 2024 15:17:47 +0300 Subject: [PATCH 16/21] minor corrections --- .../connectors/source-shopify/source_shopify/spec.json | 6 +++--- .../source-shopify/source_shopify/streams/base_streams.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json index 2bbdaf8a4965..87692dc4d740 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json @@ -102,16 +102,16 @@ "type": "integer", "title": "BULK Job termination threshold", "description": "The max time in seconds, after which the single BULK Job should be `CANCELED` and retried. The bigger the value the longer the BULK Job is allowed to run.", - "default": 3600, + "default": 7200, "minimum": 3600, "maximum": 21600 }, "job_checkpoint_interval": { "type": "integer", "title": "BULK Job checkpoint (lines collected)", - "description": "The threshold, after which the single BULK Job should be `CANCELED` and checkpointed.", + "description": "The threshold, after which the single BULK Job should be checkpointed.", "default": 200000, - "minimum": 15000, + "minimum": 20000, "maximum": 1000000 } } diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index 31555f29a352..3f1f4efacbf1 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -645,7 +645,7 @@ def __init__(self, config: Dict) -> None: # overide the default job slice size, if provided (it's auto-adjusted, later on) job_size=config.get("bulk_window_in_days", 30.0), # provide the job checkpoint interval value, default value is 200k lines collected - job_checkpoint_interval=config.get("job_checkpoint_interval", 200000), + job_checkpoint_interval=config.get("job_checkpoint_interval", 200_000), ) @property From 521ad5a9da647325d8b6aaf71f67dfc822a4284c Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 25 Jul 2024 21:39:37 +0300 Subject: [PATCH 17/21] updated --- .../shopify_graphql/bulk/job.py | 19 ++- .../shopify_graphql/bulk/query.py | 139 ++++++++++-------- .../shopify_graphql/bulk/record.py | 6 + .../source-shopify/source_shopify/spec.json | 14 +- .../source_shopify/streams/base_streams.py | 35 +++-- .../source-shopify/unit_tests/conftest.py | 7 +- .../unit_tests/graphql_bulk/test_query.py | 9 +- .../unit_tests/graphql_bulk/test_record.py | 28 ++-- 8 files changed, 156 insertions(+), 101 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index ce18effe0092..62f816c19215 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -151,9 +151,16 @@ def _is_long_running_job(self) -> bool: self._job_should_revert_slice = False return False + @property + def _supports_checkpointing(self) -> bool: + """ + The flag to determine whether or not the BULK Stream supports the `BULK checkpointing`. + """ + return self.query.supports_checkpointing + @property def _job_should_checkpoint(self) -> bool: - return self._job_last_rec_count >= self._job_checkpoint_interval + return self._supports_checkpointing and self._job_last_rec_count >= self._job_checkpoint_interval @property def _job_any_lines_collected(self) -> bool: @@ -195,11 +202,11 @@ def __reset_state(self) -> None: # set the running job object count to default self._job_last_rec_count = 0 - def set_checkpointing(self) -> None: + def _set_checkpointing(self) -> None: # set the flag to adjust the next slice from the checkpointed cursor value self._job_adjust_slice_from_checkpoint = True - def reset_checkpointing(self) -> None: + def _reset_checkpointing(self) -> None: # reseting the checkpoint flag, if bulk job has completed normally self._job_adjust_slice_from_checkpoint = False @@ -264,7 +271,7 @@ def _job_get_result(self, response: Optional[requests.Response] = None) -> Optio def _job_get_checkpointed_result(self, response: Optional[requests.Response]) -> None: if self._job_any_lines_collected or self._job_should_checkpoint: # set the flag to adjust the next slice from the checkpointed cursor value - self.set_checkpointing() + self._set_checkpointing() # fetch the collected records from CANCELED Job on checkpointing self._job_result_filename = self._job_get_result(response) @@ -497,7 +504,7 @@ def _adjust_slice_end(self, slice_end: datetime, checkpointed_cursor: Optional[s def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, checkpointed_cursor: Optional[str] = None) -> datetime: if self._job_adjust_slice_from_checkpoint: # set the checkpointing to default, before the next slice is emitted, to avoid inf.loop - self.reset_checkpointing() + self._reset_checkpointing() return self._adjust_slice_end(slice_end, checkpointed_cursor) if self._is_long_running_job: @@ -510,7 +517,7 @@ def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: final_message = f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." if self._job_any_lines_collected: - lines_collected_message = f" Rows collected: `{self._job_last_rec_count}`." + lines_collected_message = f" Rows collected: {self._job_last_rec_count} --> records: `{self.record_producer.record_composed}`." final_message = final_message + lines_collected_message # emit final Bulk job status message diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py index 777e716a29e4..1689eceebcc5 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py @@ -79,7 +79,11 @@ def prepare(query: str) -> str: @dataclass class ShopifyBulkQuery: - shop_id: int + config: Mapping[str, Any] + + @property + def shop_id(self) -> int: + return self.config.get("shop_id") @property def tools(self) -> BulkTools: @@ -112,6 +116,14 @@ def sort_key(self) -> Optional[str]: """ return None + @property + def supports_checkpointing(self) -> bool: + """ + The presence of `sort_key = "UPDATED_AT"` for a query instance, usually means, + the server-side BULK Job results are fetched and ordered correctly, suitable for checkpointing. + """ + return self.sort_key == "UPDATED_AT" + @property def query_nodes(self) -> Optional[Union[List[Field], List[str]]]: """ @@ -2382,8 +2394,7 @@ class ProductVariant(ShopifyBulkQuery): """ { productVariants( - query: "updated_at:>='2019-04-13T00:00:00+00:00' AND updated_at:<='2024-04-30T12:16:17.273363+00:00'" - sortKey: UPDATED_AT + query: "updatedAt:>='2019-04-13T00:00:00+00:00' AND updatedAt:<='2024-04-30T12:16:17.273363+00:00'" ) { edges { node { @@ -2457,64 +2468,76 @@ class ProductVariant(ShopifyBulkQuery): """ query_name = "productVariants" - sort_key = "ID" - prices_fields: List[str] = ["amount", "currencyCode"] - presentment_prices_fields: List[Field] = [ - Field( - name="edges", - fields=[ - Field( - name="node", - fields=["__typename", Field(name="price", fields=prices_fields), Field(name="compareAtPrice", fields=prices_fields)], - ) - ], - ) - ] + @property + def _should_include_presentment_prices(self) -> bool: + return self.config.get("job_product_variants_include_pres_prices") - option_value_fields: List[Field] = [ - "id", - "name", - Field(name="hasVariants", alias="has_variants"), - Field(name="swatch", fields=["color", Field(name="image", fields=["id"])]), - ] - option_fields: List[Field] = [ - "name", - "value", - Field(name="optionValue", alias="option_value", fields=option_value_fields), - ] + @property + def query_nodes(self) -> Optional[Union[List[Field], List[str]]]: - # main query - query_nodes: List[Field] = [ - "__typename", - "id", - "title", - "price", - "sku", - "position", - "inventoryPolicy", - "compareAtPrice", - "inventoryManagement", - "createdAt", - "updatedAt", - "taxable", - "barcode", - "weight", - "weightUnit", - "inventoryQuantity", - "requiresShipping", - "availableForSale", - "displayName", - "taxCode", - Field(name="selectedOptions", alias="options", fields=option_fields), - Field(name="weight", alias="grams"), - Field(name="image", fields=[Field(name="id", alias="image_id")]), - Field(name="inventoryQuantity", alias="old_inventory_quantity"), - Field(name="product", fields=[Field(name="id", alias="product_id")]), - Field(name="fulfillmentService", fields=[Field(name="handle", alias="fulfillment_service")]), - Field(name="inventoryItem", fields=[Field(name="id", alias="inventory_item_id")]), - Field(name="presentmentPrices", fields=presentment_prices_fields), - ] + prices_fields: List[str] = ["amount", "currencyCode"] + presentment_prices_fields: List[Field] = [ + Field( + name="edges", + fields=[ + Field( + name="node", + fields=[ + "__typename", + Field(name="price", fields=prices_fields), + Field(name="compareAtPrice", fields=prices_fields), + ], + ) + ], + ) + ] + option_value_fields: List[Field] = [ + "id", + "name", + Field(name="hasVariants", alias="has_variants"), + Field(name="swatch", fields=["color", Field(name="image", fields=["id"])]), + ] + option_fields: List[Field] = [ + "name", + "value", + Field(name="optionValue", alias="option_value", fields=option_value_fields), + ] + presentment_prices = ( + [Field(name="presentmentPrices", fields=presentment_prices_fields)] if self._should_include_presentment_prices else [] + ) + + query_nodes: List[Field] = [ + "__typename", + "id", + "title", + "price", + "sku", + "position", + "inventoryPolicy", + "compareAtPrice", + "inventoryManagement", + "createdAt", + "updatedAt", + "taxable", + "barcode", + "weight", + "weightUnit", + "inventoryQuantity", + "requiresShipping", + "availableForSale", + "displayName", + "taxCode", + Field(name="selectedOptions", alias="options", fields=option_fields), + Field(name="weight", alias="grams"), + Field(name="image", fields=[Field(name="id", alias="image_id")]), + Field(name="inventoryQuantity", alias="old_inventory_quantity"), + Field(name="product", fields=[Field(name="id", alias="product_id")]), + Field(name="fulfillmentService", fields=[Field(name="handle", alias="fulfillment_service")]), + Field(name="inventoryItem", fields=[Field(name="id", alias="inventory_item_id")]), + ] + presentment_prices + + return query_nodes record_composition = { "new_record": "ProductVariant", diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py index 27641669d942..acbfd472942b 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/record.py @@ -29,6 +29,8 @@ def __post_init__(self) -> None: self.composition: Optional[Mapping[str, Any]] = self.query.record_composition self.record_process_components: Optional[Callable[[MutableMapping], MutableMapping]] = self.query.record_process_components self.components: List[str] = self.composition.get("record_components", []) if self.composition else [] + # how many records composed + self.record_composed: int = 0 @property def tools(self) -> BulkTools: @@ -127,8 +129,12 @@ def produce_records(self, filename: str) -> Iterable[MutableMapping[str, Any]]: """ with open(filename, "r") as jsonl_file: + # reset the counter + self.record_composed = 0 + for record in self.process_line(jsonl_file): yield self.tools.fields_names_to_snake_case(record) + self.record_composed += 1 def read_file(self, filename: str, remove_file: Optional[bool] = True) -> Iterable[Mapping[str, Any]]: try: diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json index 87692dc4d740..af70d2ce01fe 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/spec.json @@ -98,6 +98,12 @@ "description": "Defines which API type (REST/BULK) to use to fetch `Transactions` data. If you are a `Shopify Plus` user, leave the default value to speed up the fetch.", "default": false }, + "job_product_variants_include_pres_prices": { + "type": "boolean", + "title": "Add `Presentment prices` to Product Variants", + "description": "If enabled, the `Product Variants` stream attempts to include `Presentment prices` field (may affect the performance).", + "default": true + }, "job_termination_threshold": { "type": "integer", "title": "BULK Job termination threshold", @@ -108,11 +114,11 @@ }, "job_checkpoint_interval": { "type": "integer", - "title": "BULK Job checkpoint (lines collected)", + "title": "BULK Job checkpoint (rows collected)", "description": "The threshold, after which the single BULK Job should be checkpointed.", - "default": 200000, - "minimum": 20000, - "maximum": 1000000 + "default": 100000, + "minimum": 15000, + "maximum": 200000 } } }, diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py index 3f1f4efacbf1..6c87d856dea5 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/streams/base_streams.py @@ -177,9 +177,14 @@ def request_params( class IncrementalShopifyStream(ShopifyStream, ABC): # Setting the check point interval to the limit of the records output state_checkpoint_interval = 250 - # guarantee for the NestedSubstreams to emit the STATE - # when we have the abnormal STATE distance between Parent and Substream - filter_by_state_checkpoint = False + + @property + def filter_by_state_checkpoint(self) -> bool: + """ + This filtering flag stands to guarantee for the NestedSubstreams to emit the STATE correctly, + when we have the abnormal STATE distance between Parent and Substream + """ + return False # Setting the default cursor field for all streams cursor_field = "updated_at" @@ -223,7 +228,7 @@ def track_checkpoint_cursor(self, record_value: Union[str, int]) -> None: def should_checkpoint(self, index: int) -> bool: return self.filter_by_state_checkpoint and index >= self.state_checkpoint_interval - # Parse the `stream_slice` with respect to `stream_state` for `Incremental refresh` + # Parse the `records` with respect to the `stream_state` for the `Incremental refresh` # cases where we slice the stream, the endpoints for those classes don't accept any other filtering, # but they provide us with the updated_at field in most cases, so we used that as incremental filtering during the order slicing. def filter_records_newer_than_state( @@ -630,17 +635,13 @@ class IncrementalShopifyGraphQlBulkStream(IncrementalShopifyStream): parent_stream_class: Optional[Union[ShopifyStream, IncrementalShopifyStream]] = None - filter_by_state_checkpoint = True - def __init__(self, config: Dict) -> None: super().__init__(config) - # init BULK Query instance, pass `shop_id` from config - self.query: ShopifyBulkQuery = self.bulk_query(shop_id=config.get("shop_id")) # define BULK Manager instance self.job_manager: ShopifyBulkManager = ShopifyBulkManager( http_client=self.bulk_http_client, base_url=f"{self.url_base}{self.path()}", - query=self.query, + query=self.bulk_query(config), job_termination_threshold=float(config.get("job_termination_threshold", 3600)), # overide the default job slice size, if provided (it's auto-adjusted, later on) job_size=config.get("bulk_window_in_days", 30.0), @@ -648,6 +649,10 @@ def __init__(self, config: Dict) -> None: job_checkpoint_interval=config.get("job_checkpoint_interval", 200_000), ) + @property + def filter_by_state_checkpoint(self) -> bool: + return self.job_manager._supports_checkpointing + @property def bulk_http_client(self) -> HttpClient: """ @@ -731,10 +736,14 @@ def get_state_value(self, stream_state: Mapping[str, Any] = None) -> Optional[Un def emit_slice_message(self, slice_start: datetime, slice_end: datetime) -> None: slice_size_message = f"Slice size: `P{round(self.job_manager._job_size, 1)}D`" - checkpointing_message = f"Checkpoint after `{self.job_manager.job_checkpoint_interval}` lines" - self.logger.info( - f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}. {checkpointing_message}." - ) + slice_message = f"Stream: `{self.name}` requesting BULK Job for period: {slice_start} -- {slice_end}. {slice_size_message}." + + if self.job_manager._supports_checkpointing: + checkpointing_message = f" The BULK checkpoint after `{self.job_manager.job_checkpoint_interval}` lines." + else: + checkpointing_message = f" The BULK checkpointing is not supported." + + self.logger.info(slice_message + checkpointing_message) def emit_checkpoint_message(self) -> None: if self.job_manager._job_adjust_slice_from_checkpoint: diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py index ce1898b8de19..c82bed05d3f3 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/conftest.py @@ -38,7 +38,11 @@ def logger(): @pytest.fixture def basic_config(): - return {"shop": "test_shop", "credentials": {"auth_method": "api_password", "api_password": "api_password"}} + return { + "shop": "test_shop", + "credentials": {"auth_method": "api_password", "api_password": "api_password"}, + "shop_id": 0, + } @pytest.fixture @@ -48,6 +52,7 @@ def auth_config(): "start_date": "2023-01-01", "credentials": {"auth_method": "api_password", "api_password": "api_password"}, "authenticator": None, + } diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_query.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_query.py index 87781b8e5538..9f5baaf0ef94 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_query.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_query.py @@ -99,7 +99,7 @@ def test_bulk_query_cancel() -> None: ], ids=["simple query with filter and sort"] ) -def test_base_build_query(query_name, fields, filter_field, start, end, expected) -> None: +def test_base_build_query(basic_config, query_name, fields, filter_field, start, end, expected) -> None: """ Expected result rendered: ''' @@ -116,8 +116,7 @@ def test_base_build_query(query_name, fields, filter_field, start, end, expected ''' """ - - builder = ShopifyBulkQuery(shop_id=0) + builder = ShopifyBulkQuery(basic_config) filter_query = f"{filter_field}:>'{start}' AND {filter_field}:<='{end}'" built_query = builder.build(query_name, fields, filter_query) assert expected.render() == built_query.render() @@ -240,6 +239,6 @@ def test_base_build_query(query_name, fields, filter_field, start, end, expected "InventoryLevel query", ] ) -def test_bulk_query(query_class, filter_field, start, end, expected) -> None: - stream = query_class(shop_id=0) +def test_bulk_query(basic_config, query_class, filter_field, start, end, expected) -> None: + stream = query_class(basic_config) assert stream.get(filter_field, start, end) == expected.render() \ No newline at end of file diff --git a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py index 0f18c1965949..dff60ea605d5 100644 --- a/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py +++ b/airbyte-integrations/connectors/source-shopify/unit_tests/graphql_bulk/test_record.py @@ -16,8 +16,8 @@ ({"id": 123}, {"id": 123}), ], ) -def test_record_resolve_id(record, expected) -> None: - bulk_query = ShopifyBulkQuery(shop_id=0) +def test_record_resolve_id(basic_config, record, expected) -> None: + bulk_query = ShopifyBulkQuery(basic_config) assert ShopifyBulkRecord(bulk_query).record_resolve_id(record) == expected @@ -29,8 +29,8 @@ def test_record_resolve_id(record, expected) -> None: ({}, "Other", False), ], ) -def test_check_type(record, types, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_check_type(basic_config, record, types, expected) -> None: + query = ShopifyBulkQuery(basic_config) assert ShopifyBulkRecord(query).check_type(record, types) == expected @@ -61,8 +61,8 @@ def test_check_type(record, types, expected) -> None: ) ], ) -def test_record_resolver(record, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_record_resolver(basic_config, record, expected) -> None: + query = ShopifyBulkQuery(basic_config) record_instance = ShopifyBulkRecord(query) assert record_instance.record_resolve_id(record) == expected @@ -76,8 +76,8 @@ def test_record_resolver(record, expected) -> None: ), ], ) -def test_record_new(record, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_record_new(basic_config, record, expected) -> None: + query = ShopifyBulkQuery(basic_config) record_instance = ShopifyBulkRecord(query) record_instance.record_new(record) assert record_instance.buffer == [expected] @@ -110,8 +110,8 @@ def test_record_new(record, expected) -> None: ], ids=["add_component"], ) -def test_record_new_component(records_from_jsonl, record_components, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_record_new_component(basic_config, records_from_jsonl, record_components, expected) -> None: + query = ShopifyBulkQuery(basic_config) record_instance = ShopifyBulkRecord(query) record_instance.components = record_components.get("record_components") # register new record first @@ -161,8 +161,8 @@ def test_record_new_component(records_from_jsonl, record_components, expected) - ), ], ) -def test_buffer_flush(buffered_record, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_buffer_flush(basic_config, buffered_record, expected) -> None: + query = ShopifyBulkQuery(basic_config) record_instance = ShopifyBulkRecord(query) # populate the buffer with record record_instance.buffer.append(buffered_record) @@ -196,8 +196,8 @@ def test_buffer_flush(buffered_record, expected) -> None: ], ids=["test_compose"], ) -def test_record_compose(records_from_jsonl, record_composition, expected) -> None: - query = ShopifyBulkQuery(shop_id=0) +def test_record_compose(basic_config, records_from_jsonl, record_composition, expected) -> None: + query = ShopifyBulkQuery(basic_config) # query.record_composition = record_composition record_instance = ShopifyBulkRecord(query) record_instance.composition = record_composition From 8d51e42cde6f29c513ca818a6009a8a5b03bcd79 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 25 Jul 2024 21:40:03 +0300 Subject: [PATCH 18/21] updated CAT config --- .../connectors/source-shopify/acceptance-test-config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml index 4df7a28f2086..d9e2dda2a64f 100644 --- a/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml +++ b/airbyte-integrations/connectors/source-shopify/acceptance-test-config.yml @@ -6,7 +6,7 @@ acceptance_tests: - spec_path: "source_shopify/spec.json" backward_compatibility_tests_config: # This is the intentional change. - # Added new field `job_checkpoint_interval`, + # Added new fields: `job_checkpoint_interval`, `job_product_variants_include_pres_prices` # to provide the ability to override this value by the User. disable_for_version: 2.4.14 connection: From 25cb146358ab5ea7e33638d903bf66fc1a8cf333 Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Thu, 25 Jul 2024 22:26:08 +0300 Subject: [PATCH 19/21] updated switch url logic --- .../connectors/source-shopify/poetry.lock | 12 +++++----- .../shopify_graphql/bulk/job.py | 24 +++++++------------ .../shopify_graphql/bulk/retry.py | 7 +++--- .../shopify_graphql/bulk/tools.py | 10 ++++++++ 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/poetry.lock b/airbyte-integrations/connectors/source-shopify/poetry.lock index c40071323bce..2fd640aaeeda 100644 --- a/airbyte-integrations/connectors/source-shopify/poetry.lock +++ b/airbyte-integrations/connectors/source-shopify/poetry.lock @@ -2,13 +2,13 @@ [[package]] name = "airbyte-cdk" -version = "3.9.0" +version = "3.9.3" description = "A framework for writing Airbyte Connectors." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "airbyte_cdk-3.9.0-py3-none-any.whl", hash = "sha256:5f8b800f75b3bba1ae1950eee001e12633826e338e6427fa99c89d11c861d283"}, - {file = "airbyte_cdk-3.9.0.tar.gz", hash = "sha256:cc54a63ef76896a18133b4bcb2191013c92d1ce1b405fe5261e8479b79076d63"}, + {file = "airbyte_cdk-3.9.3-py3-none-any.whl", hash = "sha256:a899c32726bcea32426b83b79d0d33b417cc949f1e25c0e29441e5e0bd2dbe8d"}, + {file = "airbyte_cdk-3.9.3.tar.gz", hash = "sha256:21ff845a7e79cd374efa1c092083af3245520b00b9ba2179a2168df66d611118"}, ] [package.dependencies] @@ -1037,13 +1037,13 @@ files = [ [[package]] name = "pytest" -version = "8.3.1" +version = "8.3.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.3.1-py3-none-any.whl", hash = "sha256:e9600ccf4f563976e2c99fa02c7624ab938296551f280835ee6516df8bc4ae8c"}, - {file = "pytest-8.3.1.tar.gz", hash = "sha256:7e8e5c5abd6e93cb1cc151f23e57adc31fcf8cfd2a3ff2da63e23f732de35db6"}, + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, ] [package.dependencies] diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py index 62f816c19215..e1e620bd4959 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/job.py @@ -246,7 +246,7 @@ def _log_job_state_with_count(self) -> None: self._log_job_msg_count = 0 def _log_state(self, message: Optional[str] = None) -> None: - pattern = f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` is {self._job_state}" + pattern = f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` is {self._job_state}" if message: self.logger.info(f"{pattern}. {message}.") else: @@ -305,12 +305,12 @@ def _on_canceling_job(self, **kwargs) -> None: def _cancel_on_long_running_job(self) -> None: self.logger.info( - f"Stream: `{self.http_client._name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." + f"Stream: `{self.http_client.name}` the BULK Job: {self._job_id} runs longer than expected ({self._job_max_elapsed_time} sec). Retry with the reduced `Slice Size` after self-cancelation." ) self._job_cancel() def _cancel_on_checkpointing(self) -> None: - self.logger.info(f"Stream: `{self.http_client._name}`, checkpointing after >= `{self._job_checkpoint_interval}` rows collected.") + self.logger.info(f"Stream: `{self.http_client.name}`, checkpointing after >= `{self._job_checkpoint_interval}` rows collected.") # set the flag to adjust the next slice from the checkpointed cursor value self._job_cancel() @@ -349,7 +349,7 @@ def _on_job_with_errors(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedE raise ShopifyBulkExceptions.BulkJobError(f"Could not validate the status of the BULK Job `{self._job_id}`. Errors: {errors}.") def _on_non_handable_job_error(self, errors: List[Mapping[str, Any]]) -> AirbyteTracedException: - raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client._name}`, Non-handable error occured: {errors}") + raise ShopifyBulkExceptions.BulkJobNonHandableError(f"The Stream: `{self.http_client.name}`, Non-handable error occured: {errors}") def _get_server_errors(self, response: requests.Response) -> List[Optional[Mapping[str, Any]]]: server_errors = response.json().get("errors", []) @@ -409,12 +409,6 @@ def _has_running_concurrent_job(self, errors: Optional[Iterable[Mapping[str, Any def _has_reached_max_concurrency(self) -> bool: return self._concurrent_attempt == self._concurrent_max_retry - def _switch_base_url(self) -> None: - if self._new_base_url: - self.base_url = self._new_base_url - else: - self.logger.warning(f"Failed switching the `base url`, no `new base url` has been retrieved.") - def _should_switch_shop_name(self, response: requests.Response) -> bool: """ Sometimes the API returns the redirected response that points to the same Store but with different Name: @@ -426,12 +420,12 @@ def _should_switch_shop_name(self, response: requests.Response) -> bool: This redirection is related to: 1) `aliased` or `hidden` store names from being exposed - 2) migrated to data to the new store, but referenced within the old one stil. + 2) `migrated` store data to the `new store`, but referenced within the old one stil reference issue: https://github.com/airbytehq/oncall/issues/5866 """ if self.base_url != response.url: - self._new_base_url = response.url + self.base_url = response.url return True return False @@ -461,7 +455,7 @@ def create_job(self, stream_slice: Mapping[str, str], filter_field: str) -> None if self._has_running_concurrent_job(errors): # when the concurrent job takes place, another job could not be created # we typically need to wait and retry, but no longer than 10 min. (see retry in `bulk_retry_on_exception`) - raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.http_client._name}") + raise ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError(f"Failed to create job for stream {self.http_client.name}") elif self._should_switch_shop_name(response): # assign new shop name, since the one that specified in `config` was redirected to the different one. raise ShopifyBulkExceptions.BulkJobRedirectToOtherShopError(f"Switching the `store` name, redirected to: {response.url}") @@ -483,7 +477,7 @@ def _job_process_created(self, response: requests.Response) -> None: self._job_id = bulk_response.get("id") self._job_created_at = bulk_response.get("createdAt") self._job_state = ShopifyBulkJobStatus.CREATED.value - self.logger.info(f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` is {ShopifyBulkJobStatus.CREATED.value}") + self.logger.info(f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` is {ShopifyBulkJobStatus.CREATED.value}") def job_size_normalize(self, start: datetime, end: datetime) -> datetime: # adjust slice size when it's bigger than the loop point when it should end, @@ -514,7 +508,7 @@ def get_adjusted_job_end(self, slice_start: datetime, slice_end: datetime, check return slice_end def _emit_final_job_message(self, job_current_elapsed_time: int) -> None: - final_message = f"Stream: `{self.http_client._name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." + final_message = f"Stream: `{self.http_client.name}`, the BULK Job: `{self._job_id}` time elapsed: {job_current_elapsed_time} sec." if self._job_any_lines_collected: lines_collected_message = f" Rows collected: {self._job_last_rec_count} --> records: `{self.record_producer.record_composed}`." diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py index d5f166b5da40..ec0c242a0b8b 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/retry.py @@ -35,7 +35,7 @@ def wrapper(self, *args, **kwargs) -> Any: raise else: logger.warning( - f"Stream `{self.http_client._name}`: {ex}. Retrying {current_retries}/{self._job_max_retries} after {self._job_backoff_time} seconds." + f"Stream `{self.http_client.name}`: {ex}. Retrying {current_retries}/{self._job_max_retries} after {self._job_backoff_time} seconds." ) sleep(self._job_backoff_time) except ShopifyBulkExceptions.BulkJobCreationFailedConcurrentError: @@ -46,14 +46,13 @@ def wrapper(self, *args, **kwargs) -> Any: self._concurrent_attempt += 1 logger.warning( - f"Stream: `{self.http_client._name}`, the BULK concurrency limit has reached. Waiting {self._concurrent_interval} sec before retry, attempt: {self._concurrent_attempt}.", + f"Stream: `{self.http_client.name}`, the BULK concurrency limit has reached. Waiting {self._concurrent_interval} sec before retry, attempt: {self._concurrent_attempt}.", ) sleep(self._concurrent_interval) except ShopifyBulkExceptions.BulkJobRedirectToOtherShopError: logger.warning( - f"Stream: `{self.http_client._name}`, the `shop name` differs from the provided by the User: `{self.base_url}`. Switching to the `{self._new_base_url}`.", + f"Stream: `{self.http_client.name}`, the `shop name` differs from the provided in `input configuration`. Switching to the `{self._tools.shop_name_from_url(self.base_url)}`.", ) - self._switch_base_url() return wrapper diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/tools.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/tools.py index bd5d26099edb..7a23aa1a2d02 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/tools.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/tools.py @@ -52,6 +52,16 @@ def filename_from_url(job_result_url: str) -> str: f"Could not extract the `filename` from `result_url` provided, details: {job_result_url}", ) + @staticmethod + def shop_name_from_url(url: str) -> str: + match = re.search(r"https://(.*?)(\.myshopify)", url) + if match: + return match.group(1) + else: + # safety net, if there is an error parsing url, + # on no match is found + return url + @staticmethod def from_iso8601_to_rfc3339(record: Mapping[str, Any], field: str) -> Mapping[str, Any]: """ From 6e39a8fff5142fde70681666e335a1f142cb727e Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Fri, 26 Jul 2024 12:32:47 +0300 Subject: [PATCH 20/21] fixed default behaviour for pres.prices --- .../source-shopify/source_shopify/shopify_graphql/bulk/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py index 1689eceebcc5..f31e4260b26e 100644 --- a/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py +++ b/airbyte-integrations/connectors/source-shopify/source_shopify/shopify_graphql/bulk/query.py @@ -2471,7 +2471,7 @@ class ProductVariant(ShopifyBulkQuery): @property def _should_include_presentment_prices(self) -> bool: - return self.config.get("job_product_variants_include_pres_prices") + return self.config.get("job_product_variants_include_pres_prices", True) @property def query_nodes(self) -> Optional[Union[List[Field], List[str]]]: From db50cfec128f8102c1d3331c83e03337625f7a8f Mon Sep 17 00:00:00 2001 From: Oleksandr Bazarnov Date: Mon, 29 Jul 2024 15:43:26 +0300 Subject: [PATCH 21/21] bumped version --- airbyte-integrations/connectors/source-shopify/metadata.yaml | 2 +- airbyte-integrations/connectors/source-shopify/pyproject.toml | 2 +- docs/integrations/sources/shopify.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/airbyte-integrations/connectors/source-shopify/metadata.yaml b/airbyte-integrations/connectors/source-shopify/metadata.yaml index 70b8565fe837..8c12e5835ff6 100644 --- a/airbyte-integrations/connectors/source-shopify/metadata.yaml +++ b/airbyte-integrations/connectors/source-shopify/metadata.yaml @@ -11,7 +11,7 @@ data: connectorSubtype: api connectorType: source definitionId: 9da77001-af33-4bcd-be46-6252bf9342b9 - dockerImageTag: 2.4.15 + dockerImageTag: 2.4.16 dockerRepository: airbyte/source-shopify documentationUrl: https://docs.airbyte.com/integrations/sources/shopify githubIssueLabel: source-shopify diff --git a/airbyte-integrations/connectors/source-shopify/pyproject.toml b/airbyte-integrations/connectors/source-shopify/pyproject.toml index d8189f715b1d..c6107064ff48 100644 --- a/airbyte-integrations/connectors/source-shopify/pyproject.toml +++ b/airbyte-integrations/connectors/source-shopify/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",] build-backend = "poetry.core.masonry.api" [tool.poetry] -version = "2.4.15" +version = "2.4.16" name = "source-shopify" description = "Source CDK implementation for Shopify." authors = [ "Airbyte ",] diff --git a/docs/integrations/sources/shopify.md b/docs/integrations/sources/shopify.md index 6d7dca32b063..e40c5e2ab7b5 100644 --- a/docs/integrations/sources/shopify.md +++ b/docs/integrations/sources/shopify.md @@ -212,7 +212,7 @@ For all `Shopify GraphQL BULK` api requests these limitations are applied: https | Version | Date | Pull Request | Subject | |:--------|:-----------|:---------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 2.4.15 | 2024-07-21 | [42095](https://github.com/airbytehq/airbyte/pull/42095) | Added the `Checkpointing` for the `BULK` streams, fixed the `store` redirection | +| 2.4.16 | 2024-07-21 | [42095](https://github.com/airbytehq/airbyte/pull/42095) | Added the `Checkpointing` for the `BULK` streams, fixed the `store` redirection | | 2.4.15 | 2024-07-27 | [42806](https://github.com/airbytehq/airbyte/pull/42806) | Update dependencies | | 2.4.14 | 2024-07-20 | [42150](https://github.com/airbytehq/airbyte/pull/42150) | Update dependencies | | 2.4.13 | 2024-07-13 | [41809](https://github.com/airbytehq/airbyte/pull/41809) | Update dependencies |