Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream returns AirbyteMessages #18572

Merged
merged 69 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
5f88aa6
method yielding airbytemessage
girarda Oct 27, 2022
0de8395
move to Stream
girarda Oct 27, 2022
e5acdcf
update abstract source
girarda Oct 27, 2022
8a63af7
reset
girarda Oct 27, 2022
d0dcb52
missing file
girarda Oct 28, 2022
adf07fa
add test docker image
girarda Oct 31, 2022
c68e01d
script to run acceptance tests with local cdk
girarda Oct 31, 2022
258990d
Update conftest to use a different image
girarda Oct 31, 2022
b0f80a7
extract to method
girarda Oct 31, 2022
68528b2
dont use a different image tag
girarda Oct 31, 2022
b31f9bd
Always install local cdk
girarda Oct 31, 2022
893beab
break the cdk
girarda Oct 31, 2022
c3964c9
get path from current working directory
girarda Oct 31, 2022
c69774a
or
girarda Oct 31, 2022
152f5df
ignore unit test
girarda Oct 31, 2022
8ea5e85
debug log
girarda Oct 31, 2022
1684823
Revert "AMI change: ami-0f23be2f917510c26 -> ami-005924fb76f7477ce (#…
girarda Oct 31, 2022
b08a9c4
build from the top
girarda Oct 31, 2022
4fd9ed0
Update source-acceptance-test
girarda Oct 31, 2022
06b1c70
fix
girarda Oct 31, 2022
3e4b135
copy setup
girarda Oct 31, 2022
4702f4d
some work on the gradle plugin
girarda Oct 31, 2022
ae7b7fb
reset to master
girarda Nov 1, 2022
3d86bce
delete unused file
girarda Nov 1, 2022
9dc4486
delete unused file
girarda Nov 1, 2022
d3852f3
reset to master
girarda Nov 1, 2022
179d689
optional argument
girarda Nov 1, 2022
65c416c
delete dead code
girarda Nov 1, 2022
b13fe7a
use latest cdk with sendgrid
girarda Nov 2, 2022
0fff056
Merge branch 'master' into alex/read_with_logs
girarda Nov 2, 2022
01dcd2a
fix sendgrid dockerfile
girarda Nov 2, 2022
5d46bb0
Merge branch 'alex/always_install_local_cdk' into alex/read_with_logs
girarda Nov 2, 2022
600c195
break the cdk
girarda Nov 2, 2022
0b48fea
use local file
girarda Nov 2, 2022
613e876
Revert "break the cdk"
girarda Nov 2, 2022
ea2089d
dont raise an exception
girarda Nov 3, 2022
e2d11b3
reset to master
girarda Nov 3, 2022
d901e1c
unit tests
girarda Nov 3, 2022
0600e59
missing test
girarda Nov 3, 2022
82e46c5
more unit tests
girarda Nov 3, 2022
50bb3ed
Merge branch 'master' into alex/read_with_logs
girarda Nov 3, 2022
acfcec8
newline
girarda Nov 3, 2022
3fa6a00
reset to master
girarda Nov 3, 2022
048508d
Merge branch 'master' into alex/read_with_logs
girarda Nov 3, 2022
3e85dca
Merge branch 'master' into alex/read_with_logs
girarda Nov 3, 2022
6d9d0aa
remove files
girarda Nov 3, 2022
464b247
reset
girarda Nov 3, 2022
4acd199
Merge branch 'master' into alex/read_with_logs
girarda Nov 4, 2022
0c36dcb
Update abstract source
girarda Nov 4, 2022
46a807e
remove method from stream
girarda Nov 4, 2022
bad305b
convert to airbytemessage
girarda Nov 5, 2022
d78628f
unittests
girarda Nov 5, 2022
7fd5cc9
Update
girarda Nov 5, 2022
0cbd28e
unit test
girarda Nov 5, 2022
a1a139e
remove debug logs
girarda Nov 5, 2022
b1d62cd
Revert "remove debug logs"
girarda Nov 5, 2022
1da2ed1
Revert "Revert "remove debug logs""
girarda Nov 5, 2022
5dac7c2
Revert "reset to master"
girarda Nov 5, 2022
594ad06
Revert "Revert "reset to master""
girarda Nov 5, 2022
2d91e9a
reset to master
girarda Nov 5, 2022
cff30a1
reset to master
girarda Nov 5, 2022
2f91b80
test
girarda Nov 5, 2022
3fc697c
Revert "test"
girarda Nov 5, 2022
62d95eb
test
girarda Nov 5, 2022
003b860
Revert "test"
girarda Nov 5, 2022
27150ba
test
girarda Nov 5, 2022
26c3289
Revert "test"
girarda Nov 5, 2022
b36842a
format
girarda Nov 5, 2022
fe3b01a
Merge branch 'master' into alex/read_with_logs
girarda Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 33 additions & 47 deletions airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@

import logging
from abc import ABC, abstractmethod
from datetime import datetime
from functools import lru_cache
from typing import Any, Dict, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union

from airbyte_cdk.models import (
AirbyteCatalog,
AirbyteConnectionStatus,
AirbyteMessage,
AirbyteRecordMessage,
AirbyteStateMessage,
ConfiguredAirbyteCatalog,
ConfiguredAirbyteStream,
Expand All @@ -24,8 +21,8 @@
from airbyte_cdk.sources.source import Source
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.http.http import HttpStream
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
from airbyte_cdk.sources.utils.transform import TypeTransformer
from airbyte_cdk.utils.event_timing import create_timer
from airbyte_cdk.utils.traced_exception import AirbyteTracedException

Expand Down Expand Up @@ -241,20 +238,27 @@ def _read_incremental(
stream_state=stream_state,
cursor_field=configured_stream.cursor_field or None,
)
for record_counter, record_data in enumerate(records, start=1):
yield self._as_airbyte_record(stream_name, record_data)
stream_state = stream_instance.get_updated_state(stream_state, record_data)
checkpoint_interval = stream_instance.state_checkpoint_interval
if checkpoint_interval and record_counter % checkpoint_interval == 0:
yield self._checkpoint_state(stream_instance, stream_state, state_manager)

total_records_counter += 1
# This functionality should ideally live outside of this method
# but since state is managed inside this method, we keep track
# of it here.
if self._limit_reached(internal_config, total_records_counter):
# Break from slice loop to save state and exit from _read_incremental function.
break
record_counter = 0
for message_counter, record_data_or_message in enumerate(records, start=1):
message = stream_data_to_airbyte_message(
stream_name, record_data_or_message, stream_instance.transformer, stream_instance.get_json_schema()
)
yield message
if message.type == MessageType.RECORD:
record = message.record
stream_state = stream_instance.get_updated_state(stream_state, record.data)
Copy link
Contributor Author

@girarda girarda Oct 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same logic moved inside the if block because we only want to update the state and record counter on records

checkpoint_interval = stream_instance.state_checkpoint_interval
record_counter += 1
if checkpoint_interval and record_counter % checkpoint_interval == 0:
yield self._checkpoint_state(stream_instance, stream_state, state_manager)

total_records_counter += 1
# This functionality should ideally live outside of this method
# but since state is managed inside this method, we keep track
# of it here.
if self._limit_reached(internal_config, total_records_counter):
# Break from slice loop to save state and exit from _read_incremental function.
break

yield self._checkpoint_state(stream_instance, stream_state, state_manager)
if self._limit_reached(internal_config, total_records_counter):
Expand All @@ -277,50 +281,32 @@ def _read_full_refresh(
total_records_counter = 0
for _slice in slices:
logger.debug("Processing stream slice", extra={"slice": _slice})
records = stream_instance.read_records(
record_data_or_messages = stream_instance.read_records(
stream_slice=_slice,
sync_mode=SyncMode.full_refresh,
cursor_field=configured_stream.cursor_field,
)
for record in records:
yield self._as_airbyte_record(configured_stream.stream.name, record)
total_records_counter += 1
if self._limit_reached(internal_config, total_records_counter):
return
for record_data_or_message in record_data_or_messages:
message = stream_data_to_airbyte_message(
stream_instance.name, record_data_or_message, stream_instance.transformer, stream_instance.get_json_schema()
)
yield message
if message.type == MessageType.RECORD:
total_records_counter += 1
if self._limit_reached(internal_config, total_records_counter):
return

def _checkpoint_state(self, stream: Stream, stream_state, state_manager: ConnectorStateManager):
# First attempt to retrieve the current state using the stream's state property. We receive an AttributeError if the state
# property is not implemented by the stream instance and as a fallback, use the stream_state retrieved from the stream
# instance's deprecated get_updated_state() method.
try:
state_manager.update_state_for_stream(stream.name, stream.namespace, stream.state)

except AttributeError:
state_manager.update_state_for_stream(stream.name, stream.namespace, stream_state)
return state_manager.create_state_message(stream.name, stream.namespace, send_per_stream_state=self.per_stream_state_enabled)

@lru_cache(maxsize=None)
def _get_stream_transformer_and_schema(self, stream_name: str) -> Tuple[TypeTransformer, Mapping[str, Any]]:
"""
Lookup stream's transform object and jsonschema based on stream name.
This function would be called a lot so using caching to save on costly
get_json_schema operation.
:param stream_name name of stream from catalog.
:return tuple with stream transformer object and discover json schema.
"""
stream_instance = self._stream_to_instance_map[stream_name]
return stream_instance.transformer, stream_instance.get_json_schema()

def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extracted to a function so it can be reused by the SimpleRetriever

now_millis = int(datetime.now().timestamp() * 1000)
transformer, schema = self._get_stream_transformer_and_schema(stream_name)
# Transform object fields according to config. Most likely you will
# need it to normalize values against json schema. By default no action
# taken unless configured. See
# docs/connector-development/cdk-python/schemas.md for details.
transformer.transform(data, schema) # type: ignore
message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
return AirbyteMessage(type=MessageType.RECORD, record=message)

@staticmethod
def _apply_log_level_to_stream_logger(logger: logging.Logger, stream_instance: Stream):
"""
Expand Down
15 changes: 13 additions & 2 deletions airbyte-cdk/python/airbyte_cdk/sources/streams/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,24 @@
import inspect
import logging
from abc import ABC, abstractmethod
from functools import lru_cache
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union

import airbyte_cdk.sources.utils.casing as casing
from airbyte_cdk.models import AirbyteStream, SyncMode
from airbyte_cdk.models import AirbyteLogMessage, AirbyteRecordMessage, AirbyteStream, AirbyteTraceMessage, SyncMode

# list of all possible HTTP methods which can be used for sending of request bodies
from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader
from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
from deprecated.classic import deprecated

# A stream's read method can return one of the following types:
# Mapping[str, Any]: The content of an AirbyteRecordMessage
# AirbyteRecordMessage: An AirbyteRecordMessage
# AirbyteLogMessage: A log message
# AirbyteTraceMessage: A trace message
StreamData = Union[Mapping[str, Any], AirbyteRecordMessage, AirbyteLogMessage, AirbyteTraceMessage]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not attached to this typedef, but the full union definition is quite cumbersome



def package_name_from_class(cls: object) -> str:
"""Find the package name given a class name"""
Expand Down Expand Up @@ -94,11 +104,12 @@ def read_records(
cursor_field: List[str] = None,
stream_slice: Mapping[str, Any] = None,
stream_state: Mapping[str, Any] = None,
) -> Iterable[Mapping[str, Any]]:
) -> Iterable[StreamData]:
"""
This method should be overridden by subclasses to read records based on the inputs
"""

@lru_cache(maxsize=None)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

def get_json_schema(self) -> Mapping[str, Any]:
"""
:return: A dict of the JSON schema representing this stream.
Expand Down
4 changes: 3 additions & 1 deletion airbyte-cdk/python/airbyte_cdk/sources/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

# Initialize Utils Package

__all__ = ["record_helper"]
40 changes: 40 additions & 0 deletions airbyte-cdk/python/airbyte_cdk/sources/utils/record_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
#

import datetime
from typing import Any, Mapping

from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteRecordMessage, AirbyteTraceMessage
from airbyte_cdk.models import Type as MessageType
from airbyte_cdk.sources.streams.core import StreamData
from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer


def stream_data_to_airbyte_message(
stream_name: str,
data_or_message: StreamData,
transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform),
schema: Mapping[str, Any] = None,
) -> AirbyteMessage:
if schema is None:
schema = {}

if isinstance(data_or_message, dict):
data = data_or_message
now_millis = int(datetime.datetime.now().timestamp() * 1000)
# Transform object fields according to config. Most likely you will
# need it to normalize values against json schema. By default no action
# taken unless configured. See
# docs/connector-development/cdk-python/schemas.md for details.
transformer.transform(data, schema) # type: ignore
message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
return AirbyteMessage(type=MessageType.RECORD, record=message)
elif isinstance(data_or_message, AirbyteRecordMessage):
return AirbyteMessage(type=MessageType.RECORD, record=data_or_message)
elif isinstance(data_or_message, AirbyteTraceMessage):
return AirbyteMessage(type=MessageType.TRACE, trace=data_or_message)
elif isinstance(data_or_message, AirbyteLogMessage):
return AirbyteMessage(type=MessageType.LOG, log=data_or_message)
else:
raise ValueError(f"Unexpected type for data_or_message: {type(data_or_message)}")
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@


from typing import Any, Iterable, List, Mapping
from unittest import mock

import pytest
from airbyte_cdk.models import AirbyteStream, SyncMode
Expand Down Expand Up @@ -173,3 +174,11 @@ def test_wrapped_primary_key_various_argument(test_input, expected):
wrapped = Stream._wrapped_primary_key(test_input)

assert wrapped == expected


@mock.patch("airbyte_cdk.sources.utils.schema_helpers.ResourceSchemaLoader.get_schema")
def test_get_json_schema_is_cached(mocked_method):
stream = StreamStubFullRefresh()
for i in range(5):
stream.get_json_schema()
assert mocked_method.call_count == 1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

manually confirmed this test fails if @lru_cache is deleted

Loading