airbytehq · clnoll · Aug 1, 2023 · Jul 27, 2023 · Jul 31, 2023 · Aug 1, 2023
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/remote_file.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/remote_file.py
@@ -3,11 +3,17 @@
 #
 
 from datetime import datetime
+from enum import Enum
 from typing import Optional
 
 from pydantic import BaseModel
 
 
+class FileReadMode(Enum):
+    READ = "r"
+    READ_BINARY = "rb"
+
+
 class RemoteFile(BaseModel):
     """
     A file in a file-based stream.

diff --git a/airbyte-integrations/connectors/source-s3/Dockerfile b/airbyte-integrations/connectors/source-s3/Dockerfile
@@ -17,5 +17,5 @@ COPY source_s3 ./source_s3
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=3.1.1
+LABEL io.airbyte.version=3.1.2
 LABEL io.airbyte.name=airbyte/source-s3
diff --git a/airbyte-integrations/connectors/source-s3/metadata.yaml b/airbyte-integrations/connectors/source-s3/metadata.yaml
@@ -5,7 +5,7 @@ data:
   connectorSubtype: file
   connectorType: source
   definitionId: 69589781-7828-43c5-9f63-8925b1c1ccc2
-  dockerImageTag: 3.1.1
+  dockerImageTag: 3.1.2
   dockerRepository: airbyte/source-s3
   githubIssueLabel: source-s3
   icon: s3.svg

diff --git a/airbyte-integrations/connectors/source-s3/setup.py b/airbyte-integrations/connectors/source-s3/setup.py
@@ -19,7 +19,7 @@
 TEST_REQUIREMENTS = [
     "pytest~=6.1",
     "connector-acceptance-test",
-    "pandas==1.3.1",
+    "pandas==2.0.3",
     "psutil",
     "pytest-order",
     "netifaces~=0.11.0",

diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/__init__.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/__init__.py
@@ -0,0 +1,8 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+from .config import Config
+from .stream_reader import SourceS3StreamReader
+
+__all__ = ["Config", "SourceS3StreamReader"]
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/config.py
@@ -0,0 +1,56 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+from typing import Optional
+
+from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
+from pydantic import AnyUrl, Field, ValidationError, root_validator
+
+
+class Config(AbstractFileBasedSpec):
+    config_version: str = "0.1"
+
+    @classmethod
+    def documentation_url(cls) -> AnyUrl:
+        return AnyUrl("https://docs.airbyte.com/integrations/sources/s3", scheme="https")
+
+    bucket: str = Field(title="Bucket", description="Name of the S3 bucket where the file(s) exist.", order=0)
+
+    aws_access_key_id: Optional[str] = Field(
+        title="AWS Access Key ID",
+        default=None,
+        description="In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper "
+        "permissions. If accessing publicly available data, this field is not necessary.",
+        airbyte_secret=True,
+        order=1,
+    )
+
+    aws_secret_access_key: Optional[str] = Field(
+        title="AWS Secret Access Key",
+        default=None,
+        description="In order to access private Buckets stored on AWS S3, this connector requires credentials with the proper "
+        "permissions. If accessing publicly available data, this field is not necessary.",
+        airbyte_secret=True,
+        order=2,
+    )
+
+    endpoint: Optional[str] = Field(
+        "", title="Endpoint", description="Endpoint to an S3 compatible service. Leave empty to use AWS.", order=4
+    )
+
+    @root_validator
+    def validate_optional_args(cls, values):
+        aws_access_key_id = values.get("aws_access_key_id")
+        aws_secret_access_key = values.get("aws_secret_access_key")
+        endpoint = values.get("endpoint")
+        if aws_access_key_id or aws_secret_access_key:
+            if not (aws_access_key_id and aws_secret_access_key):
+                raise ValidationError(
+                    "`aws_access_key_id` and `aws_secret_access_key` are both required to authenticate with AWS.", model=Config
+                )
+            if endpoint:
+                raise ValidationError(
+                    "Either `aws_access_key_id` and `aws_secret_access_key` or `endpoint` must be set, but not both.", model=Config
+                )
+        return values
diff --git a/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py b/airbyte-integrations/connectors/source-s3/source_s3/v4/stream_reader.py
@@ -0,0 +1,171 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+import logging
+from contextlib import contextmanager
+from io import IOBase
+from typing import Iterable, List, Optional, Set
+
+import boto3.session
+import smart_open
+from airbyte_cdk.sources.file_based.exceptions import ErrorListingFiles, FileBasedSourceError
+from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
+from airbyte_cdk.sources.file_based.remote_file import FileReadMode, RemoteFile
+from botocore.client import BaseClient
+from botocore.client import Config as ClientConfig
+from source_s3.v4.config import Config
+
+
+class SourceS3StreamReader(AbstractFileBasedStreamReader):
+    def __init__(self):
+        super().__init__()
+        self._s3_client = None
+
+    @property
+    def config(self) -> Config:
+        return self._config
+
+    @config.setter
+    def config(self, value: Config):
+        """
+        FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader.
+
+        Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader
+        will require keys that (for example) allow it to authenticate with the 3rd party.
+
+        Therefore, concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct
+        config type for that type of StreamReader.
+        """
+        assert isinstance(value, Config)
+        self._config = value
+
+    @property
+    def s3_client(self) -> BaseClient:
+        if self.config is None:
+            # We shouldn't hit this; config should always get set before attempting to
+            # list or read files.
+            raise ValueError("Source config is missing; cannot create the S3 client.")
+        if self._s3_client is None:
+            if self.config.endpoint:
+                client_kv_args = _get_s3_compatible_client_args(self.config)
+                self._s3_client = boto3.client("s3", **client_kv_args)
+            else:
+                self._s3_client = boto3.client(
+                    "s3",
+                    aws_access_key_id=self.config.aws_access_key_id,
+                    aws_secret_access_key=self.config.aws_secret_access_key,
+                )
+        return self._s3_client
+
+    def get_matching_files(self, globs: List[str], logger: logging.Logger) -> Iterable[RemoteFile]:
+        """
+        Get all files matching the specified glob patterns.
+        """
+        s3 = self.s3_client
+        prefixes = self.get_prefixes_from_globs(globs)
+        seen = set()
+        total_n_keys = 0
+
+        try:
+            if prefixes:
+                for prefix in prefixes:
+                    for remote_file in self._page(s3, globs, self.config.bucket, prefix, seen, logger):
+                        total_n_keys += 1
+                        yield remote_file
+            else:
+                for remote_file in self._page(s3, globs, self.config.bucket, None, seen, logger):
+                    total_n_keys += 1
+                    yield remote_file
+
+            logger.info(f"Finished listing objects from S3. Found {total_n_keys} objects total ({len(seen)} unique objects).")
+        except Exception as exc:
+            raise ErrorListingFiles(
+                FileBasedSourceError.ERROR_LISTING_FILES,
+                source="s3",
+                bucket=self.config.bucket,
+                globs=globs,
+                endpoint=self.config.endpoint,
+            ) from exc
+
+    @contextmanager
+    def open_file(self, file: RemoteFile, mode: FileReadMode, logger: logging.Logger) -> IOBase:
+        try:
+            params = {"client": self.s3_client}
+        except Exception as exc:
+            raise exc
+
+        logger.debug(f"try to open {file.uri}")
+        try:
+            result = smart_open.open(f"s3://{self.config.bucket}/{file.uri}", transport_params=params, mode=mode.value)
+        except OSError:
+            logger.warning(
+                f"We don't have access to {file.uri}. The file appears to have become unreachable during sync."
+                f"Check whether key {file.uri} exists in `{self.config.bucket}` bucket and/or has proper ACL permissions"
+            )
+        # see https://docs.python.org/3/library/contextlib.html#contextlib.contextmanager for why we do this
+        try:
+            yield result
+        finally:
+            result.close()
+
+    @staticmethod
+    def _is_folder(file) -> bool:
+        return file["Key"].endswith("/")
+
+    def _page(
+        self, s3: BaseClient, globs: List[str], bucket: str, prefix: Optional[str], seen: Set[str], logger: logging.Logger
+    ) -> Iterable[RemoteFile]:
+        """
+        Page through lists of S3 objects.
+        """
+        total_n_keys_for_prefix = 0
+        kwargs = {"Bucket": bucket}
+        while True:
+            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix) if prefix else s3.list_objects_v2(Bucket=bucket)
+            key_count = response.get("KeyCount")
+            total_n_keys_for_prefix += key_count
+            logger.info(f"Received {key_count} objects from S3 for prefix '{prefix}'.")
+
+            if "Contents" in response:
+                for file in response["Contents"]:
+                    if self._is_folder(file):
+                        continue
+                    remote_file = RemoteFile(uri=file["Key"], last_modified=file["LastModified"])
+                    if self.file_matches_globs(remote_file, globs) and remote_file.uri not in seen:
+                        seen.add(remote_file.uri)
+                        yield remote_file
+            else:
+                logger.warning(f"Invalid response from S3; missing 'Contents' key. kwargs={kwargs}.")
+
+            if next_token := response.get("NextContinuationToken"):
+                kwargs["ContinuationToken"] = next_token
+            else:
+                logger.info(f"Finished listing objects from S3 for prefix={prefix}. Found {total_n_keys_for_prefix} objects.")
+                break
+
+
+# Used to specify anonymous (unsigned) request signature
+class UNSIGNED:
+    def __copy__(self):
+        return self
+
+    def __deepcopy__(self, memodict):
+        return self
+
+
+def _get_s3_compatible_client_args(config: Config) -> dict:
+    """
+    Returns map of args used for creating s3 boto3 client.
+    """
+    client_config = ClientConfig(signature_version=UNSIGNED())
+    client_kv_args = {"config": client_config}
+    client_kv_args.update(
+        {
+            "endpoint_url": config.endpoint,
+            "use_ssl": True,
+            "verify": True,
+            "config": ClientConfig(s3={"addressing_style": "auto"}),
+        }
+    )
+    return client_kv_args
diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/__init__.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/__init__.py
diff --git a/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_config.py b/airbyte-integrations/connectors/source-s3/unit_tests/v4/test_config.py
@@ -0,0 +1,26 @@
+
+import logging
+
+import pytest
+from pydantic import ValidationError
+from source_s3.v4.config import Config
+
+logger = logging.Logger("")
+
+
+@pytest.mark.parametrize(
+    "kwargs,expected_error",
+    [
+        pytest.param({"bucket": "test", "streams": []}, None, id="required-fields"),
+        pytest.param({"bucket": "test", "streams": [], "aws_access_key_id": "access_key", "aws_secret_access_key": "secret_access_key"}, None, id="config-created-with-aws-info"),
+        pytest.param({"bucket": "test", "streams": [], "endpoint": "http://test.com"}, None, id="config-created-with-endpoint"),
+        pytest.param({"bucket": "test", "streams": [], "aws_access_key_id": "access_key", "aws_secret_access_key": "secret_access_key", "endpoint": "http://test.com"}, ValidationError, id="cannot-have-endpoint-and-aws-info"),
+        pytest.param({"streams": []}, ValidationError, id="missing-bucket"),
+    ]
+)
+def test_config(kwargs, expected_error):
+    if expected_error:
+        with pytest.raises(expected_error):
+            Config(**kwargs)
+    else:
+        Config(**kwargs)