airbytehq · artem1205 · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023
diff --git a/airbyte-integrations/connectors/source-s3/Dockerfile b/airbyte-integrations/connectors/source-s3/Dockerfile
@@ -17,5 +17,5 @@ COPY source_s3 ./source_s3
 ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=0.1.27
+LABEL io.airbyte.version=0.1.28
 LABEL io.airbyte.name=airbyte/source-s3
diff --git a/airbyte-integrations/connectors/source-s3/integration_tests/spec.json b/airbyte-integrations/connectors/source-s3/integration_tests/spec.json
@@ -198,7 +198,7 @@
               "block_size": {
                 "title": "Block Size",
                 "description": "The chunk size in bytes to process at a time in memory from each file. If your data is particularly wide and failing during schema detection, increasing this should solve it. Beware of raising this too high as you could hit OOM errors.",
-                "default": 10000,
+                "default": 0,
                 "order": 2,
                 "type": "integer"
               }

diff --git a/...integrations/connectors/source-s3/source_s3/source_files_abstract/formats/jsonl_parser.py b/...integrations/connectors/source-s3/source_s3/source_files_abstract/formats/jsonl_parser.py
@@ -1,17 +1,18 @@
 #
 # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
 #
-
-
+import logging
 from typing import Any, BinaryIO, Iterator, Mapping, TextIO, Union
 
 import pyarrow as pa
-from pyarrow import json as pa_json
+from pyarrow import json as pa_json, ArrowNotImplementedError
 from source_s3.source_files_abstract.file_info import FileInfo
 
 from .abstract_file_parser import AbstractFileParser
 from .jsonl_spec import JsonlFormat
 
+logger = logging.getLogger("airbyte")
+
 
 class JsonlParser(AbstractFileParser):
     TYPE_MAP = {
@@ -50,8 +51,10 @@ def _read_options(self) -> Mapping[str, str]:
         """
         https://arrow.apache.org/docs/python/generated/pyarrow.json.ReadOptions.html
         build ReadOptions object like: pa.json.ReadOptions(**self._read_options())
+        Disable block size parameter if it set to 0.
         """
-        return {**{"block_size": self.format.block_size, "use_threads": True}}
+        return {**{"block_size": self.format.block_size if self.format.block_size else None,
+                   "use_threads": True}}
 
     def _parse_options(self, json_schema: Mapping[str, Any] = None) -> Mapping[str, str]:
         """
@@ -70,9 +73,14 @@ def _parse_options(self, json_schema: Mapping[str, Any] = None) -> Mapping[str,
         return parse_options
 
     def _read_table(self, file: Union[TextIO, BinaryIO], json_schema: Mapping[str, Any] = None) -> pa.Table:
-        return pa_json.read_json(
-            file, pa.json.ReadOptions(**self._read_options()), pa.json.ParseOptions(**self._parse_options(json_schema))
-        )
+        try:
+            return pa_json.read_json(
+                file, pa.json.ReadOptions(**self._read_options()), pa.json.ParseOptions(**self._parse_options(json_schema))
+            )
+        except ArrowNotImplementedError as e:
+            message = "Possibly too small block size used. Please try to increase it or set to 0 disable this feature."
+            logger.warning(message)
+            raise ValueError(message) from e
 
     def get_inferred_schema(self, file: Union[TextIO, BinaryIO], file_info: FileInfo) -> Mapping[str, Any]:
         """

diff --git a/...e-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/jsonl_spec.py b/...e-integrations/connectors/source-s3/source_s3/source_files_abstract/formats/jsonl_spec.py
@@ -38,9 +38,9 @@ class Config:
         examples=["ignore", "infer", "error"],
         order=1,
     )
-
+    # Block size set to 0 as default value to disable this feature for most not-experienced users
     block_size: int = Field(
-        default=10000,
+        default=0,
         description="The chunk size in bytes to process at a time in memory from each file. If your data is particularly wide and failing during schema detection, increasing this should solve it. Beware of raising this too high as you could hit OOM errors.",
         order=2,
     )
diff --git a/docs/integrations/sources/s3.md b/docs/integrations/sources/s3.md
@@ -205,10 +205,11 @@ The avro parser uses [fastavro](https://fastavro.readthedocs.io/en/latest/). Cur
 
 The Jsonl parser uses pyarrow hence,only the line-delimited JSON format is supported.For more detailed info, please refer to the [docs] (https://arrow.apache.org/docs/python/generated/pyarrow.json.read_json.html)
 
-## Changelog
+## Changelog 21210
 
 | Version | Date       | Pull Request                                                                                                    | Subject                                                                                 |
 |:--------|:-----------|:----------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------|
+| 0.1.28  | 2023-01-10 | [21210](https://github.com/airbytehq/airbyte/pull/21210)                                                        | Update block size for json file format                                                  |
 | 0.1.27  | 2022-12-08 | [20262](https://github.com/airbytehq/airbyte/pull/20262)                                                        | Check config settings for CSV file format                                               |
 | 0.1.26  | 2022-11-08 | [19006](https://github.com/airbytehq/airbyte/pull/19006)                                                        | Add virtual-hosted-style option                                                         |
 | 0.1.24  | 2022-10-28 | [18602](https://github.com/airbytehq/airbyte/pull/18602)                                                        | Wrap errors into AirbyteTracedException pointing to a problem file                      |