From 71ff82a29da13d82a5d5f7c2f516488f3582a59b Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Fri, 22 Mar 2024 14:31:18 +0100 Subject: [PATCH 1/2] docs --- python/deltalake/table.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 064ee3a83c..7a80e1c5d1 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -1025,10 +1025,26 @@ def to_pyarrow_dataset( Args: partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem - parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31 + parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31] More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.ParquetReadOptions.html + Example: + ``deltalake`` will work with any storage compliant with :class:`pyarrow.fs.FileSystem`, however the root of the filesystem has + to be adjusted to point at the root of the Delta table. We can achieve this by wrapping the custom filesystem into + a :class:`pyarrow.fs.SubTreeFileSystem`. + ``` + import pyarrow.fs as fs + from deltalake import DeltaTable + + table_uri = "s3:///" + raw_fs, normalized_path = fs.FileSystem.from_uri(table_uri) + filesystem = fs.SubTreeFileSystem(normalized_path, raw_fs) + + dt = DeltaTable(table_uri) + ds = dt.to_pyarrow_dataset(filesystem=filesystem) + ``` + Returns: the PyArrow dataset in PyArrow """ @@ -1063,7 +1079,6 @@ def to_pyarrow_dataset( self._table.table_uri(), self._storage_options, file_sizes ) ) - format = ParquetFileFormat( read_options=parquet_read_options, default_fragment_scan_options=ParquetFragmentScanOptions(pre_buffer=True), From 4ce28d4844a5f522c565bbddaf29d4de098a5d73 Mon Sep 17 00:00:00 2001 From: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com> Date: Fri, 22 Mar 2024 15:16:57 +0100 Subject: [PATCH 2/2] typo --- python/deltalake/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/deltalake/table.py b/python/deltalake/table.py index 7a80e1c5d1..026588036d 100644 --- a/python/deltalake/table.py +++ b/python/deltalake/table.py @@ -1025,7 +1025,7 @@ def to_pyarrow_dataset( Args: partitions: A list of partition filters, see help(DeltaTable.files_by_partitions) for filter syntax filesystem: A concrete implementation of the Pyarrow FileSystem or a fsspec-compatible interface. If None, the first file path will be used to determine the right FileSystem - parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31] + parquet_read_options: Optional read options for Parquet. Use this to handle INT96 to timestamp conversion for edge cases like 0001-01-01 or 9999-12-31 More info: https://arrow.apache.org/docs/python/generated/pyarrow.dataset.ParquetReadOptions.html