From 14e7a1a7c2a59d7d1d21b3af378601fe608e69b1 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Thu, 12 Dec 2024 17:02:50 +1100 Subject: [PATCH 1/4] c --- .../src/python/user-guide/io/cloud-storage.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/source/src/python/user-guide/io/cloud-storage.py b/docs/source/src/python/user-guide/io/cloud-storage.py index 12b02df28e61..f1d1c686ea6f 100644 --- a/docs/source/src/python/user-guide/io/cloud-storage.py +++ b/docs/source/src/python/user-guide/io/cloud-storage.py @@ -77,20 +77,18 @@ def get_credentials() -> pl.CredentialProviderFunctionReturn: # --8<-- [end:scan_pyarrow_dataset] # --8<-- [start:write_parquet] - import polars as pl -import s3fs -df = pl.DataFrame({ - "foo": ["a", "b", "c", "d", "d"], - "bar": [1, 2, 3, 4, 5], -}) +df = pl.DataFrame( + { + "foo": ["a", "b", "c", "d", "d"], + "bar": [1, 2, 3, 4, 5], + } +) -fs = s3fs.S3FileSystem() destination = "s3://bucket/my_file.parquet" -# write parquet -with fs.open(destination, mode='wb') as f: - df.write_parquet(f) +df.write_parquet(destination) + # --8<-- [end:write_parquet] """ From d11ad1ac1bd9f58cb29bea235d81b1a958411f1f Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Mon, 23 Dec 2024 21:08:16 +1100 Subject: [PATCH 2/4] c --- docs/source/user-guide/io/cloud-storage.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index f12ad4576ebd..d48ac40d6524 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -71,7 +71,8 @@ We first create a PyArrow dataset and then create a `LazyFrame` from the dataset ## Writing to cloud storage -We can write a `DataFrame` to cloud storage in Python using s3fs for S3, adlfs for Azure Blob -Storage and gcsfs for Google Cloud Storage. In this example, we write a Parquet file to S3. +`DataFrame`s can also be written to cloud storage by passing a cloud URL: {{code_block('user-guide/io/cloud-storage','write_parquet',['write_parquet'])}} + +Note that `DataFrame`s can also be written to any file-like Python object that supports writes. From f312b7990c49fab6e39f6ba29f64f6480265a472 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Mon, 23 Dec 2024 21:23:26 +1100 Subject: [PATCH 3/4] c --- .../src/python/user-guide/io/cloud-storage.py | 21 +++++++++++++++++++ .../src/rust/user-guide/io/cloud-storage.rs | 3 +++ docs/source/user-guide/io/cloud-storage.md | 4 +++- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/source/src/python/user-guide/io/cloud-storage.py b/docs/source/src/python/user-guide/io/cloud-storage.py index f1d1c686ea6f..f0cbe67a9a0c 100644 --- a/docs/source/src/python/user-guide/io/cloud-storage.py +++ b/docs/source/src/python/user-guide/io/cloud-storage.py @@ -91,4 +91,25 @@ def get_credentials() -> pl.CredentialProviderFunctionReturn: df.write_parquet(destination) # --8<-- [end:write_parquet] + +# --8<-- [start:write_file_object] +import polars as pl +import s3fs +import gzip + +df = pl.DataFrame( + { + "foo": ["a", "b", "c", "d", "d"], + "bar": [1, 2, 3, 4, 5], + } +) + +destination = "s3://bucket/my_file.csv.gz" + +fs = s3fs.S3FileSystem() + +with fs.open(destination, "wb") as cloud_f: + with gzip.open(cloud_f, "w") as f: + df.write_csv(f) +# --8<-- [end:write_file_object] """ diff --git a/docs/source/src/rust/user-guide/io/cloud-storage.rs b/docs/source/src/rust/user-guide/io/cloud-storage.rs index 2df882a39c00..19fe4e66b815 100644 --- a/docs/source/src/rust/user-guide/io/cloud-storage.rs +++ b/docs/source/src/rust/user-guide/io/cloud-storage.rs @@ -44,3 +44,6 @@ async fn main() { // --8<-- [start:write_parquet] // --8<-- [end:write_parquet] + +// --8<-- [start:write_file_object] +// --8<-- [end:write_file_object] diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index d48ac40d6524..0bae6342c3c5 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -75,4 +75,6 @@ We first create a PyArrow dataset and then create a `LazyFrame` from the dataset {{code_block('user-guide/io/cloud-storage','write_parquet',['write_parquet'])}} -Note that `DataFrame`s can also be written to any file-like Python object that supports writes. +Note that `DataFrame`s can also be written to any Python file object that supports writes: + +{{code_block('user-guide/io/cloud-storage','write_csv',['write_file_object'])}} From b68d1189f546a091dbb4d1b33da851dd138d3182 Mon Sep 17 00:00:00 2001 From: Simon Lin Date: Mon, 23 Dec 2024 21:28:20 +1100 Subject: [PATCH 4/4] c --- docs/source/user-guide/io/cloud-storage.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/user-guide/io/cloud-storage.md b/docs/source/user-guide/io/cloud-storage.md index 0bae6342c3c5..5d1449c02c19 100644 --- a/docs/source/user-guide/io/cloud-storage.md +++ b/docs/source/user-guide/io/cloud-storage.md @@ -75,6 +75,8 @@ We first create a PyArrow dataset and then create a `LazyFrame` from the dataset {{code_block('user-guide/io/cloud-storage','write_parquet',['write_parquet'])}} -Note that `DataFrame`s can also be written to any Python file object that supports writes: +Note that `DataFrame`s can also be written to any Python file object that supports writes. This can +be helpful for performing operations that are not yet natively supported, e.g. writing a compressed +CSV directly to cloud: -{{code_block('user-guide/io/cloud-storage','write_csv',['write_file_object'])}} +{{code_block('user-guide/io/cloud-storage','write_file_object',['write_csv'])}}