From db26cc273cd93074d5c329ca768cb70e53fcc6c7 Mon Sep 17 00:00:00 2001
From: Kevin Tse <ktse@fb.com>
Date: Thu, 6 Oct 2022 19:12:00 -0400
Subject: [PATCH] Adding Cloud Storage Provider tutorial section

ghstack-source-id: e17389a9dd199503b6b1f2a8214867bccb30f3dc
Pull Request resolved: https://github.com/pytorch/data/pull/812
---
 docs/source/tutorial.rst | 74 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index fd33beac8..7c584729c 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -294,3 +294,77 @@ The stack of DataPipes can then be constructed using their functional forms (rec
 
 In the above example, ``datapipes1`` and ``datapipes2`` represent the exact same stack of ``IterDataPipe``\s. We
 recommend using the functional form of DataPipes.
+
+Working with Cloud Storage Providers
+---------------------------------------------
+
+Accessing AWS S3 with `fsspec` DataPipes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This requires the installation of the libraries ``fsspec``
+(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`_) and ``s3fs``
+(`s3fs GitHub repo <https://github.com/fsspec/s3fs>`_).
+
+You can list out the files within a S3 bucket directory by passing a path that starts
+with ``"s3://BUCKET_NAME"`` to ``FSSpecFileLister``.
+
+.. code:: python
+
+    from torchdata.datapipes.iter import IterableWrapper
+
+    dp = IterableWrapper(["s3://BUCKET_NAME"]).list_files_by_fsspec()
+
+You can also open files using ``FSSpecFileOpener`` and stream them (if supported by the file format).
+Note that you can also provide additional parameters via the argument ``kwargs_for_open``; this can be
+useful for purposes such as accessing specific bucket version. The supported arguments vary by
+the (cloud) file system that you are accessing.
+
+In the example below, we are streaming the archive by using ``.load_from_tar(mode="r|")``
+(in contrast with ``mode="r:"``). That allows us to begin processing data inside the archive
+without downloading the whole archive into memory first.
+
+.. code:: python
+
+    from torchdata.datapipes.iter import IterableWrapper
+    dp = IterableWrapper(["s3://BUCKET_NAME/DIRECTORY/1.tar"])
+    dp = dp.open_files_by_fsspec(mode="rb", anon=True).load_from_tar(mode="r|") # Streaming version
+    # The rest of data processing logic goes here
+
+
+Finally, ``FSSpecFileSaver`` is also available for writing data to cloud.
+
+Accessing Google Cloud Storage (GCS) with ``fsspec`` DataPipes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+This requires the installation of the libraries ``fsspec``
+(`documentation <https://filesystem-spec.readthedocs.io/en/latest/>`_) and ``gcsfs``
+(`gcsfs GitHub repo <https://github.com/fsspec/gcsfs>`_).
+
+You can list out the files within a GCS bucket directory by specifying a path that starts
+with ``"gcs://BUCKET_NAME"``. The bucket name in the example below is ``uspto-pair``.
+
+.. code:: python
+
+    from torchdata.datapipes.iter import IterableWrapper
+
+    dp = IterableWrapper(["gcs://uspto-pair/"]).list_files_by_fsspec()
+    print(list(dp))
+    # ['gcs://uspto-pair/applications', 'gcs://uspto-pair/docs', 'gcs://uspto-pair/prosecution-history-docs']
+
+Here is an example of loading a zip file ``05900035.zip`` from a bucket named ``uspto-pair`` inside the
+directory ``applications``.
+
+.. code:: python
+
+    from torchdata.datapipes.iter import IterableWrapper
+
+    dp = IterableWrapper(["gcs://uspto-pair/applications/05900035.zip"]) \
+            .open_files_by_fsspec(mode="rb") \
+            .load_from_zip()
+    # Logic to process those archive files come after
+    for path, filestream in dp:
+        print(path, filestream)
+    # gcs:/uspto-pair/applications/05900035.zip/05900035/README.txt, StreamWrapper<...>
+    # gcs:/uspto-pair/applications/05900035.zip/05900035/05900035-address_and_attorney_agent.tsv, StreamWrapper<...>
+    # gcs:/uspto-pair/applications/05900035.zip/05900035/05900035-application_data.tsv, StreamWrapper<...>
+    # gcs:/uspto-pair/applications/05900035.zip/05900035/05900035-continuity_data.tsv, StreamWrapper<...>
+    # gcs:/uspto-pair/applications/05900035.zip/05900035/05900035-transaction_history.tsv, StreamWrapper<...>