From 120c7a95ef82d40f7552e2797f74994d5adbee4e Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Wed, 14 Dec 2022 16:51:17 -0900
Subject: [PATCH 01/13] Add s3 bucket support

---
 setup.py                     | 1 +
 src/pudl_catalog/__init__.py | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a14dd91..82ca9c7 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
     zip_safe=False,
     python_requires=">=3.8,<3.12",
     install_requires=[
+        "s3fs>=2021.7,<2022.11.1",
         "gcsfs>=2021.7,<2022.11.1",
         "intake_parquet>=0.2.3,<0.3",
         "intake_sqlite>=0.2.0",
diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py
index 49afc3b..9378fa0 100644
--- a/src/pudl_catalog/__init__.py
+++ b/src/pudl_catalog/__init__.py
@@ -17,6 +17,7 @@
 
 BASE_URLS = {
     "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}",
+    "s3": f"s3://{INTAKE_BUCKET}/{CATALOG_VERSION}",
     # HTTPS access doesn't really work well, so we're hiding it from users for now.
     "https": f"https://storage.googleapis.com/{INTAKE_BUCKET}/{CATALOG_VERSION}",
 }
@@ -25,9 +26,9 @@
 if os.getenv("PUDL_INTAKE_PATH") is None:
     logger.info(
         "Environment variable PUDL_INTAKE_PATH is not set. "
-        f"Defaulting to {BASE_URLS['gs']}"
+        f"Defaulting to {BASE_URLS['s3']}"
     )
-    os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["gs"]
+    os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["s3"]
 
 if os.getenv("PUDL_INTAKE_CACHE") is None:
     logger.info(

From 6f0f5fb05a3c98535639402529c4d8182eae1ab0 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Wed, 14 Dec 2022 18:25:35 -0900
Subject: [PATCH 02/13] Add installation instructions to README

---
 README.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.rst b/README.rst
index 40e5db1..d49e590 100644
--- a/README.rst
+++ b/README.rst
@@ -68,6 +68,21 @@ See also:
 PUDL Catalog Usage
 ------------------
 
+Installation
+~~~~~~~~~~~~
+You can install the PUDL Catalog using conda:
+
+.. code:: text
+
+   conda install -c conda-forge catalystcoop.pudl
+
+or pip:
+
+.. code:: text
+
+   pip install catalystcoop.pudl-catalog
+
+
 Accessing Public Cloud Data with Requester Pays
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

From 811dea5a605ec7cdfe21abab0ab0da1d6bbfd184 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Thu, 15 Dec 2022 11:36:40 -0900
Subject: [PATCH 03/13] Update catalog version to recent release

---
 src/pudl_catalog/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py
index 9378fa0..e124a35 100644
--- a/src/pudl_catalog/__init__.py
+++ b/src/pudl_catalog/__init__.py
@@ -13,7 +13,7 @@
 
 INTAKE_BUCKET = "intake.catalyst.coop"
 # Ideally we would set this version automatically using setuptools_scm...
-CATALOG_VERSION = "v0.1.1"
+CATALOG_VERSION = "v2022.11.30"
 
 BASE_URLS = {
     "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}",

From e1e48015c9ee0fd785ed808c070c33ffbc89e4c7 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Thu, 15 Dec 2022 11:39:16 -0900
Subject: [PATCH 04/13] Remove requester pays documentation

---
 README.rst              |  12 --
 docs/index.rst          |   1 -
 docs/requester_pays.rst | 268 ----------------------------------------
 3 files changed, 281 deletions(-)
 delete mode 100644 docs/requester_pays.rst

diff --git a/README.rst b/README.rst
index d49e590..d126eb2 100644
--- a/README.rst
+++ b/README.rst
@@ -82,18 +82,6 @@ or pip:
 
    pip install catalystcoop.pudl-catalog
 
-
-Accessing Public Cloud Data with Requester Pays
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To control the cost of distributing potentially large quantities of public data, we are
-using "requester pays." This means that whoever is downloading the data pays the modest
-data egress fees. If you're not familiar with GCP or requester pays, we've written a
-short guide to setting up a GCP project to work with this type of data. See the
-`PUDL Catalog documentation <https://catalystcoop-pudl-catalog.readthedocs.io/en/latest/>`__.
-for details, and also the
-`GCP Documentation on accessing Requester Pays data <https://cloud.google.com/storage/docs/using-requester-pays#using>`__
-
 Import the Intake Catalogs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/index.rst b/docs/index.rst
index c88bcbb..4359a93 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,7 +9,6 @@ The PUDL Data Catalog
   :hidden:
   :maxdepth: 2
 
-  Requester Pays <requester_pays>
   Licensing <LICENSE>
   Code of Conduct <code_of_conduct>
   Release Notes <release_notes>
diff --git a/docs/requester_pays.rst b/docs/requester_pays.rst
deleted file mode 100644
index 436bd9a..0000000
--- a/docs/requester_pays.rst
+++ /dev/null
@@ -1,268 +0,0 @@
-=======================================================================================
-Accessing Public Cloud Data with Requester Pays
-=======================================================================================
-
-The data we're publishing in the PUDL Catalog is publicly accessible and distributed
-under the permissive `CC-BY-4.0 <https://creativecommons.org/licenses/by/4.0>`__
-license. Catalyst covers the cost of storing the data in Google cloud storage buckets.
-However, there are also fees incurred when data leaves the Google Cloud Platform (GCP).
-Depending where you're downloading from, it costs $0.10-0.20 (USD) per GB.  In order to
-share large amounts of public data without risking unexpectedly large data egress
-charges due to someone maliciously or accidentally downloading a large volume of data,
-we've set our cloud storage to use `requester pays
-<https://cloud.google.com/storage/docs/requester-pays>`__.
-
-"Requester pays" means the person downloading the data is responsible for those costs
-instead. As a user, the cost of a single download is cheap. But as a data provider,
-payihg for the downloads of all users can quickly get expensive! Downloading all of the
-EPA CEMS, FERC 1, PUDL, and US Census data we're publishing from North America should
-cost about $0.75, but if we had 1000 downloads in a month that would cost us $750. The
-PUDL Intake catalog is also set up to try and cache the data locally so that it's not
-downloaded again until a new version is released.
-
-Setting up a GCP project for billing
-------------------------------------
-
-The following instructions assume you don't currently have GCP set up, and just want to
-be able to use public data that uses requester pays. If your organization already uses
-GCP or you want to access the PUDL Catalog in the context of another project that's
-running on GCP, you'll probably need to do something different.
-
-Create a GCP Account
-~~~~~~~~~~~~~~~~~~~~
-
-If you have never used GCP before, go to `<https://cloud.google.com>`__ and click on the
-"Get Started for Free" button.  A prompt should appear asking you to choose which Google
-account to use for your GCP-related activities. You should be able to log in with a
-Gmail account or another Google ID. You don't need to use Gmail for your email. Your
-Google ID is just used for identification and authentication.
-
-.. note::
-
-  If you already have a GCP account set up then you can just log in to your existing
-  account. Note that the free startup credits are only available for brand new accounts.
-
-Create a New GCP Project
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-GCP allows you to organize services, access, and billing under various Projects, which
-you name and define. In the context of accessing public requester pays data, the only
-thing it needs to do is identify who you are, and how you'd like to pay for any charges
-associated with the accessing the data. These charges are from Google. Catalyst doesn't
-charge a markup or get any money from this, it's just a way to help cover the costs of
-distributing the data.
-
-.. note::
-
-  If you have a pre-existing GCP project that you want to use the PUDL Catalog within,
-  you can skip this step.
-
-Let's create a project for use in accessing public data.  Here's the
-`GCP documentation on creating a new project
-<https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project>`__
-
-After you've logged in, at the top of the main GCP home page you should see a search
-field. Type in "Manage Resources" and at the top of the suggested searches that pop up
-you should see a "Products and Pages" section, which contains "Manage Resources". Click
-on that section, and you should come to the Manage Resources page.
-
-Click on the "Create Project" link near the top of the page:
-
-.. image:: images/01-manage-resources.png
-
-The only field you need to have filled in here is the Project Name. You can use the
-random one that is suggested by Google. If you're creating a project outside of any
-organization, it needs to be globally unique, so Google picks a couple of random words
-and a random number (e.g. ``aerobic-star-352200``).
-
-Create a GCP Billing Account
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The billing account is separate from the project you just created. See the
-`GCP docs on creating a billing account <https://cloud.google.com/billing/docs/how-to/manage-billing-account#create_a_new_billing_account>`__
-and navigate to `<https://console.cloud.google.com/billing>`__. At the top of the page
-you should see a "Create Account" button.
-
-.. image:: images/02-create-billing-account.png
-
-.. note::
-
-  If you have a pre-existing GCP Billing Account that you want to use to cover the cost of
-  downloading data, then you can skip this step.
-
-Enable billing on your project
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Now you need to associate the billing account you just created with the project you
-made in the previous step. You need to `enable billing for the project
-<https://cloud.google.com/billing/docs/how-to/modify-project#enable_billing_for_a_project>`__
-through this account.
-
-.. note::
-
-  If you are using a pre-existing Billing Account and Project, you might still need to
-  associate them with each other if they haven't been used together previously.
-
-At the top of the `billing page <https://console.cloud.google.com/billing>`__ there
-should be two tabs: "My Billing Accounts" and "My Projects". Select "My Projects" and
-then in the row for the new project you created above, click on the three dots in the
-far righthand column labeled "Actions". The menu that pops up should have an
-"Add Billing" or "Change Billing" option. Click on that, and you should be given the
-choice of which billing account to associate with the selected project.
-
-.. image:: images/03-enable-project-billing.png
-
-Give yourself permission to spend
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Finally, you need to give yourself (your Google ID) the power to spend money within
-the new project, using the billing method you just specified. See these instructions
-for `Using Google Cloud IAM <https://cloud.google.com/iam/docs/granting-changing-revoking-access#granting-console>`__
-(Identity and Access Management), and navigate to the
-`IAM Admin page <https://console.cloud.google.com/iam-admin/iam>`__.
-
-Your Google ID (email address) should be listed as a "Principal" and you need to grant
-that Principal the "Role" of **Service Usage Consumer**, which enables it to make
-billed requests on the behalf of the currently selected project (which should be the
-one you created at the beginning of this process -- the project should be visible in the
-dropdown menu near the upper-left corner of the page).
-
-Click on the little pencil icon in the right column of the row associated with your
-Google ID to "edit principal".
-
-.. image:: images/04-iam-admin.png
-
-This should give you the option to associate a role with a particular combination of
-Principal and Project.
-
-.. image:: images/05-add-a-role.png
-
-Click on the "Role" field and type in "Service Usage Consumer"
-and it should filter the very long list of available roles down to a few, one of which
-is "Service Usage Consumer". Select that role, and click on the Save button.
-
-.. image:: images/06-add-service-usage-consumer.png
-
-
-Check your GCP credit
-~~~~~~~~~~~~~~~~~~~~~
-Google provides new accounts with $300 in free credits so you can experiment a bit
-without spending any of your own money. Check out the
-`docs on the free trial period <https://cloud.google.com/free/docs/gcp-free-tier>`__
-to understand how it works.
-
-To check in on your remaining credits or overall billing status you can go to the
-Billing Account Overview and click on the "Credit details" link in the Credits box on
-the right hand side of the page:
-
-.. image:: images/07-billing-account-overview.png
-
-This will show you a table of any existing GCP credits that have been granted to your
-account:
-
-.. image:: images/08-credits-status.png
-
-Set up the Google Cloud SDK
----------------------------
-
-Now that you have a GCP project with billing enabled, and your Google ID is empowered to
-make use of it, you need to install software that allows your computer to communicate
-with GCP and make authenticated requests on your behalf. This is part of the Google
-Cloud software Development Kit (SDK).
-
-Install the `gcloud utilities <https://cloud.google.com/sdk/docs/install>`__ on your
-computer. There are several ways to do this. We recommend using ``conda`` or its faster
-sibling ``mamba``, since you'll likely be using this catalog in the context of an
-existing conda environment. If you're not using `conda` environments, there are other
-ways to install the Google Cloud SDK explained in the link above.
-
-.. code::
-
-  conda install -c conda-forge google-cloud-sdk
-
-Log into the account you used to create your new project above by running:
-
-.. code::
-
-  gcloud auth login
-
-Initialize the ``gcloud`` command line interface and select the project you just created
-as the default project. This allows the project to be used for requester pays access
-through the command line or in software (like an Intake catalog) that has access to
-shell environment variables.
-
-If it asks you whether you want to "re-initialize this configuration with new settings"
-say yes.
-
-.. code::
-
-  gcloud init
-
-Finally, use ``gcloud`` to establish application default credentials; this will allow
-the project to be used for requester pays access through applications:
-
-.. code::
-
-  gcloud auth application-default login
-
-To test whether your GCP account is set up correctly and authenticated you can run the
-following command to list the contents of the cloud storage bucket containing the PUDL
-catalog data. This doesn't actually download any data, but will show you the versions
-that are available:
-
-.. code::
-
-   gsutil ls gs://intake.catalyst.coop
-
-.. code::
-
-   gs://intake.catalyst.coop/dev/
-   gs://intake.catalyst.coop/v0.1.0/
-
-Every night we attempt to build a new catalog based on the code and data associated with
-the ``dev`` branch of the PUDL repository. If the nightly build and data validation
-succeed, the outputs are copied to ``gs://intake.catalyst.coop/dev``. To see what's
-available there, how fresh it is, and how big the files are, you can use ``gsutil``
-like this:
-
-.. code::
-
-   gsutil ls -l gs://intake.catalyst.coop/dev
-
-    843649024  2022-09-15T12:27:02Z  gs://intake.catalyst.coop/dev/censusdp1tract.sqlite
-    761257984  2022-09-15T12:27:04Z  gs://intake.catalyst.coop/dev/ferc1.sqlite
-   5110330869  2022-09-15T12:28:59Z  gs://intake.catalyst.coop/dev/hourly_emissions_epacems.parquet
-    702459904  2022-09-15T12:27:01Z  gs://intake.catalyst.coop/dev/pudl.sqlite
-                                     gs://intake.catalyst.coop/dev/hourly_emissions_epacems/
-   TOTAL: 4 objects, 7417697781 bytes (6.91 GiB)
-
-.. warning::
-
-   If you download the files directly with ``gsutil`` then you'll be responsible for
-   updating them, making sure you have the right version, putting them in the right
-   place on your computer, etc. You also won't benefit from the caching that the Intake
-   catalogs do. For easier automatic updates, data versioning and dependency management,
-   we recommend using the Intake catalog rather than direct downloads. But for
-   developent work it can often be convenient to grab the fresh nightly build outputs.
-
-If you want to copy these files down directly to your computer, rather than using the
-PUDL Intake catalog, you can use the ``gsutil cp`` command, which behaves very much like
-the Unix ``cp`` command:
-
-.. code::
-
-   gsutil cp gs://intake.catalyst.coop/dev/pudl.sqlite ./
-
-If you wanted to download all of the build outputs (more than 10GB!) you could use ``cp
--r`` on the whole directory:
-
-.. code::
-
-   gsutil cp -r gs://intake.catalyst.coop/dev/ ./
-
-For more details on how to use ``gsutil`` in general see the
-`online documentation <https://cloud.google.com/storage/docs/gsutil>`__ or run:
-
-.. code::
-
-   gsutil --help

From 20d89dfde85657dd06379d625784c383e1569cc4 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Thu, 15 Dec 2022 11:48:48 -0900
Subject: [PATCH 05/13] Add AWS to sponsors section

---
 README.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.rst b/README.rst
index d126eb2..03306c6 100644
--- a/README.rst
+++ b/README.rst
@@ -393,3 +393,6 @@ Funding
 This work is supported by a generous grant from the `Alfred P. Sloan Foundation
 <https://sloan.org/>`__ and their `Energy & Environment Program
 <https://sloan.org/programs/research/energy-and-environment>`__
+
+Storage and egress fees for this data are covered by `Amazon Web Services's 
+Open Data Sponsorship Program <https://aws.amazon.com/opendata/open-data-sponsorship-program/>`__.
\ No newline at end of file

From e8620380233853da4e871537821508842761d556 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 15 Dec 2022 20:49:21 +0000
Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci
---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 03306c6..7a74497 100644
--- a/README.rst
+++ b/README.rst
@@ -394,5 +394,5 @@ This work is supported by a generous grant from the `Alfred P. Sloan Foundation
 <https://sloan.org/>`__ and their `Energy & Environment Program
 <https://sloan.org/programs/research/energy-and-environment>`__
 
-Storage and egress fees for this data are covered by `Amazon Web Services's 
-Open Data Sponsorship Program <https://aws.amazon.com/opendata/open-data-sponsorship-program/>`__.
\ No newline at end of file
+Storage and egress fees for this data are covered by `Amazon Web Services's
+Open Data Sponsorship Program <https://aws.amazon.com/opendata/open-data-sponsorship-program/>`__.

From 7a5153bfdb3e517730acbb7c62a77868ee77e51a Mon Sep 17 00:00:00 2001
From: Zane Selvans <zane.selvans@catalyst.coop>
Date: Thu, 15 Dec 2022 15:27:42 -0600
Subject: [PATCH 07/13] Update exptected number of EPA CEMS columns to match
 current outputs.

---
 tests/integration/hourly_emissions_epacems_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py
index 4ee0196..3352bc1 100644
--- a/tests/integration/hourly_emissions_epacems_test.py
+++ b/tests/integration/hourly_emissions_epacems_test.py
@@ -58,7 +58,7 @@ def expected_df() -> pd.DataFrame:
         storage_options={"requester_pays": True},
     )
     is_dataframe_like(expected_df)
-    assert expected_df.shape == (70_272, 19)
+    assert expected_df.shape == (70_272, 16)
     return expected_df
 
 

From e789728269067291e1dc2337a589d268bc3e5e2b Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Fri, 16 Dec 2022 09:01:15 -0900
Subject: [PATCH 08/13] Change CATALOG_VERSION to point to dev

This way people can get access to the dev data by
installing the pudl_catalog package from git.
This value will be updated for tagged releases
so people can access the release data from pypi
and conda.
---
 src/pudl_catalog/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py
index e124a35..bf1694d 100644
--- a/src/pudl_catalog/__init__.py
+++ b/src/pudl_catalog/__init__.py
@@ -13,7 +13,7 @@
 
 INTAKE_BUCKET = "intake.catalyst.coop"
 # Ideally we would set this version automatically using setuptools_scm...
-CATALOG_VERSION = "v2022.11.30"
+CATALOG_VERSION = "dev"
 
 BASE_URLS = {
     "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}",

From c012fabb57c7ffcde54e77d177afc4634f740ca4 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Fri, 16 Dec 2022 11:41:20 -0900
Subject: [PATCH 09/13] Add anonymous option to intake so users don't need to
 authenticate with aws

---
 src/pudl_catalog/pudl_catalog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml
index d6752cb..af60917 100644
--- a/src/pudl_catalog/pudl_catalog.yaml
+++ b/src/pudl_catalog/pudl_catalog.yaml
@@ -45,6 +45,8 @@ sources:
         requester_pays: true
         gs:
           requester_pays: true
+        s3:
+          anon: true
         simplecache:
           cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}"
 
@@ -80,6 +82,8 @@ sources:
         requester_pays: true
         gs:
           requester_pays: true
+        s3:
+          anon: true
         simplecache:
           cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}"
 
@@ -93,6 +97,8 @@ sources:
       storage_options:
         gs:
           requester_pays: true
+        s3:
+          anon: true
         simplecache:
           cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}"
     metadata:
@@ -117,6 +123,8 @@ sources:
       storage_options:
         gs:
           requester_pays: true
+        s3:
+          anon: true
         simplecache:
           cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}"
     metadata:
@@ -136,6 +144,8 @@ sources:
     args:
       urlpath: "{{ env(PUDL_INTAKE_PATH) }}/censusdp1tract.sqlite"
       storage_options:
+        s3:
+          anon: true
         gs:
           requester_pays: true
         simplecache:

From 088b929c2d66cae6086d5549c2b4809cd1bc3c42 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Mon, 19 Dec 2022 15:57:34 -0800
Subject: [PATCH 10/13] Add s3 integration tests and removed cache disabling

I removed caching disabling because fsspec starting throwing
unexpected keyword argument errors when making requests to s3
with caching disabled. See https://github.com/intake/intake-parquet/issues/26
for the full explanation.
---
 setup.py                                      |  3 +-
 src/pudl_catalog/pudl_catalog.yaml            | 16 +-------
 tests/integration/file_existence_test.py      | 38 ++++++++++++++++++-
 .../hourly_emissions_epacems_test.py          | 35 +++++++++++++----
 4 files changed, 68 insertions(+), 24 deletions(-)

diff --git a/setup.py b/setup.py
index d2ad80c..2d74e25 100644
--- a/setup.py
+++ b/setup.py
@@ -46,6 +46,7 @@
     zip_safe=False,
     python_requires=">=3.8,<3.12",
     install_requires=[
+        "boto3>=1.24,<1.27",
         "s3fs>=2021.7,<2022.11.1",
         "gcsfs>=2021.7,<2022.11.1",
         "intake_parquet>=0.2.3,<0.3",
@@ -72,7 +73,7 @@
             "doc8>=0.9,<1.1",  # Ensures clean documentation formatting
             "flake8>=4,<7",  # A framework for linting & static analysis
             "flake8-builtins>=1.5,<3",  # Avoid shadowing Python built-in names
-            "flake8-colors>=0.1,<0.2",  # Produce colorful error / warning output
+            "flake8-colors>=0.1.9,<0.2",  # Produce colorful error / warning output
             "flake8-docstrings>=1.5,<2",  # Ensure docstrings are formatted well
             "flake8-rst-docstrings>=0.2,<0.4",  # Allow use of ReST in docstrings
             "flake8-use-fstring>=1,<2",  # Highlight use of old-style string formatting
diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml
index af60917..9ad21e4 100644
--- a/src/pudl_catalog/pudl_catalog.yaml
+++ b/src/pudl_catalog/pudl_catalog.yaml
@@ -21,12 +21,6 @@ sources:
       gross power output. Hourly values reported by US EIA ORISPL code and emissions
       unit (smokestack) ID.
     driver: parquet
-    parameters:
-      cache_method:
-        description: "Whether to cache data locally; empty string to disable caching."
-        type: str
-        default: "simplecache::"
-        allowed: ["simplecache::", ""]
     metadata:
       title: Continuous Emissions Monitoring System (CEMS) Hourly Data
       type: application/parquet
@@ -40,7 +34,7 @@ sources:
       engine: "pyarrow"
       split_row_groups: true
       index: false
-      urlpath: "{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet"
+      urlpath: "simplecache::{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet"
       storage_options:
         requester_pays: true
         gs:
@@ -58,12 +52,6 @@ sources:
       gross power output. Hourly values reported by US EIA ORISPL code and emissions
       unit (smokestack) ID.
     driver: parquet
-    parameters:
-      cache_method:
-        description: "Whether to cache data locally; empty string to disable caching."
-        type: str
-        default: "simplecache::"
-        allowed: ["simplecache::", ""]
     metadata:
       title: Continuous Emissions Monitoring System (CEMS) Hourly Data
       type: application/parquet
@@ -77,7 +65,7 @@ sources:
       engine: "pyarrow"
       split_row_groups: true
       index: false
-      urlpath: "{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet"
+      urlpath: "simplecache::{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet"
       storage_options:
         requester_pays: true
         gs:
diff --git a/tests/integration/file_existence_test.py b/tests/integration/file_existence_test.py
index 591be77..1a093cf 100644
--- a/tests/integration/file_existence_test.py
+++ b/tests/integration/file_existence_test.py
@@ -1,8 +1,11 @@
 """Verify that expected catalog files are available in Google Cloud Storage."""
 import logging
 
+import boto3
 import pytest
 from google.cloud import storage
+from botocore import UNSIGNED
+from botocore.config import Config
 
 from pudl_catalog import CATALOG_VERSION, INTAKE_BUCKET
 
@@ -25,10 +28,43 @@
         "hourly_emissions_epacems/epacems-2020-FL.parquet",
     ],
 )
-def test_file_exists(filename: str) -> None:
+def test_gcs_file_exists(filename: str) -> None:
     """Test that files expected in the catalog are available in GCS."""
     storage_client = storage.Client()
     bucket = storage_client.bucket(INTAKE_BUCKET, user_project=BILLING_PROJECT)
     fullname = CATALOG_VERSION + "/" + filename
     blob = storage.blob.Blob(name=fullname, bucket=bucket)
     assert blob.exists()
+
+@pytest.mark.parametrize(
+    "filename",
+    [
+        "censusdp1tract.sqlite",
+        "ferc1.sqlite",
+        "pudl.sqlite",
+        "hourly_emissions_epacems.parquet",
+        "hourly_emissions_epacems/epacems-2020-NY.parquet",
+        "hourly_emissions_epacems/epacems-2020-CA.parquet",
+        "hourly_emissions_epacems/epacems-2020-TX.parquet",
+        "hourly_emissions_epacems/epacems-2020-FL.parquet",
+    ],
+)
+def test_s3_file_exists(filename: str) -> None:
+    """Test that files expected in the catalog are available in s3."""
+    # Disable signing so we don't need to load credentials.
+    # From https://github.com/boto/boto3/issues/1200#issuecomment-319141394
+    # Have to request all of the files because we get an authentication
+    # error when using Client().get_object_attributes().
+    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+
+    paginator = s3.get_paginator('list_objects_v2')
+    page_iterator = paginator.paginate(Bucket=INTAKE_BUCKET)
+    files = []
+    for page in page_iterator:
+        files += page['Contents']
+        
+    filenames = [file["Key"] for file in files]
+    fullname = CATALOG_VERSION + "/" + filename
+
+    assert fullname in filenames, f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket."
+
diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py
index 3352bc1..0851e3d 100644
--- a/tests/integration/hourly_emissions_epacems_test.py
+++ b/tests/integration/hourly_emissions_epacems_test.py
@@ -1,7 +1,9 @@
 """Test cases for the EPA CEMS data source."""
 import logging
 import os
+from pathlib import Path
 import time
+import typing
 from typing import Literal, Optional
 
 import dask.dataframe as dd
@@ -24,9 +26,11 @@
 
 os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["gs"]
 
+InternetProtocol = Literal["gs", "https", "s3"]
+
 
 def parquet_url(
-    protocol: Literal["gs", "https"],
+    protocol: InternetProtocol,
     table_name: str,
     partition_suffix: Optional[str] = None,
 ) -> str:
@@ -34,8 +38,9 @@ def parquet_url(
     try:
         url = BASE_URLS[protocol]
     except KeyError:
+        valid_protocols = typing.get_args(InternetProtocol)
         raise ValueError(
-            f"Received invalid protocol: {protocol}. Must be one of 'gs' or 'https'."
+            f"Received invalid protocol: {protocol}. Must be one of {' or '.join(valid_protocols)}."
         )
     url = url + "/" + table_name
     if partition_suffix is None:
@@ -48,14 +53,14 @@ def expected_df() -> pd.DataFrame:
     """Read parquet data directly for comparison with Intake outputs."""
     logger.debug("Reading remote test data for comparison using pd.read_parquet().")
     epacems_url = parquet_url(
-        protocol="gs",
+        protocol="s3",
         table_name="hourly_emissions_epacems",
         partition_suffix=None,
     )
     expected_df = pd.read_parquet(
         epacems_url,
         filters=TEST_FILTERS,
-        storage_options={"requester_pays": True},
+        storage_options={"anon": True},
     )
     is_dataframe_like(expected_df)
     assert expected_df.shape == (70_272, 16)
@@ -67,10 +72,12 @@ def expected_df() -> pd.DataFrame:
     [
         ("gs", None),
         ("gs", "_partitioned"),
+        ("s3", None),
+        ("s3", "_partitioned"),
     ],
 )
 def test_read_parquet(
-    protocol: Literal["gs", "https"],
+    protocol: InternetProtocol,
     partition_suffix: str,
     expected_df: pd.DataFrame,
 ) -> None:
@@ -82,7 +89,14 @@ def test_read_parquet(
         partition_suffix=partition_suffix,
     )
     start_time = time.time()
-    actual_dd = dd.read_parquet(epacems_url, storage_options={"requester_pays": True})
+    
+    storage_options = {}
+    if protocol == "gcs":
+        storage_options["requester_pays"] = True
+    elif protocol == "s3":
+        storage_options["anon"] = True
+    actual_dd = dd.read_parquet(epacems_url, storage_options=storage_options)
+
     elapsed_time = time.time() - start_time
     logger.debug(f"    elapsed time: {elapsed_time:.2f}s")
     is_dataframe_like(actual_dd)
@@ -94,22 +108,27 @@ def test_read_parquet(
     [
         ("gs", None),
         ("gs", "_partitioned"),
+        ("s3", None),
+        ("s3", "_partitioned"),
     ],
 )
 def test_intake_catalog(
-    protocol: Literal["gs", "https"],
+    protocol: InternetProtocol,
     partition_suffix: str,
     expected_df: pd.DataFrame,
+    tmp_path: Path
 ) -> None:
     """Test reading data from the intake catalog."""
     logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}")
     os.environ["PUDL_INTAKE_PATH"] = BASE_URLS[protocol]
+    # Save the data to a temporary directory
+    os.environ["PUDL_INTAKE_CACHE"] = str(tmp_path)
     pudl_cat = intake.cat.pudl_cat
     src = "hourly_emissions_epacems"
     if partition_suffix is not None:
         src += partition_suffix
     start_time = time.time()
-    actual_dd = pudl_cat[src](cache_method="").to_dask()
+    actual_dd = pudl_cat[src].to_dask()
     elapsed_time = time.time() - start_time
     logger.debug(f"    elapsed time: {elapsed_time:.2f}s")
     is_dataframe_like(actual_dd)

From 7a23a19cbf1e89126625bd21faa1eb447547900c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 19 Dec 2022 23:57:59 +0000
Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci
---
 tests/integration/file_existence_test.py         | 16 +++++++++-------
 .../integration/hourly_emissions_epacems_test.py |  6 +++---
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tests/integration/file_existence_test.py b/tests/integration/file_existence_test.py
index 1a093cf..6e3a3c9 100644
--- a/tests/integration/file_existence_test.py
+++ b/tests/integration/file_existence_test.py
@@ -3,9 +3,9 @@
 
 import boto3
 import pytest
-from google.cloud import storage
 from botocore import UNSIGNED
 from botocore.config import Config
+from google.cloud import storage
 
 from pudl_catalog import CATALOG_VERSION, INTAKE_BUCKET
 
@@ -36,6 +36,7 @@ def test_gcs_file_exists(filename: str) -> None:
     blob = storage.blob.Blob(name=fullname, bucket=bucket)
     assert blob.exists()
 
+
 @pytest.mark.parametrize(
     "filename",
     [
@@ -55,16 +56,17 @@ def test_s3_file_exists(filename: str) -> None:
     # From https://github.com/boto/boto3/issues/1200#issuecomment-319141394
     # Have to request all of the files because we get an authentication
     # error when using Client().get_object_attributes().
-    s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+    s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED))
 
-    paginator = s3.get_paginator('list_objects_v2')
+    paginator = s3.get_paginator("list_objects_v2")
     page_iterator = paginator.paginate(Bucket=INTAKE_BUCKET)
     files = []
     for page in page_iterator:
-        files += page['Contents']
-        
+        files += page["Contents"]
+
     filenames = [file["Key"] for file in files]
     fullname = CATALOG_VERSION + "/" + filename
 
-    assert fullname in filenames, f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket."
-
+    assert (
+        fullname in filenames
+    ), f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket."
diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py
index 0851e3d..5e75cda 100644
--- a/tests/integration/hourly_emissions_epacems_test.py
+++ b/tests/integration/hourly_emissions_epacems_test.py
@@ -1,9 +1,9 @@
 """Test cases for the EPA CEMS data source."""
 import logging
 import os
-from pathlib import Path
 import time
 import typing
+from pathlib import Path
 from typing import Literal, Optional
 
 import dask.dataframe as dd
@@ -89,7 +89,7 @@ def test_read_parquet(
         partition_suffix=partition_suffix,
     )
     start_time = time.time()
-    
+
     storage_options = {}
     if protocol == "gcs":
         storage_options["requester_pays"] = True
@@ -116,7 +116,7 @@ def test_intake_catalog(
     protocol: InternetProtocol,
     partition_suffix: str,
     expected_df: pd.DataFrame,
-    tmp_path: Path
+    tmp_path: Path,
 ) -> None:
     """Test reading data from the intake catalog."""
     logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}")

From 9c0c57ce0ff01233e49eaa73dcbcca8511f3073c Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Mon, 19 Dec 2022 16:58:01 -0800
Subject: [PATCH 12/13] Remove boto3 version constraint and fix integration
 test bug

---
 README.rst                                         | 5 +++++
 setup.py                                           | 2 +-
 tests/integration/hourly_emissions_epacems_test.py | 8 ++++----
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 7a74497..e667ae5 100644
--- a/README.rst
+++ b/README.rst
@@ -187,6 +187,11 @@ types:
       'path': 'https://creativecommons.org/licenses/by/4.0'},
      'catalog_dir': '/home/zane/code/catalyst/pudl-catalog/src/pudl_catalog/'}}
 
+.. note::
+
+    If the data has not been cached this method might take a while to finish depending
+    on your internet speed. The EPA CEMS parquet data is almost 5 GB.
+
 Read some data from the catalog
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/setup.py b/setup.py
index 2d74e25..2f36349 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
     zip_safe=False,
     python_requires=">=3.8,<3.12",
     install_requires=[
-        "boto3>=1.24,<1.27",
+        "boto3",  # It takes almost 20 min to solve the environment when a boto3 version is specified
         "s3fs>=2021.7,<2022.11.1",
         "gcsfs>=2021.7,<2022.11.1",
         "intake_parquet>=0.2.3,<0.3",
diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py
index 0851e3d..f030185 100644
--- a/tests/integration/hourly_emissions_epacems_test.py
+++ b/tests/integration/hourly_emissions_epacems_test.py
@@ -1,9 +1,9 @@
 """Test cases for the EPA CEMS data source."""
 import logging
 import os
-from pathlib import Path
 import time
 import typing
+from pathlib import Path
 from typing import Literal, Optional
 
 import dask.dataframe as dd
@@ -89,9 +89,9 @@ def test_read_parquet(
         partition_suffix=partition_suffix,
     )
     start_time = time.time()
-    
+
     storage_options = {}
-    if protocol == "gcs":
+    if protocol == "gs":
         storage_options["requester_pays"] = True
     elif protocol == "s3":
         storage_options["anon"] = True
@@ -116,7 +116,7 @@ def test_intake_catalog(
     protocol: InternetProtocol,
     partition_suffix: str,
     expected_df: pd.DataFrame,
-    tmp_path: Path
+    tmp_path: Path,
 ) -> None:
     """Test reading data from the intake catalog."""
     logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}")

From d027d827a79a424bfe76ecd0ac17c93e5c0f3d28 Mon Sep 17 00:00:00 2001
From: bendnorman <bdn29@cornell.edu>
Date: Mon, 19 Dec 2022 23:23:23 -0800
Subject: [PATCH 13/13] Specify boto3 version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2f36349..d211560 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
     zip_safe=False,
     python_requires=">=3.8,<3.12",
     install_requires=[
-        "boto3",  # It takes almost 20 min to solve the environment when a boto3 version is specified
+        "boto3==1.24.59",
         "s3fs>=2021.7,<2022.11.1",
         "gcsfs>=2021.7,<2022.11.1",
         "intake_parquet>=0.2.3,<0.3",