From 120c7a95ef82d40f7552e2797f74994d5adbee4e Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 14 Dec 2022 16:51:17 -0900 Subject: [PATCH 01/13] Add s3 bucket support --- setup.py | 1 + src/pudl_catalog/__init__.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a14dd91..82ca9c7 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ zip_safe=False, python_requires=">=3.8,<3.12", install_requires=[ + "s3fs>=2021.7,<2022.11.1", "gcsfs>=2021.7,<2022.11.1", "intake_parquet>=0.2.3,<0.3", "intake_sqlite>=0.2.0", diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py index 49afc3b..9378fa0 100644 --- a/src/pudl_catalog/__init__.py +++ b/src/pudl_catalog/__init__.py @@ -17,6 +17,7 @@ BASE_URLS = { "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}", + "s3": f"s3://{INTAKE_BUCKET}/{CATALOG_VERSION}", # HTTPS access doesn't really work well, so we're hiding it from users for now. "https": f"https://storage.googleapis.com/{INTAKE_BUCKET}/{CATALOG_VERSION}", } @@ -25,9 +26,9 @@ if os.getenv("PUDL_INTAKE_PATH") is None: logger.info( "Environment variable PUDL_INTAKE_PATH is not set. " - f"Defaulting to {BASE_URLS['gs']}" + f"Defaulting to {BASE_URLS['s3']}" ) - os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["gs"] + os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["s3"] if os.getenv("PUDL_INTAKE_CACHE") is None: logger.info( From 6f0f5fb05a3c98535639402529c4d8182eae1ab0 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Wed, 14 Dec 2022 18:25:35 -0900 Subject: [PATCH 02/13] Add installation instructions to README --- README.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.rst b/README.rst index 40e5db1..d49e590 100644 --- a/README.rst +++ b/README.rst @@ -68,6 +68,21 @@ See also: PUDL Catalog Usage ------------------ +Installation +~~~~~~~~~~~~ +You can install the PUDL Catalog using conda: + +.. code:: text + + conda install -c conda-forge catalystcoop.pudl + +or pip: + +.. code:: text + + pip install catalystcoop.pudl-catalog + + Accessing Public Cloud Data with Requester Pays ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 811dea5a605ec7cdfe21abab0ab0da1d6bbfd184 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Thu, 15 Dec 2022 11:36:40 -0900 Subject: [PATCH 03/13] Update catalog version to recent release --- src/pudl_catalog/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py index 9378fa0..e124a35 100644 --- a/src/pudl_catalog/__init__.py +++ b/src/pudl_catalog/__init__.py @@ -13,7 +13,7 @@ INTAKE_BUCKET = "intake.catalyst.coop" # Ideally we would set this version automatically using setuptools_scm... -CATALOG_VERSION = "v0.1.1" +CATALOG_VERSION = "v2022.11.30" BASE_URLS = { "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}", From e1e48015c9ee0fd785ed808c070c33ffbc89e4c7 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Thu, 15 Dec 2022 11:39:16 -0900 Subject: [PATCH 04/13] Remove requester pays documentation --- README.rst | 12 -- docs/index.rst | 1 - docs/requester_pays.rst | 268 ---------------------------------------- 3 files changed, 281 deletions(-) delete mode 100644 docs/requester_pays.rst diff --git a/README.rst b/README.rst index d49e590..d126eb2 100644 --- a/README.rst +++ b/README.rst @@ -82,18 +82,6 @@ or pip: pip install catalystcoop.pudl-catalog - -Accessing Public Cloud Data with Requester Pays -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To control the cost of distributing potentially large quantities of public data, we are -using "requester pays." This means that whoever is downloading the data pays the modest -data egress fees. If you're not familiar with GCP or requester pays, we've written a -short guide to setting up a GCP project to work with this type of data. See the -`PUDL Catalog documentation `__. -for details, and also the -`GCP Documentation on accessing Requester Pays data `__ - Import the Intake Catalogs ~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/index.rst b/docs/index.rst index c88bcbb..4359a93 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,7 +9,6 @@ The PUDL Data Catalog :hidden: :maxdepth: 2 - Requester Pays Licensing Code of Conduct Release Notes diff --git a/docs/requester_pays.rst b/docs/requester_pays.rst deleted file mode 100644 index 436bd9a..0000000 --- a/docs/requester_pays.rst +++ /dev/null @@ -1,268 +0,0 @@ -======================================================================================= -Accessing Public Cloud Data with Requester Pays -======================================================================================= - -The data we're publishing in the PUDL Catalog is publicly accessible and distributed -under the permissive `CC-BY-4.0 `__ -license. Catalyst covers the cost of storing the data in Google cloud storage buckets. -However, there are also fees incurred when data leaves the Google Cloud Platform (GCP). -Depending where you're downloading from, it costs $0.10-0.20 (USD) per GB. In order to -share large amounts of public data without risking unexpectedly large data egress -charges due to someone maliciously or accidentally downloading a large volume of data, -we've set our cloud storage to use `requester pays -`__. - -"Requester pays" means the person downloading the data is responsible for those costs -instead. As a user, the cost of a single download is cheap. But as a data provider, -payihg for the downloads of all users can quickly get expensive! Downloading all of the -EPA CEMS, FERC 1, PUDL, and US Census data we're publishing from North America should -cost about $0.75, but if we had 1000 downloads in a month that would cost us $750. The -PUDL Intake catalog is also set up to try and cache the data locally so that it's not -downloaded again until a new version is released. - -Setting up a GCP project for billing ------------------------------------- - -The following instructions assume you don't currently have GCP set up, and just want to -be able to use public data that uses requester pays. If your organization already uses -GCP or you want to access the PUDL Catalog in the context of another project that's -running on GCP, you'll probably need to do something different. - -Create a GCP Account -~~~~~~~~~~~~~~~~~~~~ - -If you have never used GCP before, go to ``__ and click on the -"Get Started for Free" button. A prompt should appear asking you to choose which Google -account to use for your GCP-related activities. You should be able to log in with a -Gmail account or another Google ID. You don't need to use Gmail for your email. Your -Google ID is just used for identification and authentication. - -.. note:: - - If you already have a GCP account set up then you can just log in to your existing - account. Note that the free startup credits are only available for brand new accounts. - -Create a New GCP Project -~~~~~~~~~~~~~~~~~~~~~~~~ - -GCP allows you to organize services, access, and billing under various Projects, which -you name and define. In the context of accessing public requester pays data, the only -thing it needs to do is identify who you are, and how you'd like to pay for any charges -associated with the accessing the data. These charges are from Google. Catalyst doesn't -charge a markup or get any money from this, it's just a way to help cover the costs of -distributing the data. - -.. note:: - - If you have a pre-existing GCP project that you want to use the PUDL Catalog within, - you can skip this step. - -Let's create a project for use in accessing public data. Here's the -`GCP documentation on creating a new project -`__ - -After you've logged in, at the top of the main GCP home page you should see a search -field. Type in "Manage Resources" and at the top of the suggested searches that pop up -you should see a "Products and Pages" section, which contains "Manage Resources". Click -on that section, and you should come to the Manage Resources page. - -Click on the "Create Project" link near the top of the page: - -.. image:: images/01-manage-resources.png - -The only field you need to have filled in here is the Project Name. You can use the -random one that is suggested by Google. If you're creating a project outside of any -organization, it needs to be globally unique, so Google picks a couple of random words -and a random number (e.g. ``aerobic-star-352200``). - -Create a GCP Billing Account -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The billing account is separate from the project you just created. See the -`GCP docs on creating a billing account `__ -and navigate to ``__. At the top of the page -you should see a "Create Account" button. - -.. image:: images/02-create-billing-account.png - -.. note:: - - If you have a pre-existing GCP Billing Account that you want to use to cover the cost of - downloading data, then you can skip this step. - -Enable billing on your project -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Now you need to associate the billing account you just created with the project you -made in the previous step. You need to `enable billing for the project -`__ -through this account. - -.. note:: - - If you are using a pre-existing Billing Account and Project, you might still need to - associate them with each other if they haven't been used together previously. - -At the top of the `billing page `__ there -should be two tabs: "My Billing Accounts" and "My Projects". Select "My Projects" and -then in the row for the new project you created above, click on the three dots in the -far righthand column labeled "Actions". The menu that pops up should have an -"Add Billing" or "Change Billing" option. Click on that, and you should be given the -choice of which billing account to associate with the selected project. - -.. image:: images/03-enable-project-billing.png - -Give yourself permission to spend -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Finally, you need to give yourself (your Google ID) the power to spend money within -the new project, using the billing method you just specified. See these instructions -for `Using Google Cloud IAM `__ -(Identity and Access Management), and navigate to the -`IAM Admin page `__. - -Your Google ID (email address) should be listed as a "Principal" and you need to grant -that Principal the "Role" of **Service Usage Consumer**, which enables it to make -billed requests on the behalf of the currently selected project (which should be the -one you created at the beginning of this process -- the project should be visible in the -dropdown menu near the upper-left corner of the page). - -Click on the little pencil icon in the right column of the row associated with your -Google ID to "edit principal". - -.. image:: images/04-iam-admin.png - -This should give you the option to associate a role with a particular combination of -Principal and Project. - -.. image:: images/05-add-a-role.png - -Click on the "Role" field and type in "Service Usage Consumer" -and it should filter the very long list of available roles down to a few, one of which -is "Service Usage Consumer". Select that role, and click on the Save button. - -.. image:: images/06-add-service-usage-consumer.png - - -Check your GCP credit -~~~~~~~~~~~~~~~~~~~~~ -Google provides new accounts with $300 in free credits so you can experiment a bit -without spending any of your own money. Check out the -`docs on the free trial period `__ -to understand how it works. - -To check in on your remaining credits or overall billing status you can go to the -Billing Account Overview and click on the "Credit details" link in the Credits box on -the right hand side of the page: - -.. image:: images/07-billing-account-overview.png - -This will show you a table of any existing GCP credits that have been granted to your -account: - -.. image:: images/08-credits-status.png - -Set up the Google Cloud SDK ---------------------------- - -Now that you have a GCP project with billing enabled, and your Google ID is empowered to -make use of it, you need to install software that allows your computer to communicate -with GCP and make authenticated requests on your behalf. This is part of the Google -Cloud software Development Kit (SDK). - -Install the `gcloud utilities `__ on your -computer. There are several ways to do this. We recommend using ``conda`` or its faster -sibling ``mamba``, since you'll likely be using this catalog in the context of an -existing conda environment. If you're not using `conda` environments, there are other -ways to install the Google Cloud SDK explained in the link above. - -.. code:: - - conda install -c conda-forge google-cloud-sdk - -Log into the account you used to create your new project above by running: - -.. code:: - - gcloud auth login - -Initialize the ``gcloud`` command line interface and select the project you just created -as the default project. This allows the project to be used for requester pays access -through the command line or in software (like an Intake catalog) that has access to -shell environment variables. - -If it asks you whether you want to "re-initialize this configuration with new settings" -say yes. - -.. code:: - - gcloud init - -Finally, use ``gcloud`` to establish application default credentials; this will allow -the project to be used for requester pays access through applications: - -.. code:: - - gcloud auth application-default login - -To test whether your GCP account is set up correctly and authenticated you can run the -following command to list the contents of the cloud storage bucket containing the PUDL -catalog data. This doesn't actually download any data, but will show you the versions -that are available: - -.. code:: - - gsutil ls gs://intake.catalyst.coop - -.. code:: - - gs://intake.catalyst.coop/dev/ - gs://intake.catalyst.coop/v0.1.0/ - -Every night we attempt to build a new catalog based on the code and data associated with -the ``dev`` branch of the PUDL repository. If the nightly build and data validation -succeed, the outputs are copied to ``gs://intake.catalyst.coop/dev``. To see what's -available there, how fresh it is, and how big the files are, you can use ``gsutil`` -like this: - -.. code:: - - gsutil ls -l gs://intake.catalyst.coop/dev - - 843649024 2022-09-15T12:27:02Z gs://intake.catalyst.coop/dev/censusdp1tract.sqlite - 761257984 2022-09-15T12:27:04Z gs://intake.catalyst.coop/dev/ferc1.sqlite - 5110330869 2022-09-15T12:28:59Z gs://intake.catalyst.coop/dev/hourly_emissions_epacems.parquet - 702459904 2022-09-15T12:27:01Z gs://intake.catalyst.coop/dev/pudl.sqlite - gs://intake.catalyst.coop/dev/hourly_emissions_epacems/ - TOTAL: 4 objects, 7417697781 bytes (6.91 GiB) - -.. warning:: - - If you download the files directly with ``gsutil`` then you'll be responsible for - updating them, making sure you have the right version, putting them in the right - place on your computer, etc. You also won't benefit from the caching that the Intake - catalogs do. For easier automatic updates, data versioning and dependency management, - we recommend using the Intake catalog rather than direct downloads. But for - developent work it can often be convenient to grab the fresh nightly build outputs. - -If you want to copy these files down directly to your computer, rather than using the -PUDL Intake catalog, you can use the ``gsutil cp`` command, which behaves very much like -the Unix ``cp`` command: - -.. code:: - - gsutil cp gs://intake.catalyst.coop/dev/pudl.sqlite ./ - -If you wanted to download all of the build outputs (more than 10GB!) you could use ``cp --r`` on the whole directory: - -.. code:: - - gsutil cp -r gs://intake.catalyst.coop/dev/ ./ - -For more details on how to use ``gsutil`` in general see the -`online documentation `__ or run: - -.. code:: - - gsutil --help From 20d89dfde85657dd06379d625784c383e1569cc4 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Thu, 15 Dec 2022 11:48:48 -0900 Subject: [PATCH 05/13] Add AWS to sponsors section --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index d126eb2..03306c6 100644 --- a/README.rst +++ b/README.rst @@ -393,3 +393,6 @@ Funding This work is supported by a generous grant from the `Alfred P. Sloan Foundation `__ and their `Energy & Environment Program `__ + +Storage and egress fees for this data are covered by `Amazon Web Services's +Open Data Sponsorship Program `__. \ No newline at end of file From e8620380233853da4e871537821508842761d556 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 15 Dec 2022 20:49:21 +0000 Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 03306c6..7a74497 100644 --- a/README.rst +++ b/README.rst @@ -394,5 +394,5 @@ This work is supported by a generous grant from the `Alfred P. Sloan Foundation `__ and their `Energy & Environment Program `__ -Storage and egress fees for this data are covered by `Amazon Web Services's -Open Data Sponsorship Program `__. \ No newline at end of file +Storage and egress fees for this data are covered by `Amazon Web Services's +Open Data Sponsorship Program `__. From 7a5153bfdb3e517730acbb7c62a77868ee77e51a Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Thu, 15 Dec 2022 15:27:42 -0600 Subject: [PATCH 07/13] Update exptected number of EPA CEMS columns to match current outputs. --- tests/integration/hourly_emissions_epacems_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py index 4ee0196..3352bc1 100644 --- a/tests/integration/hourly_emissions_epacems_test.py +++ b/tests/integration/hourly_emissions_epacems_test.py @@ -58,7 +58,7 @@ def expected_df() -> pd.DataFrame: storage_options={"requester_pays": True}, ) is_dataframe_like(expected_df) - assert expected_df.shape == (70_272, 19) + assert expected_df.shape == (70_272, 16) return expected_df From e789728269067291e1dc2337a589d268bc3e5e2b Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 16 Dec 2022 09:01:15 -0900 Subject: [PATCH 08/13] Change CATALOG_VERSION to point to dev This way people can get access to the dev data by installing the pudl_catalog package from git. This value will be updated for tagged releases so people can access the release data from pypi and conda. --- src/pudl_catalog/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pudl_catalog/__init__.py b/src/pudl_catalog/__init__.py index e124a35..bf1694d 100644 --- a/src/pudl_catalog/__init__.py +++ b/src/pudl_catalog/__init__.py @@ -13,7 +13,7 @@ INTAKE_BUCKET = "intake.catalyst.coop" # Ideally we would set this version automatically using setuptools_scm... -CATALOG_VERSION = "v2022.11.30" +CATALOG_VERSION = "dev" BASE_URLS = { "gs": f"gs://{INTAKE_BUCKET}/{CATALOG_VERSION}", From c012fabb57c7ffcde54e77d177afc4634f740ca4 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Fri, 16 Dec 2022 11:41:20 -0900 Subject: [PATCH 09/13] Add anonymous option to intake so users don't need to authenticate with aws --- src/pudl_catalog/pudl_catalog.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml index d6752cb..af60917 100644 --- a/src/pudl_catalog/pudl_catalog.yaml +++ b/src/pudl_catalog/pudl_catalog.yaml @@ -45,6 +45,8 @@ sources: requester_pays: true gs: requester_pays: true + s3: + anon: true simplecache: cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}" @@ -80,6 +82,8 @@ sources: requester_pays: true gs: requester_pays: true + s3: + anon: true simplecache: cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}" @@ -93,6 +97,8 @@ sources: storage_options: gs: requester_pays: true + s3: + anon: true simplecache: cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}" metadata: @@ -117,6 +123,8 @@ sources: storage_options: gs: requester_pays: true + s3: + anon: true simplecache: cache_storage: "{{ env(PUDL_INTAKE_CACHE) }}" metadata: @@ -136,6 +144,8 @@ sources: args: urlpath: "{{ env(PUDL_INTAKE_PATH) }}/censusdp1tract.sqlite" storage_options: + s3: + anon: true gs: requester_pays: true simplecache: From 088b929c2d66cae6086d5549c2b4809cd1bc3c42 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 19 Dec 2022 15:57:34 -0800 Subject: [PATCH 10/13] Add s3 integration tests and removed cache disabling I removed caching disabling because fsspec starting throwing unexpected keyword argument errors when making requests to s3 with caching disabled. See https://github.com/intake/intake-parquet/issues/26 for the full explanation. --- setup.py | 3 +- src/pudl_catalog/pudl_catalog.yaml | 16 +------- tests/integration/file_existence_test.py | 38 ++++++++++++++++++- .../hourly_emissions_epacems_test.py | 35 +++++++++++++---- 4 files changed, 68 insertions(+), 24 deletions(-) diff --git a/setup.py b/setup.py index d2ad80c..2d74e25 100644 --- a/setup.py +++ b/setup.py @@ -46,6 +46,7 @@ zip_safe=False, python_requires=">=3.8,<3.12", install_requires=[ + "boto3>=1.24,<1.27", "s3fs>=2021.7,<2022.11.1", "gcsfs>=2021.7,<2022.11.1", "intake_parquet>=0.2.3,<0.3", @@ -72,7 +73,7 @@ "doc8>=0.9,<1.1", # Ensures clean documentation formatting "flake8>=4,<7", # A framework for linting & static analysis "flake8-builtins>=1.5,<3", # Avoid shadowing Python built-in names - "flake8-colors>=0.1,<0.2", # Produce colorful error / warning output + "flake8-colors>=0.1.9,<0.2", # Produce colorful error / warning output "flake8-docstrings>=1.5,<2", # Ensure docstrings are formatted well "flake8-rst-docstrings>=0.2,<0.4", # Allow use of ReST in docstrings "flake8-use-fstring>=1,<2", # Highlight use of old-style string formatting diff --git a/src/pudl_catalog/pudl_catalog.yaml b/src/pudl_catalog/pudl_catalog.yaml index af60917..9ad21e4 100644 --- a/src/pudl_catalog/pudl_catalog.yaml +++ b/src/pudl_catalog/pudl_catalog.yaml @@ -21,12 +21,6 @@ sources: gross power output. Hourly values reported by US EIA ORISPL code and emissions unit (smokestack) ID. driver: parquet - parameters: - cache_method: - description: "Whether to cache data locally; empty string to disable caching." - type: str - default: "simplecache::" - allowed: ["simplecache::", ""] metadata: title: Continuous Emissions Monitoring System (CEMS) Hourly Data type: application/parquet @@ -40,7 +34,7 @@ sources: engine: "pyarrow" split_row_groups: true index: false - urlpath: "{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet" + urlpath: "simplecache::{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems.parquet" storage_options: requester_pays: true gs: @@ -58,12 +52,6 @@ sources: gross power output. Hourly values reported by US EIA ORISPL code and emissions unit (smokestack) ID. driver: parquet - parameters: - cache_method: - description: "Whether to cache data locally; empty string to disable caching." - type: str - default: "simplecache::" - allowed: ["simplecache::", ""] metadata: title: Continuous Emissions Monitoring System (CEMS) Hourly Data type: application/parquet @@ -77,7 +65,7 @@ sources: engine: "pyarrow" split_row_groups: true index: false - urlpath: "{{ cache_method }}{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet" + urlpath: "simplecache::{{ env(PUDL_INTAKE_PATH) }}/hourly_emissions_epacems/*.parquet" storage_options: requester_pays: true gs: diff --git a/tests/integration/file_existence_test.py b/tests/integration/file_existence_test.py index 591be77..1a093cf 100644 --- a/tests/integration/file_existence_test.py +++ b/tests/integration/file_existence_test.py @@ -1,8 +1,11 @@ """Verify that expected catalog files are available in Google Cloud Storage.""" import logging +import boto3 import pytest from google.cloud import storage +from botocore import UNSIGNED +from botocore.config import Config from pudl_catalog import CATALOG_VERSION, INTAKE_BUCKET @@ -25,10 +28,43 @@ "hourly_emissions_epacems/epacems-2020-FL.parquet", ], ) -def test_file_exists(filename: str) -> None: +def test_gcs_file_exists(filename: str) -> None: """Test that files expected in the catalog are available in GCS.""" storage_client = storage.Client() bucket = storage_client.bucket(INTAKE_BUCKET, user_project=BILLING_PROJECT) fullname = CATALOG_VERSION + "/" + filename blob = storage.blob.Blob(name=fullname, bucket=bucket) assert blob.exists() + +@pytest.mark.parametrize( + "filename", + [ + "censusdp1tract.sqlite", + "ferc1.sqlite", + "pudl.sqlite", + "hourly_emissions_epacems.parquet", + "hourly_emissions_epacems/epacems-2020-NY.parquet", + "hourly_emissions_epacems/epacems-2020-CA.parquet", + "hourly_emissions_epacems/epacems-2020-TX.parquet", + "hourly_emissions_epacems/epacems-2020-FL.parquet", + ], +) +def test_s3_file_exists(filename: str) -> None: + """Test that files expected in the catalog are available in s3.""" + # Disable signing so we don't need to load credentials. + # From https://github.com/boto/boto3/issues/1200#issuecomment-319141394 + # Have to request all of the files because we get an authentication + # error when using Client().get_object_attributes(). + s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) + + paginator = s3.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket=INTAKE_BUCKET) + files = [] + for page in page_iterator: + files += page['Contents'] + + filenames = [file["Key"] for file in files] + fullname = CATALOG_VERSION + "/" + filename + + assert fullname in filenames, f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket." + diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py index 3352bc1..0851e3d 100644 --- a/tests/integration/hourly_emissions_epacems_test.py +++ b/tests/integration/hourly_emissions_epacems_test.py @@ -1,7 +1,9 @@ """Test cases for the EPA CEMS data source.""" import logging import os +from pathlib import Path import time +import typing from typing import Literal, Optional import dask.dataframe as dd @@ -24,9 +26,11 @@ os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["gs"] +InternetProtocol = Literal["gs", "https", "s3"] + def parquet_url( - protocol: Literal["gs", "https"], + protocol: InternetProtocol, table_name: str, partition_suffix: Optional[str] = None, ) -> str: @@ -34,8 +38,9 @@ def parquet_url( try: url = BASE_URLS[protocol] except KeyError: + valid_protocols = typing.get_args(InternetProtocol) raise ValueError( - f"Received invalid protocol: {protocol}. Must be one of 'gs' or 'https'." + f"Received invalid protocol: {protocol}. Must be one of {' or '.join(valid_protocols)}." ) url = url + "/" + table_name if partition_suffix is None: @@ -48,14 +53,14 @@ def expected_df() -> pd.DataFrame: """Read parquet data directly for comparison with Intake outputs.""" logger.debug("Reading remote test data for comparison using pd.read_parquet().") epacems_url = parquet_url( - protocol="gs", + protocol="s3", table_name="hourly_emissions_epacems", partition_suffix=None, ) expected_df = pd.read_parquet( epacems_url, filters=TEST_FILTERS, - storage_options={"requester_pays": True}, + storage_options={"anon": True}, ) is_dataframe_like(expected_df) assert expected_df.shape == (70_272, 16) @@ -67,10 +72,12 @@ def expected_df() -> pd.DataFrame: [ ("gs", None), ("gs", "_partitioned"), + ("s3", None), + ("s3", "_partitioned"), ], ) def test_read_parquet( - protocol: Literal["gs", "https"], + protocol: InternetProtocol, partition_suffix: str, expected_df: pd.DataFrame, ) -> None: @@ -82,7 +89,14 @@ def test_read_parquet( partition_suffix=partition_suffix, ) start_time = time.time() - actual_dd = dd.read_parquet(epacems_url, storage_options={"requester_pays": True}) + + storage_options = {} + if protocol == "gcs": + storage_options["requester_pays"] = True + elif protocol == "s3": + storage_options["anon"] = True + actual_dd = dd.read_parquet(epacems_url, storage_options=storage_options) + elapsed_time = time.time() - start_time logger.debug(f" elapsed time: {elapsed_time:.2f}s") is_dataframe_like(actual_dd) @@ -94,22 +108,27 @@ def test_read_parquet( [ ("gs", None), ("gs", "_partitioned"), + ("s3", None), + ("s3", "_partitioned"), ], ) def test_intake_catalog( - protocol: Literal["gs", "https"], + protocol: InternetProtocol, partition_suffix: str, expected_df: pd.DataFrame, + tmp_path: Path ) -> None: """Test reading data from the intake catalog.""" logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}") os.environ["PUDL_INTAKE_PATH"] = BASE_URLS[protocol] + # Save the data to a temporary directory + os.environ["PUDL_INTAKE_CACHE"] = str(tmp_path) pudl_cat = intake.cat.pudl_cat src = "hourly_emissions_epacems" if partition_suffix is not None: src += partition_suffix start_time = time.time() - actual_dd = pudl_cat[src](cache_method="").to_dask() + actual_dd = pudl_cat[src].to_dask() elapsed_time = time.time() - start_time logger.debug(f" elapsed time: {elapsed_time:.2f}s") is_dataframe_like(actual_dd) From 7a23a19cbf1e89126625bd21faa1eb447547900c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 19 Dec 2022 23:57:59 +0000 Subject: [PATCH 11/13] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- tests/integration/file_existence_test.py | 16 +++++++++------- .../integration/hourly_emissions_epacems_test.py | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/integration/file_existence_test.py b/tests/integration/file_existence_test.py index 1a093cf..6e3a3c9 100644 --- a/tests/integration/file_existence_test.py +++ b/tests/integration/file_existence_test.py @@ -3,9 +3,9 @@ import boto3 import pytest -from google.cloud import storage from botocore import UNSIGNED from botocore.config import Config +from google.cloud import storage from pudl_catalog import CATALOG_VERSION, INTAKE_BUCKET @@ -36,6 +36,7 @@ def test_gcs_file_exists(filename: str) -> None: blob = storage.blob.Blob(name=fullname, bucket=bucket) assert blob.exists() + @pytest.mark.parametrize( "filename", [ @@ -55,16 +56,17 @@ def test_s3_file_exists(filename: str) -> None: # From https://github.com/boto/boto3/issues/1200#issuecomment-319141394 # Have to request all of the files because we get an authentication # error when using Client().get_object_attributes(). - s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) + s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) - paginator = s3.get_paginator('list_objects_v2') + paginator = s3.get_paginator("list_objects_v2") page_iterator = paginator.paginate(Bucket=INTAKE_BUCKET) files = [] for page in page_iterator: - files += page['Contents'] - + files += page["Contents"] + filenames = [file["Key"] for file in files] fullname = CATALOG_VERSION + "/" + filename - assert fullname in filenames, f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket." - + assert ( + fullname in filenames + ), f"{fullname} is not in the {INTAKE_BUCKET}/{CATALOG_VERSION} bucket." diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py index 0851e3d..5e75cda 100644 --- a/tests/integration/hourly_emissions_epacems_test.py +++ b/tests/integration/hourly_emissions_epacems_test.py @@ -1,9 +1,9 @@ """Test cases for the EPA CEMS data source.""" import logging import os -from pathlib import Path import time import typing +from pathlib import Path from typing import Literal, Optional import dask.dataframe as dd @@ -89,7 +89,7 @@ def test_read_parquet( partition_suffix=partition_suffix, ) start_time = time.time() - + storage_options = {} if protocol == "gcs": storage_options["requester_pays"] = True @@ -116,7 +116,7 @@ def test_intake_catalog( protocol: InternetProtocol, partition_suffix: str, expected_df: pd.DataFrame, - tmp_path: Path + tmp_path: Path, ) -> None: """Test reading data from the intake catalog.""" logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}") From 9c0c57ce0ff01233e49eaa73dcbcca8511f3073c Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 19 Dec 2022 16:58:01 -0800 Subject: [PATCH 12/13] Remove boto3 version constraint and fix integration test bug --- README.rst | 5 +++++ setup.py | 2 +- tests/integration/hourly_emissions_epacems_test.py | 8 ++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 7a74497..e667ae5 100644 --- a/README.rst +++ b/README.rst @@ -187,6 +187,11 @@ types: 'path': 'https://creativecommons.org/licenses/by/4.0'}, 'catalog_dir': '/home/zane/code/catalyst/pudl-catalog/src/pudl_catalog/'}} +.. note:: + + If the data has not been cached this method might take a while to finish depending + on your internet speed. The EPA CEMS parquet data is almost 5 GB. + Read some data from the catalog ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/setup.py b/setup.py index 2d74e25..2f36349 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ zip_safe=False, python_requires=">=3.8,<3.12", install_requires=[ - "boto3>=1.24,<1.27", + "boto3", # It takes almost 20 min to solve the environment when a boto3 version is specified "s3fs>=2021.7,<2022.11.1", "gcsfs>=2021.7,<2022.11.1", "intake_parquet>=0.2.3,<0.3", diff --git a/tests/integration/hourly_emissions_epacems_test.py b/tests/integration/hourly_emissions_epacems_test.py index 0851e3d..f030185 100644 --- a/tests/integration/hourly_emissions_epacems_test.py +++ b/tests/integration/hourly_emissions_epacems_test.py @@ -1,9 +1,9 @@ """Test cases for the EPA CEMS data source.""" import logging import os -from pathlib import Path import time import typing +from pathlib import Path from typing import Literal, Optional import dask.dataframe as dd @@ -89,9 +89,9 @@ def test_read_parquet( partition_suffix=partition_suffix, ) start_time = time.time() - + storage_options = {} - if protocol == "gcs": + if protocol == "gs": storage_options["requester_pays"] = True elif protocol == "s3": storage_options["anon"] = True @@ -116,7 +116,7 @@ def test_intake_catalog( protocol: InternetProtocol, partition_suffix: str, expected_df: pd.DataFrame, - tmp_path: Path + tmp_path: Path, ) -> None: """Test reading data from the intake catalog.""" logger.debug(f"intake_catalog, {protocol=}, {partition_suffix=}") From d027d827a79a424bfe76ecd0ac17c93e5c0f3d28 Mon Sep 17 00:00:00 2001 From: bendnorman Date: Mon, 19 Dec 2022 23:23:23 -0800 Subject: [PATCH 13/13] Specify boto3 version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2f36349..d211560 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ zip_safe=False, python_requires=">=3.8,<3.12", install_requires=[ - "boto3", # It takes almost 20 min to solve the environment when a boto3 version is specified + "boto3==1.24.59", "s3fs>=2021.7,<2022.11.1", "gcsfs>=2021.7,<2022.11.1", "intake_parquet>=0.2.3,<0.3",