Skip to content

Commit

Permalink
Make an EIA860m Changelog table (#3331)
Browse files Browse the repository at this point in the history
* first draft of all eia860m extraction

* first draft of transform process: runs through existing 860 transform does not do changelog yet

* simplify replaces in tranform and add changelog dropdupes

* first pass of adding full transform for eia860 and schema

* Fix bad monthly expand_timeseries

* [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci

* clean up settings and add alembic migration

* fix the settings grabbing in eia860 settings with new eia860m setup

* Convert 860m table into db table

* make a new 860m settings class, dont pass in report_date for 860, & use the right table name

* remove FK relationships to the changelog table and make expand_timeseries have a dec unit test

* change eia86m io manager to our cool new db + parquet manager

* add docs and fix b4by missp3lls and change tbl name

* add migration and update fast 860m month post new 860m integration

* alembic migrations

* [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci

* Fix the working partitions in settings and helpers

* Fix settings partitions and be better about selecting 860m only columns

* Update nightly build script to distribute parquet (#3399)

* Update nightly build script to distribute parquet

* Fix logging cut-and-paste error

* Name parquet distribution success variable like all the others

* [pre-commit.ci] auto fixes from pre-commit.com hooks

For more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Zane Selvans <zane.selvans@catalyst.coop>
  • Loading branch information
3 people authored Feb 15, 2024
1 parent 9d40b68 commit 6278781
Show file tree
Hide file tree
Showing 21 changed files with 464 additions and 190 deletions.

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions src/pudl/etl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@
*load_assets_from_modules([epacems_assets], group_name="core_epacems"),
*load_assets_from_modules([pudl.extract.eia176], group_name="raw_eia176"),
*load_assets_from_modules([pudl.extract.phmsagas], group_name="raw_phmsagas"),
*load_assets_from_modules([pudl.extract.eia860m], group_name="raw_eia860m"),
*load_assets_from_modules([pudl.extract.eia860], group_name="raw_eia860"),
*load_assets_from_modules([pudl.transform.eia860], group_name="_core_eia860"),
*load_assets_from_modules([pudl.transform.eia860m], group_name="core_eia860m"),
*load_assets_from_modules([pudl.extract.eia861], group_name="raw_eia861"),
*load_assets_from_modules(
[pudl.transform.eia861], group_name="core_eia861"
Expand Down
5 changes: 1 addition & 4 deletions src/pudl/extract/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import pudl.logging_helpers
from pudl.extract import excel
from pudl.helpers import remove_leading_zeros_from_numeric_strings
from pudl.metadata.classes import DataSource

logger = pudl.logging_helpers.get_logger(__name__)

Expand Down Expand Up @@ -106,10 +105,8 @@ def extract_eia860(context, raw_eia860__all_dfs):
ds = context.resources.datastore

if eia_settings.eia860.eia860m:
eia860m_data_source = DataSource.from_id("eia860m")
eia860m_date = eia860m_data_source.working_partitions["year_month"]
eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
year_month=eia860m_date
year_month=[eia_settings.eia860.eia860m_year_month]
)
raw_eia860__all_dfs = pudl.extract.eia860m.append_eia860m(
eia860_raw_dfs=raw_eia860__all_dfs, eia860m_raw_dfs=eia860m_raw_dfs
Expand Down
55 changes: 54 additions & 1 deletion src/pudl/extract/eia860m.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from datetime import datetime

import pandas as pd
from dagster import AssetOut, Output, asset, multi_asset

import pudl.logging_helpers
from pudl.extract import excel
Expand Down Expand Up @@ -43,8 +44,12 @@ def process_raw(self, df, page, **partition):
df["report_year"] = datetime.strptime(
list(partition.values())[0], "%Y-%m"
).year
df["report_date"] = pd.to_datetime(
list(partition.values())[0], format="%Y-%m", exact=False
)
df = self.add_data_maturity(df, page, **partition)
self.cols_added.append("report_year")
self.cols_added.append("report_date")
# Eventually we should probably make this a transform
for col in ["generator_id", "boiler_id"]:
if col in df.columns:
Expand Down Expand Up @@ -80,8 +85,56 @@ def append_eia860m(eia860_raw_dfs, eia860m_raw_dfs):
# page names in 860m and 860 are the same.
for page in pages_eia860m:
eia860_raw_dfs[page] = pd.concat(
[eia860_raw_dfs[page], eia860m_raw_dfs[page]],
[eia860_raw_dfs[page], eia860m_raw_dfs[page].drop(columns=["report_date"])],
ignore_index=True,
sort=True,
)
return eia860_raw_dfs


@asset(
required_resource_keys={"datastore", "dataset_settings"},
)
def raw_eia860m__all_dfs(context):
"""Extract raw EIAm data from excel sheets into dict of dataframes."""
eia_settings = context.resources.dataset_settings.eia
ds = context.resources.datastore

eia860m_extractor = Extractor(ds=ds)
raw_eia860m__all_dfs = eia860m_extractor.extract(
year_month=eia_settings.eia860m.year_months
)
return raw_eia860m__all_dfs


raw_table_names = (
"raw_eia860m__generator_existing",
"raw_eia860m__generator_proposed",
"raw_eia860m__generator_retired",
)


@multi_asset(
outs={table_name: AssetOut() for table_name in sorted(raw_table_names)},
required_resource_keys={"datastore", "dataset_settings"},
)
def extract_eia860m(raw_eia860m__all_dfs):
"""Extract raw EIA data from excel sheets into dataframes.
Args:
context: dagster keyword that provides access to resources and config.
Returns:
A tuple of extracted EIA dataframes.
"""
# create descriptive table_names
raw_eia860m__all_dfs = {
"raw_eia860m__" + table_name: df
for table_name, df in raw_eia860m__all_dfs.items()
}
raw_eia860m__all_dfs = dict(sorted(raw_eia860m__all_dfs.items()))

return (
Output(output_name=table_name, value=df)
for table_name, df in raw_eia860m__all_dfs.items()
)
24 changes: 10 additions & 14 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,25 +620,21 @@ def expand_timeseries(
}
)
elif fill_through_freq == "month":
end_dates.loc[:, date_col] = end_dates[
date_col
] + pd.tseries.offsets.DateOffset(months=1)
end_dates.loc[:, date_col] = pd.to_datetime(
{
"year": end_dates[date_col].dt.year,
"month": end_dates[date_col].dt.month + 1,
"month": end_dates[date_col].dt.month,
"day": 1,
}
)
elif fill_through_freq == "day":
end_dates.loc[:, date_col] = pd.to_datetime(
{
"year": end_dates[date_col].dt.year,
"month": end_dates[date_col].dt.month,
"day": end_dates[date_col].dt.day + 1,
}
)
else:
raise ValueError(
f"{fill_through_freq} is not a valid frequency to fill through."
)
end_dates.loc[:, date_col] = end_dates[
date_col
] + pd.tseries.offsets.DateOffset(days=1)

end_dates["drop_row"] = True
df = (
pd.concat([df, end_dates.reset_index()])
Expand Down Expand Up @@ -1408,9 +1404,9 @@ def get_working_dates_by_datasource(datasource: str) -> pd.DatetimeIndex:
dates = dates.append(
pd.to_datetime(working_partitions["years"], format="%Y")
)
if "year_month" in working_partitions:
if "year_months" in working_partitions:
dates = dates.append(
pd.DatetimeIndex([pd.to_datetime(working_partitions["year_month"])])
pd.DatetimeIndex(pd.to_datetime(working_partitions["year_months"]))
)
return dates

Expand Down
4 changes: 4 additions & 0 deletions src/pudl/metadata/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3525,6 +3525,10 @@
"type": "string",
"description": "Freeform description of type of utility reported in one of the other three other utility_type sections in the core_ferc1__yearly_utility_plant_summary_sched200 table. This field is reported only in the DBF reporting years (1994-2020).",
},
"valid_till_date": {
"type": "date",
"description": "The record in the changelog is valid until this date. The record is valid from the report_date up until but not including the valid_till_date.",
},
"variable_peak_pricing": {
"type": "boolean",
"description": (
Expand Down
13 changes: 12 additions & 1 deletion src/pudl/metadata/resources/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,12 @@
"fields": [["plant_id_eia", "generator_id"]],
# exclude core_epa__assn_eia_epacamd_subplant_ids bc there are generator ids in this
# glue table that come only from epacamd
"exclude": ["core_epa__assn_eia_epacamd_subplant_ids"],
# also exclude the 860 changelog table bc that table doesn't get harvested
# and therefor there are a few straggler generators that don't end up in this table
"exclude": [
"core_epa__assn_eia_epacamd_subplant_ids",
"core_eia860m__changelog_generators",
],
},
},
"sources": ["eia860", "eia923"],
Expand Down Expand Up @@ -573,9 +578,12 @@
# violations.
# See: https://github.com/catalyst-cooperative/pudl/issues/1196
# Exclude the core_epa__assn_eia_epacamd_subplant_ids table
# also exclude the 860 changelog table bc that table doesn't get harvested
# and therefor there are a few straggler generators that don't end up in this table
"exclude": [
"core_pudl__assn_eia_pudl_plants",
"core_epa__assn_eia_epacamd_subplant_ids",
"core_eia860m__changelog_generators",
],
},
},
Expand Down Expand Up @@ -672,6 +680,9 @@
# not yet harvesting owner_utility_id_eia from core_eia860__scd_ownership.
# See https://github.com/catalyst-cooperative/pudl/issues/1393
"out_eia923__yearly_generation_fuel_by_generator_energy_source_owner",
# also exclude the 860 changelog table bc that table doesn't get harvested
# and therefor there are a few straggler generators that don't end up in this table
"core_eia860m__changelog_generators",
],
},
},
Expand Down
3 changes: 3 additions & 0 deletions src/pudl/metadata/resources/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
"_out_eia__monthly_heat_rate_by_generator",
"_out_eia__monthly_derived_generator_attributes",
"out_eia__monthly_generators",
"core_eia860m__changelog_generators",
],
},
},
Expand Down Expand Up @@ -391,6 +392,7 @@
"_out_eia__monthly_capacity_factor_by_generator",
"_out_eia__monthly_derived_generator_attributes",
"out_eia__monthly_generators",
"core_eia860m__changelog_generators",
],
},
},
Expand Down Expand Up @@ -461,6 +463,7 @@
"_out_eia__monthly_fuel_cost_by_generator",
"out_eia923__monthly_generation_fuel_by_generator_energy_source",
"out_eia923__monthly_generation_fuel_by_generator",
"core_eia860m__changelog_generators",
# Utility IDs in this table are owners, not operators, and we are
# not yet harvesting owner_utility_id_eia from core_eia860__scd_ownership.
# See https://github.com/catalyst-cooperative/pudl/issues/1393
Expand Down
56 changes: 56 additions & 0 deletions src/pudl/metadata/resources/eia860m.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Definitions of data tables primarily coming from EIA-860m."""
from typing import Any

RESOURCE_METADATA: dict[str, dict[str, Any]] = {
"core_eia860m__changelog_generators": {
"description": (
"This table is a changelog of the monthly reported EIA-860m data. EIA-860m "
"includes generator tables with the most up-to-date catalogue of EIA "
"generators and their operational status and other generator characteristics. "
"EIA-860m is reported monthly, although for the vast majority of the generators "
"nothing changes month-to-month. This table is a changelog of that monthly "
"reported generator data. There is a record cooresponding to the first instance "
"of a generator and associated characteristics with a report_date column and a "
"valid_till_date column. Whenever any of the reported EIA-860m data was changed "
"for a record, there will be a new changelog record with a new report_date."
),
"schema": {
"fields": [
"report_date",
"valid_till_date",
"plant_id_eia",
"plant_name_eia",
"utility_id_eia",
"utility_name_eia",
"generator_id",
"capacity_mw",
"county",
"current_planned_generator_operating_date",
"data_maturity",
"energy_source_code_1",
"energy_storage_capacity_mwh",
"fuel_type_code_pudl",
"generator_retirement_date",
"latitude",
"longitude",
"net_capacity_mwdc",
"operational_status",
"operational_status_code",
"planned_derate_date",
"planned_generator_retirement_date",
"planned_net_summer_capacity_derate_mw",
"planned_net_summer_capacity_uprate_mw",
"planned_uprate_date",
"prime_mover_code",
"state",
"summer_capacity_mw",
"technology_description",
"winter_capacity_mw",
],
"primary_key": ["plant_id_eia", "generator_id", "report_date"],
},
"field_namespace": "eia",
"sources": ["eia860"],
"etl_group": "eia860",
},
}
5 changes: 4 additions & 1 deletion src/pudl/metadata/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,10 @@
"field_namespace": "eia",
"contributors": [],
"working_partitions": {
"year_month": "2023-12",
"year_months": [
str(q).lower()
for q in pd.period_range(start="2015-07", end="2023-12", freq="M")
],
},
"keywords": sorted(
set(
Expand Down
Loading

0 comments on commit 6278781

Please sign in to comment.