From 70e3d18c7078fafbd7440270c6c5219f31cdf3c7 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Thu, 16 Nov 2023 11:05:31 -0800 Subject: [PATCH 01/15] pipeline to load BlackCat API data --- .../external_table_a10.yml | 68 ++++++ .../external_table_a15.yml | 16 ++ .../external_table_a30.yml | 86 ++++++++ .../external_table_p10.yml | 74 +++++++ .../external_table_p20.yml | 56 +++++ .../external_table_p50.yml | 53 +++++ .../external_table_rr20_intercity.yml | 71 +++++++ .../external_table_rr20_rural.yml | 77 +++++++ .../external_table_rr20_urban_tribal.yml | 59 ++++++ .../external_table_ss60.yml | 16 ++ .../external_table_tam_narrative.yml | 16 ++ .../dags/ntd_report_validation/METADATA.yml | 18 ++ airflow/dags/ntd_report_validation/README.md | 9 + .../a10_submitted_for_ntd.yml | 7 + .../a15_submitted_for_ntd.yml | 7 + .../a30_submitted_for_ntd.yml | 7 + .../p10_submitted_for_ntd.yml | 7 + .../p20_submitted_for_ntd.yml | 7 + .../p50_submitted_for_ntd.yml | 7 + .../rr20_intercity_submitted_for_ntd.yml | 7 + .../rr20_rural_submitted_for_ntd.yml | 7 + .../rr20_urban_tribal_submitted_for_ntd.yml | 7 + .../ss60_submitted_for_ntd.yml | 7 + .../tam_narrative_submitted_for_ntd.yml | 7 + airflow/plugins/operators/__init__.py | 1 + airflow/plugins/operators/blackcat_to_gcs.py | 193 ++++++++++++++++++ 26 files changed, 890 insertions(+) create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml create mode 100644 airflow/dags/ntd_report_validation/METADATA.yml create mode 100644 airflow/dags/ntd_report_validation/README.md create mode 100644 airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml create mode 100644 airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml create mode 100644 airflow/plugins/operators/blackcat_to_gcs.py diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml new file mode 100644 index 0000000000..bf1d4cbabf --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml @@ -0,0 +1,68 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.a10_ntdreportingstationsandmaintenance + LIMIT 1; +source_objects: + - "a10_NTDReportingStationsAndMaintenance/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.a10_ntdreportingstationsandmaintenance" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "a10_NTDReportingStationsAndMaintenance/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: service_mode + type: STRING + mode: NULLABLE + - name: pt_owned_by_service_provider + type: FLOAT64 + mode: NULLABLE + - name: pt_owned_by_public_agency + type: FLOAT64 + mode: NULLABLE + - name: pt_leased_by_public_agency + type: FLOAT64 + mode: NULLABLE + - name: pt_leased_by_service_provider + type: FLOAT64 + mode: NULLABLE + - name: do_owned + type: FLOAT64 + mode: NULLABLE + - name: do_leased_by_public_agency + type: FLOAT64 + mode: NULLABLE + - name: do_leased_from_private_entity + type: FLOAT64 + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE + + + diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml new file mode 100644 index 0000000000..1e327030eb --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml @@ -0,0 +1,16 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.a15_ntdtransitassetmanagement + LIMIT 1; +source_objects: + - "a15_NTDTransitAssetManagementA15/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.a15_ntdtransitassetmanagement" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "a15_NTDTransitAssetManagementA15/" diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml new file mode 100644 index 0000000000..beefa1982d --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml @@ -0,0 +1,86 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.a30_ntdassetandresourceinfo + LIMIT 1; +source_objects: + - "a30_NTDAssetAndResourceInfo/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.a30_ntdassetandresourceinfo" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "a30_NTDAssetAndResourceInfo/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: vehicle_id + type: STRING + mode: NULLABLE + - name: vehicle_status + type: STRING + mode: NULLABLE + - name: vin + type: STRING + mode: NULLABLE + - name: ntdid + type: STRING + mode: NULLABLE + - name: ada_access + type: STRING + mode: NULLABLE + - name: vehicle_type + type: STRING + mode: NULLABLE + - name: fuel_type + type: STRING + mode: NULLABLE + - name: average_estimated_service_years_when_new + type: INTEGER + mode: NULLABLE + - name: average_expiration_years_when_new + type: INTEGER + mode: NULLABLE + - name: vehicle_year + type: INTEGER + mode: NULLABLE + - name: useful_life_years_remaining + type: INTEGER + mode: NULLABLE + - name: vehicle_length + type: STRING + mode: NULLABLE + - name: seating_capacity + type: STRING + mode: NULLABLE + - name: ownership_type + type: STRING + mode: NULLABLE + - name: modes_operated_display_text + type: STRING + mode: NULLABLE + - name: modes_operated_full_text + type: STRING + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml new file mode 100644 index 0000000000..fdb6a5aa45 --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml @@ -0,0 +1,74 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.p10_ntdreporterbasicinfo + LIMIT 1; +source_objects: + - "p10_NTDReportingP10/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.p10_ntdreporterbasicinfo" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "p10_NTDReportingP10/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: org_id + type: STRING + mode: NULLABLE + - name: user_id + type: STRING + mode: NULLABLE + - name: first_name + type: STRING + mode: NULLABLE + - name: last_name + type: STRING + mode: NULLABLE + - name: full_name + type: STRING + mode: NULLABLE + - name: text + type: STRING + mode: NULLABLE + - name: value + type: STRING + mode: NULLABLE + - name: group + type: STRING + mode: NULLABLE + - name: bool_value + type: BOOL + mode: NULLABLE + - name: primary_phone + type: STRING + mode: NULLABLE + - name: email + type: STRING + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml new file mode 100644 index 0000000000..6fd9370b8a --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml @@ -0,0 +1,56 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.p20_ntdreportermodes + LIMIT 1; +source_objects: + - "p20_NTDReportingP20/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.p20_ntdreportermodes" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "p20_NTDReportingP20/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: service_mode + type: STRING + mode: NULLABLE + - name: type_of_service + type: STRING + mode: NULLABLE + - name: commitment_date + type: DATETIME + mode: NULLABLE + - name: start_date + type: DATETIME + mode: NULLABLE + - name: end_date + type: DATETIME + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml new file mode 100644 index 0000000000..78c6cf4dde --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml @@ -0,0 +1,53 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.p50_ntdreportergtfs + LIMIT 1; +source_objects: + - "p50_NTDReportingP50/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.p50_ntdreportergtfs" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "p50_NTDReportingP50/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: mode + type: STRING + mode: NULLABLE + - name: type + type: STRING + mode: NULLABLE + - name: web_link + type: STRING + mode: NULLABLE + - name: file_path + type: STRING + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml new file mode 100644 index 0000000000..e52cbeabba --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml @@ -0,0 +1,71 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.rr20_intercity + LIMIT 1; +source_objects: + - "rr20_NTDReportingRR20_Intercity/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.rr20_intercity" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "rr20_NTDReportingRR20_Intercity/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: item_id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: item + type: STRING + mode: NULLABLE + - name: type + type: STRING + mode: NULLABLE + - name: operations_expended + type: STRING + mode: NULLABLE + - name: capital_expended + type: STRING + mode: NULLABLE + - name: description + type: STRING + mode: NULLABLE + - name: annual_vehicle_rev_miles + type: FLOAT64 + mode: NULLABLE + - name: regular_unlinked_passenger_trips + type: INTEGER + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE + + + + + + diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml new file mode 100644 index 0000000000..bf85034c82 --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml @@ -0,0 +1,77 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.rr20_rural + LIMIT 1; +source_objects: + - "rr20_NTDReportingRR20_Rural/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.rr20_rural" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "rr20_NTDReportingRR20_Rural/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: item + type: STRING + mode: NULLABLE + - name: REVENUE + type: FLOAT64 + mode: NULLABLE + - name: css_class + type: STRING + mode: NULLABLE + - name: operations_expended + type: STRING + mode: NULLABLE + - name: capital_expended + type: STRING + mode: NULLABLE + - name: description + type: STRING + mode: NULLABLE + - name: annual_vehicle_rev_miles + type: FLOAT64 + mode: NULLABLE + - name: annual_vehicle_rev_hours + type: INTEGER + mode: NULLABLE + - name: annual_unlinked_pass_trips + type: FLOAT64 + mode: NULLABLE + - name: annual_vehicle_max_service + type: INTEGER + mode: NULLABLE + - name: sponsored_service_upt + type: INTEGER + mode: NULLABLE + - name: quantity + type: INTEGER + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE \ No newline at end of file diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml new file mode 100644 index 0000000000..1522c542ab --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml @@ -0,0 +1,59 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.rr20_urban_tribal + LIMIT 1; +source_objects: + - "rr20_NTDReportingRR20_Urban_Tribal/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.rr20_urban_tribal" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "rr20_NTDReportingRR20_Urban_Tribal/" +schema_fields: + - name: api_report_id + type: STRING + mode: NULLABLE + - name: api_organization + type: STRING + mode: NULLABLE + - name: api_report_period + type: INTEGER + mode: NULLABLE + - name: api_report_status + type: STRING + mode: NULLABLE + - name: api_last_modified + type: TIMESTAMP + mode: NULLABLE + - name: id + type: STRING + mode: NULLABLE + - name: item_id + type: STRING + mode: NULLABLE + - name: report_id + type: STRING + mode: NULLABLE + - name: item + type: STRING + mode: NULLABLE + - name: type + type: STRING + mode: NULLABLE + - name: operations_expended + type: STRING + mode: NULLABLE + - name: capital_expended + type: STRING + mode: NULLABLE + - name: description + type: STRING + mode: NULLABLE + - name: last_modified_date + type: DATETIME + mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml new file mode 100644 index 0000000000..576a286c71 --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml @@ -0,0 +1,16 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.ss60_safety + LIMIT 1; +source_objects: + - "ss60_SS60/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.ss60_safety" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "ss60_SS60/" diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml new file mode 100644 index 0000000000..27e869a3f6 --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml @@ -0,0 +1,16 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.ntd_tamnarrative + LIMIT 1; +source_objects: + - "ntd_NTDReportingTAMNarrative/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.ntd_tamnarrative" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "ntd_NTDReportingTAMNarrative/" diff --git a/airflow/dags/ntd_report_validation/METADATA.yml b/airflow/dags/ntd_report_validation/METADATA.yml new file mode 100644 index 0000000000..d8a59f4b6e --- /dev/null +++ b/airflow/dags/ntd_report_validation/METADATA.yml @@ -0,0 +1,18 @@ +description: "Process raw files from a GCS bucket (NTD reports from BlackCat API) directly into BigQuery" +schedule_interval: "0 10 * * 1" #10 am every Monday +tags: + - ntd, blackcat +default_args: + owner: airflow + depends_on_past: False + start_date: "2023-10-02" + catchup: False + email: + - "kim.engie@slalom.com" + - "christian.suyat@dot.ca.gov" + - "katrina.kaiser@dot.ca.gov" + email_on_failure: True + pool: default_pool + concurrency: 50 +wait_for_defaults: + timeout: 3600 \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/README.md b/airflow/dags/ntd_report_validation/README.md new file mode 100644 index 0000000000..652eb0e2a6 --- /dev/null +++ b/airflow/dags/ntd_report_validation/README.md @@ -0,0 +1,9 @@ +# `ntd_report_validation` + +Type: [Now|Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) + +This DAG orchestrates the publishing and storing of data, in the form of NTD report submissions, first pushing API data into Google Cloud Storage in the bucket `calitp-ntd-report-validation`. + +Another DAG (part of the `create_external_tables` existing DAG) reads the GCS data in BigQuery in the Cal-ITP data warehouse. The job will take the most recent file of each report type (which has all submitted reports by Caltrans 5311 subrecipients) and publish it into BigQuery `external` tables, if it is not yet there. This job uses the Cal-ITP existing infrastructure for creating external tables, outlined [here](https://docs.calitp.org/data-infra/architecture/data.html). + +In the event of failure, the job can be rerun without backfilling. \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml new file mode 100644 index 0000000000..c33a1120e7 --- /dev/null +++ b/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingStationsAndMaintenance" +form: "A-10" +bq_table_name: "ntdreportingstationsandmaintenance" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml new file mode 100644 index 0000000000..ff58fc5b31 --- /dev/null +++ b/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDTransitAssetManagementA15" +form: "A-15" +bq_table_name: "ntdtransitassetmanagement" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml new file mode 100644 index 0000000000..2c6176942b --- /dev/null +++ b/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDAssetAndResourceInfo" +form: "A-30" +bq_table_name: "ntdassetandresourceinfo" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml new file mode 100644 index 0000000000..75694c6f3f --- /dev/null +++ b/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingP10" +form: "P-10" +bq_table_name: "ntdreporterbasicinfo" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml new file mode 100644 index 0000000000..de197b31c9 --- /dev/null +++ b/airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingP20" +form: "P-20" +bq_table_name: "ntdreportermodes" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml new file mode 100644 index 0000000000..46778e2ac2 --- /dev/null +++ b/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingP50" +form: "P-50" +bq_table_name: "ntdreportergtfs" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml new file mode 100644 index 0000000000..34656c2aaf --- /dev/null +++ b/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingRR20_Intercity" +form: "RR-20" +bq_table_name: "intercity" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml new file mode 100644 index 0000000000..3bdb2d2137 --- /dev/null +++ b/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingRR20_Rural" +form: "RR-20" +bq_table_name: "rural" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml new file mode 100644 index 0000000000..3c698de609 --- /dev/null +++ b/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingRR20_Urban_Tribal" +form: "RR-20" +bq_table_name: "urban_tribal" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml new file mode 100644 index 0000000000..13d7d04ddd --- /dev/null +++ b/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "SS60" +form: "SS-60" +bq_table_name: "safety" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml new file mode 100644 index 0000000000..fcdb3fbcac --- /dev/null +++ b/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml @@ -0,0 +1,7 @@ +operator: operators.BlackCatApiToGCSOperator + +bucket: "gs://calitp-ntd-report-validation" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" +api_tablename: "NTDReportingTAMNarrative" +form: "NTD" +bq_table_name: "tamnarrative" \ No newline at end of file diff --git a/airflow/plugins/operators/__init__.py b/airflow/plugins/operators/__init__.py index 24fd5fd7ce..39271c95c8 100644 --- a/airflow/plugins/operators/__init__.py +++ b/airflow/plugins/operators/__init__.py @@ -6,3 +6,4 @@ from operators.littlepay_raw_sync import LittlepayRawSync from operators.littlepay_to_jsonl import LittlepayToJSONL from operators.pod_operator import PodOperator +from operators.blackcat_to_gcs import BlackCatApiToGCSOperator diff --git a/airflow/plugins/operators/blackcat_to_gcs.py b/airflow/plugins/operators/blackcat_to_gcs.py new file mode 100644 index 0000000000..b6927aceea --- /dev/null +++ b/airflow/plugins/operators/blackcat_to_gcs.py @@ -0,0 +1,193 @@ +from calitp_data_infra.storage import get_fs, make_name_bq_safe +from airflow.models import BaseOperator +from pydantic import BaseModel +from typing import Optional +import pandas as pd +import pendulum +import requests +import logging +import gzip +import os +import re + +def write_to_log(logfilename): + ''' + Creates a logger object that outputs to a log file, to the filename specified, + and also streams to console. + ''' + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', + datefmt='%y-%m-%d %H:%M:%S') + file_handler = logging.FileHandler(logfilename) + file_handler.setFormatter(formatter) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + if not logger.hasHandlers(): + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + + return logger + + +def camel_to_snake(name): + '''Converts Snake case to underscore separation for renaming columns; + VehicleStatus becomes vehicle_status and + can handle acroynms like ADAAccess, which becomes ada_access''' + name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() + + +class BlackCatApiExtract(BaseModel): + api_url: str + form: str + api_tablename: str + bq_table_name: str + data: Optional[pd.DataFrame] + logger: Optional[logging.Logger] + extract_time: Optional[pendulum.DateTime] + + logger = write_to_log('load_bc_apidata_output.log') + extract_time = pendulum.now() + + # pydantic doesn't know dataframe type + # see https://stackoverflow.com/a/69200069 + class Config: + arbitrary_types_allowed = True + + def fetch_from_bc_api(self): + """Download a BlackCat table as a DataFrame. + + Note that BlackCat API reports have rows structured as follows: + [{'ReportId': , + 'Organization': , + 'ReportPeriod': , + 'ReportStatus': , + 'ReportLastModifiedDate': , + '': {'Data': [{colname: value, ...}, {colname: value, ...} ...]}}, + {'ReportId': , ...etc. to the next organization}] + + This function applies renames in the following order. + 1. rename column names from snakecase to names utilizing underscores + 2. rename fields + 3. apply column prefix (to columns not renamed by 1 or 2) + """ + + self.logger.info( + f"Downloading BlackCat data for {self.extract_time.format('YYYY')}_{self.bq_table_name}." + ) + response = requests.get(self.api_url, verify=False) + blob = response.json() + + org_data = [] + + # Cyling through and processing each org's data + for x in blob: + report_id = x.get('ReportId') + org = x.get('Organization') + period = x.get('ReportPeriod') + status = x.get('ReportStatus') + last_mod = (pendulum.from_format(x.get('ReportLastModifiedDate'), 'MM/DD/YYYY HH:mm:ss A') + .in_tz('America/Los_Angeles') + .set(tz='UTC')) + iso = last_mod.to_iso8601_string() + + org_info_values = {'api_report_id': report_id, 'api_organization': org, + 'api_report_period': period, 'api_report_status': status, + 'api_last_modified': iso} + org_info_df = pd.DataFrame([org_info_values]) + + table_json = x[self.api_tablename]['Data'] + # checks for nested json entries, replaces any with only the 'Text' value from nested json. + for x in table_json: + for k,v in x.items(): + if type(v) is dict: + x[k] = x[k]['Text'] + raw_df = pd.DataFrame.from_dict(table_json) + raw_df.rename(columns=lambda c: camel_to_snake(c), inplace=True) + whole_df = pd.concat([org_info_df, raw_df], axis=1).sort_values(by='api_organization') + + # Only the 1st row of data in org_info_df is filled, other rows have NAs. + # Here we fill in the rest with the values + whole_df = whole_df.fillna(value=org_info_values) + org_data.append(whole_df) + + raw_df = pd.concat(org_data) + raw_df.rename(columns=lambda c: camel_to_snake(c), inplace=True) + + self.data = raw_df.rename(make_name_bq_safe, axis="columns") + self.logger.info( + f"Downloaded {self.extract_time.format('YYYY')}_{self.bq_table_name} data with {len(self.data)} rows!" + ) + + def make_hive_path(self, form: str, bucket: str): + if not self.extract_time: + raise ValueError( + "An extract time must be set before a hive path can be generated." + ) + bq_form_name = ( + str.lower(form).replace("-", "") + ) + return os.path.join( + bucket, + f"{bq_form_name}_{self.api_tablename}", + f"dt={self.extract_time.to_date_string()}", + f"ts={self.extract_time.to_iso8601_string()}", + f"{bq_form_name}_{self.bq_table_name}.jsonl.gz", + ) + + def save_to_gcs(self, fs, bucket): + hive_path = self.make_hive_path(self.form, bucket) + self.logger.info(f"Uploading to GCS at {hive_path}") + if len(self.data) == 0: + self.logger.info(f"There is no data for {self.api_tablename}, not saving anything. Pipeline exiting.") + pass + else: + fs.pipe( + hive_path, + gzip.compress(self.data.to_json(orient="records", lines=True).encode()), + ) + return hive_path + + +class BlackCatApiToGCSOperator(BaseOperator): + template_fields = ("bucket",) + + def __init__( + self, + bucket, + api_url, + form, + api_tablename, + bq_table_name, + **kwargs, + ): + """An operator that downloads data from a BlackCat API + and saves it as a JSON file hive-partitioned by date in Google Cloud + Storage (GCS). + + Args: + bucket (str): GCS bucket where the scraped BlackCat report will be saved. + api_url (str): The URL to hit that gets the data. + api_tablename (str): The table that should be extracted from the BlackCat API. + MUST MATCH THE API JSON EXACTLY + bq_table_name (str): The table name that will be given in BigQuery. Appears in the GCS bucket path and the filename. + form: the NTD form that this report belongs to. E.g., RR-20, A-10, etc. + """ + self.bucket = bucket + # Instantiating an instance of the BlackCatApiExtract() + self.extract = BlackCatApiExtract( + api_url=api_url, + form=form, + api_tablename=api_tablename, + bq_table_name=bq_table_name, + ) + + super().__init__(**kwargs) + + def execute(self, **kwargs): + fs = get_fs() + self.extract.fetch_from_bc_api() + # inserts into xcoms + return self.extract.save_to_gcs(fs, self.bucket) From c0c8ae86af30c5ff66926ec5929a18b48aebdb3e Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Wed, 22 Nov 2023 10:27:23 -0800 Subject: [PATCH 02/15] pull API data into 1 table, first dbt models --- .../external_table_all_ntdreports.yml | 19 ++ .../a10_submitted_for_ntd.yml | 7 - .../a15_submitted_for_ntd.yml | 7 - .../a30_submitted_for_ntd.yml | 7 - ...ntd.yml => all_2023_submitted_for_ntd.yml} | 8 +- .../p10_submitted_for_ntd.yml | 7 - .../p50_submitted_for_ntd.yml | 7 - .../rr20_intercity_submitted_for_ntd.yml | 7 - .../rr20_rural_submitted_for_ntd.yml | 7 - .../rr20_urban_tribal_submitted_for_ntd.yml | 7 - .../ss60_submitted_for_ntd.yml | 7 - .../tam_narrative_submitted_for_ntd.yml | 7 - .../int_ntd_rr20_service_alldata.sql | 77 ++++++ .../int_ntd_rr20_service_ratios.py | 71 ++++++ .../ntd_validation/int_ntd_validation.yml | 15 ++ .../ntd_validation/_mart_ntd_validation.yml | 4 + .../fct_ntd_rr20_service_checks.py | 223 ++++++++++++++++++ .../ntd_validation/_src_api_externaltable.yml | 21 ++ .../stg_2022_rr20_exp_by_mode.sql | 5 + .../stg_2022_rr20_financial.sql | 5 + .../ntd_validation/stg_2022_rr20_service.sql | 5 + .../staging/ntd_validation/stg_2023_a10.sql | 18 ++ .../ntd_validation/stg_2023_rr20_rural.sql | 23 ++ .../stg_2023_rr20_urban_tribal.sql | 15 ++ .../ntd_validation/stg_ntd_subrecipients.sql | 3 + 25 files changed, 508 insertions(+), 74 deletions(-) create mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml delete mode 100644 airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml rename airflow/dags/ntd_report_validation/{p20_submitted_for_ntd.yml => all_2023_submitted_for_ntd.yml} (52%) delete mode 100644 airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml delete mode 100644 airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml create mode 100644 warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml create mode 100644 warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py create mode 100644 warehouse/models/staging/ntd_validation/_src_api_externaltable.yml create mode 100644 warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_2023_a10.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml new file mode 100644 index 0000000000..aa98e6e59a --- /dev/null +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml @@ -0,0 +1,19 @@ +operator: operators.ExternalTable +bucket: gs://calitp-ntd-report-validation +prefix_bucket: true +post_hook: | + SELECT * + FROM `{{ get_project_id() }}`.external_blackcat.all_2023_ntdreports + LIMIT 1; +source_objects: + - "all_2023_NTDReporting/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.all_2023_ntdreports" +source_format: NEWLINE_DELIMITED_JSON +use_bq_client: true +hive_options: + mode: AUTO + require_partition_filter: false + source_uri_prefix: "all_2023_NTDReporting/" + + + diff --git a/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml deleted file mode 100644 index c33a1120e7..0000000000 --- a/airflow/dags/ntd_report_validation/a10_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingStationsAndMaintenance" -form: "A-10" -bq_table_name: "ntdreportingstationsandmaintenance" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml deleted file mode 100644 index ff58fc5b31..0000000000 --- a/airflow/dags/ntd_report_validation/a15_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDTransitAssetManagementA15" -form: "A-15" -bq_table_name: "ntdtransitassetmanagement" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml deleted file mode 100644 index 2c6176942b..0000000000 --- a/airflow/dags/ntd_report_validation/a30_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDAssetAndResourceInfo" -form: "A-30" -bq_table_name: "ntdassetandresourceinfo" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml similarity index 52% rename from airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml rename to airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml index de197b31c9..c01b9a0515 100644 --- a/airflow/dags/ntd_report_validation/p20_submitted_for_ntd.yml +++ b/airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml @@ -1,7 +1,7 @@ operator: operators.BlackCatApiToGCSOperator -bucket: "gs://calitp-ntd-report-validation" +bucket: "gs://test-calitp-ntd-report-validation" api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingP20" -form: "P-20" -bq_table_name: "ntdreportermodes" \ No newline at end of file +api_tablename: "2023_NTDReporting" +form: "all" +bq_table_name: "2023_ntdreports" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml deleted file mode 100644 index 75694c6f3f..0000000000 --- a/airflow/dags/ntd_report_validation/p10_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingP10" -form: "P-10" -bq_table_name: "ntdreporterbasicinfo" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml deleted file mode 100644 index 46778e2ac2..0000000000 --- a/airflow/dags/ntd_report_validation/p50_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingP50" -form: "P-50" -bq_table_name: "ntdreportergtfs" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml deleted file mode 100644 index 34656c2aaf..0000000000 --- a/airflow/dags/ntd_report_validation/rr20_intercity_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingRR20_Intercity" -form: "RR-20" -bq_table_name: "intercity" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml deleted file mode 100644 index 3bdb2d2137..0000000000 --- a/airflow/dags/ntd_report_validation/rr20_rural_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingRR20_Rural" -form: "RR-20" -bq_table_name: "rural" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml deleted file mode 100644 index 3c698de609..0000000000 --- a/airflow/dags/ntd_report_validation/rr20_urban_tribal_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingRR20_Urban_Tribal" -form: "RR-20" -bq_table_name: "urban_tribal" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml deleted file mode 100644 index 13d7d04ddd..0000000000 --- a/airflow/dags/ntd_report_validation/ss60_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "SS60" -form: "SS-60" -bq_table_name: "safety" \ No newline at end of file diff --git a/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml b/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml deleted file mode 100644 index fcdb3fbcac..0000000000 --- a/airflow/dags/ntd_report_validation/tam_narrative_submitted_for_ntd.yml +++ /dev/null @@ -1,7 +0,0 @@ -operator: operators.BlackCatApiToGCSOperator - -bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "NTDReportingTAMNarrative" -form: "NTD" -bq_table_name: "tamnarrative" \ No newline at end of file diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql new file mode 100644 index 0000000000..ff65dcd991 --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql @@ -0,0 +1,77 @@ +------ +--- Compiles data for RR-20 Service checks from all years into one table for future computation +------ + +--- The 2022 data was *not* from the API and so formatted differently +--- We are *assuming* that data in 2024 and onwards will be the same format as 2023 +--- If you get errors in 2024, check which columns may differ and read errors carefully. + +---TO DO: insert parameter for loop, for each year, do what 2023 is doing, +--- and at the end, add another union statement +with data_2023 as ( + select + organization, + api_report_period as fiscal_year, + item as mode, + description as operating_capital, + CASE + WHEN description = "Operating Expenses" THEN operations_expended + WHEN description = "Capital Expenses" THEN capital_expended + ELSE Null + END as Total_Annual_Expenses_By_Mode, + annual_vehicle_rev_miles as Annual_VRM, + annual_vehicle_rev_hours as Annual_VRH, + annual_unlinked_pass_trips as Annual_UPT, + sponsored_service_upt as Sponsored_UPT, + annual_vehicle_max_service as VOMX + from {{ ref('stg_2023_rr20_rural') }} + WHERE type = "Expenses by Mode" +), + +service2022 as ( + select + Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + Mode as mode, + Annual_VRM, + Annual_VRH, + Annual_UPT, + Sponsored_UPT, + VOMX + from {{ ref('stg_2022_rr20_service') }} +), + +expenses2022 as ( + select + Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + Operating_Capital as operating_capital, + Mode as mode, + Total_Annual_Expenses_By_Mode + FROM {{ ref('stg_2022_rr20_exp_by_mode') }} +), + +all_2022 as ( + select service2022.organization, + service2022.fiscal_year, + service2022.mode, + expenses2022.operating_capital, + expenses2022.Total_Annual_Expenses_By_Mode, + service2022.Annual_VRM, + service2022.Annual_VRH, + service2022.Annual_UPT, + service2022.Sponsored_UPT, + service2022.VOMX +from service2022 +FULL OUTER JOIN expenses2022 + ON service2022.organization = expenses2022.organization + AND service2022.fiscal_year = expenses2022.fiscal_year + AND service2022.mode = expenses2022.mode +) + +select * FROM all_2022 + +UNION ALL + +select * from data_2023 + diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py new file mode 100644 index 0000000000..24f89bc8b2 --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py @@ -0,0 +1,71 @@ + +import pyspark.sql.functions as F +import pandas as pd +import logging +import pyspark + + +def write_to_log(logfilename): + ''' + Creates a logger object that outputs to a log file, to the filename specified, + and also streams to console. + ''' + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', + datefmt='%y-%m-%d %H:%M:%S') + file_handler = logging.FileHandler(logfilename) + file_handler.setFormatter(formatter) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + if not logger.hasHandlers(): + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + + return logger + +def make_ratio_cols(df, numerator, denominator, col_name, logger, operation="sum"): + if col_name is not None: + # If a user specify a column name, use it + # Raise error if the column already exists + if col_name in df.columns: + logger.info(f"Dataframe already has column '{col_name}'") + raise ValueError(f"Dataframe already has column '{col_name}'") + + else: + _col_name = col_name + + if operation == "sum": + df = (df.groupby(['organization','mode', 'fiscal_year']) + .apply(lambda x: x.assign(**{_col_name: + lambda x: x[numerator].sum() / x[denominator]})) + ) + # else do not sum the numerator columns + else: + df = (df.groupby(['organization','mode', 'fiscal_year']) + .apply(lambda x: x.assign(**{_col_name: + lambda x: x[numerator] / x[denominator]})) + ) + return df + + +def model(dbt, session): + # Set up the logger object + logger = write_to_log('rr20_servicechecks_log.log') + + #Load data from BigQuery - pass in the dbt model that we draw from. + allyears = dbt.ref("int_ntd_rr20_service_alldata") + allyears = allyears.toPandas() + + # Calculate needed ratios, added as new columns + numeric_columns = allyears.select_dtypes(include=['number']).columns + allyears[numeric_columns] = allyears[numeric_columns].fillna(0) + + allyears = make_ratio_cols(allyears, 'Total_Annual_Expenses_By_Mode', 'Annual_VRH', 'cost_per_hr', logger) + allyears = make_ratio_cols(allyears, 'Annual_VRM', 'VOMX', 'miles_per_veh', logger) + allyears = make_ratio_cols(allyears, 'Total_Annual_Expenses_By_Mode', 'Annual_UPT', 'fare_rev_per_trip', logger) + allyears = make_ratio_cols(allyears, 'Annual_VRM', 'Annual_VRH', 'rev_speed', logger, operation = "mean") + allyears = make_ratio_cols(allyears, 'Annual_UPT', 'Annual_VRH', 'trips_per_hr', logger, operation = "mean") + + return allyears \ No newline at end of file diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml new file mode 100644 index 0000000000..d5affd54f9 --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml @@ -0,0 +1,15 @@ +version: 2 + +models: + - name: int_rr20_financial + description: | + the RR-20 data that pertains to financial reporting. + # tests: + # - dbt_utils.expression_is_true: + # expression: 'status != {{ guidelines_to_be_assessed_status() }}' + # columns: + - name: int_ntd_rr20_service_ratios + description: | + makes ratios for validation checks + config: + materialized: table \ No newline at end of file diff --git a/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml b/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml new file mode 100644 index 0000000000..43ae3000c4 --- /dev/null +++ b/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml @@ -0,0 +1,4 @@ +version: 2 + +models: + - name: fct_ntd_rr20_service_checks \ No newline at end of file diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py new file mode 100644 index 0000000000..832a8961db --- /dev/null +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py @@ -0,0 +1,223 @@ +import pandas as pd +import datetime +import logging + +##### TO_DO: see if the missing data check can still work or did we already fill it with zeros + +def write_to_log(logfilename): + ''' + Creates a logger object that outputs to a log file, to the filename specified, + and also streams to console. + ''' + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', + datefmt='%y-%m-%d %H:%M:%S') + file_handler = logging.FileHandler(logfilename) + file_handler.setFormatter(formatter) + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + if not logger.hasHandlers(): + logger.addHandler(file_handler) + logger.addHandler(stream_handler) + + return logger + + +def check_rr20_ratios(df, variable, threshold, this_year, last_year, logger): + '''Validation checks where a ratio must be within a certain threshold limit + compared to the previous year.''' + agencies = df['organization'].unique() + output = [] + for agency in agencies: + agency_df = df[df['organization']==agency] + logger.info(f"Checking {agency} for {variable} info.") + if len(agency_df) > 0: + + # Check whether data for both years is present + if (len(agency_df[agency_df['fiscal_year']==this_year]) > 0) \ + & (len(agency_df[agency_df['fiscal_year']==last_year]) > 0): + + for mode in agency_df[(agency_df['fiscal_year']==this_year)]['mode'].unique(): + value_thisyr = (round(agency_df[(agency_df['mode']==mode) + & (agency_df['fiscal_year'] == this_year)] + [variable].unique()[0], 2)) + if len(agency_df[(agency_df['mode']==mode) & (agency_df['fiscal_year'] == last_year)][variable]) == 0: + value_lastyr = 0 + else: + value_lastyr = (round(agency_df[(agency_df['mode']==mode) + & (agency_df['fiscal_year'] == last_year)] + [variable].unique()[0], 2)) + + if (value_lastyr == 0) and (abs(value_thisyr - value_lastyr) >= threshold): + result = "fail" + check_name = f"{variable}" + mode = mode + description = (f"The {variable} for {mode} has changed from last year by > = {threshold*100}%, please provide a narrative justification.") + elif (value_lastyr != 0) and abs((value_lastyr - value_thisyr)/value_lastyr) >= threshold: + result = "fail" + check_name = f"{variable}" + mode = mode + description = (f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%, please provide a narrative justification.") + else: + result = "pass" + check_name = f"{variable}" + mode = mode + description = "" + + output_line = {"Organization": agency, + "name_of_check" : check_name, + "mode": mode, + "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", + "check_status": result, + "Description": description} + output.append(output_line) + else: + logger.info(f"There is no data for {agency}") + checks = pd.DataFrame(output).sort_values(by="Organization") + return checks + + +def check_single_number(df, variable, this_year, last_year, logger, threshold=None,): + '''Validation checks where a single number must be within a certain threshold limit + compared to the previous year.''' + agencies = df['organization'].unique() + output = [] + for agency in agencies: + + if len(df[df['organization']==agency]) > 0: + logger.info(f"Checking {agency} for {variable} info.") + # Check whether data for both years is present, if so perform prior yr comparison. + if (len(df[(df['organization']==agency) & (df['fiscal_year']==this_year)]) > 0) \ + & (len(df[(df['organization']==agency) & (df['fiscal_year']==last_year)]) > 0): + + for mode in df[(df['organization'] == agency) & (df['fiscal_year']==this_year)]['mode'].unique(): + value_thisyr = (round(df[(df['organization'] == agency) + & (df['mode']==mode) + & (df['fiscal_year'] == this_year)] + [variable].unique()[0], 2)) + # If there's no data for last yr: + if len(df[(df['organization'] == agency) + & (df['mode']==mode) + & (df['fiscal_year'] == last_year)][variable]) == 0: + value_lastyr = 0 + else: + value_lastyr = (round(df[(df['organization'] == agency) + & (df['mode']==mode) + & (df['fiscal_year'] == last_year)] + [variable].unique()[0], 2)) + + if (round(value_thisyr)==0 and round(value_lastyr) != 0) | (round(value_thisyr)!=0 and round(value_lastyr) == 0): + result = "fail" + check_name = f"{variable}" + mode = mode + description = (f"The {variable} for {mode} has changed either from or to zero compared to last year. Please provide a narrative justification.") + # run only the above check on whether something changed from zero to non-zero, if no threshold is given + elif threshold==None: + result = "pass" + check_name = f"{variable}" + mode = mode + description = "" + pass + # also check for pct change, if a threshold parameter is passed into function + elif (value_lastyr == 0) and (abs(value_thisyr - value_lastyr) >= threshold): + result = "fail" + check_name = f"{variable}" + mode = mode + description = (f"The {variable} for {mode} was 0 last year and has changed by > = {threshold*100}%, please provide a narrative justification.") + elif (value_lastyr != 0) and abs((value_lastyr - value_thisyr)/value_lastyr) >= threshold: + result = "fail" + check_name = f"{variable}" + mode = mode + description = (f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%; please provide a narrative justification.") + else: + result = "pass" + check_name = f"{variable}" + mode = mode + description = "" + + output_line = {"Organization": agency, + "name_of_check" : check_name, + "mode": mode, + "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", + "check_status": result, + "Description": description} + output.append(output_line) + else: + logger.info(f"There is no data for {agency}") + checks = pd.DataFrame(output).sort_values(by="Organization") + return checks + + +def model(dbt, session): + # Set up the logger object + logger = write_to_log('rr20_ftc_servicechecks_log.log') + + this_year=datetime.datetime.now().year + last_year = this_year-1 + this_date=datetime.datetime.now().date().strftime('%Y-%m-%d') #for suffix on Excel files + + #Load data from BigQuery - pass in the dbt model that we draw from. + allyears = dbt.ref("int_ntd_rr20_service_ratios") + allyears = allyears.toPandas() + + # Run validation checks + cph_checks = check_rr20_ratios(allyears, 'cost_per_hr', .30, this_year, last_year, logger) + mpv_checks = check_rr20_ratios(allyears, 'miles_per_veh', .20, this_year, last_year, logger) + vrm_checks = check_single_number(allyears, 'Annual_VRM', this_year, last_year, logger, threshold=.30) + frpt_checks = check_rr20_ratios(allyears, 'fare_rev_per_trip', .25, this_year, last_year, logger) + rev_speed_checks = check_rr20_ratios(allyears, 'rev_speed', .15, this_year, last_year, logger) + tph_checks = check_rr20_ratios(allyears, 'trips_per_hr', .30, this_year, last_year, logger) + voms0_check = check_single_number(allyears, 'VOMX', this_year, last_year, logger) + + # Combine checks into one table + rr20_checks = pd.concat([cph_checks, mpv_checks, vrm_checks, + frpt_checks, rev_speed_checks, + tph_checks, voms0_check], + ignore_index=True).sort_values(by="Organization") + + ## Part 1: save Excel file to GCS + GCS_FILE_PATH_VALIDATED = f"gs://calitp-ntd-report-validation/validation_reports_{this_year}" + with pd.ExcelWriter(f"{GCS_FILE_PATH_VALIDATED}/rr20_service_check_report_{this_date}.xlsx") as writer: + rr20_checks.to_excel(writer, sheet_name="rr20_checks_full", index=False, startrow=2) + + workbook = writer.book + worksheet = writer.sheets["rr20_checks_full"] + cell_highlight = workbook.add_format({ + 'fg_color': 'yellow', + 'bold': True, + 'border': 1 + }) + report_title = "NTD Data Validation Report" + title_format = workbook.add_format({ + 'bold': True, + 'valign': 'center', + 'align': 'left', + 'font_color': '#1c639e', + 'font_size': 15 + }) + subtitle = "Reduced Reporting RR-20: Validation Warnings" + subtitle_format = workbook.add_format({ + 'bold': True, + 'align': 'left', + 'font_color': 'black', + 'font_size': 19 + }) + + worksheet.write('A1', report_title, title_format) + worksheet.merge_range('A2:C2', subtitle, subtitle_format) + worksheet.write('G3', 'Agency Response', cell_highlight) + worksheet.write('H3', 'Response Date', cell_highlight) + worksheet.set_column(0, 0, 35) #col A width + worksheet.set_column(1, 3, 22) #cols B-D width + worksheet.set_column(4, 4, 11) #col D width + worksheet.set_column(5, 6, 53) #col E-G width + worksheet.freeze_panes('B4') + + logger.info(f"RR-20 service data checks conducted on {this_date} is complete!") + + ## Part 2: send table to BigQuery + return rr20_checks + + diff --git a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml new file mode 100644 index 0000000000..345982b566 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml @@ -0,0 +1,21 @@ +version: 2 + +sources: + - name: ntd_report_validation + description: | + Data from BlackCat API. + database: "{{ env_var('DBT_SOURCE_DATABASE', var('SOURCE_DATABASE')) }}" + schema: external_blackcat + tables: + - name: all_2023_ntdreports + - name: a10_ntdreportingstationsandmaintenance + - name: a15_ntdtransitassetmanagement + - name: a30_ntdassetandresourceinfo + - name: ntd_tamnarrative + - name: p10_ntdreporterbasicinfo + - name: p20_ntdreportermodes + - name: p50_ntdreportergtfs + - name: rr20_intercity + - name: rr20_rural + - name: rr20_urban_tribal + - name: ss60_safety diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql b/warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql new file mode 100644 index 0000000000..ea678124c7 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql @@ -0,0 +1,5 @@ +--- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future +--- We pull these tables in to use them in later int and fct models +SELECT + * +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_expenses_by_mode` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql b/warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql new file mode 100644 index 0000000000..0409fbc451 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql @@ -0,0 +1,5 @@ +--- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future +--- We pull these tables in to use them in later int and fct models +SELECT + * +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_financials__2` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql b/warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql new file mode 100644 index 0000000000..919ce31487 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql @@ -0,0 +1,5 @@ +--- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future +--- We pull these tables in to use them in later int and fct models +SELECT + * +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_service_data` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2023_a10.sql b/warehouse/models/staging/ntd_validation/stg_2023_a10.sql new file mode 100644 index 0000000000..85db82f56f --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2023_a10.sql @@ -0,0 +1,18 @@ +SELECT + organization, + reportstatus as api_report_status, + TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, + reportperiod as api_report_period, + a10.id as id, + a10.ReportId as report_id, + a10.ServiceMode as service_mode, + a10.PTOwnedByServiceProvider as pt_owned_by_service_provider, + a10.PTOwnedByPublicAgency as pt_owned_by_public_agency, + a10.PTLeasedByPublicAgency as pt_leased_by_public_agency, + a10.PTLeasedByServiceProvider as pt_leased_by_service_provider, + a10.DOOwned as do_owned, + a10.DOLeasedByPublicAgency as do_leased_by_public_agency, + a10.DOLeasedFromPrivateEntity as do_leased_from_private_entity, + a10.LastModifiedDate as last_modified_date +FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +, UNNEST (`ntdreportingstationsandmaintenance_data`) as `a10` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql b/warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql new file mode 100644 index 0000000000..b753a23584 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql @@ -0,0 +1,23 @@ +SELECT + organization, + reportstatus as api_report_status, + TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, + reportperiod as api_report_period, + ntdreportingrr20_rural_data.id as id, + ntdreportingrr20_rural_data.ReportId as report_id, + ntdreportingrr20_rural_data.Item as item, + ntdreportingrr20_rural_data.Revenue as revenue, + ntdreportingrr20_rural_data.Type as type, + ntdreportingrr20_rural_data.CSSClass as css_class, + ntdreportingrr20_rural_data.OperationsExpended as operations_expended, + ntdreportingrr20_rural_data.CapitalExpended as capital_expended, + ntdreportingrr20_rural_data.Description as description, + ntdreportingrr20_rural_data.AnnualVehicleRevMiles as annual_vehicle_rev_miles, + ntdreportingrr20_rural_data.AnnualVehicleRevHours as annual_vehicle_rev_hours, + ntdreportingrr20_rural_data.AnnualUnlinkedPassTrips as annual_unlinked_pass_trips, + ntdreportingrr20_rural_data.AnnualVehicleMaxService as annual_vehicle_max_service, + ntdreportingrr20_rural_data.SponsoredServiceUPT as sponsored_service_upt, + ntdreportingrr20_rural_data.Quantity as quantity, + ntdreportingrr20_rural_data.LastModifiedDate as last_modified_date +FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +, UNNEST (`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql b/warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql new file mode 100644 index 0000000000..0b871e0ff8 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql @@ -0,0 +1,15 @@ +SELECT + organization, + reportstatus as api_report_status, + TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, + reportperiod as api_report_period, + ntdreportingrr20_urban_tribal_data.id as id, + ntdreportingrr20_urban_tribal_data.ReportId as report_id, + ntdreportingrr20_urban_tribal_data.ItemId as item_id, + ntdreportingrr20_urban_tribal_data.Item as item, + ntdreportingrr20_urban_tribal_data.OperationsExpended as operations_expended, + ntdreportingrr20_urban_tribal_data.CapitalExpended as capital_expended, + ntdreportingrr20_urban_tribal_data.Description as description, + ntdreportingrr20_urban_tribal_data.LastModifiedDate as last_modified_date +FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +, UNNEST (`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql new file mode 100644 index 0000000000..50c5208254 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql @@ -0,0 +1,3 @@ +SELECT + Organization as organization +FROM blackcat_raw.2023_organizations \ No newline at end of file From 9ef1361551d3e197d831044edda4974b58ebcfce Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Wed, 22 Nov 2023 14:19:02 -0800 Subject: [PATCH 03/15] simplify external tables dag --- .../external_table_a10.yml | 68 --------------- .../external_table_a15.yml | 16 ---- .../external_table_a30.yml | 86 ------------------- .../external_table_p10.yml | 74 ---------------- .../external_table_p20.yml | 56 ------------ .../external_table_p50.yml | 53 ------------ .../external_table_rr20_intercity.yml | 71 --------------- .../external_table_rr20_rural.yml | 77 ----------------- .../external_table_rr20_urban_tribal.yml | 59 ------------- .../external_table_ss60.yml | 16 ---- .../external_table_tam_narrative.yml | 16 ---- airflow/plugins/operators/blackcat_to_gcs.py | 58 ++----------- 12 files changed, 9 insertions(+), 641 deletions(-) delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml delete mode 100644 airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml deleted file mode 100644 index bf1d4cbabf..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a10.yml +++ /dev/null @@ -1,68 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.a10_ntdreportingstationsandmaintenance - LIMIT 1; -source_objects: - - "a10_NTDReportingStationsAndMaintenance/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.a10_ntdreportingstationsandmaintenance" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "a10_NTDReportingStationsAndMaintenance/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: service_mode - type: STRING - mode: NULLABLE - - name: pt_owned_by_service_provider - type: FLOAT64 - mode: NULLABLE - - name: pt_owned_by_public_agency - type: FLOAT64 - mode: NULLABLE - - name: pt_leased_by_public_agency - type: FLOAT64 - mode: NULLABLE - - name: pt_leased_by_service_provider - type: FLOAT64 - mode: NULLABLE - - name: do_owned - type: FLOAT64 - mode: NULLABLE - - name: do_leased_by_public_agency - type: FLOAT64 - mode: NULLABLE - - name: do_leased_from_private_entity - type: FLOAT64 - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE - - - diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml deleted file mode 100644 index 1e327030eb..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a15.yml +++ /dev/null @@ -1,16 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.a15_ntdtransitassetmanagement - LIMIT 1; -source_objects: - - "a15_NTDTransitAssetManagementA15/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.a15_ntdtransitassetmanagement" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "a15_NTDTransitAssetManagementA15/" diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml deleted file mode 100644 index beefa1982d..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_a30.yml +++ /dev/null @@ -1,86 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.a30_ntdassetandresourceinfo - LIMIT 1; -source_objects: - - "a30_NTDAssetAndResourceInfo/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.a30_ntdassetandresourceinfo" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "a30_NTDAssetAndResourceInfo/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: vehicle_id - type: STRING - mode: NULLABLE - - name: vehicle_status - type: STRING - mode: NULLABLE - - name: vin - type: STRING - mode: NULLABLE - - name: ntdid - type: STRING - mode: NULLABLE - - name: ada_access - type: STRING - mode: NULLABLE - - name: vehicle_type - type: STRING - mode: NULLABLE - - name: fuel_type - type: STRING - mode: NULLABLE - - name: average_estimated_service_years_when_new - type: INTEGER - mode: NULLABLE - - name: average_expiration_years_when_new - type: INTEGER - mode: NULLABLE - - name: vehicle_year - type: INTEGER - mode: NULLABLE - - name: useful_life_years_remaining - type: INTEGER - mode: NULLABLE - - name: vehicle_length - type: STRING - mode: NULLABLE - - name: seating_capacity - type: STRING - mode: NULLABLE - - name: ownership_type - type: STRING - mode: NULLABLE - - name: modes_operated_display_text - type: STRING - mode: NULLABLE - - name: modes_operated_full_text - type: STRING - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml deleted file mode 100644 index fdb6a5aa45..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p10.yml +++ /dev/null @@ -1,74 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.p10_ntdreporterbasicinfo - LIMIT 1; -source_objects: - - "p10_NTDReportingP10/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.p10_ntdreporterbasicinfo" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "p10_NTDReportingP10/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: org_id - type: STRING - mode: NULLABLE - - name: user_id - type: STRING - mode: NULLABLE - - name: first_name - type: STRING - mode: NULLABLE - - name: last_name - type: STRING - mode: NULLABLE - - name: full_name - type: STRING - mode: NULLABLE - - name: text - type: STRING - mode: NULLABLE - - name: value - type: STRING - mode: NULLABLE - - name: group - type: STRING - mode: NULLABLE - - name: bool_value - type: BOOL - mode: NULLABLE - - name: primary_phone - type: STRING - mode: NULLABLE - - name: email - type: STRING - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml deleted file mode 100644 index 6fd9370b8a..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p20.yml +++ /dev/null @@ -1,56 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.p20_ntdreportermodes - LIMIT 1; -source_objects: - - "p20_NTDReportingP20/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.p20_ntdreportermodes" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "p20_NTDReportingP20/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: service_mode - type: STRING - mode: NULLABLE - - name: type_of_service - type: STRING - mode: NULLABLE - - name: commitment_date - type: DATETIME - mode: NULLABLE - - name: start_date - type: DATETIME - mode: NULLABLE - - name: end_date - type: DATETIME - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml deleted file mode 100644 index 78c6cf4dde..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_p50.yml +++ /dev/null @@ -1,53 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.p50_ntdreportergtfs - LIMIT 1; -source_objects: - - "p50_NTDReportingP50/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.p50_ntdreportergtfs" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "p50_NTDReportingP50/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: mode - type: STRING - mode: NULLABLE - - name: type - type: STRING - mode: NULLABLE - - name: web_link - type: STRING - mode: NULLABLE - - name: file_path - type: STRING - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml deleted file mode 100644 index e52cbeabba..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_intercity.yml +++ /dev/null @@ -1,71 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.rr20_intercity - LIMIT 1; -source_objects: - - "rr20_NTDReportingRR20_Intercity/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.rr20_intercity" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "rr20_NTDReportingRR20_Intercity/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: item_id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: item - type: STRING - mode: NULLABLE - - name: type - type: STRING - mode: NULLABLE - - name: operations_expended - type: STRING - mode: NULLABLE - - name: capital_expended - type: STRING - mode: NULLABLE - - name: description - type: STRING - mode: NULLABLE - - name: annual_vehicle_rev_miles - type: FLOAT64 - mode: NULLABLE - - name: regular_unlinked_passenger_trips - type: INTEGER - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE - - - - - - diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml deleted file mode 100644 index bf85034c82..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_rural.yml +++ /dev/null @@ -1,77 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.rr20_rural - LIMIT 1; -source_objects: - - "rr20_NTDReportingRR20_Rural/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.rr20_rural" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "rr20_NTDReportingRR20_Rural/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: item - type: STRING - mode: NULLABLE - - name: REVENUE - type: FLOAT64 - mode: NULLABLE - - name: css_class - type: STRING - mode: NULLABLE - - name: operations_expended - type: STRING - mode: NULLABLE - - name: capital_expended - type: STRING - mode: NULLABLE - - name: description - type: STRING - mode: NULLABLE - - name: annual_vehicle_rev_miles - type: FLOAT64 - mode: NULLABLE - - name: annual_vehicle_rev_hours - type: INTEGER - mode: NULLABLE - - name: annual_unlinked_pass_trips - type: FLOAT64 - mode: NULLABLE - - name: annual_vehicle_max_service - type: INTEGER - mode: NULLABLE - - name: sponsored_service_upt - type: INTEGER - mode: NULLABLE - - name: quantity - type: INTEGER - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE \ No newline at end of file diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml deleted file mode 100644 index 1522c542ab..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_rr20_urban_tribal.yml +++ /dev/null @@ -1,59 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.rr20_urban_tribal - LIMIT 1; -source_objects: - - "rr20_NTDReportingRR20_Urban_Tribal/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.rr20_urban_tribal" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "rr20_NTDReportingRR20_Urban_Tribal/" -schema_fields: - - name: api_report_id - type: STRING - mode: NULLABLE - - name: api_organization - type: STRING - mode: NULLABLE - - name: api_report_period - type: INTEGER - mode: NULLABLE - - name: api_report_status - type: STRING - mode: NULLABLE - - name: api_last_modified - type: TIMESTAMP - mode: NULLABLE - - name: id - type: STRING - mode: NULLABLE - - name: item_id - type: STRING - mode: NULLABLE - - name: report_id - type: STRING - mode: NULLABLE - - name: item - type: STRING - mode: NULLABLE - - name: type - type: STRING - mode: NULLABLE - - name: operations_expended - type: STRING - mode: NULLABLE - - name: capital_expended - type: STRING - mode: NULLABLE - - name: description - type: STRING - mode: NULLABLE - - name: last_modified_date - type: DATETIME - mode: NULLABLE diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml deleted file mode 100644 index 576a286c71..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_ss60.yml +++ /dev/null @@ -1,16 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.ss60_safety - LIMIT 1; -source_objects: - - "ss60_SS60/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.ss60_safety" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "ss60_SS60/" diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml deleted file mode 100644 index 27e869a3f6..0000000000 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_tam_narrative.yml +++ /dev/null @@ -1,16 +0,0 @@ -operator: operators.ExternalTable -bucket: gs://calitp-ntd-report-validation -prefix_bucket: true -post_hook: | - SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.ntd_tamnarrative - LIMIT 1; -source_objects: - - "ntd_NTDReportingTAMNarrative/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.ntd_tamnarrative" -source_format: NEWLINE_DELIMITED_JSON -use_bq_client: true -hive_options: - mode: AUTO - require_partition_filter: false - source_uri_prefix: "ntd_NTDReportingTAMNarrative/" diff --git a/airflow/plugins/operators/blackcat_to_gcs.py b/airflow/plugins/operators/blackcat_to_gcs.py index b6927aceea..96a68f865c 100644 --- a/airflow/plugins/operators/blackcat_to_gcs.py +++ b/airflow/plugins/operators/blackcat_to_gcs.py @@ -31,14 +31,6 @@ def write_to_log(logfilename): return logger -def camel_to_snake(name): - '''Converts Snake case to underscore separation for renaming columns; - VehicleStatus becomes vehicle_status and - can handle acroynms like ADAAccess, which becomes ada_access''' - name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() - - class BlackCatApiExtract(BaseModel): api_url: str form: str @@ -78,43 +70,10 @@ def fetch_from_bc_api(self): f"Downloading BlackCat data for {self.extract_time.format('YYYY')}_{self.bq_table_name}." ) response = requests.get(self.api_url, verify=False) - blob = response.json() - - org_data = [] - - # Cyling through and processing each org's data - for x in blob: - report_id = x.get('ReportId') - org = x.get('Organization') - period = x.get('ReportPeriod') - status = x.get('ReportStatus') - last_mod = (pendulum.from_format(x.get('ReportLastModifiedDate'), 'MM/DD/YYYY HH:mm:ss A') - .in_tz('America/Los_Angeles') - .set(tz='UTC')) - iso = last_mod.to_iso8601_string() - - org_info_values = {'api_report_id': report_id, 'api_organization': org, - 'api_report_period': period, 'api_report_status': status, - 'api_last_modified': iso} - org_info_df = pd.DataFrame([org_info_values]) - - table_json = x[self.api_tablename]['Data'] - # checks for nested json entries, replaces any with only the 'Text' value from nested json. - for x in table_json: - for k,v in x.items(): - if type(v) is dict: - x[k] = x[k]['Text'] - raw_df = pd.DataFrame.from_dict(table_json) - raw_df.rename(columns=lambda c: camel_to_snake(c), inplace=True) - whole_df = pd.concat([org_info_df, raw_df], axis=1).sort_values(by='api_organization') - - # Only the 1st row of data in org_info_df is filled, other rows have NAs. - # Here we fill in the rest with the values - whole_df = whole_df.fillna(value=org_info_values) - org_data.append(whole_df) - - raw_df = pd.concat(org_data) - raw_df.rename(columns=lambda c: camel_to_snake(c), inplace=True) + blob = response.json() + + raw_df = pd.json_normalize(blob) + raw_df['ReportLastModifiedDate'] = raw_df['ReportLastModifiedDate'].astype('datetime64[ns]') self.data = raw_df.rename(make_name_bq_safe, axis="columns") self.logger.info( @@ -151,7 +110,7 @@ def save_to_gcs(self, fs, bucket): return hive_path -class BlackCatApiToGCSOperator(BaseOperator): +class BlackCatApiToGCSOperator2(BaseOperator): template_fields = ("bucket",) def __init__( @@ -163,9 +122,10 @@ def __init__( bq_table_name, **kwargs, ): - """An operator that downloads data from a BlackCat API - and saves it as a JSON file hive-partitioned by date in Google Cloud - Storage (GCS). + """An operator that downloads all data from a BlackCat API + and saves it as one JSON file hive-partitioned by date in Google Cloud + Storage (GCS). Each org's data will be in 1 row, and for each separate table in the API, + a nested column will hold all of it's data. Args: bucket (str): GCS bucket where the scraped BlackCat report will be saved. From 3da73864d8f93a3b53b0ec5dbb0bee52b231d0e5 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Tue, 28 Nov 2023 10:38:40 -0800 Subject: [PATCH 04/15] start dag for excel publishing, dbt cleaning for rr20 service --- .../external_table_all_ntdreports.yml | 2 +- .../METADATA.yml | 0 .../README.md | 2 +- .../all_2023_submitted_for_ntd.yml | 2 +- .../METADATA.yml | 18 +++++ .../ntd_report_publish_validation/README.md | 7 ++ .../publish_validation_report.py | 75 +++++++++++++++++++ airflow/plugins/operators/blackcat_to_gcs.py | 2 +- .../int_ntd_rr20_service_alldata.sql | 6 +- .../ntd_validation/_src_api_externaltable.yml | 14 +--- ....sql => stg_ntd_2022_rr20_exp_by_mode.sql} | 0 ...al.sql => stg_ntd_2022_rr20_financial.sql} | 0 ...vice.sql => stg_ntd_2022_rr20_service.sql} | 0 ...{stg_2023_a10.sql => stg_ntd_2023_a10.sql} | 2 +- ..._rural.sql => stg_ntd_2023_rr20_rural.sql} | 2 +- ...sql => stg_ntd_2023_rr20_urban_tribal.sql} | 3 +- 16 files changed, 113 insertions(+), 22 deletions(-) rename airflow/dags/{ntd_report_validation => ntd_report_from_blackcat}/METADATA.yml (100%) rename airflow/dags/{ntd_report_validation => ntd_report_from_blackcat}/README.md (96%) rename airflow/dags/{ntd_report_validation => ntd_report_from_blackcat}/all_2023_submitted_for_ntd.yml (81%) create mode 100644 airflow/dags/ntd_report_publish_validation/METADATA.yml create mode 100644 airflow/dags/ntd_report_publish_validation/README.md create mode 100644 airflow/dags/ntd_report_publish_validation/publish_validation_report.py rename warehouse/models/staging/ntd_validation/{stg_2022_rr20_exp_by_mode.sql => stg_ntd_2022_rr20_exp_by_mode.sql} (100%) rename warehouse/models/staging/ntd_validation/{stg_2022_rr20_financial.sql => stg_ntd_2022_rr20_financial.sql} (100%) rename warehouse/models/staging/ntd_validation/{stg_2022_rr20_service.sql => stg_ntd_2022_rr20_service.sql} (100%) rename warehouse/models/staging/ntd_validation/{stg_2023_a10.sql => stg_ntd_2023_a10.sql} (91%) rename warehouse/models/staging/ntd_validation/{stg_2023_rr20_rural.sql => stg_ntd_2023_rr20_rural.sql} (94%) rename warehouse/models/staging/ntd_validation/{stg_2023_rr20_urban_tribal.sql => stg_ntd_2023_rr20_urban_tribal.sql} (85%) diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml index aa98e6e59a..ad5fe5517c 100644 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml @@ -1,6 +1,6 @@ operator: operators.ExternalTable bucket: gs://calitp-ntd-report-validation -prefix_bucket: true +prefix_bucket: false post_hook: | SELECT * FROM `{{ get_project_id() }}`.external_blackcat.all_2023_ntdreports diff --git a/airflow/dags/ntd_report_validation/METADATA.yml b/airflow/dags/ntd_report_from_blackcat/METADATA.yml similarity index 100% rename from airflow/dags/ntd_report_validation/METADATA.yml rename to airflow/dags/ntd_report_from_blackcat/METADATA.yml diff --git a/airflow/dags/ntd_report_validation/README.md b/airflow/dags/ntd_report_from_blackcat/README.md similarity index 96% rename from airflow/dags/ntd_report_validation/README.md rename to airflow/dags/ntd_report_from_blackcat/README.md index 652eb0e2a6..4189a8a187 100644 --- a/airflow/dags/ntd_report_validation/README.md +++ b/airflow/dags/ntd_report_from_blackcat/README.md @@ -1,4 +1,4 @@ -# `ntd_report_validation` +# `ntd_report_from_blackcat` Type: [Now|Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) diff --git a/airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml b/airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml similarity index 81% rename from airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml rename to airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml index c01b9a0515..ce0acd3528 100644 --- a/airflow/dags/ntd_report_validation/all_2023_submitted_for_ntd.yml +++ b/airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml @@ -1,6 +1,6 @@ operator: operators.BlackCatApiToGCSOperator -bucket: "gs://test-calitp-ntd-report-validation" +bucket: "gs://calitp-ntd-report-validation" api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" api_tablename: "2023_NTDReporting" form: "all" diff --git a/airflow/dags/ntd_report_publish_validation/METADATA.yml b/airflow/dags/ntd_report_publish_validation/METADATA.yml new file mode 100644 index 0000000000..0f97d22bff --- /dev/null +++ b/airflow/dags/ntd_report_publish_validation/METADATA.yml @@ -0,0 +1,18 @@ +description: "Process BigQuery tables into Excel files, save in GCS" +schedule_interval: "0 20 * * 1" #8 pm every Monday +tags: + - ntd, blackcat +default_args: + owner: airflow + depends_on_past: False + start_date: "2023-10-02" + catchup: False + email: + - "kim.engie@slalom.com" + - "christian.suyat@dot.ca.gov" + - "katrina.kaiser@dot.ca.gov" + email_on_failure: True + pool: default_pool + concurrency: 50 +wait_for_defaults: + timeout: 3600 \ No newline at end of file diff --git a/airflow/dags/ntd_report_publish_validation/README.md b/airflow/dags/ntd_report_publish_validation/README.md new file mode 100644 index 0000000000..28977df52d --- /dev/null +++ b/airflow/dags/ntd_report_publish_validation/README.md @@ -0,0 +1,7 @@ +# `ntd_report_publish_validation` + +Type: [Now|Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) + +This DAG orchestrates the publishing of NTD Report validation checks in the form of Excel files, that it saves into Google Cloud Storage. Checks conducted on submitted NTD report submissions, previously stored into BigQuery with dbt models. They are then converted to Excel files and saves in the Google Cloud Storage bucket `calitp-ntd-report-validation`. + +In the event of failure, the job can be rerun without backfilling. \ No newline at end of file diff --git a/airflow/dags/ntd_report_publish_validation/publish_validation_report.py b/airflow/dags/ntd_report_publish_validation/publish_validation_report.py new file mode 100644 index 0000000000..7b5b4e9e82 --- /dev/null +++ b/airflow/dags/ntd_report_publish_validation/publish_validation_report.py @@ -0,0 +1,75 @@ +# --- +# python_callable: publish_report +# provide_context: true +# --- +from google.cloud import bigquery +import pandas as pd +import datetime +import re + +import google.auth +import google.auth.transport.requests + +import pendulum +from calitp_data_infra.storage import ( + fetch_all_in_partition, + get_fs, +) + + +def publish_report(): + client = bigquery.Client() + print("Got BG client!") + project = "cal-itp-data-infra-staging" + dataset_id = "staging_staging" + + dataset_ref = bigquery.DatasetReference(project, dataset_id) + table_ref = dataset_ref.table("fct_ntd_rr20_service_checks") + table = client.get_table(table_ref) + print("Got table!") + + df = client.list_rows(table).to_dataframe() + print("Got df from BQ!") + print(df.head()) + + + # this_year=datetime.datetime.now().year + # ## Part 1: save Excel file to GCS (for emailing to subrecipients) + # GCS_FILE_PATH_VALIDATED = f"gs://calitp-ntd-report-validation/validation_reports_{this_year}" + # with pd.ExcelWriter(f"{GCS_FILE_PATH_VALIDATED}/rr20_service_check_report_{this_date}.xlsx") as writer: + # rr20_checks.to_excel(writer, sheet_name="rr20_checks_full", index=False, startrow=2) + + # workbook = writer.book + # worksheet = writer.sheets["rr20_checks_full"] + # cell_highlight = workbook.add_format({ + # 'fg_color': 'yellow', + # 'bold': True, + # 'border': 1 + # }) + # report_title = "NTD Data Validation Report" + # title_format = workbook.add_format({ + # 'bold': True, + # 'valign': 'center', + # 'align': 'left', + # 'font_color': '#1c639e', + # 'font_size': 15 + # }) + # subtitle = "Reduced Reporting RR-20: Validation Warnings" + # subtitle_format = workbook.add_format({ + # 'bold': True, + # 'align': 'left', + # 'font_color': 'black', + # 'font_size': 19 + # }) + + # worksheet.write('A1', report_title, title_format) + # worksheet.merge_range('A2:C2', subtitle, subtitle_format) + # worksheet.write('G3', 'Agency Response', cell_highlight) + # worksheet.write('H3', 'Response Date', cell_highlight) + # worksheet.set_column(0, 0, 35) #col A width + # worksheet.set_column(1, 3, 22) #cols B-D width + # worksheet.set_column(4, 4, 11) #col D width + # worksheet.set_column(5, 6, 53) #col E-G width + # worksheet.freeze_panes('B4') + + diff --git a/airflow/plugins/operators/blackcat_to_gcs.py b/airflow/plugins/operators/blackcat_to_gcs.py index 96a68f865c..bf9d85d8e2 100644 --- a/airflow/plugins/operators/blackcat_to_gcs.py +++ b/airflow/plugins/operators/blackcat_to_gcs.py @@ -110,7 +110,7 @@ def save_to_gcs(self, fs, bucket): return hive_path -class BlackCatApiToGCSOperator2(BaseOperator): +class BlackCatApiToGCSOperator(BaseOperator): template_fields = ("bucket",) def __init__( diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql index ff65dcd991..8c21daf559 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql @@ -24,7 +24,7 @@ with data_2023 as ( annual_unlinked_pass_trips as Annual_UPT, sponsored_service_upt as Sponsored_UPT, annual_vehicle_max_service as VOMX - from {{ ref('stg_2023_rr20_rural') }} + from {{ ref('stg_ntd_2023_rr20_rural') }} WHERE type = "Expenses by Mode" ), @@ -38,7 +38,7 @@ service2022 as ( Annual_UPT, Sponsored_UPT, VOMX - from {{ ref('stg_2022_rr20_service') }} + from {{ ref('stg_ntd_2022_rr20_service') }} ), expenses2022 as ( @@ -48,7 +48,7 @@ expenses2022 as ( Operating_Capital as operating_capital, Mode as mode, Total_Annual_Expenses_By_Mode - FROM {{ ref('stg_2022_rr20_exp_by_mode') }} + FROM {{ ref('stg_ntd_2022_rr20_exp_by_mode') }} ), all_2022 as ( diff --git a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml index 345982b566..1dc10d2e19 100644 --- a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml +++ b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml @@ -3,19 +3,9 @@ version: 2 sources: - name: ntd_report_validation description: | - Data from BlackCat API. + Data from BlackCat API. Each org's data is be in 1 row, and for each separate table in the API, + a nested column holds all of it's data. database: "{{ env_var('DBT_SOURCE_DATABASE', var('SOURCE_DATABASE')) }}" schema: external_blackcat tables: - name: all_2023_ntdreports - - name: a10_ntdreportingstationsandmaintenance - - name: a15_ntdtransitassetmanagement - - name: a30_ntdassetandresourceinfo - - name: ntd_tamnarrative - - name: p10_ntdreporterbasicinfo - - name: p20_ntdreportermodes - - name: p50_ntdreportergtfs - - name: rr20_intercity - - name: rr20_rural - - name: rr20_urban_tribal - - name: ss60_safety diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql similarity index 100% rename from warehouse/models/staging/ntd_validation/stg_2022_rr20_exp_by_mode.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql similarity index 100% rename from warehouse/models/staging/ntd_validation/stg_2022_rr20_financial.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql diff --git a/warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql similarity index 100% rename from warehouse/models/staging/ntd_validation/stg_2022_rr20_service.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql diff --git a/warehouse/models/staging/ntd_validation/stg_2023_a10.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql similarity index 91% rename from warehouse/models/staging/ntd_validation/stg_2023_a10.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql index 85db82f56f..2cda0460f6 100644 --- a/warehouse/models/staging/ntd_validation/stg_2023_a10.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql @@ -14,5 +14,5 @@ SELECT a10.DOLeasedByPublicAgency as do_leased_by_public_agency, a10.DOLeasedFromPrivateEntity as do_leased_from_private_entity, a10.LastModifiedDate as last_modified_date -FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} , UNNEST (`ntdreportingstationsandmaintenance_data`) as `a10` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql similarity index 94% rename from warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql index b753a23584..f68ef04afb 100644 --- a/warehouse/models/staging/ntd_validation/stg_2023_rr20_rural.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql @@ -19,5 +19,5 @@ SELECT ntdreportingrr20_rural_data.SponsoredServiceUPT as sponsored_service_upt, ntdreportingrr20_rural_data.Quantity as quantity, ntdreportingrr20_rural_data.LastModifiedDate as last_modified_date -FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} , UNNEST (`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` \ No newline at end of file diff --git a/warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql similarity index 85% rename from warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql rename to warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql index 0b871e0ff8..7b21fa5f34 100644 --- a/warehouse/models/staging/ntd_validation/stg_2023_rr20_urban_tribal.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql @@ -11,5 +11,6 @@ SELECT ntdreportingrr20_urban_tribal_data.CapitalExpended as capital_expended, ntdreportingrr20_urban_tribal_data.Description as description, ntdreportingrr20_urban_tribal_data.LastModifiedDate as last_modified_date -FROM `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} +-- `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` , UNNEST (`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` \ No newline at end of file From ebe996e07bd352a3a2af135986351db6e428f459 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Tue, 28 Nov 2023 11:11:35 -0800 Subject: [PATCH 05/15] remove excel conversion from dbt model --- .../fct_ntd_rr20_service_checks.py | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py index 832a8961db..750fffb6f9 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py @@ -176,44 +176,6 @@ def model(dbt, session): frpt_checks, rev_speed_checks, tph_checks, voms0_check], ignore_index=True).sort_values(by="Organization") - - ## Part 1: save Excel file to GCS - GCS_FILE_PATH_VALIDATED = f"gs://calitp-ntd-report-validation/validation_reports_{this_year}" - with pd.ExcelWriter(f"{GCS_FILE_PATH_VALIDATED}/rr20_service_check_report_{this_date}.xlsx") as writer: - rr20_checks.to_excel(writer, sheet_name="rr20_checks_full", index=False, startrow=2) - - workbook = writer.book - worksheet = writer.sheets["rr20_checks_full"] - cell_highlight = workbook.add_format({ - 'fg_color': 'yellow', - 'bold': True, - 'border': 1 - }) - report_title = "NTD Data Validation Report" - title_format = workbook.add_format({ - 'bold': True, - 'valign': 'center', - 'align': 'left', - 'font_color': '#1c639e', - 'font_size': 15 - }) - subtitle = "Reduced Reporting RR-20: Validation Warnings" - subtitle_format = workbook.add_format({ - 'bold': True, - 'align': 'left', - 'font_color': 'black', - 'font_size': 19 - }) - - worksheet.write('A1', report_title, title_format) - worksheet.merge_range('A2:C2', subtitle, subtitle_format) - worksheet.write('G3', 'Agency Response', cell_highlight) - worksheet.write('H3', 'Response Date', cell_highlight) - worksheet.set_column(0, 0, 35) #col A width - worksheet.set_column(1, 3, 22) #cols B-D width - worksheet.set_column(4, 4, 11) #col D width - worksheet.set_column(5, 6, 53) #col E-G width - worksheet.freeze_panes('B4') logger.info(f"RR-20 service data checks conducted on {this_date} is complete!") From 9d1a834a73e8a4fdf95b73e0368b8f7d6af526f3 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Thu, 30 Nov 2023 16:04:47 -0800 Subject: [PATCH 06/15] adding rr20 financial checks --- .../int_ntd_rr20_financial_fare_revenues.sql | 55 ++++++++ .../int_ntd_rr20_financial_specific_funds.sql | 50 +++++++ .../int_ntd_rr20_financial_total_exp.sql | 54 ++++++++ .../fct_ntd_rr20_equal_totals_check.sql | 39 ++++++ .../fct_ntd_rr20_funds_checks.sql | 130 ++++++++++++++++++ 5 files changed, 328 insertions(+) create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql create mode 100644 warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql create mode 100644 warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql new file mode 100644 index 0000000000..810c14a8dc --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql @@ -0,0 +1,55 @@ +-- need fare rev and upt for each year. didn't write check correctly the first time + +WITH fare_rev_2023 as ( + select + organization, + api_report_period as fiscal_year, + item as mode, + operations_expended + capital_expended as Fare_Revenues, + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE type = "Fare Revenues" +), +upt_2023 as ( + select + organization, + api_report_period as fiscal_year, + item as mode, + annual_unlinked_pass_trips as Annual_UPT + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE type = "Service Data" +), +all_2023 as ( + select fare_rev_2023.*, upt_2023.Annual_UPT + FROM fare_rev_2023 + FULL OUTER JOIN upt_2023 + ON fare_rev_2023.organization = upt_2023.organization + AND fare_rev_2023.mode = upt_2023.mode +), +fare_rev_2022 as ( + SELECT Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + sum(Fare_Revenues) as Fare_Revenues + FROM {{ ref('stg_ntd_2022_rr20_financial') }} + GROUP BY organization, fiscal_year +), +upt_2022 as ( + select + Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + Mode as mode, + Annual_UPT +from {{ ref('stg_ntd_2022_rr20_service') }} +), +all_2022 as ( + select fare_rev_2022.organization, fare_rev_2022.fiscal_year, + upt_2022.Mode, fare_rev_2022.Fare_Revenues, upt_2022.Annual_UPT + FROM fare_rev_2022 + FULL OUTER JOIN upt_2022 + ON fare_rev_2022.organization = upt_2022.organization +) + +SELECT * from all_2023 + +UNION ALL + +SELECT * from all_2022 diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql new file mode 100644 index 0000000000..c3af6aa0c9 --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql @@ -0,0 +1,50 @@ +------- +-- NTD validation errors about these 1 specific funding sources. +--- ID #s RR20F-070, RR20F-065, RR20F-068, RR20F-066, RR20F-013. Sums the capital expenses across all funding sources +--- In 2022 the data is a different format than 2023 **and onwards**. +--- Only needed for the 2023 error checking (to compare to "last year"). In 2024 you don't need 2022 data. +------- + +WITH longform_2023 AS ( + SELECT + organization, + api_report_period AS fiscal_year, + operations_expended + capital_expended AS total_expended, + REPLACE( + REPLACE( + REPLACE(item, 'FTA Formula Grants for Rural Areas (ยง5311)', 'FTA_Formula_Grants_for_Rural_Areas_5311'), + 'Other Directly Generated Funds', 'Other_Directly_Generated_Funds'), + 'Local Funds', 'Local_Funds') as item + FROM {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE item LIKE "%Directly Generated Funds%" OR + item LIKE "%Formula Grants for Rural Areas%" OR + item LIKE "Local Funds" +), +wide_2023 AS ( + SELECT * FROM + (SELECT * FROM longform_2023) + PIVOT(AVG(total_expended) FOR item IN ('FTA_Formula_Grants_for_Rural_Areas_5311', 'Other_Directly_Generated_Funds', 'Local_Funds')) + ORDER BY organization +), +data_2022 AS ( + SELECT Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + SUM(Other_Directly_Generated_Funds) as Other_Directly_Generated_Funds_2022, + SUM(FTA_Formula_Grants_for_Rural_Areas_5311) as FTA_Formula_Grants_for_Rural_Areas_5311_2022, + Null as Local_Funds_2022 + FROM {{ ref('stg_ntd_2022_rr20_financial') }} + GROUP BY 1,2 + ORDER BY organization +) + +select wide_2023.organization, + wide_2023.FTA_Formula_Grants_for_Rural_Areas_5311 as FTA_Formula_Grants_for_Rural_Areas_5311_2023, + wide_2023.Other_Directly_Generated_Funds as Other_Directly_Generated_Funds_2023, + wide_2023.Local_Funds as Local_Funds_2023, + data_2022.FTA_Formula_Grants_for_Rural_Areas_5311_2022, + data_2022.Other_Directly_Generated_Funds_2022, + data_2022.Local_Funds_2022 +from wide_2023 +FULL OUTER JOIN data_2022 + ON wide_2023.organization = data_2022.organization +ORDER BY organization diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql new file mode 100644 index 0000000000..06211cc877 --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql @@ -0,0 +1,54 @@ +------ +--- Compiles data for RR-20 Financial checks on total amounts (operating and capital) +--- into one table for downstream validation checks. "Prior year" data not needed +--- NTD error ID #s RR20F-001OA, RR20F-001C, RR20F-182 +------ + +WITH total_operations_exp_2023 as( + select organization, + api_report_period as fiscal_year, + sum(operations_expended) as Total_Annual_Op_Expenses_by_Mode + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE css_class = "expense" + group by organization, api_report_period +), +total_capital_exp_bymode_2023 as ( + select organization, + api_report_period as fiscal_year, + sum(capital_expended) as Total_Annual_Cap_Expenses_byMode + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE css_class = "expense" + group by organization, api_report_period +), +total_operations_rev_2023 as ( + select organization, + api_report_period as fiscal_year, + sum(operations_expended) as Total_Annual_Op_Revenues_Expended + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE css_class = "revenue" + group by organization, api_report_period +), +total_cap_exp_byfunds_2023 as ( + select organization, + api_report_period as fiscal_year, + sum(capital_expended) as Total_Annual_Cap_Expenses_byFunds + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE css_class = "revenue" + group by organization, api_report_period +) + +SELECT + total_operations_exp_2023.*, + total_capital_exp_bymode_2023.Total_Annual_Cap_Expenses_byMode, + total_operations_rev_2023.Total_Annual_Op_Revenues_Expended, + total_cap_exp_byfunds_2023.Total_Annual_Cap_Expenses_byFunds +FROM total_operations_exp_2023 +FULL OUTER JOIN total_capital_exp_bymode_2023 + ON total_operations_exp_2023.organization = total_capital_exp_bymode_2023.organization + AND total_operations_exp_2023.fiscal_year = total_capital_exp_bymode_2023.fiscal_year +FULL OUTER JOIN total_operations_rev_2023 + ON total_operations_exp_2023.organization = total_operations_rev_2023.organization + AND total_operations_exp_2023.fiscal_year = total_operations_rev_2023.fiscal_year +FULL OUTER JOIN total_cap_exp_byfunds_2023 + ON total_operations_exp_2023.organization = total_cap_exp_byfunds_2023.organization + AND total_operations_exp_2023.fiscal_year = total_cap_exp_byfunds_2023.fiscal_year diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql new file mode 100644 index 0000000000..5b523ea685 --- /dev/null +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql @@ -0,0 +1,39 @@ + +WITH rr20f_0010a as ( + select + organization, + "RR20F-001OA: equal totalsfor operating expenses" as name_of_check, + CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_byMode,0)) THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN check_status = "Fail" THEN "Total_Annual_Revenues_Expended should, but does not, equal Total_Annual_Expenses_by_Mode. Please provide a narrative justification." + WHEN check_status = "Pass" THEN "" + ELSE NULL + END as description, + COALESCE("Total_Annual_Revenues_Expended = $", ROUND(Total_Annual_Op_Revenues_Expended,0), + ",Total_Annual_Expenses_by_Mode = $", ROUND(Total_Annual_Op_Expenses_byMode,0)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + FROM {{ ref('int_ntd_rr20_financial_total_exp') }} +), +rr20f_001c as( + select + organization, + "RR20F-001C: equal totals for capital expenses by mode and funding source expenditures" as name_of_check, + CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN check_status = "Fail" THEN "The sum of Total Expenses for all modes for Uses of Capital does not equal the sum of all values entered for Directly Generated, Non-Federal and Federal Government Funds for Uses of Capital. Please revise or explain." + WHEN check_status = "Pass" THEN "" + ELSE NULL + END as description, + COALESCE("Total_Annual_Cap_Expenses_byMode = $", ROUND(Total_Annual_Cap_Expenses_byMode,0), + ",Total_Annual_Cap_Expenses_byFunds = $", ROUND(Total_Annual_Cap_Expenses_byFunds,0)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + FROM {{ ref('int_ntd_rr20_financial_total_exp') }} +) + +SELECT * FROM rr20f_0010a + +UNION ALL + +SELECT * FROM rr20f_001c \ No newline at end of file diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql new file mode 100644 index 0000000000..1e4c9ed6ca --- /dev/null +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql @@ -0,0 +1,130 @@ +--- We do identical CASE WHEN clauses in each CTE. The results determine 2 different column values but one can only specify 1 col/statement + +WITH rr20f_070 as ( + select + organization, + "RR20F-070: 5311 Funds not reported" as name_of_check, + CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL + THEN "The ยง5311 program is not listed as a revenue source in your report, Please double check and provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +), +rr20f_066 as ( + select + organization, + "RR20F-066: change from zero" as name_of_check, + CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) + OR + ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) + OR + ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) + THEN "FTA_Formula_Grants_for_Rural_Areas_5311 funding changed either from or to zero compared to last year. Please provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2022 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2022,0) AS STRING), + "2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +), +rr20f_065 as ( + select + organization, + "RR20F-065: 5311 Funds same value" as name_of_check, + CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) AND + (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) + THEN "You have identical values for FTA_Formula_Grants_for_Rural_Areas_5311 funding in 2022 and 2023, which is unusual. Please provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2022 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2022,0) AS STRING), + "2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +), +rr20f_013 as ( + select + organization, + "RR20F-013: Other Directly Generated Funds same value" as name_of_check, + CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) AND + (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) AND + (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) AND + (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) AND + (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) + THEN "You have identical values for Other_Directly_Generated_Funds funding in 2022 and 2023, which is unusual. Please provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2022 = ", CAST(ROUND(Other_Directly_Generated_Funds_2022,0) AS STRING), + "2023 = ", CAST(ROUND(Other_Directly_Generated_Funds_2023,0) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +), +rr20f_068 as ( + select + organization, + "RR20F-068: 5311 Funds rounded to thousand" as name_of_check, + CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL + THEN "FTA_Formula_Grants_for_Rural_Areas_5311 are rounded to the nearest thousand, but should be reported as exact values. Please double check and provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +), +rr20f_024 as ( + select + organization, + "RR20F-024: Local Funds rounded to thousand" as name_of_check, + CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL + THEN "Local Funds are rounded to the nearest thousand, but should be reported as exact values. Please double check and provide a narrative justification." + ELSE "" + END AS description, + CONCAT("2023 = ", CAST(ROUND(Local_Funds_2023) AS STRING)) as value_checked, + CURRENT_TIMESTAMP() AS date_checked + from {{ ref('int_ntd_rr20_financial_specific_funds') }} +) + +SELECT * FROM rr20f_070 +UNION ALL +SELECT * FROM rr20f_066 +UNION ALL +SELECT * FROM rr20f_065 +UNION ALL +SELECT * FROM rr20f_013 +UNION ALL +SELECT * FROM rr20f_068 +UNION ALL +SELECT * FROM rr20f_024 +ORDER BY organization From e5c28f7e3fdb8dc50624f50a836daf9d713d6dd1 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Fri, 1 Dec 2023 14:45:17 -0800 Subject: [PATCH 07/15] add VOMS check --- .../int_ntd_a30_voms_vins_totals.sql | 26 ++++++++++++++++++ .../ntd_validation/fct_ntd_a30_vomscheck.sql | 23 ++++++++++++++++ .../fct_ntd_rr20_equal_totals_check.sql | 27 ++++++++++--------- .../stg_ntd_2023_a30_assetandresourceinfo.sql | 27 +++++++++++++++++++ 4 files changed, 91 insertions(+), 12 deletions(-) create mode 100644 warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql create mode 100644 warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql create mode 100644 warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql new file mode 100644 index 0000000000..c0a960859e --- /dev/null +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql @@ -0,0 +1,26 @@ +--- get the # of active VINS in the inventory - DON'T HAVE +--- get the # of VOMS in the rr-20 +-- get the # of vins in the A30 + +with voms_rr20 as ( + select organization, + fiscal_year, + AVG(VOMX) as rr20_voms + FROM {{ ref('int_ntd_rr20_service_alldata') }} + GROUP BY organization, fiscal_year +), + +vins_a30 as ( + SELECT organization, + api_report_period as fiscal_year, + COUNT (DISTINCT VIN) as a30_vin_n + FROM {{ ref('stg_ntd_2023_a30_assetandresourceinfo') }} + GROUP BY organization, fiscal_year +) + +select voms_rr20.*, vins_a30.a30_vin_n +FROM voms_rr20 +FULL OUTER JOIN vins_a30 + ON voms_rr20.organization = vins_a30.organization + AND voms_rr20.fiscal_year = vins_a30.fiscal_year +ORDER BY organization, fiscal_year \ No newline at end of file diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql new file mode 100644 index 0000000000..91d6cb20e8 --- /dev/null +++ b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql @@ -0,0 +1,23 @@ +--- Since we don't have the Revenue Inventory table from Black Cat yet, we cannot do the other checks in the file voms_inventory_check.py. +--- When we do get that data, we can add them here in their own CTEs, one per check, and then UNION ALL the CTEs together. + +WITH rr20f_180 as ( + SELECT organization, + "RR20F-180: VOMS across forms" as name_of_check, + CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) + THEN "Fail" + ELSE "Pass" + END as check_status, + CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) + THEN "Total VOMS is greater than total A-30 vehicles reported. Please clarify." + ELSE "VOMS & A-30 vehicles reported are equal to and/or lower than active inventory." + END as description, + CONCAT("RR-20 VOMS = ", CAST(ROUND(rr20_voms, 1) AS STRING), + "# A-30 VINs = ", CAST(ROUND(a30_vin_n, 1) AS STRING)) AS value_checked, + CURRENT_TIMESTAMP() AS date_checked + FROM {{ ref('int_ntd_a30_voms_vins_totals') }} +) + +SELECT * from rr20f_180 + + diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql index 5b523ea685..06a00c18d7 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql @@ -1,17 +1,19 @@ +--- We do identical CASE WHEN clauses in each CTE. The results determine 2 different column values but one can only specify 1 col/statement WITH rr20f_0010a as ( select organization, "RR20F-001OA: equal totalsfor operating expenses" as name_of_check, - CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_byMode,0)) THEN "Fail" + CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) + THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN check_status = "Fail" THEN "Total_Annual_Revenues_Expended should, but does not, equal Total_Annual_Expenses_by_Mode. Please provide a narrative justification." - WHEN check_status = "Pass" THEN "" - ELSE NULL + CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) + THEN "Total_Annual_Revenues_Expended should, but does not, equal Total_Annual_Expenses_by_Mode. Please provide a narrative justification." + ELSE "" END as description, - COALESCE("Total_Annual_Revenues_Expended = $", ROUND(Total_Annual_Op_Revenues_Expended,0), - ",Total_Annual_Expenses_by_Mode = $", ROUND(Total_Annual_Op_Expenses_byMode,0)) as value_checked, + CONCAT("Total_Annual_Revenues_Expended = $", CAST(ROUND(Total_Annual_Op_Revenues_Expended,0) AS STRING), + ",Total_Annual_Expenses_by_Mode = $", CAST(ROUND(Total_Annual_Op_Expenses_by_Mode,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked FROM {{ ref('int_ntd_rr20_financial_total_exp') }} ), @@ -19,15 +21,16 @@ rr20f_001c as( select organization, "RR20F-001C: equal totals for capital expenses by mode and funding source expenditures" as name_of_check, - CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) THEN "Fail" + CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) + THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN check_status = "Fail" THEN "The sum of Total Expenses for all modes for Uses of Capital does not equal the sum of all values entered for Directly Generated, Non-Federal and Federal Government Funds for Uses of Capital. Please revise or explain." - WHEN check_status = "Pass" THEN "" - ELSE NULL + CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) + THEN "The sum of Total Expenses for all modes for Uses of Capital does not equal the sum of all values entered for Directly Generated, Non-Federal and Federal Government Funds for Uses of Capital. Please revise or explain." + ELSE "" END as description, - COALESCE("Total_Annual_Cap_Expenses_byMode = $", ROUND(Total_Annual_Cap_Expenses_byMode,0), - ",Total_Annual_Cap_Expenses_byFunds = $", ROUND(Total_Annual_Cap_Expenses_byFunds,0)) as value_checked, + CONCAT("Total_Annual_Cap_Expenses_byMode = $", CAST(ROUND(Total_Annual_Cap_Expenses_byMode,0) AS STRING), + ",Total_Annual_Cap_Expenses_byFunds = $", CAST(ROUND(Total_Annual_Cap_Expenses_byFunds,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked FROM {{ ref('int_ntd_rr20_financial_total_exp') }} ) diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql new file mode 100644 index 0000000000..d6e9965e23 --- /dev/null +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql @@ -0,0 +1,27 @@ +SELECT + organization, + reportstatus as api_report_status, + TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, + reportperiod as api_report_period, + a30.id as id, + a30.VehicleId as vehicle_id, + a30.NTDID as ntd_id, + a30.VehicleLength as vehicle_length, + a30.FuelType as fuel_type, + a30.FundSource as fund_source, + a30.ReportId as report_id, + a30.AverageEstimatedServiceYearsWhenNew as average_estimated_service_years_when_new, + a30.VehicleStatus as vehicle_status, + a30.Vin as vin, + a30.ADAAccess as ada_access, + a30.VehicleType as vehicle_type, + a30.AverageExpirationYearsWhenNew as average_expiration_years_when_new, + a30.VehicleYear as vehicle_year, + a30.UsefulLifeYearsRemaining as useful_life_years_remaining, + a30.SeatingCapacity as seating_capacity, + a30.OwnershipType as ownership_type, + a30.ModesOperatedDisplayText as modes_operated_display_text, + a30.ModesOperatedFullText as modes_operated_full_text, + a30.LastModifiedDate as last_modified_date +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} +, UNNEST (`ntdassetandresourceinfo_data`) as `a30` \ No newline at end of file From b3b2557830ac792e17e373441eba92d4ff6d8470 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Fri, 1 Dec 2023 15:24:33 -0800 Subject: [PATCH 08/15] formatting --- .../ntd_validation/int_ntd_a30_voms_vins_totals.sql | 2 +- .../ntd_validation/int_ntd_rr20_service_alldata.sql | 1 - .../intermediate/ntd_validation/int_ntd_rr20_service_ratios.py | 2 +- warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql | 2 -- .../mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql | 2 +- .../staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql | 2 +- .../staging/ntd_validation/stg_ntd_2022_rr20_financial.sql | 2 +- .../models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql | 2 +- warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql | 2 +- .../ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql | 2 +- .../models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql | 2 +- .../staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql | 2 +- .../models/staging/ntd_validation/stg_ntd_subrecipients.sql | 2 +- 13 files changed, 11 insertions(+), 14 deletions(-) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql index c0a960859e..350fd6315c 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql @@ -23,4 +23,4 @@ FROM voms_rr20 FULL OUTER JOIN vins_a30 ON voms_rr20.organization = vins_a30.organization AND voms_rr20.fiscal_year = vins_a30.fiscal_year -ORDER BY organization, fiscal_year \ No newline at end of file +ORDER BY organization, fiscal_year diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql index 8c21daf559..cd44a46c2f 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql @@ -74,4 +74,3 @@ select * FROM all_2022 UNION ALL select * from data_2023 - diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py index 24f89bc8b2..ae44deb762 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py @@ -68,4 +68,4 @@ def model(dbt, session): allyears = make_ratio_cols(allyears, 'Annual_VRM', 'Annual_VRH', 'rev_speed', logger, operation = "mean") allyears = make_ratio_cols(allyears, 'Annual_UPT', 'Annual_VRH', 'trips_per_hr', logger, operation = "mean") - return allyears \ No newline at end of file + return allyears diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql index 91d6cb20e8..583a340914 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql @@ -19,5 +19,3 @@ WITH rr20f_180 as ( ) SELECT * from rr20f_180 - - diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql index 06a00c18d7..7bc169dd25 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql @@ -39,4 +39,4 @@ SELECT * FROM rr20f_0010a UNION ALL -SELECT * FROM rr20f_001c \ No newline at end of file +SELECT * FROM rr20f_001c diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql index ea678124c7..c2fbca6ae8 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql @@ -2,4 +2,4 @@ --- We pull these tables in to use them in later int and fct models SELECT * -FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_expenses_by_mode` \ No newline at end of file +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_expenses_by_mode` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql index 0409fbc451..5f465071db 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql @@ -2,4 +2,4 @@ --- We pull these tables in to use them in later int and fct models SELECT * -FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_financials__2` \ No newline at end of file +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_financials__2` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql index 919ce31487..64c17c9b43 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql @@ -2,4 +2,4 @@ --- We pull these tables in to use them in later int and fct models SELECT * -FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_service_data` \ No newline at end of file +FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_service_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql index 2cda0460f6..6ecc277392 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql @@ -15,4 +15,4 @@ SELECT a10.DOLeasedFromPrivateEntity as do_leased_from_private_entity, a10.LastModifiedDate as last_modified_date FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdreportingstationsandmaintenance_data`) as `a10` \ No newline at end of file +, UNNEST (`ntdreportingstationsandmaintenance_data`) as `a10` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql index d6e9965e23..223c2e104a 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql @@ -24,4 +24,4 @@ SELECT a30.ModesOperatedFullText as modes_operated_full_text, a30.LastModifiedDate as last_modified_date FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdassetandresourceinfo_data`) as `a30` \ No newline at end of file +, UNNEST (`ntdassetandresourceinfo_data`) as `a30` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql index f68ef04afb..aeaec900d7 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql @@ -20,4 +20,4 @@ SELECT ntdreportingrr20_rural_data.Quantity as quantity, ntdreportingrr20_rural_data.LastModifiedDate as last_modified_date FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` \ No newline at end of file +, UNNEST (`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql index 7b21fa5f34..003c00b758 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql @@ -13,4 +13,4 @@ SELECT ntdreportingrr20_urban_tribal_data.LastModifiedDate as last_modified_date FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -- `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` -, UNNEST (`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` \ No newline at end of file +, UNNEST (`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql index 50c5208254..b935ffbd64 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql @@ -1,3 +1,3 @@ SELECT Organization as organization -FROM blackcat_raw.2023_organizations \ No newline at end of file +FROM blackcat_raw.2023_organizations From 7b3c7f6327a1dd0bc29edd78b8a5d23743ac6f0b Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Mon, 4 Dec 2023 11:09:44 -0800 Subject: [PATCH 09/15] metadata for intermediate models --- .../int_ntd_rr20_financial_fare_revenues.sql | 2 +- .../ntd_validation/int_ntd_validation.yml | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql index 810c14a8dc..d29fc24b6d 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql @@ -1,4 +1,4 @@ --- need fare rev and upt for each year. didn't write check correctly the first time +-- need fare rev and upt for each year. WITH fare_rev_2023 as ( select diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml index d5affd54f9..7ce803848c 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml @@ -1,13 +1,27 @@ version: 2 models: - - name: int_rr20_financial + - name: int_ntd_rr20_financial_fare_revenues description: | - the RR-20 data that pertains to financial reporting. + Setting up the RR-20 data for comparing fare revenues to previous year # tests: # - dbt_utils.expression_is_true: # expression: 'status != {{ guidelines_to_be_assessed_status() }}' # columns: + - name: int_ntd_rr20_financial_specific_funds + description: | + Setting up the RR-20 data for comparing specific funding sources - the 5311 funds, and Other directly generated funds + For NTD validation error ID #s RR20F-070, RR20F-065, RR20F-068, RR20F-066, RR20F-013 + - name: int_ntd_rr20_financial_total_exp + description: | + Setting up the RR-20 data for comparing totals, for operating and capital expenses, reported in different ares of the RR-20 + For NTD validation error ID #s RR20F-001OA, RR20F-001C, RR20F-182 + - name: int_ntd_rr20_service_alldata + description: | + Combines 2023 and 2022 data in preparation for doing NTD validation checks. + The 2022 data was *not* from the API and so formatted differently + We are *assuming* that data in 2024 and onwards will be the same format as 2023 + If you get errors in 2024, check which columns may differ and read errors carefully. - name: int_ntd_rr20_service_ratios description: | makes ratios for validation checks From bc3cf8a13604993c0c85733e7dd718a7819bde38 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Mon, 4 Dec 2023 11:48:30 -0800 Subject: [PATCH 10/15] formatting --- .../ntd_validation/_mart_ntd_validation.yml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml b/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml index 43ae3000c4..3a782c0322 100644 --- a/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml +++ b/warehouse/models/mart/ntd_validation/_mart_ntd_validation.yml @@ -1,4 +1,20 @@ version: 2 models: - - name: fct_ntd_rr20_service_checks \ No newline at end of file + - name: fct_ntd_rr20_service_checks + description: | + Runs validation checks on the RR-20 service data. Source data is int_ntd_rr20_service_ratios. + This model is still in python but should be converted to SQL as time allows. + - name: fct_ntd_a30_vomscheck + description: | + Runs various checks on VOMS data submitted to NTD, that are also in the file voms_inventory_check.py. + Since we don't have the Revenue Inventory table from Black Cat yet, we cannot do all of the checks. + TO DO: add when vehicle inventory becomes available. + - name: fct_ntd_rr20_equal_totals_check + description: | + Runs various validation checks that compare total funding amounts reported in different places. + For NTD validation error ID #s rr20f_0010a, rr20f_001c + - name: fct_ntd_rr20_funds_checks + description: | + Runs various validation checks on specific RR-20 funding source data. + For NTD validation error ID #s rr20f_070, rr20f_066, rr20f_065, rr20f_013, rr20f_068, rr20f_024 From 0abdfbf401502ebd6d188b8b40508ce740f52f35 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Wed, 6 Dec 2023 15:39:03 -0800 Subject: [PATCH 11/15] formatting --- .../external_table_all_ntdreports.yml | 397 +++++++++++++++++- ..._for_ntd.yml => all_submitted_for_ntd.yml} | 6 +- .../publish_validation_report.py | 75 ---- airflow/plugins/operators/blackcat_to_gcs.py | 87 ++-- 4 files changed, 440 insertions(+), 125 deletions(-) rename airflow/dags/ntd_report_from_blackcat/{all_2023_submitted_for_ntd.yml => all_submitted_for_ntd.yml} (62%) delete mode 100644 airflow/dags/ntd_report_publish_validation/publish_validation_report.py diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml index ad5fe5517c..a6f9c41dd1 100644 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml @@ -3,17 +3,398 @@ bucket: gs://calitp-ntd-report-validation prefix_bucket: false post_hook: | SELECT * - FROM `{{ get_project_id() }}`.external_blackcat.all_2023_ntdreports + FROM `{{ get_project_id() }}`.external_blackcat.all_ntdreports LIMIT 1; source_objects: - - "all_2023_NTDReporting/*.jsonl.gz" -destination_project_dataset_table: "external_blackcat.all_2023_ntdreports" + - "all_NTDReporting/*.jsonl.gz" +destination_project_dataset_table: "external_blackcat.all_ntdreports" source_format: NEWLINE_DELIMITED_JSON use_bq_client: true hive_options: - mode: AUTO + mode: CUSTOM require_partition_filter: false - source_uri_prefix: "all_2023_NTDReporting/" - - - + source_uri_prefix: "all_NTDReporting/{year:STRING}/{dt:DATE}/{ts:TIMESTAMP}/" +schema_fields: + - name: reportid + type: INTEGER + - name: organization + type: STRING + - name: reportperiod + type: INTEGER + - name: reportstatus + type: STRING + - name: reportlastmodifieddate + type: TIMESTAMP + - name: ntdreportingstationsandmaintenance_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ServiceMode + type: STRING + - name: ReportId + type: INTEGER + - name: PTOwnedByServiceProvider + type: FLOAT + - name: PTOwnedByPublicAgency + type: FLOAT + - name: PTLeasedByServiceProvider + type: FLOAT + - name: PTLeasedByPublicAgency + type: FLOAT + - name: DOOwned + type: FLOAT + - name: DOLeasedByPublicAgency + type: FLOAT + - name: DOLeasedFromPrivateEntity + type: FLOAT + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdtransitassetmanagementa15_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: FacilityId + type: INTEGER + - name: ReportId + type: INTEGER + - name: FacilityName + type: STRING + - name: PrimaryMode + type: STRING + - name: FacilityClass + type: STRING + - name: FacilityType + type: STRING + - name: YearBuilt + type: INTEGER + - name: Size + type: STRING + - name: DOTCapitalResponsibility + type: FLOAT + - name: OrganizationCapitalResponsibility + type: FLOAT + - name: ConditionAssessment + type: FLOAT + - name: ConditionAssessment + type: TIMESTAMP + - name: SectionOfLargerFacility + type: BOOLEAN + - name: Latitude + type: FLOAT + - name: LatitudeDirection + type: STRING + - name: Longitude + type: FLOAT + - name: LongitudeDirection + type: STRING + - name: SecondaryMode + type: STRING + - name: PrivateMode + type: STRING + - name: ntdassetandresourceinfo_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: VehicleId + type: INTEGER + - name: ReportId + type: INTEGER + - name: VehicleStatus + type: STRING + - name: Vin + type: INTEGER + - name: NTDID + type: STRING + - name: ADAAccess + type: BOOLEAN + - name: VehicleType + type: STRING + - name: FuelType + type: STRING + - name: FundSource + type: STRING + - name: AverageEstimatedServiceYearsWhenNew + type: INTEGER + - name: AverageExpirationYearsWhenNew + type: INTEGER + - name: VehicleYear + type: INTEGER + - name: UsefulLifeYearsRemaining + type: INTEGER + - name: VehicleLength + type: FLOAT + - name: SeatingCapacity + type: INTEGER + - name: OwnershipType + type: STRING + - name: ModesOperatedDisplayText + type: STRING + - name: ModesOperatedFullText + type: STRING + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingp10_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: OrgId + type: INTEGER + - name: UserId + type: STRING + - name: FirstName + type: STRING + - name: LastName + type: STRING + - name: FullName + type: RECORD + fields: + - name: id + type: INTEGER + - name: Text + type: STRING + - name: Value + type: STRING + - name: Group + type: STRING + - name: BoolValue + type: BOOLEAN + - name: PrimaryPhone + type: STRING + - name: Email + type: STRING + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingp20_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: ServiceMode + type: STRING + - name: TypeOfService + type: STRING + - name: CommitmentDate + type: STRING + - name: StartDate + type: STRING + - name: EndDate + type: STRING + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingp50_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: Mode + type: RECORD + fields: + - name: id + type: STRING + - name: Text + type: STRING + - name: Value + type: STRING + - name: Group + type: STRING + - name: BoolValue + type: BOOLEAN + - name: Type + type: RECORD + fields: + - name: id + type: STRING + - name: Text + type: STRING + - name: Value + type: STRING + - name: Group + type: STRING + - name: BoolValue + type: BOOLEAN + - name: WebLink + type: STRING + - name: FilePath + type: STRING + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportinga35_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: EquipmentName + type: STRING + - name: EquipmentId + type: INTEGER + - name: VehicleType + type: STRING + - name: PrimaryMode + type: STRING + - name: SecondaryMode + type: STRING + - name: TotalVehicles + type: STRING + - name: UsefulLifeBenchmark + type: INTEGER + - name: YearOfManufacture + type: INTEGER + - name: TransitAgencyCapitalResponsibility + type: FLOAT + - name: EstimatedCost + type: FLOAT + - name: YearDollarsEstimatedCost + type: INTEGER + - name: UsefulLifeYearsBenchMark + type: INTEGER + - name: UsefulLifeYearsRemaining + type: INTEGER + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingrr20_intercity_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ItemId + type: INTEGER + - name: ReportId + type: INTEGER + - name: Item + type: STRING + - name: Type + type: STRING + - name: OperationsExpended + type: FLOAT + - name: CapitalExpended + type: FLOAT + - name: Description + type: STRING + - name: AnnualVehicleRevMiles + type: INTEGER + - name: RegularUnlinkedPassengerTrips + type: INTEGER + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingrr20_rural_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: Item + type: STRING + - name: Revenue + type: STRING + - name: Type + type: STRING + - name: CssClass + type: STRING + - name: OperationsExpended + type: FLOAT + - name: CapitalExpended + type: FLOAT + - name: Description + type: STRING + - name: AnnualVehicleRevMiles + type: INTEGER + - name: AnnualVehicleRevHours + type: INTEGER + - name: AnnualUnlinkedPassTrips + type: INTEGER + - name: AnnualVehicleMaxService + type: INTEGER + - name: SponsoredServiceUPT + type: INTEGER + - name: Quantity + type: INTEGER + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingrr20_urban_tribal_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ItemId + type: INTEGER + - name: ReportId + type: INTEGER + - name: Item + type: STRING + - name: OperationsExpended + type: FLOAT + - name: CapitalExpended + type: FLOAT + - name: Description + type: STRING + - name: LastModifiedDate + type: TIMESTAMP + - name: ntdreportingtamnarrative_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ReportId + type: INTEGER + - name: Type + type: STRING + - name: Category + type: STRING + - name: VehiclesInAssetClass + type: INTEGER + - name: VehiclesExceededULBTAMPlan + type: INTEGER + - name: TAMPlanGoalsDescription + type: STRING + - name: VehiclesToBeRetiredBeyondULB + type: INTEGER + - name: VehiclesPastULBInTAM + type: INTEGER + - name: LastModifiedDate + type: TIMESTAMP + - name: ss60_data + type: RECORD + fields: + - name: Id + type: INTEGER + - name: ItemId + type: INTEGER + - name: ReportId + type: INTEGER + - name: Item + type: STRING + - name: Type + type: STRING + - name: CssClass + type: STRING + - name: TransitVehicleAssualts + type: INTEGER + - name: RevenueFacilityAssualts + type: INTEGER + - name: NonRevenueFacilityAssualts + type: INTEGER + - name: OtherLocationAssualts + type: INTEGER + - name: MajorEvents + type: INTEGER + - name: Fatalities + type: INTEGER + - name: Injuries + type: INTEGER + - name: Quantity + type: STRING + - name: LastModifiedDate + type: TIMESTAMP diff --git a/airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml b/airflow/dags/ntd_report_from_blackcat/all_submitted_for_ntd.yml similarity index 62% rename from airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml rename to airflow/dags/ntd_report_from_blackcat/all_submitted_for_ntd.yml index ce0acd3528..53263c8d33 100644 --- a/airflow/dags/ntd_report_from_blackcat/all_2023_submitted_for_ntd.yml +++ b/airflow/dags/ntd_report_from_blackcat/all_submitted_for_ntd.yml @@ -1,7 +1,7 @@ operator: operators.BlackCatApiToGCSOperator bucket: "gs://calitp-ntd-report-validation" -api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/2023" -api_tablename: "2023_NTDReporting" +api_url: "https://services.blackcattransit.com/api/APIModules/GetNTDReportsByYear/BCG_CA/" +api_tablename_suffix: "NTDReporting" form: "all" -bq_table_name: "2023_ntdreports" \ No newline at end of file +bq_table_name_suffix: "ntdreports" diff --git a/airflow/dags/ntd_report_publish_validation/publish_validation_report.py b/airflow/dags/ntd_report_publish_validation/publish_validation_report.py deleted file mode 100644 index 7b5b4e9e82..0000000000 --- a/airflow/dags/ntd_report_publish_validation/publish_validation_report.py +++ /dev/null @@ -1,75 +0,0 @@ -# --- -# python_callable: publish_report -# provide_context: true -# --- -from google.cloud import bigquery -import pandas as pd -import datetime -import re - -import google.auth -import google.auth.transport.requests - -import pendulum -from calitp_data_infra.storage import ( - fetch_all_in_partition, - get_fs, -) - - -def publish_report(): - client = bigquery.Client() - print("Got BG client!") - project = "cal-itp-data-infra-staging" - dataset_id = "staging_staging" - - dataset_ref = bigquery.DatasetReference(project, dataset_id) - table_ref = dataset_ref.table("fct_ntd_rr20_service_checks") - table = client.get_table(table_ref) - print("Got table!") - - df = client.list_rows(table).to_dataframe() - print("Got df from BQ!") - print(df.head()) - - - # this_year=datetime.datetime.now().year - # ## Part 1: save Excel file to GCS (for emailing to subrecipients) - # GCS_FILE_PATH_VALIDATED = f"gs://calitp-ntd-report-validation/validation_reports_{this_year}" - # with pd.ExcelWriter(f"{GCS_FILE_PATH_VALIDATED}/rr20_service_check_report_{this_date}.xlsx") as writer: - # rr20_checks.to_excel(writer, sheet_name="rr20_checks_full", index=False, startrow=2) - - # workbook = writer.book - # worksheet = writer.sheets["rr20_checks_full"] - # cell_highlight = workbook.add_format({ - # 'fg_color': 'yellow', - # 'bold': True, - # 'border': 1 - # }) - # report_title = "NTD Data Validation Report" - # title_format = workbook.add_format({ - # 'bold': True, - # 'valign': 'center', - # 'align': 'left', - # 'font_color': '#1c639e', - # 'font_size': 15 - # }) - # subtitle = "Reduced Reporting RR-20: Validation Warnings" - # subtitle_format = workbook.add_format({ - # 'bold': True, - # 'align': 'left', - # 'font_color': 'black', - # 'font_size': 19 - # }) - - # worksheet.write('A1', report_title, title_format) - # worksheet.merge_range('A2:C2', subtitle, subtitle_format) - # worksheet.write('G3', 'Agency Response', cell_highlight) - # worksheet.write('H3', 'Response Date', cell_highlight) - # worksheet.set_column(0, 0, 35) #col A width - # worksheet.set_column(1, 3, 22) #cols B-D width - # worksheet.set_column(4, 4, 11) #col D width - # worksheet.set_column(5, 6, 53) #col E-G width - # worksheet.freeze_panes('B4') - - diff --git a/airflow/plugins/operators/blackcat_to_gcs.py b/airflow/plugins/operators/blackcat_to_gcs.py index bf9d85d8e2..dcbbc43458 100644 --- a/airflow/plugins/operators/blackcat_to_gcs.py +++ b/airflow/plugins/operators/blackcat_to_gcs.py @@ -1,24 +1,27 @@ -from calitp_data_infra.storage import get_fs, make_name_bq_safe -from airflow.models import BaseOperator -from pydantic import BaseModel +import gzip +import logging +import os from typing import Optional + import pandas as pd import pendulum import requests -import logging -import gzip -import os -import re +from calitp_data_infra.storage import get_fs, make_name_bq_safe +from pydantic import BaseModel + +from airflow.models import BaseOperator + def write_to_log(logfilename): - ''' + """ Creates a logger object that outputs to a log file, to the filename specified, and also streams to console. - ''' + """ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) - formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', - datefmt='%y-%m-%d %H:%M:%S') + formatter = logging.Formatter( + "%(asctime)s:%(levelname)s: %(message)s", datefmt="%y-%m-%d %H:%M:%S" + ) file_handler = logging.FileHandler(logfilename) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() @@ -34,20 +37,20 @@ def write_to_log(logfilename): class BlackCatApiExtract(BaseModel): api_url: str form: str - api_tablename: str - bq_table_name: str + api_tablename_suffix: str + bq_table_name_suffix: str data: Optional[pd.DataFrame] logger: Optional[logging.Logger] extract_time: Optional[pendulum.DateTime] - logger = write_to_log('load_bc_apidata_output.log') + logger = write_to_log("load_bc_apidata_output.log") extract_time = pendulum.now() - + # pydantic doesn't know dataframe type # see https://stackoverflow.com/a/69200069 class Config: arbitrary_types_allowed = True - + def fetch_from_bc_api(self): """Download a BlackCat table as a DataFrame. @@ -65,19 +68,23 @@ def fetch_from_bc_api(self): 2. rename fields 3. apply column prefix (to columns not renamed by 1 or 2) """ - + self.logger.info( - f"Downloading BlackCat data for {self.extract_time.format('YYYY')}_{self.bq_table_name}." + f"Downloading BlackCat data for {self.extract_time.format('YYYY')}_{self.bq_table_name_suffix}." ) - response = requests.get(self.api_url, verify=False) - blob = response.json() - + # will automatically add the current year to the API url so that it ends with "/YYYY". + url = self.api_url + self.extract_time.format("YYYY") + response = requests.get(url) + blob = response.json() + raw_df = pd.json_normalize(blob) - raw_df['ReportLastModifiedDate'] = raw_df['ReportLastModifiedDate'].astype('datetime64[ns]') + raw_df["ReportLastModifiedDate"] = raw_df["ReportLastModifiedDate"].astype( + "datetime64[ns]" + ) self.data = raw_df.rename(make_name_bq_safe, axis="columns") self.logger.info( - f"Downloaded {self.extract_time.format('YYYY')}_{self.bq_table_name} data with {len(self.data)} rows!" + f"Downloaded {self.bq_table_name_suffix} data for {self.extract_time.format('YYYY')} with {len(self.data)} rows!" ) def make_hive_path(self, form: str, bucket: str): @@ -85,22 +92,23 @@ def make_hive_path(self, form: str, bucket: str): raise ValueError( "An extract time must be set before a hive path can be generated." ) - bq_form_name = ( - str.lower(form).replace("-", "") - ) + bq_form_name = str.lower(form).replace("-", "") return os.path.join( bucket, - f"{bq_form_name}_{self.api_tablename}", + f"{bq_form_name}_{self.api_tablename_suffix}", + f"year={self.extract_time.format('YYYY')}", f"dt={self.extract_time.to_date_string()}", f"ts={self.extract_time.to_iso8601_string()}", - f"{bq_form_name}_{self.bq_table_name}.jsonl.gz", + f"{bq_form_name}_{self.bq_table_name_suffix}.jsonl.gz", ) def save_to_gcs(self, fs, bucket): hive_path = self.make_hive_path(self.form, bucket) self.logger.info(f"Uploading to GCS at {hive_path}") if len(self.data) == 0: - self.logger.info(f"There is no data for {self.api_tablename}, not saving anything. Pipeline exiting.") + self.logger.info( + f"There is no data for {self.api_tablename_suffix} for {self.extract_time.format('YYYY')}, not saving anything. Pipeline exiting." + ) pass else: fs.pipe( @@ -118,30 +126,31 @@ def __init__( bucket, api_url, form, - api_tablename, - bq_table_name, + api_tablename_suffix, + bq_table_name_suffix, **kwargs, ): """An operator that downloads all data from a BlackCat API and saves it as one JSON file hive-partitioned by date in Google Cloud - Storage (GCS). Each org's data will be in 1 row, and for each separate table in the API, - a nested column will hold all of it's data. + Storage (GCS). Each org's data will be in 1 row, and for each separate table in the API, + a nested column will hold all of it's data. Args: bucket (str): GCS bucket where the scraped BlackCat report will be saved. - api_url (str): The URL to hit that gets the data. - api_tablename (str): The table that should be extracted from the BlackCat API. + api_url (str): The URL to hit that gets the data. This is dynamically appended with the current year, so that + ... in 2023 it will pull data from the ".../2023" url and in 2024, ".../2024" etc. + api_tablename_suffix (str): The table that should be extracted from the BlackCat API. MUST MATCH THE API JSON EXACTLY - bq_table_name (str): The table name that will be given in BigQuery. Appears in the GCS bucket path and the filename. - form: the NTD form that this report belongs to. E.g., RR-20, A-10, etc. + bq_table_name_suffix (str): The table name that will be given in BigQuery. Appears in the GCS bucket path and the filename. + form: the NTD form that this report belongs to. E.g., RR-20, A-10, etc. Since it's all forms, here it's "all" """ self.bucket = bucket # Instantiating an instance of the BlackCatApiExtract() self.extract = BlackCatApiExtract( api_url=api_url, form=form, - api_tablename=api_tablename, - bq_table_name=bq_table_name, + api_tablename_suffix=api_tablename_suffix, + bq_table_name_suffix=bq_table_name_suffix, ) super().__init__(**kwargs) From ac912393df87732ace8d24e7cfe09224ee063390 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Wed, 6 Dec 2023 22:02:12 -0800 Subject: [PATCH 12/15] schema fixes --- .../external_table_all_ntdreports.yml | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml index a6f9c41dd1..748cbe1279 100644 --- a/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml +++ b/airflow/dags/create_external_tables/ntd_report_validation/external_table_all_ntdreports.yml @@ -24,9 +24,10 @@ schema_fields: - name: reportstatus type: STRING - name: reportlastmodifieddate - type: TIMESTAMP + type: INTEGER - name: ntdreportingstationsandmaintenance_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -52,6 +53,7 @@ schema_fields: type: TIMESTAMP - name: ntdtransitassetmanagementa15_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -77,16 +79,16 @@ schema_fields: type: FLOAT - name: ConditionAssessment type: FLOAT - - name: ConditionAssessment + - name: ConditionAssessmentDate type: TIMESTAMP - name: SectionOfLargerFacility type: BOOLEAN - name: Latitude - type: FLOAT + type: STRING - name: LatitudeDirection type: STRING - name: Longitude - type: FLOAT + type: STRING - name: LongitudeDirection type: STRING - name: SecondaryMode @@ -95,6 +97,7 @@ schema_fields: type: STRING - name: ntdassetandresourceinfo_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -138,6 +141,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingp10_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -172,6 +176,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingp20_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -191,6 +196,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingp50_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -230,6 +236,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportinga35_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -248,7 +255,7 @@ schema_fields: - name: TotalVehicles type: STRING - name: UsefulLifeBenchmark - type: INTEGER + type: BOOLEAN - name: YearOfManufacture type: INTEGER - name: TransitAgencyCapitalResponsibility @@ -265,6 +272,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingrr20_intercity_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -290,6 +298,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingrr20_rural_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -325,6 +334,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingrr20_urban_tribal_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -344,6 +354,7 @@ schema_fields: type: TIMESTAMP - name: ntdreportingtamnarrative_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER @@ -367,6 +378,7 @@ schema_fields: type: TIMESTAMP - name: ss60_data type: RECORD + mode: REPEATED fields: - name: Id type: INTEGER From ac46ff1b07a4bd50b3bb419d2a0c76cd0e254903 Mon Sep 17 00:00:00 2001 From: Laurie Merrell Date: Thu, 7 Dec 2023 09:48:58 -0600 Subject: [PATCH 13/15] address linter issues --- .../ntd_report_from_blackcat/METADATA.yml | 2 +- .../dags/ntd_report_from_blackcat/README.md | 8 +- .../METADATA.yml | 2 +- .../ntd_report_publish_validation/README.md | 4 +- airflow/plugins/operators/__init__.py | 2 +- .../int_ntd_a30_voms_vins_totals.sql | 4 +- .../int_ntd_rr20_financial_fare_revenues.sql | 18 +- .../int_ntd_rr20_financial_specific_funds.sql | 14 +- .../int_ntd_rr20_financial_total_exp.sql | 2 +- .../int_ntd_rr20_service_alldata.sql | 11 +- .../int_ntd_rr20_service_ratios.py | 73 +++-- .../ntd_validation/int_ntd_validation.yml | 10 +- .../ntd_validation/fct_ntd_a30_vomscheck.sql | 6 +- .../fct_ntd_rr20_equal_totals_check.sql | 18 +- .../fct_ntd_rr20_funds_checks.sql | 74 ++--- .../fct_ntd_rr20_service_checks.py | 266 ++++++++++++------ .../ntd_validation/_src_api_externaltable.yml | 4 +- .../stg_ntd_2022_rr20_exp_by_mode.sql | 5 +- .../stg_ntd_2022_rr20_financial.sql | 5 +- .../stg_ntd_2022_rr20_service.sql | 5 +- .../ntd_validation/stg_ntd_2023_a10.sql | 6 +- .../stg_ntd_2023_a30_assetandresourceinfo.sql | 6 +- .../stg_ntd_2023_rr20_rural.sql | 6 +- .../stg_ntd_2023_rr20_urban_tribal.sql | 6 +- .../ntd_validation/stg_ntd_subrecipients.sql | 2 +- 25 files changed, 336 insertions(+), 223 deletions(-) diff --git a/airflow/dags/ntd_report_from_blackcat/METADATA.yml b/airflow/dags/ntd_report_from_blackcat/METADATA.yml index d8a59f4b6e..a6ee5e2843 100644 --- a/airflow/dags/ntd_report_from_blackcat/METADATA.yml +++ b/airflow/dags/ntd_report_from_blackcat/METADATA.yml @@ -15,4 +15,4 @@ default_args: pool: default_pool concurrency: 50 wait_for_defaults: - timeout: 3600 \ No newline at end of file + timeout: 3600 diff --git a/airflow/dags/ntd_report_from_blackcat/README.md b/airflow/dags/ntd_report_from_blackcat/README.md index 4189a8a187..18cd55f96d 100644 --- a/airflow/dags/ntd_report_from_blackcat/README.md +++ b/airflow/dags/ntd_report_from_blackcat/README.md @@ -2,8 +2,8 @@ Type: [Now|Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) -This DAG orchestrates the publishing and storing of data, in the form of NTD report submissions, first pushing API data into Google Cloud Storage in the bucket `calitp-ntd-report-validation`. - -Another DAG (part of the `create_external_tables` existing DAG) reads the GCS data in BigQuery in the Cal-ITP data warehouse. The job will take the most recent file of each report type (which has all submitted reports by Caltrans 5311 subrecipients) and publish it into BigQuery `external` tables, if it is not yet there. This job uses the Cal-ITP existing infrastructure for creating external tables, outlined [here](https://docs.calitp.org/data-infra/architecture/data.html). +This DAG orchestrates the publishing and storing of data, in the form of NTD report submissions, first pushing API data into Google Cloud Storage in the bucket `calitp-ntd-report-validation`. -In the event of failure, the job can be rerun without backfilling. \ No newline at end of file +Another DAG (part of the `create_external_tables` existing DAG) reads the GCS data in BigQuery in the Cal-ITP data warehouse. The job will take the most recent file of each report type (which has all submitted reports by Caltrans 5311 subrecipients) and publish it into BigQuery `external` tables, if it is not yet there. This job uses the Cal-ITP existing infrastructure for creating external tables, outlined [here](https://docs.calitp.org/data-infra/architecture/data.html). + +In the event of failure, the job can be rerun without backfilling. diff --git a/airflow/dags/ntd_report_publish_validation/METADATA.yml b/airflow/dags/ntd_report_publish_validation/METADATA.yml index 0f97d22bff..f41c5cfe59 100644 --- a/airflow/dags/ntd_report_publish_validation/METADATA.yml +++ b/airflow/dags/ntd_report_publish_validation/METADATA.yml @@ -15,4 +15,4 @@ default_args: pool: default_pool concurrency: 50 wait_for_defaults: - timeout: 3600 \ No newline at end of file + timeout: 3600 diff --git a/airflow/dags/ntd_report_publish_validation/README.md b/airflow/dags/ntd_report_publish_validation/README.md index 28977df52d..33fda0cd15 100644 --- a/airflow/dags/ntd_report_publish_validation/README.md +++ b/airflow/dags/ntd_report_publish_validation/README.md @@ -2,6 +2,6 @@ Type: [Now|Scheduled](https://docs.calitp.org/data-infra/airflow/dags-maintenance.html) -This DAG orchestrates the publishing of NTD Report validation checks in the form of Excel files, that it saves into Google Cloud Storage. Checks conducted on submitted NTD report submissions, previously stored into BigQuery with dbt models. They are then converted to Excel files and saves in the Google Cloud Storage bucket `calitp-ntd-report-validation`. +This DAG orchestrates the publishing of NTD Report validation checks in the form of Excel files, that it saves into Google Cloud Storage. Checks conducted on submitted NTD report submissions, previously stored into BigQuery with dbt models. They are then converted to Excel files and saves in the Google Cloud Storage bucket `calitp-ntd-report-validation`. -In the event of failure, the job can be rerun without backfilling. \ No newline at end of file +In the event of failure, the job can be rerun without backfilling. diff --git a/airflow/plugins/operators/__init__.py b/airflow/plugins/operators/__init__.py index 39271c95c8..9ffc42acf9 100644 --- a/airflow/plugins/operators/__init__.py +++ b/airflow/plugins/operators/__init__.py @@ -1,9 +1,9 @@ # flake8: noqa from operators.airtable_to_gcs import AirtableToGCSOperator +from operators.blackcat_to_gcs import BlackCatApiToGCSOperator from operators.external_table import ExternalTable from operators.gtfs_csv_to_jsonl import GtfsGcsToJsonlOperator from operators.gtfs_csv_to_jsonl_hourly import GtfsGcsToJsonlOperatorHourly from operators.littlepay_raw_sync import LittlepayRawSync from operators.littlepay_to_jsonl import LittlepayToJSONL from operators.pod_operator import PodOperator -from operators.blackcat_to_gcs import BlackCatApiToGCSOperator diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql index 350fd6315c..a5df5e51a3 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_a30_voms_vins_totals.sql @@ -6,14 +6,14 @@ with voms_rr20 as ( select organization, fiscal_year, AVG(VOMX) as rr20_voms - FROM {{ ref('int_ntd_rr20_service_alldata') }} + FROM {{ ref('int_ntd_rr20_service_alldata') }} GROUP BY organization, fiscal_year ), vins_a30 as ( SELECT organization, api_report_period as fiscal_year, - COUNT (DISTINCT VIN) as a30_vin_n + COUNT(DISTINCT VIN) as a30_vin_n FROM {{ ref('stg_ntd_2023_a30_assetandresourceinfo') }} GROUP BY organization, fiscal_year ) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql index d29fc24b6d..cece952604 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_fare_revenues.sql @@ -1,7 +1,7 @@ --- need fare rev and upt for each year. +-- need fare rev and upt for each year. WITH fare_rev_2023 as ( - select + select organization, api_report_period as fiscal_year, item as mode, @@ -10,7 +10,7 @@ WITH fare_rev_2023 as ( WHERE type = "Fare Revenues" ), upt_2023 as ( - select + select organization, api_report_period as fiscal_year, item as mode, @@ -19,8 +19,12 @@ upt_2023 as ( WHERE type = "Service Data" ), all_2023 as ( - select fare_rev_2023.*, upt_2023.Annual_UPT - FROM fare_rev_2023 + select fare_rev_2023.organization, + fare_rev_2023.fiscal_year, + fare_rev_2023.mode, + fare_rev_2023.Fare_Revenues, + upt_2023.Annual_UPT + FROM fare_rev_2023 FULL OUTER JOIN upt_2023 ON fare_rev_2023.organization = upt_2023.organization AND fare_rev_2023.mode = upt_2023.mode @@ -33,7 +37,7 @@ fare_rev_2022 as ( GROUP BY organization, fiscal_year ), upt_2022 as ( - select + select Organization_Legal_Name as organization, Fiscal_Year as fiscal_year, Mode as mode, @@ -43,7 +47,7 @@ from {{ ref('stg_ntd_2022_rr20_service') }} all_2022 as ( select fare_rev_2022.organization, fare_rev_2022.fiscal_year, upt_2022.Mode, fare_rev_2022.Fare_Revenues, upt_2022.Annual_UPT - FROM fare_rev_2022 + FROM fare_rev_2022 FULL OUTER JOIN upt_2022 ON fare_rev_2022.organization = upt_2022.organization ) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql index c3af6aa0c9..768ca69c44 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_specific_funds.sql @@ -1,7 +1,7 @@ ------- --- NTD validation errors about these 1 specific funding sources. +-- NTD validation errors about these 1 specific funding sources. --- ID #s RR20F-070, RR20F-065, RR20F-068, RR20F-066, RR20F-013. Sums the capital expenses across all funding sources ---- In 2022 the data is a different format than 2023 **and onwards**. +--- In 2022 the data is a different format than 2023 **and onwards**. --- Only needed for the 2023 error checking (to compare to "last year"). In 2024 you don't need 2022 data. ------- @@ -16,9 +16,9 @@ WITH longform_2023 AS ( 'Other Directly Generated Funds', 'Other_Directly_Generated_Funds'), 'Local Funds', 'Local_Funds') as item FROM {{ ref('stg_ntd_2023_rr20_rural') }} - WHERE item LIKE "%Directly Generated Funds%" OR - item LIKE "%Formula Grants for Rural Areas%" OR - item LIKE "Local Funds" + WHERE item LIKE "%Directly Generated Funds%" + OR item LIKE "%Formula Grants for Rural Areas%" + OR item LIKE "Local Funds" ), wide_2023 AS ( SELECT * FROM @@ -33,11 +33,11 @@ data_2022 AS ( SUM(FTA_Formula_Grants_for_Rural_Areas_5311) as FTA_Formula_Grants_for_Rural_Areas_5311_2022, Null as Local_Funds_2022 FROM {{ ref('stg_ntd_2022_rr20_financial') }} - GROUP BY 1,2 + GROUP BY 1,2 -- noqa: L054 ORDER BY organization ) -select wide_2023.organization, +select wide_2023.organization, wide_2023.FTA_Formula_Grants_for_Rural_Areas_5311 as FTA_Formula_Grants_for_Rural_Areas_5311_2023, wide_2023.Other_Directly_Generated_Funds as Other_Directly_Generated_Funds_2023, wide_2023.Local_Funds as Local_Funds_2023, diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql index 06211cc877..164a3c8403 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_financial_total_exp.sql @@ -37,7 +37,7 @@ total_cap_exp_byfunds_2023 as ( group by organization, api_report_period ) -SELECT +SELECT total_operations_exp_2023.*, total_capital_exp_bymode_2023.Total_Annual_Cap_Expenses_byMode, total_operations_rev_2023.Total_Annual_Op_Revenues_Expended, diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql index cd44a46c2f..20a14257ad 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql @@ -6,10 +6,10 @@ --- We are *assuming* that data in 2024 and onwards will be the same format as 2023 --- If you get errors in 2024, check which columns may differ and read errors carefully. ----TO DO: insert parameter for loop, for each year, do what 2023 is doing, +---TO DO: insert parameter for loop, for each year, do what 2023 is doing, --- and at the end, add another union statement with data_2023 as ( - select + select organization, api_report_period as fiscal_year, item as mode, @@ -17,7 +17,6 @@ with data_2023 as ( CASE WHEN description = "Operating Expenses" THEN operations_expended WHEN description = "Capital Expenses" THEN capital_expended - ELSE Null END as Total_Annual_Expenses_By_Mode, annual_vehicle_rev_miles as Annual_VRM, annual_vehicle_rev_hours as Annual_VRH, @@ -29,7 +28,7 @@ with data_2023 as ( ), service2022 as ( - select + select Organization_Legal_Name as organization, Fiscal_Year as fiscal_year, Mode as mode, @@ -42,7 +41,7 @@ service2022 as ( ), expenses2022 as ( - select + select Organization_Legal_Name as organization, Fiscal_Year as fiscal_year, Operating_Capital as operating_capital, @@ -63,7 +62,7 @@ all_2022 as ( service2022.Sponsored_UPT, service2022.VOMX from service2022 -FULL OUTER JOIN expenses2022 +FULL OUTER JOIN expenses2022 ON service2022.organization = expenses2022.organization AND service2022.fiscal_year = expenses2022.fiscal_year AND service2022.mode = expenses2022.mode diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py index ae44deb762..a20e7e16d1 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py @@ -1,19 +1,21 @@ - -import pyspark.sql.functions as F -import pandas as pd import logging -import pyspark + +import pandas as pd # noqa: F401 +import pyspark # noqa: F401 +import pyspark.sql.functions as F # noqa: F401 def write_to_log(logfilename): - ''' + """ Creates a logger object that outputs to a log file, to the filename specified, and also streams to console. - ''' + """ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) - formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', - datefmt='%y-%m-%d %H:%M:%S') + formatter = logging.Formatter( + f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541 + datefmt="%y-%m-%d %H:%M:%S", # noqa: F541 + ) file_handler = logging.FileHandler(logfilename) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() @@ -25,6 +27,7 @@ def write_to_log(logfilename): return logger + def make_ratio_cols(df, numerator, denominator, col_name, logger, operation="sum"): if col_name is not None: # If a user specify a column name, use it @@ -32,40 +35,52 @@ def make_ratio_cols(df, numerator, denominator, col_name, logger, operation="sum if col_name in df.columns: logger.info(f"Dataframe already has column '{col_name}'") raise ValueError(f"Dataframe already has column '{col_name}'") - + else: _col_name = col_name - - if operation == "sum": - df = (df.groupby(['organization','mode', 'fiscal_year']) - .apply(lambda x: x.assign(**{_col_name: - lambda x: x[numerator].sum() / x[denominator]})) - ) + + if operation == "sum": + df = df.groupby(["organization", "mode", "fiscal_year"]).apply( + lambda x: x.assign( + **{_col_name: lambda x: x[numerator].sum() / x[denominator]} + ) + ) # else do not sum the numerator columns else: - df = (df.groupby(['organization','mode', 'fiscal_year']) - .apply(lambda x: x.assign(**{_col_name: - lambda x: x[numerator] / x[denominator]})) - ) + df = df.groupby(["organization", "mode", "fiscal_year"]).apply( + lambda x: x.assign(**{_col_name: lambda x: x[numerator] / x[denominator]}) + ) return df def model(dbt, session): # Set up the logger object - logger = write_to_log('rr20_servicechecks_log.log') - - #Load data from BigQuery - pass in the dbt model that we draw from. + logger = write_to_log("rr20_servicechecks_log.log") + + # Load data from BigQuery - pass in the dbt model that we draw from. allyears = dbt.ref("int_ntd_rr20_service_alldata") allyears = allyears.toPandas() # Calculate needed ratios, added as new columns - numeric_columns = allyears.select_dtypes(include=['number']).columns + numeric_columns = allyears.select_dtypes(include=["number"]).columns allyears[numeric_columns] = allyears[numeric_columns].fillna(0) - - allyears = make_ratio_cols(allyears, 'Total_Annual_Expenses_By_Mode', 'Annual_VRH', 'cost_per_hr', logger) - allyears = make_ratio_cols(allyears, 'Annual_VRM', 'VOMX', 'miles_per_veh', logger) - allyears = make_ratio_cols(allyears, 'Total_Annual_Expenses_By_Mode', 'Annual_UPT', 'fare_rev_per_trip', logger) - allyears = make_ratio_cols(allyears, 'Annual_VRM', 'Annual_VRH', 'rev_speed', logger, operation = "mean") - allyears = make_ratio_cols(allyears, 'Annual_UPT', 'Annual_VRH', 'trips_per_hr', logger, operation = "mean") + + allyears = make_ratio_cols( + allyears, "Total_Annual_Expenses_By_Mode", "Annual_VRH", "cost_per_hr", logger + ) + allyears = make_ratio_cols(allyears, "Annual_VRM", "VOMX", "miles_per_veh", logger) + allyears = make_ratio_cols( + allyears, + "Total_Annual_Expenses_By_Mode", + "Annual_UPT", + "fare_rev_per_trip", + logger, + ) + allyears = make_ratio_cols( + allyears, "Annual_VRM", "Annual_VRH", "rev_speed", logger, operation="mean" + ) + allyears = make_ratio_cols( + allyears, "Annual_UPT", "Annual_VRH", "trips_per_hr", logger, operation="mean" + ) return allyears diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml index 7ce803848c..aa8006667d 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_validation.yml @@ -2,18 +2,18 @@ version: 2 models: - name: int_ntd_rr20_financial_fare_revenues - description: | + description: | Setting up the RR-20 data for comparing fare revenues to previous year # tests: # - dbt_utils.expression_is_true: # expression: 'status != {{ guidelines_to_be_assessed_status() }}' # columns: - name: int_ntd_rr20_financial_specific_funds - description: | + description: | Setting up the RR-20 data for comparing specific funding sources - the 5311 funds, and Other directly generated funds For NTD validation error ID #s RR20F-070, RR20F-065, RR20F-068, RR20F-066, RR20F-013 - name: int_ntd_rr20_financial_total_exp - description: | + description: | Setting up the RR-20 data for comparing totals, for operating and capital expenses, reported in different ares of the RR-20 For NTD validation error ID #s RR20F-001OA, RR20F-001C, RR20F-182 - name: int_ntd_rr20_service_alldata @@ -21,9 +21,9 @@ models: Combines 2023 and 2022 data in preparation for doing NTD validation checks. The 2022 data was *not* from the API and so formatted differently We are *assuming* that data in 2024 and onwards will be the same format as 2023 - If you get errors in 2024, check which columns may differ and read errors carefully. + If you get errors in 2024, check which columns may differ and read errors carefully. - name: int_ntd_rr20_service_ratios description: | makes ratios for validation checks config: - materialized: table \ No newline at end of file + materialized: table diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql index 583a340914..8e26482b98 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_a30_vomscheck.sql @@ -4,18 +4,18 @@ WITH rr20f_180 as ( SELECT organization, "RR20F-180: VOMS across forms" as name_of_check, - CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) + CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) + CASE WHEN ROUND(rr20_voms, 1) > ROUND(a30_vin_n, 1) THEN "Total VOMS is greater than total A-30 vehicles reported. Please clarify." ELSE "VOMS & A-30 vehicles reported are equal to and/or lower than active inventory." END as description, CONCAT("RR-20 VOMS = ", CAST(ROUND(rr20_voms, 1) AS STRING), "# A-30 VINs = ", CAST(ROUND(a30_vin_n, 1) AS STRING)) AS value_checked, CURRENT_TIMESTAMP() AS date_checked - FROM {{ ref('int_ntd_a30_voms_vins_totals') }} + FROM {{ ref('int_ntd_a30_voms_vins_totals') }} ) SELECT * from rr20f_180 diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql index 7bc169dd25..ca9a74df74 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_equal_totals_check.sql @@ -1,35 +1,35 @@ --- We do identical CASE WHEN clauses in each CTE. The results determine 2 different column values but one can only specify 1 col/statement WITH rr20f_0010a as ( - select + select organization, "RR20F-001OA: equal totalsfor operating expenses" as name_of_check, - CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) + CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) + CASE WHEN (ROUND(Total_Annual_Op_Revenues_Expended,0) != ROUND(Total_Annual_Op_Expenses_by_Mode,0)) THEN "Total_Annual_Revenues_Expended should, but does not, equal Total_Annual_Expenses_by_Mode. Please provide a narrative justification." ELSE "" END as description, - CONCAT("Total_Annual_Revenues_Expended = $", CAST(ROUND(Total_Annual_Op_Revenues_Expended,0) AS STRING), + CONCAT("Total_Annual_Revenues_Expended = $", CAST(ROUND(Total_Annual_Op_Revenues_Expended,0) AS STRING), ",Total_Annual_Expenses_by_Mode = $", CAST(ROUND(Total_Annual_Op_Expenses_by_Mode,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked FROM {{ ref('int_ntd_rr20_financial_total_exp') }} -), +), rr20f_001c as( - select + select organization, "RR20F-001C: equal totals for capital expenses by mode and funding source expenditures" as name_of_check, - CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) + CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) + CASE WHEN (ROUND(Total_Annual_Cap_Expenses_byMode,0) != ROUND(Total_Annual_Cap_Expenses_byFunds,0)) THEN "The sum of Total Expenses for all modes for Uses of Capital does not equal the sum of all values entered for Directly Generated, Non-Federal and Federal Government Funds for Uses of Capital. Please revise or explain." ELSE "" END as description, - CONCAT("Total_Annual_Cap_Expenses_byMode = $", CAST(ROUND(Total_Annual_Cap_Expenses_byMode,0) AS STRING), + CONCAT("Total_Annual_Cap_Expenses_byMode = $", CAST(ROUND(Total_Annual_Cap_Expenses_byMode,0) AS STRING), ",Total_Annual_Cap_Expenses_byFunds = $", CAST(ROUND(Total_Annual_Cap_Expenses_byFunds,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked FROM {{ ref('int_ntd_rr20_financial_total_exp') }} diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql index 1e4c9ed6ca..72aac52e79 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_funds_checks.sql @@ -1,38 +1,38 @@ --- We do identical CASE WHEN clauses in each CTE. The results determine 2 different column values but one can only specify 1 col/statement WITH rr20f_070 as ( - select + select organization, "RR20F-070: 5311 Funds not reported" as name_of_check, - CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL + CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL + CASE WHEN ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023) = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL THEN "The ยง5311 program is not listed as a revenue source in your report, Please double check and provide a narrative justification." ELSE "" END AS description, CONCAT("2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked - from {{ ref('int_ntd_rr20_financial_specific_funds') }} + from {{ ref('int_ntd_rr20_financial_specific_funds') }} ), rr20f_066 as ( - select + select organization, "RR20F-066: change from zero" as name_of_check, - CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) + CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) OR - ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) + ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) + CASE WHEN ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL)) OR - ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) + ((FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 = 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NULL)) THEN "FTA_Formula_Grants_for_Rural_Areas_5311 funding changed either from or to zero compared to last year. Please provide a narrative justification." ELSE "" END AS description, @@ -42,78 +42,78 @@ rr20f_066 as ( from {{ ref('int_ntd_rr20_financial_specific_funds') }} ), rr20f_065 as ( - select + select organization, "RR20F-065: 5311 Funds same value" as name_of_check, - CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) + CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) AND - (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) + CASE WHEN (FTA_Formula_Grants_for_Rural_Areas_5311_2023 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2022 != 0 OR FTA_Formula_Grants_for_Rural_Areas_5311_2022 IS NOT NULL) + AND (FTA_Formula_Grants_for_Rural_Areas_5311_2023 = FTA_Formula_Grants_for_Rural_Areas_5311_2022) THEN "You have identical values for FTA_Formula_Grants_for_Rural_Areas_5311 funding in 2022 and 2023, which is unusual. Please provide a narrative justification." ELSE "" END AS description, CONCAT("2022 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2022,0) AS STRING), "2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked - from {{ ref('int_ntd_rr20_financial_specific_funds') }} + from {{ ref('int_ntd_rr20_financial_specific_funds') }} ), rr20f_013 as ( - select + select organization, "RR20F-013: Other Directly Generated Funds same value" as name_of_check, - CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) AND - (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) AND - (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) + CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) + AND (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) + AND (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) AND - (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) AND - (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) + CASE WHEN (Other_Directly_Generated_Funds_2023 != 0 OR Other_Directly_Generated_Funds_2023 IS NOT NULL) + AND (Other_Directly_Generated_Funds_2022 != 0 OR Other_Directly_Generated_Funds_2022 IS NOT NULL) + AND (Other_Directly_Generated_Funds_2023 = Other_Directly_Generated_Funds_2022) THEN "You have identical values for Other_Directly_Generated_Funds funding in 2022 and 2023, which is unusual. Please provide a narrative justification." ELSE "" END AS description, CONCAT("2022 = ", CAST(ROUND(Other_Directly_Generated_Funds_2022,0) AS STRING), "2023 = ", CAST(ROUND(Other_Directly_Generated_Funds_2023,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked - from {{ ref('int_ntd_rr20_financial_specific_funds') }} + from {{ ref('int_ntd_rr20_financial_specific_funds') }} ), rr20f_068 as ( - select + select organization, "RR20F-068: 5311 Funds rounded to thousand" as name_of_check, - CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL + CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL + CASE WHEN MOD(CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS INT),1000) = 0 AND FTA_Formula_Grants_for_Rural_Areas_5311_2023 IS NOT NULL THEN "FTA_Formula_Grants_for_Rural_Areas_5311 are rounded to the nearest thousand, but should be reported as exact values. Please double check and provide a narrative justification." ELSE "" END AS description, CONCAT("2023 = ", CAST(ROUND(FTA_Formula_Grants_for_Rural_Areas_5311_2023,0) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked - from {{ ref('int_ntd_rr20_financial_specific_funds') }} + from {{ ref('int_ntd_rr20_financial_specific_funds') }} ), rr20f_024 as ( - select + select organization, "RR20F-024: Local Funds rounded to thousand" as name_of_check, - CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL + CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL THEN "Fail" ELSE "Pass" END as check_status, - CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL + CASE WHEN MOD(CAST(ROUND(Local_Funds_2023) AS INT),1000) = 0 AND Local_Funds_2023 IS NOT NULL THEN "Local Funds are rounded to the nearest thousand, but should be reported as exact values. Please double check and provide a narrative justification." ELSE "" END AS description, CONCAT("2023 = ", CAST(ROUND(Local_Funds_2023) AS STRING)) as value_checked, CURRENT_TIMESTAMP() AS date_checked - from {{ ref('int_ntd_rr20_financial_specific_funds') }} + from {{ ref('int_ntd_rr20_financial_specific_funds') }} ) SELECT * FROM rr20f_070 diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py index 750fffb6f9..13d47fe797 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py @@ -1,18 +1,22 @@ -import pandas as pd import datetime import logging -##### TO_DO: see if the missing data check can still work or did we already fill it with zeros +import pandas as pd + +# TO_DO: see if the missing data check can still work or did we already fill it with zeros + def write_to_log(logfilename): - ''' + """ Creates a logger object that outputs to a log file, to the filename specified, and also streams to console. - ''' + """ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) - formatter = logging.Formatter(f'%(asctime)s:%(levelname)s: %(message)s', - datefmt='%y-%m-%d %H:%M:%S') + formatter = logging.Formatter( + f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541 + datefmt="%y-%m-%d %H:%M:%S", # noqa: F541 + ) file_handler = logging.FileHandler(logfilename) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() @@ -26,52 +30,75 @@ def write_to_log(logfilename): def check_rr20_ratios(df, variable, threshold, this_year, last_year, logger): - '''Validation checks where a ratio must be within a certain threshold limit - compared to the previous year.''' - agencies = df['organization'].unique() + """Validation checks where a ratio must be within a certain threshold limit + compared to the previous year.""" + agencies = df["organization"].unique() output = [] for agency in agencies: - agency_df = df[df['organization']==agency] + agency_df = df[df["organization"] == agency] logger.info(f"Checking {agency} for {variable} info.") if len(agency_df) > 0: - # Check whether data for both years is present - if (len(agency_df[agency_df['fiscal_year']==this_year]) > 0) \ - & (len(agency_df[agency_df['fiscal_year']==last_year]) > 0): - - for mode in agency_df[(agency_df['fiscal_year']==this_year)]['mode'].unique(): - value_thisyr = (round(agency_df[(agency_df['mode']==mode) - & (agency_df['fiscal_year'] == this_year)] - [variable].unique()[0], 2)) - if len(agency_df[(agency_df['mode']==mode) & (agency_df['fiscal_year'] == last_year)][variable]) == 0: + if (len(agency_df[agency_df["fiscal_year"] == this_year]) > 0) & ( + len(agency_df[agency_df["fiscal_year"] == last_year]) > 0 + ): + for mode in agency_df[(agency_df["fiscal_year"] == this_year)][ + "mode" + ].unique(): + value_thisyr = round( + agency_df[ + (agency_df["mode"] == mode) + & (agency_df["fiscal_year"] == this_year) + ][variable].unique()[0], + 2, + ) + if ( + len( + agency_df[ + (agency_df["mode"] == mode) + & (agency_df["fiscal_year"] == last_year) + ][variable] + ) + == 0 + ): value_lastyr = 0 else: - value_lastyr = (round(agency_df[(agency_df['mode']==mode) - & (agency_df['fiscal_year'] == last_year)] - [variable].unique()[0], 2)) - - if (value_lastyr == 0) and (abs(value_thisyr - value_lastyr) >= threshold): + value_lastyr = round( + agency_df[ + (agency_df["mode"] == mode) + & (agency_df["fiscal_year"] == last_year) + ][variable].unique()[0], + 2, + ) + + if (value_lastyr == 0) and ( + abs(value_thisyr - value_lastyr) >= threshold + ): result = "fail" check_name = f"{variable}" mode = mode - description = (f"The {variable} for {mode} has changed from last year by > = {threshold*100}%, please provide a narrative justification.") - elif (value_lastyr != 0) and abs((value_lastyr - value_thisyr)/value_lastyr) >= threshold: + description = f"The {variable} for {mode} has changed from last year by > = {threshold*100}%, please provide a narrative justification." + elif (value_lastyr != 0) and abs( + (value_lastyr - value_thisyr) / value_lastyr + ) >= threshold: result = "fail" check_name = f"{variable}" mode = mode - description = (f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%, please provide a narrative justification.") + description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%, please provide a narrative justification." else: result = "pass" check_name = f"{variable}" mode = mode description = "" - output_line = {"Organization": agency, - "name_of_check" : check_name, - "mode": mode, - "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", - "check_status": result, - "Description": description} + output_line = { + "Organization": agency, + "name_of_check": check_name, + "mode": mode, + "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", + "check_status": result, + "Description": description, + } output.append(output_line) else: logger.info(f"There is no data for {agency}") @@ -79,70 +106,115 @@ def check_rr20_ratios(df, variable, threshold, this_year, last_year, logger): return checks -def check_single_number(df, variable, this_year, last_year, logger, threshold=None,): - '''Validation checks where a single number must be within a certain threshold limit - compared to the previous year.''' - agencies = df['organization'].unique() +def check_single_number( + df, + variable, + this_year, + last_year, + logger, + threshold=None, +): + """Validation checks where a single number must be within a certain threshold limit + compared to the previous year.""" + agencies = df["organization"].unique() output = [] for agency in agencies: - - if len(df[df['organization']==agency]) > 0: + if len(df[df["organization"] == agency]) > 0: logger.info(f"Checking {agency} for {variable} info.") # Check whether data for both years is present, if so perform prior yr comparison. - if (len(df[(df['organization']==agency) & (df['fiscal_year']==this_year)]) > 0) \ - & (len(df[(df['organization']==agency) & (df['fiscal_year']==last_year)]) > 0): - - for mode in df[(df['organization'] == agency) & (df['fiscal_year']==this_year)]['mode'].unique(): - value_thisyr = (round(df[(df['organization'] == agency) - & (df['mode']==mode) - & (df['fiscal_year'] == this_year)] - [variable].unique()[0], 2)) + if ( + len( + df[ + (df["organization"] == agency) + & (df["fiscal_year"] == this_year) + ] + ) + > 0 + ) & ( + len( + df[ + (df["organization"] == agency) + & (df["fiscal_year"] == last_year) + ] + ) + > 0 + ): + for mode in df[ + (df["organization"] == agency) & (df["fiscal_year"] == this_year) + ]["mode"].unique(): + value_thisyr = round( + df[ + (df["organization"] == agency) + & (df["mode"] == mode) + & (df["fiscal_year"] == this_year) + ][variable].unique()[0], + 2, + ) # If there's no data for last yr: - if len(df[(df['organization'] == agency) - & (df['mode']==mode) - & (df['fiscal_year'] == last_year)][variable]) == 0: + if ( + len( + df[ + (df["organization"] == agency) + & (df["mode"] == mode) + & (df["fiscal_year"] == last_year) + ][variable] + ) + == 0 + ): value_lastyr = 0 else: - value_lastyr = (round(df[(df['organization'] == agency) - & (df['mode']==mode) - & (df['fiscal_year'] == last_year)] - [variable].unique()[0], 2)) - - if (round(value_thisyr)==0 and round(value_lastyr) != 0) | (round(value_thisyr)!=0 and round(value_lastyr) == 0): + value_lastyr = round( + df[ + (df["organization"] == agency) + & (df["mode"] == mode) + & (df["fiscal_year"] == last_year) + ][variable].unique()[0], + 2, + ) + + if (round(value_thisyr) == 0 and round(value_lastyr) != 0) | ( + round(value_thisyr) != 0 and round(value_lastyr) == 0 + ): result = "fail" check_name = f"{variable}" mode = mode - description = (f"The {variable} for {mode} has changed either from or to zero compared to last year. Please provide a narrative justification.") + description = f"The {variable} for {mode} has changed either from or to zero compared to last year. Please provide a narrative justification." # run only the above check on whether something changed from zero to non-zero, if no threshold is given - elif threshold==None: + elif threshold is None: result = "pass" check_name = f"{variable}" mode = mode description = "" pass # also check for pct change, if a threshold parameter is passed into function - elif (value_lastyr == 0) and (abs(value_thisyr - value_lastyr) >= threshold): + elif (value_lastyr == 0) and ( + abs(value_thisyr - value_lastyr) >= threshold + ): result = "fail" check_name = f"{variable}" mode = mode - description = (f"The {variable} for {mode} was 0 last year and has changed by > = {threshold*100}%, please provide a narrative justification.") - elif (value_lastyr != 0) and abs((value_lastyr - value_thisyr)/value_lastyr) >= threshold: + description = f"The {variable} for {mode} was 0 last year and has changed by > = {threshold*100}%, please provide a narrative justification." + elif (value_lastyr != 0) and abs( + (value_lastyr - value_thisyr) / value_lastyr + ) >= threshold: result = "fail" check_name = f"{variable}" mode = mode - description = (f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%; please provide a narrative justification.") + description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%; please provide a narrative justification." else: result = "pass" check_name = f"{variable}" mode = mode description = "" - output_line = {"Organization": agency, - "name_of_check" : check_name, - "mode": mode, - "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", - "check_status": result, - "Description": description} + output_line = { + "Organization": agency, + "name_of_check": check_name, + "mode": mode, + "value_checked": f"{this_year} = {value_thisyr}, {last_year} = {value_lastyr}", + "check_status": result, + "Description": description, + } output.append(output_line) else: logger.info(f"There is no data for {agency}") @@ -152,34 +224,54 @@ def check_single_number(df, variable, this_year, last_year, logger, threshold=No def model(dbt, session): # Set up the logger object - logger = write_to_log('rr20_ftc_servicechecks_log.log') + logger = write_to_log("rr20_ftc_servicechecks_log.log") - this_year=datetime.datetime.now().year - last_year = this_year-1 - this_date=datetime.datetime.now().date().strftime('%Y-%m-%d') #for suffix on Excel files + this_year = datetime.datetime.now().year + last_year = this_year - 1 + this_date = ( + datetime.datetime.now().date().strftime("%Y-%m-%d") + ) # for suffix on Excel files - #Load data from BigQuery - pass in the dbt model that we draw from. + # Load data from BigQuery - pass in the dbt model that we draw from. allyears = dbt.ref("int_ntd_rr20_service_ratios") allyears = allyears.toPandas() # Run validation checks - cph_checks = check_rr20_ratios(allyears, 'cost_per_hr', .30, this_year, last_year, logger) - mpv_checks = check_rr20_ratios(allyears, 'miles_per_veh', .20, this_year, last_year, logger) - vrm_checks = check_single_number(allyears, 'Annual_VRM', this_year, last_year, logger, threshold=.30) - frpt_checks = check_rr20_ratios(allyears, 'fare_rev_per_trip', .25, this_year, last_year, logger) - rev_speed_checks = check_rr20_ratios(allyears, 'rev_speed', .15, this_year, last_year, logger) - tph_checks = check_rr20_ratios(allyears, 'trips_per_hr', .30, this_year, last_year, logger) - voms0_check = check_single_number(allyears, 'VOMX', this_year, last_year, logger) + cph_checks = check_rr20_ratios( + allyears, "cost_per_hr", 0.30, this_year, last_year, logger + ) + mpv_checks = check_rr20_ratios( + allyears, "miles_per_veh", 0.20, this_year, last_year, logger + ) + vrm_checks = check_single_number( + allyears, "Annual_VRM", this_year, last_year, logger, threshold=0.30 + ) + frpt_checks = check_rr20_ratios( + allyears, "fare_rev_per_trip", 0.25, this_year, last_year, logger + ) + rev_speed_checks = check_rr20_ratios( + allyears, "rev_speed", 0.15, this_year, last_year, logger + ) + tph_checks = check_rr20_ratios( + allyears, "trips_per_hr", 0.30, this_year, last_year, logger + ) + voms0_check = check_single_number(allyears, "VOMX", this_year, last_year, logger) # Combine checks into one table - rr20_checks = pd.concat([cph_checks, mpv_checks, vrm_checks, - frpt_checks, rev_speed_checks, - tph_checks, voms0_check], - ignore_index=True).sort_values(by="Organization") + rr20_checks = pd.concat( + [ + cph_checks, + mpv_checks, + vrm_checks, + frpt_checks, + rev_speed_checks, + tph_checks, + voms0_check, + ], + ignore_index=True, + ).sort_values(by="Organization") logger.info(f"RR-20 service data checks conducted on {this_date} is complete!") - ## Part 2: send table to BigQuery + # Part 2: send table to BigQuery return rr20_checks - - diff --git a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml index 1dc10d2e19..346169b267 100644 --- a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml +++ b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml @@ -3,8 +3,8 @@ version: 2 sources: - name: ntd_report_validation description: | - Data from BlackCat API. Each org's data is be in 1 row, and for each separate table in the API, - a nested column holds all of it's data. + Data from BlackCat API. Each org's data is be in 1 row, and for each separate table in the API, + a nested column holds all of it's data. database: "{{ env_var('DBT_SOURCE_DATABASE', var('SOURCE_DATABASE')) }}" schema: external_blackcat tables: diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql index c2fbca6ae8..2c6bcd5a38 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_exp_by_mode.sql @@ -1,5 +1,6 @@ --- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future --- We pull these tables in to use them in later int and fct models -SELECT - * +-- TODO: enumerate columns +SELECT -- noqa: AM04 + * FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_expenses_by_mode` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql index 5f465071db..6afc02d872 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_financial.sql @@ -1,5 +1,6 @@ --- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future --- We pull these tables in to use them in later int and fct models -SELECT - * +-- TODO: enumerate columns +SELECT -- noqa: AM04 + * FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_financials__2` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql index 64c17c9b43..770028f71c 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2022_rr20_service.sql @@ -1,5 +1,6 @@ --- One-time data ingest of 2022 data, whose pattern which will not be repeated in the future --- We pull these tables in to use them in later int and fct models -SELECT - * +-- TODO: enumerate columns +SELECT -- noqa: AM04 + * FROM `cal-itp-data-infra.blackcat_raw.2022_rr20_service_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql index 6ecc277392..7f84f3af60 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql @@ -1,4 +1,4 @@ -SELECT +SELECT organization, reportstatus as api_report_status, TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, @@ -14,5 +14,5 @@ SELECT a10.DOLeasedByPublicAgency as do_leased_by_public_agency, a10.DOLeasedFromPrivateEntity as do_leased_from_private_entity, a10.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdreportingstationsandmaintenance_data`) as `a10` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, + UNNEST(`ntdreportingstationsandmaintenance_data`) as `a10` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql index 223c2e104a..5925b966e9 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql @@ -1,4 +1,4 @@ -SELECT +SELECT organization, reportstatus as api_report_status, TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, @@ -23,5 +23,5 @@ SELECT a30.ModesOperatedDisplayText as modes_operated_display_text, a30.ModesOperatedFullText as modes_operated_full_text, a30.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdassetandresourceinfo_data`) as `a30` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, + UNNEST(`ntdassetandresourceinfo_data`) as `a30` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql index aeaec900d7..1a49ff9b3a 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql @@ -1,4 +1,4 @@ -SELECT +SELECT organization, reportstatus as api_report_status, TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, @@ -19,5 +19,5 @@ SELECT ntdreportingrr20_rural_data.SponsoredServiceUPT as sponsored_service_upt, ntdreportingrr20_rural_data.Quantity as quantity, ntdreportingrr20_rural_data.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} -, UNNEST (`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, + UNNEST(`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql index 003c00b758..224860c383 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql @@ -1,4 +1,4 @@ -SELECT +SELECT organization, reportstatus as api_report_status, TIMESTAMP_MILLIS(reportlastmodifieddate) as api_report_last_modified_date, @@ -11,6 +11,6 @@ SELECT ntdreportingrr20_urban_tribal_data.CapitalExpended as capital_expended, ntdreportingrr20_urban_tribal_data.Description as description, ntdreportingrr20_urban_tribal_data.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }} +FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, -- `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` -, UNNEST (`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` + UNNEST(`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql index b935ffbd64..adde256f4d 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql @@ -1,3 +1,3 @@ SELECT - Organization as organization + Organization as organization FROM blackcat_raw.2023_organizations From 7eed7353b04ada31f4c0322585f9033dbf975541 Mon Sep 17 00:00:00 2001 From: Laurie Merrell Date: Thu, 7 Dec 2023 09:57:51 -0600 Subject: [PATCH 14/15] more linter --- .../ntd_validation/int_ntd_rr20_service_ratios.py | 4 ++-- .../ntd_validation/fct_ntd_rr20_service_checks.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py index a20e7e16d1..b4265d74d6 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py @@ -13,8 +13,8 @@ def write_to_log(logfilename): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter( - f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541 - datefmt="%y-%m-%d %H:%M:%S", # noqa: F541 + f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541, E231 + datefmt="%y-%m-%d %H:%M:%S", # noqa: F541, E231 ) file_handler = logging.FileHandler(logfilename) file_handler.setFormatter(formatter) diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py index 13d47fe797..3a92a04e4c 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py @@ -14,8 +14,8 @@ def write_to_log(logfilename): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter( - f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541 - datefmt="%y-%m-%d %H:%M:%S", # noqa: F541 + f"%(asctime)s:%(levelname)s: %(message)s", # noqa: F541, E231 + datefmt="%y-%m-%d %H:%M:%S", # noqa: F541, E231 ) file_handler = logging.FileHandler(logfilename) file_handler.setFormatter(formatter) @@ -77,14 +77,14 @@ def check_rr20_ratios(df, variable, threshold, this_year, last_year, logger): result = "fail" check_name = f"{variable}" mode = mode - description = f"The {variable} for {mode} has changed from last year by > = {threshold*100}%, please provide a narrative justification." + description = f"The {variable} for {mode} has changed from last year by > = {threshold * 100}%, please provide a narrative justification." elif (value_lastyr != 0) and abs( (value_lastyr - value_thisyr) / value_lastyr ) >= threshold: result = "fail" check_name = f"{variable}" mode = mode - description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%, please provide a narrative justification." + description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr) / value_lastyr) * 100, 1)}%, please provide a narrative justification." else: result = "pass" check_name = f"{variable}" @@ -193,14 +193,14 @@ def check_single_number( result = "fail" check_name = f"{variable}" mode = mode - description = f"The {variable} for {mode} was 0 last year and has changed by > = {threshold*100}%, please provide a narrative justification." + description = f"The {variable} for {mode} was 0 last year and has changed by > = {threshold * 100}%, please provide a narrative justification." elif (value_lastyr != 0) and abs( (value_lastyr - value_thisyr) / value_lastyr ) >= threshold: result = "fail" check_name = f"{variable}" mode = mode - description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr)/value_lastyr)*100, 1)}%; please provide a narrative justification." + description = f"The {variable} for {mode} has changed from last year by {round(abs((value_lastyr - value_thisyr) / value_lastyr) * 100, 1)}%; please provide a narrative justification." # noqa: E702 else: result = "pass" check_name = f"{variable}" From 91e7fad7b0242e1b8121df030d48ef3005f4fde5 Mon Sep 17 00:00:00 2001 From: Kim Engie Date: Thu, 7 Dec 2023 08:51:27 -0800 Subject: [PATCH 15/15] fix conflicts, final ratios corrections --- .../int_ntd_rr20_service_alldata.sql | 72 ++++++++++++--- .../int_ntd_rr20_service_ratios.py | 91 ++++++++++--------- .../fct_ntd_rr20_service_checks.py | 2 +- .../ntd_validation/_src_api_externaltable.yml | 2 +- .../ntd_validation/stg_ntd_2023_a10.sql | 4 +- .../stg_ntd_2023_a30_assetandresourceinfo.sql | 4 +- .../stg_ntd_2023_rr20_rural.sql | 4 +- .../stg_ntd_2023_rr20_urban_tribal.sql | 4 +- .../ntd_validation/stg_ntd_subrecipients.sql | 2 +- 9 files changed, 120 insertions(+), 65 deletions(-) diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql index 20a14257ad..0b40f6487c 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_alldata.sql @@ -8,25 +8,62 @@ ---TO DO: insert parameter for loop, for each year, do what 2023 is doing, --- and at the end, add another union statement -with data_2023 as ( +with service_2023 as ( select organization, api_report_period as fiscal_year, item as mode, - description as operating_capital, - CASE - WHEN description = "Operating Expenses" THEN operations_expended - WHEN description = "Capital Expenses" THEN capital_expended - END as Total_Annual_Expenses_By_Mode, annual_vehicle_rev_miles as Annual_VRM, annual_vehicle_rev_hours as Annual_VRH, annual_unlinked_pass_trips as Annual_UPT, sponsored_service_upt as Sponsored_UPT, annual_vehicle_max_service as VOMX from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE type = "Service Data" +), + +expenses_2023 as ( + select + organization, + api_report_period as fiscal_year, + item as mode, + operations_expended as Total_Annual_Expenses_By_Mode + from {{ ref('stg_ntd_2023_rr20_rural') }} WHERE type = "Expenses by Mode" ), +fare_rev_2023 as ( + select + organization, + api_report_period as fiscal_year, + sum(operations_expended) as Fare_Revenues + from {{ ref('stg_ntd_2023_rr20_rural') }} + WHERE type = "Fare Revenues" + GROUP BY organization, fiscal_year +), + +all_2023 as ( + SELECT DISTINCT + service_2023.organization, + service_2023.fiscal_year, + service_2023.mode, + expenses_2023.Total_Annual_Expenses_By_Mode, + service_2023.Annual_VRM, + service_2023.Annual_VRH, + service_2023.Annual_UPT, + service_2023.Sponsored_UPT, + service_2023.VOMX, + fare_rev_2023.Fare_Revenues + FROM service_2023 + FULL OUTER JOIN expenses_2023 + ON service_2023.organization = expenses_2023.organization + AND service_2023.fiscal_year = expenses_2023.fiscal_year + AND service_2023.mode = expenses_2023.mode + FULL OUTER JOIN fare_rev_2023 + ON service_2023.organization = fare_rev_2023.organization + AND service_2023.fiscal_year = fare_rev_2023.fiscal_year +), + service2022 as ( select Organization_Legal_Name as organization, @@ -44,32 +81,45 @@ expenses2022 as ( select Organization_Legal_Name as organization, Fiscal_Year as fiscal_year, - Operating_Capital as operating_capital, Mode as mode, Total_Annual_Expenses_By_Mode FROM {{ ref('stg_ntd_2022_rr20_exp_by_mode') }} + WHERE Operating_Capital = "Operating" +), + +fare_rev_2022 as ( + select + Organization_Legal_Name as organization, + Fiscal_Year as fiscal_year, + Fare_Revenues + FROM {{ ref('stg_ntd_2022_rr20_financial') }} + WHERE Operating_Capital = "Operating" ), all_2022 as ( - select service2022.organization, + SELECT DISTINCT + service2022.organization, service2022.fiscal_year, service2022.mode, - expenses2022.operating_capital, expenses2022.Total_Annual_Expenses_By_Mode, service2022.Annual_VRM, service2022.Annual_VRH, service2022.Annual_UPT, service2022.Sponsored_UPT, - service2022.VOMX + service2022.VOMX, + fare_rev_2022.Fare_Revenues from service2022 FULL OUTER JOIN expenses2022 ON service2022.organization = expenses2022.organization AND service2022.fiscal_year = expenses2022.fiscal_year AND service2022.mode = expenses2022.mode +INNER JOIN fare_rev_2022 + ON service2022.organization = fare_rev_2022.organization + AND service2022.fiscal_year = fare_rev_2022.fiscal_year ) select * FROM all_2022 UNION ALL -select * from data_2023 +select * from all_2023 diff --git a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py index b4265d74d6..c8e3227b2f 100644 --- a/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py +++ b/warehouse/models/intermediate/ntd_validation/int_ntd_rr20_service_ratios.py @@ -1,9 +1,5 @@ import logging -import pandas as pd # noqa: F401 -import pyspark # noqa: F401 -import pyspark.sql.functions as F # noqa: F401 - def write_to_log(logfilename): """ @@ -28,31 +24,6 @@ def write_to_log(logfilename): return logger -def make_ratio_cols(df, numerator, denominator, col_name, logger, operation="sum"): - if col_name is not None: - # If a user specify a column name, use it - # Raise error if the column already exists - if col_name in df.columns: - logger.info(f"Dataframe already has column '{col_name}'") - raise ValueError(f"Dataframe already has column '{col_name}'") - - else: - _col_name = col_name - - if operation == "sum": - df = df.groupby(["organization", "mode", "fiscal_year"]).apply( - lambda x: x.assign( - **{_col_name: lambda x: x[numerator].sum() / x[denominator]} - ) - ) - # else do not sum the numerator columns - else: - df = df.groupby(["organization", "mode", "fiscal_year"]).apply( - lambda x: x.assign(**{_col_name: lambda x: x[numerator] / x[denominator]}) - ) - return df - - def model(dbt, session): # Set up the logger object logger = write_to_log("rr20_servicechecks_log.log") @@ -60,27 +31,61 @@ def model(dbt, session): # Load data from BigQuery - pass in the dbt model that we draw from. allyears = dbt.ref("int_ntd_rr20_service_alldata") allyears = allyears.toPandas() + logger.info("Service data loaded!") # Calculate needed ratios, added as new columns numeric_columns = allyears.select_dtypes(include=["number"]).columns - allyears[numeric_columns] = allyears[numeric_columns].fillna(0) + allyears[numeric_columns] = allyears[numeric_columns].fillna( + value=0, inplace=False, axis=1 + ) - allyears = make_ratio_cols( - allyears, "Total_Annual_Expenses_By_Mode", "Annual_VRH", "cost_per_hr", logger + # Cost per hr + allyears2 = ( + allyears.groupby(["organization", "mode", "fiscal_year"], dropna=False) + .apply( + lambda x: x.assign( + cost_per_hr=x["Total_Annual_Expenses_By_Mode"] / x["Annual_VRH"] + ) + ) + .reset_index(drop=True) ) - allyears = make_ratio_cols(allyears, "Annual_VRM", "VOMX", "miles_per_veh", logger) - allyears = make_ratio_cols( - allyears, - "Total_Annual_Expenses_By_Mode", - "Annual_UPT", - "fare_rev_per_trip", - logger, + # Miles per vehicle + allyears2 = ( + allyears2.groupby(["organization", "mode", "fiscal_year"], dropna=False) + .apply( + lambda x: x.assign( + miles_per_veh=lambda x: x["Annual_VRM"].sum() / x["VOMX"] + ) + ) + .reset_index(drop=True) ) - allyears = make_ratio_cols( - allyears, "Annual_VRM", "Annual_VRH", "rev_speed", logger, operation="mean" + # Fare revenues + allyears2 = ( + allyears2.groupby(["organization", "mode", "fiscal_year"], dropna=False) + .apply( + lambda x: x.assign( + fare_rev_per_trip=lambda x: x["Fare_Revenues"].sum() / x["Annual_UPT"] + ) + ) + .reset_index(drop=True) ) - allyears = make_ratio_cols( - allyears, "Annual_UPT", "Annual_VRH", "trips_per_hr", logger, operation="mean" + # Revenue Speed + allyears2 = ( + allyears2.groupby(["organization", "mode", "fiscal_year"], dropna=False) + .apply( + lambda x: x.assign(rev_speed=lambda x: x["Annual_VRM"] / x["Annual_VRH"]) + ) + .reset_index(drop=True) ) + # Trips per hr + allyears2 = ( + allyears2.groupby(["organization", "mode", "fiscal_year"], dropna=False) + .apply( + lambda x: x.assign(trips_per_hr=lambda x: x["Annual_UPT"] / x["Annual_VRH"]) + ) + .reset_index(drop=True) + ) + + logger.info("Ratios calculated!") return allyears diff --git a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py index 3a92a04e4c..adc4edf1c6 100644 --- a/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py +++ b/warehouse/models/mart/ntd_validation/fct_ntd_rr20_service_checks.py @@ -273,5 +273,5 @@ def model(dbt, session): logger.info(f"RR-20 service data checks conducted on {this_date} is complete!") - # Part 2: send table to BigQuery + # Send table to BigQuery return rr20_checks diff --git a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml index 346169b267..6d7ef7d398 100644 --- a/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml +++ b/warehouse/models/staging/ntd_validation/_src_api_externaltable.yml @@ -8,4 +8,4 @@ sources: database: "{{ env_var('DBT_SOURCE_DATABASE', var('SOURCE_DATABASE')) }}" schema: external_blackcat tables: - - name: all_2023_ntdreports + - name: all_ntdreports diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql index 7f84f3af60..f6e2c33054 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a10.sql @@ -14,5 +14,5 @@ SELECT a10.DOLeasedByPublicAgency as do_leased_by_public_agency, a10.DOLeasedFromPrivateEntity as do_leased_from_private_entity, a10.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, - UNNEST(`ntdreportingstationsandmaintenance_data`) as `a10` +FROM {{ source('ntd_report_validation', 'all_ntdreports') }}, + UNNEST(`ntdreportingstationsandmaintenance_data`) as `a10` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql index 5925b966e9..bf685f194a 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_a30_assetandresourceinfo.sql @@ -23,5 +23,5 @@ SELECT a30.ModesOperatedDisplayText as modes_operated_display_text, a30.ModesOperatedFullText as modes_operated_full_text, a30.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, - UNNEST(`ntdassetandresourceinfo_data`) as `a30` +FROM {{ source('ntd_report_validation', 'all_ntdreports') }}, + UNNEST(`ntdassetandresourceinfo_data`) as `a30` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql index 1a49ff9b3a..f40457e2f0 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_rural.sql @@ -19,5 +19,5 @@ SELECT ntdreportingrr20_rural_data.SponsoredServiceUPT as sponsored_service_upt, ntdreportingrr20_rural_data.Quantity as quantity, ntdreportingrr20_rural_data.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, - UNNEST(`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` +FROM {{ source('ntd_report_validation', 'all_ntdreports') }}, + UNNEST(`ntdreportingrr20_rural_data`) as `ntdreportingrr20_rural_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql index 224860c383..02b61c3728 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_2023_rr20_urban_tribal.sql @@ -11,6 +11,6 @@ SELECT ntdreportingrr20_urban_tribal_data.CapitalExpended as capital_expended, ntdreportingrr20_urban_tribal_data.Description as description, ntdreportingrr20_urban_tribal_data.LastModifiedDate as last_modified_date -FROM {{ source('ntd_report_validation', 'all_2023_ntdreports') }}, +FROM {{ source('ntd_report_validation', 'all_ntdreports') }}, -- `cal-itp-data-infra-staging.external_blackcat.all_2023_ntdreports` - UNNEST(`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` + UNNEST(`ntdreportingrr20_urban_tribal_data`) as `ntdreportingrr20_urban_tribal_data` diff --git a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql index adde256f4d..c0282b37b4 100644 --- a/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql +++ b/warehouse/models/staging/ntd_validation/stg_ntd_subrecipients.sql @@ -1,3 +1,3 @@ SELECT Organization as organization -FROM blackcat_raw.2023_organizations +FROM `cal-itp-data-infra.blackcat_raw.2023_organizations`