From ed034bb5ddd76cc43011b89d045e950307eb30b5 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 9 Oct 2024 16:02:26 -0300 Subject: [PATCH 01/56] adding gypscie preprocessing on flow --- .../precipitacao_alertario/flows.py | 123 +++- .../precipitacao_alertario/tasks_gypscie.py | 605 ++++++++++++++++++ .../precipitacao_alertario/util_gypscie.py | 162 +++++ 3 files changed, 887 insertions(+), 3 deletions(-) create mode 100644 pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py create mode 100644 pipelines/meteorologia/precipitacao_alertario/util_gypscie.py diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index bf92102f..6c727787 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# pylint: disable=C0103 +# pylint: disable=C0103, line-too-long """ Flows for precipitacao_alertario. """ @@ -13,6 +13,7 @@ from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 create_table_and_upload_to_gcs, + get_now_datetime, task_run_dbt_model_task, ) @@ -31,11 +32,27 @@ from pipelines.rj_escritorio.rain_dashboard.constants import ( constants as rain_dashboard_constants, ) -from pipelines.utils.constants import constants as utils_constants +# from pipelines.utils.constants import constants as utils_constants from pipelines.utils.custom import wait_for_flow_run_with_timeout -from pipelines.utils.dump_db.constants import constants as dump_db_constants +# from pipelines.utils.dump_db.constants import constants as dump_db_constants from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants +# preprocessing imports +from pipelines.precipitation_model.rionowcast.tasks import ( # pylint: disable=E0611, E0401 + access_api, + add_columns_on_dfr, + download_datasets_from_gypscie, + execute_dataset_processor, + get_dataset_info, + get_dataset_processor_info, + path_to_dfr, + register_dataset_on_gypscie, + task_wait_run, +) +from pipelines.tasks import ( # pylint: disable=E0611, E0401 + task_create_partitions, +) + wait_for_flow_run_with_5min_timeout = wait_for_flow_run_with_timeout(timeout=timedelta(minutes=5)) with Flow( @@ -73,6 +90,38 @@ default=dump_to_gcs_constants.MAX_BYTES_PROCESSED_PER_TABLE.value, ) + # Gypscie parameters + environment_id = Parameter("environment_id", default=1, required=False) + domain_id = Parameter("domain_id", default=1, required=False) + project_id = Parameter("project_id", default=1, required=False) + project_name = Parameter("project_name", default="rionowcast_precipitation", required=False) + + # Gypscie processor parameters + processor_name = Parameter("processor_name", default="etl_alertario22", required=True) + dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar + + # Parameters for saving data on GCP + materialize_after_dump = Parameter("materialize_after_dump", default=False, required=False) + dump_mode = Parameter("dump_mode", default=False, required=False) + dataset_id_previsao_chuva = Parameter( + "dataset_id", default="clima_previsao_chuva", required=False + ) + table_id_previsao_chuva = Parameter( + "table_id", default="preprocessamento_pluviometro_alertario", required=False + ) + + # Dataset parameters + station_type = Parameter("station_type", default="pluviometro", required=False) + source = Parameter("source", default="alertario", required=False) + + # Dataset path, if it was saved on ETL flow or it will be None + dataset_path = Parameter("dataset_path", default=None, required=False) # dataset_path + model_version = Parameter("model_version", default=1, required=False) + + ######################### + # Start alertario flow # + ######################### + dfr_pluviometric, dfr_meteorological = download_data() (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, @@ -366,6 +415,74 @@ # raise_final_state=True, # ) + #################################### + # Start preprocessing flow # + #################################### + + api = access_api() + + dataset_info = get_dataset_info(station_type, source) + + # Get processor information on gypscie + with case(dataset_processor_id, None): + dataset_processor_response, dataset_processor_id = get_dataset_processor_info( + api, processor_name + ) + + dataset_response = register_dataset_on_gypscie(api, filepath=dataset_path, domain_id=domain_id) + # TODO: verifcar no codigo do augustp se são esses os parametros corretos + processor_parameters = { + "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], + "station_type": station_type, + } + + dataset_processor_task_id = execute_dataset_processor( + api, + processor_id=dataset_processor_id, + dataset_id=[dataset_response["id"]], + environment_id=environment_id, + project_id=project_id, + parameters=processor_parameters, + ) + wait_run = task_wait_run(api, dataset_processor_task_id, flow_type="processor") + dataset_path = download_datasets_from_gypscie( + api, dataset_names=[dataset_response["id"]], wait=wait_run + ) + dfr_ = path_to_dfr(dataset_path) + # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) + dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + + # Save pre-treated data on local file with partitions + now_datetime = get_now_datetime() + prediction_data_path = task_create_partitions( + dfr, + partition_date_column=dataset_info["partition_date_column"], + savepath="model_prediction", + suffix=now_datetime, + ) + ################################ + # Save preprocessing on GCP # + ################################ + + # Upload data to BigQuery + create_table = create_table_and_upload_to_gcs( + data_path=prediction_data_path, + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + dump_mode=dump_mode, + biglake_table=False, + ) + + # Trigger DBT flow run + with case(materialize_after_dump, True): + run_dbt = task_run_dbt_model_task( + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + # mode=materialization_mode, + # materialize_to_datario=materialize_to_datario, + ) + run_dbt.set_upstream(create_table) + # para rodar na cloud cor_meteorologia_precipitacao_alertario.storage = GCS(constants.GCS_FLOWS_BUCKET.value) cor_meteorologia_precipitacao_alertario.run_config = KubernetesRun( diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py b/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py new file mode 100644 index 00000000..840540f0 --- /dev/null +++ b/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py @@ -0,0 +1,605 @@ +# -*- coding: utf-8 -*- +""" +Tasks +""" +import datetime +import os +from pathlib import Path +from time import sleep +from typing import Dict, List + +import numpy as np +import pandas as pd + +from basedosdados.upload.base import Base +from google.cloud import bigquery +from prefect import task +from prefect.engine.signals import ENDRUN +from prefect.engine.state import Failed +from prefeitura_rio.pipelines_utils.infisical import get_secret +from prefeitura_rio.pipelines_utils.logging import log +from requests.exceptions import HTTPError + +from pipelines.constants import constants # pylint: disable=E0611, E0401 +from pipelines.precipitation_model.rionowcast.utils import ( # pylint: disable=E0611, E0401 + GypscieApi, + wait_run, +) + + +# noqa E302, E303 +@task() +def access_api(): + """# noqa E303 + Acess api and return it to be used in other requests + """ + infisical_username = constants.INFISICAL_USERNAME.value + infisical_password = constants.INFISICAL_PASSWORD.value + + # username = get_secret(secret_name="USERNAME", path="/gypscie", environment="prod") + # password = get_secret(secret_name="PASSWORD", path="/gypscie", environment="prod") + + username = get_secret(infisical_username, path="/gypscie")[infisical_username] + password = get_secret(infisical_password, path="/gypscie")[infisical_password] + api = GypscieApi(username=username, password=password) + + return api + + +@task() +def get_billing_project_id( + bd_project_mode: str = "prod", + billing_project_id: str = None, +) -> str: + """ + Get billing project id + """ + if not billing_project_id: + log("Billing project ID was not provided, trying to get it from environment variable") + try: + bd_base = Base() + billing_project_id = bd_base.config["gcloud-projects"][bd_project_mode]["name"] + log(f"Billing project ID was inferred from environment variables: {billing_project_id}") + except KeyError: + pass + if not billing_project_id: + raise ValueError( + "billing_project_id must be either provided or inferred from environment variables" + ) + log(f"Billing project ID: {billing_project_id}") + return billing_project_id + + +def download_data_from_bigquery(query: str, billing_project_id: str) -> pd.DataFrame: + """ADD""" + # pylint: disable=E1124, protected-access + # client = google_client(billing_project_id, from_file=True, reauth=False) + # job_config = bigquery.QueryJobConfig() + # # job_config.dry_run = True + + # # Get data + # log("Querying data from BigQuery") + # job = client["bigquery"].query(query, job_config=job_config) + # https://github.com/prefeitura-rio/pipelines_rj_iplanrio/blob/ecd21c727b6f99346ef84575608e560e5825dd38/pipelines/painel_obras/dump_data/tasks.py#L39 + bq_client = bigquery.Client( + credentials=Base(bucket_name="rj-cor")._load_credentials(mode="prod"), + project=billing_project_id, + ) + job = bq_client.query(query) + while not job.done(): + sleep(1) + + # Get data + # log("Querying data from BigQuery") + # job = client["bigquery"].query(query) + # while not job.done(): + # sleep(1) + log("Getting result from query") + results = job.result() + log("Converting result to pandas dataframe") + dfr = results.to_dataframe() + log("End download data from bigquery") + return dfr + + +@task() +def register_dataset_on_gypscie(api, filepath: Path, domain_id: int = 1) -> Dict: + """ + Register dataset on gypscie and return its informations like id. + Obs: dataset name must be unique. + Return: + { + 'domain': + { + 'description': 'This project has the objective to create nowcasting models.', + 'id': 1, + 'name': 'rionowcast_precipitation' + }, + 'file_type': 'csv', + 'id': 18, + 'name': 'rain_gauge_to_model', + 'register': '2024-07-02T19:20:32.507744', + 'uri': 'http://gypscie.dados.rio/api/download/datasets/rain_gauge_to_model.zip' + } + """ + log(f"\nStart registring dataset by sending {filepath} Data to Gypscie") + + data = { + "domain_id": domain_id, + "name": str(filepath).split("/")[-1].split(".")[0] + + "_" + + datetime.datetime.now().strftime("%Y%m%d%H%M%S"), # pylint: disable=use-maxsplit-arg + } + log(type(data), data) + files = { + "files": open(file=filepath, mode="rb"), # pylint: disable=consider-using-with + } + + response = api.post(path="datasets", data=data, files=files) + + log(f"register_dataset_on_gypscie response: {response} and response.json(): {response.json()}") + return response.json() + + +@task(nout=2) +def get_dataset_processor_info(api, processor_name: str): + """ + Geting dataset processor information + """ + log(f"Getting dataset processor info for {processor_name}") + dataset_processors_response = api.get( + path="dataset_processors", + ) + + # log(dataset_processors_response) + dataset_processor_id = None + for response in dataset_processors_response: + if response.get("name") == processor_name: + dataset_processor_id = response["id"] + # log(response) + # log(response["id"]) + return dataset_processors_response, dataset_processor_id + + # if not dataset_processor_id: + # log(f"{processor_name} not found. Try adding it.") + + +@task() +# pylint: disable=too-many-arguments +def execute_dataset_processor( + api, + processor_id: int, + dataset_id: list, # como pegar os vários datasets + environment_id: int, + project_id: int, + parameters: dict + # adicionar campos do dataset_processor +) -> List: + """ + Requisição de execução de um DatasetProcessor + """ + log("\nStarting executing dataset processing") + + task_response = api.post( + path="processor_run", + json={ + "dataset_id": dataset_id, + "environment_id": environment_id, + "parameters": parameters, + "processor_id": processor_id, + "project_id": project_id, + }, + ) + # task_response = {'task_id': '227e74bc-0057-4e63-a30f-8374604e442b'} + + # response = wait_run(api, task_response.json()) + + # if response["state"] != "SUCCESS": + # failed_message = "Error processing this dataset. Stop flow or restart this task" + # log(failed_message) + # task_state = Failed(failed_message) + # raise ENDRUN(state=task_state) + + # output_datasets = response["result"]["output_datasets"] # returns a list with datasets + # log(f"\nFinish executing dataset processing, we have {len(output_datasets)} datasets") + # return output_datasets + return task_response.json(["task_id"]) + + +@task() +def predict(api, model_id: int, dataset_id: int, project_id: int) -> dict: + """ + Requisição de execução de um processo de Predição + """ + print("Starting prediction") + response = api.post( + path="predict", + data={ + "model_id": model_id, + "dataset_id": dataset_id, + "project_id": project_id, + }, + ) + print(f"Prediction ended. Response: {response}, {response.json()}") + return response.json() + + +def calculate_start_and_end_date( + hours_from_past: int, +) -> tuple[datetime.datetime, datetime.datetime]: + """ + Calculates the start and end date based on the hours from past + """ + end_date = datetime.datetime.now() + start_date = end_date - datetime.timedelta(hours=hours_from_past) + return start_date, end_date + + +@task() +def query_data_from_gcp( # pylint: disable=too-many-arguments + dataset_id: str, + table_id: str, + billing_project_id: str, + start_date: str = None, + end_date: str = None, + save_format: str = "csv", +) -> Path: + """ + Download historical data from source. + format: csv or parquet + """ + log(f"Start downloading {dataset_id}.{table_id} data") + + directory_path = Path("data/input/") + if not os.path.exists(directory_path): + os.makedirs(directory_path) + + savepath = directory_path / f"{dataset_id}_{table_id}" # TODO: + + # pylint: disable=consider-using-f-string + # noqa E262 + query = """ + SELECT + * + FROM rj-cor.{}.{} + """.format( + dataset_id, + table_id, + ) + + # pylint: disable=consider-using-f-string + if start_date: + filter_query = """ + WHERE data_particao BETWEEN '{}' AND '{}' + """.format( + start_date, end_date + ) + query += filter_query + + log(f"Query used to download data:\n{query}") + + dfr = download_data_from_bigquery(query=query, billing_project_id=billing_project_id) + if save_format == "csv": + dfr.to_csv(f"{savepath}.csv", index=False) + elif save_format == "parquet": + dfr.to_parquet(f"{savepath}.parquet", index=False) + # bd.download(savepath=savepath, query=query, billing_project_id=billing_project_id) + + log(f"{table_id} data saved on {savepath}") + return savepath + + +@task() +def execute_prediction_on_gypscie( + api, + model_params: dict, + # hours_to_predict, +) -> str: + """ + Requisição de execução de um processo de Predição + Return task_id + """ + log("Starting prediction") + task_response = api.post( + path="workflow_run", + json=model_params, + ) + # data={ + # "model_id": model_id, + # "dataset_id": dataset_id, + # "project_id": project_id, + # }, + response = wait_run(api, task_response.json()) + + if response["state"] != "SUCCESS": + failed_message = "Error processing this dataset. Stop flow or restart this task" + log(failed_message) + task_state = Failed(failed_message) + raise ENDRUN(state=task_state) + + print(f"Prediction ended. Response: {response}, {response.json()}") + # TODO: retorna a predição? o id da do dataset? + + return response.json().get("task_id") # response.json().get('task_id') + + +@task +def task_wait_run(api, task_response, flow_type: str = "dataflow") -> Dict: + """ + Force flow wait for the end of data processing + flow_type: dataflow or processor + """ + return wait_run(api, task_response, flow_type) + + +@task +def get_dataflow_params( # pylint: disable=too-many-arguments + workflow_id, + environment_id, + project_id, + load_data_funtion_id, + pre_processing_function_id, + model_function_id, + radar_data_id, + rain_gauge_data_id, + grid_data_id, + model_data_id, +) -> List: + """ + Return parameters for the model + + { + "workflow_id": 36, + "environment_id": 1, + "parameters": [ + { + "function_id":42, + "params": {"radar_data_path":178, "rain_gauge_data_path":179, "grid_data_path":177} + }, + { + "function_id":43 + }, + { + "function_id":45, + "params": {"model_path":191} # model was registered on Gypscie as a dataset + } + ], + "project_id": 1 + } + """ + return { + "workflow_id": workflow_id, + "environment_id": environment_id, + "parameters": [ + { + "function_id": load_data_funtion_id, + "params": { + "radar_data_path": radar_data_id, + "rain_gauge_data_path": rain_gauge_data_id, + "grid_data_path": grid_data_id, + }, + }, + { + "function_id": pre_processing_function_id, + }, + {"function_id": model_function_id, "params": {"model_path": model_data_id}}, + ], + "project_id": project_id, + } + + +@task() +def get_output_dataset_ids_on_gypscie( + api, + task_id, +) -> List: + """ + Get output files id with predictions + """ + try: + response = api.get(path="status_workflow_run/" + task_id) + response = response.json() + except HTTPError as err: + if err.response.status_code == 404: + print(f"Task {task_id} not found") + return [] + + return response.get("output_datasets") + + +@task() +def download_datasets_from_gypscie( + api, + dataset_names: List, + wait=None, +) -> List: + """ + Get output files with predictions + """ + for file_name in dataset_names: + response = api.get(path=f"download/datasets/{file_name}.zip") + if response.status_code == 200: + log(f"Dataset {file_name} downloaded") + else: + log(f"Dataset {file_name} not found on Gypscie") + # TODO: verificar se o arquivo é .zip mesmo + return [dataset_name + ".zip" for dataset_name in dataset_names] + + +@task +def desnormalize_data(array: np.ndarray): + """ + Desnormalize data + + Inputs: + array: numpy array + Returns: + a numpy array with the values desnormalized + """ + return array + + +@task +def geolocalize_data(prediction_datasets: np.ndarray, now_datetime: str) -> pd.DataFrame: + """ + Geolocalize data using grid and add timestamp + + Inputs: + prediction_datasets: numpy array + now_datetime: string in format YYYY_MM_DD__H_M_S + Returns: + a pandas dataframe to be saved on GCP + Expected columns: latitude, longitude, janela_predicao, + valor_predicao, data_predicao (timestamp em que foi realizada a previsão) + """ + return prediction_datasets + + +@task +def create_image(data) -> List: + """ + Create image using Geolocalized data or the numpy array from desnormalized_data function + Exemplo de código que usei pra gerar uma imagem vindo de um xarray: + + def create_and_save_image(data: xr.xarray, variable) -> Path: + plt.figure(figsize=(10, 10)) + + # Use the Geostationary projection in cartopy + axis = plt.axes(projection=ccrs.PlateCarree()) + + lat_max, lon_max = ( + -21.708288842894145, + -42.36573106186053, + ) # canto superior direito + lat_min, lon_min = ( + -23.793855217170343, + -45.04488171189226, + ) # canto inferior esquerdo + + extent = [lon_min, lat_min, lon_max, lat_max] + img_extent = [extent[0], extent[2], extent[1], extent[3]] + + # Define the color scale based on the channel + colormap = "jet" # White to black for IR channels + + # Plot the image + img = axis.imshow(data, origin="upper", extent=img_extent, cmap=colormap, alpha=0.8) + + # Add coastlines, borders and gridlines + axis.coastlines(resolution='10m', color='black', linewidth=0.8) + axis.add_feature(cartopy.feature.BORDERS, edgecolor='white', linewidth=0.5) + + + grdln = axis.gridlines( + crs=ccrs.PlateCarree(), + color="gray", + alpha=0.7, + linestyle="--", + linewidth=0.7, + xlocs=np.arange(-180, 180, 1), + ylocs=np.arange(-90, 90, 1), + draw_labels=True, + ) + grdln.top_labels = False + grdln.right_labels = False + + plt.colorbar( + img, + label=variable.upper(), + extend="both", + orientation="horizontal", + pad=0.05, + fraction=0.05, + ) + + output_image_path = Path(os.getcwd()) / "output" / "images" + + save_image_path = output_image_path / (f"{variable}.png") + + if not output_image_path.exists(): + output_image_path.mkdir(parents=True, exist_ok=True) + + plt.savefig(save_image_path, bbox_inches="tight", pad_inches=0, dpi=300) + plt.show() + return save_image_path + """ + save_image_path = "image.png" + + return save_image_path + + +@task +def get_dataset_info(station_type: str, source: str) -> Dict: + """ + Inputs: + station_type: str ["rain_gauge", "weather_station", "radar"] + source: str ["alertario", "inmet", "mendanha"] + """ + + if station_type == "rain_gauge": + dataset_info = { + "dataset_id": "clima_pluviometro", + "filename": "gauge_station_bq", + "partition_date_column": "datetime", + } + if source == "alertario": + dataset_info["table_id"] = "taxa_precipitacao_alertario" + dataset_info["destination_table_id"] = "preprocessamento_pluviometro_alertario" + elif station_type == "weather_station": + dataset_info = { + "dataset_id": "clima_pluviometro", + "filename": "weather_station_bq", + "partition_date_column": "datetime", + } + if source == "alertario": + dataset_info["table_id"] = "meteorologia_alertario" + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" + elif source == "inmet": + dataset_info["table_id"] = "meteorologia_inmet" + dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" + else: + dataset_info = { + "dataset_id": "clima_radar", + "partition_date_column": "datetime", + } + if source == "mendanha": + dataset_info["storage_path"] = "" + dataset_info["destination_table_id"] = "preprocessamento_radar_mendanha" + elif source == "guaratiba": + dataset_info["storage_path"] = "" + dataset_info["destination_table_id"] = "preprocessamento_radar_guaratiba" + elif source == "macae": + dataset_info["storage_path"] = "" + dataset_info["destination_table_id"] = "preprocessamento_radar_macae" + + return dataset_info + + +def path_to_dfr(path: str) -> pd.DataFrame: + + """ + Reads a csv or parquet file from the given path and returns a dataframe + """ + if path.endswith(".csv"): + dfr = pd.read_csv(path) + elif path.endswith(".parquet"): + dfr = pd.read_parquet(path) + else: + raise ValueError("File extension not supported") + return dfr + + +def add_columns_on_dfr( + dfr: pd.DataFrame, model_version: int, update_time: bool = False +) -> pd.DataFrame: + """ + Reads a csv or parquet file from the given path and adds a column + with the update time based on Brazil timezone + """ + if update_time: + dfr["update_time"] = pd.Timestamp.now(tz="America/Sao_Paulo") + if model_version is not None: + dfr["model_version"] = model_version + return dfr diff --git a/pipelines/meteorologia/precipitacao_alertario/util_gypscie.py b/pipelines/meteorologia/precipitacao_alertario/util_gypscie.py new file mode 100644 index 00000000..f5873dc6 --- /dev/null +++ b/pipelines/meteorologia/precipitacao_alertario/util_gypscie.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +""" +Utils file +""" + +# from concurrent.futures import ThreadPoolExecutor, as_completed, wait +from datetime import datetime, timedelta +from time import sleep +from typing import Callable, Dict, Tuple # , List + +import basedosdados as bd +import requests +from prefeitura_rio.pipelines_utils.logging import log + + +class GypscieApi: + """ + GypscieApi + """ + + def __init__( + self, + username: str = None, + password: str = None, + base_url: str = None, + token_callback: Callable[[str, datetime], None] = lambda *_: None, + ) -> None: + if username is None or password is None: + raise ValueError("Must be set refresh token or username with password") + + self._base_url = base_url or "https://gypscie.dados.rio/api/" + self._username = username + self._password = password + self._token_callback = token_callback + self._headers, self._token, self._expires_at = self._get_headers() + + def _get_headers(self) -> Tuple[Dict[str, str], str, datetime]: + + response = requests.post( + f"{self._base_url}login", + headers={"accept": "application/json", "Content-Type": "application/json"}, + json={ + # 'grant_type': 'password', + # 'scope': 'openid profile', + "username": self._username, + "password": self._password, + }, + ) + if response.status_code == 200: + token = response.json()["token"] + # now + expires_in_seconds - 10 minutes + expires_at = datetime.now() + timedelta(seconds=30 * 60) + else: + log(f"Status code: {response.status_code}\nResponse:{response.content}") + raise Exception() + + return {"Authorization": f"Bearer {token}"}, token, expires_at + + def _refresh_token_if_needed(self) -> None: + if self._expires_at <= datetime.now(): + self._headers, self._token, self._expires_at = self._get_headers() + self._token_callback(self.get_token(), self.expires_at()) + + def refresh_token(self): + """ + refresh + """ + self._refresh_token_if_needed() + + def get_token(self): + """ + get token + """ + self._refresh_token_if_needed() + + return self._headers["Authorization"].split(" ")[1] + + def expires_at(self): + """ + expire + """ + return self._expires_at + + def get(self, path: str, timeout: int = 120) -> Dict: + """ + get + """ + self._refresh_token_if_needed() + response = requests.get(f"{self._base_url}{path}", headers=self._headers, timeout=timeout) + response.raise_for_status() + return response.json() + + def put(self, path, json=None): + """ + put + """ + self._refresh_token_if_needed() + response = requests.put(f"{self._base_url}{path}", headers=self._headers, json=json) + return response + + def post(self, path, data: dict = None, json: dict = None, files: dict = None): + """ + post + """ + self._refresh_token_if_needed() + response = requests.post( + url=f"{self._base_url}{path}", + headers=self._headers, + data=data, + json=json, + files=files, + ) + # response = requests.post(f"{self._base_url}{path}", headers=self._headers, json=json) + return response + + +def bq_project(kind: str = "bigquery_prod"): + """Get the set BigQuery project_id + + Args: + kind (str, optional): Which client to get the project name from. + Options are 'bigquery_staging', 'bigquery_prod' and 'storage_staging' + Defaults to 'bigquery_prod'. + + Returns: + str: the requested project_id + """ + return bd.upload.base.Base().client[kind].project + + +def wait_run(api, task_response, flow_type: str = "dataflow") -> Dict: + """ + Force flow wait for the end of data processing + flow_type: dataflow or processor + Return: + { + "result": {}, + "state": "string", + "status": "string" + } + """ + if "task_id" in task_response.keys(): + _id = task_response.get("task_id") + else: + log(f"Error processing: task_id not found on response:{task_response}") + # TODO: stop flow here + + # Request to get the execution status + path_flow_type = "status_workflow_run" if flow_type == "dataflow" else "status_processor_run" + response = api.get( + path=f"{path_flow_type}/" + _id, + ) + + log(f"Execution status: {response}.") + while response["state"] == "STARTED": + sleep(5) + response = wait_run(api, task_response) + + if response["state"] != "SUCCESS": + log("Error processing this dataset. Stop flow or restart this task") + + return response From c93fbf693b81a538778d1e445c8c71515b236450 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 9 Oct 2024 16:13:08 -0300 Subject: [PATCH 02/56] adding parameters on scheduler --- .../meteorologia/precipitacao_alertario/flows.py | 14 +++++++------- .../precipitacao_alertario/schedules.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 6c727787..8f37d560 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -32,8 +32,10 @@ from pipelines.rj_escritorio.rain_dashboard.constants import ( constants as rain_dashboard_constants, ) + # from pipelines.utils.constants import constants as utils_constants from pipelines.utils.custom import wait_for_flow_run_with_timeout + # from pipelines.utils.dump_db.constants import constants as dump_db_constants from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants @@ -100,14 +102,12 @@ processor_name = Parameter("processor_name", default="etl_alertario22", required=True) dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar - # Parameters for saving data on GCP - materialize_after_dump = Parameter("materialize_after_dump", default=False, required=False) - dump_mode = Parameter("dump_mode", default=False, required=False) + # Parameters for saving data preprocessed on GCP dataset_id_previsao_chuva = Parameter( - "dataset_id", default="clima_previsao_chuva", required=False + "dataset_id_previsao_chuva", default="clima_previsao_chuva", required=False ) table_id_previsao_chuva = Parameter( - "table_id", default="preprocessamento_pluviometro_alertario", required=False + "table_id_previsao_chuva", default="preprocessamento_pluviometro_alertario", required=False ) # Dataset parameters @@ -469,12 +469,12 @@ data_path=prediction_data_path, dataset_id=dataset_id_previsao_chuva, table_id=table_id_previsao_chuva, - dump_mode=dump_mode, + dump_mode=DUMP_MODE, biglake_table=False, ) # Trigger DBT flow run - with case(materialize_after_dump, True): + with case(MATERIALIZE_AFTER_DUMP, True): run_dbt = task_run_dbt_model_task( dataset_id=dataset_id_previsao_chuva, table_id=table_id_previsao_chuva, diff --git a/pipelines/meteorologia/precipitacao_alertario/schedules.py b/pipelines/meteorologia/precipitacao_alertario/schedules.py index ad90fbd7..2f1a74da 100644 --- a/pipelines/meteorologia/precipitacao_alertario/schedules.py +++ b/pipelines/meteorologia/precipitacao_alertario/schedules.py @@ -27,6 +27,18 @@ "materialize_to_datario": False, "mode": "prod", "dump_to_gcs": False, + "environment_id": 1, + "domain_id": 1, + "project_id": 1, + "project_name": "rionowcast_precipitation", + "processor_name": "etl_alertario22", + "dataset_processor_id": 43, + "dataset_id_previsao_chuva": "clima_previsao_chuva", + "table_id_previsao_chuva": "preprocessamento_pluviometro_alertario", + "station_type": "pluviometro", + "source": "alertario", + "maximum_bytes_processed": None, + "model_version": 1, }, ), ] From d41481661c7d487e34dfd8590fe1ef188a648cce Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 19:13:35 +0000 Subject: [PATCH 03/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../precipitacao_alertario/flows.py | 20 +++++++++---------- .../precipitacao_alertario/tasks_gypscie.py | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 8f37d560..e12eb573 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -29,15 +29,6 @@ save_last_dbt_update, treat_pluviometer_and_meteorological_data, ) -from pipelines.rj_escritorio.rain_dashboard.constants import ( - constants as rain_dashboard_constants, -) - -# from pipelines.utils.constants import constants as utils_constants -from pipelines.utils.custom import wait_for_flow_run_with_timeout - -# from pipelines.utils.dump_db.constants import constants as dump_db_constants -from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants # preprocessing imports from pipelines.precipitation_model.rionowcast.tasks import ( # pylint: disable=E0611, E0401 @@ -51,9 +42,16 @@ register_dataset_on_gypscie, task_wait_run, ) -from pipelines.tasks import ( # pylint: disable=E0611, E0401 - task_create_partitions, +from pipelines.rj_escritorio.rain_dashboard.constants import ( + constants as rain_dashboard_constants, ) +from pipelines.tasks import task_create_partitions # pylint: disable=E0611, E0401 + +# from pipelines.utils.constants import constants as utils_constants +from pipelines.utils.custom import wait_for_flow_run_with_timeout + +# from pipelines.utils.dump_db.constants import constants as dump_db_constants +from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants wait_for_flow_run_with_5min_timeout = wait_for_flow_run_with_timeout(timeout=timedelta(minutes=5)) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py b/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py index 840540f0..63cfafcc 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd - from basedosdados.upload.base import Base from google.cloud import bigquery from prefect import task From 1d7b4cb6267634be3229b1f84003b3ba0c9f33f1 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 9 Oct 2024 16:41:36 -0300 Subject: [PATCH 04/56] adding preprocessing on mendanha flow --- .../precipitacao_alertario/flows.py | 23 +-- .../meteorologia/radar/mendanha/flows.py | 143 ++++++++++++++++-- .../gypscie/tasks.py} | 2 +- .../gypscie/utils.py} | 0 4 files changed, 142 insertions(+), 26 deletions(-) rename pipelines/{meteorologia/precipitacao_alertario/tasks_gypscie.py => utils/gypscie/tasks.py} (99%) rename pipelines/{meteorologia/precipitacao_alertario/util_gypscie.py => utils/gypscie/utils.py} (100%) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index e12eb573..9b40cc21 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -30,8 +30,19 @@ treat_pluviometer_and_meteorological_data, ) +from pipelines.rj_escritorio.rain_dashboard.constants import ( + constants as rain_dashboard_constants, +) +from pipelines.tasks import task_create_partitions # pylint: disable=E0611, E0401 + +# from pipelines.utils.constants import constants as utils_constants +from pipelines.utils.custom import wait_for_flow_run_with_timeout + +# from pipelines.utils.dump_db.constants import constants as dump_db_constants +from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants + # preprocessing imports -from pipelines.precipitation_model.rionowcast.tasks import ( # pylint: disable=E0611, E0401 +from pipelines.utils.gypscie.tasks import ( # pylint: disable=E0611, E0401 access_api, add_columns_on_dfr, download_datasets_from_gypscie, @@ -42,16 +53,6 @@ register_dataset_on_gypscie, task_wait_run, ) -from pipelines.rj_escritorio.rain_dashboard.constants import ( - constants as rain_dashboard_constants, -) -from pipelines.tasks import task_create_partitions # pylint: disable=E0611, E0401 - -# from pipelines.utils.constants import constants as utils_constants -from pipelines.utils.custom import wait_for_flow_run_with_timeout - -# from pipelines.utils.dump_db.constants import constants as dump_db_constants -from pipelines.utils.dump_to_gcs.constants import constants as dump_to_gcs_constants wait_for_flow_run_with_5min_timeout = wait_for_flow_run_with_timeout(timeout=timedelta(minutes=5)) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 289aa967..bb2deab7 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -4,15 +4,20 @@ """ Flows for setting rain dashboard using radar data. """ -from prefect import Parameter, case -from prefect.run_configs import KubernetesRun -from prefect.storage import GCS -from prefeitura_rio.pipelines_utils.custom import Flow +from prefect import Parameter, case # pylint: disable=E0611, E0401 +from prefect.run_configs import KubernetesRun # pylint: disable=E0611, E0401 +from prefect.storage import GCS # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials +from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 + create_table_and_upload_to_gcs, + get_now_datetime, + task_run_dbt_model_task, +) -from pipelines.constants import constants -from pipelines.meteorologia.radar.mendanha.constants import ( - constants as radar_constants, # pylint: disable=E0611, E0401 +from pipelines.constants import constants # pylint: disable=E0611, E0401 +from pipelines.meteorologia.radar.mendanha.constants import ( # pylint: disable=E0611, E0401 + constants as radar_constants, ) # from pipelines.tasks import task_get_redis_client @@ -51,6 +56,23 @@ task_save_on_redis, ) +# preprocessing imports +from pipelines.utils.gypscie.tasks import ( # pylint: disable=E0611, E0401 + access_api as access_api_gypscie, + add_columns_on_dfr, + download_datasets_from_gypscie, + execute_dataset_processor, + get_dataset_info, + get_dataset_processor_info, + path_to_dfr, + register_dataset_on_gypscie, + task_wait_run, +) + +from pipelines.tasks import ( # pylint: disable=E0611, E0401 + task_create_partitions, +) + # create_visualization_with_background, prefix_to_restore, save_data, # from pipelines.utils_rj_cor import build_redis_hash # pylint: disable=E0611, E0401 @@ -81,13 +103,38 @@ # BASE_PATH = "pipelines/rj_cor/meteorologia/radar/precipitacao/" BUCKET_NAME = "rj-escritorio-scp" - # redis_data_key = Parameter("redis_data_key", default="data_last_15min_rain") - # redis_update_key = Parameter( - # "redis_update_key", default="data_last_15min_rain_update" - # ) - # redis_host = Parameter("redis_host", default="redis.redis.svc.cluster.local") - # redis_port = Parameter("redis_port", default=6379) - # redis_db = Parameter("redis_db", default=1) + # Preprocessing gypscie parameters + # Gypscie parameters + environment_id = Parameter("environment_id", default=1, required=False) + domain_id = Parameter("domain_id", default=1, required=False) + project_id = Parameter("project_id", default=1, required=False) + project_name = Parameter("project_name", default="rionowcast_precipitation", required=False) + + # Gypscie processor parameters + processor_name = Parameter("processor_name", default="etl_alertario22", required=True) + dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar + + # Parameters for saving data on GCP + materialize_after_dump = Parameter("materialize_after_dump", default=False, required=False) + dump_mode = Parameter("dump_mode", default=False, required=False) + dataset_id_previsao_chuva = Parameter( + "dataset_id_previsao_chuva", default="clima_previsao_chuva", required=False + ) + table_id_previsao_chuva = Parameter( + "table_id_previsao_chuva", default="preprocessamento_radar_mendanha", required=False + ) + + # Dataset parameters + station_type = Parameter("station_type", default="radar", required=False) + source = Parameter("source", default="mendanha", required=False) + + # Dataset path, if it was saved on ETL flow or it will be None + dataset_path = Parameter("dataset_path", default=None, required=False) # dataset_path + model_version = Parameter("model_version", default=1, required=False) + + ############################ + # Start radar flow # + ############################ redis_client = task_get_redis_client(infisical_secrets_path="/redis") redis_hash = task_build_redis_hash(DATASET_ID, TABLE_ID, name="images", mode=MODE) @@ -220,6 +267,74 @@ ) # save_last_update_redis.set_upstream(upload_table) + #################################### + # Start preprocessing flow # + #################################### + + api_gypscie = access_api_gypscie() + + dataset_info = get_dataset_info(station_type, source) + + # Get processor information on gypscie + with case(dataset_processor_id, None): + dataset_processor_response, dataset_processor_id = get_dataset_processor_info( + api_gypscie, processor_name + ) + + dataset_response = register_dataset_on_gypscie(api_gypscie, filepath=dataset_path, domain_id=domain_id) + # TODO: verifcar no codigo do augustp se são esses os parametros corretos + processor_parameters = { + "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], + "station_type": station_type, + } + + dataset_processor_task_id = execute_dataset_processor( + api_gypscie, + processor_id=dataset_processor_id, + dataset_id=[dataset_response["id"]], + environment_id=environment_id, + project_id=project_id, + parameters=processor_parameters, + ) + wait_run = task_wait_run(api_gypscie, dataset_processor_task_id, flow_type="processor") + dataset_path = download_datasets_from_gypscie( + api_gypscie, dataset_names=[dataset_response["id"]], wait=wait_run + ) + dfr_ = path_to_dfr(dataset_path) + # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) + dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + + # Save pre-treated data on local file with partitions + now_datetime = get_now_datetime() + prediction_data_path = task_create_partitions( + dfr, + partition_date_column=dataset_info["partition_date_column"], + savepath="model_prediction", + suffix=now_datetime, + ) + ################################ + # Save preprocessing on GCP # + ################################ + + # Upload data to BigQuery + create_table = create_table_and_upload_to_gcs( + data_path=prediction_data_path, + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + dump_mode=dump_mode, + biglake_table=False, + ) + + # Trigger DBT flow run + with case(materialize_after_dump, True): + run_dbt = task_run_dbt_model_task( + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + # mode=materialization_mode, + # materialize_to_datario=materialize_to_datario, + ) + run_dbt.set_upstream(create_table) + cor_meteorologia_refletividade_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) cor_meteorologia_refletividade_radar_flow.run_config = KubernetesRun( diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py b/pipelines/utils/gypscie/tasks.py similarity index 99% rename from pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py rename to pipelines/utils/gypscie/tasks.py index 63cfafcc..b633c80d 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks_gypscie.py +++ b/pipelines/utils/gypscie/tasks.py @@ -20,7 +20,7 @@ from requests.exceptions import HTTPError from pipelines.constants import constants # pylint: disable=E0611, E0401 -from pipelines.precipitation_model.rionowcast.utils import ( # pylint: disable=E0611, E0401 +from pipelines.utils.gypscie.utils import ( # pylint: disable=E0611, E0401 GypscieApi, wait_run, ) diff --git a/pipelines/meteorologia/precipitacao_alertario/util_gypscie.py b/pipelines/utils/gypscie/utils.py similarity index 100% rename from pipelines/meteorologia/precipitacao_alertario/util_gypscie.py rename to pipelines/utils/gypscie/utils.py From 6e63aa7b1d25506b2c966c51da427af44c7cbccb Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 9 Oct 2024 16:57:56 -0300 Subject: [PATCH 05/56] changin path where dfr was saved --- .../precipitacao_alertario/flows.py | 134 +++++++++--------- .../precipitacao_alertario/tasks.py | 18 +-- .../meteorologia/radar/mendanha/flows.py | 116 +++++++-------- 3 files changed, 139 insertions(+), 129 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 9b40cc21..181c830a 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -5,10 +5,10 @@ """ from datetime import timedelta -from prefect import Parameter, case -from prefect.run_configs import KubernetesRun -from prefect.storage import GCS -from prefect.tasks.prefect import create_flow_run, wait_for_flow_run +from prefect import Parameter, case # pylint: disable=E0611, E0401 +from prefect.run_configs import KubernetesRun # pylint: disable=E0611, E0401 +from prefect.storage import GCS # pylint: disable=E0611, E0401 +from prefect.tasks.prefect import create_flow_run, wait_for_flow_run # pylint: disable=E0611,E0401 from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 @@ -91,6 +91,8 @@ default=dump_to_gcs_constants.MAX_BYTES_PROCESSED_PER_TABLE.value, ) + # Preprocessing gypscie parameters + preprocessing_gypscie = Parameter("preprocessing_gypscie", default=False, required=False) # Gypscie parameters environment_id = Parameter("environment_id", default=1, required=False) domain_id = Parameter("domain_id", default=1, required=False) @@ -136,7 +138,7 @@ ) with case(empty_data_pluviometric, False): - path_pluviometric = save_data( + path_pluviometric, full_path_pluviometric = save_data( dfr_pluviometric, "pluviometric", wait=empty_data_pluviometric ) # Create table in BigQuery @@ -418,69 +420,73 @@ # Start preprocessing flow # #################################### - api = access_api() + with case(empty_data_pluviometric, False): + with case(preprocessing_gypscie, True): + api = access_api() - dataset_info = get_dataset_info(station_type, source) + dataset_info = get_dataset_info(station_type, source) - # Get processor information on gypscie - with case(dataset_processor_id, None): - dataset_processor_response, dataset_processor_id = get_dataset_processor_info( - api, processor_name - ) + # Get processor information on gypscie + with case(dataset_processor_id, None): + dataset_processor_response, dataset_processor_id = get_dataset_processor_info( + api, processor_name + ) - dataset_response = register_dataset_on_gypscie(api, filepath=dataset_path, domain_id=domain_id) - # TODO: verifcar no codigo do augustp se são esses os parametros corretos - processor_parameters = { - "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], - "station_type": station_type, - } - - dataset_processor_task_id = execute_dataset_processor( - api, - processor_id=dataset_processor_id, - dataset_id=[dataset_response["id"]], - environment_id=environment_id, - project_id=project_id, - parameters=processor_parameters, - ) - wait_run = task_wait_run(api, dataset_processor_task_id, flow_type="processor") - dataset_path = download_datasets_from_gypscie( - api, dataset_names=[dataset_response["id"]], wait=wait_run - ) - dfr_ = path_to_dfr(dataset_path) - # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) - - # Save pre-treated data on local file with partitions - now_datetime = get_now_datetime() - prediction_data_path = task_create_partitions( - dfr, - partition_date_column=dataset_info["partition_date_column"], - savepath="model_prediction", - suffix=now_datetime, - ) - ################################ - # Save preprocessing on GCP # - ################################ - - # Upload data to BigQuery - create_table = create_table_and_upload_to_gcs( - data_path=prediction_data_path, - dataset_id=dataset_id_previsao_chuva, - table_id=table_id_previsao_chuva, - dump_mode=DUMP_MODE, - biglake_table=False, - ) + dataset_response = register_dataset_on_gypscie( + api, filepath=full_path_pluviometric, domain_id=domain_id + ) + # TODO: verifcar no codigo do augustp se são esses os parametros corretos + processor_parameters = { + "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], + "station_type": station_type, + } + + dataset_processor_task_id = execute_dataset_processor( + api, + processor_id=dataset_processor_id, + dataset_id=[dataset_response["id"]], + environment_id=environment_id, + project_id=project_id, + parameters=processor_parameters, + ) + wait_run = task_wait_run(api, dataset_processor_task_id, flow_type="processor") + dataset_path = download_datasets_from_gypscie( + api, dataset_names=[dataset_response["id"]], wait=wait_run + ) + dfr_ = path_to_dfr(dataset_path) + # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) + dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + + # Save pre-treated data on local file with partitions + now_datetime = get_now_datetime() + prediction_data_path = task_create_partitions( + dfr, + partition_date_column=dataset_info["partition_date_column"], + savepath="model_prediction", + suffix=now_datetime, + ) + ################################ + # Save preprocessing on GCP # + ################################ + + # Upload data to BigQuery + create_table = create_table_and_upload_to_gcs( + data_path=prediction_data_path, + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + dump_mode=DUMP_MODE, + biglake_table=False, + ) - # Trigger DBT flow run - with case(MATERIALIZE_AFTER_DUMP, True): - run_dbt = task_run_dbt_model_task( - dataset_id=dataset_id_previsao_chuva, - table_id=table_id_previsao_chuva, - # mode=materialization_mode, - # materialize_to_datario=materialize_to_datario, - ) - run_dbt.set_upstream(create_table) + # Trigger DBT flow run + with case(MATERIALIZE_AFTER_DUMP, True): + run_dbt = task_run_dbt_model_task( + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + # mode=materialization_mode, + # materialize_to_datario=materialize_to_datario, + ) + run_dbt.set_upstream(create_table) # para rodar na cloud cor_meteorologia_precipitacao_alertario.storage = GCS(constants.GCS_FLOWS_BUCKET.value) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index d64c0751..703f0ac6 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -9,11 +9,11 @@ import numpy as np import pandas as pd -import pendulum +import pendulum # pylint: disable=E0401 import requests -from bs4 import BeautifulSoup -from prefect import task -from prefeitura_rio.pipelines_utils.infisical import get_secret +from bs4 import BeautifulSoup # pylint: disable=E0401 +from prefect import task # pylint: disable=E0401 +from prefeitura_rio.pipelines_utils.infisical import get_secret # pylint: disable=E0401 from pipelines.constants import constants from pipelines.meteorologia.precipitacao_alertario.utils import ( @@ -178,12 +178,12 @@ def treat_pluviometer_and_meteorological_data( return dfr, empty_data -@task +@task(nout=2) def save_data( dfr: pd.DataFrame, data_name: str = "temp", wait=None, # pylint: disable=unused-argument -) -> Union[str, Path]: +) -> Tuple[Union[str, Path], Union[str, Path]]: """ Salvar dfr tratados em csv para conseguir subir pro GCP """ @@ -199,15 +199,15 @@ def save_data( log(f"Dataframe for {data_name} after partitions {dataframe.iloc[0]}") log(f"Dataframe for {data_name} after partitions {dataframe.dtypes}") - to_partitions( + full_path = to_partitions( data=dataframe, partition_columns=partitions, savepath=prepath, data_type="csv", suffix=current_time, ) - log(f"Files saved on {prepath}") - return prepath + log(f"Files saved on {prepath}, full path is {full_path}") + return prepath, full_path @task diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index bb2deab7..38524f61 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -104,6 +104,7 @@ BUCKET_NAME = "rj-escritorio-scp" # Preprocessing gypscie parameters + preprocessing_gypscie = Parameter("preprocessing_gypscie", default=False, required=False) # Gypscie parameters environment_id = Parameter("environment_id", default=1, required=False) domain_id = Parameter("domain_id", default=1, required=False) @@ -271,69 +272,72 @@ # Start preprocessing flow # #################################### - api_gypscie = access_api_gypscie() + with case(preprocessing_gypscie, True): + api_gypscie = access_api_gypscie() - dataset_info = get_dataset_info(station_type, source) + dataset_info = get_dataset_info(station_type, source) - # Get processor information on gypscie - with case(dataset_processor_id, None): - dataset_processor_response, dataset_processor_id = get_dataset_processor_info( - api_gypscie, processor_name + # Get processor information on gypscie + with case(dataset_processor_id, None): + dataset_processor_response, dataset_processor_id = get_dataset_processor_info( + api_gypscie, processor_name + ) + # TODO: e se o radar_files tiver mais de um arquivo? + dataset_response = register_dataset_on_gypscie( + api_gypscie, filepath=radar_files, domain_id=domain_id ) + # TODO: verifcar no codigo do augustp se são esses os parametros corretos + processor_parameters = { + "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], + "station_type": station_type, + } + + dataset_processor_task_id = execute_dataset_processor( + api_gypscie, + processor_id=dataset_processor_id, + dataset_id=[dataset_response["id"]], + environment_id=environment_id, + project_id=project_id, + parameters=processor_parameters, + ) + wait_run = task_wait_run(api_gypscie, dataset_processor_task_id, flow_type="processor") + dataset_path = download_datasets_from_gypscie( + api_gypscie, dataset_names=[dataset_response["id"]], wait=wait_run + ) + dfr_ = path_to_dfr(dataset_path) + # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) + dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + + # Save pre-treated data on local file with partitions + now_datetime = get_now_datetime() + prediction_data_path = task_create_partitions( + dfr, + partition_date_column=dataset_info["partition_date_column"], + savepath="model_prediction", + suffix=now_datetime, + ) + ################################ + # Save preprocessing on GCP # + ################################ - dataset_response = register_dataset_on_gypscie(api_gypscie, filepath=dataset_path, domain_id=domain_id) - # TODO: verifcar no codigo do augustp se são esses os parametros corretos - processor_parameters = { - "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], - "station_type": station_type, - } - - dataset_processor_task_id = execute_dataset_processor( - api_gypscie, - processor_id=dataset_processor_id, - dataset_id=[dataset_response["id"]], - environment_id=environment_id, - project_id=project_id, - parameters=processor_parameters, - ) - wait_run = task_wait_run(api_gypscie, dataset_processor_task_id, flow_type="processor") - dataset_path = download_datasets_from_gypscie( - api_gypscie, dataset_names=[dataset_response["id"]], wait=wait_run - ) - dfr_ = path_to_dfr(dataset_path) - # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) - - # Save pre-treated data on local file with partitions - now_datetime = get_now_datetime() - prediction_data_path = task_create_partitions( - dfr, - partition_date_column=dataset_info["partition_date_column"], - savepath="model_prediction", - suffix=now_datetime, - ) - ################################ - # Save preprocessing on GCP # - ################################ - - # Upload data to BigQuery - create_table = create_table_and_upload_to_gcs( - data_path=prediction_data_path, - dataset_id=dataset_id_previsao_chuva, - table_id=table_id_previsao_chuva, - dump_mode=dump_mode, - biglake_table=False, - ) - - # Trigger DBT flow run - with case(materialize_after_dump, True): - run_dbt = task_run_dbt_model_task( + # Upload data to BigQuery + create_table = create_table_and_upload_to_gcs( + data_path=prediction_data_path, dataset_id=dataset_id_previsao_chuva, table_id=table_id_previsao_chuva, - # mode=materialization_mode, - # materialize_to_datario=materialize_to_datario, + dump_mode=dump_mode, + biglake_table=False, ) - run_dbt.set_upstream(create_table) + + # Trigger DBT flow run + with case(materialize_after_dump, True): + run_dbt = task_run_dbt_model_task( + dataset_id=dataset_id_previsao_chuva, + table_id=table_id_previsao_chuva, + # mode=materialization_mode, + # materialize_to_datario=materialize_to_datario, + ) + run_dbt.set_upstream(create_table) cor_meteorologia_refletividade_radar_flow.storage = GCS(constants.GCS_FLOWS_BUCKET.value) From 8f37f36e3343ae4f28aba6eb8011ebc557bdc43b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 19:58:29 +0000 Subject: [PATCH 06/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../meteorologia/precipitacao_alertario/flows.py | 6 ++++-- pipelines/meteorologia/radar/mendanha/flows.py | 15 +++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 181c830a..092d1ae6 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -8,7 +8,10 @@ from prefect import Parameter, case # pylint: disable=E0611, E0401 from prefect.run_configs import KubernetesRun # pylint: disable=E0611, E0401 from prefect.storage import GCS # pylint: disable=E0611, E0401 -from prefect.tasks.prefect import create_flow_run, wait_for_flow_run # pylint: disable=E0611,E0401 +from prefect.tasks.prefect import ( # pylint: disable=E0611,E0401 + create_flow_run, + wait_for_flow_run, +) from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 @@ -29,7 +32,6 @@ save_last_dbt_update, treat_pluviometer_and_meteorological_data, ) - from pipelines.rj_escritorio.rain_dashboard.constants import ( constants as rain_dashboard_constants, ) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 38524f61..8fb9d648 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -16,8 +16,8 @@ ) from pipelines.constants import constants # pylint: disable=E0611, E0401 -from pipelines.meteorologia.radar.mendanha.constants import ( # pylint: disable=E0611, E0401 - constants as radar_constants, +from pipelines.meteorologia.radar.mendanha.constants import ( + constants as radar_constants, # pylint: disable=E0611, E0401 ) # from pipelines.tasks import task_get_redis_client @@ -51,14 +51,17 @@ # from prefect.tasks.prefect import create_flow_run, wait_for_flow_run from pipelines.tasks import ( # pylint: disable=E0611, E0401 task_build_redis_hash, + task_create_partitions, task_get_redis_client, task_get_redis_output, task_save_on_redis, ) # preprocessing imports -from pipelines.utils.gypscie.tasks import ( # pylint: disable=E0611, E0401 - access_api as access_api_gypscie, +from pipelines.utils.gypscie.tasks import ( + access_api as access_api_gypscie, # pylint: disable=E0611, E0401 +) +from pipelines.utils.gypscie.tasks import ( add_columns_on_dfr, download_datasets_from_gypscie, execute_dataset_processor, @@ -69,10 +72,6 @@ task_wait_run, ) -from pipelines.tasks import ( # pylint: disable=E0611, E0401 - task_create_partitions, -) - # create_visualization_with_background, prefix_to_restore, save_data, # from pipelines.utils_rj_cor import build_redis_hash # pylint: disable=E0611, E0401 From af14baea8486de07622f152d18d2fde1db0b3d7c Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 17 Oct 2024 15:01:40 -0300 Subject: [PATCH 07/56] adding treatment version on gypscie register dataset --- pipelines/meteorologia/precipitacao_alertario/flows.py | 5 +++-- pipelines/meteorologia/precipitacao_alertario/tasks.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 181c830a..d8037a09 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -98,6 +98,7 @@ domain_id = Parameter("domain_id", default=1, required=False) project_id = Parameter("project_id", default=1, required=False) project_name = Parameter("project_name", default="rionowcast_precipitation", required=False) + treatment_version = Parameter("treatment_version", default=1, required=False) # Gypscie processor parameters processor_name = Parameter("processor_name", default="etl_alertario22", required=True) @@ -139,7 +140,7 @@ with case(empty_data_pluviometric, False): path_pluviometric, full_path_pluviometric = save_data( - dfr_pluviometric, "pluviometric", wait=empty_data_pluviometric + dfr_pluviometric, treatment_version, "pluviometric", wait=empty_data_pluviometric ) # Create table in BigQuery UPLOAD_TABLE = create_table_and_upload_to_gcs( @@ -431,7 +432,7 @@ dataset_processor_response, dataset_processor_id = get_dataset_processor_info( api, processor_name ) - + # TODO: converter os horarios do alertario para UTC antes de resgistrar dataset_response = register_dataset_on_gypscie( api, filepath=full_path_pluviometric, domain_id=domain_id ) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 703f0ac6..a7979bd9 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -181,6 +181,7 @@ def treat_pluviometer_and_meteorological_data( @task(nout=2) def save_data( dfr: pd.DataFrame, + treatment_version: int, data_name: str = "temp", wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: @@ -204,7 +205,7 @@ def save_data( partition_columns=partitions, savepath=prepath, data_type="csv", - suffix=current_time, + suffix=str(treatment_version)+"_"+current_time, ) log(f"Files saved on {prepath}, full path is {full_path}") return prepath, full_path From 14723774b5620c0afc5491af9a320130ca665681 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 17 Oct 2024 15:02:11 -0300 Subject: [PATCH 08/56] adding treatment version on gypscie register dataset --- pipelines/meteorologia/radar/mendanha/flows.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 38524f61..45f933a2 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -131,7 +131,7 @@ # Dataset path, if it was saved on ETL flow or it will be None dataset_path = Parameter("dataset_path", default=None, required=False) # dataset_path - model_version = Parameter("model_version", default=1, required=False) + treatment_version = Parameter("treatment_version", default=1, required=False) ############################ # Start radar flow # @@ -282,7 +282,8 @@ dataset_processor_response, dataset_processor_id = get_dataset_processor_info( api_gypscie, processor_name ) - # TODO: e se o radar_files tiver mais de um arquivo? + # TODO: ao salvar o nome do radar_files salvar com sufixo treatment_version + # pq te que ser unico no gypscie dataset_response = register_dataset_on_gypscie( api_gypscie, filepath=radar_files, domain_id=domain_id ) @@ -306,7 +307,7 @@ ) dfr_ = path_to_dfr(dataset_path) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + dfr = add_columns_on_dfr(dfr_, treatment_version, update_time=True) # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() From e28585935fd6d337c0e7916186787004fc0c2418 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 30 Oct 2024 18:45:23 -0300 Subject: [PATCH 09/56] adding functions to treat data on gypscie --- .../precipitacao_alertario/flows.py | 89 ++++++++++++++----- .../precipitacao_alertario/tasks.py | 30 ++++++- 2 files changed, 96 insertions(+), 23 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 69bf45dc..90a2a231 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -27,6 +27,7 @@ from pipelines.meteorologia.precipitacao_alertario.schedules import minute_schedule from pipelines.meteorologia.precipitacao_alertario.tasks import ( check_to_run_dbt, + convert_sp_timezone_to_utc, download_data, save_data, save_last_dbt_update, @@ -48,12 +49,14 @@ access_api, add_columns_on_dfr, download_datasets_from_gypscie, - execute_dataset_processor, + execute_dataflow_on_gypscie, + get_dataflow_alertario_params, get_dataset_info, + get_dataset_name_on_gypscie, get_dataset_processor_info, path_to_dfr, register_dataset_on_gypscie, - task_wait_run, + unzip_files, ) wait_for_flow_run_with_5min_timeout = wait_for_flow_run_with_timeout(timeout=timedelta(minutes=5)) @@ -96,6 +99,7 @@ # Preprocessing gypscie parameters preprocessing_gypscie = Parameter("preprocessing_gypscie", default=False, required=False) # Gypscie parameters + workflow_id = Parameter("workflow_id", default=1, required=False) environment_id = Parameter("environment_id", default=1, required=False) domain_id = Parameter("domain_id", default=1, required=False) project_id = Parameter("project_id", default=1, required=False) @@ -106,6 +110,20 @@ processor_name = Parameter("processor_name", default="etl_alertario22", required=True) dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar + load_data_function_id = Parameter("load_data_function_id", default=53, required=False) + parse_date_time_function_id = Parameter( + "parse_date_time_function_id", default=54, required=False + ) + drop_duplicates_function_id = Parameter( + "drop_duplicates_function_id", default=55, required=False + ) + replace_inconsistent_values_function_id = Parameter( + "replace_inconsistent_values_function_id", default=56, required=False + ) + add_lat_lon_function_id = Parameter("add_lat_lon_function_id", default=57, required=False) + save_data_function_id = Parameter("save_data_function_id", default=58, required=False) + rain_gauge_metadata_path = Parameter("rain_gauge_metadata_path", default=227, required=False) + # Parameters for saving data preprocessed on GCP dataset_id_previsao_chuva = Parameter( "dataset_id_previsao_chuva", default="clima_previsao_chuva", required=False @@ -127,13 +145,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -434,29 +458,52 @@ dataset_processor_response, dataset_processor_id = get_dataset_processor_info( api, processor_name ) - # TODO: converter os horarios do alertario para UTC antes de resgistrar - dataset_response = register_dataset_on_gypscie( - api, filepath=full_path_pluviometric, domain_id=domain_id + dfr_pluviometric_gypscie = convert_sp_timezone_to_utc(dfr_pluviometric) + path_pluviometric_gypscie, full_path_pluviometric_gypscie = save_data( + dfr_pluviometric_gypscie, + columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], + data_name="gypscie", + ) + register_dataset_response = register_dataset_on_gypscie( + api, filepath=path_pluviometric_gypscie, domain_id=domain_id ) - # TODO: verifcar no codigo do augustp se são esses os parametros corretos - processor_parameters = { - "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], - "station_type": station_type, - } - dataset_processor_task_id = execute_dataset_processor( - api, - processor_id=dataset_processor_id, - dataset_id=[dataset_response["id"]], + model_params = get_dataflow_alertario_params( + workflow_id=workflow_id, environment_id=environment_id, project_id=project_id, - parameters=processor_parameters, + rain_gauge_data_id=register_dataset_response["id"], + rain_gauge_metadata_path=rain_gauge_metadata_path, + load_data_funtion_id=load_data_function_id, + parse_date_time_function_id=parse_date_time_function_id, + drop_duplicates_function_id=drop_duplicates_function_id, + replace_inconsistent_values_function_id=replace_inconsistent_values_function_id, + add_lat_lon_function_id=add_lat_lon_function_id, + save_data_function_id=save_data_function_id, ) - wait_run = task_wait_run(api, dataset_processor_task_id, flow_type="processor") - dataset_path = download_datasets_from_gypscie( - api, dataset_names=[dataset_response["id"]], wait=wait_run + + # Send dataset ids to gypscie to get predictions + output_dataset_ids = execute_dataflow_on_gypscie( + api, + model_params, ) - dfr_ = path_to_dfr(dataset_path) + + # dataset_processor_task_id = execute_dataset_processor( + # api, + # processor_id=dataset_processor_id, + # dataset_id=[dataset_response["id"]], + # environment_id=environment_id, + # project_id=project_id, + # parameters=processor_parameters, + # ) + # wait_run = task_wait_run(api, dataset_processor_task_id, flow_type="processor") + # dataset_path = download_datasets_from_gypscie( + # api, dataset_names=[dataset_response["id"]], wait=wait_run + # ) + dataset_names = get_dataset_name_on_gypscie(api, output_dataset_ids) # new + ziped_dataset_paths = download_datasets_from_gypscie(api, dataset_names=dataset_names) + dataset_paths = unzip_files(ziped_dataset_paths) + dfr_ = path_to_dfr(dataset_paths) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index a7979bd9..7a51cbb1 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -181,7 +181,8 @@ def treat_pluviometer_and_meteorological_data( @task(nout=2) def save_data( dfr: pd.DataFrame, - treatment_version: int, + columns: str = None, + treatment_version: str = "", data_name: str = "temp", wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: @@ -200,12 +201,15 @@ def save_data( log(f"Dataframe for {data_name} after partitions {dataframe.iloc[0]}") log(f"Dataframe for {data_name} after partitions {dataframe.dtypes}") + if columns: + dataframe = dataframe[columns] + full_path = to_partitions( data=dataframe, partition_columns=partitions, savepath=prepath, data_type="csv", - suffix=str(treatment_version)+"_"+current_time, + suffix=treatment_version + "_" + current_time, ) log(f"Files saved on {prepath}, full path is {full_path}") return prepath, full_path @@ -348,3 +352,25 @@ def save_data_old( ) log(f"{data_name} files saved on {prepath}") return prepath + + +@task +def convert_sp_timezone_to_utc(dfr, data_column: str = "data_medicao") -> pd.DataFrame: + """ + Convert a dataframe data_column from São Paulo (UTC-3) to UTC. + + Parameters: + dfr (pd.DataFrame): DataFrame with data_column. + + Returns: + pd.DataFrame: DataFrame with data_column converted to UTC. + """ + + if data_column not in dfr.columns: + raise ValueError(f"DataFrame must contain a column named {data_column}.") + + dfr[data_column] = pd.to_datetime(dfr[data_column]) + dfr[data_column] = dfr[data_column].dt.tz_localize("America/Sao_Paulo") + dfr[data_column] = dfr[data_column].dt.tz_convert("UTC") + + return dfr From 2ab57c9972ff9e932cd8ddb0656e6fa60ff7ba81 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 30 Oct 2024 19:00:12 -0300 Subject: [PATCH 10/56] changing gypscie tasks --- pipelines/utils/gypscie/tasks.py | 214 ++++++++++++++++++++++++++----- 1 file changed, 185 insertions(+), 29 deletions(-) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index b633c80d..48feb596 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -1,16 +1,20 @@ # -*- coding: utf-8 -*- +# pylint: disable= C0207 """ Tasks """ import datetime import os +import zipfile from pathlib import Path + from time import sleep from typing import Dict, List import numpy as np import pandas as pd -from basedosdados.upload.base import Base + +from basedosdados import Base from google.cloud import bigquery from prefect import task from prefect.engine.signals import ENDRUN @@ -38,8 +42,8 @@ def access_api(): # username = get_secret(secret_name="USERNAME", path="/gypscie", environment="prod") # password = get_secret(secret_name="PASSWORD", path="/gypscie", environment="prod") - username = get_secret(infisical_username, path="/gypscie")[infisical_username] - password = get_secret(infisical_password, path="/gypscie")[infisical_password] + username = get_secret(infisical_username, path="/gypscie_dexl")[infisical_username] + password = get_secret(infisical_password, path="/gypscie_dexl")[infisical_password] api = GypscieApi(username=username, password=password) return api @@ -171,7 +175,7 @@ def execute_dataset_processor( dataset_id: list, # como pegar os vários datasets environment_id: int, project_id: int, - parameters: dict + parameters: dict, # adicionar campos do dataset_processor ) -> List: """ @@ -289,14 +293,16 @@ def query_data_from_gcp( # pylint: disable=too-many-arguments @task() -def execute_prediction_on_gypscie( +def execute_dataflow_on_gypscie( api, model_params: dict, # hours_to_predict, -) -> str: +) -> List: """ Requisição de execução de um processo de Predição - Return task_id + Return + {'state': 'STARTED'} + {'result': {'output_datasets': [236]}, 'state': 'SUCCESS'} """ log("Starting prediction") task_response = api.post( @@ -316,10 +322,8 @@ def execute_prediction_on_gypscie( task_state = Failed(failed_message) raise ENDRUN(state=task_state) - print(f"Prediction ended. Response: {response}, {response.json()}") - # TODO: retorna a predição? o id da do dataset? - - return response.json().get("task_id") # response.json().get('task_id') + log(f"Prediction ended. Response: {response}") + return response["result"].get("output_datasets") @task @@ -331,6 +335,86 @@ def task_wait_run(api, task_response, flow_type: str = "dataflow") -> Dict: return wait_run(api, task_response, flow_type) +@task +def get_dataflow_alertario_params( # pylint: disable=too-many-arguments + workflow_id, + environment_id, + project_id, + rain_gauge_data_id, + rain_gauge_metadata_path, + load_data_funtion_id, + parse_date_time_function_id, + drop_duplicates_function_id, + replace_inconsistent_values_function_id, + add_lat_lon_function_id, + save_data_function_id, +) -> List: + """ + Return parameters for the alertario ETL + + { + "workflow_id": 41, + "environment_id": 1, + "parameters": [ + { + "function_id":53, # load_data + "params": { + "rain_gauge_data_path":226, + "rain_gauge_metadata_path":227 + } + }, + { + "function_id":54 # parse_date_time + }, + { + "function_id":55 # drop_duplicates + }, + { + "function_id":56 # replace_inconsistent_values + }, + { + "function_id":57 # add_lat_lon + }, + { + "function_id":58, # save_data + "params": {"output_path":"dados_alertario_20230112_190000.parquet"} + } + ], + "project_id": 1 + } + """ + return { + "workflow_id": workflow_id, + "environment_id": environment_id, + "parameters": [ + { + "function_id": load_data_funtion_id, + "params": { + "rain_gauge_data_path": rain_gauge_data_id, + "rain_gauge_metadata_path": rain_gauge_metadata_path, + }, + }, + { + "function_id": parse_date_time_function_id, + }, + { + "function_id": drop_duplicates_function_id, + }, + { + "function_id": replace_inconsistent_values_function_id, + }, + { + "function_id": add_lat_lon_function_id, + }, + { + "function_id": save_data_function_id, + "params": {"output_path": "preprocessed_data_alertario.parquet"}, + }, + ], + "project_id": project_id, + } + + @task def get_dataflow_params( # pylint: disable=too-many-arguments workflow_id, @@ -343,6 +427,7 @@ def get_dataflow_params( # pylint: disable=too-many-arguments rain_gauge_data_id, grid_data_id, model_data_id, + output_function_id, ) -> List: """ Return parameters for the model @@ -382,6 +467,7 @@ def get_dataflow_params( # pylint: disable=too-many-arguments "function_id": pre_processing_function_id, }, {"function_id": model_function_id, "params": {"model_path": model_data_id}}, + {"function_id": output_function_id, "params": {"output_path": "prediction.npy"}}, ], "project_id": project_id, } @@ -402,27 +488,93 @@ def get_output_dataset_ids_on_gypscie( if err.response.status_code == 404: print(f"Task {task_id} not found") return [] + log(f"status_workflow_run response {response}") return response.get("output_datasets") +@task() +def get_dataset_name_on_gypscie( + api, + dataset_ids: list, +) -> List: + """ + Get datasets name using their dataset ids + """ + dataset_names = [] + log(f"All dataset_ids to get names: {dataset_ids}") + for dataset_id in dataset_ids: + log(f"Getting name for dataset id: {dataset_id}") + try: + response = api.get(path="datasets/" + str(dataset_id)) + except HTTPError as err: + if err.response.status_code == 404: + print(f"Dataset_id {dataset_id} not found") + return [] + log(f"Get dataset name response {response}") + dataset_names.append(response.get("name")) + log(f"All dataset names {dataset_names}") + return dataset_names + + @task() def download_datasets_from_gypscie( api, dataset_names: List, - wait=None, + wait=None, # pylint: disable=unused-argument ) -> List: """ Get output files with predictions """ - for file_name in dataset_names: - response = api.get(path=f"download/datasets/{file_name}.zip") + log(f"\n\nDataset names to be downloaded from Gypscie: {dataset_names}") + for dataset_name in dataset_names: + log(f"Downloading dataset {dataset_name} from Gypscie") + response = api.get(f"download/datasets/{dataset_name}.zip") + log(f"Download {dataset_name}'s response: {response}") if response.status_code == 200: - log(f"Dataset {file_name} downloaded") + dataset = response.content + with open(f"{dataset_name}.zip", "wb") as file: + file.write(dataset) + log(f"Dataset {dataset_name} downloaded") else: - log(f"Dataset {file_name} not found on Gypscie") - # TODO: verificar se o arquivo é .zip mesmo - return [dataset_name + ".zip" for dataset_name in dataset_names] + log(f"Dataset {dataset_name} not found on Gypscie") + return dataset_names + + +@task +def unzip_files(zip_files: List[str], destination_folder: str = "./") -> List[str]: + """ + Unzip files to destination folder + """ + zip_files = [ + zip_file if zip_file.endswith(".zip") else zip_file + ".zip" for zip_file in zip_files + ] + os.makedirs(destination_folder, exist_ok=True) + + unziped_files = [] + for zip_file in zip_files: + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(destination_folder) + unziped_files.extend( + [ + os.path.join(destination_folder, nome_arquivo) + for nome_arquivo in zip_ref.namelist() + ] + ) + + return unziped_files + + +@task +def read_numpy_files(file_paths: List[str]) -> List[np.ndarray]: + """ + Read numpy arrays and return a list with of them + """ + arrays = [] + for file_path in file_paths: + array = np.load(file_path) + arrays.append(array) + return arrays @task @@ -451,6 +603,7 @@ def geolocalize_data(prediction_datasets: np.ndarray, now_datetime: str) -> pd.D Expected columns: latitude, longitude, janela_predicao, valor_predicao, data_predicao (timestamp em que foi realizada a previsão) """ + now_datetime = now_datetime + 1 return prediction_datasets @@ -523,7 +676,7 @@ def create_and_save_image(data: xr.xarray, variable) -> Path: return save_image_path """ save_image_path = "image.png" - + data = data + 1 return save_image_path @@ -552,9 +705,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" @@ -577,16 +730,19 @@ def get_dataset_info(station_type: str, source: str) -> Dict: def path_to_dfr(path: str) -> pd.DataFrame: - """ Reads a csv or parquet file from the given path and returns a dataframe """ - if path.endswith(".csv"): - dfr = pd.read_csv(path) - elif path.endswith(".parquet"): - dfr = pd.read_parquet(path) - else: - raise ValueError("File extension not supported") + dfr = pd.DataFrame() + try: + if path.endswith(".csv"): + dfr = pd.read_csv(path) + elif path.endswith(".parquet"): + dfr = pd.read_parquet(path) + else: + raise ValueError("File extension not supported") + except AttributeError as error: + log(f"type(path) {type(path)} error {error}") return dfr From fad29543cc0643872302ab5ed03b906733065147 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Oct 2024 22:00:32 +0000 Subject: [PATCH 11/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/utils/gypscie/tasks.py | 8 +++----- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 90a2a231..b5c6b08c 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -145,19 +145,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 48feb596..03e014e5 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -7,13 +7,11 @@ import os import zipfile from pathlib import Path - from time import sleep from typing import Dict, List import numpy as np import pandas as pd - from basedosdados import Base from google.cloud import bigquery from prefect import task @@ -705,9 +703,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 631d375eaa025f43e1a5d4e97be3bc2bbcbfa477 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Wed, 30 Oct 2024 19:16:16 -0300 Subject: [PATCH 12/56] bugfix" --- .../meteorologia/precipitacao_alertario/flows.py | 14 ++++++++++---- pipelines/utils/gypscie/tasks.py | 13 ++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index b5c6b08c..3d594f1c 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -47,7 +47,7 @@ # preprocessing imports from pipelines.utils.gypscie.tasks import ( # pylint: disable=E0611, E0401 access_api, - add_columns_on_dfr, + add_caracterization_columns_on_dfr, download_datasets_from_gypscie, execute_dataflow_on_gypscie, get_dataflow_alertario_params, @@ -145,13 +145,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -499,7 +505,7 @@ dataset_paths = unzip_files(ziped_dataset_paths) dfr_ = path_to_dfr(dataset_paths) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_columns_on_dfr(dfr_, model_version, update_time=True) + dfr = add_caracterization_columns_on_dfr(dfr_, model_version, update_time=True) # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 03e014e5..341a3bdd 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -703,9 +703,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" @@ -744,12 +744,11 @@ def path_to_dfr(path: str) -> pd.DataFrame: return dfr -def add_columns_on_dfr( - dfr: pd.DataFrame, model_version: int, update_time: bool = False +def add_caracterization_columns_on_dfr( + dfr: pd.DataFrame, model_version: int = None, update_time: bool = False ) -> pd.DataFrame: """ - Reads a csv or parquet file from the given path and adds a column - with the update time based on Brazil timezone + Add a column with the update time based on Brazil timezone and model version """ if update_time: dfr["update_time"] = pd.Timestamp.now(tz="America/Sao_Paulo") From 88c5a3838ce6eb887bd693a53b7bec2389a7118c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 30 Oct 2024 22:16:41 +0000 Subject: [PATCH 13/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/utils/gypscie/tasks.py | 6 +++--- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 3d594f1c..e197cb31 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -145,19 +145,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 341a3bdd..8cb2affb 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -703,9 +703,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From a70fc67787a7bad305b20fd7f1ee9f41b2b3ef2b Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 08:01:33 -0300 Subject: [PATCH 14/56] trying to solve TypeError: object of type 'Parameter' has no len() --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 5 ++++- pipelines/utils/gypscie/tasks.py | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 7a51cbb1..6d145522 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -182,7 +182,7 @@ def treat_pluviometer_and_meteorological_data( def save_data( dfr: pd.DataFrame, columns: str = None, - treatment_version: str = "", + treatment_version: int = None, data_name: str = "temp", wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: @@ -190,6 +190,9 @@ def save_data( Salvar dfr tratados em csv para conseguir subir pro GCP """ + if not treatment_version: + treatment_version = "" + prepath = Path(f"/tmp/precipitacao_alertario/{data_name}") prepath.mkdir(parents=True, exist_ok=True) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 8cb2affb..b3421728 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -14,7 +14,7 @@ import pandas as pd from basedosdados import Base from google.cloud import bigquery -from prefect import task +from prefect import task, Parameter from prefect.engine.signals import ENDRUN from prefect.engine.state import Failed from prefeitura_rio.pipelines_utils.infisical import get_secret @@ -745,13 +745,16 @@ def path_to_dfr(path: str) -> pd.DataFrame: def add_caracterization_columns_on_dfr( - dfr: pd.DataFrame, model_version: int = None, update_time: bool = False + dfr: pd.DataFrame, model_version: None, update_time: bool = False ) -> pd.DataFrame: """ Add a column with the update time based on Brazil timezone and model version """ + if update_time: dfr["update_time"] = pd.Timestamp.now(tz="America/Sao_Paulo") if model_version is not None: + if isinstance(model_version, Parameter): + model_version = model_version.value dfr["model_version"] = model_version return dfr From ff0b24fc1f31c26daa0dec08ed830d9524ed9406 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:02:14 +0000 Subject: [PATCH 15/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/utils/gypscie/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index b3421728..33aafd8b 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -14,7 +14,7 @@ import pandas as pd from basedosdados import Base from google.cloud import bigquery -from prefect import task, Parameter +from prefect import Parameter, task from prefect.engine.signals import ENDRUN from prefect.engine.state import Failed from prefeitura_rio.pipelines_utils.infisical import get_secret From 96ed1ad4476cf72f16e21920beb8c962cc6e63fc Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 08:39:25 -0300 Subject: [PATCH 16/56] trying to solve TypeError: object of type 'Parameter' has no len() --- pipelines/utils/gypscie/tasks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 33aafd8b..804de0f1 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -754,7 +754,5 @@ def add_caracterization_columns_on_dfr( if update_time: dfr["update_time"] = pd.Timestamp.now(tz="America/Sao_Paulo") if model_version is not None: - if isinstance(model_version, Parameter): - model_version = model_version.value dfr["model_version"] = model_version return dfr From abdcc4b06c76d7b6b8cad778425fb07172a435b7 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 08:46:47 -0300 Subject: [PATCH 17/56] trying to solve TypeError: object of type 'Parameter' has no len() --- pipelines/utils/gypscie/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 804de0f1..1829ad2c 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -754,5 +754,6 @@ def add_caracterization_columns_on_dfr( if update_time: dfr["update_time"] = pd.Timestamp.now(tz="America/Sao_Paulo") if model_version is not None: - dfr["model_version"] = model_version + model_version_ = str(model_version) + dfr["model_version"] = model_version_ return dfr From be6872b6752b523173521eb9825bc4db67644b9b Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 08:54:56 -0300 Subject: [PATCH 18/56] bugfix --- pipelines/meteorologia/radar/mendanha/flows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 8dfb47be..a28d2276 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -62,7 +62,7 @@ access_api as access_api_gypscie, # pylint: disable=E0611, E0401 ) from pipelines.utils.gypscie.tasks import ( - add_columns_on_dfr, + add_caracterization_columns_on_dfr, download_datasets_from_gypscie, execute_dataset_processor, get_dataset_info, @@ -306,7 +306,7 @@ ) dfr_ = path_to_dfr(dataset_path) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_columns_on_dfr(dfr_, treatment_version, update_time=True) + dfr = add_caracterization_columns_on_dfr(dfr_, treatment_version, update_time=True) # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() From b849697d8c1e931a84b28353d21bff5d70ce2261 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 09:47:15 -0300 Subject: [PATCH 19/56] fixing partition column name and save_data function --- .../precipitacao_alertario/flows.py | 19 ++++++++++++++----- .../precipitacao_alertario/tasks.py | 2 +- pipelines/utils/gypscie/tasks.py | 10 +++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index e197cb31..a611aeac 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -145,13 +145,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -160,7 +166,10 @@ with case(empty_data_pluviometric, False): path_pluviometric, full_path_pluviometric = save_data( - dfr_pluviometric, treatment_version, "pluviometric", wait=empty_data_pluviometric + dfr_pluviometric, + data_name="pluviometric", + treatment_version=treatment_version, + wait=empty_data_pluviometric, ) # Create table in BigQuery UPLOAD_TABLE = create_table_and_upload_to_gcs( @@ -393,7 +402,7 @@ # Save and materialize meteorological data with case(empty_data_meteorological, False): path_meteorological = save_data( - dfr_meteorological, "meteorological", wait=empty_data_meteorological + dfr_meteorological, data_name="meteorological", wait=empty_data_meteorological ) # Create table in BigQuery UPLOAD_TABLE_METEOROLOGICAL = create_table_and_upload_to_gcs( @@ -455,8 +464,8 @@ dfr_pluviometric_gypscie = convert_sp_timezone_to_utc(dfr_pluviometric) path_pluviometric_gypscie, full_path_pluviometric_gypscie = save_data( dfr_pluviometric_gypscie, - columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], data_name="gypscie", + columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], ) register_dataset_response = register_dataset_on_gypscie( api, filepath=path_pluviometric_gypscie, domain_id=domain_id diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 6d145522..62380fca 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -181,9 +181,9 @@ def treat_pluviometer_and_meteorological_data( @task(nout=2) def save_data( dfr: pd.DataFrame, + data_name: str = "temp", columns: str = None, treatment_version: int = None, - data_name: str = "temp", wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: """ diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 1829ad2c..fcd1eaf1 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -690,7 +690,7 @@ def get_dataset_info(station_type: str, source: str) -> Dict: dataset_info = { "dataset_id": "clima_pluviometro", "filename": "gauge_station_bq", - "partition_date_column": "datetime", + "partition_date_column": "data_medicao", } if source == "alertario": dataset_info["table_id"] = "taxa_precipitacao_alertario" @@ -699,13 +699,13 @@ def get_dataset_info(station_type: str, source: str) -> Dict: dataset_info = { "dataset_id": "clima_pluviometro", "filename": "weather_station_bq", - "partition_date_column": "datetime", + "partition_date_column": "data_medicao", } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 62781691baf7c6bfaed2c64e05738bab343b9a0f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 11:25:42 -0300 Subject: [PATCH 20/56] bugfix --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 62380fca..7793f53d 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -197,6 +197,8 @@ def save_data( prepath.mkdir(parents=True, exist_ok=True) partition_column = "data_medicao" + new_partition_columns = ["ano_particao", "mes_particao", "data_particao"] + dfr = dfr.drop(columns=[col for col in new_partition_columns if col in dfr.columns]) log(f"Dataframe for {data_name} before partitions {dfr.iloc[0]}") log(f"Dataframe for {data_name} before partitions {dfr.dtypes}") dataframe, partitions = parse_date_columns(dfr, partition_column) @@ -205,7 +207,7 @@ def save_data( log(f"Dataframe for {data_name} after partitions {dataframe.dtypes}") if columns: - dataframe = dataframe[columns] + dataframe = dataframe[columns + new_partition_columns] full_path = to_partitions( data=dataframe, From a46a02331430d819de0cbc9f46832e0c1c4a31ce Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 14:26:03 +0000 Subject: [PATCH 21/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/utils/gypscie/tasks.py | 6 +++--- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index a611aeac..88474bb4 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -145,19 +145,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index fcd1eaf1..df0e9b1f 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -703,9 +703,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 03b1de27bd4373125ffc1bd84c871bf5e04da55a Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 11:39:20 -0300 Subject: [PATCH 22/56] bugfix --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 7793f53d..a0b34047 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -214,7 +214,7 @@ def save_data( partition_columns=partitions, savepath=prepath, data_type="csv", - suffix=treatment_version + "_" + current_time, + suffix=str(treatment_version) + "_" + current_time, ) log(f"Files saved on {prepath}, full path is {full_path}") return prepath, full_path From 382527ff3fecd393ce674b82096da1778be8820f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 12:41:56 -0300 Subject: [PATCH 23/56] fixing gypscie api --- pipelines/constants.py | 1 + pipelines/utils/gypscie/tasks.py | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pipelines/constants.py b/pipelines/constants.py index b01c7b82..2134d604 100644 --- a/pipelines/constants.py +++ b/pipelines/constants.py @@ -115,6 +115,7 @@ class constants(Enum): } # Infisical + INFISICAL_PATH = "/gypscie_dexl" INFISICAL_URL = "URL" INFISICAL_USERNAME = "USERNAME" INFISICAL_PASSWORD = "PASSWORD" diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index df0e9b1f..3b23317e 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -12,13 +12,13 @@ import numpy as np import pandas as pd -from basedosdados import Base -from google.cloud import bigquery -from prefect import Parameter, task -from prefect.engine.signals import ENDRUN -from prefect.engine.state import Failed -from prefeitura_rio.pipelines_utils.infisical import get_secret -from prefeitura_rio.pipelines_utils.logging import log +from basedosdados import Base # pylint: disable=E0611, E0401 +from google.cloud import bigquery # pylint: disable=E0611, E0401 +from prefect import task # pylint: disable=E0611, E0401 +from prefect.engine.signals import ENDRUN # pylint: disable=E0611, E0401 +from prefect.engine.state import Failed # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.infisical import get_secret # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.logging import log # pylint: disable=E0611, E0401 from requests.exceptions import HTTPError from pipelines.constants import constants # pylint: disable=E0611, E0401 @@ -34,15 +34,18 @@ def access_api(): """# noqa E303 Acess api and return it to be used in other requests """ + infisical_path = constants.INFISICAL_PATH.value + infisical_url = constants.INFISICAL_URL.value infisical_username = constants.INFISICAL_USERNAME.value infisical_password = constants.INFISICAL_PASSWORD.value # username = get_secret(secret_name="USERNAME", path="/gypscie", environment="prod") # password = get_secret(secret_name="PASSWORD", path="/gypscie", environment="prod") - username = get_secret(infisical_username, path="/gypscie_dexl")[infisical_username] - password = get_secret(infisical_password, path="/gypscie_dexl")[infisical_password] - api = GypscieApi(username=username, password=password) + url = get_secret(infisical_url, path=infisical_path)[infisical_url] + username = get_secret(infisical_username, path=infisical_path)[infisical_username] + password = get_secret(infisical_password, path=infisical_path)[infisical_password] + api = GypscieApi(base_url=url, username=username, password=password) return api From fb946caefc68cfba8cb3da3932d7a73673478449 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 14:08:39 -0300 Subject: [PATCH 24/56] bugfix --- .../meteorologia/precipitacao_alertario/flows.py | 15 +++++++++++---- pipelines/utils/gypscie/tasks.py | 9 ++++----- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 88474bb4..b0c36327 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -13,6 +13,7 @@ wait_for_flow_run, ) from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 +# pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 create_table_and_upload_to_gcs, @@ -133,7 +134,7 @@ ) # Dataset parameters - station_type = Parameter("station_type", default="pluviometro", required=False) + station_type = Parameter("station_type", default="rain_gauge", required=False) source = Parameter("source", default="alertario", required=False) # Dataset path, if it was saved on ETL flow or it will be None @@ -145,13 +146,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -395,7 +402,7 @@ # Save and materialize meteorological data with case(empty_data_meteorological, False): - path_meteorological = save_data( + path_meteorological, full_path_meteorological = save_data( dfr_meteorological, data_name="meteorological", wait=empty_data_meteorological ) # Create table in BigQuery diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 3b23317e..8c024ca2 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -134,7 +134,6 @@ def register_dataset_on_gypscie(api, filepath: Path, domain_id: int = 1) -> Dict + "_" + datetime.datetime.now().strftime("%Y%m%d%H%M%S"), # pylint: disable=use-maxsplit-arg } - log(type(data), data) files = { "files": open(file=filepath, mode="rb"), # pylint: disable=consider-using-with } @@ -706,9 +705,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" @@ -726,7 +725,7 @@ def get_dataset_info(station_type: str, source: str) -> Dict: elif source == "macae": dataset_info["storage_path"] = "" dataset_info["destination_table_id"] = "preprocessamento_radar_macae" - + log(f"Dataset info: {dataset_info}") return dataset_info From 0aab9121fa925b6a0957b473e0cf7e3bc1c2ca50 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 17:08:58 +0000 Subject: [PATCH 25/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../meteorologia/precipitacao_alertario/flows.py | 11 +++-------- pipelines/utils/gypscie/tasks.py | 10 ++++++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index b0c36327..e0bfdca5 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -13,6 +13,7 @@ wait_for_flow_run, ) from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 + # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 @@ -146,19 +147,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 8c024ca2..1c91a97e 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -17,7 +17,9 @@ from prefect import task # pylint: disable=E0611, E0401 from prefect.engine.signals import ENDRUN # pylint: disable=E0611, E0401 from prefect.engine.state import Failed # pylint: disable=E0611, E0401 -from prefeitura_rio.pipelines_utils.infisical import get_secret # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.infisical import ( + get_secret, # pylint: disable=E0611, E0401 +) from prefeitura_rio.pipelines_utils.logging import log # pylint: disable=E0611, E0401 from requests.exceptions import HTTPError @@ -705,9 +707,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 1288be10a71a419ef970e43bc3e40b739d14b00f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 14:32:30 -0300 Subject: [PATCH 26/56] bugfix --- .../meteorologia/precipitacao_alertario/flows.py | 13 ++++++++++--- .../precipitacao_alertario/schedules.py | 14 ++++++++++++-- pipelines/tasks.py | 11 ++++++----- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index e0bfdca5..ed1d6a43 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -147,13 +147,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -464,7 +470,7 @@ columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], ) register_dataset_response = register_dataset_on_gypscie( - api, filepath=path_pluviometric_gypscie, domain_id=domain_id + api, filepath=full_path_pluviometric_gypscie, domain_id=domain_id ) model_params = get_dataflow_alertario_params( @@ -513,6 +519,7 @@ partition_date_column=dataset_info["partition_date_column"], savepath="model_prediction", suffix=now_datetime, + wait=dfr, ) ################################ # Save preprocessing on GCP # diff --git a/pipelines/meteorologia/precipitacao_alertario/schedules.py b/pipelines/meteorologia/precipitacao_alertario/schedules.py index 2f1a74da..f928c357 100644 --- a/pipelines/meteorologia/precipitacao_alertario/schedules.py +++ b/pipelines/meteorologia/precipitacao_alertario/schedules.py @@ -27,17 +27,27 @@ "materialize_to_datario": False, "mode": "prod", "dump_to_gcs": False, + "maximum_bytes_processed": None, + "preprocessing_gypscie": True, + "workflow_id": 1, "environment_id": 1, "domain_id": 1, "project_id": 1, "project_name": "rionowcast_precipitation", + "treatment_version": 1, "processor_name": "etl_alertario22", "dataset_processor_id": 43, + "load_data_function_id": 53, + "parse_date_time_function_id": 54, + "drop_duplicates_function_id": 55, + "replace_inconsistent_values_function_id": 56, + "add_lat_lon_function_id": 57, + "save_data_function_id": 58, + "rain_gauge_metadata_path": 227, "dataset_id_previsao_chuva": "clima_previsao_chuva", "table_id_previsao_chuva": "preprocessamento_pluviometro_alertario", - "station_type": "pluviometro", + "station_type": "rain_gauge", "source": "alertario", - "maximum_bytes_processed": None, "model_version": 1, }, ), diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 80942dd9..557c808b 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -9,11 +9,11 @@ from typing import List, Union import pandas as pd -import pendulum -from google.cloud import storage -from prefect import task -from prefect.triggers import all_successful -from prefeitura_rio.pipelines_utils.infisical import get_secret +import pendulum # pylint: disable=E0611, E0401 +from google.cloud import storage # pylint: disable=E0611, E0401 +from prefect import task # pylint: disable=E0611, E0401 +from prefect.triggers import all_successful # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.infisical import get_secret # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.pandas import ( # pylint: disable=E0611, E0401 parse_date_columns, to_partitions, @@ -309,6 +309,7 @@ def task_create_partitions( suffix: str = None, build_json_dataframe: bool = False, dataframe_key_column: str = None, + wait=None, # pylint: disable=unused-argument ) -> Path: # sourcery skip: raise-specific-error """ Create task for to_partitions From 0601c6d67f220d93cea2046186e55dc909230342 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 15:29:39 -0300 Subject: [PATCH 27/56] bugfix --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 2 +- pipelines/tasks.py | 1 + pipelines/utils/gypscie/tasks.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index a0b34047..72e44c40 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -217,7 +217,7 @@ def save_data( suffix=str(treatment_version) + "_" + current_time, ) log(f"Files saved on {prepath}, full path is {full_path}") - return prepath, full_path + return prepath, full_path[0] @task diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 557c808b..0ac567b5 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -314,6 +314,7 @@ def task_create_partitions( """ Create task for to_partitions """ + log(f"Data before partition columns creation {data.iloc[0]}") data, partition_columns = parse_date_columns(data, partition_date_column) log(f"Created partition columns {partition_columns} and data first row now is {data.iloc[0]}") saved_files = to_partitions( diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 1c91a97e..9744e8b5 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -694,7 +694,7 @@ def get_dataset_info(station_type: str, source: str) -> Dict: dataset_info = { "dataset_id": "clima_pluviometro", "filename": "gauge_station_bq", - "partition_date_column": "data_medicao", + "partition_date_column": "datetime", } if source == "alertario": dataset_info["table_id"] = "taxa_precipitacao_alertario" @@ -703,7 +703,7 @@ def get_dataset_info(station_type: str, source: str) -> Dict: dataset_info = { "dataset_id": "clima_pluviometro", "filename": "weather_station_bq", - "partition_date_column": "data_medicao", + "partition_date_column": "datetime", } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" From 8d913cc9fc27de33d207e2e20749f37399a8c5ef Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 16:19:44 -0300 Subject: [PATCH 28/56] bugfix --- pipelines/meteorologia/precipitacao_alertario/flows.py | 3 ++- pipelines/meteorologia/precipitacao_alertario/schedules.py | 2 +- pipelines/meteorologia/precipitacao_alertario/tasks.py | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index ed1d6a43..6277c953 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -101,7 +101,7 @@ # Preprocessing gypscie parameters preprocessing_gypscie = Parameter("preprocessing_gypscie", default=False, required=False) # Gypscie parameters - workflow_id = Parameter("workflow_id", default=1, required=False) + workflow_id = Parameter("workflow_id", default=41, required=False) environment_id = Parameter("environment_id", default=1, required=False) domain_id = Parameter("domain_id", default=1, required=False) project_id = Parameter("project_id", default=1, required=False) @@ -468,6 +468,7 @@ dfr_pluviometric_gypscie, data_name="gypscie", columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], + data_type="parquet", ) register_dataset_response = register_dataset_on_gypscie( api, filepath=full_path_pluviometric_gypscie, domain_id=domain_id diff --git a/pipelines/meteorologia/precipitacao_alertario/schedules.py b/pipelines/meteorologia/precipitacao_alertario/schedules.py index f928c357..1fecfbb8 100644 --- a/pipelines/meteorologia/precipitacao_alertario/schedules.py +++ b/pipelines/meteorologia/precipitacao_alertario/schedules.py @@ -29,7 +29,7 @@ "dump_to_gcs": False, "maximum_bytes_processed": None, "preprocessing_gypscie": True, - "workflow_id": 1, + "workflow_id": 41, "environment_id": 1, "domain_id": 1, "project_id": 1, diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 72e44c40..22edcde3 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -184,6 +184,7 @@ def save_data( data_name: str = "temp", columns: str = None, treatment_version: int = None, + data_type: str = "csv", wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: """ @@ -213,7 +214,7 @@ def save_data( data=dataframe, partition_columns=partitions, savepath=prepath, - data_type="csv", + data_type=data_type, suffix=str(treatment_version) + "_" + current_time, ) log(f"Files saved on {prepath}, full path is {full_path}") From f0094ef081f3f195b5061fe81d25b86fe93a6af9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 19:20:06 +0000 Subject: [PATCH 29/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/tasks.py | 4 +++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 6277c953..6a6fd133 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -147,19 +147,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 0ac567b5..8318a46d 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -13,7 +13,9 @@ from google.cloud import storage # pylint: disable=E0611, E0401 from prefect import task # pylint: disable=E0611, E0401 from prefect.triggers import all_successful # pylint: disable=E0611, E0401 -from prefeitura_rio.pipelines_utils.infisical import get_secret # pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.infisical import ( + get_secret, # pylint: disable=E0611, E0401 +) from prefeitura_rio.pipelines_utils.pandas import ( # pylint: disable=E0611, E0401 parse_date_columns, to_partitions, From 9506ed07f75c348f5cd4e14d9c8b075cc84eda05 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 16:52:52 -0300 Subject: [PATCH 30/56] converting utc date to specific format --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 22edcde3..7ade0aed 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -378,5 +378,6 @@ def convert_sp_timezone_to_utc(dfr, data_column: str = "data_medicao") -> pd.Dat dfr[data_column] = pd.to_datetime(dfr[data_column]) dfr[data_column] = dfr[data_column].dt.tz_localize("America/Sao_Paulo") dfr[data_column] = dfr[data_column].dt.tz_convert("UTC") + dfr[data_column] = dfr[data_column].dt.strftime("%Y-%m-%d %H:%M:%S") return dfr From 26acf665a5c0aad0bd1a0bb93a6ec61c7a30a658 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 18:08:07 -0300 Subject: [PATCH 31/56] changing task_create_partitions --- .../precipitacao_alertario/flows.py | 22 +++++++++++---- pipelines/tasks.py | 28 ++++++++++++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 6a6fd133..a5d686c5 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -147,13 +147,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -503,19 +509,23 @@ dataset_names = get_dataset_name_on_gypscie(api, output_dataset_ids) # new ziped_dataset_paths = download_datasets_from_gypscie(api, dataset_names=dataset_names) dataset_paths = unzip_files(ziped_dataset_paths) - dfr_ = path_to_dfr(dataset_paths) + dfr_gypscie_ = path_to_dfr(dataset_paths) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_caracterization_columns_on_dfr(dfr_, model_version, update_time=True) + dfr_gypscie = add_caracterization_columns_on_dfr( + dfr_gypscie_, model_version, update_time=True + ) # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() prediction_data_path = task_create_partitions( - dfr, + data=dfr_gypscie, partition_date_column=dataset_info["partition_date_column"], savepath="model_prediction", + preffix="dados_alertario", suffix=now_datetime, - wait=dfr, + wait=dfr_gypscie, ) + prediction_data_path.set_upstream(dfr_gypscie) ################################ # Save preprocessing on GCP # ################################ diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 8318a46d..4f31e3cf 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -13,8 +13,9 @@ from google.cloud import storage # pylint: disable=E0611, E0401 from prefect import task # pylint: disable=E0611, E0401 from prefect.triggers import all_successful # pylint: disable=E0611, E0401 + # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.infisical import ( - get_secret, # pylint: disable=E0611, E0401 + get_secret, ) from prefeitura_rio.pipelines_utils.pandas import ( # pylint: disable=E0611, E0401 parse_date_columns, @@ -308,26 +309,39 @@ def task_create_partitions( # partition_columns: List[str], savepath: str = "temp", data_type: str = "csv", + preffix: str = None, suffix: str = None, build_json_dataframe: bool = False, dataframe_key_column: str = None, wait=None, # pylint: disable=unused-argument -) -> Path: # sourcery skip: raise-specific-error +) -> List[Path]: # sourcery skip: raise-specific-error """ Create task for to_partitions """ + prepath = Path(f"/tmp/{savepath}") + prepath.mkdir(parents=True, exist_ok=True) + log(f"Data before partition columns creation {data.iloc[0]}") data, partition_columns = parse_date_columns(data, partition_date_column) log(f"Created partition columns {partition_columns} and data first row now is {data.iloc[0]}") - saved_files = to_partitions( + full_paths = to_partitions( data=data, partition_columns=partition_columns, - savepath=savepath, + savepath=prepath, data_type=data_type, suffix=suffix, build_json_dataframe=build_json_dataframe, dataframe_key_column=dataframe_key_column, ) - log(f"Partition saved files {saved_files}") - log(f"Returned path {savepath}, {type(savepath)}") - return Path(savepath) + if preffix: + new_paths = [] + for full_path in full_paths: + new_filename = full_path.name.replace("data_", f"{preffix}_data_") + savepath = full_path.with_name(new_filename) + + # Renomear o arquivo + full_path.rename(savepath) + new_paths.append(savepath) + full_paths = new_paths + log(f"Returned path {full_paths}, {type(full_paths)}") + return full_paths From 987ab9fe45e0442908114be28863e837024bf23a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 31 Oct 2024 21:08:35 +0000 Subject: [PATCH 32/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/tasks.py | 7 +++---- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index a5d686c5..1a8c282a 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -147,19 +147,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 4f31e3cf..f2932a5c 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -13,10 +13,9 @@ from google.cloud import storage # pylint: disable=E0611, E0401 from prefect import task # pylint: disable=E0611, E0401 from prefect.triggers import all_successful # pylint: disable=E0611, E0401 - # pylint: disable=E0611, E0401 -from prefeitura_rio.pipelines_utils.infisical import ( - get_secret, -) + +# pylint: disable=E0611, E0401 +from prefeitura_rio.pipelines_utils.infisical import get_secret from prefeitura_rio.pipelines_utils.pandas import ( # pylint: disable=E0611, E0401 parse_date_columns, to_partitions, From 14e3b8713fc95426e53344f8d98ae43b82d159da Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 18:33:49 -0300 Subject: [PATCH 33/56] bugfix --- .../precipitacao_alertario/tasks.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 7ade0aed..45fbf897 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# pylint: disable=C0103,R0914 +# pylint: disable=C0103,R0914,R0913 """ Tasks for precipitacao_alertario """ @@ -185,6 +185,7 @@ def save_data( columns: str = None, treatment_version: int = None, data_type: str = "csv", + preffix: str = None, wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: """ @@ -210,15 +211,26 @@ def save_data( if columns: dataframe = dataframe[columns + new_partition_columns] - full_path = to_partitions( + full_paths = to_partitions( data=dataframe, partition_columns=partitions, savepath=prepath, data_type=data_type, suffix=str(treatment_version) + "_" + current_time, ) - log(f"Files saved on {prepath}, full path is {full_path}") - return prepath, full_path[0] + if preffix: + log(f"Adding preffix {preffix} on {full_paths}") + new_paths = [] + for full_path in full_paths: + new_filename = full_path.name.replace("data_", f"{preffix}_data_") + savepath = full_path.with_name(new_filename) + + # Renomear o arquivo + full_path.rename(savepath) + new_paths.append(savepath) + full_paths = new_paths + log(f"Files saved on {prepath}, full paths are {full_paths}") + return prepath, full_paths[0] @task From e66fce73b25838ea8c395bc02b6848929e70c336 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Thu, 31 Oct 2024 18:47:30 -0300 Subject: [PATCH 34/56] bugfix --- pipelines/meteorologia/precipitacao_alertario/flows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 1a8c282a..87d61c11 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -463,6 +463,7 @@ data_name="gypscie", columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], data_type="parquet", + preffix="dados_alertario", ) register_dataset_response = register_dataset_on_gypscie( api, filepath=full_path_pluviometric_gypscie, domain_id=domain_id @@ -515,7 +516,6 @@ data=dfr_gypscie, partition_date_column=dataset_info["partition_date_column"], savepath="model_prediction", - preffix="dados_alertario", suffix=now_datetime, wait=dfr_gypscie, ) From bda27b735164a7571bd2b3924b9dc8fda07d27eb Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 10:39:29 -0300 Subject: [PATCH 35/56] renaming file to dados_alertario_raw --- .../meteorologia/precipitacao_alertario/flows.py | 3 ++- .../meteorologia/precipitacao_alertario/tasks.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 87d61c11..6f7610f0 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -463,7 +463,8 @@ data_name="gypscie", columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], data_type="parquet", - preffix="dados_alertario", + rename="dados_alertario_raw", + suffix=False, ) register_dataset_response = register_dataset_on_gypscie( api, filepath=full_path_pluviometric_gypscie, domain_id=domain_id diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 45fbf897..497765f3 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -186,6 +186,8 @@ def save_data( treatment_version: int = None, data_type: str = "csv", preffix: str = None, + suffix: bool = True, + rename: str = None, wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: """ @@ -194,6 +196,8 @@ def save_data( if not treatment_version: treatment_version = "" + else: + treatment_version = str(treatment_version) + "_" prepath = Path(f"/tmp/precipitacao_alertario/{data_name}") prepath.mkdir(parents=True, exist_ok=True) @@ -204,10 +208,11 @@ def save_data( log(f"Dataframe for {data_name} before partitions {dfr.iloc[0]}") log(f"Dataframe for {data_name} before partitions {dfr.dtypes}") dataframe, partitions = parse_date_columns(dfr, partition_column) - current_time = pendulum.now("America/Sao_Paulo").strftime("%Y%m%d%H%M") log(f"Dataframe for {data_name} after partitions {dataframe.iloc[0]}") log(f"Dataframe for {data_name} after partitions {dataframe.dtypes}") + if suffix: + suffix = pendulum.now("America/Sao_Paulo").strftime("%Y%m%d%H%M") if columns: dataframe = dataframe[columns + new_partition_columns] @@ -216,13 +221,14 @@ def save_data( partition_columns=partitions, savepath=prepath, data_type=data_type, - suffix=str(treatment_version) + "_" + current_time, + suffix=suffix, ) - if preffix: + if preffix or rename: log(f"Adding preffix {preffix} on {full_paths}") new_paths = [] for full_path in full_paths: - new_filename = full_path.name.replace("data_", f"{preffix}_data_") + change_filename = f"{preffix}_data_" if preffix else rename + new_filename = full_path.name.replace("data_", change_filename) savepath = full_path.with_name(new_filename) # Renomear o arquivo @@ -230,6 +236,7 @@ def save_data( new_paths.append(savepath) full_paths = new_paths log(f"Files saved on {prepath}, full paths are {full_paths}") + # TODO alterar funções seguintes para receberem uma lista em vez de ter o full_paths[0] return prepath, full_paths[0] From d7c8e29dfb2ca90a64b078f74bacc650fbb677a0 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 10:57:18 -0300 Subject: [PATCH 36/56] bugfix" --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index 497765f3..c93b0456 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -194,10 +194,7 @@ def save_data( Salvar dfr tratados em csv para conseguir subir pro GCP """ - if not treatment_version: - treatment_version = "" - else: - treatment_version = str(treatment_version) + "_" + treatment_version = str(treatment_version) + "_" if treatment_version else "" prepath = Path(f"/tmp/precipitacao_alertario/{data_name}") prepath.mkdir(parents=True, exist_ok=True) @@ -206,13 +203,10 @@ def save_data( new_partition_columns = ["ano_particao", "mes_particao", "data_particao"] dfr = dfr.drop(columns=[col for col in new_partition_columns if col in dfr.columns]) log(f"Dataframe for {data_name} before partitions {dfr.iloc[0]}") - log(f"Dataframe for {data_name} before partitions {dfr.dtypes}") dataframe, partitions = parse_date_columns(dfr, partition_column) log(f"Dataframe for {data_name} after partitions {dataframe.iloc[0]}") - log(f"Dataframe for {data_name} after partitions {dataframe.dtypes}") - if suffix: - suffix = pendulum.now("America/Sao_Paulo").strftime("%Y%m%d%H%M") + suffix = pendulum.now("America/Sao_Paulo").strftime("%Y%m%d%H%M") if suffix else None if columns: dataframe = dataframe[columns + new_partition_columns] From 0b2ad3d803fc2193935da00e3ce1e1384c870f75 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 11:13:54 -0300 Subject: [PATCH 37/56] bugfix" --- pipelines/meteorologia/precipitacao_alertario/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index c93b0456..cdbe5fc1 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -221,8 +221,8 @@ def save_data( log(f"Adding preffix {preffix} on {full_paths}") new_paths = [] for full_path in full_paths: - change_filename = f"{preffix}_data_" if preffix else rename - new_filename = full_path.name.replace("data_", change_filename) + change_filename = f"{preffix}_data" if preffix else rename + new_filename = full_path.name.replace("data", change_filename) savepath = full_path.with_name(new_filename) # Renomear o arquivo From 3856421c1c2dee0f3585b5bfcd66f325d665157f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 11:57:03 -0300 Subject: [PATCH 38/56] changing column type before registring dataset on gypscie --- .../precipitacao_alertario/flows.py | 16 +++++++-- pipelines/utils/gypscie/tasks.py | 34 ++++++++++++++++--- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 6f7610f0..3fbdcb28 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -50,6 +50,7 @@ from pipelines.utils.gypscie.tasks import ( # pylint: disable=E0611, E0401 access_api, add_caracterization_columns_on_dfr, + convert_columns_type, download_datasets_from_gypscie, execute_dataflow_on_gypscie, get_dataflow_alertario_params, @@ -147,13 +148,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -457,7 +464,10 @@ dataset_processor_response, dataset_processor_id = get_dataset_processor_info( api, processor_name ) - dfr_pluviometric_gypscie = convert_sp_timezone_to_utc(dfr_pluviometric) + dfr_pluviometric_converted = convert_columns_type( + dfr_pluviometric, columns=["id_estacao"], new_types=[int] + ) + dfr_pluviometric_gypscie = convert_sp_timezone_to_utc(dfr_pluviometric_converted) path_pluviometric_gypscie, full_path_pluviometric_gypscie = save_data( dfr_pluviometric_gypscie, data_name="gypscie", diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 9744e8b5..2b3e79a3 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -17,8 +17,9 @@ from prefect import task # pylint: disable=E0611, E0401 from prefect.engine.signals import ENDRUN # pylint: disable=E0611, E0401 from prefect.engine.state import Failed # pylint: disable=E0611, E0401 +# pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.infisical import ( - get_secret, # pylint: disable=E0611, E0401 + get_secret, ) from prefeitura_rio.pipelines_utils.logging import log # pylint: disable=E0611, E0401 from requests.exceptions import HTTPError @@ -707,9 +708,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" @@ -761,3 +762,28 @@ def add_caracterization_columns_on_dfr( model_version_ = str(model_version) dfr["model_version"] = model_version_ return dfr + + +@task +def convert_columns_type( + dfr: pd.DataFrame, columns: list = None, new_types: list = None +) -> pd.DataFrame: + """ + Converts specified columns in a DataFrame to the provided data types. + + Parameters: + dfr (pd.DataFrame): The input DataFrame to modify. + columns (list): List of column names to be converted. + new_types (list): List of target data types for each column, in the same order as `columns`. + + Returns: + pd.DataFrame: The modified DataFrame with columns converted to specified types. + """ + if len(columns) != len(new_types): + raise ValueError("The lists `columns` and `new_types` must be of the same length.") + + for col, new_type in zip(columns, new_types): + if col in dfr.columns: + dfr[col] = dfr[col].astype(new_type) + + return dfr From f55b2b5791955d74ffd95ee7d11785e80297a2f3 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 13:15:15 -0300 Subject: [PATCH 39/56] changin return of get function --- pipelines/utils/gypscie/utils.py | 6 +++++- pipelines/utils_api.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pipelines/utils/gypscie/utils.py b/pipelines/utils/gypscie/utils.py index f5873dc6..f4a0627a 100644 --- a/pipelines/utils/gypscie/utils.py +++ b/pipelines/utils/gypscie/utils.py @@ -10,6 +10,7 @@ import basedosdados as bd import requests +import simplejson from prefeitura_rio.pipelines_utils.logging import log @@ -88,7 +89,10 @@ def get(self, path: str, timeout: int = 120) -> Dict: self._refresh_token_if_needed() response = requests.get(f"{self._base_url}{path}", headers=self._headers, timeout=timeout) response.raise_for_status() - return response.json() + try: + return response.json() + except simplejson.JSONDecodeError: + return response def put(self, path, json=None): """ diff --git a/pipelines/utils_api.py b/pipelines/utils_api.py index 2ba46b0b..f600d901 100644 --- a/pipelines/utils_api.py +++ b/pipelines/utils_api.py @@ -8,6 +8,7 @@ from typing import Callable, Dict, Tuple # , List import requests +import simplejson from prefeitura_rio.pipelines_utils.logging import log @@ -101,7 +102,10 @@ def get(self, path: str, timeout: int = 120) -> Dict: self._refresh_token_if_needed() response = requests.get(f"{self._base_url}{path}", headers=self._headers, timeout=timeout) response.raise_for_status() - return response.json() + try: + return response.json() + except simplejson.JSONDecodeError: + return response def put(self, path, json_data=None): """ From 1d714c98dd36a5b90579b848eea04264313b161a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:15:38 +0000 Subject: [PATCH 40/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/utils/gypscie/tasks.py | 11 +++++------ 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 3fbdcb28..b4f3c769 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -148,19 +148,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 2b3e79a3..fa63eef0 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -17,10 +17,9 @@ from prefect import task # pylint: disable=E0611, E0401 from prefect.engine.signals import ENDRUN # pylint: disable=E0611, E0401 from prefect.engine.state import Failed # pylint: disable=E0611, E0401 + # pylint: disable=E0611, E0401 -from prefeitura_rio.pipelines_utils.infisical import ( - get_secret, -) +from prefeitura_rio.pipelines_utils.infisical import get_secret from prefeitura_rio.pipelines_utils.logging import log # pylint: disable=E0611, E0401 from requests.exceptions import HTTPError @@ -708,9 +707,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 879e3600ceac08bbbed55b478decad71fb0d60e2 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 13:26:21 -0300 Subject: [PATCH 41/56] adding task to functions --- pipelines/utils/gypscie/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index fa63eef0..fed2f6bb 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -731,6 +731,7 @@ def get_dataset_info(station_type: str, source: str) -> Dict: return dataset_info +@task def path_to_dfr(path: str) -> pd.DataFrame: """ Reads a csv or parquet file from the given path and returns a dataframe @@ -748,6 +749,7 @@ def path_to_dfr(path: str) -> pd.DataFrame: return dfr +@task def add_caracterization_columns_on_dfr( dfr: pd.DataFrame, model_version: None, update_time: bool = False ) -> pd.DataFrame: From 6929b4645f16f055505d4499b3e13d12fc0a982f Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 13:53:31 -0300 Subject: [PATCH 42/56] bugfix --- pipelines/utils/gypscie/tasks.py | 42 ++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index fed2f6bb..2b7f2417 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -707,9 +707,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info[ - "destination_table_id" - ] = "preprocessamento_estacao_meteorologica_alertario" + dataset_info["destination_table_id"] = ( + "preprocessamento_estacao_meteorologica_alertario" + ) elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" @@ -732,20 +732,31 @@ def get_dataset_info(station_type: str, source: str) -> Dict: @task -def path_to_dfr(path: str) -> pd.DataFrame: +def path_to_dfr(paths: List[str]) -> pd.DataFrame: """ - Reads a csv or parquet file from the given path and returns a dataframe + Reads csvs or parquets filess from the given paths and returns a concatenated dataframe. """ - dfr = pd.DataFrame() - try: - if path.endswith(".csv"): - dfr = pd.read_csv(path) - elif path.endswith(".parquet"): - dfr = pd.read_parquet(path) - else: - raise ValueError("File extension not supported") - except AttributeError as error: - log(f"type(path) {type(path)} error {error}") + log(f"Start converting files from {paths} to a df.") + dataframes = [] + + for path in paths: + try: + if path.endswith(".csv"): + dfr_ = pd.read_csv(path) + elif path.endswith(".parquet"): + dfr_ = pd.read_parquet(path) + else: + raise ValueError(f"File extension not supported for file: {path}") + dataframes.append(dfr_) + + except AttributeError as error: + log(f"type(path) {type(path)} error {error}") + + if dataframes: + dfr = pd.concat(dataframes, ignore_index=True) + else: + dfr = pd.DataFrame() + log(f"Dataframe : {dfr.iloc[0]}") return dfr @@ -762,6 +773,7 @@ def add_caracterization_columns_on_dfr( if model_version is not None: model_version_ = str(model_version) dfr["model_version"] = model_version_ + log(f"Dataframe with new columns {dfr.iloc[0]}") return dfr From 0f73172ec76841cc5c1da1d2ff3ded01354ff73c Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 15:16:15 -0300 Subject: [PATCH 43/56] returning a path instead a list on task_create_partitions --- pipelines/meteorologia/precipitacao_alertario/flows.py | 2 +- pipelines/tasks.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index b4f3c769..c32aeb83 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -524,7 +524,7 @@ suffix=now_datetime, wait=dfr_gypscie, ) - prediction_data_path.set_upstream(dfr_gypscie) + ################################ # Save preprocessing on GCP # ################################ diff --git a/pipelines/tasks.py b/pipelines/tasks.py index f2932a5c..c0bd97bc 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -313,7 +313,7 @@ def task_create_partitions( build_json_dataframe: bool = False, dataframe_key_column: str = None, wait=None, # pylint: disable=unused-argument -) -> List[Path]: # sourcery skip: raise-specific-error +) -> Path: # sourcery skip: raise-specific-error """ Create task for to_partitions """ @@ -343,4 +343,4 @@ def task_create_partitions( new_paths.append(savepath) full_paths = new_paths log(f"Returned path {full_paths}, {type(full_paths)}") - return full_paths + return full_paths[0] From a43d0444b272186dd648809eb4174ebb0e2965f5 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Fri, 1 Nov 2024 15:39:41 -0300 Subject: [PATCH 44/56] changing path where to save table --- .../meteorologia/precipitacao_alertario/flows.py | 12 +++++++++--- pipelines/tasks.py | 8 ++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index c32aeb83..85a08f93 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -148,13 +148,19 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( + ( + dfr_pluviometric, + empty_data_pluviometric, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( + ( + dfr_meteorological, + empty_data_meteorological, + ) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, @@ -517,7 +523,7 @@ # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() - prediction_data_path = task_create_partitions( + prediction_data_path, prediction_data_full_path = task_create_partitions( data=dfr_gypscie, partition_date_column=dataset_info["partition_date_column"], savepath="model_prediction", diff --git a/pipelines/tasks.py b/pipelines/tasks.py index c0bd97bc..801c43bc 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -6,7 +6,7 @@ import json from pathlib import Path -from typing import List, Union +from typing import List, Union, Tuple import pandas as pd import pendulum # pylint: disable=E0611, E0401 @@ -301,7 +301,7 @@ def save_dataframe( return prepath -@task +@task(nout=2) def task_create_partitions( data: pd.DataFrame, partition_date_column: str, @@ -313,7 +313,7 @@ def task_create_partitions( build_json_dataframe: bool = False, dataframe_key_column: str = None, wait=None, # pylint: disable=unused-argument -) -> Path: # sourcery skip: raise-specific-error +) -> Tuple[Union[str, Path], Union[str, Path]]: # sourcery skip: raise-specific-error """ Create task for to_partitions """ @@ -343,4 +343,4 @@ def task_create_partitions( new_paths.append(savepath) full_paths = new_paths log(f"Returned path {full_paths}, {type(full_paths)}") - return full_paths[0] + return prepath, full_paths[0] From 0649aa087e44911341676163a67bef44a9b94a39 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 10:38:16 -0300 Subject: [PATCH 45/56] adding rename function and adapting code to treat radar data on gypscie --- .../meteorologia/radar/mendanha/flows.py | 83 ++++++++++++------- .../meteorologia/radar/mendanha/schedules.py | 18 ++++ .../meteorologia/radar/mendanha/utils.py | 16 ++-- 3 files changed, 78 insertions(+), 39 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index a28d2276..e36433ff 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -8,6 +8,7 @@ from prefect.run_configs import KubernetesRun # pylint: disable=E0611, E0401 from prefect.storage import GCS # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 + # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 create_table_and_upload_to_gcs, @@ -64,12 +65,15 @@ from pipelines.utils.gypscie.tasks import ( add_caracterization_columns_on_dfr, download_datasets_from_gypscie, - execute_dataset_processor, + execute_dataflow_on_gypscie, + get_dataflow_mendanha_params, get_dataset_info, + get_dataset_name_on_gypscie, get_dataset_processor_info, path_to_dfr, register_dataset_on_gypscie, - task_wait_run, + rename_files, + unzip_files, ) # create_visualization_with_background, prefix_to_restore, save_data, @@ -104,16 +108,25 @@ # Preprocessing gypscie parameters preprocessing_gypscie = Parameter("preprocessing_gypscie", default=False, required=False) + processor_name = Parameter("processor_name", default="etl_alertario22", required=True) + dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar + workflow_id = Parameter("workflow_id", default=40, required=False) + + load_data_function_id = Parameter("load_data_function_id", default=46, required=False) + filter_data_function_id = Parameter("filter_data_function_id", default=47, required=False) + parse_date_time_function_id = Parameter( + "parse_date_time_function_id", default=48, required=False + ) + aggregate_data_function_id = Parameter("aggregate_data_function_id", default=49, required=False) + save_data_function_id = Parameter("save_data_function_id", default=50, required=False) + model_version = Parameter("model_version", default=1, required=False) + # Gypscie parameters environment_id = Parameter("environment_id", default=1, required=False) domain_id = Parameter("domain_id", default=1, required=False) project_id = Parameter("project_id", default=1, required=False) project_name = Parameter("project_name", default="rionowcast_precipitation", required=False) - # Gypscie processor parameters - processor_name = Parameter("processor_name", default="etl_alertario22", required=True) - dataset_processor_id = Parameter("dataset_processor_id", default=43, required=False) # mudar - # Parameters for saving data on GCP materialize_after_dump = Parameter("materialize_after_dump", default=False, required=False) dump_mode = Parameter("dump_mode", default=False, required=False) @@ -152,7 +165,8 @@ files_to_download=files_on_storage_list, destination_path="temp/", ) - radar = task_open_radar_file(radar_files[0]) + uncompressed_files = unzip_files(radar_files) + radar = task_open_radar_file(uncompressed_files[0]) grid_shape, grid_limits = get_radar_parameters(radar) radar_2d = remap_data(radar, RADAR_PRODUCT_LIST, grid_shape, grid_limits) @@ -267,9 +281,9 @@ ) # save_last_update_redis.set_upstream(upload_table) - #################################### - # Start preprocessing flow # - #################################### + ###################################### + # Start gypscie preprocessing flow # + ###################################### with case(preprocessing_gypscie, True): api_gypscie = access_api_gypscie() @@ -283,35 +297,44 @@ ) # TODO: ao salvar o nome do radar_files salvar com sufixo treatment_version # pq te que ser unico no gypscie - dataset_response = register_dataset_on_gypscie( - api_gypscie, filepath=radar_files, domain_id=domain_id + # for now, all files to be processe has to have the name defined on default_value + # when the workflow was saved on gypscie. In this case default_value = "9921GUA_PPIVol.hdf" + # Gypscie will give a different name for zip file, but the inside file will have the name for all. + renamed_files = rename_files( + uncompressed_files, original_name=uncompressed_files[0], rename="9921GUA_PPIVol.hdf" ) - # TODO: verifcar no codigo do augustp se são esses os parametros corretos - processor_parameters = { - "dataset1": str(dataset_path).rsplit("/", maxsplit=1)[-1], - "station_type": station_type, - } - - dataset_processor_task_id = execute_dataset_processor( - api_gypscie, - processor_id=dataset_processor_id, - dataset_id=[dataset_response["id"]], + register_dataset_response = register_dataset_on_gypscie( + api_gypscie, filepath=renamed_files[0], domain_id=domain_id + ) + model_params = get_dataflow_mendanha_params( + workflow_id=workflow_id, environment_id=environment_id, project_id=project_id, - parameters=processor_parameters, + radar_data_id=register_dataset_response["id"], + load_data_function_id=load_data_function_id, + filter_data_function_id=filter_data_function_id, + parse_date_time_function_id=parse_date_time_function_id, + agregate_data_function_id=aggregate_data_function_id, + save_data_function_id=save_data_function_id, ) - wait_run = task_wait_run(api_gypscie, dataset_processor_task_id, flow_type="processor") - dataset_path = download_datasets_from_gypscie( - api_gypscie, dataset_names=[dataset_response["id"]], wait=wait_run + + output_dataset_ids = execute_dataflow_on_gypscie( + api_gypscie, + model_params, ) - dfr_ = path_to_dfr(dataset_path) + dataset_names = get_dataset_name_on_gypscie(api, output_dataset_ids) # new + ziped_dataset_paths = download_datasets_from_gypscie(api, dataset_names=dataset_names) + dataset_paths = unzip_files(ziped_dataset_paths) + dfr_gypscie_ = path_to_dfr(dataset_paths) # output_datasets_id = get_output_dataset_ids_on_gypscie(api, dataset_processor_task_id) - dfr = add_caracterization_columns_on_dfr(dfr_, treatment_version, update_time=True) + dfr_gypscie = add_caracterization_columns_on_dfr( + dfr_gypscie_, model_version, update_time=True + ) # Save pre-treated data on local file with partitions now_datetime = get_now_datetime() - prediction_data_path = task_create_partitions( - dfr, + prediction_data_path, prediction_data_full_path = task_create_partitions( + dfr_gypscie, partition_date_column=dataset_info["partition_date_column"], savepath="model_prediction", suffix=now_datetime, diff --git a/pipelines/meteorologia/radar/mendanha/schedules.py b/pipelines/meteorologia/radar/mendanha/schedules.py index d7202218..df218b59 100644 --- a/pipelines/meteorologia/radar/mendanha/schedules.py +++ b/pipelines/meteorologia/radar/mendanha/schedules.py @@ -28,6 +28,24 @@ "save_image_with_background": False, "save_image_without_colorbar": True, "save_image_with_colorbar": True, + "preprocessing_gypscie": True, + "workflow_id": 40, + "environment_id": 1, + "domain_id": 1, + "project_id": 1, + "project_name": "rionowcast_precipitation", + "processor_name": "etl_inea_radar", + "dataset_processor_id": 43, + "load_data_function_id": 46, + "filter_data_function_id": 47, + "parse_date_time_function_id": 48, + "aggregate_data_function_id": 49, + "save_data_function_id": 50, + "dataset_id_previsao_chuva": "clima_previsao_chuva", + "table_id_previsao_chuva": "preprocessamento_radar_mendanha", + "station_type": "radar", + "source": "mendanha", + "model_version": 1, }, ), ] diff --git a/pipelines/meteorologia/radar/mendanha/utils.py b/pipelines/meteorologia/radar/mendanha/utils.py index 588cc953..ea8b2ad1 100644 --- a/pipelines/meteorologia/radar/mendanha/utils.py +++ b/pipelines/meteorologia/radar/mendanha/utils.py @@ -4,18 +4,16 @@ General utils for setting rain dashboard using radar data. """ import base64 -import gzip import io import os import re -import shutil from datetime import datetime from pathlib import Path from typing import Union import matplotlib.colors as mcolors import numpy as np -import pyart +import pyart # pylint: disable=E0611, E0401 def extract_timestamp(filename) -> datetime: @@ -48,12 +46,12 @@ def open_radar_file(file_path: Union[str, Path]) -> Union[pyart.core.Radar, None Radar object. """ file_path = str(file_path) - if file_path.endswith(".gz"): - uncompressed_file_path = file_path[:-3] - with gzip.open(file_path, "rb") as f_in: - with open(uncompressed_file_path, "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - file_path = uncompressed_file_path + # if file_path.endswith(".gz"): + # uncompressed_file_path = file_path[:-3] + # with gzip.open(file_path, "rb") as f_in: + # with open(uncompressed_file_path, "wb") as f_out: + # shutil.copyfileobj(f_in, f_out) + # file_path = uncompressed_file_path try: opened_file = pyart.aux_io.read_odim_h5(file_path) From b4ecf9ec682b0184d9905fb8e096b4d65caca3bd Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 10:48:22 -0300 Subject: [PATCH 46/56] forgot to add other changes --- .../precipitacao_alertario/flows.py | 7 +- .../precipitacao_alertario/tasks.py | 30 ++-- pipelines/utils/gypscie/tasks.py | 132 +++++++++++++++--- 3 files changed, 135 insertions(+), 34 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 85a08f93..3ffe774d 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -59,6 +59,7 @@ get_dataset_processor_info, path_to_dfr, register_dataset_on_gypscie, + rename_files, unzip_files, ) @@ -473,11 +474,13 @@ data_name="gypscie", columns=["id_estacao", "data_medicao", "acumulado_chuva_5min"], data_type="parquet", - rename="dados_alertario_raw", suffix=False, ) + full_path_pluviometric_gypscie_ = rename_files( + full_path_pluviometric_gypscie, rename="dados_alertario_raw" + ) register_dataset_response = register_dataset_on_gypscie( - api, filepath=full_path_pluviometric_gypscie, domain_id=domain_id + api, filepath=full_path_pluviometric_gypscie_[0], domain_id=domain_id ) model_params = get_dataflow_alertario_params( diff --git a/pipelines/meteorologia/precipitacao_alertario/tasks.py b/pipelines/meteorologia/precipitacao_alertario/tasks.py index cdbe5fc1..e29f4865 100644 --- a/pipelines/meteorologia/precipitacao_alertario/tasks.py +++ b/pipelines/meteorologia/precipitacao_alertario/tasks.py @@ -185,9 +185,9 @@ def save_data( columns: str = None, treatment_version: int = None, data_type: str = "csv", - preffix: str = None, + # preffix: str = None, suffix: bool = True, - rename: str = None, + # rename: str = None, wait=None, # pylint: disable=unused-argument ) -> Tuple[Union[str, Path], Union[str, Path]]: """ @@ -217,21 +217,21 @@ def save_data( data_type=data_type, suffix=suffix, ) - if preffix or rename: - log(f"Adding preffix {preffix} on {full_paths}") - new_paths = [] - for full_path in full_paths: - change_filename = f"{preffix}_data" if preffix else rename - new_filename = full_path.name.replace("data", change_filename) - savepath = full_path.with_name(new_filename) - - # Renomear o arquivo - full_path.rename(savepath) - new_paths.append(savepath) - full_paths = new_paths + # if preffix or rename: + # log(f"Adding preffix {preffix} on {full_paths}") + # new_paths = [] + # for full_path in full_paths: + # change_filename = f"{preffix}_data" if preffix else rename + # new_filename = full_path.name.replace("data", change_filename) + # savepath = full_path.with_name(new_filename) + + # # Renomear o arquivo + # full_path.rename(savepath) + # new_paths.append(savepath) + # full_paths = new_paths log(f"Files saved on {prepath}, full paths are {full_paths}") # TODO alterar funções seguintes para receberem uma lista em vez de ter o full_paths[0] - return prepath, full_paths[0] + return prepath, full_paths @task diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index 2b7f2417..f08c0724 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -4,11 +4,13 @@ Tasks """ import datetime +import gzip import os +import shutil import zipfile from pathlib import Path from time import sleep -from typing import Dict, List +from typing import Dict, List, Union import numpy as np import pandas as pd @@ -417,6 +419,74 @@ def get_dataflow_alertario_params( # pylint: disable=too-many-arguments } +@task +def get_dataflow_mendanha_params( # pylint: disable=too-many-arguments + workflow_id, + environment_id, + project_id, + radar_data_id, + load_data_function_id, + filter_data_function_id, + parse_date_time_function_id, + agregate_data_function_id, + save_data_function_id, +) -> List: + """ + Return parameters for the Mendanha radar's ETL + + data = { + "workflow_id": 40, + "environment_id": 1, + "parameters": [ + { + "function_id":46, # load_data + "params": {"radar_data_path":213} + }, + { + "function_id":47 # filter_data + }, + { + "function_id":48 # parse_date_time + }, + { + "function_id":49 # aggregate_data + }, + { + "function_id":50, # save_data + "params": {"output_path":"9921GUA_PPIVol_20230112_190010_0000.parquet"} + } + ], + "project_id": 1 + } + """ + return { + "workflow_id": workflow_id, + "environment_id": environment_id, + "parameters": [ + { + "function_id": load_data_function_id, + "params": { + "radar_data_path": radar_data_id, + }, + }, + { + "function_id": filter_data_function_id, + }, + { + "function_id": parse_date_time_function_id, + }, + { + "function_id": agregate_data_function_id, + }, + { + "function_id": save_data_function_id, + "params": {"output_path": "preprocessed_data_radar_mendanha.parquet"}, + }, + ], + "project_id": project_id, + } + + @task def get_dataflow_params( # pylint: disable=too-many-arguments workflow_id, @@ -544,27 +614,28 @@ def download_datasets_from_gypscie( @task -def unzip_files(zip_files: List[str], destination_folder: str = "./") -> List[str]: +def unzip_files(compressed_files: List[str], destination_folder: str = "./") -> List[str]: """ - Unzip files to destination folder + Unzip .zip and .gz files to destination folder. """ - zip_files = [ - zip_file if zip_file.endswith(".zip") else zip_file + ".zip" for zip_file in zip_files - ] os.makedirs(destination_folder, exist_ok=True) - unziped_files = [] - for zip_file in zip_files: - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(destination_folder) - unziped_files.extend( - [ - os.path.join(destination_folder, nome_arquivo) - for nome_arquivo in zip_ref.namelist() - ] - ) + extracted_files = [] + for file in compressed_files: + if file.endswith(".zip"): + with zipfile.ZipFile(file, "r") as zip_ref: + zip_ref.extractall(destination_folder) + extracted_files.extend( + [os.path.join(destination_folder, f) for f in zip_ref.namelist()] + ) + elif file.endswith(".gz"): + output_file = os.path.join(destination_folder, os.path.basename(file)[:-3]) + with gzip.open(file, "rb") as gz_file: + with open(output_file, "wb") as out_file: + shutil.copyfileobj(gz_file, out_file) + extracted_files.append(output_file) - return unziped_files + return extracted_files @task @@ -800,3 +871,30 @@ def convert_columns_type( dfr[col] = dfr[col].astype(new_type) return dfr + + +@task +def rename_files( + files: List[Union[Path, str]], + original_name: str = "data", + preffix: str = None, + rename: str = None, +) -> List[Path]: + """ + Renomeia os arquivos com base em um prefixo ou novo nome. + """ + new_paths = [] + for file_path in files: + file_path = Path(file_path) + print(f"Original file path: {file_path}") + + change_filename = f"{preffix}_{original_name}" if preffix else rename + print(f"Name to replace '{original_name}' with: {change_filename}") + new_filename = file_path.name.replace(original_name, change_filename) + savepath = file_path.with_name(new_filename) + + # Rename file + file_path.rename(savepath) + new_paths.append(savepath) + print(f"Renamed file paths: {new_paths}") + return new_paths From 787aaf26cf16fc9ccb72df9075052c1bbf461a3d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:48:47 +0000 Subject: [PATCH 47/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/meteorologia/precipitacao_alertario/flows.py | 10 ++-------- pipelines/meteorologia/radar/mendanha/flows.py | 3 ++- pipelines/tasks.py | 2 +- pipelines/utils/gypscie/tasks.py | 6 +++--- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/pipelines/meteorologia/precipitacao_alertario/flows.py b/pipelines/meteorologia/precipitacao_alertario/flows.py index 3ffe774d..7cd17eb5 100644 --- a/pipelines/meteorologia/precipitacao_alertario/flows.py +++ b/pipelines/meteorologia/precipitacao_alertario/flows.py @@ -149,19 +149,13 @@ ######################### dfr_pluviometric, dfr_meteorological = download_data() - ( - dfr_pluviometric, - empty_data_pluviometric, - ) = treat_pluviometer_and_meteorological_data( + (dfr_pluviometric, empty_data_pluviometric,) = treat_pluviometer_and_meteorological_data( dfr=dfr_pluviometric, dataset_id=DATASET_ID_PLUVIOMETRIC, table_id=TABLE_ID_PLUVIOMETRIC, mode=MATERIALIZATION_MODE, ) - ( - dfr_meteorological, - empty_data_meteorological, - ) = treat_pluviometer_and_meteorological_data( + (dfr_meteorological, empty_data_meteorological,) = treat_pluviometer_and_meteorological_data( dfr=dfr_meteorological, dataset_id=DATASET_ID_METEOROLOGICAL, table_id=TABLE_ID_METEOROLOGICAL, diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index e36433ff..39051b08 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -8,7 +8,8 @@ from prefect.run_configs import KubernetesRun # pylint: disable=E0611, E0401 from prefect.storage import GCS # pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.custom import Flow # pylint: disable=E0611, E0401 - # pylint: disable=E0611, E0401 + +# pylint: disable=E0611, E0401 from prefeitura_rio.pipelines_utils.state_handlers import handler_inject_bd_credentials from prefeitura_rio.pipelines_utils.tasks import ( # pylint: disable=E0611, E0401 create_table_and_upload_to_gcs, diff --git a/pipelines/tasks.py b/pipelines/tasks.py index 801c43bc..cdaaad2a 100644 --- a/pipelines/tasks.py +++ b/pipelines/tasks.py @@ -6,7 +6,7 @@ import json from pathlib import Path -from typing import List, Union, Tuple +from typing import List, Tuple, Union import pandas as pd import pendulum # pylint: disable=E0611, E0401 diff --git a/pipelines/utils/gypscie/tasks.py b/pipelines/utils/gypscie/tasks.py index f08c0724..3b6d6b97 100644 --- a/pipelines/utils/gypscie/tasks.py +++ b/pipelines/utils/gypscie/tasks.py @@ -778,9 +778,9 @@ def get_dataset_info(station_type: str, source: str) -> Dict: } if source == "alertario": dataset_info["table_id"] = "meteorologia_alertario" - dataset_info["destination_table_id"] = ( - "preprocessamento_estacao_meteorologica_alertario" - ) + dataset_info[ + "destination_table_id" + ] = "preprocessamento_estacao_meteorologica_alertario" elif source == "inmet": dataset_info["table_id"] = "meteorologia_inmet" dataset_info["destination_table_id"] = "preprocessamento_estacao_meteorologica_inmet" From 3c7329160f2e91fae802834d9306c154f14510cf Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 11:42:37 -0300 Subject: [PATCH 48/56] trying to fix fail on flow registration --- pipelines/meteorologia/radar/__init__.py | 2 ++ pipelines/meteorologia/radar/mendanha/__init__.py | 7 +++++++ pipelines/utils/dump_db/__init__.py | 2 ++ 3 files changed, 11 insertions(+) diff --git a/pipelines/meteorologia/radar/__init__.py b/pipelines/meteorologia/radar/__init__.py index e69de29b..6917c4e7 100644 --- a/pipelines/meteorologia/radar/__init__.py +++ b/pipelines/meteorologia/radar/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from pipelines.meteorologia.radar.mendanha.flows import * # noqa \ No newline at end of file diff --git a/pipelines/meteorologia/radar/mendanha/__init__.py b/pipelines/meteorologia/radar/mendanha/__init__.py index e69de29b..faf414fd 100644 --- a/pipelines/meteorologia/radar/mendanha/__init__.py +++ b/pipelines/meteorologia/radar/mendanha/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- +""" +Prefect flows for mendanha project +""" +############################################################################### +# Automatically managed, please do not touch +############################################################################### diff --git a/pipelines/utils/dump_db/__init__.py b/pipelines/utils/dump_db/__init__.py index e69de29b..ebb2e059 100644 --- a/pipelines/utils/dump_db/__init__.py +++ b/pipelines/utils/dump_db/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""" Init file for dump_dbt module """ From ef7f92c071427e9d0901c042df54354b0c66b0db Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 11:55:43 -0300 Subject: [PATCH 49/56] testing init --- pipelines/meteorologia/radar/__init__.py | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 pipelines/meteorologia/radar/__init__.py diff --git a/pipelines/meteorologia/radar/__init__.py b/pipelines/meteorologia/radar/__init__.py deleted file mode 100644 index 6917c4e7..00000000 --- a/pipelines/meteorologia/radar/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -# -*- coding: utf-8 -*- -from pipelines.meteorologia.radar.mendanha.flows import * # noqa \ No newline at end of file From cf69f936c6511a5a1f612e641dc60c05ce916a92 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 12:08:11 -0300 Subject: [PATCH 50/56] chore: force register --- .github/workflows/cd.yaml | 2 +- pipelines/meteorologia/radar/__init__.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 pipelines/meteorologia/radar/__init__.py diff --git a/.github/workflows/cd.yaml b/.github/workflows/cd.yaml index 53d4b5fb..68a0a29b 100644 --- a/.github/workflows/cd.yaml +++ b/.github/workflows/cd.yaml @@ -71,4 +71,4 @@ jobs: - name: Register Prefect flows run: |- - python .github/workflows/scripts/register_flows.py --project $PREFECT__SERVER__PROJECT --path pipelines/ --schedule --filter-affected-flows + python .github/workflows/scripts/register_flows.py --project $PREFECT__SERVER__PROJECT --path pipelines/ --schedule --no-filter-affected-flows diff --git a/pipelines/meteorologia/radar/__init__.py b/pipelines/meteorologia/radar/__init__.py new file mode 100644 index 00000000..e69de29b From 8be7b2ff853c2101e9b7792cf25fff8362a6d508 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 12:31:44 -0300 Subject: [PATCH 51/56] adding utils on init --- pipelines/meteorologia/radar/__init__.py | 1 + pipelines/utils/__init__.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pipelines/meteorologia/radar/__init__.py b/pipelines/meteorologia/radar/__init__.py index e69de29b..be4d8fae 100644 --- a/pipelines/meteorologia/radar/__init__.py +++ b/pipelines/meteorologia/radar/__init__.py @@ -0,0 +1 @@ +from pipelines.meteorologia.radar.mendanha.flows import * # noqa diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py index 3aeae37e..842b94e8 100644 --- a/pipelines/utils/__init__.py +++ b/pipelines/utils/__init__.py @@ -4,3 +4,5 @@ """ from pipelines.utils.execute_dbt_model.flows import * +from pipelines.utils.dump_db.flows import * +from pipelines.utils.dump_to_gcs.flows import * From 49b39f36a59a45313c37e3268a1cf3009142bbb6 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 12:41:48 -0300 Subject: [PATCH 52/56] removing paralelism from flow --- pipelines/meteorologia/radar/mendanha/flows.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 39051b08..10f4d31f 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -87,8 +87,6 @@ with Flow( name="COR: Meteorologia - Mapa de Refletividade Radar do Mendanha", state_handlers=[handler_inject_bd_credentials], - skip_if_running=False, - parallelism=100, # skip_if_running=True, ) as cor_meteorologia_refletividade_radar_men_flow: From 11356e601cfd6c541126d3f470d3676287aae385 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 13:04:08 -0300 Subject: [PATCH 53/56] changin flow run config --- pipelines/meteorologia/radar/mendanha/flows.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index 10f4d31f..d40a8453 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -366,10 +366,6 @@ cor_meteorologia_refletividade_radar_men_flow.run_config = KubernetesRun( image=constants.DOCKER_IMAGE.value, labels=[constants.RJ_COR_AGENT_LABEL.value], - cpu_request=1, - cpu_limit=1, - memory_request="2Gi", - memory_limit="3Gi", ) cor_meteorologia_refletividade_radar_men_flow.schedule = TIME_SCHEDULE From 6508243769a1a32f38e5908ccdf779ef41c029b8 Mon Sep 17 00:00:00 2001 From: patriciacatandi Date: Mon, 4 Nov 2024 13:57:25 -0300 Subject: [PATCH 54/56] rolling back modification on init --- pipelines/meteorologia/radar/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/meteorologia/radar/__init__.py b/pipelines/meteorologia/radar/__init__.py index be4d8fae..e69de29b 100644 --- a/pipelines/meteorologia/radar/__init__.py +++ b/pipelines/meteorologia/radar/__init__.py @@ -1 +0,0 @@ -from pipelines.meteorologia.radar.mendanha.flows import * # noqa From e80364a27172c5981f4cddcff23cab4894c9ba1c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:57:45 +0000 Subject: [PATCH 55/56] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pipelines/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/utils/__init__.py b/pipelines/utils/__init__.py index 842b94e8..7ca8bff5 100644 --- a/pipelines/utils/__init__.py +++ b/pipelines/utils/__init__.py @@ -3,6 +3,6 @@ Helper flows that could fit any pipeline. """ -from pipelines.utils.execute_dbt_model.flows import * from pipelines.utils.dump_db.flows import * from pipelines.utils.dump_to_gcs.flows import * +from pipelines.utils.execute_dbt_model.flows import * From a3cea13cdb0d6656313beeb23878d74c075e509f Mon Sep 17 00:00:00 2001 From: Gabriel Gazola Milan Date: Mon, 4 Nov 2024 15:20:10 -0300 Subject: [PATCH 56/56] fix: variable name --- pipelines/meteorologia/radar/mendanha/flows.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/meteorologia/radar/mendanha/flows.py b/pipelines/meteorologia/radar/mendanha/flows.py index d40a8453..74946a4d 100644 --- a/pipelines/meteorologia/radar/mendanha/flows.py +++ b/pipelines/meteorologia/radar/mendanha/flows.py @@ -165,12 +165,12 @@ destination_path="temp/", ) uncompressed_files = unzip_files(radar_files) - radar = task_open_radar_file(uncompressed_files[0]) - grid_shape, grid_limits = get_radar_parameters(radar) - radar_2d = remap_data(radar, RADAR_PRODUCT_LIST, grid_shape, grid_limits) + radar_file = task_open_radar_file(uncompressed_files[0]) + grid_shape, grid_limits = get_radar_parameters(radar_file) + radar_2d = remap_data(radar_file, RADAR_PRODUCT_LIST, grid_shape, grid_limits) # Create visualizations - formatted_time, filename_time = get_and_format_time(radar) + formatted_time, filename_time = get_and_format_time(radar_file) cbar_title = get_colorbar_title(RADAR_PRODUCT_LIST[0]) fig = create_visualization_no_background( radar_2d, radar_product=RADAR_PRODUCT_LIST[0], cbar_title=cbar_title, title=formatted_time