diff --git a/examples/yinglong/README.md b/examples/yinglong/README.md new file mode 100644 index 0000000000..0f8272d7e9 --- /dev/null +++ b/examples/yinglong/README.md @@ -0,0 +1,54 @@ +# Skillful High Resolution Regional Short Term Forecasting with Boundary Smoothing + +YingLong, a high-resolution, short-term regional weather forecasting, artificial-intelligence-based model, which is capable of hourly predicting weather fields including wind speed, temperature, and specific humidity at a 3km resolution. YingLong utilizes a parallel structure of global and local blocks to capture multiscale meteorological features and is trained on analysis dataset. Additionally, the necessary information around the regional boundary is introduced to YingLong through the boundary smoothing strategy, which significantly improves the regional forecasting results. By comparing forecast results with those from WRF-ARW, one of the best numerical prediction models, YingLong demonstrates superior forecasting performances in most cases, especially on surface variables. + +This code is the implementation of YingLong. We select the southeastern region of the United States, which is around the range of 110-130E, 15-35N, with 440 × 408 grid points in Lambert projection. + +## Installation + +### 1. Install PaddlePaddle + +Please install the 2.5.2 version of PaddlePaddle according to your environment on the official website of [PaddlePaddle](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html). + +For example, if your environment is linux and CUDA 11.2, you can install PaddlePaddle by the following command. + +``` shell +python -m pip install paddlepaddle-gpu==2.5.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html +``` + +After installation, run the following command to verify if PaddlePaddle has been successfully installed. + +``` shell +python -c "import paddle; paddle.utils.run_check()" +``` + +If `"PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now."` appears, to verify that the installation was successful. + +### 2. Install PaddleScience + +Clone the code of PaddleScience from [here](https://github.com/PaddlePaddle/PaddleScience.git). + +``` shell +git clone -b develop https://github.com/PaddlePaddle/PaddleScience.git +``` + +## Example Usage + +### 1. Download the data and model weights + +``` shell +cd PaddleScience +wget https://paddle-org.bj.bcebos.com/paddlescience/datasets/yinglong/hrrr_example_24vars.tar +tar -xvf hrrr_example_24vars.tar +wget https://paddle-org.bj.bcebos.com/paddlescience/models/yinglong/yinglong_models.tar +tar -xvf yinglong_models.tar +``` + +### 2. Run the code + +The following code runs the Yinglong model, and the model output will be saved in 'output/result.npy'. + +``` shell +export PYTHONPATH=$PWD +python examples/yinglong/predict_12layers.py +``` diff --git a/examples/yinglong/predict_12layers.py b/examples/yinglong/predict_12layers.py new file mode 100644 index 0000000000..2f5f71a686 --- /dev/null +++ b/examples/yinglong/predict_12layers.py @@ -0,0 +1,189 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from os import path as osp + +import h5py +import numpy as np +import paddle +import paddle.inference as paddle_infer +import pandas as pd +from packaging import version + +from examples.yinglong.timefeatures import time_features +from ppsci.utils import logger + + +class YingLong: + def __init__( + self, model_file: str, params_file: str, mean_path: str, std_path: str + ): + self.model_file = model_file + self.params_file = params_file + + config = paddle_infer.Config(model_file, params_file) + config.switch_ir_optim(False) + config.enable_use_gpu(100, 0) + config.enable_memory_optim() + + self.predictor = paddle_infer.create_predictor(config) + + # get input names and data handles + self.input_names = self.predictor.get_input_names() + self.input_data_handle = self.predictor.get_input_handle(self.input_names[0]) + self.time_stamps_handle = self.predictor.get_input_handle(self.input_names[1]) + self.nwp_data_handle = self.predictor.get_input_handle(self.input_names[2]) + + # get output names and data handles + self.output_names = self.predictor.get_output_names() + self.output_handle = self.predictor.get_output_handle(self.output_names[0]) + + # load mean and std data + self.mean = np.load(mean_path).reshape(-1, 1, 1).astype(np.float32) + self.std = np.load(std_path).reshape(-1, 1, 1).astype(np.float32) + + def _preprocess_data(self, input_data, time_stamps, nwp_data): + # normalize data + input_data = (input_data - self.mean) / self.std + nwp_data = (nwp_data - self.mean) / self.std + + # process time stamps + for i in range(len(time_stamps)): + time_stamps[i] = pd.DataFrame({"date": time_stamps[i]}) + time_stamps[i] = time_features(time_stamps[i], timeenc=1, freq="h").astype( + np.float32 + ) + time_stamps = np.asarray(time_stamps) + return input_data, time_stamps, nwp_data + + def _postprocess_data(self, data): + # denormalize data + data = data * self.std + self.mean + return data + + def __call__(self, input_data, time_stamp, nwp_data): + # preprocess data + input_data, time_stamps, nwp_data = self._preprocess_data( + input_data, time_stamp, nwp_data + ) + + # set input data + self.input_data_handle.copy_from_cpu(input_data) + self.time_stamps_handle.copy_from_cpu(time_stamps) + self.nwp_data_handle.copy_from_cpu(nwp_data) + + # run predictor + self.predictor.run() + + # get output data + output_data = self.output_handle.copy_to_cpu() + + # postprocess data + output_data = self._postprocess_data(output_data) + return output_data + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_file", + type=str, + default="./yinglong_models/yinglong_12.pdmodel", + help="Model filename", + ) + parser.add_argument( + "--params_file", + type=str, + default="./yinglong_models/yinglong_12.pdiparams", + help="Parameter filename", + ) + parser.add_argument( + "--mean_path", + type=str, + default="./hrrr_example_24vars/stat/mean_crop.npy", + help="Mean filename", + ) + parser.add_argument( + "--std_path", + type=str, + default="./hrrr_example_24vars/stat/std_crop.npy", + help="Standard deviation filename", + ) + parser.add_argument( + "--input_file", + type=str, + default="./hrrr_example_24vars/valid/2022/01/01.h5", + help="Input filename", + ) + parser.add_argument( + "--init_time", type=str, default="2022/01/01/00", help="Init time" + ) + parser.add_argument( + "--nwp_file", + type=str, + default="./hrrr_example_24vars/nwp_convert/2022/01/01/00.h5", + help="NWP filename", + ) + parser.add_argument( + "--num_timestamps", type=int, default=22, help="Number of timestamps" + ) + parser.add_argument( + "--output_path", type=str, default="output", help="Output file path" + ) + + return parser.parse_args() + + +def main(): + args = parse_args() + logger.init_logger("ppsci", osp.join(args.output_path, "predict.log"), "info") + if version.Version(paddle.__version__) != version.Version("2.5.2"): + logger.error( + f"Your Paddle version is {paddle.__version__}, but this code currently " + "only supports PaddlePaddle 2.5.2. The latest version of Paddle will be " + "supported as soon as possible." + ) + exit() + + num_timestamps = args.num_timestamps + + # create predictor + predictor = YingLong( + args.model_file, args.params_file, args.mean_path, args.std_path + ) + + # load data + input_file = h5py.File(args.input_file, "r")["fields"] + nwp_file = h5py.File(args.nwp_file, "r")["fields"] + input_data = input_file[0:1] + nwp_data = nwp_file[0:num_timestamps] + + # create time stamps + cur_time = pd.to_datetime(args.init_time, format="%Y/%m/%d/%H") + time_stamps = [[cur_time]] + for _ in range(num_timestamps): + cur_time += pd.Timedelta(hours=1) + time_stamps.append([cur_time]) + + # run predictor + output_data = predictor(input_data, time_stamps, nwp_data) + + save_path = osp.join(args.output_path, "result.npy") + logger.info(f"Save output to {save_path}") + np.save(save_path, output_data) + + +if __name__ == "__main__": + main() diff --git a/examples/yinglong/timefeatures.py b/examples/yinglong/timefeatures.py new file mode 100644 index 0000000000..6f0fef6dae --- /dev/null +++ b/examples/yinglong/timefeatures.py @@ -0,0 +1,177 @@ +# This code is reference from https://github.com/zhouhaoyi/Informer2020 +from typing import List + +import numpy as np +import pandas as pd +from pandas.tseries import offsets +from pandas.tseries.frequencies import to_offset + + +class TimeFeature: + def __init__(self): + pass + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + pass + + def __repr__(self): + return self.__class__.__name__ + "()" + + +class SecondOfMinute(TimeFeature): + """Minute of hour encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return index.second / 59.0 - 0.5 + + +class MinuteOfHour(TimeFeature): + """Minute of hour encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return index.minute / 59.0 - 0.5 + + +class HourOfDay(TimeFeature): + """Hour of day encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return index.hour / 23.0 - 0.5 + + +class DayOfWeek(TimeFeature): + """Hour of day encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return index.dayofweek / 6.0 - 0.5 + + +class DayOfMonth(TimeFeature): + """Day of month encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return (index.day - 1) / 30.0 - 0.5 + + +class DayOfYear(TimeFeature): + """Day of year encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return (index.dayofyear - 1) / 365.0 - 0.5 + + +class MonthOfYear(TimeFeature): + """Month of year encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return (index.month - 1) / 11.0 - 0.5 + + +class WeekOfYear(TimeFeature): + """Week of year encoded as value between [-0.5, 0.5]""" + + def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: + return (index.week - 1) / 52.0 - 0.5 + + +def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]: + """ + Returns a list of time features that will be appropriate for the given frequency string. + Parameters + ---------- + freq_str + Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc. + """ + + features_by_offsets = { + offsets.YearEnd: [], + offsets.QuarterEnd: [MonthOfYear], + offsets.MonthEnd: [MonthOfYear], + offsets.Week: [DayOfMonth, WeekOfYear], + offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear], + offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear], + offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear], + offsets.Minute: [ + MinuteOfHour, + HourOfDay, + DayOfWeek, + DayOfMonth, + DayOfYear, + ], + offsets.Second: [ + SecondOfMinute, + MinuteOfHour, + HourOfDay, + DayOfWeek, + DayOfMonth, + DayOfYear, + ], + } + + offset = to_offset(freq_str) + + for offset_type, feature_classes in features_by_offsets.items(): + if isinstance(offset, offset_type): + return [cls() for cls in feature_classes] + + supported_freq_msg = f""" + Unsupported frequency {freq_str} + The following frequencies are supported: + Y - yearly + alias: A + M - monthly + W - weekly + D - daily + B - business days + H - hourly + T - minutely + alias: min + S - secondly + """ + raise RuntimeError(supported_freq_msg) + + +def time_features(dates, timeenc=1, freq="h"): + """ + > `time_features` takes in a `dates` dataframe with a 'dates' column and extracts the date down to `freq` where freq can be any of the following if `timeenc` is 0: + > * m - [month] + > * w - [month] + > * d - [month, day, weekday] + > * b - [month, day, weekday] + > * h - [month, day, weekday, hour] + > * t - [month, day, weekday, hour, *minute] + > + > If `timeenc` is 1, a similar, but different list of `freq` values are supported (all encoded between [-0.5 and 0.5]): + > * Q - [month] + > * M - [month] + > * W - [Day of month, week of year] + > * D - [Day of week, day of month, day of year] + > * B - [Day of week, day of month, day of year] + > * H - [Hour of day, day of week, day of month, day of year] + > * T - [Minute of hour*, hour of day, day of week, day of month, day of year] + > * S - [Second of minute, minute of hour, hour of day, day of week, day of month, day of year] + + *minute returns a number from 0-3 corresponding to the 15 minute period it falls into. + """ + if timeenc == 0: + dates["month"] = dates.date.apply(lambda row: row.month, 1) + dates["day"] = dates.date.apply(lambda row: row.day, 1) + dates["weekday"] = dates.date.apply(lambda row: row.weekday(), 1) + dates["hour"] = dates.date.apply(lambda row: row.hour, 1) + dates["minute"] = dates.date.apply(lambda row: row.minute, 1) + dates["minute"] = dates.minute.map(lambda x: x // 15) + freq_map = { + "y": [], + "m": ["month"], + "w": ["month"], + "d": ["month", "day", "weekday"], + "b": ["month", "day", "weekday"], + "h": ["month", "day", "weekday", "hour"], + "t": ["month", "day", "weekday", "hour", "minute"], + } + return dates[freq_map[freq.lower()]].values + if timeenc == 1: + dates = pd.to_datetime(dates.date.values) + return np.vstack( + [feat(dates) for feat in time_features_from_frequency_str(freq)] + ).transpose(1, 0)