diff --git a/README.md b/README.md
index 9c7cac65..2edb54d8 100644
--- a/README.md
+++ b/README.md
@@ -94,7 +94,7 @@ This functionality is implemented with the [Microsoft NNI](https://github.com/mi
 imputation survey repo [Awesome_Imputation](https://github.com/WenjieDu/Awesome_Imputation) to see how to config and
 tune the hyperparameters.
 
-🔥 Note that Transformer, iTransformer, FreTS, Crossformer, PatchTST, DLinear, ETSformer, FiLM, FEDformer, Informer, Autoformer
+🔥 Note that Transformer, iTransformer, FreTS, Crossformer, PatchTST, DLinear, ETSformer, Pyraformer, Nonstationary Transformer, FiLM, FEDformer, Informer, Autoformer
 are not proposed as imputation methods in their original papers, and they cannot accept POTS as input.
 **To make them applicable on POTS data, we apply the embedding strategy and training approach (ORT+MIT)
 the same as we did in [SAITS paper](https://arxiv.org/pdf/2202.08516).**
@@ -110,14 +110,16 @@ The paper references and links are all listed at the bottom of this file.
 | **Type**      | **Algo**                           | **IMPU** | **FORE** | **CLAS** | **CLUS** | **ANOD** | **Year - Venue** |
 |:--------------|:-----------------------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:-----------------|
 | Neural Net    | iTransformer[^24]                  |    ✅     |          |          |          |          | 2024 - ICLR      |
-| Neural Net    | FreTS[^23]                         |    ✅     |          |          |          |          | 2024 - NeurIPS   |
 | Neural Net    | SAITS[^1]                          |    ✅     |          |          |          |          | 2023 - ESWA      |
+| Neural Net    | FreTS[^23]                         |    ✅     |          |          |          |          | 2023 - NeurIPS   |
 | Neural Net    | Crossformer[^16]                   |    ✅     |          |          |          |          | 2023 - ICLR      |
 | Neural Net    | TimesNet[^14]                      |    ✅     |          |          |          |          | 2023 - ICLR      |
 | Neural Net    | PatchTST[^18]                      |    ✅     |          |          |          |          | 2023 - ICLR      |
 | Neural Net    | ETSformer[^19]                     |    ✅     |          |          |          |          | 2023 - ICLR      |
 | Neural Net    | DLinear[^17]                       |    ✅     |          |          |          |          | 2023 - AAAI      |
+| Neural Net    | Nonstationary <br>Transformer[^25] |    ✅     |          |          |          |          | 2022 - NeurIPS   |
 | Neural Net    | FiLM[^22]                          |    ✅     |          |          |          |          | 2022 - NeurIPS   |
+| Neural Net    | Pyraformer[^26]                    |          |          |    ✅     |          |          | 2022 - ICLR      |
 | Neural Net    | Raindrop[^5]                       |          |          |    ✅     |          |          | 2022 - ICLR      |
 | Neural Net    | FEDformer[^20]                     |    ✅     |          |          |          |          | 2022 - ICML      |
 | Neural Net    | Autoformer[^15]                    |    ✅     |          |          |          |          | 2021 - NeurIPS   |
@@ -350,8 +352,10 @@ PyPOTS community is open, transparent, and surely friendly. Let's work together
 [^20]: Zhou, T., Ma, Z., Wen, Q., Wang, X., Sun, L., & Jin, R. (2022). [FEDformer: Frequency enhanced decomposed transformer for long-term series forecasting](https://proceedings.mlr.press/v162/zhou22g.html). *ICML 2022*.
 [^21]: Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021). [Informer: Beyond efficient transformer for long sequence time-series forecasting](https://ojs.aaai.org/index.php/AAAI/article/view/17325). *AAAI 2021*.
 [^22]: Zhou, T., Ma, Z., Wen, Q., Sun, L., Yao, T., Yin, W., & Jin, R. (2022). [FiLM: Frequency improved Legendre Memory Model for Long-term Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2022/hash/524ef58c2bd075775861234266e5e020-Abstract-Conference.html). *NeurIPS 2022*.
-[^23]: Yi, K., Zhang, Q., Fan, W., Wang, S., Wang, P., He, H., An, N., Lian, D., Cao, L., & Niu, Z. (2024). [Frequency-domain MLPs are More Effective Learners in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f1d16af76939f476b5f040fd1398c0a3-Abstract-Conference.html). *NeurIPS 2024*.
+[^23]: Yi, K., Zhang, Q., Fan, W., Wang, S., Wang, P., He, H., An, N., Lian, D., Cao, L., & Niu, Z. (2023). [Frequency-domain MLPs are More Effective Learners in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f1d16af76939f476b5f040fd1398c0a3-Abstract-Conference.html). *NeurIPS 2023*.
 [^24]: Liu, Y., Hu, T., Zhang, H., Wu, H., Wang, S., Ma, L., & Long, M. (2024). [iTransformer: Inverted Transformers Are Effective for Time Series Forecasting](https://openreview.net/forum?id=JePfAI8fah). *ICLR 2024*.
+[^25]: Liu, Y., Wu, H., Wang, J., & Long, M. (2022). [Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2022/hash/4054556fcaa934b0bf76da52cf4f92cb-Abstract-Conference.html). *NeurIPS 2022*.
+[^26]: Liu, S., Yu, H., Liao, C., Li, J., Lin, W., Liu, A. X., & Dustdar, S. (2022). [Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting](https://openreview.net/forum?id=0EXmFzUn5I). *ICLR 2022*.
 
 
 
diff --git a/README_zh.md b/README_zh.md
index b8a5845a..6724fd4a 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -89,38 +89,41 @@ PyPOTS当前支持多变量POTS数据的插补，预测，分类，聚类以及
 🌟 自**v0.2**版本开始, PyPOTS中所有神经网络模型都支持超参数调优。该功能基于[微软的NNI](https://github.com/microsoft/nni)框架实现。
 你可以通过参考我们的时间序列插补综述项目的代码[Awesome_Imputation](https://github.com/WenjieDu/Awesome_Imputation)来了解如何使用PyPOTS调优模型的超参。
 
-🔥 请注意：Transformer, iTransformer, FreTS, Crossformer, PatchTST, DLinear, ETSformer, FiLM, FEDformer, Informer, Autoformer模型
-在它们的原始论文中并未用作插补方法，因此这些模型的输入中不能带有缺失值， 所以无法接受POTS数据作为输入。
+🔥 请注意: Transformer, iTransformer, FreTS, Crossformer, PatchTST, DLinear, ETSformer, Pyraformer, Nonstationary Transformer, FiLM, FEDformer, Informer, Autoformer
+模型在它们的原始论文中并未用作插补方法，因此这些模型的输入中不能带有缺失值， 所以无法接受POTS数据作为输入。
 **为了使上述模型能够适用于POTS数据，我们采用了与[SAITS论文](https://arxiv.org/pdf/2202.08516)中相同的embedding策略和训练方法（ORT+MIT）对它们进行改进**。
 
-| **类型**      | **算法**                           | **插补**   | **预测** |  **分类** | **聚类** | **异常检测** | **年份 - 刊物** |
-|:--------------|:-----------------------------------|:--------:|:--------:|:---------:|:--------:|:------------:|:-----------------|
-| Neural Net    | iTransformer[^24]                  |    ✅     |        |         |        |          | 2024 - ICLR      |
-| Neural Net    | FreTS[^23]                         |    ✅     |        |         |        |          | 2024 - NeurIPS   |
-| Neural Net    | SAITS[^1]                          |    ✅     |        |         |        |          | 2023 - ESWA      |
-| Neural Net    | Crossformer[^16]                   |    ✅     |        |         |        |          | 2023 - ICLR      |
-| Neural Net    | TimesNet[^14]                      |    ✅     |        |         |        |          | 2023 - ICLR      |
-| Neural Net    | PatchTST[^18]                      |    ✅     |        |         |        |          | 2023 - ICLR      |
-| Neural Net    | ETSformer[^19]                     |    ✅     |        |         |        |          | 2023 - ICLR      |
-| Neural Net    | DLinear[^17]                       |    ✅     |        |         |        |          | 2023 - AAAI      |
-| Neural Net    | FiLM[^22]                          |    ✅     |        |         |        |          | 2022 - NeurIPS   |
-| Neural Net    | Raindrop[^5]                       |          |        |    ✅    |        |          | 2022 - ICLR      |
-| Neural Net    | FEDformer[^20]                     |    ✅     |        |         |        |          | 2022 - ICML      |
-| Neural Net    | Autoformer[^15]                    |    ✅     |        |         |        |          | 2021 - NeurIPS   |
-| Neural Net    | CSDI[^12]                          |    ✅     |   ✅    |         |        |          | 2021 - NeurIPS   |
-| Neural Net    | Informer[^21]                      |    ✅     |        |         |        |          | 2021 - AAAI      |
-| Neural Net    | US-GAN[^10]                        |    ✅     |        |         |        |          | 2021 - AAAI      |
-| Neural Net    | CRLI[^6]                           |          |        |         |   ✅    |          | 2021 - AAAI      |
-| Probabilistic | BTTF[^8]                           |          |   ✅    |         |        |          | 2021 - TPAMI     |
-| Neural Net    | GP-VAE[^11]                        |    ✅     |        |         |        |          | 2020 - AISTATS   |
-| Neural Net    | VaDER[^7]                          |          |        |         |   ✅    |          | 2019 - GigaSci.  |
-| Neural Net    | M-RNN[^9]                          |    ✅     |        |         |        |          | 2019 - TBME      |
-| Neural Net    | BRITS[^3]                          |    ✅     |        |    ✅    |        |          | 2018 - NeurIPS   |
-| Neural Net    | GRU-D[^4]                          |    ✅     |        |    ✅    |        |          | 2018 - Sci. Rep. |
-| Neural Net    | Transformer[^2]                    |    ✅     |        |         |        |          | 2017 - NeurIPS   |
-| Naive         | LOCF/NOCB                          |    ✅     |        |         |        |          |                  |
-| Naive         | Mean                               |    ✅     |        |         |        |          |                  |
-| Naive         | Median                             |    ✅     |        |         |        |          |                  |
+| **类型**      | **算法**                           | **插补** | **预测** | **分类** | **聚类** | **异常检测** | **年份 - 刊物**      |
+|:--------------|:-----------------------------------|:------:|:------:|:------:|:------:|:--------:|:-----------------|
+| Neural Net    | iTransformer[^24]                  |   ✅    |        |        |        |          | 2024 - ICLR      |
+| Neural Net    | SAITS[^1]                          |   ✅    |        |        |        |          | 2023 - ESWA      |
+| Neural Net    | FreTS[^23]                         |   ✅    |        |        |        |          | 2023 - NeurIPS   |
+| Neural Net    | Koopa[^27]                         |   ✅    |        |        |        |          | 2023 - NeurIPS   |
+| Neural Net    | Crossformer[^16]                   |   ✅    |        |        |        |          | 2023 - ICLR      |
+| Neural Net    | TimesNet[^14]                      |   ✅    |        |        |        |          | 2023 - ICLR      |
+| Neural Net    | PatchTST[^18]                      |   ✅    |        |        |        |          | 2023 - ICLR      |
+| Neural Net    | ETSformer[^19]                     |   ✅    |        |        |        |          | 2023 - ICLR      |
+| Neural Net    | DLinear[^17]                       |   ✅    |        |        |        |          | 2023 - AAAI      |
+| Neural Net    | Nonstationary <br>Transformer[^25] |   ✅    |        |        |        |          | 2022 - NeurIPS   |
+| Neural Net    | FiLM[^22]                          |   ✅    |        |        |        |          | 2022 - NeurIPS   |
+| Neural Net    | Pyraformer[^26]                    |        |        |   ✅    |        |          | 2022 - ICLR      |
+| Neural Net    | Raindrop[^5]                       |        |        |   ✅    |        |          | 2022 - ICLR      |
+| Neural Net    | FEDformer[^20]                     |   ✅    |        |        |        |          | 2022 - ICML      |
+| Neural Net    | Autoformer[^15]                    |   ✅    |        |        |        |          | 2021 - NeurIPS   |
+| Neural Net    | CSDI[^12]                          |   ✅    |   ✅    |        |        |          | 2021 - NeurIPS   |
+| Neural Net    | Informer[^21]                      |   ✅    |        |        |        |          | 2021 - AAAI      |
+| Neural Net    | US-GAN[^10]                        |   ✅    |        |        |        |          | 2021 - AAAI      |
+| Neural Net    | CRLI[^6]                           |        |        |        |   ✅    |          | 2021 - AAAI      |
+| Probabilistic | BTTF[^8]                           |        |   ✅    |        |        |          | 2021 - TPAMI     |
+| Neural Net    | GP-VAE[^11]                        |   ✅    |        |        |        |          | 2020 - AISTATS   |
+| Neural Net    | VaDER[^7]                          |        |        |        |   ✅    |          | 2019 - GigaSci.  |
+| Neural Net    | M-RNN[^9]                          |   ✅    |        |        |        |          | 2019 - TBME      |
+| Neural Net    | BRITS[^3]                          |   ✅    |        |   ✅    |        |          | 2018 - NeurIPS   |
+| Neural Net    | GRU-D[^4]                          |   ✅    |        |   ✅    |        |          | 2018 - Sci. Rep. |
+| Neural Net    | Transformer[^2]                    |   ✅    |        |        |        |          | 2017 - NeurIPS   |
+| Naive         | LOCF/NOCB                          |   ✅    |        |        |        |          |                  |
+| Naive         | Mean                               |   ✅    |        |        |        |          |                  |
+| Naive         | Median                             |   ✅    |        |        |        |          |                  |
 
 
 ## ❖ PyPOTS生态系统
@@ -325,8 +328,10 @@ PyPOTS社区是一个开放、透明、友好的社区，让我们共同努力
 [^20]: Zhou, T., Ma, Z., Wen, Q., Wang, X., Sun, L., & Jin, R. (2022). [FEDformer: Frequency enhanced decomposed transformer for long-term series forecasting](https://proceedings.mlr.press/v162/zhou22g.html). *ICML 2022*.
 [^21]: Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021). [Informer: Beyond efficient transformer for long sequence time-series forecasting](https://ojs.aaai.org/index.php/AAAI/article/view/17325). *AAAI 2021*.
 [^22]: Zhou, T., Ma, Z., Wen, Q., Sun, L., Yao, T., Yin, W., & Jin, R. (2022). [FiLM: Frequency improved Legendre Memory Model for Long-term Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2022/hash/524ef58c2bd075775861234266e5e020-Abstract-Conference.html). *NeurIPS 2022*.
-[^23]: Yi, K., Zhang, Q., Fan, W., Wang, S., Wang, P., He, H., An, N., Lian, D., Cao, L., & Niu, Z. (2024). [Frequency-domain MLPs are More Effective Learners in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f1d16af76939f476b5f040fd1398c0a3-Abstract-Conference.html). *NeurIPS 2024*.
+[^23]: Yi, K., Zhang, Q., Fan, W., Wang, S., Wang, P., He, H., An, N., Lian, D., Cao, L., & Niu, Z. (2023). [Frequency-domain MLPs are More Effective Learners in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2023/hash/f1d16af76939f476b5f040fd1398c0a3-Abstract-Conference.html). *NeurIPS 2023*.
 [^24]: Liu, Y., Hu, T., Zhang, H., Wu, H., Wang, S., Ma, L., & Long, M. (2024). [iTransformer: Inverted Transformers Are Effective for Time Series Forecasting](https://openreview.net/forum?id=JePfAI8fah). *ICLR 2024*.
+[^25]: Liu, Y., Wu, H., Wang, J., & Long, M. (2022). [Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting](https://proceedings.neurips.cc/paper_files/paper/2022/hash/4054556fcaa934b0bf76da52cf4f92cb-Abstract-Conference.html). *NeurIPS 2022*.
+[^26]: Liu, S., Yu, H., Liao, C., Li, J., Lin, W., Liu, A. X., & Dustdar, S. (2022). [Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting](https://openreview.net/forum?id=0EXmFzUn5I). *ICLR 2022*.
 
 
 <details>
diff --git a/docs/index.rst b/docs/index.rst
index 7f2149ef..d4b23253 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -72,6 +72,10 @@ Welcome to PyPOTS docs!
    :alt: arXiv DOI
    :target: https://arxiv.org/abs/2305.18811
 
+.. image:: https://img.shields.io/badge/README-🇨🇳中文版-FCEFE8
+   :alt: README in Chinese
+   :target: https://github.com/WenjieDu/PyPOTS/blob/main/README_zh.md
+
 .. image:: https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FPyPOTS%2FPyPOTS&count_bg=%23009A0A&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=Visits%20since%20May%202022&edge_flat=false
    :alt: Visiting number
 
@@ -107,68 +111,80 @@ PyPOTS supports imputation, classification, clustering, forecasting, and anomaly
 time series with missing values. The table below shows the availability of each algorithm in PyPOTS for different tasks.
 The symbol ✅ indicates the algorithm is available for the corresponding task (note that models will be continuously updated
 in the future to handle tasks that are not currently supported. Stay tuned❗️).
-The task types are abbreviated as follows: **IMPU**: Imputation; **FORE**: Forecasting;
-**CLAS**: Classification; **CLUS**: Clustering; **ANOD**: Anomaly Detection.
-The paper references are all listed at the bottom of this readme file.
 
 🌟 Since **v0.2**, all neural-network models in PyPOTS has got hyperparameter-optimization support.
 This functionality is implemented with the `Microsoft NNI <https://github.com/microsoft/nni>`_ framework. You may want to refer to our time-series
 imputation survey repo `Awesome_Imputation <https://github.com/WenjieDu/Awesome_Imputation>`_ to see how to config and
 tune the hyperparameters.
 
-🔥 Note that Transformer, Crossformer, PatchTST, DLinear, ETSformer, FEDformer, Informer, Autoformer are not proposed as imputation methods in their original papers,
-and they cannot accept POTS as input. To make them applicable on POTS data, we apply the embedding strategy and training approach (ORT+MIT)
+🔥 Note that Transformer, iTransformer, FreTS, Crossformer, PatchTST, DLinear, ETSformer, Pyraformer, Nonstationary Transformer, FiLM, FEDformer, Informer, Autoformer
+are not proposed as imputation methods in their original papers, and they cannot accept POTS as input.
+To make them applicable on POTS data, we apply the embedding strategy and training approach (ORT+MIT)
 the same as we did in `SAITS paper <https://arxiv.org/pdf/2202.08516)>`_.
 
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Type           | Algorithm                                            | IMPU | FORE | CLAS | CLUS | ANOD | Year |
-+================+======================================================+======+======+======+======+======+======+
-| Neural Net     | SAITS :cite:`du2023SAITS`                            |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | Crossformer :cite:`nie2023patchtst`                  |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | TimesNet :cite:`wu2023timesnet`                      |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | PatchTST :cite:`nie2023patchtst`                     |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | DLinear :cite:`zeng2023dlinear`                      |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | ETSformer :cite:`woo2023etsformer`                   |  ✅  |      |      |      |      | 2023 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | FEDformer :cite:`zhou2022fedformer`                  |  ✅  |      |      |      |      | 2022 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | Raindrop :cite:`zhang2022Raindrop`                   |      |      |  ✅  |      |      | 2022 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | Informer :cite:`zhou2021informer`                    |  ✅  |      |      |      |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | Autoformer :cite:`wu2021autoformer`                  |  ✅  |      |      |      |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | CSDI :cite:`tashiro2021csdi`                         |  ✅  |  ✅  |      |      |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | US-GAN :cite:`miao2021SSGAN`                         |  ✅  |      |      |      |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | CRLI :cite:`ma2021CRLI`                              |      |      |      |  ✅  |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Probabilistic  | BTTF :cite:`chen2021BTMF`                            |      |  ✅  |      |      |      | 2021 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | GP-VAE :cite:`fortuin2020gpvae`                      |  ✅  |      |      |      |      | 2020 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | VaDER :cite:`dejong2019VaDER`                        |      |      |      |  ✅  |      | 2019 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | M-RNN :cite:`yoon2019MRNN`                           |  ✅  |      |      |      |      | 2019 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | BRITS :cite:`cao2018BRITS`                           |  ✅  |      |  ✅  |      |      | 2018 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | GRU-D :cite:`che2018GRUD`                            |      |      |  ✅  |      |      | 2018 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Neural Net     | Transformer :cite:`vaswani2017Transformer`           |  ✅  |      |      |      |      | 2017 |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Naive          | LOCF/NOCB                                            |  ✅  |      |      |      |      |  /   |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Naive          | Median                                               |  ✅  |      |      |      |      |  /   |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
-| Naive          | Mean                                                 |  ✅  |      |      |      |      |  /   |
-+----------------+------------------------------------------------------+------+------+------+------+------+------+
+The task types are abbreviated as follows: **IMPU**: Imputation; **FORE**: Forecasting;
+**CLAS**: Classification; **CLUS**: Clustering; **ANOD**: Anomaly Detection.
+The paper references are all listed at the bottom of this readme file.
+
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Type           | Algorithm                                                 | IMPU | FORE | CLAS | CLUS | ANOD | Year - Venue      |
++================+===========================================================+======+======+======+======+======+===================+
+| Neural Net     | iTransformer :cite:`liu2024itransformer`                  |  ✅  |      |      |      |      | 2024 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | SAITS :cite:`du2023SAITS`                                 |  ✅  |      |      |      |      | 2023 - ESWA       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | FreTS :cite:`yi2023frets`                                 |  ✅  |      |      |      |      | 2023 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Crossformer :cite:`nie2023patchtst`                       |  ✅  |      |      |      |      | 2023 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | TimesNet :cite:`wu2023timesnet`                           |  ✅  |      |      |      |      | 2023 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | PatchTST :cite:`nie2023patchtst`                          |  ✅  |      |      |      |      | 2023 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | ETSformer :cite:`woo2023etsformer`                        |  ✅  |      |      |      |      | 2023 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | DLinear :cite:`zeng2023dlinear`                           |  ✅  |      |      |      |      | 2023 - AAAI       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Nonstationary Trans. :cite:`liu2022nonstationary`         |  ✅  |      |      |      |      | 2023 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | FiLM :cite:`zhou2022film`                                 |  ✅  |      |      |      |      | 2023 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Pyraformer :cite:`liu2022pyraformer`                      |  ✅  |      |      |      |      | 2022 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Raindrop :cite:`zhang2022Raindrop`                        |      |      |  ✅  |      |      | 2022 - ICLR       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | FEDformer :cite:`zhou2022fedformer`                       |  ✅  |      |      |      |      | 2022 - ICML       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Autoformer :cite:`wu2021autoformer`                       |  ✅  |      |      |      |      | 2021 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | CSDI :cite:`tashiro2021csdi`                              |  ✅  |  ✅  |      |      |      | 2021 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Informer :cite:`zhou2021informer`                         |  ✅  |      |      |      |      | 2021 - AAAI       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | US-GAN :cite:`miao2021SSGAN`                              |  ✅  |      |      |      |      | 2021 - AAAI       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | CRLI :cite:`ma2021CRLI`                                   |      |      |      |  ✅  |      | 2021 - AAAI       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Probabilistic  | BTTF :cite:`chen2021BTMF`                                 |      |  ✅  |      |      |      | 2021 - TPAMI      |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | GP-VAE :cite:`fortuin2020gpvae`                           |  ✅  |      |      |      |      | 2020 - AISTATS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | VaDER :cite:`dejong2019VaDER`                             |      |      |      |  ✅  |      | 2019 - GigaSci.   |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | M-RNN :cite:`yoon2019MRNN`                                |  ✅  |      |      |      |      | 2019 - TBME       |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | BRITS :cite:`cao2018BRITS`                                |  ✅  |      |  ✅  |      |      | 2018 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | GRU-D :cite:`che2018GRUD`                                 |      |      |  ✅  |      |      | 2018 - Sci. Rep.  |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Neural Net     | Transformer :cite:`vaswani2017Transformer`                |  ✅  |      |      |      |      | 2017 - NeurIPS    |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Naive          | LOCF/NOCB                                                 |  ✅  |      |      |      |      |  /                |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Naive          | Median                                                    |  ✅  |      |      |      |      |  /                |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
+| Naive          | Mean                                                      |  ✅  |      |      |      |      |  /                |
++----------------+-----------------------------------------------------------+------+------+------+------+------+-------------------+
 
 
 ❖ PyPOTS Ecosystem
diff --git a/docs/references.bib b/docs/references.bib
index f45fe429..2c07158a 100644
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -23,7 +23,6 @@ @ARTICLE{yoon2019MRNN
 doi={10.1109/TBME.2018.2874712}
 }
 
-
 @article{che2018GRUD,
 title = {Recurrent {{Neural Networks}} for {{Multivariate Time Series}} with {{Missing Values}}},
 author = {Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan},
@@ -395,35 +394,35 @@ @inproceedings{reddi2018OnTheConvergence
 }
 
 @article{hubert1985,
-  title={Comparing partitions},
-  author={Hubert, Lawrence and Arabie, Phipps},
-  journal={Journal of classification},
-  volume={2},
-  pages={193--218},
-  year={1985},
-  publisher={Springer}
+title={Comparing partitions},
+author={Hubert, Lawrence and Arabie, Phipps},
+journal={Journal of classification},
+volume={2},
+pages={193--218},
+year={1985},
+publisher={Springer}
 }
 
 @article{steinley2004,
-  title={Properties of the hubert-arable adjusted rand index},
-  author={Steinley, Douglas},
-  journal={Psychological methods},
-  volume={9},
-  number={3},
-  pages={386},
-  year={2004},
-  publisher={American Psychological Association}
+title={Properties of the hubert-arable adjusted rand index},
+author={Steinley, Douglas},
+journal={Psychological methods},
+volume={9},
+number={3},
+pages={386},
+year={2004},
+publisher={American Psychological Association}
 }
 
 @article{calinski1974,
-  title={A dendrite method for cluster analysis},
-  author={Cali{\'n}ski, Tadeusz and Harabasz, Jerzy},
-  journal={Communications in Statistics-theory and Methods},
-  volume={3},
-  number={1},
-  pages={1--27},
-  year={1974},
-  publisher={Taylor \& Francis}
+title={A dendrite method for cluster analysis},
+author={Cali{\'n}ski, Tadeusz and Harabasz, Jerzy},
+journal={Communications in Statistics-theory and Methods},
+volume={3},
+number={1},
+pages={1--27},
+year={1974},
+publisher={Taylor \& Francis}
 }
 
 @inproceedings{tashiro2021csdi,
@@ -464,13 +463,6 @@ @inproceedings{wu2023timesnet
 url={https://openreview.net/forum?id=ju_Uqw384Oq}
 }
 
-@inproceedings{liu2022nonstationary,
-title={Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting},
-author={Liu, Yong and Wu, Haixu and Wang, Jianmin and Long, Mingsheng},
-booktitle={Advances in Neural Information Processing Systems},
-year={2022}
-}
-
 @inproceedings{wu2021autoformer,
 author = {Wu, Haixu and Xu, Jiehui and Wang, Jianmin and Long, Mingsheng},
 booktitle = {Advances in Neural Information Processing Systems},
@@ -574,4 +566,53 @@ @inproceedings{liu2024itransformer
 booktitle={The Twelfth International Conference on Learning Representations},
 year={2024},
 url={https://openreview.net/forum?id=JePfAI8fah}
-}
\ No newline at end of file
+}
+
+@inproceedings{liu2022nonstationary,
+author = {Liu, Yong and Wu, Haixu and Wang, Jianmin and Long, Mingsheng},
+booktitle = {Advances in Neural Information Processing Systems},
+editor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},
+pages = {9881--9893},
+publisher = {Curran Associates, Inc.},
+title = {Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting},
+url = {https://proceedings.neurips.cc/paper_files/paper/2022/file/4054556fcaa934b0bf76da52cf4f92cb-Paper-Conference.pdf},
+volume = {35},
+year = {2022}
+}
+
+@inproceedings{liu2023koopa,
+author = {Liu, Yong and Li, Chenyu and Wang, Jianmin and Long, Mingsheng},
+booktitle = {Advances in Neural Information Processing Systems},
+editor = {A. Oh and T. Neumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
+pages = {12271--12290},
+publisher = {Curran Associates, Inc.},
+title = {Koopa: Learning Non-stationary Time Series Dynamics with Koopman Predictors},
+url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/28b3dc0970fa4624a63278a4268de997-Paper-Conference.pdf},
+volume = {36},
+year = {2023}
+}
+
+@inproceedings{liu2022pyraformer,
+title={Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting},
+author={Shizhan Liu and Hang Yu and Cong Liao and Jianguo Li and Weiyao Lin and Alex X. Liu and Schahram Dustdar},
+booktitle={International Conference on Learning Representations},
+year={2022},
+url={https://openreview.net/forum?id=0EXmFzUn5I}
+}
+
+@inproceedings{kitaev2020reformer,
+title={Reformer: The Efficient Transformer},
+author={Nikita Kitaev and Lukasz Kaiser and Anselm Levskaya},
+booktitle={International Conference on Learning Representations},
+year={2020},
+url={https://openreview.net/forum?id=rkgNKkHtvB}
+}
+
+@article{das2023tide,
+title={Long-term Forecasting with Ti{DE}: Time-series Dense Encoder},
+author={Abhimanyu Das and Weihao Kong and Andrew Leach and Shaan K Mathur and Rajat Sen and Rose Yu},
+journal={Transactions on Machine Learning Research},
+issn={2835-8856},
+year={2023},
+url={https://openreview.net/forum?id=pCbC3aQB5W},
+}
diff --git a/pypots/imputation/__init__.py b/pypots/imputation/__init__.py
index 07d93a45..b6cc4c3f 100644
--- a/pypots/imputation/__init__.py
+++ b/pypots/imputation/__init__.py
@@ -13,6 +13,8 @@
 from .saits import SAITS
 from .transformer import Transformer
 from .itransformer import iTransformer
+from .nonstationary_transformer import NonstationaryTransformer
+from .pyraformer import Pyraformer
 from .timesnet import TimesNet
 from .etsformer import ETSformer
 from .fedformer import FEDformer
@@ -45,6 +47,8 @@
     "DLinear",
     "Informer",
     "Autoformer",
+    "NonstationaryTransformer",
+    "Pyraformer",
     "BRITS",
     "MRNN",
     "GPVAE",
diff --git a/pypots/imputation/nonstationary_transformer/__init__.py b/pypots/imputation/nonstationary_transformer/__init__.py
new file mode 100644
index 00000000..e3b18f8c
--- /dev/null
+++ b/pypots/imputation/nonstationary_transformer/__init__.py
@@ -0,0 +1,24 @@
+"""
+The package of the partially-observed time-series imputation model Nonstationary-Transformer.
+
+Refer to the paper
+`Yong Liu, Haixu Wu, Jianmin Wang, Mingsheng Long.
+Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting.
+Advances in Neural Information Processing Systems 35 (2022): 9881-9893.
+<https://proceedings.neurips.cc/paper_files/paper/2022/file/4054556fcaa934b0bf76da52cf4f92cb-Paper-Conference.pdf>`_
+
+Notes
+-----
+This implementation is inspired by the official one https://github.com/thuml/Nonstationary_Transformers
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .model import NonstationaryTransformer
+
+__all__ = [
+    "NonstationaryTransformer",
+]
diff --git a/pypots/imputation/nonstationary_transformer/core.py b/pypots/imputation/nonstationary_transformer/core.py
new file mode 100644
index 00000000..9ca21e1d
--- /dev/null
+++ b/pypots/imputation/nonstationary_transformer/core.py
@@ -0,0 +1,111 @@
+"""
+The core wrapper assembles the submodules of NonstationaryTransformer imputation model
+and takes over the forward progress of the algorithm.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+import torch.nn as nn
+
+from ...nn.modules.nonstationary_transformer import (
+    NonstationaryTransformerEncoder,
+    Projector,
+)
+from ...nn.modules.saits import SaitsLoss, SaitsEmbedding
+from ...nn.functional.normalization import nonstationary_norm, nonstationary_denorm
+
+
+class _NonstationaryTransformer(nn.Module):
+    def __init__(
+        self,
+        n_steps: int,
+        n_features: int,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_ffn: int,
+        d_projector_hidden: int,
+        n_projector_hidden_layers: int,
+        dropout: float,
+        attn_dropout: float,
+        ORT_weight: float = 1,
+        MIT_weight: float = 1,
+    ):
+        super().__init__()
+
+        d_k = d_v = d_model // n_heads
+        self.n_steps = n_steps
+
+        self.saits_embedding = SaitsEmbedding(
+            n_features * 2,
+            d_model,
+            with_pos=False,
+            dropout=dropout,
+        )
+        self.encoder = NonstationaryTransformerEncoder(
+            n_layers,
+            d_model,
+            n_heads,
+            d_k,
+            d_v,
+            d_ffn,
+            dropout,
+            attn_dropout,
+        )
+        self.tau_learner = Projector(
+            d_in=n_features,
+            n_steps=n_steps,
+            d_hidden=d_projector_hidden,
+            n_hidden_layers=n_projector_hidden_layers,
+            d_output=1,
+        )
+        self.delta_learner = Projector(
+            d_in=n_features,
+            n_steps=n_steps,
+            d_hidden=d_projector_hidden,
+            n_hidden_layers=n_projector_hidden_layers,
+            d_output=n_steps,
+        )
+
+        # for the imputation task, the output dim is the same as input dim
+        self.output_projection = nn.Linear(d_model, n_features)
+        self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
+
+    def forward(self, inputs: dict, training: bool = True) -> dict:
+        X, missing_mask = inputs["X"], inputs["missing_mask"]
+        X_enc, means, stdev = nonstationary_norm(X, missing_mask)
+
+        tau = self.tau_learner(X, stdev).exp()
+        delta = self.delta_learner(X, means)
+
+        # WDU: the original Nonstationary Transformer paper isn't proposed for imputation task. Hence the model doesn't
+        # take the missing mask into account, which means, in the process, the model doesn't know which part of
+        # the input data is missing, and this may hurt the model's imputation performance. Therefore, I apply the
+        # SAITS embedding method to project the concatenation of features and masks into a hidden space, as well as
+        # the output layers to project back from the hidden space to the original space.
+        enc_out = self.saits_embedding(X, missing_mask)
+
+        # NonstationaryTransformer encoder processing
+        enc_out, attns = self.encoder(enc_out, tau=tau, delta=delta)
+        # project back the original data space
+        reconstruction = self.output_projection(enc_out)
+        reconstruction = nonstationary_denorm(reconstruction, means, stdev)
+
+        imputed_data = missing_mask * X + (1 - missing_mask) * reconstruction
+        results = {
+            "imputed_data": imputed_data,
+        }
+
+        # if in training mode, return results with losses
+        if training:
+            X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
+            loss, ORT_loss, MIT_loss = self.saits_loss_func(
+                reconstruction, X_ori, missing_mask, indicating_mask
+            )
+            results["ORT_loss"] = ORT_loss
+            results["MIT_loss"] = MIT_loss
+            # `loss` is always the item for backward propagating to update the model
+            results["loss"] = loss
+
+        return results
diff --git a/pypots/imputation/nonstationary_transformer/data.py b/pypots/imputation/nonstationary_transformer/data.py
new file mode 100644
index 00000000..3b703cb5
--- /dev/null
+++ b/pypots/imputation/nonstationary_transformer/data.py
@@ -0,0 +1,24 @@
+"""
+Dataset class for NonstationaryTransformer.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union
+
+from ..saits.data import DatasetForSAITS
+
+
+class DatasetForNonstationaryTransformer(DatasetForSAITS):
+    """Actually NonstationaryTransformer uses the same data strategy as SAITS, needs MIT for training."""
+
+    def __init__(
+        self,
+        data: Union[dict, str],
+        return_X_ori: bool,
+        return_y: bool,
+        file_type: str = "hdf5",
+        rate: float = 0.2,
+    ):
+        super().__init__(data, return_X_ori, return_y, file_type, rate)
diff --git a/pypots/imputation/nonstationary_transformer/model.py b/pypots/imputation/nonstationary_transformer/model.py
new file mode 100644
index 00000000..9786ccd7
--- /dev/null
+++ b/pypots/imputation/nonstationary_transformer/model.py
@@ -0,0 +1,333 @@
+"""
+The implementation of Nonstationary-Transformer for the partially-observed time-series imputation task.
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from .core import _NonstationaryTransformer
+from .data import DatasetForNonstationaryTransformer
+from ..base import BaseNNImputer
+from ...data.checking import key_in_data_set
+from ...data.dataset import BaseDataset
+from ...optim.adam import Adam
+from ...optim.base import Optimizer
+from ...utils.logging import logger
+
+
+class NonstationaryTransformer(BaseNNImputer):
+    """The PyTorch implementation of the Nonstationary-Transformer model.
+    NonstationaryTransformer is originally proposed by Liu et al. in :cite:`liu2022nonstationary`.
+
+    Parameters
+    ----------
+    n_steps :
+        The number of time steps in the time-series data sample.
+
+    n_features :
+        The number of features in the time-series data sample.
+
+    n_layers :
+        The number of layers in the NonstationaryTransformer model.
+
+    d_model :
+        The dimension of the model.
+
+    n_heads :
+        The number of heads in each layer of NonstationaryTransformer.
+
+    d_ffn :
+        The dimension of the feed-forward network.
+
+    d_projector_hidden :
+        The dimensions of hidden layers in MLP projectors.
+        It should be a list of integers and the length of the list should be equal to n_projector_hidden_layers.
+
+    n_projector_hidden_layers :
+        The number of hidden layers in MLP projectors.
+
+    dropout :
+        The dropout rate for the model.
+
+    ORT_weight :
+        The weight for the ORT loss, the same as SAITS.
+
+    MIT_weight :
+        The weight for the MIT loss, the same as SAITS.
+
+    batch_size :
+        The batch size for training and evaluating the model.
+
+    epochs :
+        The number of epochs for training the model.
+
+    patience :
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    optimizer :
+        The optimizer for model training.
+        If not given, will use a default Adam optimizer.
+
+    num_workers :
+        The number of subprocesses to use for data loading.
+        `0` means data loading will be in the main process, i.e. there won't be subprocesses.
+
+    device :
+        The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them.
+        If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple),
+        then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models.
+        If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the
+        model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices).
+        Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
+
+    saving_path :
+        The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during
+        training into a tensorboard file). Will not save if not given.
+
+    model_saving_strategy :
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
+        No model will be saved when it is set as None.
+        The "best" strategy will only automatically save the best model after the training finished.
+        The "better" strategy will automatically save the model during training whenever the model performs
+        better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
+
+    """
+
+    def __init__(
+        self,
+        n_steps: int,
+        n_features: int,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_ffn: int,
+        d_projector_hidden: list,
+        n_projector_hidden_layers: int,
+        dropout: float = 0,
+        ORT_weight: float = 1,
+        MIT_weight: float = 1,
+        batch_size: int = 32,
+        epochs: int = 100,
+        patience: int = None,
+        optimizer: Optional[Optimizer] = Adam(),
+        num_workers: int = 0,
+        device: Optional[Union[str, torch.device, list]] = None,
+        saving_path: str = None,
+        model_saving_strategy: Optional[str] = "best",
+    ):
+        super().__init__(
+            batch_size,
+            epochs,
+            patience,
+            num_workers,
+            device,
+            saving_path,
+            model_saving_strategy,
+        )
+        assert len(d_projector_hidden) == n_projector_hidden_layers, (
+            f"The length of d_hidden should be equal to n_hidden_layers, "
+            f"but got {len(d_projector_hidden)} and {n_projector_hidden_layers}."
+        )
+
+        self.n_steps = n_steps
+        self.n_features = n_features
+        # model hype-parameters
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.d_model = d_model
+        self.d_ffn = d_ffn
+        self.d_projector_hidden = d_projector_hidden
+        self.n_projector_hidden_layers = n_projector_hidden_layers
+        self.dropout = dropout
+        self.ORT_weight = ORT_weight
+        self.MIT_weight = MIT_weight
+
+        # set up the model
+        self.model = _NonstationaryTransformer(
+            self.n_steps,
+            self.n_features,
+            self.n_layers,
+            self.d_model,
+            self.n_heads,
+            self.d_ffn,
+            self.d_projector_hidden,
+            self.n_projector_hidden_layers,
+            self.dropout,
+            self.ORT_weight,
+            self.MIT_weight,
+        )
+        self._send_model_to_given_device()
+        self._print_model_size()
+
+        # set up the optimizer
+        self.optimizer = optimizer
+        self.optimizer.init_optimizer(self.model.parameters())
+
+    def _assemble_input_for_training(self, data: list) -> dict:
+        (
+            indices,
+            X,
+            missing_mask,
+            X_ori,
+            indicating_mask,
+        ) = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+            "X_ori": X_ori,
+            "indicating_mask": indicating_mask,
+        }
+
+        return inputs
+
+    def _assemble_input_for_validating(self, data: list) -> dict:
+        return self._assemble_input_for_training(data)
+
+    def _assemble_input_for_testing(self, data: list) -> dict:
+        indices, X, missing_mask = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+        }
+
+        return inputs
+
+    def fit(
+        self,
+        train_set: Union[dict, str],
+        val_set: Optional[Union[dict, str]] = None,
+        file_type: str = "hdf5",
+    ) -> None:
+        # Step 1: wrap the input data with classes Dataset and DataLoader
+        training_set = DatasetForNonstationaryTransformer(
+            train_set, return_X_ori=False, return_y=False, file_type=file_type
+        )
+        training_loader = DataLoader(
+            training_set,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+        )
+        val_loader = None
+        if val_set is not None:
+            if not key_in_data_set("X_ori", val_set):
+                raise ValueError("val_set must contain 'X_ori' for model validation.")
+            val_set = DatasetForNonstationaryTransformer(
+                val_set, return_X_ori=True, return_y=False, file_type=file_type
+            )
+            val_loader = DataLoader(
+                val_set,
+                batch_size=self.batch_size,
+                shuffle=False,
+                num_workers=self.num_workers,
+            )
+
+        # Step 2: train the model and freeze it
+        self._train_model(training_loader, val_loader)
+        self.model.load_state_dict(self.best_model_dict)
+        self.model.eval()  # set the model as eval status to freeze it.
+
+        # Step 3: save the model if necessary
+        self._auto_save_model_if_necessary(confirm_saving=True)
+
+    def predict(
+        self,
+        test_set: Union[dict, str],
+        file_type: str = "hdf5",
+    ) -> dict:
+        """Make predictions for the input data with the trained model.
+
+        Parameters
+        ----------
+        test_set : dict or str
+            The dataset for model validating, should be a dictionary including keys as 'X',
+            or a path string locating a data file supported by PyPOTS (e.g. h5 file).
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (time steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type :
+            The type of the given file if test_set is a path string.
+
+        Returns
+        -------
+        file_type :
+            The dictionary containing the clustering results and latent variables if necessary.
+
+        """
+        # Step 1: wrap the input data with classes Dataset and DataLoader
+        self.model.eval()  # set the model as eval status to freeze it.
+        test_set = BaseDataset(
+            test_set,
+            return_X_ori=False,
+            return_X_pred=False,
+            return_y=False,
+            file_type=file_type,
+        )
+        test_loader = DataLoader(
+            test_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+        )
+        imputation_collector = []
+
+        # Step 2: process the data with the model
+        with torch.no_grad():
+            for idx, data in enumerate(test_loader):
+                inputs = self._assemble_input_for_testing(data)
+                results = self.model.forward(inputs, training=False)
+                imputation_collector.append(results["imputed_data"])
+
+        # Step 3: output collection and return
+        imputation = torch.cat(imputation_collector).cpu().detach().numpy()
+        result_dict = {
+            "imputation": imputation,
+        }
+        return result_dict
+
+    def impute(
+        self,
+        X: Union[dict, str],
+        file_type: str = "hdf5",
+    ) -> np.ndarray:
+        """Impute missing values in the given data with the trained model.
+
+        Warnings
+        --------
+        The method impute is deprecated. Please use `predict()` instead.
+
+        Parameters
+        ----------
+        X :
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (time steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type :
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (time steps), n_features],
+            Imputed data.
+        """
+        logger.warning(
+            "🚨DeprecationWarning: The method impute is deprecated. Please use `predict` instead."
+        )
+
+        results_dict = self.predict(X, file_type=file_type)
+        return results_dict["imputation"]
diff --git a/pypots/imputation/pyraformer/__init__.py b/pypots/imputation/pyraformer/__init__.py
new file mode 100644
index 00000000..56a3bcac
--- /dev/null
+++ b/pypots/imputation/pyraformer/__init__.py
@@ -0,0 +1,24 @@
+"""
+The package of the partially-observed time-series imputation model Pyraformer.
+
+Refer to the paper
+`Shizhan Liu, Hang Yu, Cong Liao, Jianguo Li, Weiyao Lin, Alex X. Liu, and Schahram Dustdar.
+"Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting".
+International Conference on Learning Representations. 2022.
+<https://openreview.net/pdf?id=0EXmFzUn5I>`_
+
+Notes
+-----
+This implementation is inspired by the official one https://github.com/ant-research/Pyraformer
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .model import Pyraformer
+
+__all__ = [
+    "Pyraformer",
+]
diff --git a/pypots/imputation/pyraformer/core.py b/pypots/imputation/pyraformer/core.py
new file mode 100644
index 00000000..3087d90a
--- /dev/null
+++ b/pypots/imputation/pyraformer/core.py
@@ -0,0 +1,86 @@
+"""
+The core wrapper assembles the submodules of Pyraformer imputation model
+and takes over the forward progress of the algorithm.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+import torch.nn as nn
+
+from ...nn.modules.pyraformer import PyraformerEncoder
+from ...nn.modules.saits import SaitsLoss, SaitsEmbedding
+
+
+class _Pyraformer(nn.Module):
+    def __init__(
+        self,
+        n_steps: int,
+        n_features: int,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_ffn: int,
+        dropout: float,
+        attn_dropout: float,
+        window_size: list,
+        inner_size: int,
+        ORT_weight: float = 1,
+        MIT_weight: float = 1,
+    ):
+        super().__init__()
+
+        self.saits_embedding = SaitsEmbedding(
+            n_features * 2,
+            d_model,
+            with_pos=False,
+            dropout=dropout,
+        )
+        self.encoder = PyraformerEncoder(
+            n_steps,
+            n_layers,
+            d_model,
+            n_heads,
+            d_ffn,
+            dropout,
+            attn_dropout,
+            window_size,
+            inner_size,
+        )
+
+        # for the imputation task, the output dim is the same as input dim
+        self.output_projection = nn.Linear((len(window_size) + 1) * d_model, n_features)
+        self.saits_loss_func = SaitsLoss(ORT_weight, MIT_weight)
+
+    def forward(self, inputs: dict, training: bool = True) -> dict:
+        X, missing_mask = inputs["X"], inputs["missing_mask"]
+
+        # WDU: the original Pyraformer paper isn't proposed for imputation task. Hence the model doesn't take
+        # the missing mask into account, which means, in the process, the model doesn't know which part of
+        # the input data is missing, and this may hurt the model's imputation performance. Therefore, I apply the
+        # SAITS embedding method to project the concatenation of features and masks into a hidden space, as well as
+        # the output layers to project back from the hidden space to the original space.
+        enc_out = self.saits_embedding(X, missing_mask)
+
+        # Pyraformer encoder processing
+        enc_out, attns = self.encoder(enc_out)
+        # project back the original data space
+        reconstruction = self.output_projection(enc_out)
+
+        imputed_data = missing_mask * X + (1 - missing_mask) * reconstruction
+        results = {
+            "imputed_data": imputed_data,
+        }
+
+        # if in training mode, return results with losses
+        if training:
+            X_ori, indicating_mask = inputs["X_ori"], inputs["indicating_mask"]
+            loss, ORT_loss, MIT_loss = self.saits_loss_func(
+                reconstruction, X_ori, missing_mask, indicating_mask
+            )
+            results["ORT_loss"] = ORT_loss
+            results["MIT_loss"] = MIT_loss
+            # `loss` is always the item for backward propagating to update the model
+            results["loss"] = loss
+
+        return results
diff --git a/pypots/imputation/pyraformer/data.py b/pypots/imputation/pyraformer/data.py
new file mode 100644
index 00000000..0ed0b31b
--- /dev/null
+++ b/pypots/imputation/pyraformer/data.py
@@ -0,0 +1,24 @@
+"""
+Dataset class for Pyraformer.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union
+
+from ..saits.data import DatasetForSAITS
+
+
+class DatasetForPyraformer(DatasetForSAITS):
+    """Actually Pyraformer uses the same data strategy as SAITS, needs MIT for training."""
+
+    def __init__(
+        self,
+        data: Union[dict, str],
+        return_X_ori: bool,
+        return_y: bool,
+        file_type: str = "hdf5",
+        rate: float = 0.2,
+    ):
+        super().__init__(data, return_X_ori, return_y, file_type, rate)
diff --git a/pypots/imputation/pyraformer/model.py b/pypots/imputation/pyraformer/model.py
new file mode 100644
index 00000000..757e96f3
--- /dev/null
+++ b/pypots/imputation/pyraformer/model.py
@@ -0,0 +1,326 @@
+"""
+The implementation of Pyraformer for the partially-observed time-series imputation task.
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Union, Optional
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from .core import _Pyraformer
+from .data import DatasetForPyraformer
+from ..base import BaseNNImputer
+from ...data.checking import key_in_data_set
+from ...data.dataset import BaseDataset
+from ...optim.adam import Adam
+from ...optim.base import Optimizer
+
+
+class Pyraformer(BaseNNImputer):
+    """The PyTorch implementation of the Pyraformer model.
+    Pyraformer is originally proposed by Liu et al. in :cite:`liu2022pyraformer`.
+
+    Parameters
+    ----------
+    n_steps :
+        The number of time steps in the time-series data sample.
+
+    n_features :
+        The number of features in the time-series data sample.
+
+    n_layers :
+        The number of layers in the Pyraformer model.
+
+    d_model :
+        The dimension of the model.
+
+    n_heads :
+        The number of heads in each layer of Pyraformer.
+
+    d_ffn :
+        The dimension of the feed-forward network.
+
+    window_size :
+        The downsample window size in pyramidal attention.
+
+    inner_size :
+        The size of neighbour attention
+
+    dropout :
+        The dropout rate for the model.
+
+    attn_dropout :
+        The dropout rate for the attention mechanism.
+
+    ORT_weight :
+        The weight for the ORT loss, the same as SAITS.
+
+    MIT_weight :
+        The weight for the MIT loss, the same as SAITS.
+
+    batch_size :
+        The batch size for training and evaluating the model.
+
+    epochs :
+        The number of epochs for training the model.
+
+    patience :
+        The patience for the early-stopping mechanism. Given a positive integer, the training process will be
+        stopped when the model does not perform better after that number of epochs.
+        Leaving it default as None will disable the early-stopping.
+
+    optimizer :
+        The optimizer for model training.
+        If not given, will use a default Adam optimizer.
+
+    num_workers :
+        The number of subprocesses to use for data loading.
+        `0` means data loading will be in the main process, i.e. there won't be subprocesses.
+
+    device :
+        The device for the model to run on. It can be a string, a :class:`torch.device` object, or a list of them.
+        If not given, will try to use CUDA devices first (will use the default CUDA device if there are multiple),
+        then CPUs, considering CUDA and CPU are so far the main devices for people to train ML models.
+        If given a list of devices, e.g. ['cuda:0', 'cuda:1'], or [torch.device('cuda:0'), torch.device('cuda:1')] , the
+        model will be parallely trained on the multiple devices (so far only support parallel training on CUDA devices).
+        Other devices like Google TPU and Apple Silicon accelerator MPS may be added in the future.
+
+    saving_path :
+        The path for automatically saving model checkpoints and tensorboard files (i.e. loss values recorded during
+        training into a tensorboard file). Will not save if not given.
+
+    model_saving_strategy :
+        The strategy to save model checkpoints. It has to be one of [None, "best", "better", "all"].
+        No model will be saved when it is set as None.
+        The "best" strategy will only automatically save the best model after the training finished.
+        The "better" strategy will automatically save the model during training whenever the model performs
+        better than in previous epochs.
+        The "all" strategy will save every model after each epoch training.
+
+    """
+
+    def __init__(
+        self,
+        n_steps: int,
+        n_features: int,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_ffn: int,
+        window_size: list,
+        inner_size: int,
+        dropout: float = 0,
+        attn_dropout: float = 0,
+        ORT_weight: float = 1,
+        MIT_weight: float = 1,
+        batch_size: int = 32,
+        epochs: int = 100,
+        patience: int = None,
+        optimizer: Optional[Optimizer] = Adam(),
+        num_workers: int = 0,
+        device: Optional[Union[str, torch.device, list]] = None,
+        saving_path: str = None,
+        model_saving_strategy: Optional[str] = "best",
+    ):
+        super().__init__(
+            batch_size,
+            epochs,
+            patience,
+            num_workers,
+            device,
+            saving_path,
+            model_saving_strategy,
+        )
+
+        self.n_steps = n_steps
+        self.n_features = n_features
+        # model hype-parameters
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.d_model = d_model
+        self.d_ffn = d_ffn
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.window_size = window_size
+        self.inner_size = inner_size
+        self.ORT_weight = ORT_weight
+        self.MIT_weight = MIT_weight
+
+        # set up the model
+        self.model = _Pyraformer(
+            self.n_steps,
+            self.n_features,
+            self.n_layers,
+            self.d_model,
+            self.n_heads,
+            self.d_ffn,
+            self.dropout,
+            self.attn_dropout,
+            self.window_size,
+            self.inner_size,
+            self.ORT_weight,
+            self.MIT_weight,
+        )
+        self._send_model_to_given_device()
+        self._print_model_size()
+
+        # set up the optimizer
+        self.optimizer = optimizer
+        self.optimizer.init_optimizer(self.model.parameters())
+
+    def _assemble_input_for_training(self, data: list) -> dict:
+        (
+            indices,
+            X,
+            missing_mask,
+            X_ori,
+            indicating_mask,
+        ) = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+            "X_ori": X_ori,
+            "indicating_mask": indicating_mask,
+        }
+
+        return inputs
+
+    def _assemble_input_for_validating(self, data: list) -> dict:
+        return self._assemble_input_for_training(data)
+
+    def _assemble_input_for_testing(self, data: list) -> dict:
+        indices, X, missing_mask = self._send_data_to_given_device(data)
+
+        inputs = {
+            "X": X,
+            "missing_mask": missing_mask,
+        }
+
+        return inputs
+
+    def fit(
+        self,
+        train_set: Union[dict, str],
+        val_set: Optional[Union[dict, str]] = None,
+        file_type: str = "hdf5",
+    ) -> None:
+        # Step 1: wrap the input data with classes Dataset and DataLoader
+        training_set = DatasetForPyraformer(
+            train_set, return_X_ori=False, return_y=False, file_type=file_type
+        )
+        training_loader = DataLoader(
+            training_set,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+        )
+        val_loader = None
+        if val_set is not None:
+            if not key_in_data_set("X_ori", val_set):
+                raise ValueError("val_set must contain 'X_ori' for model validation.")
+            val_set = DatasetForPyraformer(
+                val_set, return_X_ori=True, return_y=False, file_type=file_type
+            )
+            val_loader = DataLoader(
+                val_set,
+                batch_size=self.batch_size,
+                shuffle=False,
+                num_workers=self.num_workers,
+            )
+
+        # Step 2: train the model and freeze it
+        self._train_model(training_loader, val_loader)
+        self.model.load_state_dict(self.best_model_dict)
+        self.model.eval()  # set the model as eval status to freeze it.
+
+        # Step 3: save the model if necessary
+        self._auto_save_model_if_necessary(confirm_saving=True)
+
+    def predict(
+        self,
+        test_set: Union[dict, str],
+        file_type: str = "hdf5",
+    ) -> dict:
+        """Make predictions for the input data with the trained model.
+
+        Parameters
+        ----------
+        test_set : dict or str
+            The dataset for model validating, should be a dictionary including keys as 'X',
+            or a path string locating a data file supported by PyPOTS (e.g. h5 file).
+            If it is a dict, X should be array-like of shape [n_samples, sequence length (n_steps), n_features],
+            which is time-series data for validating, can contain missing values, and y should be array-like of shape
+            [n_samples], which is classification labels of X.
+            If it is a path string, the path should point to a data file, e.g. a h5 file, which contains
+            key-value pairs like a dict, and it has to include keys as 'X' and 'y'.
+
+        file_type :
+            The type of the given file if test_set is a path string.
+
+        Returns
+        -------
+        file_type :
+            The dictionary containing the clustering results and latent variables if necessary.
+
+        """
+        # Step 1: wrap the input data with classes Dataset and DataLoader
+        self.model.eval()  # set the model as eval status to freeze it.
+        test_set = BaseDataset(
+            test_set,
+            return_X_ori=False,
+            return_X_pred=False,
+            return_y=False,
+            file_type=file_type,
+        )
+        test_loader = DataLoader(
+            test_set,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+        )
+        imputation_collector = []
+
+        # Step 2: process the data with the model
+        with torch.no_grad():
+            for idx, data in enumerate(test_loader):
+                inputs = self._assemble_input_for_testing(data)
+                results = self.model.forward(inputs, training=False)
+                imputation_collector.append(results["imputed_data"])
+
+        # Step 3: output collection and return
+        imputation = torch.cat(imputation_collector).cpu().detach().numpy()
+        result_dict = {
+            "imputation": imputation,
+        }
+        return result_dict
+
+    def impute(
+        self,
+        test_set: Union[dict, str],
+        file_type: str = "hdf5",
+    ) -> np.ndarray:
+        """Impute missing values in the given data with the trained model.
+
+        Parameters
+        ----------
+        test_set :
+            The data samples for testing, should be array-like of shape [n_samples, sequence length (n_steps),
+            n_features], or a path string locating a data file, e.g. h5 file.
+
+        file_type :
+            The type of the given file if X is a path string.
+
+        Returns
+        -------
+        array-like, shape [n_samples, sequence length (n_steps), n_features],
+            Imputed data.
+        """
+
+        result_dict = self.predict(test_set, file_type=file_type)
+        return result_dict["imputation"]
diff --git a/pypots/imputation/transformer/model.py b/pypots/imputation/transformer/model.py
index 76d04fac..d7d59097 100644
--- a/pypots/imputation/transformer/model.py
+++ b/pypots/imputation/transformer/model.py
@@ -63,7 +63,7 @@ class Transformer(BaseNNImputer):
         The dropout rate for all fully-connected layers in the model.
 
     attn_dropout :
-        The dropout rate for DMSA.
+        The dropout rate for the attention mechanism.
 
     ORT_weight :
         The weight for the ORT loss.
diff --git a/pypots/nn/modules/nonstationary_transformer/__init__.py b/pypots/nn/modules/nonstationary_transformer/__init__.py
new file mode 100644
index 00000000..d92de271
--- /dev/null
+++ b/pypots/nn/modules/nonstationary_transformer/__init__.py
@@ -0,0 +1,26 @@
+"""
+The package including the modules of Non-stationary Transformer.
+
+Refer to the paper
+`Yong Liu, Haixu Wu, Jianmin Wang, Mingsheng Long.
+Non-stationary Transformers: Exploring the Stationarity in Time Series Forecasting.
+Advances in Neural Information Processing Systems 35 (2022): 9881-9893.
+<https://proceedings.neurips.cc/paper_files/paper/2022/file/4054556fcaa934b0bf76da52cf4f92cb-Paper-Conference.pdf>`_
+
+Notes
+-----
+This implementation is inspired by the official one https://github.com/thuml/Nonstationary_Transformers
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from .autoencoder import NonstationaryTransformerEncoder
+from .layers import DeStationaryAttention, Projector
+
+__all__ = [
+    "NonstationaryTransformerEncoder",
+    "DeStationaryAttention",
+    "Projector",
+]
diff --git a/pypots/nn/modules/nonstationary_transformer/autoencoder.py b/pypots/nn/modules/nonstationary_transformer/autoencoder.py
new file mode 100644
index 00000000..fcd7863f
--- /dev/null
+++ b/pypots/nn/modules/nonstationary_transformer/autoencoder.py
@@ -0,0 +1,120 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .layers import DeStationaryAttention
+from ..transformer.layers import TransformerEncoderLayer
+
+
+class NonstationaryTransformerEncoder(nn.Module):
+    """NonstationaryTransformer encoder.
+    Its arch is the same with the original Transformer encoder,
+    but the attention operator is replaced by the DeStationaryAttention.
+
+    Parameters
+    ----------
+    n_layers:
+        The number of layers in the encoder.
+
+    d_model:
+        The dimension of the module manipulation space.
+        The input tensor will be projected to a space with d_model dimensions.
+
+    n_heads:
+        The number of heads in multi-head attention.
+
+    d_k:
+        The dimension of the key and query tensor.
+
+    d_v:
+        The dimension of the value tensor.
+
+    d_ffn:
+        The dimension of the hidden layer in the feed-forward network.
+
+    dropout:
+        The dropout rate.
+
+    attn_dropout:
+        The dropout rate for the attention map.
+
+    """
+
+    def __init__(
+        self,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_k: int,
+        d_v: int,
+        d_ffn: int,
+        dropout: float,
+        attn_dropout: float,
+    ):
+        super().__init__()
+
+        self.enc_layer_stack = nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    DeStationaryAttention(d_k**0.5, attn_dropout),
+                    d_model,
+                    n_heads,
+                    d_k,
+                    d_v,
+                    d_ffn,
+                    dropout,
+                )
+                for _ in range(n_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        src_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, list]]:
+        """Forward processing of the encoder.
+
+        Parameters
+        ----------
+        x:
+            Input tensor.
+
+        src_mask:
+            Masking tensor for the attention map. The shape should be [batch_size, n_heads, n_steps, n_steps].
+
+        Returns
+        -------
+        enc_output:
+            Output tensor.
+
+        attn_weights_collector:
+            A list containing the attention map from each encoder layer.
+
+        """
+        attn_weights_collector = []
+        enc_output = x
+
+        if src_mask is None:
+            # triangular causal mask
+            bz, n_steps, _ = x.shape
+            mask_shape = [bz, n_steps, n_steps]
+            src_mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(x.device)
+
+        for layer in self.enc_layer_stack:
+            enc_output, attn_weights = layer(enc_output, src_mask, **kwargs)
+            attn_weights_collector.append(attn_weights)
+
+        return enc_output, attn_weights_collector
diff --git a/pypots/nn/modules/nonstationary_transformer/layers.py b/pypots/nn/modules/nonstationary_transformer/layers.py
new file mode 100644
index 00000000..8464bc9e
--- /dev/null
+++ b/pypots/nn/modules/nonstationary_transformer/layers.py
@@ -0,0 +1,107 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.fft
+import torch.nn as nn
+
+from ..transformer.attention import AttentionOperator
+
+
+class DeStationaryAttention(AttentionOperator):
+    """De-stationary Attention"""
+
+    def __init__(self, temperature: float, attn_dropout: float = 0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        v: torch.Tensor,
+        k: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # q, k, v all have 4 dimensions [batch_size, n_steps, n_heads, d_tensor]
+        # d_tensor could be d_q, d_k, d_v
+
+        B, L, H, E = q.shape
+        _, S, _, D = v.shape
+        temperature = self.temperature or 1.0 / math.sqrt(E)
+
+        tau, delta = kwargs["tau"], kwargs["delta"]
+        tau = 1.0 if tau is None else tau.unsqueeze(1).unsqueeze(1)  # B x 1 x 1 x 1
+        delta = (
+            0.0 if delta is None else delta.unsqueeze(1).unsqueeze(1)
+        )  # B x 1 x 1 x S
+
+        # De-stationary Attention, rescaling pre-softmax score with learned de-stationary factors
+        scores = torch.einsum("blhe,bshe->bhls", q, k) * tau + delta
+
+        if attn_mask is not None:
+            scores.masked_fill_(attn_mask, -torch.inf)
+
+        attn = self.dropout(torch.softmax(temperature * scores, dim=-1))
+        V = torch.einsum("bhls,bshd->blhd", attn, v)
+        output = V.contiguous()
+
+        return output, attn
+
+
+class Projector(nn.Module):
+    """
+    MLP to learn the De-stationary factors
+    """
+
+    def __init__(
+        self,
+        d_in: int,
+        n_steps: int,
+        d_hidden: list,
+        n_hidden_layers: int,
+        d_output: int,
+        kernel_size: int = 3,
+    ):
+        super().__init__()
+
+        assert (
+            len(d_hidden) == n_hidden_layers
+        ), f"The length of d_hidden should be equal to n_hidden_layers, but got {len(d_hidden)} and {n_hidden_layers}."
+
+        padding = 1 if torch.__version__ >= "1.5.0" else 2
+        self.series_conv = nn.Conv1d(
+            in_channels=n_steps,
+            out_channels=1,
+            kernel_size=kernel_size,
+            padding=padding,
+            padding_mode="circular",
+            bias=False,
+        )
+
+        layers = [nn.Linear(2 * d_in, d_hidden[0]), nn.ReLU()]
+        for i in range(n_hidden_layers - 1):
+            layers += [nn.Linear(d_hidden[i], d_hidden[i + 1]), nn.ReLU()]
+
+        layers += [nn.Linear(d_hidden[-1], d_output, bias=False)]
+        self.backbone = nn.Sequential(*layers)
+
+    def forward(self, x, stats):
+        # x:     B x S x E
+        # stats: B x 1 x E
+        # y:     B x O
+        batch_size = x.shape[0]
+        x = self.series_conv(x)  # B x 1 x E
+        x = torch.cat([x, stats], dim=1)  # B x 2 x E
+        x = x.view(batch_size, -1)  # B x 2E
+        y = self.backbone(x)  # B x O
+
+        return y
diff --git a/pypots/nn/modules/pyraformer/__init__.py b/pypots/nn/modules/pyraformer/__init__.py
new file mode 100644
index 00000000..f20b9cc9
--- /dev/null
+++ b/pypots/nn/modules/pyraformer/__init__.py
@@ -0,0 +1,24 @@
+"""
+The package including the modules of Pyraformer.
+
+Refer to the paper
+`Shizhan Liu, Hang Yu, Cong Liao, Jianguo Li, Weiyao Lin, Alex X. Liu, and Schahram Dustdar.
+"Pyraformer: Low-Complexity Pyramidal Attention for Long-Range Time Series Modeling and Forecasting".
+International Conference on Learning Representations. 2022.
+<https://openreview.net/pdf?id=0EXmFzUn5I>`_
+
+Notes
+-----
+This implementation is inspired by the official one https://github.com/ant-research/Pyraformer
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+from .autoencoder import PyraformerEncoder
+
+__all__ = [
+    "PyraformerEncoder",
+]
diff --git a/pypots/nn/modules/pyraformer/autoencoder.py b/pypots/nn/modules/pyraformer/autoencoder.py
new file mode 100644
index 00000000..df2d455a
--- /dev/null
+++ b/pypots/nn/modules/pyraformer/autoencoder.py
@@ -0,0 +1,73 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .layers import get_mask, refer_points, Bottleneck_Construct
+from ..transformer.attention import ScaledDotProductAttention
+from ..transformer.layers import TransformerEncoderLayer
+
+
+class PyraformerEncoder(nn.Module):
+    def __init__(
+        self,
+        n_steps: int,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        d_ffn: int,
+        dropout: float,
+        attn_dropout: float,
+        window_size: list,
+        inner_size: int,
+    ):
+        super().__init__()
+
+        d_bottleneck = d_model // 4
+        d_k = d_v = d_model // n_heads
+
+        self.mask, self.all_size = get_mask(n_steps, window_size, inner_size)
+        self.indexes = refer_points(self.all_size, window_size)
+        self.layer_stack = nn.ModuleList(
+            [
+                TransformerEncoderLayer(
+                    ScaledDotProductAttention(d_k**0.5, attn_dropout),
+                    d_model,
+                    n_heads,
+                    d_k,
+                    d_v,
+                    d_ffn,
+                    dropout=dropout,
+                )
+                for _ in range(n_layers)
+            ]
+        )  # in the official code, they only use the naive pyramid attention
+        self.conv_layers = Bottleneck_Construct(d_model, window_size, d_bottleneck)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        src_mask: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, list]]:
+
+        mask = self.mask.repeat(len(x), 1, 1).to(x.device)
+        x = self.conv_layers(x)
+
+        attn_weights_collector = []
+        for layer in self.layer_stack:
+            x, attn_weights = layer(x, mask)
+            attn_weights_collector.append(attn_weights)
+
+        indexes = self.indexes.repeat(x.size(0), 1, 1, x.size(2)).to(x.device)
+        indexes = indexes.view(x.size(0), -1, x.size(2))
+        all_enc = torch.gather(x, 1, indexes)
+        enc_output = all_enc.view(x.size(0), self.all_size[0], -1)
+
+        return enc_output, attn_weights_collector
diff --git a/pypots/nn/modules/pyraformer/layers.py b/pypots/nn/modules/pyraformer/layers.py
new file mode 100644
index 00000000..0fc61e90
--- /dev/null
+++ b/pypots/nn/modules/pyraformer/layers.py
@@ -0,0 +1,130 @@
+"""
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+import math
+
+import torch
+import torch.fft
+import torch.nn as nn
+
+
+def get_mask(input_size, window_size, inner_size):
+    """Get the attention mask of PAM-Naive"""
+    # Get the size of all layers
+    all_size = [input_size]
+    for i in range(len(window_size)):
+        layer_size = math.floor(all_size[i] / window_size[i])
+        all_size.append(layer_size)
+
+    seq_length = sum(all_size)
+    mask = torch.zeros(seq_length, seq_length)
+
+    # get intra-scale mask
+    inner_window = inner_size // 2
+    for layer_idx in range(len(all_size)):
+        start = sum(all_size[:layer_idx])
+        for i in range(start, start + all_size[layer_idx]):
+            left_side = max(i - inner_window, start)
+            right_side = min(i + inner_window + 1, start + all_size[layer_idx])
+            mask[i, left_side:right_side] = 1
+
+    # get inter-scale mask
+    for layer_idx in range(1, len(all_size)):
+        start = sum(all_size[:layer_idx])
+        for i in range(start, start + all_size[layer_idx]):
+            left_side = (start - all_size[layer_idx - 1]) + (i - start) * window_size[
+                layer_idx - 1
+            ]
+            if i == (start + all_size[layer_idx] - 1):
+                right_side = start
+            else:
+                right_side = (start - all_size[layer_idx - 1]) + (
+                    i - start + 1
+                ) * window_size[layer_idx - 1]
+            mask[i, left_side:right_side] = 1
+            mask[left_side:right_side, i] = 1
+
+    mask = (1 - mask).bool()
+
+    return mask, all_size
+
+
+def refer_points(all_sizes, window_size):
+    """Gather features from PAM's pyramid sequences"""
+    input_size = all_sizes[0]
+    indexes = torch.zeros(input_size, len(all_sizes))
+
+    for i in range(input_size):
+        indexes[i][0] = i
+        former_index = i
+        for j in range(1, len(all_sizes)):
+            start = sum(all_sizes[:j])
+            inner_layer_idx = former_index - (start - all_sizes[j - 1])
+            former_index = start + min(
+                inner_layer_idx // window_size[j - 1], all_sizes[j] - 1
+            )
+            indexes[i][j] = former_index
+
+    indexes = indexes.unsqueeze(0).unsqueeze(3)
+
+    return indexes.long()
+
+
+class ConvLayer(nn.Module):
+    def __init__(self, c_in, window_size):
+        super().__init__()
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=window_size,
+            stride=window_size,
+        )
+        self.norm = nn.BatchNorm1d(c_in)
+        self.activation = nn.ELU()
+
+    def forward(self, x):
+        x = self.downConv(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        return x
+
+
+class Bottleneck_Construct(nn.Module):
+    """Bottleneck convolution CSCM"""
+
+    def __init__(self, d_model, window_size, d_inner):
+        super().__init__()
+        if not isinstance(window_size, list):
+            self.conv_layers = nn.ModuleList(
+                [
+                    ConvLayer(d_inner, window_size),
+                    ConvLayer(d_inner, window_size),
+                    ConvLayer(d_inner, window_size),
+                ]
+            )
+        else:
+            self.conv_layers = []
+            for i in range(len(window_size)):
+                self.conv_layers.append(ConvLayer(d_inner, window_size[i]))
+            self.conv_layers = nn.ModuleList(self.conv_layers)
+        self.up = nn.Linear(d_inner, d_model)
+        self.down = nn.Linear(d_model, d_inner)
+        self.norm = nn.LayerNorm(d_model)
+
+    def forward(self, enc_input):
+        temp_input = self.down(enc_input).permute(0, 2, 1)
+        all_inputs = []
+        for i in range(len(self.conv_layers)):
+            temp_input = self.conv_layers[i](temp_input)
+            all_inputs.append(temp_input)
+
+        all_inputs = torch.cat(all_inputs, dim=2).transpose(1, 2)
+        all_inputs = self.up(all_inputs)
+        all_inputs = torch.cat([enc_input, all_inputs], dim=1)
+
+        all_inputs = self.norm(all_inputs)
+        return all_inputs
diff --git a/tests/imputation/nonstationary_transformer.py b/tests/imputation/nonstationary_transformer.py
new file mode 100644
index 00000000..57dcf537
--- /dev/null
+++ b/tests/imputation/nonstationary_transformer.py
@@ -0,0 +1,139 @@
+"""
+Test cases for NonstationaryTransformer imputation model.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+import os.path
+import unittest
+
+import numpy as np
+import pytest
+
+from pypots.imputation import NonstationaryTransformer
+from pypots.optim import Adam
+from pypots.utils.logging import logger
+from pypots.utils.metrics import calc_mse
+from tests.global_test_config import (
+    DATA,
+    EPOCHS,
+    DEVICE,
+    TRAIN_SET,
+    VAL_SET,
+    TEST_SET,
+    GENERAL_H5_TRAIN_SET_PATH,
+    GENERAL_H5_VAL_SET_PATH,
+    GENERAL_H5_TEST_SET_PATH,
+    RESULT_SAVING_DIR_FOR_IMPUTATION,
+    check_tb_and_model_checkpoints_existence,
+)
+
+
+class TestNonstationaryTransformer(unittest.TestCase):
+    logger.info("Running tests for an imputation model NonstationaryTransformer...")
+
+    # set the log and model saving path
+    saving_path = os.path.join(
+        RESULT_SAVING_DIR_FOR_IMPUTATION, "NonstationaryTransformer"
+    )
+    model_save_name = "saved_nonstationary_transformer_model.pypots"
+
+    # initialize an Adam optimizer
+    optimizer = Adam(lr=0.001, weight_decay=1e-5)
+
+    # initialize a NonstationaryTransformer model
+    nonstationary_transformer = NonstationaryTransformer(
+        DATA["n_steps"],
+        DATA["n_features"],
+        n_layers=2,
+        d_model=32,
+        n_heads=2,
+        d_ffn=32,
+        d_projector_hidden=[64, 64],
+        n_projector_hidden_layers=2,
+        dropout=0,
+        epochs=EPOCHS,
+        saving_path=saving_path,
+        optimizer=optimizer,
+        device=DEVICE,
+    )
+
+    @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
+    def test_0_fit(self):
+        self.nonstationary_transformer.fit(TRAIN_SET, VAL_SET)
+
+    @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
+    def test_1_impute(self):
+        imputation_results = self.nonstationary_transformer.predict(TEST_SET)
+        assert not np.isnan(
+            imputation_results["imputation"]
+        ).any(), "Output still has missing values after running impute()."
+
+        test_MSE = calc_mse(
+            imputation_results["imputation"],
+            DATA["test_X_ori"],
+            DATA["test_X_indicating_mask"],
+        )
+        logger.info(f"NonstationaryTransformer test_MSE: {test_MSE}")
+
+    @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
+    def test_2_parameters(self):
+        assert (
+            hasattr(self.nonstationary_transformer, "model")
+            and self.nonstationary_transformer.model is not None
+        )
+
+        assert (
+            hasattr(self.nonstationary_transformer, "optimizer")
+            and self.nonstationary_transformer.optimizer is not None
+        )
+
+        assert hasattr(self.nonstationary_transformer, "best_loss")
+        self.assertNotEqual(self.nonstationary_transformer.best_loss, float("inf"))
+
+        assert (
+            hasattr(self.nonstationary_transformer, "best_model_dict")
+            and self.nonstationary_transformer.best_model_dict is not None
+        )
+
+    @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
+    def test_3_saving_path(self):
+        # whether the root saving dir exists, which should be created by save_log_into_tb_file
+        assert os.path.exists(
+            self.saving_path
+        ), f"file {self.saving_path} does not exist"
+
+        # check if the tensorboard file and model checkpoints exist
+        check_tb_and_model_checkpoints_existence(self.nonstationary_transformer)
+
+        # save the trained model into file, and check if the path exists
+        saved_model_path = os.path.join(self.saving_path, self.model_save_name)
+        self.nonstationary_transformer.save(saved_model_path)
+
+        # test loading the saved model, not necessary, but need to test
+        self.nonstationary_transformer.load(saved_model_path)
+
+    @pytest.mark.xdist_group(name="imputation-nonstationary_transformer")
+    def test_4_lazy_loading(self):
+        self.nonstationary_transformer.fit(
+            GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH
+        )
+        imputation_results = self.nonstationary_transformer.predict(
+            GENERAL_H5_TEST_SET_PATH
+        )
+        assert not np.isnan(
+            imputation_results["imputation"]
+        ).any(), "Output still has missing values after running impute()."
+
+        test_MSE = calc_mse(
+            imputation_results["imputation"],
+            DATA["test_X_ori"],
+            DATA["test_X_indicating_mask"],
+        )
+        logger.info(f"Lazy-loading NonstationaryTransformer test_MSE: {test_MSE}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/imputation/pyraformer.py b/tests/imputation/pyraformer.py
new file mode 100644
index 00000000..64629647
--- /dev/null
+++ b/tests/imputation/pyraformer.py
@@ -0,0 +1,131 @@
+"""
+Test cases for Pyraformer imputation model.
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+import os.path
+import unittest
+
+import numpy as np
+import pytest
+
+from pypots.imputation import Pyraformer
+from pypots.optim import Adam
+from pypots.utils.logging import logger
+from pypots.utils.metrics import calc_mse
+from tests.global_test_config import (
+    DATA,
+    EPOCHS,
+    DEVICE,
+    TRAIN_SET,
+    VAL_SET,
+    TEST_SET,
+    GENERAL_H5_TRAIN_SET_PATH,
+    GENERAL_H5_VAL_SET_PATH,
+    GENERAL_H5_TEST_SET_PATH,
+    RESULT_SAVING_DIR_FOR_IMPUTATION,
+    check_tb_and_model_checkpoints_existence,
+)
+
+
+class TestPyraformer(unittest.TestCase):
+    logger.info("Running tests for an imputation model Pyraformer...")
+
+    # set the log and model saving path
+    saving_path = os.path.join(RESULT_SAVING_DIR_FOR_IMPUTATION, "Pyraformer")
+    model_save_name = "saved_pyraformer_model.pypots"
+
+    # initialize an Adam optimizer
+    optimizer = Adam(lr=0.001, weight_decay=1e-5)
+
+    # initialize a Pyraformer model
+    pyraformer = Pyraformer(
+        DATA["n_steps"],
+        DATA["n_features"],
+        n_layers=2,
+        d_model=32,
+        n_heads=2,
+        d_ffn=32,
+        window_size=[2, 2],
+        inner_size=3,
+        dropout=0,
+        attn_dropout=0,
+        epochs=EPOCHS,
+        saving_path=saving_path,
+        optimizer=optimizer,
+        device=DEVICE,
+    )
+
+    @pytest.mark.xdist_group(name="imputation-pyraformer")
+    def test_0_fit(self):
+        self.pyraformer.fit(TRAIN_SET, VAL_SET)
+
+    @pytest.mark.xdist_group(name="imputation-pyraformer")
+    def test_1_impute(self):
+        imputation_results = self.pyraformer.predict(TEST_SET)
+        assert not np.isnan(
+            imputation_results["imputation"]
+        ).any(), "Output still has missing values after running impute()."
+
+        test_MSE = calc_mse(
+            imputation_results["imputation"],
+            DATA["test_X_ori"],
+            DATA["test_X_indicating_mask"],
+        )
+        logger.info(f"Pyraformer test_MSE: {test_MSE}")
+
+    @pytest.mark.xdist_group(name="imputation-pyraformer")
+    def test_2_parameters(self):
+        assert hasattr(self.pyraformer, "model") and self.pyraformer.model is not None
+
+        assert (
+            hasattr(self.pyraformer, "optimizer")
+            and self.pyraformer.optimizer is not None
+        )
+
+        assert hasattr(self.pyraformer, "best_loss")
+        self.assertNotEqual(self.pyraformer.best_loss, float("inf"))
+
+        assert (
+            hasattr(self.pyraformer, "best_model_dict")
+            and self.pyraformer.best_model_dict is not None
+        )
+
+    @pytest.mark.xdist_group(name="imputation-pyraformer")
+    def test_3_saving_path(self):
+        # whether the root saving dir exists, which should be created by save_log_into_tb_file
+        assert os.path.exists(
+            self.saving_path
+        ), f"file {self.saving_path} does not exist"
+
+        # check if the tensorboard file and model checkpoints exist
+        check_tb_and_model_checkpoints_existence(self.pyraformer)
+
+        # save the trained model into file, and check if the path exists
+        saved_model_path = os.path.join(self.saving_path, self.model_save_name)
+        self.pyraformer.save(saved_model_path)
+
+        # test loading the saved model, not necessary, but need to test
+        self.pyraformer.load(saved_model_path)
+
+    @pytest.mark.xdist_group(name="imputation-pyraformer")
+    def test_4_lazy_loading(self):
+        self.pyraformer.fit(GENERAL_H5_TRAIN_SET_PATH, GENERAL_H5_VAL_SET_PATH)
+        imputation_results = self.pyraformer.predict(GENERAL_H5_TEST_SET_PATH)
+        assert not np.isnan(
+            imputation_results["imputation"]
+        ).any(), "Output still has missing values after running impute()."
+
+        test_MSE = calc_mse(
+            imputation_results["imputation"],
+            DATA["test_X_ori"],
+            DATA["test_X_indicating_mask"],
+        )
+        logger.info(f"Lazy-loading Pyraformer test_MSE: {test_MSE}")
+
+
+if __name__ == "__main__":
+    unittest.main()