NatLabRockies · nbodini · Jan 12, 2022 · Sep 7, 2021 · Sep 7, 2021 · Oct 4, 2021
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pycqa/isort
-    rev: 5.9.3
+    rev: 5.10.1
     hooks:
       - id: isort
         additional_dependencies: [toml]
@@ -16,15 +16,15 @@ repos:
 
 
   - repo: https://github.com/psf/black
-    rev: 21.8b0
+    rev: 21.12b0
     hooks:
         - id: black
           name: black
           stages: [commit]
           language_version: python3.7
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v2.3.0
+    rev: v4.1.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
@@ -36,6 +36,8 @@ repos:
     -   id: mixed-line-ending
     -   id: pretty-format-json
         args: [--autofix]
-    -   id: flake8
-        additional_dependencies: [flake8-docstrings]
-        exclude: ^tests/
+
+  -   repo: https://github.com/pycqa/flake8
+      rev: '4.0.1'
+      hooks:
+      -   id: flake8
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ All notable changes to this project will be documented in this file. If you make
 - Added hourly resolution to AEP calculation
 - Added wind farm plotting function to pandas_plotting toolkit using the Bokeh library
 - Split the QC methods into a more generic `WindToolKitQualityControlDiagnosticSuite` class and WTK-specific subclass: `WindToolKitQualityControlDiagnosticSuite`.
+- Updated filter algorithms in AEP calculation, now with a proper outlier filter
 
 ## [2.2 - 2021-05-28]
 - IAV incorporation in AEP calculation

diff --git a/examples/02_plant_aep_analysis.ipynb b/examples/02_plant_aep_analysis.ipynb
diff --git a/examples/02b_augmented_plant_aep_analysis.ipynb b/examples/02b_augmented_plant_aep_analysis.ipynb
diff --git a/examples/03_turbine_ideal_energy.ipynb b/examples/03_turbine_ideal_energy.ipynb
diff --git a/examples/05_eya_gap_analysis.ipynb b/examples/05_eya_gap_analysis.ipynb
@@ -111,10 +111,10 @@
    "outputs": [],
    "source": [
     "# Calculate TIE\n",
-    "ta = turbine_long_term_gross_energy.TurbineLongTermGrossEnergy(project)\n",
+    "ta = turbine_long_term_gross_energy.TurbineLongTermGrossEnergy(project, UQ=False)\n",
     "ta.run(reanal_subset = ['era5', 'merra2'], \n",
     "       max_power_filter = 0.85, \n",
-    "       wind_bin_thresh = 2.0, \n",
+    "       wind_bin_thresh = 2.0,\n",
     "       correction_threshold = 0.90, \n",
     "       enable_plotting = False,\n",
     "       plot_dir = None)"
@@ -234,7 +234,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.8.10"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/examples/daily_power_curve_R80711.png b/examples/daily_power_curve_R80711.png
diff --git a/examples/daily_power_curve_R80721.png b/examples/daily_power_curve_R80721.png
diff --git a/examples/daily_power_curve_R80736.png b/examples/daily_power_curve_R80736.png
diff --git a/examples/daily_power_curve_R80790.png b/examples/daily_power_curve_R80790.png
diff --git a/examples/filtered_power_curve_R80711.png b/examples/filtered_power_curve_R80711.png
diff --git a/examples/filtered_power_curve_R80721.png b/examples/filtered_power_curve_R80721.png
diff --git a/examples/filtered_power_curve_R80736.png b/examples/filtered_power_curve_R80736.png
diff --git a/examples/filtered_power_curve_R80790.png b/examples/filtered_power_curve_R80790.png
diff --git a/operational_analysis/methods/plant_analysis.py b/operational_analysis/methods/plant_analysis.py
@@ -81,6 +81,8 @@ def __init__(
         uncertainty_losses=0.05,
         uncertainty_windiness=(10, 20),
         uncertainty_loss_max=(10, 20),
+        outlier_detection=False,
+        uncertainty_outlier=(1, 3),
         uncertainty_nan_energy=0.01,
         time_resolution="M",
         reg_model="lin",
@@ -98,6 +100,8 @@ def __init__(
          uncertainty_losses(:obj:`float`): uncertainty on long-term losses
          uncertainty_windiness(:obj:`tuple`): number of years to use for the windiness correction
          uncertainty_loss_max(:obj:`tuple`): threshold for the combined availabilty and curtailment monthly loss threshold
+         outlier_detection(:obj:`bool`): whether to perform (True) or not (False - default) outlier detection filtering
+         uncertainty_outlier(:obj:`tuple`): min and max thresholds (Monte-Carlo sampled) for the outlier detection filter. At monthly resolution, this is the tuning constant for Huber’s t function for a robust linear regression. At daily/hourly resolution, this is the number of stdev of wind speed used as threshold for the bin filter.
          uncertainty_nan_energy(:obj:`float`): threshold to flag days/months based on NaNs
          time_resolution(:obj:`string`): whether to perform the AEP calculation at monthly ('M'), daily ('D') or hourly ('H') time resolution
          reg_model(:obj:`string`): which model to use for the regression ('lin' for linear, 'gam', 'gbm', 'etr'). At monthly time resolution only linear regression is allowed because of the reduced number of data points.
@@ -120,8 +124,10 @@ def __init__(
         self.uncertainty_meter = np.float64(uncertainty_meter)
         self.uncertainty_losses = np.float64(uncertainty_losses)
         self.uncertainty_windiness = np.array(uncertainty_windiness, dtype=np.float64)
+        self.uncertainty_outlier = np.array(uncertainty_outlier, dtype=np.float64)
         self.uncertainty_loss_max = np.array(uncertainty_loss_max, dtype=np.float64)
         self.uncertainty_nan_energy = np.float64(uncertainty_nan_energy)
+        self.outlier_detection = outlier_detection
 
         # Check that selected time resolution is allowed
         if time_resolution not in ["M", "D", "H"]:
@@ -279,7 +285,8 @@ def plot_reanalysis_normalized_rolling_monthly_windspeed(self):
 
     def plot_reanalysis_gross_energy_data(self, outlier_thres):
         """
-        Make a plot of normalized 30-day gross energy vs wind speed for each reanalysis product, include R2 measure
+        Make a plot of gross energy vs wind speed for each reanalysis product,
+        with outliers highlighted
 
         Args:
             outlier_thres (float): outlier threshold (typical range of 1 to 4) which adjusts outlier sensitivity detection
@@ -295,50 +302,86 @@ def plot_reanalysis_gross_energy_data(self, outlier_thres):
         # Loop through each reanalysis product and make a scatterplot of monthly wind speed vs plant energy
         for p in np.arange(0, len(list(self._reanal_products))):
             col_name = self._reanal_products[p]  # Reanalysis column in monthly data frame
+            # Plot
+            plt.subplot(2, 2, p + 1)
 
-            x = sm.add_constant(
-                valid_aggregate[col_name]
-            )  # Define 'x'-values (constant needed for regression function)
-            if self.time_resolution == "M":
+            if (
+                self.time_resolution == "M"
+            ):  # Monthly case: apply robust linear regression for outliers detection
+                x = sm.add_constant(
+                    valid_aggregate[col_name]
+                )  # Define 'x'-values (constant needed for regression function)
                 y = (
                     valid_aggregate["gross_energy_gwh"] * 30 / valid_aggregate["num_days_expected"]
                 )  # Normalize energy data to 30-days
-            else:
+
+                rlm = sm.RLM(
+                    y, x, M=sm.robust.norms.HuberT(t=outlier_thres)
+                )  # Robust linear regression with HuberT algorithm (threshold equal to outlier_thres)
+                rlm_results = rlm.fit()
+
+                r2 = np.corrcoef(
+                    x.loc[rlm_results.weights == 1, col_name], y[rlm_results.weights == 1]
+                )[
+                    0, 1
+                ]  # Get R2 from valid data
+
+                # Continue plotting
+                plt.plot(
+                    x.loc[rlm_results.weights != 1, col_name],
+                    y[rlm_results.weights != 1],
+                    "rx",
+                    label="Outlier",
+                )
+                plt.plot(
+                    x.loc[rlm_results.weights == 1, col_name],
+                    y[rlm_results.weights == 1],
+                    ".",
+                    label="Valid data",
+                )
+                plt.title(col_name + ", R2=" + str(np.round(r2, 3)))
+                plt.ylabel("30-day normalized gross energy (GWh)")
+
+            else:  # Daily/hourly case: apply bin filter for outliers detection
+                x = valid_aggregate[col_name]
                 y = valid_aggregate["gross_energy_gwh"]
+                plant_capac = self._plant._plant_capacity / 1000.0 * self._hours_in_res
+
+                # Apply bin filter
+                flag = filters.bin_filter(
+                    bin_col=y,
+                    value_col=x,
+                    bin_width=0.06 * plant_capac,
+                    threshold=outlier_thres,  # wind bin threshold (stdev outside the median)
+                    center_type="median",
+                    bin_min=0.01 * plant_capac,
+                    bin_max=0.85 * plant_capac,
+                    threshold_type="std",
+                    direction="all",  # both left and right (from the median)
+                )
 
-            rlm = sm.RLM(
-                y, x, M=sm.robust.norms.HuberT(t=outlier_thres)
-            )  # Robust linear regression with HuberT algorithm (threshold equal to 2)
-            rlm_results = rlm.fit()
+                # Continue plotting
+                plt.plot(
+                    x.loc[flag],
+                    y[flag],
+                    "rx",
+                    label="Outlier",
+                )
+                plt.plot(
+                    x.loc[~flag],
+                    y[~flag],
+                    ".",
+                    label="Valid data",
+                )
 
-            r2 = np.corrcoef(
-                x.loc[rlm_results.weights == 1, col_name], y[rlm_results.weights == 1]
-            )[
-                0, 1
-            ]  # Get R2 from valid data
+                if self.time_resolution == "D":
+                    plt.ylabel("Daily gross energy (GWh)")
+                elif self.time_resolution == "H":
+                    plt.ylabel("Hourly gross energy (GWh)")
+                plt.title(col_name)
 
-            # Plot results
-            plt.subplot(2, 2, p + 1)
-            plt.plot(
-                x.loc[rlm_results.weights != 1, col_name],
-                y[rlm_results.weights != 1],
-                "rx",
-                label="Outlier",
-            )
-            plt.plot(
-                x.loc[rlm_results.weights == 1, col_name],
-                y[rlm_results.weights == 1],
-                ".",
-                label="Valid data",
-            )
-            plt.title(col_name + ", R2=" + str(np.round(r2, 3)))
             plt.xlabel("Wind speed (m/s)")
-            if self.time_resolution == "M":
-                plt.ylabel("30-day normalized gross energy (GWh)")
-            elif self.time_resolution == "D":
-                plt.ylabel("Daily gross energy (GWh)")
-            elif self.time_resolution == "H":
-                plt.ylabel("Hourly gross energy (GWh)")
+
         plt.tight_layout()
         return plt
 
@@ -750,6 +793,15 @@ def setup_monte_carlo_inputs(self):
             )
             / 100.0,
         }
+        if self.outlier_detection:
+            inputs["outlier_threshold"] = (
+                np.random.randint(
+                    self.uncertainty_outlier[0] * 10,
+                    (self.uncertainty_outlier[1] + 0.1) * 10,
+                    self.num_sim,
+                )
+                / 10.0
+            )
 
         self._inputs = pd.DataFrame(inputs)
 
@@ -791,8 +843,13 @@ def filter_outliers(self, n):
 
         # Apply range filter to wind speed
         df_sub = df_sub.assign(flag_range=filters.range_flag(df_sub[reanal], below=0, above=40))
-        # Apply frozen/unresponsive sensor filter
-        df_sub.loc[:, "flag_frozen"] = filters.unresponsive_flag(df_sub[reanal], threshold=3)
+        if self.reg_temperature:
+            # Apply range filter to temperatre
+            df_sub = df_sub.assign(
+                flag_range_T=filters.range_flag(
+                    df_sub[reanal + "_temperature_K"], below=200, above=320
+                )
+            )
         # Apply window range filter
         df_sub.loc[:, "flag_window"] = filters.window_range_flag(
             window_col=df_sub[reanal],
@@ -803,12 +860,50 @@ def filter_outliers(self, n):
             value_max=1.2 * plant_capac,
         )
 
+        if self.outlier_detection:
+            if self.time_resolution == "M":
+                # Monthly linear regression (i.e., few data points):
+                # filter outliers based on robust linear regression
+                # using Huber algorithm to flag outliers
+                X = sm.add_constant(df_sub[reanal])  # Reanalysis data with constant column
+                y = (
+                    df_sub["gross_energy_gwh"] * 30 / df_sub["num_days_expected"]
+                )  # Energy data (normalized to 30-days)
+
+                # Perform robust linear regression
+                rlm = sm.RLM(y, X, M=sm.robust.norms.HuberT(self._run.outlier_threshold))
+                rlm_results = rlm.fit()
+
+                # Define valid data as points in which the Huber algorithm returned a value of 1
+                df_sub.loc[:, "flag_outliers"] = rlm_results.weights != 1
+
+            else:
+                # Daily regressions (i.e., higher number of data points):
+                # Apply bin filter to catch outliers
+                df_sub.loc[:, "flag_outliers"] = filters.bin_filter(
+                    bin_col=df_sub["gross_energy_gwh"],
+                    value_col=df_sub[reanal],
+                    bin_width=0.06 * plant_capac,
+                    threshold=self._run.outlier_threshold,  # wind bin threshold (multiplicative factor of std of <value_col> in bin)
+                    center_type="median",
+                    bin_min=0.01 * plant_capac,
+                    bin_max=0.85 * plant_capac,
+                    threshold_type="std",
+                    direction="all",  # both left and right (from the median)
+                )
+        else:
+            df_sub.loc[:, "flag_outliers"] = False
+
         # Create a 'final' flag which is true if any of the previous flags are true
         df_sub.loc[:, "flag_final"] = (
             (df_sub.loc[:, "flag_range"])
-            | (df_sub.loc[:, "flag_frozen"])
             | (df_sub.loc[:, "flag_window"])
+            | (df_sub.loc[:, "flag_outliers"])
         )
+        if self.reg_temperature:
+            df_sub.loc[:, "flag_final"] = (df_sub.loc[:, "flag_final"]) | (
+                df_sub.loc[:, "flag_range_T"]
+            )
 
         # Define valid data
         valid_data = df_sub.loc[
@@ -817,7 +912,8 @@ def filter_outliers(self, n):
         ]
         if self.reg_winddirection:
             valid_data_to_add = df_sub.loc[
-                ~df_sub.loc[:, "flag_final"], [reanal + "_wd", reanal + "_u_ms", reanal + "_v_ms"],
+                ~df_sub.loc[:, "flag_final"],
+                [reanal + "_wd", reanal + "_u_ms", reanal + "_v_ms"],
             ]
             valid_data = pd.concat([valid_data, valid_data_to_add], axis=1)