Reference suitability (#11)

* stop saving parquet files to output * add check_for_ops_curve_shift * add plot_ops_curve_timelines_one_wtg * enable ruff on tests * Update main_analysis.py fix bug * add new pp_analysis_plots like Cp * Update combine_results_plots.py use confidence interval for error bars * update smarteole_example.ipynb show latest plots * Update main_analysis.py add required cols check to check_for_ops_curve_shift * Update pp_analysis_plots.py handle column missing case in plot_pre_post_condition_histogram, relevant for ambient temperature when using non turbine reference
resgroup · Jun 26, 2024 · de1b1ab · de1b1ab
1 parent ec5b242
commit de1b1ab
Show file tree

Hide file tree

Showing 13 changed files with 530 additions and 107 deletions.
diff --git a/examples/smarteole_example.ipynb b/examples/smarteole_example.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,7 +52,6 @@ include = ["wind_up*"]
 line-length = 120
 target-version = "py310"
 show-fixes = true
-extend-exclude = ["tests"]
 
 [tool.ruff.lint]
 select = ["ALL"] # https://beta.ruff.rs/docs/rules/

diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -8,15 +8,18 @@
 SAMPLE_OTHER = pd.date_range(pd.Timestamp("2000-01-01T05:00+05:00"), periods=2)
 
 
-@pytest.mark.parametrize("t,exp", [
-    (pd.DatetimeIndex(SAMPLE_NAIVE), pd.DatetimeIndex(SAMPLE_UTC)),
-    (pd.DatetimeIndex(SAMPLE_UTC), pd.DatetimeIndex(SAMPLE_UTC)),
-    (pd.DatetimeIndex(SAMPLE_OTHER), pd.DatetimeIndex(SAMPLE_UTC)),
-    (pd.Timestamp("2000-01-01T00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
-    (pd.Timestamp("2000-01-01T00:00+00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
-    (pd.Timestamp("2000-01-01T05:00+05:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
-])
-def test_ensure_utc(t, exp):
+@pytest.mark.parametrize(
+    ("t", "exp"),
+    [
+        (pd.DatetimeIndex(SAMPLE_NAIVE), pd.DatetimeIndex(SAMPLE_UTC)),
+        (pd.DatetimeIndex(SAMPLE_UTC), pd.DatetimeIndex(SAMPLE_UTC)),
+        (pd.DatetimeIndex(SAMPLE_OTHER), pd.DatetimeIndex(SAMPLE_UTC)),
+        (pd.Timestamp("2000-01-01T00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
+        (pd.Timestamp("2000-01-01T00:00+00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
+        (pd.Timestamp("2000-01-01T05:00+05:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
+    ],
+)
+def test_ensure_utc(t: pd.Timestamp, exp: pd.Timestamp) -> None:
     actual = ensure_utc(t)
 
     if isinstance(t, pd.Timestamp):

diff --git a/tests/test_helpers.py b/tests/test_helpers.py
@@ -6,7 +6,7 @@
 
 
 class TestWithParquetCache:
-    def test_creates_and_fetches_cache(self, tmp_path: Path):
+    def test_creates_and_fetches_cache(self, tmp_path: Path) -> None:
         fp = tmp_path / "test.parquet"
         sample_df = pd.DataFrame({"a": [1, 2, 3]})
 
@@ -22,7 +22,7 @@ def myfunc() -> pd.DataFrame:
         df2 = myfunc()
         pd.testing.assert_frame_equal(df2, sample_df)
 
-    def test_doesnt_run_the_func_if_file_exists(self, tmp_path: Path):
+    def test_doesnt_run_the_func_if_file_exists(self, tmp_path: Path) -> None:
         fp = tmp_path / "test.parquet"
         sample_df = pd.DataFrame({"a": [1, 2, 3]})
         sample_df.to_parquet(fp)

diff --git a/tests/test_northing.py b/tests/test_northing.py
@@ -32,8 +32,12 @@ def test_apply_northing_corrections(test_lsa_t13_config: WindUpConfig) -> None:
     assert median_yaw_after_northing["LSA_T13"] == pytest.approx(235.22855377197266)
     assert median_yaw_after_northing["LSA_T14"] == pytest.approx(224.92881774902344)
 
-    abs_north_errs_before_northing = calc_max_abs_north_errs(wf_df, north_ref_wd_col=REANALYSIS_WD_COL,timebase_s=cfg.timebase_s)
-    abs_north_errs_after_northing = calc_max_abs_north_errs(wf_df_after_northing, north_ref_wd_col=REANALYSIS_WD_COL,timebase_s=cfg.timebase_s)
+    abs_north_errs_before_northing = calc_max_abs_north_errs(
+        wf_df, north_ref_wd_col=REANALYSIS_WD_COL, timebase_s=cfg.timebase_s
+    )
+    abs_north_errs_after_northing = calc_max_abs_north_errs(
+        wf_df_after_northing, north_ref_wd_col=REANALYSIS_WD_COL, timebase_s=cfg.timebase_s
+    )
     assert abs_north_errs_before_northing.min() == pytest.approx(7.88920288085938)
     assert abs_north_errs_before_northing.max() == pytest.approx(7.88920288085938)
     assert abs_north_errs_after_northing["LSA_T07"] == pytest.approx(18.400006103515604)

diff --git a/tests/test_pp_analysis.py b/tests/test_pp_analysis.py
@@ -11,7 +11,7 @@
 def test_pre_post_pp_analysis_with_reversal(test_lsa_t13_config: WindUpConfig) -> None:
     cfg = test_lsa_t13_config
 
-    test_wtg = next(x for x in cfg.asset.wtgs if x.name=="LSA_T13")
+    test_wtg = next(x for x in cfg.asset.wtgs if x.name == "LSA_T13")
     ref_name = "LSA_T12"
     detrend_ws_col = "ref_ws_detrended"
     test_pw_col = "test_pw_clipped"
@@ -39,9 +39,25 @@ def test_pre_post_pp_analysis_with_reversal(test_lsa_t13_config: WindUpConfig) -
     # minor changes to make actual_df compatible with old expected_df
     expected_df["hours_for_mwh_calc"] = expected_df["hours_per_year"]
     expected_df["hours_per_year"] = actual_df["hours_per_year"]
-    cols_with_new_calc = ["uplift_se", "uplift_p5_kw", "uplift_p95_kw"]
+    cols_with_new_calc = ["uplift_kw_se", "uplift_p5_kw", "uplift_p95_kw"]
     expected_df[cols_with_new_calc] = actual_df[cols_with_new_calc]
-    new_cols = ["pre_valid", "post_valid", "hours_pre_raw", "hours_post_raw", "is_invalid_bin",'pw_at_mid_expected', 'pw_sem_at_mid_expected']
+    new_cols = [
+        "pre_valid",
+        "post_valid",
+        "hours_pre_raw",
+        "hours_post_raw",
+        "is_invalid_bin",
+        "pw_at_mid_expected",
+        "pw_sem_at_mid_expected",
+        "relative_cp_baseline",
+        "relative_cp_post",
+        "relative_cp_sem_at_mid_expected",
+        "relative_cp_sem_at_mid_post",
+        "uplift_relative_cp",
+        "uplift_relative_cp_se",
+        "uplift_relative_cp_p5",
+        "uplift_relative_cp_p95",
+    ]
     expected_df[new_cols] = actual_df[new_cols]
     expected_df = expected_df[actual_df.columns]
 

diff --git a/tests/test_reanalysis_data.py b/tests/test_reanalysis_data.py
@@ -11,6 +11,7 @@
     get_dsid_and_dates_from_filename,
 )
 
+
 def test_get_dsid_and_dates_from_filename() -> None:
     assert get_dsid_and_dates_from_filename("ERA5T_47.50N_-3.25E_100m_1hr_19900101_20231031.parquet") == (
         "ERA5T_47.50N_-3.25E_100m_1hr",

diff --git a/tests/test_smart_data.py b/tests/test_smart_data.py
@@ -10,12 +10,13 @@
     add_smart_lat_long_to_cfg,
     calc_last_xmin_datetime_in_month,
     calc_month_list_and_time_info,
-    check_and_convert_scada_raw,
     load_smart_md_from_file,
     load_smart_scada_and_md_from_file,
 )
+
 TIMEBASE_PD_TIMEDELTA = pd.Timedelta("10min")
 
+
 def test_calc_last_xmin_datetime_in_month() -> None:
     inputs = [
         dt.datetime(2020, 1, 13),
@@ -34,6 +35,7 @@ def test_calc_last_xmin_datetime_in_month() -> None:
     for i, e in zip(inputs, expected, strict=True):
         assert calc_last_xmin_datetime_in_month(i, TIMEBASE_PD_TIMEDELTA) == pd.Timestamp(e)
 
+
 def test_add_smart_lat_long_to_cfg(test_marge_config: WindUpConfig) -> None:
     cfg = test_marge_config
     md_df = load_smart_md_from_file(asset_name="Marge Wind Farm", test_mode=True)
@@ -62,6 +64,7 @@ def test_calc_month_list_and_time_info() -> None:
     assert smart_tz == "UTC"
     assert smart_tf == "End"
 
+
 def test_load_smart_scada_and_md_from_file() -> None:
     test_data_dir = TEST_DATA_FLD / "smart_data" / "Marge Wind Farm"
     first_datetime_utc_start = pd.Timestamp("2020-02-26 23:50:00", tz="UTC")

diff --git a/wind_up/main_analysis.py b/wind_up/main_analysis.py
@@ -12,6 +12,7 @@
     REANALYSIS_WD_COL,
     REANALYSIS_WS_COL,
     WINDFARM_YAWDIR_COL,
+    DataColumns,
 )
 from wind_up.detrend import apply_wsratio_v_wd_scen, calc_wsratio_v_wd_scen, check_applied_detrend
 from wind_up.interface import AssessmentInputs, add_toggle_signals
@@ -362,6 +363,59 @@ def yaw_offset_results(
     return results
 
 
+def check_for_ops_curve_shift(
+    pre_df: pd.DataFrame,
+    post_df: pd.DataFrame,
+    *,
+    wtg_name: str,
+    ws_col: str,
+    pw_col: str,
+    rpm_col: str,
+    pt_col: str,
+) -> dict[str, float]:
+    results_dict = {
+        "powercurve_shift": np.nan,
+        "rpm_shift": np.nan,
+        "pitch_shift": np.nan,
+    }
+    # check if all required columns are present
+    required_cols = [ws_col, pw_col, pt_col, rpm_col]
+    for req_col in required_cols:
+        if req_col not in pre_df.columns:
+            msg = f"check_for_ops_curve_shift {wtg_name} pre_df missing required column {req_col}"
+            result_manager.warning(msg)
+            return results_dict
+        if req_col not in post_df.columns:
+            msg = f"check_for_ops_curve_shift {wtg_name} post_df missing required column {req_col}"
+            result_manager.warning(msg)
+            return results_dict
+    pre_dropna_df = pre_df.dropna(subset=[ws_col, pw_col, pt_col, rpm_col]).copy()
+    post_dropna_df = post_df.dropna(subset=[ws_col, pw_col, pt_col, rpm_col]).copy()
+
+    for descr, x_var, y_var, x_bin_width, warn_thresh in [
+        ("powercurve_shift", ws_col, pw_col, 1, 0.01),
+        ("rpm_shift", pw_col, rpm_col, 0, 0.005),
+        ("pitch_shift", ws_col, pt_col, 1, 0.1),
+    ]:
+        bins = np.arange(0, pre_dropna_df[x_var].max() + x_bin_width, x_bin_width) if x_bin_width > 0 else 10
+        mean_curve = pre_dropna_df.groupby(pd.cut(pre_dropna_df[x_var], bins=bins, retbins=False), observed=True).agg(
+            x_mean=pd.NamedAgg(column=x_var, aggfunc="mean"),
+            y_mean=pd.NamedAgg(column=y_var, aggfunc="mean"),
+        )
+        post_dropna_df["expected_y"] = np.interp(post_dropna_df[x_var], mean_curve["x_mean"], mean_curve["y_mean"])
+        mean_df = post_dropna_df.mean()
+        if y_var == pt_col:
+            results_dict[descr] = mean_df[y_var] - mean_df["expected_y"]
+        else:
+            results_dict[descr] = (mean_df[y_var] / mean_df["expected_y"] - 1).clip(-1, 1)
+        if abs(results_dict[descr]) > warn_thresh:
+            result_manager.warning(
+                f"{wtg_name} check_for_ops_curve_shift abs {descr} > {warn_thresh}: {results_dict[descr]}"
+            )
+
+    return results_dict
+
+
 def calc_test_ref_results(
     *,
     test_df: pd.DataFrame,
@@ -505,6 +559,16 @@ def calc_test_ref_results(
     pre_df = pre_df.merge(ref_df, how="left", left_index=True, right_index=True)
     post_df = post_df.merge(ref_df, how="left", left_index=True, right_index=True)
 
+    ref_ops_curve_shift_dict = check_for_ops_curve_shift(
+        pre_df,
+        post_df,
+        wtg_name=ref_name,
+        ws_col=ref_ws_col,
+        pw_col=ref_pw_col,
+        rpm_col=f"ref_{DataColumns.gen_rpm_mean}",
+        pt_col=f"ref_{DataColumns.pitch_angle_mean}",
+    )
+
     pre_df = add_waking_scen(
         test_ref_df=pre_df,
         test_name=test_name,
@@ -608,9 +672,6 @@ def calc_test_ref_results(
             toggle_name=cfg.toggle.name if cfg.toggle else None,
         )
 
-    pre_df.to_parquet(cfg.out_dir / f"{test_wtg.name}_{ref_name}_pre_df.parquet")
-    post_df.to_parquet(cfg.out_dir / f"{test_wtg.name}_{ref_name}_post_df.parquet")
-
     pp_results, pp_df = pre_post_pp_analysis_with_reversal_and_bootstrapping(
         cfg=cfg,
         test_wtg=test_wtg,
@@ -634,6 +695,9 @@ def calc_test_ref_results(
         "ref_max_northing_error_v_wf": ref_max_northing_error_v_wf,
         "ref_max_ws_drift": ref_max_ws_drift,
         "ref_max_ws_drift_pp_period": ref_max_ws_drift_pp_period,
+        "ref_powercurve_shift": ref_ops_curve_shift_dict["powercurve_shift"],
+        "ref_rpm_shift": ref_ops_curve_shift_dict["rpm_shift"],
+        "ref_pitch_shift": ref_ops_curve_shift_dict["pitch_shift"],
         "detrend_pre_r2_improvement": detrend_pre_r2_improvement,
         "detrend_post_r2_improvement": detrend_post_r2_improvement,
         "mean_power_pre": pre_df.dropna(subset=[detrend_ws_col, test_pw_col, ref_wd_col])[test_pw_col].mean(),
@@ -730,6 +794,16 @@ def run_wind_up_analysis(
 
         test_df, pre_df, post_df = pre_post_splitter.split(test_df, test_wtg_name=test_name)
 
+        test_ops_curve_shift_dict = check_for_ops_curve_shift(
+            pre_df,
+            post_df,
+            wtg_name=test_name,
+            ws_col=test_ws_col,
+            pw_col=test_pw_col,
+            rpm_col=f"test_{DataColumns.gen_rpm_mean}",
+            pt_col=f"test_{DataColumns.pitch_angle_mean}",
+        )
+
         # compare ops curves of pre_df and post_df
         compare_ops_curves_pre_post(
             pre_df=pre_df,
@@ -747,6 +821,9 @@ def run_wind_up_analysis(
             "lt_wtg_hours_filt": lt_wtg_df_filt["observed_hours"].sum() if lt_wtg_df_filt is not None else 0,
             "test_max_ws_drift": test_max_ws_drift,
             "test_max_ws_drift_pp_period": test_max_ws_drift_pp_period,
+            "test_powercurve_shift": test_ops_curve_shift_dict["powercurve_shift"],
+            "test_rpm_shift": test_ops_curve_shift_dict["rpm_shift"],
+            "test_pitch_shift": test_ops_curve_shift_dict["pitch_shift"],
             "preprocess_warning_counts": preprocess_warning_counts,
             "test_warning_counts": len(result_manager.stored_warnings),
         }

diff --git a/wind_up/plots/combine_results_plots.py b/wind_up/plots/combine_results_plots.py
@@ -1,12 +1,14 @@
 import pandas as pd
 from matplotlib import pyplot as plt
+from scipy.stats import norm
 
 from wind_up.models import PlotConfig
 
 
-def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotConfig) -> None:
+def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotConfig, confidence: float = 0.9) -> None:
     show_plots = plot_cfg.show_plots
     save_plots = plot_cfg.save_plots
+    z_score = norm.ppf((1 + confidence) / 2)
 
     plt.figure(figsize=(10, 6))
     labels = [
@@ -18,14 +20,14 @@ def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotCo
         )
     ]
     values = trdf["uplift_frc"] * 100
-    yerrs = trdf["unc_one_sigma_frc"] * 100
+    yerrs = trdf["unc_one_sigma_frc"] * 100 * z_score
     plt.bar(labels, values, yerr=yerrs, capsize=3)
     plt.xlabel("turbine")
-    plt.ylabel("uplift [%] and 1-sigma uncertainty")
-    plot_title = "uplift by test-ref pair"
+    plt.ylabel("uplift [%]")
+    plot_title = f"uplift and {confidence*100:.0f}% CI by test-ref pair"
     plt.title("uplift by test-ref pair")
     plt.grid(axis="y")
-    plt.xticks(rotation=45, ha="right")
+    plt.xticks(rotation=90, ha="right")
     plt.tight_layout()
     if show_plots:
         plt.show()
@@ -35,14 +37,14 @@ def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotCo
     plt.figure()
     labels = tdf["test_wtg"]
     values = tdf["p50_uplift"] * 100
-    yerrs = tdf["sigma"] * 100
+    yerrs = tdf["sigma"] * 100 * z_score
     plt.bar(labels, values, yerr=yerrs, capsize=3)
     plt.xlabel("turbine")
-    plt.ylabel("uplift [%] and 1-sigma uncertainty")
-    plot_title = "combined uplift"
+    plt.ylabel("uplift [%]")
+    plot_title = f"combined uplift and {confidence*100:.0f}% CI"
     plt.title(plot_title)
     plt.grid(axis="y")
-    plt.xticks(rotation=45, ha="right")
+    plt.xticks(rotation=90, ha="right")
     plt.tight_layout()
     if show_plots:
         plt.show()