add wind farm LT distribution option (#14)

* make ref subfolder * order columns in interim results * increase hrs_per_mps * bump version * more cols in combine_results * Update pre_post_pp_analysis_expected_df.parquet * increase ws_diff_ul * add use_test_wtg_lt_distribution setting * Update caching.py only warn if cache is not fresh * combine check_for_ops_curve_shift warnings * add plot_pre_post_uplift_pct * fix check_for_ops_curve_shift warning * rename smart_missing_data_fields * bug fix fix case where no northing corrections are defined * remove uplift_relative_cp * Update smarteole_example.ipynb * add test_calc_windfarm_lt_dfs_raw_filt * Update northing.py ensure wf_df are always set the same way even if a turbine has no northing corrections * Update northing.py fix format * Update smarteole_example.ipynb
resgroup · Jul 29, 2024 · d05d2de · d05d2de
1 parent ca6f29d
commit d05d2de
Show file tree

Hide file tree

Showing 20 changed files with 557 additions and 436 deletions.
diff --git a/examples/smarteole_example.ipynb b/examples/smarteole_example.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "res-wind-up"
-version = "0.1.7"
+version = "0.1.8"
 authors = [
     { name = "Alex Clerc", email = "alex.clerc@res-group.com" }
 ]

diff --git a/tests/test_combine_results.py b/tests/test_combine_results.py
@@ -71,13 +71,15 @@ def test_brt_t16_pitch() -> None:
     trdf = pd.read_csv(Path(__file__).parents[0] / "test_data/trdf_BRT_T16_pitch_Sep23.csv", index_col=0)
     edf = pd.read_csv(Path(__file__).parents[0] / "test_data/tdf_BRT_T16_pitch.csv", index_col=0)
     tdf = combine_results(trdf=trdf)
+    tdf = tdf[edf.columns.tolist()]
     assert_frame_equal(edf, tdf)
 
 
 def test_brt_t16_pitch_no_auto_choose() -> None:
     trdf = pd.read_csv(Path(__file__).parents[0] / "test_data/trdf_BRT_T16_pitch_Sep23.csv", index_col=0)
     edf = pd.read_csv(Path(__file__).parents[0] / "test_data/tdf_BRT_T16_pitch_no_auto_choose.csv", index_col=0)
     tdf = combine_results(trdf=trdf, auto_choose_refs=False)
+    tdf = tdf[edf.columns.tolist()]
     assert_frame_equal(edf, tdf)
 
 
@@ -86,4 +88,5 @@ def test_brt_t16_pitch_exclude_refs() -> None:
     edf = pd.read_csv(Path(__file__).parents[0] / "test_data/tdf_BRT_T16_pitch_exclude_refs.csv", index_col=0)
     # all but one ref excluded
     tdf = combine_results(trdf=trdf, auto_choose_refs=False, exclude_refs=["BRT_T02", "BRT_T03", "BRT_T04", "BRT_T14"])
+    tdf = tdf[edf.columns.tolist()]
     assert_frame_equal(edf, tdf)
diff --git a/tests/test_data/pre_post_pp_analysis_expected_df.parquet b/tests/test_data/pre_post_pp_analysis_expected_df.parquet
diff --git a/tests/test_long_term.py b/tests/test_long_term.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from pandas.testing import assert_frame_equal
 
-from wind_up.long_term import calc_turbine_lt_dfs_raw_filt
+from wind_up.long_term import calc_lt_dfs_raw_filt
 from wind_up.models import WindUpConfig
 
 
@@ -12,16 +12,52 @@ def test_calc_turbine_lt_dfs_raw_filt(test_lsa_t13_config: WindUpConfig) -> None
     test_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_test_df.parquet")
     test_df.columns = test_df.columns.str.replace("test_", "")
     test_name = "LSA_T13"
-    lt_wtg_df_raw, lt_wtg_df_filt = calc_turbine_lt_dfs_raw_filt(
-        wtg_name=test_name,
+    lt_wtg_df_raw, lt_wtg_df_filt = calc_lt_dfs_raw_filt(
+        wtg_or_wf_name=test_name,
         cfg=cfg,
-        wtg_df=test_df,
+        wtg_or_wf_df=test_df,
         ws_col="WindSpeedMean",
         pw_col="ActivePowerMean",
+        one_turbine=True,
         plot_cfg=None,
     )
 
     expected_raw_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_lt_wtg_df_raw.parquet")
     expected_filt_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_lt_wtg_df_filt.parquet")
     assert_frame_equal(lt_wtg_df_raw, expected_raw_df)
     assert_frame_equal(lt_wtg_df_filt, expected_filt_df)
+
+
+def test_calc_windfarm_lt_dfs_raw_filt(test_lsa_t13_config: WindUpConfig) -> None:
+    cfg = test_lsa_t13_config
+    test_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_test_df.parquet")
+    test_df.columns = test_df.columns.str.replace("test_", "")
+    # make a fake wf_df
+    test_df["TurbineName"] = "LSA_T13"
+    wf_df = test_df.copy().set_index(["TurbineName"], append=True).swaplevel()
+    for fake_wtg_name in ["LSA_T14", "LSA_T15"]:
+        new_df = test_df.copy()
+        new_df["TurbineName"] = fake_wtg_name
+        new_df = new_df.set_index(["TurbineName"], append=True).swaplevel()
+        wf_df = pd.concat([wf_df, new_df])
+    cfg.asset.wtgs = [x for x in cfg.asset.wtgs if x.name in {"LSA_T13", "LSA_T14", "LSA_T15"}]
+
+    lt_wtg_df_raw, lt_wtg_df_filt = calc_lt_dfs_raw_filt(
+        wtg_or_wf_name=cfg.asset.name,
+        cfg=cfg,
+        wtg_or_wf_df=wf_df,
+        ws_col="WindSpeedMean",
+        pw_col="ActivePowerMean",
+        one_turbine=False,
+        plot_cfg=None,
+    )
+
+    expected_raw_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_lt_wtg_df_raw.parquet")
+    expected_raw_df["observed_hours"] *= 3
+    expected_raw_df["observed_mwh"] *= 3
+    assert_frame_equal(lt_wtg_df_raw, expected_raw_df)
+
+    expected_filt_df = pd.read_parquet(Path(__file__).parents[0] / "test_data/LSA_T13_lt_wtg_df_filt.parquet")
+    expected_filt_df["observed_hours"] *= 3
+    expected_filt_df["observed_mwh"] *= 3
+    assert_frame_equal(lt_wtg_df_filt, expected_filt_df)
diff --git a/tests/test_pp_analysis.py b/tests/test_pp_analysis.py
@@ -36,43 +36,18 @@ def test_pre_post_pp_analysis_with_reversal(test_lsa_t13_config: WindUpConfig) -
         test_df=test_df,
     )
 
-    # minor changes to make actual_df compatible with old expected_df
-    expected_df["hours_for_mwh_calc"] = expected_df["hours_per_year"]
-    expected_df["hours_per_year"] = actual_df["hours_per_year"]
-    cols_with_new_calc = ["uplift_kw_se", "uplift_p5_kw", "uplift_p95_kw"]
-    expected_df[cols_with_new_calc] = actual_df[cols_with_new_calc]
-    new_cols = [
-        "pre_valid",
-        "post_valid",
-        "hours_pre_raw",
-        "hours_post_raw",
-        "is_invalid_bin",
-        "pw_at_mid_expected",
-        "pw_sem_at_mid_expected",
-        "relative_cp_baseline",
-        "relative_cp_post",
-        "relative_cp_sem_at_mid_expected",
-        "relative_cp_sem_at_mid_post",
-        "uplift_relative_cp",
-        "uplift_relative_cp_se",
-        "uplift_relative_cp_p5",
-        "uplift_relative_cp_p95",
-    ]
-    expected_df[new_cols] = actual_df[new_cols]
-    expected_df = expected_df[actual_df.columns]
-
     assert_frame_equal(actual_df, expected_df)
-    assert pp_results["pp_valid_hours"] == pytest.approx(10748.5)
-    assert pp_results["pp_valid_hours_pre"] == pytest.approx(5807.333333333333)
+    assert pp_results["pp_valid_hours"] == pytest.approx(10745.83333333333)
+    assert pp_results["pp_valid_hours_pre"] == pytest.approx(5804.666666666666)
     assert pp_results["pp_valid_hours_post"] == pytest.approx(4941.166666666667)
-    assert pp_results["pp_invalid_bin_count"] == 3
-    assert pp_results["pp_data_coverage"] == pytest.approx(0.6793388952092024)
-    assert pp_results["reversal_error"] == pytest.approx(-0.008786551768533796)
-    assert pp_results["uplift_noadj_frc"] == pytest.approx(0.04523448345231426)
-    assert pp_results["poweronly_uplift_frc"] == pytest.approx(0.04560411838169785)
-    assert pp_results["reversed_uplift_frc"] == pytest.approx(0.03681756661316406)
-    assert pp_results["uplift_frc"] == pytest.approx(0.040841207568047364)
-    assert pp_results["missing_bins_unc_scale_factor"] == pytest.approx(1.0000000006930523)
-    assert pp_results["t_value_one_sigma"] == pytest.approx(1.0000168636907854)
-    assert pp_results["unc_one_sigma_lowerbound_frc"] == pytest.approx(0.004393275884266898)
-    assert pp_results["unc_one_sigma_frc"] == pytest.approx(0.004393275884266898)
+    assert pp_results["pp_invalid_bin_count"] == 4
+    assert pp_results["pp_data_coverage"] == pytest.approx(0.679170353516201)
+    assert pp_results["reversal_error"] == pytest.approx(-0.008804504211352544)
+    assert pp_results["uplift_noadj_frc"] == pytest.approx(0.04523639558619659)
+    assert pp_results["poweronly_uplift_frc"] == pytest.approx(0.045608171004022265)
+    assert pp_results["reversed_uplift_frc"] == pytest.approx(0.03680366679266972)
+    assert pp_results["uplift_frc"] == pytest.approx(0.04083414348052032)
+    assert pp_results["missing_bins_unc_scale_factor"] == pytest.approx(1.0000004657300203)
+    assert pp_results["t_value_one_sigma"] == pytest.approx(1.0000168659661646)
+    assert pp_results["unc_one_sigma_lowerbound_frc"] == pytest.approx(0.004402252105676272)
+    assert pp_results["unc_one_sigma_frc"] == pytest.approx(0.004402252105676272)
diff --git a/wind_up/caching.py b/wind_up/caching.py
@@ -23,11 +23,14 @@ def wrapped_f(*a: Any, **kw: Any) -> pd.DataFrame:  # noqa
 def with_pickle_cache(fp: Path, *, use_cache: bool = True) -> Callable:
     def wrap(func: Callable[..., Any]) -> Callable[..., Any]:
         def wrapped_f(*a: Any, **kw: Any) -> Any:  # noqa
+            fresh_cache = False
             if not Path(fp).is_file() or not use_cache or Path(fp).stat().st_size == 0:
                 with Path.open(fp, "wb") as f:
                     pickle.dump(func(*a, **kw), f)
-            with Path.open(fp, "rb") as f:
+                    fresh_cache = True
+            if not fresh_cache:
                 result_manager.warning(f"loading cached pickle {fp}")
+            with Path.open(fp, "rb") as f:
                 return pickle.load(f)
 
         return wrapped_f

diff --git a/wind_up/combine_results.py b/wind_up/combine_results.py
@@ -38,12 +38,14 @@ def calc_tdf(trdf: pd.DataFrame, ref_list: list[str], weight_col: str = "unc_wei
             aggfunc=lambda x: (x * trdf.loc[x.index, weight_col]).sum() / trdf.loc[x.index, weight_col].sum(),
         ),
         ref_count=pd.NamedAgg(column="uplift_frc", aggfunc=len),
+        ref_list=pd.NamedAgg(column="ref", aggfunc=lambda x: ", ".join(sorted(x))),
         is_ref=pd.NamedAgg(column="test_wtg", aggfunc=lambda x: x.isin(ref_list).any()),
     )
     tdf["sigma_test"] = (tdf["sigma_uncorr"] + tdf["sigma_corr"]) / 2
     tdf = tdf.sort_values(by=["ref_count", "test_wtg"], ascending=[False, True])
     tdf = tdf.reset_index()
     sigma_ref = calc_sigma_ref(tdf, ref_list)
+    tdf["sigma_ref"] = sigma_ref
     tdf["sigma"] = tdf["sigma_test"].clip(lower=sigma_ref)
     tdf["p95_uplift"] = tdf["p50_uplift"] + norm.ppf(0.05) * tdf["sigma"]
     tdf["p5_uplift"] = tdf["p50_uplift"] + norm.ppf(0.95) * tdf["sigma"]

diff --git a/wind_up/long_term.py b/wind_up/long_term.py
@@ -44,9 +44,10 @@ def calc_lt_df(
     ws_bin_edges = np.arange(0, df_for_lt[ws_col].max() + ws_bin_width, ws_bin_width)
 
     rows_per_hour = 3600 / timebase_s
+    df_for_groupby = df_for_lt.reset_index()
     lt_df = (
-        df_for_lt.dropna(subset=[ws_col, pw_col])
-        .groupby(by=pd.cut(df_for_lt[ws_col], bins=ws_bin_edges, retbins=False), observed=False)
+        df_for_groupby.dropna(subset=[ws_col, pw_col])
+        .groupby(by=pd.cut(df_for_groupby[ws_col], bins=ws_bin_edges, retbins=False), observed=False)
         .agg(
             ws_mean=pd.NamedAgg(column=ws_col, aggfunc=lambda x: x.mean()),
             observed_hours=pd.NamedAgg(column=ws_col, aggfunc=lambda x: len(x) / rows_per_hour),
@@ -88,139 +89,88 @@ def calc_lt_df(
     return lt_df
 
 
-def calc_turbine_lt_df(
-    wtg_name: str,
+def filter_and_calc_lt_df(
+    wtg_or_wf_name: str,
     cfg: WindUpConfig,
-    wtg_df: pd.DataFrame,
+    wtg_or_wf_df: pd.DataFrame,
     *,
     ws_col: str,
     pw_col: str,
     title_end: str = "",
+    one_turbine: bool,
     plot_cfg: PlotConfig | None = None,
 ) -> pd.DataFrame:
-    workings_df = wtg_df.copy()
+    workings_df = wtg_or_wf_df.copy()
     if not isinstance(workings_df.index, pd.DatetimeIndex):
-        msg = f"wtg_df must have a DatetimeIndex, got {type(workings_df.index)}"
-        raise TypeError(msg)
+        if "TimeStamp_StartFormat" in workings_df.index.names:
+            workings_df = workings_df.reset_index().set_index("TimeStamp_StartFormat", drop=True)
+        else:
+            msg = (
+                f"workings_df must have a DatetimeIndex or index level called TimeStamp_StartFormat. "
+                f"{workings_df.index.names=}"
+            )
+            raise ValueError(msg)
 
     ok_for_lt = (workings_df.index >= cfg.lt_first_dt_utc_start) & (workings_df.index <= cfg.lt_last_dt_utc_start)
 
     lt_df = calc_lt_df(
         df_for_lt=workings_df[ok_for_lt],
-        num_turbines=1,
+        num_turbines=1 if one_turbine else len(cfg.asset.wtgs),
         years_for_lt_distribution=cfg.years_for_lt_distribution,
         ws_col=ws_col,
         ws_bin_width=cfg.ws_bin_width,
         pw_col=pw_col,
         timebase_s=cfg.timebase_s,
     )
     if plot_cfg is not None:
-        plot_lt_ws(lt_df=lt_df, turbine_or_wf_name=wtg_name, title_end=title_end, plot_cfg=plot_cfg, one_turbine=True)
-
-    return lt_df
-
-
-def calc_turbine_lt_dfs_raw_filt(
-    wtg_name: str,
-    cfg: WindUpConfig,
-    wtg_df: pd.DataFrame,
-    *,
-    ws_col: str,
-    pw_col: str,
-    plot_cfg: PlotConfig | None = None,
-) -> tuple[pd.DataFrame, pd.DataFrame]:
-    lt_wf_df_raw = calc_turbine_lt_df(
-        wtg_name=wtg_name,
-        cfg=cfg,
-        wtg_df=wtg_df,
-        ws_col=RAW_WINDSPEED_COL,
-        pw_col=RAW_POWER_COL,
-        title_end="before filter",
-        plot_cfg=plot_cfg,
-    )
-    lt_wf_df_filt = calc_turbine_lt_df(
-        wtg_name=wtg_name,
-        cfg=cfg,
-        wtg_df=wtg_df,
-        ws_col=ws_col,
-        pw_col=pw_col,
-        title_end="after filter",
-        plot_cfg=plot_cfg,
-    )
-    if plot_cfg is not None:
-        plot_lt_ws_raw_filt(
-            lt_df_raw=lt_wf_df_raw,
-            lt_df_filt=lt_wf_df_filt,
-            turbine_or_wf_name=wtg_name,
+        plot_lt_ws(
+            lt_df=lt_df,
+            turbine_or_wf_name=wtg_or_wf_name,
+            title_end=title_end,
             plot_cfg=plot_cfg,
-            one_turbine=True,
+            one_turbine=one_turbine,
         )
 
-    return lt_wf_df_raw, lt_wf_df_filt
-
-
-def calc_windfarm_lt_df(
-    cfg: WindUpConfig,
-    wf_df: pd.DataFrame,
-    *,
-    ws_col: str,
-    pw_col: str,
-    title_end: str = "",
-    plot_cfg: PlotConfig | None = None,
-) -> pd.DataFrame:
-    workings_df = wf_df.copy()
-    if len(workings_df.index.levels) == 2:  # noqa PLR2004
-        workings_df.index = workings_df.index.droplevel("TurbineName")
-
-    ok_for_lt = (workings_df.index >= cfg.lt_first_dt_utc_start) & (workings_df.index <= cfg.lt_last_dt_utc_start)
-
-    lt_df = calc_lt_df(
-        df_for_lt=workings_df[ok_for_lt],
-        num_turbines=len(cfg.asset.wtgs),
-        years_for_lt_distribution=cfg.years_for_lt_distribution,
-        ws_col=ws_col,
-        ws_bin_width=cfg.ws_bin_width,
-        pw_col=pw_col,
-        timebase_s=cfg.timebase_s,
-    )
-
-    if plot_cfg is not None:
-        plot_lt_ws(lt_df=lt_df, turbine_or_wf_name=cfg.asset.name, title_end=title_end, plot_cfg=plot_cfg)
-
     return lt_df
 
 
-def calc_windfarm_lt_dfs_raw_filt(
+def calc_lt_dfs_raw_filt(
+    wtg_or_wf_name: str,
     cfg: WindUpConfig,
-    wf_df_raw: pd.DataFrame,
-    wf_df_filt: pd.DataFrame,
+    wtg_or_wf_df: pd.DataFrame,
     *,
     ws_col: str,
     pw_col: str,
+    one_turbine: bool,
     plot_cfg: PlotConfig | None = None,
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
-    lt_wf_df_raw = calc_windfarm_lt_df(
+    lt_df_raw = filter_and_calc_lt_df(
+        wtg_or_wf_name=wtg_or_wf_name,
         cfg=cfg,
-        wf_df=wf_df_raw,
-        ws_col=ws_col,
-        pw_col=pw_col,
+        wtg_or_wf_df=wtg_or_wf_df,
+        ws_col=RAW_WINDSPEED_COL,
+        pw_col=RAW_POWER_COL,
         title_end="before filter",
+        one_turbine=one_turbine,
         plot_cfg=plot_cfg,
     )
-    lt_wf_df_filt = calc_windfarm_lt_df(
+    lt_df_filt = filter_and_calc_lt_df(
+        wtg_or_wf_name=wtg_or_wf_name,
         cfg=cfg,
-        wf_df=wf_df_filt,
+        wtg_or_wf_df=wtg_or_wf_df,
         ws_col=ws_col,
         pw_col=pw_col,
         title_end="after filter",
+        one_turbine=one_turbine,
         plot_cfg=plot_cfg,
     )
     if plot_cfg is not None:
         plot_lt_ws_raw_filt(
-            lt_df_raw=lt_wf_df_raw,
-            lt_df_filt=lt_wf_df_filt,
-            turbine_or_wf_name=cfg.asset.name,
+            lt_df_raw=lt_df_raw,
+            lt_df_filt=lt_df_filt,
+            wtg_or_wf_name=wtg_or_wf_name,
             plot_cfg=plot_cfg,
+            one_turbine=one_turbine,
         )
 
-    return lt_wf_df_raw, lt_wf_df_filt
+    return lt_df_raw, lt_df_filt