Skip to content

Commit

Permalink
Reference suitability (#11)
Browse files Browse the repository at this point in the history
* stop saving parquet files to output

* add check_for_ops_curve_shift

* add plot_ops_curve_timelines_one_wtg

* enable ruff on tests

* Update main_analysis.py

fix bug

* add new pp_analysis_plots like Cp

* Update combine_results_plots.py

use confidence interval for error bars

* update smarteole_example.ipynb

show latest plots

* Update main_analysis.py

add required cols check to check_for_ops_curve_shift

* Update pp_analysis_plots.py

handle column missing case in plot_pre_post_condition_histogram, relevant for ambient temperature when using non turbine reference
  • Loading branch information
aclerc authored Jun 26, 2024
1 parent ec5b242 commit de1b1ab
Show file tree
Hide file tree
Showing 13 changed files with 530 additions and 107 deletions.
194 changes: 139 additions & 55 deletions examples/smarteole_example.ipynb

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ include = ["wind_up*"]
line-length = 120
target-version = "py310"
show-fixes = true
extend-exclude = ["tests"]

[tool.ruff.lint]
select = ["ALL"] # https://beta.ruff.rs/docs/rules/
Expand Down
21 changes: 12 additions & 9 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,18 @@
SAMPLE_OTHER = pd.date_range(pd.Timestamp("2000-01-01T05:00+05:00"), periods=2)


@pytest.mark.parametrize("t,exp", [
(pd.DatetimeIndex(SAMPLE_NAIVE), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.DatetimeIndex(SAMPLE_UTC), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.DatetimeIndex(SAMPLE_OTHER), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.Timestamp("2000-01-01T00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
(pd.Timestamp("2000-01-01T00:00+00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
(pd.Timestamp("2000-01-01T05:00+05:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
])
def test_ensure_utc(t, exp):
@pytest.mark.parametrize(
("t", "exp"),
[
(pd.DatetimeIndex(SAMPLE_NAIVE), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.DatetimeIndex(SAMPLE_UTC), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.DatetimeIndex(SAMPLE_OTHER), pd.DatetimeIndex(SAMPLE_UTC)),
(pd.Timestamp("2000-01-01T00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
(pd.Timestamp("2000-01-01T00:00+00:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
(pd.Timestamp("2000-01-01T05:00+05:00"), pd.Timestamp("2000-01-01T00:00+00:00")),
],
)
def test_ensure_utc(t: pd.Timestamp, exp: pd.Timestamp) -> None:
actual = ensure_utc(t)

if isinstance(t, pd.Timestamp):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


class TestWithParquetCache:
def test_creates_and_fetches_cache(self, tmp_path: Path):
def test_creates_and_fetches_cache(self, tmp_path: Path) -> None:
fp = tmp_path / "test.parquet"
sample_df = pd.DataFrame({"a": [1, 2, 3]})

Expand All @@ -22,7 +22,7 @@ def myfunc() -> pd.DataFrame:
df2 = myfunc()
pd.testing.assert_frame_equal(df2, sample_df)

def test_doesnt_run_the_func_if_file_exists(self, tmp_path: Path):
def test_doesnt_run_the_func_if_file_exists(self, tmp_path: Path) -> None:
fp = tmp_path / "test.parquet"
sample_df = pd.DataFrame({"a": [1, 2, 3]})
sample_df.to_parquet(fp)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_northing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ def test_apply_northing_corrections(test_lsa_t13_config: WindUpConfig) -> None:
assert median_yaw_after_northing["LSA_T13"] == pytest.approx(235.22855377197266)
assert median_yaw_after_northing["LSA_T14"] == pytest.approx(224.92881774902344)

abs_north_errs_before_northing = calc_max_abs_north_errs(wf_df, north_ref_wd_col=REANALYSIS_WD_COL,timebase_s=cfg.timebase_s)
abs_north_errs_after_northing = calc_max_abs_north_errs(wf_df_after_northing, north_ref_wd_col=REANALYSIS_WD_COL,timebase_s=cfg.timebase_s)
abs_north_errs_before_northing = calc_max_abs_north_errs(
wf_df, north_ref_wd_col=REANALYSIS_WD_COL, timebase_s=cfg.timebase_s
)
abs_north_errs_after_northing = calc_max_abs_north_errs(
wf_df_after_northing, north_ref_wd_col=REANALYSIS_WD_COL, timebase_s=cfg.timebase_s
)
assert abs_north_errs_before_northing.min() == pytest.approx(7.88920288085938)
assert abs_north_errs_before_northing.max() == pytest.approx(7.88920288085938)
assert abs_north_errs_after_northing["LSA_T07"] == pytest.approx(18.400006103515604)
Expand Down
22 changes: 19 additions & 3 deletions tests/test_pp_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def test_pre_post_pp_analysis_with_reversal(test_lsa_t13_config: WindUpConfig) -> None:
cfg = test_lsa_t13_config

test_wtg = next(x for x in cfg.asset.wtgs if x.name=="LSA_T13")
test_wtg = next(x for x in cfg.asset.wtgs if x.name == "LSA_T13")
ref_name = "LSA_T12"
detrend_ws_col = "ref_ws_detrended"
test_pw_col = "test_pw_clipped"
Expand Down Expand Up @@ -39,9 +39,25 @@ def test_pre_post_pp_analysis_with_reversal(test_lsa_t13_config: WindUpConfig) -
# minor changes to make actual_df compatible with old expected_df
expected_df["hours_for_mwh_calc"] = expected_df["hours_per_year"]
expected_df["hours_per_year"] = actual_df["hours_per_year"]
cols_with_new_calc = ["uplift_se", "uplift_p5_kw", "uplift_p95_kw"]
cols_with_new_calc = ["uplift_kw_se", "uplift_p5_kw", "uplift_p95_kw"]
expected_df[cols_with_new_calc] = actual_df[cols_with_new_calc]
new_cols = ["pre_valid", "post_valid", "hours_pre_raw", "hours_post_raw", "is_invalid_bin",'pw_at_mid_expected', 'pw_sem_at_mid_expected']
new_cols = [
"pre_valid",
"post_valid",
"hours_pre_raw",
"hours_post_raw",
"is_invalid_bin",
"pw_at_mid_expected",
"pw_sem_at_mid_expected",
"relative_cp_baseline",
"relative_cp_post",
"relative_cp_sem_at_mid_expected",
"relative_cp_sem_at_mid_post",
"uplift_relative_cp",
"uplift_relative_cp_se",
"uplift_relative_cp_p5",
"uplift_relative_cp_p95",
]
expected_df[new_cols] = actual_df[new_cols]
expected_df = expected_df[actual_df.columns]

Expand Down
1 change: 1 addition & 0 deletions tests/test_reanalysis_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
get_dsid_and_dates_from_filename,
)


def test_get_dsid_and_dates_from_filename() -> None:
assert get_dsid_and_dates_from_filename("ERA5T_47.50N_-3.25E_100m_1hr_19900101_20231031.parquet") == (
"ERA5T_47.50N_-3.25E_100m_1hr",
Expand Down
5 changes: 4 additions & 1 deletion tests/test_smart_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@
add_smart_lat_long_to_cfg,
calc_last_xmin_datetime_in_month,
calc_month_list_and_time_info,
check_and_convert_scada_raw,
load_smart_md_from_file,
load_smart_scada_and_md_from_file,
)

TIMEBASE_PD_TIMEDELTA = pd.Timedelta("10min")


def test_calc_last_xmin_datetime_in_month() -> None:
inputs = [
dt.datetime(2020, 1, 13),
Expand All @@ -34,6 +35,7 @@ def test_calc_last_xmin_datetime_in_month() -> None:
for i, e in zip(inputs, expected, strict=True):
assert calc_last_xmin_datetime_in_month(i, TIMEBASE_PD_TIMEDELTA) == pd.Timestamp(e)


def test_add_smart_lat_long_to_cfg(test_marge_config: WindUpConfig) -> None:
cfg = test_marge_config
md_df = load_smart_md_from_file(asset_name="Marge Wind Farm", test_mode=True)
Expand Down Expand Up @@ -62,6 +64,7 @@ def test_calc_month_list_and_time_info() -> None:
assert smart_tz == "UTC"
assert smart_tf == "End"


def test_load_smart_scada_and_md_from_file() -> None:
test_data_dir = TEST_DATA_FLD / "smart_data" / "Marge Wind Farm"
first_datetime_utc_start = pd.Timestamp("2020-02-26 23:50:00", tz="UTC")
Expand Down
83 changes: 80 additions & 3 deletions wind_up/main_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
REANALYSIS_WD_COL,
REANALYSIS_WS_COL,
WINDFARM_YAWDIR_COL,
DataColumns,
)
from wind_up.detrend import apply_wsratio_v_wd_scen, calc_wsratio_v_wd_scen, check_applied_detrend
from wind_up.interface import AssessmentInputs, add_toggle_signals
Expand Down Expand Up @@ -362,6 +363,59 @@ def yaw_offset_results(
return results


def check_for_ops_curve_shift(
pre_df: pd.DataFrame,
post_df: pd.DataFrame,
*,
wtg_name: str,
ws_col: str,
pw_col: str,
rpm_col: str,
pt_col: str,
) -> dict[str, float]:
results_dict = {
"powercurve_shift": np.nan,
"rpm_shift": np.nan,
"pitch_shift": np.nan,
}
# check if all required columns are present
required_cols = [ws_col, pw_col, pt_col, rpm_col]
for req_col in required_cols:
if req_col not in pre_df.columns:
msg = f"check_for_ops_curve_shift {wtg_name} pre_df missing required column {req_col}"
result_manager.warning(msg)
return results_dict
if req_col not in post_df.columns:
msg = f"check_for_ops_curve_shift {wtg_name} post_df missing required column {req_col}"
result_manager.warning(msg)
return results_dict
pre_dropna_df = pre_df.dropna(subset=[ws_col, pw_col, pt_col, rpm_col]).copy()
post_dropna_df = post_df.dropna(subset=[ws_col, pw_col, pt_col, rpm_col]).copy()

for descr, x_var, y_var, x_bin_width, warn_thresh in [
("powercurve_shift", ws_col, pw_col, 1, 0.01),
("rpm_shift", pw_col, rpm_col, 0, 0.005),
("pitch_shift", ws_col, pt_col, 1, 0.1),
]:
bins = np.arange(0, pre_dropna_df[x_var].max() + x_bin_width, x_bin_width) if x_bin_width > 0 else 10
mean_curve = pre_dropna_df.groupby(pd.cut(pre_dropna_df[x_var], bins=bins, retbins=False), observed=True).agg(
x_mean=pd.NamedAgg(column=x_var, aggfunc="mean"),
y_mean=pd.NamedAgg(column=y_var, aggfunc="mean"),
)
post_dropna_df["expected_y"] = np.interp(post_dropna_df[x_var], mean_curve["x_mean"], mean_curve["y_mean"])
mean_df = post_dropna_df.mean()
if y_var == pt_col:
results_dict[descr] = mean_df[y_var] - mean_df["expected_y"]
else:
results_dict[descr] = (mean_df[y_var] / mean_df["expected_y"] - 1).clip(-1, 1)
if abs(results_dict[descr]) > warn_thresh:
result_manager.warning(
f"{wtg_name} check_for_ops_curve_shift abs {descr} > {warn_thresh}: {results_dict[descr]}"
)

return results_dict


def calc_test_ref_results(
*,
test_df: pd.DataFrame,
Expand Down Expand Up @@ -505,6 +559,16 @@ def calc_test_ref_results(
pre_df = pre_df.merge(ref_df, how="left", left_index=True, right_index=True)
post_df = post_df.merge(ref_df, how="left", left_index=True, right_index=True)

ref_ops_curve_shift_dict = check_for_ops_curve_shift(
pre_df,
post_df,
wtg_name=ref_name,
ws_col=ref_ws_col,
pw_col=ref_pw_col,
rpm_col=f"ref_{DataColumns.gen_rpm_mean}",
pt_col=f"ref_{DataColumns.pitch_angle_mean}",
)

pre_df = add_waking_scen(
test_ref_df=pre_df,
test_name=test_name,
Expand Down Expand Up @@ -608,9 +672,6 @@ def calc_test_ref_results(
toggle_name=cfg.toggle.name if cfg.toggle else None,
)

pre_df.to_parquet(cfg.out_dir / f"{test_wtg.name}_{ref_name}_pre_df.parquet")
post_df.to_parquet(cfg.out_dir / f"{test_wtg.name}_{ref_name}_post_df.parquet")

pp_results, pp_df = pre_post_pp_analysis_with_reversal_and_bootstrapping(
cfg=cfg,
test_wtg=test_wtg,
Expand All @@ -634,6 +695,9 @@ def calc_test_ref_results(
"ref_max_northing_error_v_wf": ref_max_northing_error_v_wf,
"ref_max_ws_drift": ref_max_ws_drift,
"ref_max_ws_drift_pp_period": ref_max_ws_drift_pp_period,
"ref_powercurve_shift": ref_ops_curve_shift_dict["powercurve_shift"],
"ref_rpm_shift": ref_ops_curve_shift_dict["rpm_shift"],
"ref_pitch_shift": ref_ops_curve_shift_dict["pitch_shift"],
"detrend_pre_r2_improvement": detrend_pre_r2_improvement,
"detrend_post_r2_improvement": detrend_post_r2_improvement,
"mean_power_pre": pre_df.dropna(subset=[detrend_ws_col, test_pw_col, ref_wd_col])[test_pw_col].mean(),
Expand Down Expand Up @@ -730,6 +794,16 @@ def run_wind_up_analysis(

test_df, pre_df, post_df = pre_post_splitter.split(test_df, test_wtg_name=test_name)

test_ops_curve_shift_dict = check_for_ops_curve_shift(
pre_df,
post_df,
wtg_name=test_name,
ws_col=test_ws_col,
pw_col=test_pw_col,
rpm_col=f"test_{DataColumns.gen_rpm_mean}",
pt_col=f"test_{DataColumns.pitch_angle_mean}",
)

# compare ops curves of pre_df and post_df
compare_ops_curves_pre_post(
pre_df=pre_df,
Expand All @@ -747,6 +821,9 @@ def run_wind_up_analysis(
"lt_wtg_hours_filt": lt_wtg_df_filt["observed_hours"].sum() if lt_wtg_df_filt is not None else 0,
"test_max_ws_drift": test_max_ws_drift,
"test_max_ws_drift_pp_period": test_max_ws_drift_pp_period,
"test_powercurve_shift": test_ops_curve_shift_dict["powercurve_shift"],
"test_rpm_shift": test_ops_curve_shift_dict["rpm_shift"],
"test_pitch_shift": test_ops_curve_shift_dict["pitch_shift"],
"preprocess_warning_counts": preprocess_warning_counts,
"test_warning_counts": len(result_manager.stored_warnings),
}
Expand Down
20 changes: 11 additions & 9 deletions wind_up/plots/combine_results_plots.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import norm

from wind_up.models import PlotConfig


def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotConfig) -> None:
def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotConfig, confidence: float = 0.9) -> None:
show_plots = plot_cfg.show_plots
save_plots = plot_cfg.save_plots
z_score = norm.ppf((1 + confidence) / 2)

plt.figure(figsize=(10, 6))
labels = [
Expand All @@ -18,14 +20,14 @@ def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotCo
)
]
values = trdf["uplift_frc"] * 100
yerrs = trdf["unc_one_sigma_frc"] * 100
yerrs = trdf["unc_one_sigma_frc"] * 100 * z_score
plt.bar(labels, values, yerr=yerrs, capsize=3)
plt.xlabel("turbine")
plt.ylabel("uplift [%] and 1-sigma uncertainty")
plot_title = "uplift by test-ref pair"
plt.ylabel("uplift [%]")
plot_title = f"uplift and {confidence*100:.0f}% CI by test-ref pair"
plt.title("uplift by test-ref pair")
plt.grid(axis="y")
plt.xticks(rotation=45, ha="right")
plt.xticks(rotation=90, ha="right")
plt.tight_layout()
if show_plots:
plt.show()
Expand All @@ -35,14 +37,14 @@ def plot_combine_results(trdf: pd.DataFrame, tdf: pd.DataFrame, plot_cfg: PlotCo
plt.figure()
labels = tdf["test_wtg"]
values = tdf["p50_uplift"] * 100
yerrs = tdf["sigma"] * 100
yerrs = tdf["sigma"] * 100 * z_score
plt.bar(labels, values, yerr=yerrs, capsize=3)
plt.xlabel("turbine")
plt.ylabel("uplift [%] and 1-sigma uncertainty")
plot_title = "combined uplift"
plt.ylabel("uplift [%]")
plot_title = f"combined uplift and {confidence*100:.0f}% CI"
plt.title(plot_title)
plt.grid(axis="y")
plt.xticks(rotation=45, ha="right")
plt.xticks(rotation=90, ha="right")
plt.tight_layout()
if show_plots:
plt.show()
Expand Down
Loading

0 comments on commit de1b1ab

Please sign in to comment.