diff --git a/solardatatools/algorithms/time_shifts.py b/solardatatools/algorithms/time_shifts.py index 9fa9f94f..0697a764 100644 --- a/solardatatools/algorithms/time_shifts.py +++ b/solardatatools/algorithms/time_shifts.py @@ -17,6 +17,7 @@ import numpy as np from scipy.stats import mode +from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from solardatatools.solar_noon import energy_com, avg_sunrise_sunset from solardatatools.signal_decompositions import l2_l1d1_l2d2p365 @@ -143,21 +144,26 @@ def run( self.__recursion_depth = 0 def optimize_c1(self, metric, c1s, use_ixs, c2, periodic_detector, solver=None): - n = np.sum(use_ixs) - select = np.random.uniform(size=n) <= 0.75 # random holdout selection - train = np.copy(use_ixs) - test = np.copy(use_ixs) - train[use_ixs] = select - test[use_ixs] = ~select + # set up train/test split with sklearn + ixs = np.arange(len(metric)) + ixs = ixs[use_ixs] + train_ixs, test_ixs = train_test_split(ixs, test_size=0.75) + train = np.zeros(len(metric), dtype=bool) + test = np.zeros(len(metric), dtype=bool) + train[train_ixs] = True + test[test_ixs] = True + # initialize results objects train_r = np.zeros_like(c1s) test_r = np.zeros_like(c1s) tv_metric = np.zeros_like(c1s) jpy = np.zeros_like(c1s) + # iterate over possible values of c1 parameter for i, v in enumerate(c1s): s1, s2 = self.estimate_components( metric, v, c2, train, periodic_detector, n_iter=5, solver=solver ) y = metric + # collect results train_r[i] = np.average(np.power((y - s1 - s2)[train], 2)) test_r[i] = np.average(np.power((y - s1 - s2)[test], 2)) tv_metric[i] = np.average(np.abs(np.diff(s1, n=1))) @@ -165,7 +171,7 @@ def optimize_c1(self, metric, c1s, use_ixs, c2, periodic_detector, solver=None): jumps_per_year = count_jumps / (len(metric) / 365) jpy[i] = jumps_per_year zero_one_scale = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) - hn = zero_one_scale(test_r) + hn = zero_one_scale(test_r) # holdout error metrix rn = zero_one_scale(train_r) ixs = np.arange(len(c1s)) # Detecting more than 5 time shifts per year is extremely uncommon, @@ -203,39 +209,29 @@ def estimate_components( w = 1 / (eps + np.abs(np.diff(s1, n=1))) return s1, s2 - def plot_optimization(self): + def plot_optimization(self, figsize=None): if self.best_ix is not None: c1s = self.c1_vals hn = self.normalized_holdout_error rn = self.normalized_train_error best_c1 = self.best_c1 import matplotlib.pyplot as plt - - plt.plot(c1s, hn, marker=".") - plt.axvline(best_c1, ls="--", color="red") - plt.xscale("log") - plt.title("holdout validation") - plt.show() - plt.plot(c1s, self.jumps_per_year, marker=".") - plt.axvline(best_c1, ls="--", color="red") - plt.xscale("log") - plt.title("jumps per year") - plt.show() - plt.plot(c1s, rn, marker=".") - plt.axvline(best_c1, ls="--", color="red") - plt.xscale("log") - plt.title("training residuals") - plt.show() - # plt.plot(c1s, hn * rn, marker='.') - # plt.axvline(best_c1, ls='--', color='red') - # plt.xscale('log') - # plt.title('holdout error times training error') - # plt.show() - plt.plot(c1s, self.tv_metric, marker=".") - plt.axvline(best_c1, ls="--", color="red") - plt.xscale("log") - plt.title("Total variation metric") - plt.show() + fig, ax = plt.subplots(nrows=4, sharex=True, figsize=figsize) + ax[0].plot(c1s, hn, marker=".") + ax[0].axvline(best_c1, ls="--", color="red") + ax[0].set_title("holdout validation") + ax[1].plot(c1s, self.jumps_per_year, marker=".") + ax[1].axvline(best_c1, ls="--", color="red") + ax[1].set_title("jumps per year") + ax[2].plot(c1s, rn, marker=".") + ax[2].axvline(best_c1, ls="--", color="red") + ax[2].set_title("training residuals") + ax[3].plot(c1s, self.tv_metric, marker=".") + ax[3].axvline(best_c1, ls="--", color="red") + ax[3].set_xscale("log") + ax[3].set_title("Total variation metric") + plt.tight_layout() + return fig def apply_corrections(self, data): roll_by_index = self.roll_by_index diff --git a/solardatatools/clear_day_detection.py b/solardatatools/clear_day_detection.py index 64b8a33c..c2f65b9b 100644 --- a/solardatatools/clear_day_detection.py +++ b/solardatatools/clear_day_detection.py @@ -41,7 +41,7 @@ def find_clear_days( # Shift this metric so the median is at zero # tc = np.percentile(tc, 50) - tc # Normalize such that the maximum value is equal to one - tc /= np.max(tc) + tc /= np.nanmax(tc) tc = 1 - tc # Seasonal renormalization: estimate a "baseline smoothness" based on local # 90th percentile of smoothness signal. This has the effect of increasing diff --git a/solardatatools/signal_decompositions.py b/solardatatools/signal_decompositions.py index 81ee7cf3..297d56af 100644 --- a/solardatatools/signal_decompositions.py +++ b/solardatatools/signal_decompositions.py @@ -161,7 +161,7 @@ def tl1_l2d2p365( :return: median fit with seasonal baseline removed """ if use_ixs is None: - use_ixs = np.arange(len(signal)) + use_ixs = ~np.isnan(signal) x = cvx.Variable(len(signal)) r = signal[use_ixs] - x[use_ixs] objective = cvx.Minimize(