From 0db4f42f5c2d61cef732ffb557676f467fd538b0 Mon Sep 17 00:00:00 2001 From: Annette Stellema <40450353+stellema@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:06:12 +1100 Subject: [PATCH 1/4] Add eva main function and add eva.fit_gev fitstart options --- ci/environment.yml | 1 + .../worked_example-HadGEM3-GC31-MM.ipynb | 8 +- docs/user_guide/worked_example_stationary.rst | 4 +- setup.py | 1 + unseen/eva.py | 1022 +++++++++++------ unseen/moments.py | 4 +- unseen/stability.py | 2 +- unseen/tests/conftest.py | 59 +- unseen/tests/test_eva.py | 405 +++---- 9 files changed, 910 insertions(+), 596 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index bd6f119..133d6dc 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,6 +9,7 @@ dependencies: - dask-jobqueue - geopandas - gitpython + - lmoments3 - netcdf4 - numpy - pip diff --git a/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb b/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb index 58984d3..5ee6550 100644 --- a/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb +++ b/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb @@ -1220,9 +1220,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'generate_estimates=True'.\n", + "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'fitstart='scipy_subet''.\n", " warnings.warn(\n", - "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'generate_estimates=True'.\n", + "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'fitstart='scipy_subet''.\n", " warnings.warn(\n" ] }, @@ -2609,12 +2609,12 @@ ], "source": [ "model_da_indep.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:blue')\n", - "model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, generate_estimates=True)\n", + "model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, fitstart='scipy_subet')\n", "model_raw_pdf = gev.pdf(xvals, model_raw_shape, model_raw_loc, model_raw_scale)\n", "plt.plot(xvals, model_raw_pdf, color='tab:blue', linewidth=4.0, label='model')\n", "\n", "model_da_bc.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:orange')\n", - "model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, generate_estimates=True)\n", + "model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, fitstart='scipy_subet')\n", "model_bc_pdf = gev.pdf(xvals, model_bc_shape, model_bc_loc, model_bc_scale)\n", "plt.plot(xvals, model_bc_pdf, color='tab:orange', linewidth=4.0, label='model (corrected)')\n", "\n", diff --git a/docs/user_guide/worked_example_stationary.rst b/docs/user_guide/worked_example_stationary.rst index 8f59a4e..8b6f241 100644 --- a/docs/user_guide/worked_example_stationary.rst +++ b/docs/user_guide/worked_example_stationary.rst @@ -438,12 +438,12 @@ to see the effect of the bias correction. model_da_bc_stacked = model_da_bc.dropna('lead_time').stack({'sample': ['ensemble', 'init_date', 'lead_time']}) model_da_indep.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:blue') - model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, generate_estimates=True) + model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, fitstart='scipy_subet') model_raw_pdf = gev.pdf(xvals, model_raw_shape, model_raw_loc, model_raw_scale) plt.plot(xvals, model_raw_pdf, color='tab:blue', linewidth=4.0, label='model') model_da_bc.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:orange') - model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, generate_estimates=True) + model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, fitstart='scipy_subet') model_bc_pdf = gev.pdf(xvals, model_bc_shape, model_bc_loc, model_bc_scale) plt.plot(xvals, model_bc_pdf, color='tab:orange', linewidth=4.0, label='model (corrected)') diff --git a/setup.py b/setup.py index b217d55..96d229f 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ "bias_correction = unseen.bias_correction:_main", "stability = unseen.stability:_main", "moments = unseen.moments:_main", + "eva = unseen.eva:_main", ] }, ) diff --git a/unseen/eva.py b/unseen/eva.py index 18f6481..5a745bd 100644 --- a/unseen/eva.py +++ b/unseen/eva.py @@ -1,5 +1,8 @@ """Extreme value analysis functions.""" +import argparse +from lmoments3 import distr +import matplotlib.pyplot as plt from matplotlib import colormaps from matplotlib.dates import date2num from matplotlib.ticker import AutoMinorLocator @@ -9,6 +12,11 @@ from scipy.stats.distributions import chi2 import warnings from xarray import apply_ufunc, DataArray +import xclim.indices.stats as xcstats + +from . import fileio +from . import general_utils +from . import time_utils def event_in_context(data, threshold, direction): @@ -48,35 +56,231 @@ def event_in_context(data, threshold, direction): return n_events, n_population, return_period, percentile -def fit_stationary_gev(x, user_estimates=[], generate_estimates=False): - """Estimate stationary shape, location and scale parameters. +def fit_gev( + data, + core_dim="time", + stationary=True, + covariate=None, + fitstart="LMM", + loc1=0, + scale1=0, + assert_good_fit=False, + pick_best_model=False, + alpha=0.05, + method="Nelder-Mead", +): + """Estimate stationary or nonstationary GEV distribution parameters. Parameters ---------- - x : array_like - Data to use in estimating the distribution parameters - user_estimates : list, optional - Initial guess of the shape, loc and scale parameters - generate_estimates : bool, optional - Generate initial parameter guesses using a data subset + data : array_like + Data to use in estimating the distribution parameters + core_dim : str, default "time" + Name of time/sample dimension in `data` and `covariate` + stationary : bool, default True + Fit as a stationary GEV using `fit_stationary_gev` + covariate : array_like, optional + A nonstationary covariate array with the same `core_dim` as `data` + fitstart : {array-like, 'LMM', 'MM', 'scipy', 'scipy_fitstart', + 'scipy_subset', 'xclim_fitstart', 'xclim'}, default 'scipy_fitstart' + Initial guess method/estimate of the shape, loc and scale parameters + loc1, scale1 : float or None, default 0 + Initial guess of trend parameters. If None, the trend is fixed at zero + assert_good_fit : bool, default False + Stationary parameters must pass goodness of fit test at `alpha` level. + Attempt a retry and return NaNs if the test fails again. + pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False + Method to test relative fit of stationary and nonstationary models. + Do not use if you don't want nonstationary parameters. The output will + have GEV 5 parameters even if stationary is True. + alpha : float, default 0.05 + Fit test p-value threshold for stationary fit (relative/goodness of fit) + method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell', + 'trust-constr', 'COBYLA'}, default 'Nelder-Mead' + Optimization method for nonstationary fit Returns ------- - shape, loc, scale : float - GEV parameters + dparams : xarray.DataArray + The GEV distribution parameters with the same dimensions as `data` + (excluding `core_dim`) and a new dimension `dparams`: + If stationary, dparams = (c, loc, scale). + If nonstationary, dparams = (c, loc0, loc1, scale0, scale1). + + Notes + ----- + - Use `unpack_gev_params` to get the shape, location and scale parameters + as a separate array. If nonstationary, the output will still be three + parameters that have an extra covariate dimension. + - For stationary data the parameters are estimated using + `scipy.stats.genextreme.fit`. + - For nonstationary data, the parameters (including the linear location and + scale trend parameters are estimated by minimising + a penalised negative log-likelihood function. + - The `assert_good_fit` option ensures that the distribution fit is + accepted if the goodness of fit test `p-value > alpha` (i.e., accept + the null hypothesis). It will retry the fit using data[::2] to generate + an initial guess. + - The `covariate` must be numeric and have dimensions aligned with `data`. + - If `pick_best_model` is a method, the relative goodness of fit method is + used to determine if stationary or nonstationary parameters are returned. + """ + kwargs = {k: v for k, v in locals().items() if k not in ["data", "covariate"]} + + def _assert_good_fit_1d(data, dparams, alpha, fit_kwargs): + """Test goodness of stationary GEV fit and retry if failed.""" + pvalue = check_gev_fit(data, dparams) + + if np.all(pvalue < alpha): + # Retry fit using alternative fitstart methods + warnings.warn("GEV fit failed. Retrying fitstart with data subset.") + _kwargs = fit_kwargs.copy() + _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart) + _kwargs["stationary"] = True + dparams = _fit_1d(data, covariate, **_kwargs) + pvalue = check_gev_fit(data, dparams) + + # Return NaNs if the test still fails + if np.all(pvalue < alpha): + # Return NaNs + dparams = dparams * np.nan + warnings.warn("Data fit failed.") + return dparams + + def _fit_1d( + data, + covariate, + stationary, + fitstart, + core_dim, + loc1, + scale1, + assert_good_fit, + pick_best_model, + alpha, + method, + ): + """Estimate distribution parameters.""" + if np.all(~np.isfinite(data)): + # Return NaNs if all input data is infinite + n = 3 if stationary else 5 + return np.array([np.nan] * n) + + if np.isnan(data).any(): + # Mask NaNs in data + mask = np.isfinite(data) + data = data[mask] + if not stationary: + covariate = covariate[mask] + + # Initial estimates of distribution parameters for MLE + if isinstance(fitstart, str): + dparams_i = _fitstart_1d(data, fitstart) + else: + # User provided initial estimates + dparams_i = fitstart + + # Use genextreme to get stationary distribution parameters + if stationary or pick_best_model: + if dparams_i is None: + dparams_i = genextreme.fit(data) + else: + dparams = genextreme.fit( + data, dparams_i[0], loc=dparams_i[1], scale=dparams_i[2] + ) + dparams = np.array([i for i in dparams], dtype="float64") + + if assert_good_fit: + dparams = _assert_good_fit_1d(data, dparams, alpha, kwargs) + + if not stationary or pick_best_model: + # Temporarily reverse shape sign (scipy uses different sign convention) + dparams_ns_i = [-dparams_i[0], dparams_i[1], loc1, dparams_i[2], scale1] + + # Optimisation bounds (scale parameter must be non-negative) + bounds = [(None, None)] * 5 + bounds[3] = (0, None) # Positive scale parameter + if loc1 is None: + dparams_ns_i[2] = 0 + bounds[2] = (0, 0) # Only allow trend in scale + if scale1 is None: + dparams_ns_i[4] = 0 + bounds[4] = (0, 0) # Only allow trend in location + + # Minimise the negative log-likelihood function to get optimal dparams + res = minimize( + nllf, + dparams_ns_i, + args=(data, covariate), + method=method, + bounds=bounds, + ) + dparams_ns = np.array([i for i in res.x], dtype="float64") + # Reverse shape sign for consistency with scipy.stats results + dparams_ns[0] *= -1 - if any(user_estimates): - shape, loc, scale = user_estimates - shape, loc, scale = genextreme.fit(x, shape, loc=loc, scale=scale) + # Stationary and nonstationary model relative goodness of fit + if pick_best_model: + dparams = get_best_GEV_model_1d( + data, dparams, dparams_ns, covariate, alpha, test=pick_best_model + ) + else: + dparams = dparams_ns + + return dparams - elif generate_estimates: - # Generate initial estimates using a data subset (useful for large datasets) - shape, loc, scale = genextreme.fit(x[::2]) - shape, loc, scale = genextreme.fit(x, shape, loc=loc, scale=scale) + if covariate is not None: + covariate = _format_covariate(data, covariate, core_dim) else: - shape, loc, scale = genextreme.fit(x) - return shape, loc, scale + covariate = 0 # Dummy covariate for apply_ufunc + + # Input core dimensions + if core_dim is not None and hasattr(covariate, core_dim): + # Covariate has the same core dimension as data + input_core_dims = [[core_dim], [core_dim]] + else: + # Covariate is a 1D array + input_core_dims = [[core_dim], []] + + n_params = 5 if (not stationary or pick_best_model) else 3 + # Fit data to distribution parameters + dparams = apply_ufunc( + _fit_1d, + data, + covariate, + input_core_dims=input_core_dims, + output_core_dims=[["dparams"]], + vectorize=True, + dask="parallelized", + kwargs=kwargs, + output_dtypes=["float64"], + dask_gufunc_kwargs={"output_sizes": {"dparams": n_params}}, + ) + if isinstance(data, DataArray): + # Format output (consistent with xclim) + if n_params == 3: + dparams.coords["dparams"] = ["c", "loc", "scale"] + else: + dparams.coords["dparams"] = ["c", "loc0", "loc1", "scale0", "scale1"] + + # Add coordinates for the distribution parameters + dist_name = "genextreme" if stationary else "nonstationary genextreme" + if isinstance(fitstart, str): + estimator = fitstart.upper() + else: + estimator = f"User estimates = {fitstart}" + + dparams.attrs = dict( + long_name=f"{dist_name.capitalize()} parameters", + description=f"Parameters of the {dist_name} distribution", + method="MLE", + estimator=estimator, + scipy_dist="genextreme", + units="", + ) + + return dparams def penalised_sum(x): @@ -96,12 +300,12 @@ def penalised_sum(x): return total + penalty -def nllf(theta, x, covariate=None): +def nllf(dparams, x, covariate=None): """Penalised negative log-likelihood function. Parameters ---------- - theta : tuple of floats + dparams : tuple of floats Distribution parameters (stationary or non-stationary) x : array_like Data to use in estimating the distribution parameters @@ -118,39 +322,39 @@ def nllf(theta, x, covariate=None): This is modified version of `scipy.stats.genextreme.fit` for fitting extreme value distribution parameters, in which the location and scale parameters can vary linearly with a covariate. - The log-likelihood equations are based on Méndez et al. (2007). + The log-likelihood equations are based on Coles (2001; page 55). It is suitable for stationary or nonstationary distributions: - - theta = shape, loc, scale - - theta = shape, loc, loc1, scale, scale1 - The nonstationary parameters are returned if `theta` incudes the location + - dparams = shape, loc, scale + - dparams = shape, loc, loc1, scale, scale1 + The nonstationary parameters are returned if `dparams` incudes the location and scale trend parameters. A large finite penalty (instead of infinity) is applied for observations beyond the support of the distribution. The NLLF is not finite when the shape is nonzero and Z is negative because the PDF is zero (i.e., ``log(0)=inf)``). """ - if len(theta) == 5: + if len(dparams) == 5: # Nonstationary GEV parameters - shape, loc0, loc1, scale0, scale1 = theta + shape, loc0, loc1, scale0, scale1 = dparams loc = loc0 + loc1 * covariate scale = scale0 + scale1 * covariate else: # Stationary GEV parameters - shape, loc, scale = theta + shape, loc, scale = dparams s = (x - loc) / scale # Calculate the NLLF (type 1 or types 2-3 extreme value distributions) # Type I extreme value distributions (Gumbel) - if shape == 0: + if np.fabs(shape) < 1e-6: valid = scale > 0 L = np.log(scale, where=valid) + s + np.exp(-s) # Types II & III extreme value distributions (Fréchet and Weibull) else: Z = 1 + shape * s - # The log-likelihood function is finite when the shape and Z are positive + # The log-likelihood is finite when the shape and Z are positive valid = np.isfinite(Z) & (Z > 0) & (scale > 0) L = ( np.log(scale, where=valid) @@ -160,251 +364,114 @@ def nllf(theta, x, covariate=None): L = np.where(valid, L, np.inf) - # Sum function along all axes (where finite) & add penalty for each infinite element + # Sum function (where finite) & add penalty for each infinite element total = penalised_sum(L) return total -def fit_gev( - data, - core_dim="time", - stationary=True, - covariate=None, - loc1=0, - scale1=0, - test_fit_goodness=False, - relative_fit_test=None, - alpha=0.05, - user_estimates=[], - generate_estimates=False, - method="Nelder-Mead", -): - """Estimate stationary or nonstationary GEV distribution parameters. +def _fitstart_1d(data, method): + """Generate initial parameter guesses for nonstationary fit. Parameters ---------- data : array_like Data to use in estimating the distribution parameters - core_dim : str, optional - Name of time/sample dimension in `data`. Default: "time". - stationary : bool, optional - Fit as a stationary GEV using `fit_stationary_gev`. Default: True. - covariate : array_like or str, optional - A nonstationary covariate array or coordinate name - loc1, scale1 : float or None, optional - Initial guess of trend parameters. If None, the trend is fixed at zero. - test_fit_goodness : bool, optional - Test goodness of fit and attempt retry. Default False. - relative_fit_test : {None, 'lrt', 'aic', 'bic'}, optional - Method to test relative fit of stationary and nonstationary models. - The trend parameters are set to zero if the stationary fit is better. - alpha : float, optional - Goodness of fit p-value threshold. Default 0.05. - user estimates: list, optional - Initial guess of the shape, loc and scale parameters - generate_estimates : bool, optional - Generate initial parameter guesses using a data subset - method : str, optional - Optimization method for nonstationary fit {'Nelder-Mead', 'L-BFGS-B', - 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'COBYLA'}. + method : {'LMM', 'scipy_fitstart', 'scipy', 'scipy_subset', + 'xclim_fitstart', 'xclim'} + Initial guess method of the shape, loc and scale parameters Returns ------- - theta : xr.DataArray - The GEV distribution parameters with the same dimensions as `data` - (excluding `core_dim`) and a new dimension `theta`: - If stationary, theta = (shape, loc, scale). - If nonstationary, theta = (shape, loc0, loc1, scale0, scale1). + dparams_i : list + Initial guess of the shape, loc and scale parameters Notes ----- - For stationary data the shape, location and scale parameters are - estimated using `gev_stationary_fit`. - For nonstationary data, the linear location and scale trend - parameters are estimated using a penalized negative log-likelihood - function with initial estimates based on the stationary fit. - The distribution fit is considered good if the p-value is above - `alpha` (i.e., accept the null hypothesis). Otherwise, it retry the fit - without `user_estimates` and with `generating_estimates`. - If data is a stacked forecast ensemble, the covariate may need to be - stacked in the same way. + - Use `scipy_fitstart` to reproduce the scipy fit in `fit_gev`. + - The LMM shape sign is reversed for consistency with scipy.stats results. """ - kwargs = locals() # Function inputs - - def _format_covariate(data, covariate, stationary, core_dim): - """Format or generate covariate .""" - if not stationary: - if isinstance(covariate, str): - # Select coordinate in data - covariate = data[covariate] - elif covariate is None: - # Guess covariate - if core_dim in data: - covariate = data[core_dim] - else: - covariate = np.arange(data.shape[0]) - - if covariate.dtype.kind not in set("buifc"): - # Convert dates to numbers - covariate = date2num(covariate) - - if not isinstance(covariate, DataArray): - # Convert to DataArray with the same core_dim as data - covariate = DataArray(covariate, dims=[core_dim]) - else: - covariate = 0 # Dummy covariate for apply_ufunc - - return covariate - - def _fit( - data, - covariate, - core_dim, - user_estimates, - generate_estimates, - loc1, - scale1, - stationary, - test_fit_goodness, - relative_fit_test, - alpha, - method, - ): - """Estimate distribution parameters.""" - if np.all(~np.isfinite(data)): - # Return NaNs if all input data is infinite - n = 3 if stationary else 5 - return np.array([np.nan] * n) - - if np.isnan(data).any(): - # Mask NaNs in data - mask = np.isfinite(data) - data = data[mask] - if not stationary: - covariate = covariate[mask] - # Use genextreme to get stationary distribution parameters - theta = fit_stationary_gev(data, user_estimates, generate_estimates) + if method == "LMM": + # L-moments method + dparams_i = distr.gev.lmom_fit(data) + dparams_i = list(dparams_i.values()) + dparams_i[0] = -dparams_i[0] + + elif method == "scipy_fitstart": + # Moments method? + dparams_i = genextreme._fitstart(data) + + elif method == "scipy": + # MLE + dparams_i = genextreme.fit(data) + + elif method == "scipy_subset": + # MLE (equivalent of fitstart='scipy_subet') + dparams_i = genextreme.fit(data[::2]) + + elif method == "xclim_fitstart": + # Approximates the probability weighted moments (PWM) method? + args, kwargs = xcstats._fit_start(data, dist="genextreme") + dparams_i = [args[0], kwargs["loc"], kwargs["scale"]] + + elif method == "xclim": + # MLE + da = DataArray(data, dims="time") + dparams_i = xcstats.fit(da, "genextreme", method="MLE") + else: + raise ValueError(f"Unknown fitstart method: {method}") - if not stationary: - # Use genextreme as initial guesses - shape, loc, scale = theta - # Temporarily reverse shape sign (scipy uses different sign convention) - theta_i = [-shape, loc, loc1, scale, scale1] + return np.array(dparams_i, dtype="float64") - # Optimisation bounds (scale parameter must be non-negative) - bounds = [(None, None)] * len(theta_i) - bounds[3] = (0, None) # Positive scale parameter - if loc1 is None: - theta_i[2] = 0 - bounds[2] = (0, 0) # Only allow trend in scale - if scale1 is None: - theta_i[4] = 0 - bounds[4] = (0, 0) # Only allow trend in location - # Minimise the negative log-likelihood function to get optimal theta - res = minimize( - nllf, - theta_i, - args=(data, covariate), - method=method, - bounds=bounds, - ) - theta = res.x +def _format_covariate(data, covariate, core_dim): + """Format or generate covariate. - if isinstance(relative_fit_test, str): - # Test relative fit of stationary and nonstationary models - # Negative log likelihood using genextreme parameters - L1 = nllf([-shape, loc, scale], data) - L2 = res.fun + Parameters + ---------- + data : xarray.DataArray + Data to use in estimating the distribution parameters + covariate : array_like or str + A nonstationary covariate array or coordinate name + core_dim : str + Name of time/sample dimension in `data` - result = check_gev_relative_fit( - data, L1, L2, test=relative_fit_test, alpha=alpha - ) - if not result: - warnings.warn( - f"{relative_fit_test} test failed. Returning stationary parameters." - ) - # Return stationary parameters (genextreme.fit output) with no trend - theta = [shape, loc, 0, scale, 0] + Returns + ------- + covariate : xarray.DataArray + Covariate with the same core_dim as data + """ - # Reverse shape sign for consistency with scipy.stats results - theta[0] *= -1 - - theta = np.array([i for i in theta], dtype="float64") - - if test_fit_goodness and stationary: - pvalue = check_gev_fit(data, theta, core_dim=core_dim) - - # Accept null distribution of the Anderson-darling test (same distribution) - if np.all(pvalue < alpha): - if any(kwargs["user_estimates"]): - warnings.warn("GEV fit failed. Retrying without user_estimates.") - kwargs["user_estimates"] = [None, None, None] - theta = _fit(data, covariate, **kwargs) - elif not kwargs["generate_estimates"]: - warnings.warn( - "GEV fit failed. Retrying with generate_estimates=True." - ) - kwargs["generate_estimates"] = True # Also breaks loop - theta = _fit(data, covariate, **kwargs) - else: - # Return NaNs - theta = theta * np.nan - warnings.warn("Data fit failed.") - return theta - - data = kwargs.pop("data") - covariate = kwargs.pop("covariate") - covariate = _format_covariate(data, covariate, stationary, core_dim) + if isinstance(covariate, str): + # Select coordinate in data + covariate = data[covariate] - # Input core dimensions - if core_dim is not None and hasattr(covariate, core_dim): - # Covariate has the same core dimension as data - input_core_dims = [[core_dim], [core_dim]] - else: - # Covariate is a 1D array - input_core_dims = [[core_dim], []] + elif covariate is None: + # Guess covariate + if core_dim in data: + covariate = data[core_dim] + else: + covariate = np.arange(data.shape[0]) - # Expected output of theta - n = 3 if stationary else 5 + if covariate.dtype.kind not in set("buifc"): + # Convert dates to numbers + covariate = date2num(covariate) - # Fit data to distribution parameters - theta = apply_ufunc( - _fit, - data, - covariate, - input_core_dims=input_core_dims, - output_core_dims=[["theta"]], - vectorize=True, - dask="parallelized", - kwargs=kwargs, - output_dtypes=["float64"], - dask_gufunc_kwargs=dict(output_sizes={"theta": n}), - ) + if not isinstance(covariate, DataArray): + # Convert to DataArray with the same core_dim as data + covariate = DataArray(covariate, dims=[core_dim]) - # Format output - if len(data.shape) == 1: - # Return a tuple of scalars instead of a data array - theta = np.array([i for i in theta], dtype="float64") - - if isinstance(theta, DataArray): - if stationary: - coords = ["shape", "loc", "scale"] - else: - coords = ["shape", "loc0", "loc1", "scale0", "scale1"] - theta.coords["theta"] = coords - return theta + return covariate -def check_gev_fit(data, params, core_dim="time", **kwargs): +def check_gev_fit(data, dparams, core_dim=[], **kwargs): """Test stationary GEV distribution goodness of fit. Parameters ---------- data: array_like Data used to estimate the distribution parameters - params : tuple of floats + dparams : tuple of floats Shape, location and scale parameters core_dim : str, optional Data dimension to test over @@ -417,10 +484,10 @@ def check_gev_fit(data, params, core_dim="time", **kwargs): Goodness of fit p-value """ - def _goodness_of_fit(data, params, **kwargs): + def _goodness_of_fit(data, dparams, **kwargs): """Test GEV goodness of fit.""" # Stationary parameters - shape, loc, scale = params + shape, loc, scale = dparams res = goodness_of_fit( genextreme, @@ -430,11 +497,14 @@ def _goodness_of_fit(data, params, **kwargs): ) return res.pvalue + if not isinstance(core_dim, list): + core_dim = [core_dim] + pvalue = apply_ufunc( _goodness_of_fit, data, - params, - input_core_dims=[[core_dim], ["theta"]], + dparams, + input_core_dims=[core_dim, ["dparams"]], vectorize=True, kwargs=kwargs, dask="parallelized", @@ -452,7 +522,7 @@ def check_gev_relative_fit(data, L1, L2, test, alpha=0.05): Data to use in estimating the distribution parameters L1, L2 : float Negative log-likelihood of the stationary and nonstationary model - test : {"aic", "bic", "lrt"} + test : {"AIC", "BIC", "LRT"} Method to test relative fit of stationary and nonstationary models Returns @@ -484,15 +554,35 @@ def check_gev_relative_fit(data, L1, L2, test, alpha=0.05): # Bayesian Information Criterion (BIC) bic = [k * np.log(len(data)) + (2 * n) for n, k in zip([L1, L2], dof)] result = bic[0] > bic[1] + else: + raise ValueError("test must be 'LRT', 'AIC' or 'BIC'", test) return result -def unpack_gev_params(params, covariate=None): - """Unpack shape, loc, scale from params. +def get_best_GEV_model_1d(data, dparams, dparams_ns, covariate, alpha, test): + """Get the best GEV model based on a relative fit test.""" + # Calculate the stationary GEV parameters + shape, loc, scale = dparams + + # Negative log-likelihood of stationary and nonstationary models + L1 = nllf([-shape, loc, scale], data) + L2 = nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate) + + result = check_gev_relative_fit(data, L1, L2, test=test, alpha=alpha) + if not result: + # Return the stationary parameters with no trend + dparams = np.array([shape, loc, 0, scale, 0], dtype="float64") + else: + dparams = dparams_ns + return dparams + + +def unpack_gev_params(dparams, covariate=None): + """Unpack shape, loc, scale from dparams. Parameters ---------- - params : xarray.DataArray, list or tuple + dparams : xarray.DataArray, list or tuple Stationary or nonstationary GEV parameters covariate : xarray.DataArray, optional Covariate values for nonstationary parameters @@ -504,31 +594,37 @@ def unpack_gev_params(params, covariate=None): covariate. """ - if hasattr(params, "theta"): + if hasattr(dparams, "dparams"): # Select the correct dimension in a DataArray - params = [params.isel(theta=i) for i in range(params.theta.size)] + dparams = [dparams.isel(dparams=i) for i in range(dparams.dparams.size)] + elif not isinstance(dparams, (list, tuple)) and dparams.ndim > 1: + warnings.warn(f"Assuming parameters on axis=-1 (shape={dparams.shape})") + dparams = np.split(dparams, dparams.shape[-1], axis=-1) # Unpack GEV parameters - if len(params) == 3: + if len(dparams) == 3: # Stationary GEV parameters - shape, loc, scale = params + shape, loc, scale = dparams - elif len(params) == 5: + elif len(dparams) == 5: # Nonstationary GEV parameters - shape, loc0, loc1, scale0, scale1 = params + shape, loc0, loc1, scale0, scale1 = dparams loc = loc0 + loc1 * covariate scale = scale0 + scale1 * covariate + else: + raise ValueError("Expected 3 or 5 GEV parameters.", dparams) + return shape, loc, scale -def get_return_period(event, params=None, covariate=None, **kwargs): +def get_return_period(event, dparams=None, covariate=None, **kwargs): """Get return periods for a given events. Parameters ---------- event : float or array_like Event value(s) for which to calculate the return period - params : array_like, optional + dparams : array_like, optional Stationary or nonstationary GEV parameters covariate : array_like, optional Covariate values for nonstationary parameters @@ -540,10 +636,11 @@ def get_return_period(event, params=None, covariate=None, **kwargs): return_period : float or array_like Return period(s) for the event(s) """ - if params is None: - params = fit_gev(**kwargs) - shape, loc, scale = unpack_gev_params(params, covariate) + if dparams is None: + dparams = fit_gev(**kwargs) + + shape, loc, scale = unpack_gev_params(dparams, covariate) probability = apply_ufunc( genextreme.sf, @@ -559,14 +656,14 @@ def get_return_period(event, params=None, covariate=None, **kwargs): return 1.0 / probability -def get_return_level(return_period, params=None, covariate=None, **kwargs): +def get_return_level(return_period, dparams=None, covariate=None, **kwargs): """Get the return levels for given return periods. Parameters ---------- return_period : float or array_like Return period(s) for which to calculate the return level - params : array_like, optional + dparams : array_like, optional Stationary or nonstationary GEV parameters covariate : array_like, optional Covariate values for nonstationary parameters @@ -577,11 +674,18 @@ def get_return_level(return_period, params=None, covariate=None, **kwargs): ------- return_level : float or array_like Return level(s) of the given return period(s) + + Notes + ----- + If `return_period` is an ndarray, make sure dimensions are aligned with + `dparams`. For example, dparams dims=(lat, lon, dparams) and return_period + dims=(lat, lon, period). """ - if params is None: - params = fit_gev(**kwargs) - shape, loc, scale = unpack_gev_params(params, covariate) + if dparams is None: + dparams = fit_gev(**kwargs) + + shape, loc, scale = unpack_gev_params(dparams, covariate) return_level = apply_ufunc( genextreme.isf, @@ -603,34 +707,31 @@ def gev_return_curve( bootstrap_method="non-parametric", n_bootstraps=1000, max_return_period=4, - user_estimates=None, max_shape_ratio=None, + **fit_kwargs, ): """Return x and y data for a GEV return period curve. Parameters ---------- - data : xarray DataArray + data : xarray.DataArray event_value : float Magnitude of event of interest bootstrap_method : {'parametric', 'non-parametric'}, default "non-parametric" n_bootstraps : int, default 1000 max_return_period : float, default 4 The maximum return period is 10^{max_return_period} - user_estimates: list, default None - Initial estimates of the shape, loc and scale parameters max_shape_ratio: float, optional - Maximum bootstrap shape parameter to full population shape parameter ratio (e.g. 6.0) - Useful for filtering bad fits to bootstrap samples + Maximum bootstrap shape parameter to full population shape parameter + ratio (e.g. 6.0). Useful for filtering bad fits to bootstrap samples + fit_kwargs : dict, optional + Additional keyword arguments to pass to `fit_gev` """ + rng = np.random.default_rng(seed=0) # GEV fit to data - if user_estimates: - shape, loc, scale = fit_gev( - data, user_estimates=user_estimates, stationary=True - ) - else: - shape, loc, scale = fit_gev(data, generate_estimates=True, stationary=True) + dparams = fit_gev(data, **fit_kwargs) + shape, loc, scale = unpack_gev_params(dparams) curve_return_periods = np.logspace(0, max_return_period, num=10000) curve_probabilities = 1.0 / curve_return_periods @@ -646,8 +747,8 @@ def gev_return_curve( if bootstrap_method == "parametric": boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data)) elif bootstrap_method == "non-parametric": - boot_data = np.random.choice(data, size=data.shape, replace=True) - boot_shape, boot_loc, boot_scale = fit_gev(boot_data, generate_estimates=True) + boot_data = rng.choice(data, size=data.shape, replace=True) + boot_shape, boot_loc, boot_scale = fit_gev(boot_data, fitstart="scipy_subet") if max_shape_ratio: shape_ratio = abs(boot_shape) / abs(shape) if shape_ratio > max_shape_ratio: @@ -698,15 +799,15 @@ def plot_gev_return_curve( ylabel=None, ylim=None, text=False, - user_estimates=None, max_shape_ratio=None, + **fit_kwargs, ): """Plot a single return period curve. Parameters ---------- ax : matplotlib plot axis - data : xarray DataArray + data : xarray.DataArray event_value : float Magnitude of the event of interest direction : {'exceedance', 'deceedance'}, default 'exceedance' @@ -721,11 +822,11 @@ def plot_gev_return_curve( Limits for y-axis text : bool, default False Write the return period (and 95% CI) on the plot - user_estimates: list, default None - Initial estimates of the shape, loc and scale parameters max_shape_ratio: float, optional Maximum bootstrap shape parameter to full population shape parameter ratio (e.g. 6.0) Useful for filtering bad fits to bootstrap samples + fit_kwargs : dict, optional + Additional keyword arguments to pass to `fit_gev` """ if direction == "deceedance": @@ -737,8 +838,8 @@ def plot_gev_return_curve( bootstrap_method=bootstrap_method, n_bootstraps=n_bootstraps, max_return_period=max_return_period, - user_estimates=user_estimates, max_shape_ratio=max_shape_ratio, + **fit_kwargs, ) ( curve_return_periods, @@ -813,35 +914,53 @@ def plot_gev_return_curve( ax.grid() -def plot_nonstationary_pdfs(ax, data, theta_s, theta_ns, covariate): +def plot_nonstationary_pdfs( + data, + dparams_s, + dparams_ns, + covariate, + ax=None, + title="", + units=None, + cmap="rainbow", + outfile=None, +): """Plot stationary and nonstationary GEV PDFs. Parameters ---------- - ax : matplotlib.Axes data : array-like Data to plot the histogram - theta_s : tuple of floats + dparams_s : tuple of floats Stationary GEV parameters (shape, loc, scale) - theta_ns : tuple or array-like + dparams_ns : tuple or array-like Nonstationary GEV parameters (shape, loc0, loc1, scale0, scale1) covariate : array-like - Covariates values in which to plot the nonstationary PDFs - + Covariate values in which to plot the nonstationary PDFs + ax : matplotlib.axes.Axes + title : str, optional + xlabel : str, optional + cmap : str, default "rainbow" + outfile : str, optional Returns ------- ax : matplotlib.Axes """ + if ax is None: + fig, ax = plt.subplots(1, 1, figsize=(10, 7)) + + ax.set_title(title, loc="left") + n = covariate.size - colors = colormaps["rainbow"](np.linspace(0, 1, n)) - shape, loc, scale = unpack_gev_params(theta_ns, covariate) + colors = colormaps[cmap](np.linspace(0, 1, n)) + shape, loc, scale = unpack_gev_params(dparams_ns, covariate) # Histogram. _, bins, _ = ax.hist(data, bins=40, density=True, alpha=0.5, label="Histogram") # Stationary GEV PDF - shape_s, loc_s, scale_s = theta_s + shape_s, loc_s, scale_s = dparams_s pdf_s = genextreme.pdf(bins, shape_s, loc=loc_s, scale=scale_s) ax.plot(bins, pdf_s, c="k", ls="--", lw=2.8, label="Stationary") @@ -850,64 +969,110 @@ def plot_nonstationary_pdfs(ax, data, theta_s, theta_ns, covariate): pdf_ns = genextreme.pdf(bins, shape, loc=loc[i], scale=scale[i]) ax.plot(bins, pdf_ns, lw=1.6, c=colors[i], zorder=0, label=t) + ax.set_xlabel(units) ax.set_ylabel("Probability") ax.xaxis.set_minor_locator(AutoMinorLocator()) ax.yaxis.set_minor_locator(AutoMinorLocator()) - ax.legend(bbox_to_anchor=(1, 1)) + ax.legend(loc="upper right", bbox_to_anchor=(1, 1), framealpha=0.3) + ax.set_xmargin(1e-3) + + if outfile: + plt.tight_layout() + plt.savefig(outfile, dpi=200, bbox_inches="tight") return ax def plot_nonstationary_return_curve( - ax, return_periods, theta_s, theta_ns, covariate, dim="time" + return_periods, + dparams_s, + dparams_ns, + covariate, + dim="time", + ax=None, + title="", + units=None, + cmap="rainbow", + outfile=None, ): """Plot stationary and nonstationary return period curves. Parameters ---------- - ax : matplotlib.Axes return_periods : array-like - Return periods to plot - theta_s : array-like or tuple of floats + Return periods to plot (x-axis) + dparams_s : array-like or tuple of floats Stationary GEV parameters (shape, loc, scale) - theta_ns : array-like or tuple of floats + dparams_ns : array-like or tuple of floats Nonstationary GEV parameters (shape, loc0, loc1, scale0, scale1) covariate : array-like Covariate values in which to show the nonstationary return levels dim : str, optional Covariate core dimension name, default "time" + ax : matplotlib.axes.Axes + title : str, optional + units : str, optional + cmap : str, default "rainbow" + outfile : str, optional Returns ------- - ax : matplotlib.Axes + ax : matplotlib.axes.Axes """ + if ax is None: + fig, ax = plt.subplots(1, 1, figsize=(9, 5)) + + ax.set_title(title, loc="left") + n = covariate.size - colors = colormaps["rainbow"](np.linspace(0, 1, n)) + colors = colormaps[cmap](np.linspace(0, 1, n)) # Stationary return levels - return_levels = get_return_level(return_periods, theta_s) - ax.plot( - return_periods, return_levels, label="Stationary", c="k", ls="--", zorder=n + 1 - ) + if dparams_s is not None: + return_levels = get_return_level(return_periods, dparams_s) + ax.plot( + return_periods, + return_levels, + label="Stationary", + c="k", + ls="--", + zorder=n + 1, + ) # Nonstationary return levels - return_levels = get_return_level(return_periods, theta_ns, covariate) + return_levels = get_return_level(return_periods, dparams_ns, covariate) for i, t in enumerate(covariate.values): ax.plot(return_periods, return_levels.isel({dim: i}), label=t, c=colors[i]) ax.set_xscale("log") - ax.set_xlabel("Return period [years]") + ax.set_ylabel(units) + ax.set_xlabel("Return period") ax.yaxis.set_minor_locator(AutoMinorLocator()) ax.set_xmargin(1e-2) ax.legend() + + if outfile: + plt.tight_layout() + plt.savefig(outfile, dpi=200, bbox_inches="tight") return ax -def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"): - """Plot data binned by a covariate as a stacked histogram. +def plot_stacked_histogram( + dv1, + dv2, + bins=None, + labels=None, + ax=None, + title="", + units=None, + cmap="rainbow", + legend=True, + outfile=None, +): + """Histogram with data binned and stacked. Parameters ---------- - ax : matplotlib.Axes + dv1 : xarray.DataArray Data to plot in the histogram dv2 : xarray.DataArray @@ -915,27 +1080,48 @@ def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"): bins : array-like Bin edges of dv2 labels : array-like, optional - Labels for each bin, default None (uses left side of each bin) - dim : str, default: "time" - Core dimension name of dv1 and dv2, default "time" + Labels for each bin, default None uses left side of each bin + dim : str, default "time" + Core dimension name of dv1 and dv2 + ax : matplotlib.axes.Axes + title : str, optional + units : str, optional + cmap : str, default "rainbow" + legend : bool, optional + outfile : str, optional Returns ------- - ax : matplotlib.Axes + ax : matplotlib.axes.Axes """ assert dv1.size == dv2.size + if bins is None or np.ndim(bins) == 0: + bins = np.histogram_bin_edges(dv2, bins) + + # Round bins to integers if possible + if np.all(np.diff(bins) >= 1): + bins = np.ceil(bins).astype(dtype=int) + + if labels is None: + # Labels show left side of each bin + # labels = bins[:-1] + labels = [f"{bins[i]}-{bins[i+1] - 1}" for i in range(len(bins) - 1)] + # Subset dv1 by bins dx_subsets = [ - dv1.where((dv2 >= bins[a]) & (dv2 < bins[a + 1])) for a in range(len(bins) - 1) + dv1.where(((dv2 >= bins[a]) & (dv2 < bins[a + 1])).values) + for a in range(len(bins) - 1) ] + dx_subsets[-1] = dv1.where((dv2 >= bins[-2]).values) - if labels is None: - # Labels show left side of each bin - labels = bins[:-1] + colors = colormaps[cmap](np.linspace(0, 1, len(bins) - 1)) - colors = colormaps["rainbow"](np.linspace(0, 1, len(bins) - 1)) + if ax is None: + fig, ax = plt.subplots(1, 1, figsize=(10, 7)) + + ax.set_title(title, loc="left") ax.hist( dx_subsets, density=True, @@ -945,6 +1131,202 @@ def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"): edgecolor="k", label=labels, ) - ax.legend() + if legend: + ax.legend() + ax.set_xlabel(units) ax.set_ylabel("Probability") - return ax + + if outfile: + plt.tight_layout() + plt.savefig(outfile, dpi=200, bbox_inches="tight") + return ax, bins + + +def _parse_command_line(): + """Parse the command line for input arguments""" + + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument("file", type=str, help="Forecast file") + parser.add_argument("var", type=str, help="Variable name") + parser.add_argument("outfile", type=str, help="Output file") + parser.add_argument( + "--stack_dims", + type=str, + nargs="*", + default=["ensemble", "init_date", "lead_time"], + help="Dimensions to stack", + ) + parser.add_argument("--core_dim", type=str, default="time", help="Core dimension") + parser.add_argument( + "--stationary", + type=bool, + default=True, + help="Fit nonstationary GEV distribution", + ) + parser.add_argument( + "--fitstart", + default="LMM", + choices=( + "LMM", + "scipy", + "scipy_fitstart", + "scipy_subset", + "xclim_MLE", + "xclim", + ["shape", "loc", "scale"], + ), + help="Initial guess method (or estimate) of the GEV parameters", + ) + parser.add_argument( + "--assert_good_fit", + action="store_true", + default=False, + help="Test fit goodness", + ) + parser.add_argument( + "--pick_best_model", + type=str, + default=None, + help="Relative fit test to pick stationary or nonstationary parameters", + ) + parser.add_argument( + "--reference_time_period", + type=str, + nargs=2, + default=None, + help="Reference time period (start_date, end_date)", + ) + parser.add_argument( + "--covariate", type=str, default="time.year", help="Covariate variable" + ) + parser.add_argument( + "--covariate_file", type=str, default=None, help="Covariate file" + ) + parser.add_argument( + "--min_lead", default=None, help="Minimum lead time (int or filename)" + ) + parser.add_argument( + "--min_lead_kwargs", + type=str, + nargs="*", + default={}, + action=general_utils.store_dict, + help="Minimum lead time file", + ) + # parser.add_argument( + # "--confidence_interval", + # type=float, + # default=0.95, + # help="Confidence interval e.g., --confidence_interval 0.95", + # ) + parser.add_argument( + "--ensemble_dim", + type=str, + default="ensemble", + help="Name of ensemble member dimension", + ) + parser.add_argument( + "--init_dim", + type=str, + default="init_date", + help="Name of initial date dimension", + ) + parser.add_argument( + "--lead_dim", + type=str, + default="lead_time", + help="Name of lead time dimension", + ) + parser.add_argument( + "--output_chunks", + type=str, + nargs="*", + action=general_utils.store_dict, + default={}, + help="Output chunks", + ) + parser.add_argument( + "--dask_config", type=str, help="YAML file specifying dask client configuration" + ) + args = parser.parse_args() + + return args + + +def _main(): + """Run the command line program.""" + + args = _parse_command_line() + + ds = fileio.open_dataset(args.file, variables=[args.var]) + + if args.covariate_file is not None: + # Add covariate to dataset (to ensure all operations are aligned) + ds_covariate = fileio.open_dataset( + args.covariate_file, variables=[args.covariate] + ) + ds[args.covariate] = ds_covariate[args.covariate] + + # Filter data by reference time period + if args.reference_time_period: + ds = time_utils.select_time_period(ds, args.reference_time_period) + + # Filter data by minimum lead time + if args.min_lead: + if isinstance(args.min_lead, str): + # Load min_lead from file + ds_min_lead = fileio.open_dataset(args.min_lead, **args.min_lead_kwargs) + min_lead = ds_min_lead["min_lead"].load() + ds = ds.groupby(f"{args.init_dim}.month").where( + ds[args.lead_dim] >= min_lead + ) + ds = ds.drop_vars("month") + else: + ds = ds.where(ds[args.lead_dim] >= args.min_lead) + + # Stack dimensions along new "sample" dimension + if all([dim in ds[args.var].dims for dim in args.stack_dims]): + ds = ds.stack(**{"sample": args.stack_dims}) + args.core_dim = "sample" + + if not args.stationary: + covariate = _format_covariate(ds[args.var], ds[args.covariate], args.core_dim) + else: + covariate = None + + dparams = fit_gev( + ds[args.var], + core_dim=args.core_dim, + stationary=args.stationary, + fitstart=args.fitstart, + covariate=covariate, + assert_good_fit=args.assert_good_fit, + pick_best_model=args.pick_best_model, + ) + + # Format outfile + dparams = dparams.to_dataset() + + # Add the covariate variable + if not args.stationary or args.pick_best_model: + dparams[args.covariate] = covariate + + infile_logs = {args.file: ds.attrs["history"]} + if isinstance(args.min_lead, str): + infile_logs[args.min_lead] = ds_min_lead.attrs["history"] + dparams.attrs["history"] = fileio.get_new_log(infile_logs=infile_logs) + + if args.output_chunks: + dparams = dparams.chunk(args.output_chunks) + + if "zarr" in args.outfile: + fileio.to_zarr(dparams, args.outfile) + else: + dparams.to_netcdf(args.outfile, compute=True) + + +if __name__ == "__main__": + _main() diff --git a/unseen/moments.py b/unseen/moments.py index ae25070..2bb3fbf 100644 --- a/unseen/moments.py +++ b/unseen/moments.py @@ -183,7 +183,7 @@ def create_plot( random_sample = np.random.choice(da_fcst_stacked, sample_size) sample_moments = calc_moments( random_sample, - user_estimates=[ + fitstart=[ moments_fcst["GEV shape"], moments_fcst["GEV location"], moments_fcst["GEV scale"], @@ -196,7 +196,7 @@ def create_plot( bc_random_sample = np.random.choice(da_bc_fcst_stacked, sample_size) bc_sample_moments = calc_moments( bc_random_sample, - user_estimates=[ + fitstart=[ moments_fcst["GEV shape"], moments_fcst["GEV location"], moments_fcst["GEV scale"], diff --git a/unseen/stability.py b/unseen/stability.py index dba31ac..a9e4a4c 100644 --- a/unseen/stability.py +++ b/unseen/stability.py @@ -105,7 +105,7 @@ def return_curve(data, method, params=[], **kwargs): params : list, default None shape, location and scale parameters (calculated if None) kwargs : dict, optional - kwargs passed to eva.fit_gev (N.B. used to use generate_estimates=True) + kwargs passed to eva.fit_gev """ if method == "empirical": diff --git a/unseen/tests/conftest.py b/unseen/tests/conftest.py index ce6a6cd..25aef26 100644 --- a/unseen/tests/conftest.py +++ b/unseen/tests/conftest.py @@ -2,6 +2,7 @@ import dask.array as dsa import numpy as np import pytest +from scipy.stats import genextreme import xarray as xr @@ -101,7 +102,8 @@ def example_da_forecast(request): ) ) else: - data = np.random.random( + rng = np.random.default_rng(seed=0) + data = rng.random( ( len(init), len(lead), @@ -113,3 +115,58 @@ def example_da_forecast(request): return ds.assign_coords( {pytest.TIME_DIM: ([pytest.INIT_DIM, pytest.LEAD_DIM], time)} ) + + +@pytest.fixture() +def example_da_gev(request): + """An example 1D GEV DataArray and distribution parameters.""" + rng = np.random.default_rng(seed=0) + time = xr.cftime_range(start="2000-01-01", periods=1500, freq="D") + + # Shape, location and scale parameters + shape = rng.uniform(-0.5, 0.5) + loc = rng.uniform(-10, 10) + scale = rng.uniform(0.1, 10) + dparams = shape, loc, scale + + rvs = genextreme.rvs(shape, loc=loc, scale=scale, size=(time.size), random_state=0) + data = xr.DataArray(rvs, coords=[time], dims=[pytest.TIME_DIM]) + if request.param == "dask": + data = data.chunk({pytest.TIME_DIM: -1}) + elif request.param == "numpy": + data = data.values + return data, dparams + + +@pytest.fixture() +def example_da_gev_3d(request): + """An example 3D GEV DataArray and distribution parameters.""" + rng = np.random.default_rng(seed=0) + time = xr.cftime_range(start="2000-01-01", periods=1500, freq="D") + lat = np.arange(2) + lon = np.arange(2) + + # Shape, location and scale parameters + size = (len(lat), len(lon)) + shape = rng.uniform(-0.5, 0.5, size=size) + loc = rng.uniform(-10, 10, size=size) + scale = rng.uniform(0.1, 10, size=size) + dparams = np.stack([shape, loc, scale], axis=-1) + + rvs = genextreme.rvs( + shape, + loc=loc, + scale=scale, + size=(len(time), len(lat), len(lon)), + random_state=0, + ) + data = xr.DataArray( + rvs, + coords=[time, lat, lon], + dims=[pytest.TIME_DIM, pytest.LAT_DIM, pytest.LON_DIM], + ) + if request.param == "dask": + data = data.chunk({pytest.TIME_DIM: -1, pytest.LAT_DIM: 1, pytest.LON_DIM: 1}) + elif request.param == "numpy": + data = data.values + return data, dparams diff --git a/unseen/tests/test_eva.py b/unseen/tests/test_eva.py index bba533f..d30f13e 100644 --- a/unseen/tests/test_eva.py +++ b/unseen/tests/test_eva.py @@ -1,191 +1,102 @@ """Test extreme value analysis functions.""" -from matplotlib.dates import date2num import numpy as np import numpy.testing as npt -from scipy.stats import genextreme -from xarray import cftime_range, DataArray +import pytest +import xarray as xr from unseen.eva import fit_gev, get_return_period, get_return_level -rtol = 0.3 # relative tolerance -alpha = 0.05 - - -def example_da_gev_1d(): - """An example 1D GEV DataArray and distribution parameters.""" - time = cftime_range(start="2000-01-01", periods=1500, freq="D") - - # Shape, location and scale parameters - np.random.seed(0) - shape = np.random.uniform() - loc = np.random.uniform(-10, 10) - scale = np.random.uniform(0.1, 10) - theta = shape, loc, scale - - rvs = genextreme.rvs(shape, loc=loc, scale=scale, size=(time.size), random_state=0) - data = DataArray(rvs, coords=[time], dims=["time"]) - return data, theta - - -def example_da_gev_1d_dask(): - """An example 1D GEV dask array and distribution parameters.""" - data, theta = example_da_gev_1d() - data = data.chunk({"time": -1}) - return data, theta - - -def example_da_gev_3d(): - """An example 3D GEV DataArray and distribution parameters.""" - time = cftime_range(start="2000-01-01", periods=1500, freq="D") - lat = np.arange(2) - lon = np.arange(2) - - # Shape, location and scale parameters - size = (len(lat), len(lon)) - np.random.seed(0) - shape = np.random.uniform(size=size) - loc = np.random.uniform(-10, 10, size=size) - scale = np.random.uniform(0.1, 10, size=size) - theta = np.stack([shape, loc, scale], axis=-1) - - rvs = genextreme.rvs( - shape, - loc=loc, - scale=scale, - size=(len(time), len(lat), len(lon)), - random_state=0, - ) - data = DataArray(rvs, coords=[time, lat, lon], dims=["time", "lat", "lon"]) - return data, theta - - -def example_da_gev_3d_dask(): - """An example 3D GEV dask array and its distribution parameters.""" - data, theta = example_da_gev_3d() - data = data.chunk({"time": -1, "lat": 1, "lon": 1}) - return data, theta +rtol = 0.3 # relative tolerance for testing close values def add_example_gev_trend(data): trend = np.arange(data.time.size) * 2.5 / data.time.size - trend = DataArray(trend, coords={"time": data.time}) + trend = xr.DataArray(trend, coords={"time": data.time}) return data + trend -def example_da_gev_forecast(): - """Create example stacked forecast dataArray.""" - ensemble = np.arange(3) - lead_time = np.arange(5) - init_date = cftime_range(start="2000-01-01", periods=24, freq="MS") - time = [ - init_date.shift(i, freq="MS")[: len(lead_time)] for i in range(len(init_date)) - ] - - # Generate shape, location and scale parameters. - np.random.seed(2) - shape = np.random.uniform() - loc = np.random.uniform(-10, 10) - scale = np.random.uniform(0.1, 10) - theta = shape, loc, scale - - rvs = genextreme.rvs( - shape, - loc=loc, - scale=scale, - size=(len(ensemble), len(init_date), len(lead_time)), - random_state=0, +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +def test_fit_gev_1d(example_da_gev): + """Run stationary GEV fit using 1D array.""" + data, dparams_i = example_da_gev + dparams = fit_gev( + data, stationary=True, assert_good_fit=False, pick_best_model=False ) - data = DataArray( - rvs, - coords=[ensemble, init_date, lead_time], - dims=["ensemble", "init_date", "lead_time"], - ) - data = data.assign_coords({"time": (["init_date", "lead_time"], time)}) - data_stacked = data.stack({"sample": ["ensemble", "init_date", "lead_time"]}) - return data_stacked, theta - - -def test_fit_gev_1d(): - """Run stationary fit using 1D array & check results.""" - data, theta_i = example_da_gev_1d() - theta = fit_gev(data, stationary=True) - # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) - - -def test_fit_gev_1d_user_estimates(): - """Run stationary fit using 1D array & user_estimates.""" - data, theta_i = example_da_gev_1d() - user_estimates = list(theta_i) - theta = fit_gev(data, stationary=True, user_estimates=user_estimates) # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) - - -def test_fit_gev_1d_goodness(): - """Run stationary fit using 1D array & fit_goodness_test.""" - data, theta_i = example_da_gev_1d() - theta = fit_gev(data, stationary=True, test_fit_goodness=True) - # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) - - -def test_fit_gev_1d_numpy(): - """Run stationary fit using 1D np.ndarray & check results.""" - data, theta_i = example_da_gev_1d() - data = data.values - theta = fit_gev(data, stationary=True) - # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) - - -def test_fit_gev_1d_dask(): - """Run stationary fit using 1D dask array & check results.""" - data, theta_i = example_da_gev_1d_dask() - theta = fit_gev(data, stationary=True, core_dim="time") + npt.assert_allclose(dparams, dparams_i, rtol=rtol) + + +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +@pytest.mark.parametrize( + "fitstart", + [ + [1, -5, 1], + "LMM", + "scipy_fitstart", + "scipy", + "scipy_subset", + "xclim_fitstart", + "xclim", + ], +) +def test_fit_gev_1d_fitstart(example_da_gev, fitstart): + """Run stationary GEV fit using 1D array & fitstart method.""" + data, dparams_i = example_da_gev + dparams = fit_gev( + data, + stationary=True, + fitstart=fitstart, + assert_good_fit=False, + pick_best_model=False, + ) # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) + npt.assert_allclose(dparams, dparams_i, rtol=rtol) -def test_fit_gev_3d(): - """Run stationary fit using 3D array & check results.""" - data, theta_i = example_da_gev_3d() - theta = fit_gev(data, stationary=True, core_dim="time") +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +def test_fit_gev_1d_assert_good_fit(example_da_gev): + """Run stationary GEV fit using 1D array & fit_goodness_test.""" + data, dparams_i = example_da_gev + dparams = fit_gev(data, stationary=True, assert_good_fit=True) # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) + npt.assert_allclose(dparams, dparams_i, rtol=0.3) -def test_fit_gev_3d_dask(): - """Run stationary fit using 3D dask array & check results.""" - data, theta_i = example_da_gev_3d_dask() - theta = fit_gev(data, stationary=True, core_dim="time") +# todo FAILED unseen/tests/test_eva.py::test_fit_gev_3d[xarray] - AssertionError: +@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) +def test_fit_gev_3d(example_da_gev_3d): + """Run stationary GEV fit using 3D array & check results.""" + data, dparams_i = example_da_gev_3d + dparams = fit_gev(data, stationary=True, fitstart="LMM", core_dim="time") # Check fitted params match params used to create data - npt.assert_allclose(theta, theta_i, rtol=rtol) + npt.assert_allclose(dparams, dparams_i, rtol=0.4) -def test_fit_ns_gev_1d(): - """Run non-stationary fit using 1D array & check results.""" - data, _ = example_da_gev_1d() +@pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True) +def test_fit_ns_gev_1d(example_da_gev): + """Run non-stationary GEV fit using 1D array & check results.""" + data, _ = example_da_gev data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) + covariate = xr.DataArray(np.arange(data.time.size), dims="time") - theta = fit_gev( + dparams = fit_gev( data, stationary=False, core_dim="time", covariate=covariate, ) - assert np.all(theta[2] > 0) # Positive trend in location + assert np.all(dparams[2] > 0) # Positive trend in location -def test_fit_ns_gev_1d_loc_only(): - """Run non-stationary fit using 1D array (location parameter only).""" - data, _ = example_da_gev_1d() +@pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True) +def test_fit_ns_gev_1d_loc_only(example_da_gev): + """Run non-stationary GEV fit using 1D array (location parameter only).""" + data, _ = example_da_gev data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) + covariate = xr.DataArray(np.arange(data.time.size), dims="time") - theta = fit_gev( + dparams = fit_gev( data, stationary=False, core_dim="time", @@ -193,17 +104,18 @@ def test_fit_ns_gev_1d_loc_only(): scale1=None, covariate=covariate, ) - assert np.all(theta[2] > 0) # Positive trend in location - assert np.all(theta[4] == 0) # No trend in scale + assert np.all(dparams[2] > 0) # Positive trend in location + assert np.all(dparams[4] == 0) # No trend in scale -def test_fit_ns_gev_1d_scale_only(): - """Run non-stationary fit using 1D array (scale parameter only).""" - data, _ = example_da_gev_1d() +@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True) +def test_fit_ns_gev_1d_scale_only(example_da_gev): + """Run non-stationary GEV fit using 1D array (scale parameter only).""" + data, _ = example_da_gev data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) + covariate = xr.DataArray(np.arange(data.time.size), dims="time") - theta = fit_gev( + dparams = fit_gev( data, stationary=False, core_dim="time", @@ -211,158 +123,119 @@ def test_fit_ns_gev_1d_scale_only(): scale1=0, covariate=covariate, ) - assert np.all(theta[2] == 0) # No trend in location - assert np.all(theta[4] != 0) # Nonzero trend in scale - - -def test_fit_ns_gev_1d_dask(): - """Run non-stationary fit using 1D dask array & check results.""" - data, _ = example_da_gev_1d_dask() - # Add a positive linear trend - data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) - theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time") - assert np.all(theta[2] > 0) # Positive trend in location + assert np.all(dparams[2] == 0) # No trend in location + assert np.all(dparams[4] != 0) # Nonzero trend in scale -def test_fit_ns_gev_3d(): - """Run non-stationary fit using 3D array & check results.""" - data, _ = example_da_gev_3d() +@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) +def test_fit_ns_gev_3d(example_da_gev_3d): + """Run non-stationary GEV fit using 3D array & check results.""" + data, _ = example_da_gev_3d # Add a positive linear trend data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) - theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time") - assert np.all(theta.isel(theta=2) > 0) # Positive trend in location + covariate = xr.DataArray(np.arange(data.time.size), dims="time") + dparams = fit_gev(data, stationary=False, covariate=covariate, core_dim="time") + assert np.all(dparams.isel(dparams=2) > 0) # Positive trend in location -def test_fit_ns_gev_1d_relative_fit_test_bic_trend(): - """Run non-stationary fit & check 'BIC' test returns nonstationary params.""" - data, _ = example_da_gev_1d() +@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True) +def test_fit_ns_gev_1d_pick_best_model_bic_trend(example_da_gev): + """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params.""" + data, _ = example_da_gev # Add a large positive linear trend data = add_example_gev_trend(data) data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) + covariate = xr.DataArray(np.arange(data.time.size), dims="time") - theta = fit_gev( + dparams = fit_gev( data, stationary=False, core_dim="time", covariate=covariate, - relative_fit_test="bic", + pick_best_model="bic", ) - assert np.all(theta[2] > 0) # Positive trend in location + assert np.all(dparams[2] > 0) # Positive trend in location -def test_fit_ns_gev_1d_relative_fit_test_bic_no_trend(): - """Run non-stationary fit & check 'BIC' test returns stationary params.""" - data, _ = example_da_gev_1d() - covariate = np.arange(data.time.size, dtype=int) +@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True) +def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev): + """Run non-stationary GEV fit & check 'BIC' test returns stationary params.""" + data, _ = example_da_gev + covariate = xr.DataArray(np.arange(data.time.size), dims="time") - theta = fit_gev( + dparams = fit_gev( data, stationary=False, core_dim="time", covariate=covariate, - relative_fit_test="bic", + pick_best_model="bic", ) - assert np.all(theta[2] == 0) # No trend in location - assert np.all(theta[4] == 0) # No trend in scale - - -def test_fit_ns_gev_3d_dask(): - """Run non-stationary fit using 3D dask array & check results.""" - data, _ = example_da_gev_3d_dask() - # Add a positive linear trend - data = add_example_gev_trend(data) - covariate = np.arange(data.time.size, dtype=int) - theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time") - assert np.all(theta.isel(theta=2) > 0) # Positive trend in location + assert np.all(dparams[2] == 0) # No trend in location + assert np.all(dparams[4] == 0) # No trend in scale -def test_fit_ns_gev_forecast(): - """Run non-stationary fit using stacked forecast dataArray.""" - data, _ = example_da_gev_forecast() - # Convert times to numerical timesteps - covariate = DataArray(date2num(data.time), coords={"sample": data.sample}) - # Add a positive linear trend - trend = covariate / 1e2 - data = data + trend - data = data.sortby(data.time) - covariate = covariate.sortby(data.time) - theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="sample") - assert np.all(theta[2] > 0) # Positive trend in location - - -def test_get_return_period(): +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +def test_get_return_period(example_da_gev): """Run get_return_period for a single event using 1d data.""" - data, _ = example_da_gev_1d() - event = data.mean() - rp = get_return_period(event, data=data) - assert rp.size == 1 - assert np.all(np.isfinite(rp)) + data, _ = example_da_gev + event = np.mean(data) + ari = get_return_period(event, data=data) + assert ari.size == 1 + assert np.all(np.isfinite(ari)) -def test_get_return_period_1d(): - """Run get_return_period for 1d array of events using 1d data.""" - data, theta = example_da_gev_1d() - event = data.quantile([0.25, 0.5, 0.75], dim="time") - rp = get_return_period(event, theta) - assert rp.shape == event.shape - assert np.all(np.isfinite(rp)) - - -def test_get_return_period_3d(): +@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) +def test_get_return_period_3d(example_da_gev_3d): """Run get_return_period for 3d array of events using 3d data.""" - data, theta = example_da_gev_3d() - theta = fit_gev(data, stationary=True) + data, dparams = example_da_gev_3d + dparams = fit_gev(data, stationary=True) # Multiple events unique to each lat/lon event = data.quantile([0.25, 0.5, 0.75], dim="time") - rp = get_return_period(event, theta) - assert rp.shape == event.shape - assert np.all(np.isfinite(rp)) + ari = get_return_period(event, dparams) + assert ari.shape == event.shape + assert np.all(np.isfinite(ari)) -def test_get_return_period_3d_nonstationary(): +@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) +def test_get_return_period_3d_nonstationary(example_da_gev_3d): """Run get_return_period for 3d events using 3d nonstationary data.""" - data, _ = example_da_gev_3d() + data, _ = example_da_gev_3d data = add_example_gev_trend(data) - covariate = DataArray(np.arange(data.time.size), dims="time") + covariate = xr.DataArray(np.arange(data.time.size), dims="time") params = fit_gev(data, stationary=False, covariate=covariate, core_dim="time") # Multiple events unique to each lat/lon event = data.quantile([0.25, 0.5, 0.75], dim="time") - covariate_subset = DataArray([0, covariate.size], dims="time") - rp = get_return_period(event, params, covariate=covariate_subset) - assert rp.shape == (*list(event.shape), covariate_subset.size) - assert np.all(np.isfinite(rp)) - - -def test_get_return_level(): - """Run get_return_level for a single return_period using 1d data.""" - _, theta = example_da_gev_1d() - rp = 100 - return_level = get_return_level(rp, theta) - assert return_level.size == 1 - assert np.all(np.isfinite(return_level)) + covariate_subset = xr.DataArray([0, covariate.size], dims="time") + ari = get_return_period(event, params, covariate=covariate_subset) + assert ari.shape == (*list(event.shape), covariate_subset.size) + assert np.all(np.isfinite(ari)) -def test_get_return_level_1d(): +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +@pytest.mark.parametrize("ari", [100, np.array([10, 100, 1000])]) +def test_get_return_level(example_da_gev, ari): """Run get_return_level for 1d array of periods using 1d data.""" - _, theta = example_da_gev_1d() - rp = np.array([10, 100, 1000]) - return_level = get_return_level(rp, theta) - assert return_level.shape == rp.shape + _, dparams = example_da_gev + return_level = get_return_level(ari, dparams) + if isinstance(ari, int): + assert return_level.size == 1 + else: + assert return_level.shape == ari.shape assert np.all(np.isfinite(return_level)) -def test_get_return_level_3d(): +@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) +def test_get_return_level_3d(example_da_gev_3d): """Run get_return_level for 3d array of periods using 3d data.""" - data, theta = example_da_gev_3d() - theta = fit_gev(data, stationary=True) + data, dparams = example_da_gev_3d + dparams = fit_gev(data, stationary=True, core_dim="time") + # Multiple events unique to each lat/lon - dims = ("return_period", "lat", "lon") - rp = np.array([10, 100, 1000] * 4).T - rp = DataArray(rp.reshape((3, 2, 2)), dims=dims) - return_level = get_return_level(rp, theta) - assert return_level.shape == rp.shape + dims = ("lat", "lon", "return_period") + ari = np.array([10, 100, 1000] * 4).T + ari = xr.DataArray(ari.reshape(dparams.shape), dims=dims) + return_level = get_return_level(ari, dparams) + + assert return_level.shape == ari.shape assert np.all(np.isfinite(return_level)) From e2c6482efccb16f247ab166bf91ff93065a0022b Mon Sep 17 00:00:00 2001 From: Annette Stellema <40450353+stellema@users.noreply.github.com> Date: Wed, 16 Oct 2024 16:08:05 +1100 Subject: [PATCH 2/4] Update fit_gev goodness of fit and retry and add draft of gev_confidence_interval --- unseen/eva.py | 278 +++++++++++++++++++++++++++++---------- unseen/tests/test_eva.py | 61 +++++---- 2 files changed, 235 insertions(+), 104 deletions(-) diff --git a/unseen/eva.py b/unseen/eva.py index 5a745bd..3367883 100644 --- a/unseen/eva.py +++ b/unseen/eva.py @@ -8,7 +8,7 @@ from matplotlib.ticker import AutoMinorLocator import numpy as np from scipy.optimize import minimize -from scipy.stats import genextreme, goodness_of_fit +from scipy.stats import genextreme, ks_1samp, cramervonmises from scipy.stats.distributions import chi2 import warnings from xarray import apply_ufunc, DataArray @@ -64,12 +64,14 @@ def fit_gev( fitstart="LMM", loc1=0, scale1=0, + retry_fit=False, assert_good_fit=False, pick_best_model=False, alpha=0.05, method="Nelder-Mead", + goodness_of_fit_kwargs=dict(test="ks"), ): - """Estimate stationary or nonstationary GEV distribution parameters. + """Estimate stationary or nonstationary GEV distributionß parameters. Parameters ---------- @@ -86,9 +88,11 @@ def fit_gev( Initial guess method/estimate of the shape, loc and scale parameters loc1, scale1 : float or None, default 0 Initial guess of trend parameters. If None, the trend is fixed at zero + retry_fit : bool, default True + Retry fit using a fitstart(data[::2]) if the fit does not pass the + goodness of fit test (p-value > alpha). assert_good_fit : bool, default False - Stationary parameters must pass goodness of fit test at `alpha` level. - Attempt a retry and return NaNs if the test fails again. + Return NaNs if data fails a GEV goodness of fit test at `alpha` level. pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False Method to test relative fit of stationary and nonstationary models. Do not use if you don't want nonstationary parameters. The output will @@ -98,6 +102,8 @@ def fit_gev( method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell', 'trust-constr', 'COBYLA'}, default 'Nelder-Mead' Optimization method for nonstationary fit + goodness_of_fit_kwargs : dict, optional + Additional keyword arguments to pass to `check_gev_fit` Returns ------- @@ -115,51 +121,35 @@ def fit_gev( - For stationary data the parameters are estimated using `scipy.stats.genextreme.fit`. - For nonstationary data, the parameters (including the linear location and - scale trend parameters are estimated by minimising - a penalised negative log-likelihood function. - - The `assert_good_fit` option ensures that the distribution fit is - accepted if the goodness of fit test `p-value > alpha` (i.e., accept - the null hypothesis). It will retry the fit using data[::2] to generate - an initial guess. + scale trend parameters are estimated by minimising a penalised negative + log-likelihood function. - The `covariate` must be numeric and have dimensions aligned with `data`. - If `pick_best_model` is a method, the relative goodness of fit method is used to determine if stationary or nonstationary parameters are returned. + - `assert_good_fit`: Return NaNs if the goodness of fit null hypothesis is + rejected (i.e., `p-value <= alpha`). + - `retry_fit`: retry the fit using data[::2] to generate an initial + guess (same fitstart method). + """ - kwargs = {k: v for k, v in locals().items() if k not in ["data", "covariate"]} - - def _assert_good_fit_1d(data, dparams, alpha, fit_kwargs): - """Test goodness of stationary GEV fit and retry if failed.""" - pvalue = check_gev_fit(data, dparams) - - if np.all(pvalue < alpha): - # Retry fit using alternative fitstart methods - warnings.warn("GEV fit failed. Retrying fitstart with data subset.") - _kwargs = fit_kwargs.copy() - _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart) - _kwargs["stationary"] = True - dparams = _fit_1d(data, covariate, **_kwargs) - pvalue = check_gev_fit(data, dparams) - - # Return NaNs if the test still fails - if np.all(pvalue < alpha): - # Return NaNs - dparams = dparams * np.nan - warnings.warn("Data fit failed.") - return dparams + kwargs = { + k: v for k, v in locals().items() if k not in ["data", "covariate", "core_dim"] + } def _fit_1d( data, covariate, stationary, fitstart, - core_dim, loc1, scale1, + retry_fit, assert_good_fit, pick_best_model, alpha, method, + goodness_of_fit_kwargs, ): """Estimate distribution parameters.""" if np.all(~np.isfinite(data)): @@ -183,16 +173,38 @@ def _fit_1d( # Use genextreme to get stationary distribution parameters if stationary or pick_best_model: - if dparams_i is None: - dparams_i = genextreme.fit(data) + if fitstart is None: + dparams = genextreme.fit(data) else: dparams = genextreme.fit( data, dparams_i[0], loc=dparams_i[1], scale=dparams_i[2] ) dparams = np.array([i for i in dparams], dtype="float64") - if assert_good_fit: - dparams = _assert_good_fit_1d(data, dparams, alpha, kwargs) + if retry_fit or assert_good_fit: + pvalue = check_gev_fit(data, dparams, **goodness_of_fit_kwargs) + + if retry_fit and np.all(pvalue <= alpha): + # Retry fit using alternative fitstart methods + _kwargs = kwargs.copy() + _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart) + _kwargs["stationary"] = True + for k in ["retry_fit", "assert_good_fit", "pick_best_model"]: + _kwargs[k] = False # Avoids recursion + dparams_alt = _fit_1d(data, covariate, **_kwargs) + + # Test if the alternative fit is better + L1 = _gev_nllf([-dparams[0], *dparams[1:]], data) + L2 = _gev_nllf([-dparams_alt[0], *dparams_alt[1:]], data) + if L2 < L1: + dparams = dparams_alt + pvalue = check_gev_fit(data, dparams, **goodness_of_fit_kwargs) + warnings.warn("Better fit estimate using data[::2].") + + if assert_good_fit and pvalue <= alpha: + # Return NaNs + dparams = dparams * np.nan + warnings.warn("Data fit failed.") if not stationary or pick_best_model: # Temporarily reverse shape sign (scipy uses different sign convention) @@ -210,7 +222,7 @@ def _fit_1d( # Minimise the negative log-likelihood function to get optimal dparams res = minimize( - nllf, + _gev_nllf, dparams_ns_i, args=(data, covariate), method=method, @@ -300,8 +312,8 @@ def penalised_sum(x): return total + penalty -def nllf(dparams, x, covariate=None): - """Penalised negative log-likelihood function. +def _gev_nllf(dparams, x, covariate=None): + """GEV penalised negative log-likelihood function. Parameters ---------- @@ -389,6 +401,16 @@ def _fitstart_1d(data, method): ----- - Use `scipy_fitstart` to reproduce the scipy fit in `fit_gev`. - The LMM shape sign is reversed for consistency with scipy.stats results. + scipy_fitstart: + >>> shape = skew(data) / 2 + >>> scale = np.std(data) / np.sqrt(6) + >>> location = np.mean(data) - (scale * (0.5772 + np.log(2))) + >>> dparams_i = (-shape, location, scale) # + or - shape? + + xclim_fitstart: + >>> scale = np.sqrt(6 * np.var(data)) / np.pi + >>> location = np.mean(data) - 0.57722 * scale + >>> dparams_i = [-0.1, location, scale] """ if method == "LMM": @@ -464,8 +486,8 @@ def _format_covariate(data, covariate, core_dim): return covariate -def check_gev_fit(data, dparams, core_dim=[], **kwargs): - """Test stationary GEV distribution goodness of fit. +def check_gev_fit(data, dparams, core_dim=[], test="ks", **kwargs): + """Perform a goodness of fit of GEV distribution with the given parameters. Parameters ---------- @@ -475,24 +497,35 @@ def check_gev_fit(data, dparams, core_dim=[], **kwargs): Shape, location and scale parameters core_dim : str, optional Data dimension to test over + test : {'ks', 'cvm'}, default 'ks' + Test to use for goodness of fit kwargs : dict, optional - Additional keyword arguments to pass to `goodness_of_fit`. + Additional keyword arguments to pass to the stats function. Returns ------- - pvalue : scipy.stats._fit.GoodnessOfFitResult.pvalue + pvalue : float Goodness of fit p-value + + Notes + ----- + - CvM is more likely to detect small discrepancies that may not matter + practically in large datasets. """ + stats_func = { + "ks": ks_1samp, + "cvm": cramervonmises, + } - def _goodness_of_fit(data, dparams, **kwargs): + def _fit_test_genextreme(data, dparams, **kwargs): """Test GEV goodness of fit.""" # Stationary parameters - shape, loc, scale = dparams + c, loc, scale = dparams - res = goodness_of_fit( - genextreme, + res = stats_func[test]( data, - known_params=dict(c=shape, loc=loc, scale=scale), + genextreme.cdf, + args=(c, loc, scale), **kwargs, ) return res.pvalue @@ -501,7 +534,7 @@ def _goodness_of_fit(data, dparams, **kwargs): core_dim = [core_dim] pvalue = apply_ufunc( - _goodness_of_fit, + _fit_test_genextreme, data, dparams, input_core_dims=[core_dim, ["dparams"]], @@ -565,8 +598,8 @@ def get_best_GEV_model_1d(data, dparams, dparams_ns, covariate, alpha, test): shape, loc, scale = dparams # Negative log-likelihood of stationary and nonstationary models - L1 = nllf([-shape, loc, scale], data) - L2 = nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate) + L1 = _gev_nllf([-shape, loc, scale], data) + L2 = _gev_nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate) result = check_gev_relative_fit(data, L1, L2, test=test, alpha=alpha) if not result: @@ -701,6 +734,104 @@ def get_return_level(return_period, dparams=None, covariate=None, **kwargs): return return_level +def gev_confidence_interval( + data, + dparams=None, + return_period=None, + return_level=None, + bootstrap_method="non-parametric", + n_resamples=1000, + ci=0.95, + core_dim="time", + fit_kwargs={}, +): + """ + Bootstrapped confidence intervals for return periods or return levels. + + Parameters: + ----------- + + data : xarray.DataArray + Input data to fit GEV distribution + dparams : xarray.DataArray, optional + GEV distribution parameters. If None, the parameters are estimated. + return_period : float or xarray.DataArray, default None + Return period(s). Mutually exclusive with `return_level`. + return_level : float or xarray.DataArray, default None + Return level(s) Mutually exclusive with `return_period`. + bootstrap_method : {'parametric', 'non-parametric'}, default 'non-parametric' + Bootstrap method to use for resampling + n_resamples : int, optional + Number of bootstrap resamples to perform (default: 1000) + ci : float, optional + Confidence level (e.g., 0.95 for 95% confidence interval, default: 0.95) + core_dim : str, optional + The core dimension along which to apply GEV fitting (default: None, will auto-detect) + fit_kwargs : dict, optional + Additional keyword arguments to pass to `fit_gev` + + Returns: + -------- + ci_bounds : xarray.DataArray + Confidence intervals with lower and upper bounds along dim 'quantile' + """ + # todo: max_shape_ratio + # Replace core dim with the one from the fit_kwargs if it exists + core_dim = fit_kwargs.pop("core_dim", core_dim) + + rng = np.random.default_rng(seed=0) + if dparams is None: + dparams = fit_gev(data, core_dim=core_dim, **fit_kwargs) + shape, loc, scale = unpack_gev_params(dparams) + + # Generate random indices for resampling + if bootstrap_method == "parametric": + boot_data = apply_ufunc( + genextreme.rvs, + shape, + loc, + scale, + input_core_dims=[[], [], []], + output_core_dims=[["k", core_dim]], + kwargs=dict(size=(n_resamples, data[core_dim].size)), + vectorize=True, + dask="parallelized", + ) + boot_data = boot_data.transpose("k", core_dim, ...) + + elif bootstrap_method == "non-parametric": + resample_indices = rng.integers( + 0, data[core_dim].size, (n_resamples, data[core_dim].size) + ) + indexer = DataArray(resample_indices, dims=("k", core_dim)) + boot_data = data.isel({core_dim: indexer}) + + # Fit GEV parameters to resampled data + gev_params_resampled = fit_gev(boot_data, core_dim=core_dim, **fit_kwargs) + + if return_period is not None: + result = get_return_level( + return_period, gev_params_resampled, core_dim=core_dim + ) + elif return_level is not None: + result = get_return_period( + return_level, gev_params_resampled, core_dim=core_dim + ) + + # Bounds of confidence intervals + ci = ci * 100 # Avoid rounding errors + q = (100 - ci) * 0.5 / 100 + + # Calculate confidence intervals from resampled percentiles + ci_bounds = result.quantile([q, 1 - q], dim="k") + + ci_bounds.attrs = { + "long_name": "Confidence interval", + "description": f"{ci:g}% confidence interval ({n_resamples} resamples)", + } + return ci_bounds + + def gev_return_curve( data, event_value, @@ -708,6 +839,7 @@ def gev_return_curve( n_bootstraps=1000, max_return_period=4, max_shape_ratio=None, + ci=0.95, **fit_kwargs, ): """Return x and y data for a GEV return period curve. @@ -733,7 +865,9 @@ def gev_return_curve( dparams = fit_gev(data, **fit_kwargs) shape, loc, scale = unpack_gev_params(dparams) - curve_return_periods = np.logspace(0, max_return_period, num=10000) + curve_return_periods = DataArray( + np.logspace(0, max_return_period, num=10000), dims="ari" + ) curve_probabilities = 1.0 / curve_return_periods curve_values = genextreme.isf(curve_probabilities, shape, loc, scale) @@ -748,24 +882,21 @@ def gev_return_curve( boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data)) elif bootstrap_method == "non-parametric": boot_data = rng.choice(data, size=data.shape, replace=True) - boot_shape, boot_loc, boot_scale = fit_gev(boot_data, fitstart="scipy_subet") + + boot_dparams = fit_gev(boot_data, **fit_kwargs) if max_shape_ratio: - shape_ratio = abs(boot_shape) / abs(shape) + shape_ratio = abs(boot_dparams[0]) / abs(boot_dparams[0]) if shape_ratio > max_shape_ratio: continue - boot_value = genextreme.isf( - curve_probabilities, boot_shape, boot_loc, boot_scale - ) - boot_values = np.vstack((boot_values, boot_value)) - boot_event_probability = genextreme.sf( - event_value, boot_shape, loc=boot_loc, scale=boot_scale - ) - boot_event_return_period = 1.0 / boot_event_probability + boot_value = get_return_level(curve_return_periods, boot_dparams) + boot_values = np.vstack((boot_values, boot_value)) + boot_event_return_period = get_return_period(event_value, boot_dparams) boot_event_return_periods.append(boot_event_return_period) - curve_values_lower_ci = np.quantile(boot_values, 0.025, axis=0) - curve_values_upper_ci = np.quantile(boot_values, 0.975, axis=0) + q = (100 - ci * 100) * 0.5 / 100 # Quantile for lower and upper bounds + curve_values_lower_ci = np.quantile(boot_values, q, axis=0) + curve_values_upper_ci = np.quantile(boot_values, 1 - q, axis=0) curve_data = ( curve_return_periods, curve_values, @@ -777,8 +908,8 @@ def gev_return_curve( boot_event_return_periods = boot_event_return_periods[ np.isfinite(boot_event_return_periods) ] - event_return_period_lower_ci = np.quantile(boot_event_return_periods, 0.025) - event_return_period_upper_ci = np.quantile(boot_event_return_periods, 0.975) + event_return_period_lower_ci = np.quantile(boot_event_return_periods, q) + event_return_period_upper_ci = np.quantile(boot_event_return_periods, q - 1) event_data = ( event_return_period, event_return_period_lower_ci, @@ -1180,11 +1311,17 @@ def _parse_command_line(): ), help="Initial guess method (or estimate) of the GEV parameters", ) + parser.add_argument( + "--retry_fit", + action="store_true", + default=False, + help="Return NaNs if fit doesn't pass the goodness of fit test", + ) parser.add_argument( "--assert_good_fit", action="store_true", default=False, - help="Test fit goodness", + help="Return NaNs if fit doesn't pass the goodness of fit test", ) parser.add_argument( "--pick_best_model", @@ -1216,12 +1353,6 @@ def _parse_command_line(): action=general_utils.store_dict, help="Minimum lead time file", ) - # parser.add_argument( - # "--confidence_interval", - # type=float, - # default=0.95, - # help="Confidence interval e.g., --confidence_interval 0.95", - # ) parser.add_argument( "--ensemble_dim", type=str, @@ -1303,6 +1434,7 @@ def _main(): stationary=args.stationary, fitstart=args.fitstart, covariate=covariate, + retry_fit=args.retry_fit, assert_good_fit=args.assert_good_fit, pick_best_model=args.pick_best_model, ) diff --git a/unseen/tests/test_eva.py b/unseen/tests/test_eva.py index d30f13e..e436f3c 100644 --- a/unseen/tests/test_eva.py +++ b/unseen/tests/test_eva.py @@ -32,6 +32,7 @@ def test_fit_gev_1d(example_da_gev): "fitstart", [ [1, -5, 1], + None, "LMM", "scipy_fitstart", "scipy", @@ -54,23 +55,13 @@ def test_fit_gev_1d_fitstart(example_da_gev, fitstart): npt.assert_allclose(dparams, dparams_i, rtol=rtol) -@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) -def test_fit_gev_1d_assert_good_fit(example_da_gev): - """Run stationary GEV fit using 1D array & fit_goodness_test.""" - data, dparams_i = example_da_gev - dparams = fit_gev(data, stationary=True, assert_good_fit=True) - # Check fitted params match params used to create data - npt.assert_allclose(dparams, dparams_i, rtol=0.3) - - -# todo FAILED unseen/tests/test_eva.py::test_fit_gev_3d[xarray] - AssertionError: @pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True) def test_fit_gev_3d(example_da_gev_3d): """Run stationary GEV fit using 3D array & check results.""" data, dparams_i = example_da_gev_3d dparams = fit_gev(data, stationary=True, fitstart="LMM", core_dim="time") # Check fitted params match params used to create data - npt.assert_allclose(dparams, dparams_i, rtol=0.4) + npt.assert_allclose(dparams, dparams_i, rtol=rtol) @pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True) @@ -138,29 +129,34 @@ def test_fit_ns_gev_3d(example_da_gev_3d): assert np.all(dparams.isel(dparams=2) > 0) # Positive trend in location -@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True) -def test_fit_ns_gev_1d_pick_best_model_bic_trend(example_da_gev): - """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params.""" - data, _ = example_da_gev - # Add a large positive linear trend - data = add_example_gev_trend(data) - data = add_example_gev_trend(data) - covariate = xr.DataArray(np.arange(data.time.size), dims="time") +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +def test_fit_gev_1d_retry_fit(example_da_gev): + """Run stationary GEV fit using 1D array & retry_fit.""" + data, dparams_i = example_da_gev + # Set large alpha to force any fit considered bad + dparams = fit_gev(data, stationary=True, retry_fit=True, alpha=1) + # Check fitted params match params used to create data + npt.assert_allclose(dparams, dparams_i, rtol=rtol) - dparams = fit_gev( - data, - stationary=False, - core_dim="time", - covariate=covariate, - pick_best_model="bic", - ) - assert np.all(dparams[2] > 0) # Positive trend in location + +@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) +def test_fit_gev_1d_assert_good_fit(example_da_gev): + """Run stationary GEV fit using 1D array & assert_good_fit.""" + data, _ = example_da_gev + # Set large alpha to force any fit considered bad + dparams = fit_gev(data, stationary=True, assert_good_fit=True, alpha=1) + assert all(np.isnan(dparams)) @pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True) -def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev): - """Run non-stationary GEV fit & check 'BIC' test returns stationary params.""" +@pytest.mark.parametrize("trend", [False, True]) +def test_fit_ns_gev_1d_pick_best_model_bic(example_da_gev, trend): + """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params.""" data, _ = example_da_gev + if trend: + # Add a large positive linear trend + data = add_example_gev_trend(data) + data = add_example_gev_trend(data) covariate = xr.DataArray(np.arange(data.time.size), dims="time") dparams = fit_gev( @@ -170,8 +166,11 @@ def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev): covariate=covariate, pick_best_model="bic", ) - assert np.all(dparams[2] == 0) # No trend in location - assert np.all(dparams[4] == 0) # No trend in scale + if trend: + assert np.all(dparams[2] > 0) # Positive trend in location + else: + assert np.all(dparams[2] == 0) # No trend in location + assert np.all(dparams[4] == 0) # No trend in scale @pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True) From 123ac98a4d1c89b0add4501637672f2b20091c36 Mon Sep 17 00:00:00 2001 From: Annette Stellema <40450353+stellema@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:33:57 +1100 Subject: [PATCH 3/4] Update eva.py and timeseries plot --- unseen/eva.py | 127 +++++++++++++++++++++++++--------------- unseen/general_utils.py | 7 ++- 2 files changed, 84 insertions(+), 50 deletions(-) diff --git a/unseen/eva.py b/unseen/eva.py index 3367883..18d6295 100644 --- a/unseen/eva.py +++ b/unseen/eva.py @@ -64,7 +64,7 @@ def fit_gev( fitstart="LMM", loc1=0, scale1=0, - retry_fit=False, + retry_fit=True, assert_good_fit=False, pick_best_model=False, alpha=0.05, @@ -83,20 +83,21 @@ def fit_gev( Fit as a stationary GEV using `fit_stationary_gev` covariate : array_like, optional A nonstationary covariate array with the same `core_dim` as `data` - fitstart : {array-like, 'LMM', 'MM', 'scipy', 'scipy_fitstart', + fitstart : {array-like, 'LMM', 'scipy_fitstart', 'scipy', 'scipy_subset', 'xclim_fitstart', 'xclim'}, default 'scipy_fitstart' Initial guess method/estimate of the shape, loc and scale parameters loc1, scale1 : float or None, default 0 Initial guess of trend parameters. If None, the trend is fixed at zero retry_fit : bool, default True - Retry fit using a fitstart(data[::2]) if the fit does not pass the - goodness of fit test (p-value > alpha). + Retry fit with different initial estimate if the fit does not pass the + goodness of fit test at the `alpha` level. assert_good_fit : bool, default False Return NaNs if data fails a GEV goodness of fit test at `alpha` level. + Mutually exclusive with `stationary=False` and `pick_best_model`. pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False Method to test relative fit of stationary and nonstationary models. - Do not use if you don't want nonstationary parameters. The output will - have GEV 5 parameters even if stationary is True. + The output will have GEV 5 parameters even if stationary is True. + Mutually exclusive with `stationary` and/or `assert_good_fit`. alpha : float, default 0.05 Fit test p-value threshold for stationary fit (relative/goodness of fit) method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell', @@ -116,23 +117,23 @@ def fit_gev( Notes ----- - Use `unpack_gev_params` to get the shape, location and scale parameters - as a separate array. If nonstationary, the output will still be three + as a separate array. If nonstationary, the output will also have three parameters that have an extra covariate dimension. - For stationary data the parameters are estimated using - `scipy.stats.genextreme.fit`. + `scipy.stats.genextreme.fit` with initial guess based on `fitstart` (use + fitstart 'scipy_fitstart' or None to use scipy defaults). - For nonstationary data, the parameters (including the linear location and - scale trend parameters are estimated by minimising a penalised negative + scale trend parameters) are estimated by minimising a penalised negative log-likelihood function. - The `covariate` must be numeric and have dimensions aligned with `data`. - - If `pick_best_model` is a method, the relative goodness of fit method is - used to determine if stationary or nonstationary parameters are returned. - - `assert_good_fit`: Return NaNs if the goodness of fit null hypothesis is - rejected (i.e., `p-value <= alpha`). - `retry_fit`: retry the fit using data[::2] to generate an initial guess (same fitstart method). - - + - If `pick_best_model` is a method, the relative goodness of fit method is + used to determine if stationary or nonstationary parameters are returned. + If the stationary fit is better, the nonstationary parameters are returned + with zero trends (see `check_gev_relative_fit` and `get_best_GEV_model_1d`). """ + kwargs = { k: v for k, v in locals().items() if k not in ["data", "covariate", "core_dim"] } @@ -206,7 +207,7 @@ def _fit_1d( dparams = dparams * np.nan warnings.warn("Data fit failed.") - if not stationary or pick_best_model: + if not stationary: # Temporarily reverse shape sign (scipy uses different sign convention) dparams_ns_i = [-dparams_i[0], dparams_i[1], loc1, dparams_i[2], scale1] @@ -242,6 +243,13 @@ def _fit_1d( return dparams + if stationary and pick_best_model: + raise ValueError( + f"Stationary must be false if pick_best_model={pick_best_model}." + ) + if assert_good_fit and pick_best_model: + raise ValueError("pick_best_model and assert_good_fit are mutually exclusive.") + if covariate is not None: covariate = _format_covariate(data, covariate, core_dim) else: @@ -255,7 +263,7 @@ def _fit_1d( # Covariate is a 1D array input_core_dims = [[core_dim], []] - n_params = 5 if (not stationary or pick_best_model) else 3 + n_params = 3 if stationary else 5 # Fit data to distribution parameters dparams = apply_ufunc( _fit_1d, @@ -269,8 +277,9 @@ def _fit_1d( output_dtypes=["float64"], dask_gufunc_kwargs={"output_sizes": {"dparams": n_params}}, ) + + # Format output (consistent with xclim) if isinstance(data, DataArray): - # Format output (consistent with xclim) if n_params == 3: dparams.coords["dparams"] = ["c", "loc", "scale"] else: @@ -417,7 +426,6 @@ def _fitstart_1d(data, method): # L-moments method dparams_i = distr.gev.lmom_fit(data) dparams_i = list(dparams_i.values()) - dparams_i[0] = -dparams_i[0] elif method == "scipy_fitstart": # Moments method? @@ -734,6 +742,28 @@ def get_return_level(return_period, dparams=None, covariate=None, **kwargs): return return_level +def aep_to_ari(aep): + """Convert from aep (%) to ari (years) + + Details: http://www.bom.gov.au/water/designRainfalls/ifd-arr87/glossary.shtml + Stolen from https://github.com/climate-innovation-hub/frequency-analysis/blob/master/eva.py + """ + + assert aep < 100, "aep to be expressed as a percentage (must be < 100)" + aep = aep / 100 + + return 1 / (-np.log(1 - aep)) + + +def ari_to_aep(ari): + """Convert from ari (years) to aep (%) + + Details: http://www.bom.gov.au/water/designRainfalls/ifd-arr87/glossary.shtml + Stolen from https://github.com/climate-innovation-hub/frequency-analysis/blob/master/eva.py + """ + return ((np.exp(1 / ari) - 1) / np.exp(1 / ari)) * 100 + + def gev_confidence_interval( data, dparams=None, @@ -775,7 +805,7 @@ def gev_confidence_interval( ci_bounds : xarray.DataArray Confidence intervals with lower and upper bounds along dim 'quantile' """ - # todo: max_shape_ratio + # todo: add max_shape_ratio # Replace core dim with the one from the fit_kwargs if it exists core_dim = fit_kwargs.pop("core_dim", core_dim) @@ -800,6 +830,7 @@ def gev_confidence_interval( boot_data = boot_data.transpose("k", core_dim, ...) elif bootstrap_method == "non-parametric": + # todo: replace with rng.choice resample_indices = rng.integers( 0, data[core_dim].size, (n_resamples, data[core_dim].size) ) @@ -1288,14 +1319,20 @@ def _parse_command_line(): type=str, nargs="*", default=["ensemble", "init_date", "lead_time"], - help="Dimensions to stack", + help="Dimensions to stack", # todo: test this ) parser.add_argument("--core_dim", type=str, default="time", help="Core dimension") parser.add_argument( "--stationary", - type=bool, + action="store_true", default=True, - help="Fit nonstationary GEV distribution", + help="Fit stationary GEV distribution", + ) + parser.add_argument( + "--nonstationary", + action="store_true", + default=False, + help="Fit non-stationary GEV distribution", ) parser.add_argument( "--fitstart", @@ -1307,7 +1344,7 @@ def _parse_command_line(): "scipy_subset", "xclim_MLE", "xclim", - ["shape", "loc", "scale"], + ["shape", "loc", "scale"], # todo: test this ), help="Initial guess method (or estimate) of the GEV parameters", ) @@ -1315,7 +1352,7 @@ def _parse_command_line(): "--retry_fit", action="store_true", default=False, - help="Return NaNs if fit doesn't pass the goodness of fit test", + help="Retry fit if it doesn't pass the goodness of fit test", ) parser.add_argument( "--assert_good_fit", @@ -1325,8 +1362,7 @@ def _parse_command_line(): ) parser.add_argument( "--pick_best_model", - type=str, - default=None, + default=False, help="Relative fit test to pick stationary or nonstationary parameters", ) parser.add_argument( @@ -1341,7 +1377,7 @@ def _parse_command_line(): ) parser.add_argument( "--covariate_file", type=str, default=None, help="Covariate file" - ) + ) # todo: test this parser.add_argument( "--min_lead", default=None, help="Minimum lead time (int or filename)" ) @@ -1351,7 +1387,7 @@ def _parse_command_line(): nargs="*", default={}, action=general_utils.store_dict, - help="Minimum lead time file", + help="Keyword arguments for opening min_lead file", ) parser.add_argument( "--ensemble_dim", @@ -1372,33 +1408,30 @@ def _parse_command_line(): help="Name of lead time dimension", ) parser.add_argument( - "--output_chunks", + "--file_kwargs", type=str, nargs="*", - action=general_utils.store_dict, default={}, - help="Output chunks", - ) - parser.add_argument( - "--dask_config", type=str, help="YAML file specifying dask client configuration" + action=general_utils.store_dict, + help="Keyword arguments for opening the data file", ) args = parser.parse_args() - return args def _main(): - """Run the command line program.""" + """Run the command line program to save GEV distribution parameters.""" args = _parse_command_line() + args.stationary = False if args.nonstationary else True - ds = fileio.open_dataset(args.file, variables=[args.var]) + ds = fileio.open_dataset(args.file, **args.file_kwargs) if args.covariate_file is not None: - # Add covariate to dataset (to ensure all operations are aligned) ds_covariate = fileio.open_dataset( args.covariate_file, variables=[args.covariate] ) + # Add covariate to dataset (to ensure all operations are aligned) ds[args.covariate] = ds_covariate[args.covariate] # Filter data by reference time period @@ -1420,10 +1453,10 @@ def _main(): # Stack dimensions along new "sample" dimension if all([dim in ds[args.var].dims for dim in args.stack_dims]): - ds = ds.stack(**{"sample": args.stack_dims}) + ds = ds.stack(**{"sample": args.stack_dims}, create_index=False) args.core_dim = "sample" - if not args.stationary: + if args.nonstationary: covariate = _format_covariate(ds[args.var], ds[args.covariate], args.core_dim) else: covariate = None @@ -1440,20 +1473,20 @@ def _main(): ) # Format outfile - dparams = dparams.to_dataset() + dparams = dparams.to_dataset(name=args.var) # Add the covariate variable - if not args.stationary or args.pick_best_model: - dparams[args.covariate] = covariate + if args.nonstationary: + dparams["covariate"] = covariate + # Add metadata + dparams.attrs = ds.attrs infile_logs = {args.file: ds.attrs["history"]} if isinstance(args.min_lead, str): infile_logs[args.min_lead] = ds_min_lead.attrs["history"] dparams.attrs["history"] = fileio.get_new_log(infile_logs=infile_logs) - if args.output_chunks: - dparams = dparams.chunk(args.output_chunks) - + # Save to file if "zarr" in args.outfile: fileio.to_zarr(dparams, args.outfile) else: diff --git a/unseen/general_utils.py b/unseen/general_utils.py index a6f42db..6c84895 100644 --- a/unseen/general_utils.py +++ b/unseen/general_utils.py @@ -164,12 +164,10 @@ def plot_timeseries_scatter( units = "" if ax is None: - fig, ax = plt.subplots(1, 1, figsize=(10, 4)) + fig, ax = plt.subplots(1, 1, figsize=(12, 5)) if title is not None: ax.set_title(title, loc="left") - # Plot ensemble data - ax.scatter(da[time_dim], da, s=3, c="lightskyblue", label=label) # Plot observed data if da_obs is not None: ax.scatter( @@ -179,7 +177,10 @@ def plot_timeseries_scatter( c="k", marker="x", label=obs_label, + zorder=10, ) + # Plot ensemble data + ax.scatter(da[time_dim], da, s=5, c="deepskyblue", label=label) ax.set_ylabel(units) ax.set_xmargin(1e-2) From 27f7e092d3693b6c81db2b41ebd76b055ff0f639 Mon Sep 17 00:00:00 2001 From: Annette Stellema <40450353+stellema@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:35:44 +1100 Subject: [PATCH 4/4] Update unseen/eva.py Co-authored-by: Damien Irving --- unseen/eva.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unseen/eva.py b/unseen/eva.py index 18d6295..940f9c0 100644 --- a/unseen/eva.py +++ b/unseen/eva.py @@ -71,7 +71,7 @@ def fit_gev( method="Nelder-Mead", goodness_of_fit_kwargs=dict(test="ks"), ): - """Estimate stationary or nonstationary GEV distributionß parameters. + """Estimate stationary or nonstationary GEV distribution parameters. Parameters ----------