From 0db4f42f5c2d61cef732ffb557676f467fd538b0 Mon Sep 17 00:00:00 2001
From: Annette Stellema <40450353+stellema@users.noreply.github.com>
Date: Fri, 11 Oct 2024 16:06:12 +1100
Subject: [PATCH 1/4] Add eva main function and add eva.fit_gev fitstart
 options

---
 ci/environment.yml                            |    1 +
 .../worked_example-HadGEM3-GC31-MM.ipynb      |    8 +-
 docs/user_guide/worked_example_stationary.rst |    4 +-
 setup.py                                      |    1 +
 unseen/eva.py                                 | 1022 +++++++++++------
 unseen/moments.py                             |    4 +-
 unseen/stability.py                           |    2 +-
 unseen/tests/conftest.py                      |   59 +-
 unseen/tests/test_eva.py                      |  405 +++----
 9 files changed, 910 insertions(+), 596 deletions(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index bd6f119..133d6dc 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -9,6 +9,7 @@ dependencies:
   - dask-jobqueue
   - geopandas
   - gitpython
+  - lmoments3
   - netcdf4
   - numpy
   - pip
diff --git a/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb b/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb
index 58984d3..5ee6550 100644
--- a/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb
+++ b/docs/user_guide/worked_example-HadGEM3-GC31-MM.ipynb
@@ -1220,9 +1220,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'generate_estimates=True'.\n",
+      "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'fitstart='scipy_subet''.\n",
       "  warnings.warn(\n",
-      "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'generate_estimates=True'.\n",
+      "/home/599/dbi599/unseen/unseen/eva.py:253: UserWarning: Data fit failed. Retrying with 'fitstart='scipy_subet''.\n",
       "  warnings.warn(\n"
      ]
     },
@@ -2609,12 +2609,12 @@
    ],
    "source": [
     "model_da_indep.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:blue')\n",
-    "model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, generate_estimates=True)\n",
+    "model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, fitstart='scipy_subet')\n",
     "model_raw_pdf = gev.pdf(xvals, model_raw_shape, model_raw_loc, model_raw_scale)\n",
     "plt.plot(xvals, model_raw_pdf, color='tab:blue', linewidth=4.0, label='model')\n",
     "\n",
     "model_da_bc.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:orange')\n",
-    "model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, generate_estimates=True)\n",
+    "model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, fitstart='scipy_subet')\n",
     "model_bc_pdf = gev.pdf(xvals, model_bc_shape, model_bc_loc, model_bc_scale)\n",
     "plt.plot(xvals, model_bc_pdf, color='tab:orange', linewidth=4.0, label='model (corrected)')\n",
     "\n",
diff --git a/docs/user_guide/worked_example_stationary.rst b/docs/user_guide/worked_example_stationary.rst
index 8f59a4e..8b6f241 100644
--- a/docs/user_guide/worked_example_stationary.rst
+++ b/docs/user_guide/worked_example_stationary.rst
@@ -438,12 +438,12 @@ to see the effect of the bias correction.
     model_da_bc_stacked = model_da_bc.dropna('lead_time').stack({'sample': ['ensemble', 'init_date', 'lead_time']})
 
     model_da_indep.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:blue')
-    model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, generate_estimates=True)
+    model_raw_shape, model_raw_loc, model_raw_scale = eva.fit_gev(model_da_indep_stacked.values, fitstart='scipy_subet')
     model_raw_pdf = gev.pdf(xvals, model_raw_shape, model_raw_loc, model_raw_scale)
     plt.plot(xvals, model_raw_pdf, color='tab:blue', linewidth=4.0, label='model')
 
     model_da_bc.plot.hist(bins=50, density=True, alpha=0.7, facecolor='tab:orange')
-    model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, generate_estimates=True)
+    model_bc_shape, model_bc_loc, model_bc_scale = eva.fit_gev(model_da_bc_stacked.values, fitstart='scipy_subet')
     model_bc_pdf = gev.pdf(xvals, model_bc_shape, model_bc_loc, model_bc_scale)
     plt.plot(xvals, model_bc_pdf, color='tab:orange', linewidth=4.0, label='model (corrected)')
 
diff --git a/setup.py b/setup.py
index b217d55..96d229f 100644
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@
             "bias_correction = unseen.bias_correction:_main",
             "stability = unseen.stability:_main",
             "moments = unseen.moments:_main",
+            "eva = unseen.eva:_main",
         ]
     },
 )
diff --git a/unseen/eva.py b/unseen/eva.py
index 18f6481..5a745bd 100644
--- a/unseen/eva.py
+++ b/unseen/eva.py
@@ -1,5 +1,8 @@
 """Extreme value analysis functions."""
 
+import argparse
+from lmoments3 import distr
+import matplotlib.pyplot as plt
 from matplotlib import colormaps
 from matplotlib.dates import date2num
 from matplotlib.ticker import AutoMinorLocator
@@ -9,6 +12,11 @@
 from scipy.stats.distributions import chi2
 import warnings
 from xarray import apply_ufunc, DataArray
+import xclim.indices.stats as xcstats
+
+from . import fileio
+from . import general_utils
+from . import time_utils
 
 
 def event_in_context(data, threshold, direction):
@@ -48,35 +56,231 @@ def event_in_context(data, threshold, direction):
     return n_events, n_population, return_period, percentile
 
 
-def fit_stationary_gev(x, user_estimates=[], generate_estimates=False):
-    """Estimate stationary shape, location and scale parameters.
+def fit_gev(
+    data,
+    core_dim="time",
+    stationary=True,
+    covariate=None,
+    fitstart="LMM",
+    loc1=0,
+    scale1=0,
+    assert_good_fit=False,
+    pick_best_model=False,
+    alpha=0.05,
+    method="Nelder-Mead",
+):
+    """Estimate stationary or nonstationary GEV distribution parameters.
 
     Parameters
     ----------
-    x : array_like
-         Data to use in estimating the distribution parameters
-    user_estimates : list, optional
-        Initial guess of the shape, loc and scale parameters
-    generate_estimates : bool, optional
-        Generate initial parameter guesses using a data subset
+    data : array_like
+        Data to use in estimating the distribution parameters
+    core_dim : str, default "time"
+        Name of time/sample dimension in `data` and `covariate`
+    stationary : bool, default True
+        Fit as a stationary GEV using `fit_stationary_gev`
+    covariate : array_like, optional
+        A nonstationary covariate array with the same `core_dim` as `data`
+    fitstart : {array-like, 'LMM', 'MM', 'scipy', 'scipy_fitstart',
+    'scipy_subset', 'xclim_fitstart', 'xclim'}, default 'scipy_fitstart'
+        Initial guess method/estimate of the shape, loc and scale parameters
+    loc1, scale1 : float or None, default 0
+        Initial guess of trend parameters. If None, the trend is fixed at zero
+    assert_good_fit : bool, default False
+        Stationary parameters must pass goodness of fit test at `alpha` level.
+        Attempt a retry and return NaNs if the test fails again.
+    pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False
+        Method to test relative fit of stationary and nonstationary models.
+        Do not use if you don't want nonstationary parameters. The output will
+        have GEV 5 parameters even if stationary is True.
+    alpha : float, default 0.05
+        Fit test p-value threshold for stationary fit (relative/goodness of fit)
+    method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell',
+    'trust-constr', 'COBYLA'}, default 'Nelder-Mead'
+        Optimization method for nonstationary fit
 
     Returns
     -------
-    shape, loc, scale : float
-        GEV parameters
+    dparams : xarray.DataArray
+        The GEV distribution parameters with the same dimensions as `data`
+        (excluding `core_dim`) and a new dimension `dparams`:
+        If stationary, dparams = (c, loc, scale).
+        If nonstationary, dparams = (c, loc0, loc1, scale0, scale1).
+
+    Notes
+    -----
+    - Use `unpack_gev_params` to get the shape, location and scale parameters
+    as a separate array. If nonstationary, the output will still be three
+    parameters that have an extra covariate dimension.
+    - For stationary data the parameters are estimated using
+     `scipy.stats.genextreme.fit`.
+    - For nonstationary data, the parameters (including the linear location and
+    scale trend parameters are estimated by minimising
+    a penalised negative log-likelihood function.
+    - The `assert_good_fit` option ensures that the distribution fit is
+    accepted if the goodness of fit test `p-value > alpha` (i.e., accept
+    the null hypothesis). It will retry the fit using data[::2] to generate
+    an initial guess.
+    - The `covariate` must be numeric and have dimensions aligned with `data`.
+    - If `pick_best_model` is a method, the relative goodness of fit method is
+    used to determine if stationary or nonstationary parameters are returned.
+
     """
+    kwargs = {k: v for k, v in locals().items() if k not in ["data", "covariate"]}
+
+    def _assert_good_fit_1d(data, dparams, alpha, fit_kwargs):
+        """Test goodness of stationary GEV fit and retry if failed."""
+        pvalue = check_gev_fit(data, dparams)
+
+        if np.all(pvalue < alpha):
+            # Retry fit using alternative fitstart methods
+            warnings.warn("GEV fit failed. Retrying fitstart with data subset.")
+            _kwargs = fit_kwargs.copy()
+            _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart)
+            _kwargs["stationary"] = True
+            dparams = _fit_1d(data, covariate, **_kwargs)
+            pvalue = check_gev_fit(data, dparams)
+
+        # Return NaNs if the test still fails
+        if np.all(pvalue < alpha):
+            # Return NaNs
+            dparams = dparams * np.nan
+            warnings.warn("Data fit failed.")
+        return dparams
+
+    def _fit_1d(
+        data,
+        covariate,
+        stationary,
+        fitstart,
+        core_dim,
+        loc1,
+        scale1,
+        assert_good_fit,
+        pick_best_model,
+        alpha,
+        method,
+    ):
+        """Estimate distribution parameters."""
+        if np.all(~np.isfinite(data)):
+            # Return NaNs if all input data is infinite
+            n = 3 if stationary else 5
+            return np.array([np.nan] * n)
+
+        if np.isnan(data).any():
+            # Mask NaNs in data
+            mask = np.isfinite(data)
+            data = data[mask]
+            if not stationary:
+                covariate = covariate[mask]
+
+        # Initial estimates of distribution parameters for MLE
+        if isinstance(fitstart, str):
+            dparams_i = _fitstart_1d(data, fitstart)
+        else:
+            # User provided initial estimates
+            dparams_i = fitstart
+
+        # Use genextreme to get stationary distribution parameters
+        if stationary or pick_best_model:
+            if dparams_i is None:
+                dparams_i = genextreme.fit(data)
+            else:
+                dparams = genextreme.fit(
+                    data, dparams_i[0], loc=dparams_i[1], scale=dparams_i[2]
+                )
+            dparams = np.array([i for i in dparams], dtype="float64")
+
+            if assert_good_fit:
+                dparams = _assert_good_fit_1d(data, dparams, alpha, kwargs)
+
+        if not stationary or pick_best_model:
+            # Temporarily reverse shape sign (scipy uses different sign convention)
+            dparams_ns_i = [-dparams_i[0], dparams_i[1], loc1, dparams_i[2], scale1]
+
+            # Optimisation bounds (scale parameter must be non-negative)
+            bounds = [(None, None)] * 5
+            bounds[3] = (0, None)  # Positive scale parameter
+            if loc1 is None:
+                dparams_ns_i[2] = 0
+                bounds[2] = (0, 0)  # Only allow trend in scale
+            if scale1 is None:
+                dparams_ns_i[4] = 0
+                bounds[4] = (0, 0)  # Only allow trend in location
+
+            # Minimise the negative log-likelihood function to get optimal dparams
+            res = minimize(
+                nllf,
+                dparams_ns_i,
+                args=(data, covariate),
+                method=method,
+                bounds=bounds,
+            )
+            dparams_ns = np.array([i for i in res.x], dtype="float64")
+            # Reverse shape sign for consistency with scipy.stats results
+            dparams_ns[0] *= -1
 
-    if any(user_estimates):
-        shape, loc, scale = user_estimates
-        shape, loc, scale = genextreme.fit(x, shape, loc=loc, scale=scale)
+            # Stationary and nonstationary model relative goodness of fit
+            if pick_best_model:
+                dparams = get_best_GEV_model_1d(
+                    data, dparams, dparams_ns, covariate, alpha, test=pick_best_model
+                )
+            else:
+                dparams = dparams_ns
+
+        return dparams
 
-    elif generate_estimates:
-        # Generate initial estimates using a data subset (useful for large datasets)
-        shape, loc, scale = genextreme.fit(x[::2])
-        shape, loc, scale = genextreme.fit(x, shape, loc=loc, scale=scale)
+    if covariate is not None:
+        covariate = _format_covariate(data, covariate, core_dim)
     else:
-        shape, loc, scale = genextreme.fit(x)
-    return shape, loc, scale
+        covariate = 0  # Dummy covariate for apply_ufunc
+
+    # Input core dimensions
+    if core_dim is not None and hasattr(covariate, core_dim):
+        # Covariate has the same core dimension as data
+        input_core_dims = [[core_dim], [core_dim]]
+    else:
+        # Covariate is a 1D array
+        input_core_dims = [[core_dim], []]
+
+    n_params = 5 if (not stationary or pick_best_model) else 3
+    # Fit data to distribution parameters
+    dparams = apply_ufunc(
+        _fit_1d,
+        data,
+        covariate,
+        input_core_dims=input_core_dims,
+        output_core_dims=[["dparams"]],
+        vectorize=True,
+        dask="parallelized",
+        kwargs=kwargs,
+        output_dtypes=["float64"],
+        dask_gufunc_kwargs={"output_sizes": {"dparams": n_params}},
+    )
+    if isinstance(data, DataArray):
+        # Format output (consistent with xclim)
+        if n_params == 3:
+            dparams.coords["dparams"] = ["c", "loc", "scale"]
+        else:
+            dparams.coords["dparams"] = ["c", "loc0", "loc1", "scale0", "scale1"]
+
+        # Add coordinates for the distribution parameters
+        dist_name = "genextreme" if stationary else "nonstationary genextreme"
+        if isinstance(fitstart, str):
+            estimator = fitstart.upper()
+        else:
+            estimator = f"User estimates = {fitstart}"
+
+        dparams.attrs = dict(
+            long_name=f"{dist_name.capitalize()} parameters",
+            description=f"Parameters of the {dist_name} distribution",
+            method="MLE",
+            estimator=estimator,
+            scipy_dist="genextreme",
+            units="",
+        )
+
+    return dparams
 
 
 def penalised_sum(x):
@@ -96,12 +300,12 @@ def penalised_sum(x):
     return total + penalty
 
 
-def nllf(theta, x, covariate=None):
+def nllf(dparams, x, covariate=None):
     """Penalised negative log-likelihood function.
 
     Parameters
     ----------
-    theta : tuple of floats
+    dparams : tuple of floats
         Distribution parameters (stationary or non-stationary)
     x : array_like
         Data to use in estimating the distribution parameters
@@ -118,39 +322,39 @@ def nllf(theta, x, covariate=None):
     This is modified version of `scipy.stats.genextreme.fit` for fitting extreme
     value distribution parameters, in which the location and scale parameters
     can vary linearly with a covariate.
-    The log-likelihood equations are based on Méndez et al. (2007).
+    The log-likelihood equations are based on Coles (2001; page 55).
     It is suitable for stationary or nonstationary distributions:
-        - theta = shape, loc, scale
-        - theta = shape, loc, loc1, scale, scale1
-    The nonstationary parameters are returned if `theta` incudes the location
+        - dparams = shape, loc, scale
+        - dparams = shape, loc, loc1, scale, scale1
+    The nonstationary parameters are returned if `dparams` incudes the location
     and scale trend parameters.
     A large finite penalty (instead of infinity) is applied for observations
     beyond the support of the distribution.
     The NLLF is not finite when the shape is nonzero and Z is negative because
     the PDF is zero (i.e., ``log(0)=inf)``).
     """
-    if len(theta) == 5:
+    if len(dparams) == 5:
         # Nonstationary GEV parameters
-        shape, loc0, loc1, scale0, scale1 = theta
+        shape, loc0, loc1, scale0, scale1 = dparams
         loc = loc0 + loc1 * covariate
         scale = scale0 + scale1 * covariate
 
     else:
         # Stationary GEV parameters
-        shape, loc, scale = theta
+        shape, loc, scale = dparams
 
     s = (x - loc) / scale
 
     # Calculate the NLLF (type 1 or types 2-3 extreme value distributions)
     # Type I extreme value distributions (Gumbel)
-    if shape == 0:
+    if np.fabs(shape) < 1e-6:
         valid = scale > 0
         L = np.log(scale, where=valid) + s + np.exp(-s)
 
     # Types II & III extreme value distributions (Fréchet and Weibull)
     else:
         Z = 1 + shape * s
-        # The log-likelihood function is finite when the shape and Z are positive
+        # The log-likelihood is finite when the shape and Z are positive
         valid = np.isfinite(Z) & (Z > 0) & (scale > 0)
         L = (
             np.log(scale, where=valid)
@@ -160,251 +364,114 @@ def nllf(theta, x, covariate=None):
 
     L = np.where(valid, L, np.inf)
 
-    # Sum function along all axes (where finite) & add penalty for each infinite element
+    # Sum function (where finite) & add penalty for each infinite element
     total = penalised_sum(L)
     return total
 
 
-def fit_gev(
-    data,
-    core_dim="time",
-    stationary=True,
-    covariate=None,
-    loc1=0,
-    scale1=0,
-    test_fit_goodness=False,
-    relative_fit_test=None,
-    alpha=0.05,
-    user_estimates=[],
-    generate_estimates=False,
-    method="Nelder-Mead",
-):
-    """Estimate stationary or nonstationary GEV distribution parameters.
+def _fitstart_1d(data, method):
+    """Generate initial parameter guesses for nonstationary fit.
 
     Parameters
     ----------
     data : array_like
         Data to use in estimating the distribution parameters
-    core_dim : str, optional
-        Name of time/sample dimension in `data`. Default: "time".
-    stationary : bool, optional
-        Fit as a stationary GEV using `fit_stationary_gev`. Default: True.
-    covariate : array_like or str, optional
-        A nonstationary covariate array or coordinate name
-    loc1, scale1 : float or None, optional
-        Initial guess of trend parameters. If None, the trend is fixed at zero.
-    test_fit_goodness : bool, optional
-        Test goodness of fit and attempt retry. Default False.
-    relative_fit_test : {None, 'lrt', 'aic', 'bic'}, optional
-        Method to test relative fit of stationary and nonstationary models.
-        The trend parameters are set to zero if the stationary fit is better.
-    alpha : float, optional
-        Goodness of fit p-value threshold. Default 0.05.
-    user estimates: list, optional
-        Initial guess of the shape, loc and scale parameters
-    generate_estimates : bool, optional
-        Generate initial parameter guesses using a data subset
-    method : str, optional
-        Optimization method for nonstationary fit {'Nelder-Mead', 'L-BFGS-B',
-        'TNC', 'SLSQP', 'Powell', 'trust-constr', 'COBYLA'}.
+    method : {'LMM', 'scipy_fitstart', 'scipy', 'scipy_subset',
+    'xclim_fitstart', 'xclim'}
+        Initial guess method of the shape, loc and scale parameters
 
     Returns
     -------
-    theta : xr.DataArray
-        The GEV distribution parameters with the same dimensions as `data`
-        (excluding `core_dim`) and a new dimension `theta`:
-        If stationary, theta = (shape, loc, scale).
-        If nonstationary, theta = (shape, loc0, loc1, scale0, scale1).
+    dparams_i : list
+        Initial guess of the shape, loc and scale parameters
 
     Notes
     -----
-    For stationary data the shape, location and scale parameters are
-    estimated using `gev_stationary_fit`.
-    For nonstationary data, the linear location and scale trend
-    parameters are estimated using a penalized negative log-likelihood
-    function with initial estimates based on the stationary fit.
-    The distribution fit is considered good if the p-value is above
-     `alpha` (i.e., accept the null hypothesis). Otherwise, it retry the fit
-    without `user_estimates` and with `generating_estimates`.
-    If data is a stacked forecast ensemble, the covariate may need to be
-    stacked in the same way.
+    - Use `scipy_fitstart` to reproduce the scipy fit in `fit_gev`.
+    - The LMM shape sign is reversed for consistency with scipy.stats results.
     """
-    kwargs = locals()  # Function inputs
-
-    def _format_covariate(data, covariate, stationary, core_dim):
-        """Format or generate covariate ."""
-        if not stationary:
-            if isinstance(covariate, str):
-                # Select coordinate in data
-                covariate = data[covariate]
-            elif covariate is None:
-                # Guess covariate
-                if core_dim in data:
-                    covariate = data[core_dim]
-                else:
-                    covariate = np.arange(data.shape[0])
-
-            if covariate.dtype.kind not in set("buifc"):
-                # Convert dates to numbers
-                covariate = date2num(covariate)
-
-            if not isinstance(covariate, DataArray):
-                # Convert to DataArray with the same core_dim as data
-                covariate = DataArray(covariate, dims=[core_dim])
-        else:
-            covariate = 0  # Dummy covariate for apply_ufunc
-
-        return covariate
-
-    def _fit(
-        data,
-        covariate,
-        core_dim,
-        user_estimates,
-        generate_estimates,
-        loc1,
-        scale1,
-        stationary,
-        test_fit_goodness,
-        relative_fit_test,
-        alpha,
-        method,
-    ):
-        """Estimate distribution parameters."""
-        if np.all(~np.isfinite(data)):
-            # Return NaNs if all input data is infinite
-            n = 3 if stationary else 5
-            return np.array([np.nan] * n)
-
-        if np.isnan(data).any():
-            # Mask NaNs in data
-            mask = np.isfinite(data)
-            data = data[mask]
-            if not stationary:
-                covariate = covariate[mask]
 
-        # Use genextreme to get stationary distribution parameters
-        theta = fit_stationary_gev(data, user_estimates, generate_estimates)
+    if method == "LMM":
+        # L-moments method
+        dparams_i = distr.gev.lmom_fit(data)
+        dparams_i = list(dparams_i.values())
+        dparams_i[0] = -dparams_i[0]
+
+    elif method == "scipy_fitstart":
+        # Moments method?
+        dparams_i = genextreme._fitstart(data)
+
+    elif method == "scipy":
+        # MLE
+        dparams_i = genextreme.fit(data)
+
+    elif method == "scipy_subset":
+        # MLE (equivalent of fitstart='scipy_subet')
+        dparams_i = genextreme.fit(data[::2])
+
+    elif method == "xclim_fitstart":
+        # Approximates the probability weighted moments (PWM) method?
+        args, kwargs = xcstats._fit_start(data, dist="genextreme")
+        dparams_i = [args[0], kwargs["loc"], kwargs["scale"]]
+
+    elif method == "xclim":
+        # MLE
+        da = DataArray(data, dims="time")
+        dparams_i = xcstats.fit(da, "genextreme", method="MLE")
+    else:
+        raise ValueError(f"Unknown fitstart method: {method}")
 
-        if not stationary:
-            # Use genextreme as initial guesses
-            shape, loc, scale = theta
-            # Temporarily reverse shape sign (scipy uses different sign convention)
-            theta_i = [-shape, loc, loc1, scale, scale1]
+    return np.array(dparams_i, dtype="float64")
 
-            # Optimisation bounds (scale parameter must be non-negative)
-            bounds = [(None, None)] * len(theta_i)
-            bounds[3] = (0, None)  # Positive scale parameter
-            if loc1 is None:
-                theta_i[2] = 0
-                bounds[2] = (0, 0)  # Only allow trend in scale
-            if scale1 is None:
-                theta_i[4] = 0
-                bounds[4] = (0, 0)  # Only allow trend in location
 
-            # Minimise the negative log-likelihood function to get optimal theta
-            res = minimize(
-                nllf,
-                theta_i,
-                args=(data, covariate),
-                method=method,
-                bounds=bounds,
-            )
-            theta = res.x
+def _format_covariate(data, covariate, core_dim):
+    """Format or generate covariate.
 
-            if isinstance(relative_fit_test, str):
-                # Test relative fit of stationary and nonstationary models
-                # Negative log likelihood using genextreme parameters
-                L1 = nllf([-shape, loc, scale], data)
-                L2 = res.fun
+    Parameters
+    ----------
+    data : xarray.DataArray
+        Data to use in estimating the distribution parameters
+    covariate : array_like or str
+        A nonstationary covariate array or coordinate name
+    core_dim : str
+        Name of time/sample dimension in `data`
 
-                result = check_gev_relative_fit(
-                    data, L1, L2, test=relative_fit_test, alpha=alpha
-                )
-                if not result:
-                    warnings.warn(
-                        f"{relative_fit_test} test failed. Returning stationary parameters."
-                    )
-                    # Return stationary parameters (genextreme.fit output) with no trend
-                    theta = [shape, loc, 0, scale, 0]
+    Returns
+    -------
+    covariate : xarray.DataArray
+        Covariate with the same core_dim as data
+    """
 
-            # Reverse shape sign for consistency with scipy.stats results
-            theta[0] *= -1
-
-        theta = np.array([i for i in theta], dtype="float64")
-
-        if test_fit_goodness and stationary:
-            pvalue = check_gev_fit(data, theta, core_dim=core_dim)
-
-            # Accept null distribution of the Anderson-darling test (same distribution)
-            if np.all(pvalue < alpha):
-                if any(kwargs["user_estimates"]):
-                    warnings.warn("GEV fit failed. Retrying without user_estimates.")
-                    kwargs["user_estimates"] = [None, None, None]
-                    theta = _fit(data, covariate, **kwargs)
-                elif not kwargs["generate_estimates"]:
-                    warnings.warn(
-                        "GEV fit failed. Retrying with generate_estimates=True."
-                    )
-                    kwargs["generate_estimates"] = True  # Also breaks loop
-                    theta = _fit(data, covariate, **kwargs)
-                else:
-                    # Return NaNs
-                    theta = theta * np.nan
-                    warnings.warn("Data fit failed.")
-        return theta
-
-    data = kwargs.pop("data")
-    covariate = kwargs.pop("covariate")
-    covariate = _format_covariate(data, covariate, stationary, core_dim)
+    if isinstance(covariate, str):
+        # Select coordinate in data
+        covariate = data[covariate]
 
-    # Input core dimensions
-    if core_dim is not None and hasattr(covariate, core_dim):
-        # Covariate has the same core dimension as data
-        input_core_dims = [[core_dim], [core_dim]]
-    else:
-        # Covariate is a 1D array
-        input_core_dims = [[core_dim], []]
+    elif covariate is None:
+        # Guess covariate
+        if core_dim in data:
+            covariate = data[core_dim]
+        else:
+            covariate = np.arange(data.shape[0])
 
-    # Expected output of theta
-    n = 3 if stationary else 5
+    if covariate.dtype.kind not in set("buifc"):
+        # Convert dates to numbers
+        covariate = date2num(covariate)
 
-    # Fit data to distribution parameters
-    theta = apply_ufunc(
-        _fit,
-        data,
-        covariate,
-        input_core_dims=input_core_dims,
-        output_core_dims=[["theta"]],
-        vectorize=True,
-        dask="parallelized",
-        kwargs=kwargs,
-        output_dtypes=["float64"],
-        dask_gufunc_kwargs=dict(output_sizes={"theta": n}),
-    )
+    if not isinstance(covariate, DataArray):
+        # Convert to DataArray with the same core_dim as data
+        covariate = DataArray(covariate, dims=[core_dim])
 
-    # Format output
-    if len(data.shape) == 1:
-        # Return a tuple of scalars instead of a data array
-        theta = np.array([i for i in theta], dtype="float64")
-
-    if isinstance(theta, DataArray):
-        if stationary:
-            coords = ["shape", "loc", "scale"]
-        else:
-            coords = ["shape", "loc0", "loc1", "scale0", "scale1"]
-        theta.coords["theta"] = coords
-    return theta
+    return covariate
 
 
-def check_gev_fit(data, params, core_dim="time", **kwargs):
+def check_gev_fit(data, dparams, core_dim=[], **kwargs):
     """Test stationary GEV distribution goodness of fit.
 
     Parameters
     ----------
     data: array_like
         Data used to estimate the distribution parameters
-    params : tuple of floats
+    dparams : tuple of floats
         Shape, location and scale parameters
     core_dim : str, optional
         Data dimension to test over
@@ -417,10 +484,10 @@ def check_gev_fit(data, params, core_dim="time", **kwargs):
         Goodness of fit p-value
     """
 
-    def _goodness_of_fit(data, params, **kwargs):
+    def _goodness_of_fit(data, dparams, **kwargs):
         """Test GEV goodness of fit."""
         # Stationary parameters
-        shape, loc, scale = params
+        shape, loc, scale = dparams
 
         res = goodness_of_fit(
             genextreme,
@@ -430,11 +497,14 @@ def _goodness_of_fit(data, params, **kwargs):
         )
         return res.pvalue
 
+    if not isinstance(core_dim, list):
+        core_dim = [core_dim]
+
     pvalue = apply_ufunc(
         _goodness_of_fit,
         data,
-        params,
-        input_core_dims=[[core_dim], ["theta"]],
+        dparams,
+        input_core_dims=[core_dim, ["dparams"]],
         vectorize=True,
         kwargs=kwargs,
         dask="parallelized",
@@ -452,7 +522,7 @@ def check_gev_relative_fit(data, L1, L2, test, alpha=0.05):
         Data to use in estimating the distribution parameters
     L1, L2 : float
         Negative log-likelihood of the stationary and nonstationary model
-    test : {"aic", "bic", "lrt"}
+    test : {"AIC", "BIC", "LRT"}
         Method to test relative fit of stationary and nonstationary models
 
     Returns
@@ -484,15 +554,35 @@ def check_gev_relative_fit(data, L1, L2, test, alpha=0.05):
         # Bayesian Information Criterion (BIC)
         bic = [k * np.log(len(data)) + (2 * n) for n, k in zip([L1, L2], dof)]
         result = bic[0] > bic[1]
+    else:
+        raise ValueError("test must be 'LRT', 'AIC' or 'BIC'", test)
     return result
 
 
-def unpack_gev_params(params, covariate=None):
-    """Unpack shape, loc, scale from params.
+def get_best_GEV_model_1d(data, dparams, dparams_ns, covariate, alpha, test):
+    """Get the best GEV model based on a relative fit test."""
+    # Calculate the stationary GEV parameters
+    shape, loc, scale = dparams
+
+    # Negative log-likelihood of stationary and nonstationary models
+    L1 = nllf([-shape, loc, scale], data)
+    L2 = nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate)
+
+    result = check_gev_relative_fit(data, L1, L2, test=test, alpha=alpha)
+    if not result:
+        # Return the stationary parameters with no trend
+        dparams = np.array([shape, loc, 0, scale, 0], dtype="float64")
+    else:
+        dparams = dparams_ns
+    return dparams
+
+
+def unpack_gev_params(dparams, covariate=None):
+    """Unpack shape, loc, scale from dparams.
 
     Parameters
     ----------
-    params : xarray.DataArray, list or tuple
+    dparams : xarray.DataArray, list or tuple
         Stationary or nonstationary GEV parameters
     covariate : xarray.DataArray, optional
         Covariate values for nonstationary parameters
@@ -504,31 +594,37 @@ def unpack_gev_params(params, covariate=None):
         covariate.
     """
 
-    if hasattr(params, "theta"):
+    if hasattr(dparams, "dparams"):
         # Select the correct dimension in a DataArray
-        params = [params.isel(theta=i) for i in range(params.theta.size)]
+        dparams = [dparams.isel(dparams=i) for i in range(dparams.dparams.size)]
+    elif not isinstance(dparams, (list, tuple)) and dparams.ndim > 1:
+        warnings.warn(f"Assuming parameters on axis=-1 (shape={dparams.shape})")
+        dparams = np.split(dparams, dparams.shape[-1], axis=-1)
 
     # Unpack GEV parameters
-    if len(params) == 3:
+    if len(dparams) == 3:
         # Stationary GEV parameters
-        shape, loc, scale = params
+        shape, loc, scale = dparams
 
-    elif len(params) == 5:
+    elif len(dparams) == 5:
         # Nonstationary GEV parameters
-        shape, loc0, loc1, scale0, scale1 = params
+        shape, loc0, loc1, scale0, scale1 = dparams
         loc = loc0 + loc1 * covariate
         scale = scale0 + scale1 * covariate
+    else:
+        raise ValueError("Expected 3 or 5 GEV parameters.", dparams)
+
     return shape, loc, scale
 
 
-def get_return_period(event, params=None, covariate=None, **kwargs):
+def get_return_period(event, dparams=None, covariate=None, **kwargs):
     """Get return periods for a given events.
 
     Parameters
     ----------
     event : float or array_like
         Event value(s) for which to calculate the return period
-    params : array_like, optional
+    dparams : array_like, optional
         Stationary or nonstationary GEV parameters
     covariate : array_like, optional
         Covariate values for nonstationary parameters
@@ -540,10 +636,11 @@ def get_return_period(event, params=None, covariate=None, **kwargs):
     return_period : float or array_like
         Return period(s) for the event(s)
     """
-    if params is None:
-        params = fit_gev(**kwargs)
 
-    shape, loc, scale = unpack_gev_params(params, covariate)
+    if dparams is None:
+        dparams = fit_gev(**kwargs)
+
+    shape, loc, scale = unpack_gev_params(dparams, covariate)
 
     probability = apply_ufunc(
         genextreme.sf,
@@ -559,14 +656,14 @@ def get_return_period(event, params=None, covariate=None, **kwargs):
     return 1.0 / probability
 
 
-def get_return_level(return_period, params=None, covariate=None, **kwargs):
+def get_return_level(return_period, dparams=None, covariate=None, **kwargs):
     """Get the return levels for given return periods.
 
     Parameters
     ----------
     return_period : float or array_like
        Return period(s) for which to calculate the return level
-    params : array_like, optional
+    dparams : array_like, optional
         Stationary or nonstationary GEV parameters
     covariate : array_like, optional
         Covariate values for nonstationary parameters
@@ -577,11 +674,18 @@ def get_return_level(return_period, params=None, covariate=None, **kwargs):
     -------
     return_level : float or array_like
         Return level(s) of the given return period(s)
+
+    Notes
+    -----
+    If `return_period` is an ndarray, make sure dimensions are aligned with
+    `dparams`. For example, dparams dims=(lat, lon, dparams) and return_period
+    dims=(lat, lon, period).
     """
-    if params is None:
-        params = fit_gev(**kwargs)
 
-    shape, loc, scale = unpack_gev_params(params, covariate)
+    if dparams is None:
+        dparams = fit_gev(**kwargs)
+
+    shape, loc, scale = unpack_gev_params(dparams, covariate)
 
     return_level = apply_ufunc(
         genextreme.isf,
@@ -603,34 +707,31 @@ def gev_return_curve(
     bootstrap_method="non-parametric",
     n_bootstraps=1000,
     max_return_period=4,
-    user_estimates=None,
     max_shape_ratio=None,
+    **fit_kwargs,
 ):
     """Return x and y data for a GEV return period curve.
 
     Parameters
     ----------
-    data : xarray DataArray
+    data : xarray.DataArray
     event_value : float
         Magnitude of event of interest
     bootstrap_method : {'parametric', 'non-parametric'}, default "non-parametric"
     n_bootstraps : int, default 1000
     max_return_period : float, default 4
         The maximum return period is 10^{max_return_period}
-    user_estimates: list, default None
-        Initial estimates of the shape, loc and scale parameters
     max_shape_ratio: float, optional
-        Maximum bootstrap shape parameter to full population shape parameter ratio (e.g. 6.0)
-        Useful for filtering bad fits to bootstrap samples
+        Maximum bootstrap shape parameter to full population shape parameter
+        ratio (e.g. 6.0). Useful for filtering bad fits to bootstrap samples
+    fit_kwargs : dict, optional
+        Additional keyword arguments to pass to `fit_gev`
     """
+    rng = np.random.default_rng(seed=0)
 
     # GEV fit to data
-    if user_estimates:
-        shape, loc, scale = fit_gev(
-            data, user_estimates=user_estimates, stationary=True
-        )
-    else:
-        shape, loc, scale = fit_gev(data, generate_estimates=True, stationary=True)
+    dparams = fit_gev(data, **fit_kwargs)
+    shape, loc, scale = unpack_gev_params(dparams)
 
     curve_return_periods = np.logspace(0, max_return_period, num=10000)
     curve_probabilities = 1.0 / curve_return_periods
@@ -646,8 +747,8 @@ def gev_return_curve(
         if bootstrap_method == "parametric":
             boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data))
         elif bootstrap_method == "non-parametric":
-            boot_data = np.random.choice(data, size=data.shape, replace=True)
-        boot_shape, boot_loc, boot_scale = fit_gev(boot_data, generate_estimates=True)
+            boot_data = rng.choice(data, size=data.shape, replace=True)
+        boot_shape, boot_loc, boot_scale = fit_gev(boot_data, fitstart="scipy_subet")
         if max_shape_ratio:
             shape_ratio = abs(boot_shape) / abs(shape)
             if shape_ratio > max_shape_ratio:
@@ -698,15 +799,15 @@ def plot_gev_return_curve(
     ylabel=None,
     ylim=None,
     text=False,
-    user_estimates=None,
     max_shape_ratio=None,
+    **fit_kwargs,
 ):
     """Plot a single return period curve.
 
     Parameters
     ----------
     ax : matplotlib plot axis
-    data : xarray DataArray
+    data : xarray.DataArray
     event_value : float
         Magnitude of the event of interest
     direction : {'exceedance', 'deceedance'}, default 'exceedance'
@@ -721,11 +822,11 @@ def plot_gev_return_curve(
         Limits for y-axis
     text : bool, default False
        Write the return period (and 95% CI) on the plot
-    user_estimates: list, default None
-        Initial estimates of the shape, loc and scale parameters
     max_shape_ratio: float, optional
         Maximum bootstrap shape parameter to full population shape parameter ratio (e.g. 6.0)
         Useful for filtering bad fits to bootstrap samples
+    fit_kwargs : dict, optional
+        Additional keyword arguments to pass to `fit_gev`
     """
 
     if direction == "deceedance":
@@ -737,8 +838,8 @@ def plot_gev_return_curve(
         bootstrap_method=bootstrap_method,
         n_bootstraps=n_bootstraps,
         max_return_period=max_return_period,
-        user_estimates=user_estimates,
         max_shape_ratio=max_shape_ratio,
+        **fit_kwargs,
     )
     (
         curve_return_periods,
@@ -813,35 +914,53 @@ def plot_gev_return_curve(
     ax.grid()
 
 
-def plot_nonstationary_pdfs(ax, data, theta_s, theta_ns, covariate):
+def plot_nonstationary_pdfs(
+    data,
+    dparams_s,
+    dparams_ns,
+    covariate,
+    ax=None,
+    title="",
+    units=None,
+    cmap="rainbow",
+    outfile=None,
+):
     """Plot stationary and nonstationary GEV PDFs.
 
     Parameters
     ----------
-    ax : matplotlib.Axes
     data : array-like
         Data to plot the histogram
-    theta_s : tuple of floats
+    dparams_s : tuple of floats
         Stationary GEV parameters (shape, loc, scale)
-    theta_ns : tuple or array-like
+    dparams_ns : tuple or array-like
         Nonstationary GEV parameters (shape, loc0, loc1, scale0, scale1)
     covariate : array-like
-        Covariates values in which to plot the nonstationary PDFs
-
+        Covariate values in which to plot the nonstationary PDFs
+    ax : matplotlib.axes.Axes
+    title : str, optional
+    xlabel : str, optional
+    cmap : str, default "rainbow"
+    outfile : str, optional
 
     Returns
     -------
     ax : matplotlib.Axes
     """
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(10, 7))
+
+    ax.set_title(title, loc="left")
+
     n = covariate.size
-    colors = colormaps["rainbow"](np.linspace(0, 1, n))
-    shape, loc, scale = unpack_gev_params(theta_ns, covariate)
+    colors = colormaps[cmap](np.linspace(0, 1, n))
+    shape, loc, scale = unpack_gev_params(dparams_ns, covariate)
 
     # Histogram.
     _, bins, _ = ax.hist(data, bins=40, density=True, alpha=0.5, label="Histogram")
 
     # Stationary GEV PDF
-    shape_s, loc_s, scale_s = theta_s
+    shape_s, loc_s, scale_s = dparams_s
     pdf_s = genextreme.pdf(bins, shape_s, loc=loc_s, scale=scale_s)
     ax.plot(bins, pdf_s, c="k", ls="--", lw=2.8, label="Stationary")
 
@@ -850,64 +969,110 @@ def plot_nonstationary_pdfs(ax, data, theta_s, theta_ns, covariate):
         pdf_ns = genextreme.pdf(bins, shape, loc=loc[i], scale=scale[i])
         ax.plot(bins, pdf_ns, lw=1.6, c=colors[i], zorder=0, label=t)
 
+    ax.set_xlabel(units)
     ax.set_ylabel("Probability")
     ax.xaxis.set_minor_locator(AutoMinorLocator())
     ax.yaxis.set_minor_locator(AutoMinorLocator())
-    ax.legend(bbox_to_anchor=(1, 1))
+    ax.legend(loc="upper right", bbox_to_anchor=(1, 1), framealpha=0.3)
+    ax.set_xmargin(1e-3)
+
+    if outfile:
+        plt.tight_layout()
+        plt.savefig(outfile, dpi=200, bbox_inches="tight")
     return ax
 
 
 def plot_nonstationary_return_curve(
-    ax, return_periods, theta_s, theta_ns, covariate, dim="time"
+    return_periods,
+    dparams_s,
+    dparams_ns,
+    covariate,
+    dim="time",
+    ax=None,
+    title="",
+    units=None,
+    cmap="rainbow",
+    outfile=None,
 ):
     """Plot stationary and nonstationary return period curves.
 
     Parameters
     ----------
-    ax : matplotlib.Axes
     return_periods : array-like
-        Return periods to plot
-    theta_s : array-like or tuple of floats
+        Return periods to plot (x-axis)
+    dparams_s : array-like or tuple of floats
         Stationary GEV parameters (shape, loc, scale)
-    theta_ns : array-like or tuple of floats
+    dparams_ns : array-like or tuple of floats
         Nonstationary GEV parameters (shape, loc0, loc1, scale0, scale1)
     covariate : array-like
         Covariate values in which to show the nonstationary return levels
     dim : str, optional
         Covariate core dimension name, default "time"
+    ax : matplotlib.axes.Axes
+    title : str, optional
+    units : str, optional
+    cmap : str, default "rainbow"
+    outfile : str, optional
 
     Returns
     -------
-    ax : matplotlib.Axes
+    ax : matplotlib.axes.Axes
     """
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(9, 5))
+
+    ax.set_title(title, loc="left")
+
     n = covariate.size
-    colors = colormaps["rainbow"](np.linspace(0, 1, n))
+    colors = colormaps[cmap](np.linspace(0, 1, n))
 
     # Stationary return levels
-    return_levels = get_return_level(return_periods, theta_s)
-    ax.plot(
-        return_periods, return_levels, label="Stationary", c="k", ls="--", zorder=n + 1
-    )
+    if dparams_s is not None:
+        return_levels = get_return_level(return_periods, dparams_s)
+        ax.plot(
+            return_periods,
+            return_levels,
+            label="Stationary",
+            c="k",
+            ls="--",
+            zorder=n + 1,
+        )
 
     # Nonstationary return levels
-    return_levels = get_return_level(return_periods, theta_ns, covariate)
+    return_levels = get_return_level(return_periods, dparams_ns, covariate)
     for i, t in enumerate(covariate.values):
         ax.plot(return_periods, return_levels.isel({dim: i}), label=t, c=colors[i])
 
     ax.set_xscale("log")
-    ax.set_xlabel("Return period [years]")
+    ax.set_ylabel(units)
+    ax.set_xlabel("Return period")
     ax.yaxis.set_minor_locator(AutoMinorLocator())
     ax.set_xmargin(1e-2)
     ax.legend()
+
+    if outfile:
+        plt.tight_layout()
+        plt.savefig(outfile, dpi=200, bbox_inches="tight")
     return ax
 
 
-def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"):
-    """Plot data binned by a covariate as a stacked histogram.
+def plot_stacked_histogram(
+    dv1,
+    dv2,
+    bins=None,
+    labels=None,
+    ax=None,
+    title="",
+    units=None,
+    cmap="rainbow",
+    legend=True,
+    outfile=None,
+):
+    """Histogram with data binned and stacked.
 
     Parameters
     ----------
-    ax : matplotlib.Axes
+
     dv1 : xarray.DataArray
         Data to plot in the histogram
     dv2 : xarray.DataArray
@@ -915,27 +1080,48 @@ def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"):
     bins : array-like
         Bin edges of dv2
     labels : array-like, optional
-        Labels for each bin, default None (uses left side of each bin)
-    dim : str, default: "time"
-        Core dimension name of dv1 and dv2, default "time"
+        Labels for each bin, default None uses left side of each bin
+    dim : str, default "time"
+        Core dimension name of dv1 and dv2
+    ax : matplotlib.axes.Axes
+    title : str, optional
+    units : str, optional
+    cmap : str, default "rainbow"
+    legend : bool, optional
+    outfile : str, optional
 
     Returns
     -------
-    ax : matplotlib.Axes
+    ax : matplotlib.axes.Axes
     """
 
     assert dv1.size == dv2.size
 
+    if bins is None or np.ndim(bins) == 0:
+        bins = np.histogram_bin_edges(dv2, bins)
+
+        # Round bins to integers if possible
+        if np.all(np.diff(bins) >= 1):
+            bins = np.ceil(bins).astype(dtype=int)
+
+    if labels is None:
+        # Labels show left side of each bin
+        # labels = bins[:-1]
+        labels = [f"{bins[i]}-{bins[i+1] - 1}" for i in range(len(bins) - 1)]
+
     # Subset dv1 by bins
     dx_subsets = [
-        dv1.where((dv2 >= bins[a]) & (dv2 < bins[a + 1])) for a in range(len(bins) - 1)
+        dv1.where(((dv2 >= bins[a]) & (dv2 < bins[a + 1])).values)
+        for a in range(len(bins) - 1)
     ]
+    dx_subsets[-1] = dv1.where((dv2 >= bins[-2]).values)
 
-    if labels is None:
-        # Labels show left side of each bin
-        labels = bins[:-1]
+    colors = colormaps[cmap](np.linspace(0, 1, len(bins) - 1))
 
-    colors = colormaps["rainbow"](np.linspace(0, 1, len(bins) - 1))
+    if ax is None:
+        fig, ax = plt.subplots(1, 1, figsize=(10, 7))
+
+    ax.set_title(title, loc="left")
     ax.hist(
         dx_subsets,
         density=True,
@@ -945,6 +1131,202 @@ def plot_stacked_histogram(ax, dv1, dv2, bins, labels=None, dim="time"):
         edgecolor="k",
         label=labels,
     )
-    ax.legend()
+    if legend:
+        ax.legend()
+    ax.set_xlabel(units)
     ax.set_ylabel("Probability")
-    return ax
+
+    if outfile:
+        plt.tight_layout()
+        plt.savefig(outfile, dpi=200, bbox_inches="tight")
+    return ax, bins
+
+
+def _parse_command_line():
+    """Parse the command line for input arguments"""
+
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+
+    parser.add_argument("file", type=str, help="Forecast file")
+    parser.add_argument("var", type=str, help="Variable name")
+    parser.add_argument("outfile", type=str, help="Output file")
+    parser.add_argument(
+        "--stack_dims",
+        type=str,
+        nargs="*",
+        default=["ensemble", "init_date", "lead_time"],
+        help="Dimensions to stack",
+    )
+    parser.add_argument("--core_dim", type=str, default="time", help="Core dimension")
+    parser.add_argument(
+        "--stationary",
+        type=bool,
+        default=True,
+        help="Fit nonstationary GEV distribution",
+    )
+    parser.add_argument(
+        "--fitstart",
+        default="LMM",
+        choices=(
+            "LMM",
+            "scipy",
+            "scipy_fitstart",
+            "scipy_subset",
+            "xclim_MLE",
+            "xclim",
+            ["shape", "loc", "scale"],
+        ),
+        help="Initial guess method (or estimate) of the GEV parameters",
+    )
+    parser.add_argument(
+        "--assert_good_fit",
+        action="store_true",
+        default=False,
+        help="Test fit goodness",
+    )
+    parser.add_argument(
+        "--pick_best_model",
+        type=str,
+        default=None,
+        help="Relative fit test to pick stationary or nonstationary parameters",
+    )
+    parser.add_argument(
+        "--reference_time_period",
+        type=str,
+        nargs=2,
+        default=None,
+        help="Reference time period (start_date, end_date)",
+    )
+    parser.add_argument(
+        "--covariate", type=str, default="time.year", help="Covariate variable"
+    )
+    parser.add_argument(
+        "--covariate_file", type=str, default=None, help="Covariate file"
+    )
+    parser.add_argument(
+        "--min_lead", default=None, help="Minimum lead time (int or filename)"
+    )
+    parser.add_argument(
+        "--min_lead_kwargs",
+        type=str,
+        nargs="*",
+        default={},
+        action=general_utils.store_dict,
+        help="Minimum lead time file",
+    )
+    # parser.add_argument(
+    #     "--confidence_interval",
+    #     type=float,
+    #     default=0.95,
+    #     help="Confidence interval e.g., --confidence_interval 0.95",
+    # )
+    parser.add_argument(
+        "--ensemble_dim",
+        type=str,
+        default="ensemble",
+        help="Name of ensemble member dimension",
+    )
+    parser.add_argument(
+        "--init_dim",
+        type=str,
+        default="init_date",
+        help="Name of initial date dimension",
+    )
+    parser.add_argument(
+        "--lead_dim",
+        type=str,
+        default="lead_time",
+        help="Name of lead time dimension",
+    )
+    parser.add_argument(
+        "--output_chunks",
+        type=str,
+        nargs="*",
+        action=general_utils.store_dict,
+        default={},
+        help="Output chunks",
+    )
+    parser.add_argument(
+        "--dask_config", type=str, help="YAML file specifying dask client configuration"
+    )
+    args = parser.parse_args()
+
+    return args
+
+
+def _main():
+    """Run the command line program."""
+
+    args = _parse_command_line()
+
+    ds = fileio.open_dataset(args.file, variables=[args.var])
+
+    if args.covariate_file is not None:
+        # Add covariate to dataset (to ensure all operations are aligned)
+        ds_covariate = fileio.open_dataset(
+            args.covariate_file, variables=[args.covariate]
+        )
+        ds[args.covariate] = ds_covariate[args.covariate]
+
+    # Filter data by reference time period
+    if args.reference_time_period:
+        ds = time_utils.select_time_period(ds, args.reference_time_period)
+
+    # Filter data by minimum lead time
+    if args.min_lead:
+        if isinstance(args.min_lead, str):
+            # Load min_lead from file
+            ds_min_lead = fileio.open_dataset(args.min_lead, **args.min_lead_kwargs)
+            min_lead = ds_min_lead["min_lead"].load()
+            ds = ds.groupby(f"{args.init_dim}.month").where(
+                ds[args.lead_dim] >= min_lead
+            )
+            ds = ds.drop_vars("month")
+        else:
+            ds = ds.where(ds[args.lead_dim] >= args.min_lead)
+
+    # Stack dimensions along new "sample" dimension
+    if all([dim in ds[args.var].dims for dim in args.stack_dims]):
+        ds = ds.stack(**{"sample": args.stack_dims})
+        args.core_dim = "sample"
+
+    if not args.stationary:
+        covariate = _format_covariate(ds[args.var], ds[args.covariate], args.core_dim)
+    else:
+        covariate = None
+
+    dparams = fit_gev(
+        ds[args.var],
+        core_dim=args.core_dim,
+        stationary=args.stationary,
+        fitstart=args.fitstart,
+        covariate=covariate,
+        assert_good_fit=args.assert_good_fit,
+        pick_best_model=args.pick_best_model,
+    )
+
+    # Format outfile
+    dparams = dparams.to_dataset()
+
+    # Add the covariate variable
+    if not args.stationary or args.pick_best_model:
+        dparams[args.covariate] = covariate
+
+    infile_logs = {args.file: ds.attrs["history"]}
+    if isinstance(args.min_lead, str):
+        infile_logs[args.min_lead] = ds_min_lead.attrs["history"]
+    dparams.attrs["history"] = fileio.get_new_log(infile_logs=infile_logs)
+
+    if args.output_chunks:
+        dparams = dparams.chunk(args.output_chunks)
+
+    if "zarr" in args.outfile:
+        fileio.to_zarr(dparams, args.outfile)
+    else:
+        dparams.to_netcdf(args.outfile, compute=True)
+
+
+if __name__ == "__main__":
+    _main()
diff --git a/unseen/moments.py b/unseen/moments.py
index ae25070..2bb3fbf 100644
--- a/unseen/moments.py
+++ b/unseen/moments.py
@@ -183,7 +183,7 @@ def create_plot(
         random_sample = np.random.choice(da_fcst_stacked, sample_size)
         sample_moments = calc_moments(
             random_sample,
-            user_estimates=[
+            fitstart=[
                 moments_fcst["GEV shape"],
                 moments_fcst["GEV location"],
                 moments_fcst["GEV scale"],
@@ -196,7 +196,7 @@ def create_plot(
             bc_random_sample = np.random.choice(da_bc_fcst_stacked, sample_size)
             bc_sample_moments = calc_moments(
                 bc_random_sample,
-                user_estimates=[
+                fitstart=[
                     moments_fcst["GEV shape"],
                     moments_fcst["GEV location"],
                     moments_fcst["GEV scale"],
diff --git a/unseen/stability.py b/unseen/stability.py
index dba31ac..a9e4a4c 100644
--- a/unseen/stability.py
+++ b/unseen/stability.py
@@ -105,7 +105,7 @@ def return_curve(data, method, params=[], **kwargs):
     params : list, default None
         shape, location and scale parameters (calculated if None)
     kwargs : dict, optional
-        kwargs passed to eva.fit_gev (N.B. used to use generate_estimates=True)
+        kwargs passed to eva.fit_gev
     """
 
     if method == "empirical":
diff --git a/unseen/tests/conftest.py b/unseen/tests/conftest.py
index ce6a6cd..25aef26 100644
--- a/unseen/tests/conftest.py
+++ b/unseen/tests/conftest.py
@@ -2,6 +2,7 @@
 import dask.array as dsa
 import numpy as np
 import pytest
+from scipy.stats import genextreme
 import xarray as xr
 
 
@@ -101,7 +102,8 @@ def example_da_forecast(request):
             )
         )
     else:
-        data = np.random.random(
+        rng = np.random.default_rng(seed=0)
+        data = rng.random(
             (
                 len(init),
                 len(lead),
@@ -113,3 +115,58 @@ def example_da_forecast(request):
     return ds.assign_coords(
         {pytest.TIME_DIM: ([pytest.INIT_DIM, pytest.LEAD_DIM], time)}
     )
+
+
+@pytest.fixture()
+def example_da_gev(request):
+    """An example 1D GEV DataArray and distribution parameters."""
+    rng = np.random.default_rng(seed=0)
+    time = xr.cftime_range(start="2000-01-01", periods=1500, freq="D")
+
+    # Shape, location and scale parameters
+    shape = rng.uniform(-0.5, 0.5)
+    loc = rng.uniform(-10, 10)
+    scale = rng.uniform(0.1, 10)
+    dparams = shape, loc, scale
+
+    rvs = genextreme.rvs(shape, loc=loc, scale=scale, size=(time.size), random_state=0)
+    data = xr.DataArray(rvs, coords=[time], dims=[pytest.TIME_DIM])
+    if request.param == "dask":
+        data = data.chunk({pytest.TIME_DIM: -1})
+    elif request.param == "numpy":
+        data = data.values
+    return data, dparams
+
+
+@pytest.fixture()
+def example_da_gev_3d(request):
+    """An example 3D GEV DataArray and distribution parameters."""
+    rng = np.random.default_rng(seed=0)
+    time = xr.cftime_range(start="2000-01-01", periods=1500, freq="D")
+    lat = np.arange(2)
+    lon = np.arange(2)
+
+    # Shape, location and scale parameters
+    size = (len(lat), len(lon))
+    shape = rng.uniform(-0.5, 0.5, size=size)
+    loc = rng.uniform(-10, 10, size=size)
+    scale = rng.uniform(0.1, 10, size=size)
+    dparams = np.stack([shape, loc, scale], axis=-1)
+
+    rvs = genextreme.rvs(
+        shape,
+        loc=loc,
+        scale=scale,
+        size=(len(time), len(lat), len(lon)),
+        random_state=0,
+    )
+    data = xr.DataArray(
+        rvs,
+        coords=[time, lat, lon],
+        dims=[pytest.TIME_DIM, pytest.LAT_DIM, pytest.LON_DIM],
+    )
+    if request.param == "dask":
+        data = data.chunk({pytest.TIME_DIM: -1, pytest.LAT_DIM: 1, pytest.LON_DIM: 1})
+    elif request.param == "numpy":
+        data = data.values
+    return data, dparams
diff --git a/unseen/tests/test_eva.py b/unseen/tests/test_eva.py
index bba533f..d30f13e 100644
--- a/unseen/tests/test_eva.py
+++ b/unseen/tests/test_eva.py
@@ -1,191 +1,102 @@
 """Test extreme value analysis functions."""
 
-from matplotlib.dates import date2num
 import numpy as np
 import numpy.testing as npt
-from scipy.stats import genextreme
-from xarray import cftime_range, DataArray
+import pytest
+import xarray as xr
 
 from unseen.eva import fit_gev, get_return_period, get_return_level
 
-rtol = 0.3  # relative tolerance
-alpha = 0.05
-
-
-def example_da_gev_1d():
-    """An example 1D GEV DataArray and distribution parameters."""
-    time = cftime_range(start="2000-01-01", periods=1500, freq="D")
-
-    # Shape, location and scale parameters
-    np.random.seed(0)
-    shape = np.random.uniform()
-    loc = np.random.uniform(-10, 10)
-    scale = np.random.uniform(0.1, 10)
-    theta = shape, loc, scale
-
-    rvs = genextreme.rvs(shape, loc=loc, scale=scale, size=(time.size), random_state=0)
-    data = DataArray(rvs, coords=[time], dims=["time"])
-    return data, theta
-
-
-def example_da_gev_1d_dask():
-    """An example 1D GEV dask array and distribution parameters."""
-    data, theta = example_da_gev_1d()
-    data = data.chunk({"time": -1})
-    return data, theta
-
-
-def example_da_gev_3d():
-    """An example 3D GEV DataArray and distribution parameters."""
-    time = cftime_range(start="2000-01-01", periods=1500, freq="D")
-    lat = np.arange(2)
-    lon = np.arange(2)
-
-    # Shape, location and scale parameters
-    size = (len(lat), len(lon))
-    np.random.seed(0)
-    shape = np.random.uniform(size=size)
-    loc = np.random.uniform(-10, 10, size=size)
-    scale = np.random.uniform(0.1, 10, size=size)
-    theta = np.stack([shape, loc, scale], axis=-1)
-
-    rvs = genextreme.rvs(
-        shape,
-        loc=loc,
-        scale=scale,
-        size=(len(time), len(lat), len(lon)),
-        random_state=0,
-    )
-    data = DataArray(rvs, coords=[time, lat, lon], dims=["time", "lat", "lon"])
-    return data, theta
-
-
-def example_da_gev_3d_dask():
-    """An example 3D GEV dask array and its distribution parameters."""
-    data, theta = example_da_gev_3d()
-    data = data.chunk({"time": -1, "lat": 1, "lon": 1})
-    return data, theta
+rtol = 0.3  # relative tolerance for testing close values
 
 
 def add_example_gev_trend(data):
     trend = np.arange(data.time.size) * 2.5 / data.time.size
-    trend = DataArray(trend, coords={"time": data.time})
+    trend = xr.DataArray(trend, coords={"time": data.time})
     return data + trend
 
 
-def example_da_gev_forecast():
-    """Create example stacked forecast dataArray."""
-    ensemble = np.arange(3)
-    lead_time = np.arange(5)
-    init_date = cftime_range(start="2000-01-01", periods=24, freq="MS")
-    time = [
-        init_date.shift(i, freq="MS")[: len(lead_time)] for i in range(len(init_date))
-    ]
-
-    # Generate shape, location and scale parameters.
-    np.random.seed(2)
-    shape = np.random.uniform()
-    loc = np.random.uniform(-10, 10)
-    scale = np.random.uniform(0.1, 10)
-    theta = shape, loc, scale
-
-    rvs = genextreme.rvs(
-        shape,
-        loc=loc,
-        scale=scale,
-        size=(len(ensemble), len(init_date), len(lead_time)),
-        random_state=0,
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+def test_fit_gev_1d(example_da_gev):
+    """Run stationary GEV fit using 1D array."""
+    data, dparams_i = example_da_gev
+    dparams = fit_gev(
+        data, stationary=True, assert_good_fit=False, pick_best_model=False
     )
-    data = DataArray(
-        rvs,
-        coords=[ensemble, init_date, lead_time],
-        dims=["ensemble", "init_date", "lead_time"],
-    )
-    data = data.assign_coords({"time": (["init_date", "lead_time"], time)})
-    data_stacked = data.stack({"sample": ["ensemble", "init_date", "lead_time"]})
-    return data_stacked, theta
-
-
-def test_fit_gev_1d():
-    """Run stationary fit using 1D array & check results."""
-    data, theta_i = example_da_gev_1d()
-    theta = fit_gev(data, stationary=True)
-    # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
-
-
-def test_fit_gev_1d_user_estimates():
-    """Run stationary fit using 1D array & user_estimates."""
-    data, theta_i = example_da_gev_1d()
-    user_estimates = list(theta_i)
-    theta = fit_gev(data, stationary=True, user_estimates=user_estimates)
     # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
-
-
-def test_fit_gev_1d_goodness():
-    """Run stationary fit using 1D array & fit_goodness_test."""
-    data, theta_i = example_da_gev_1d()
-    theta = fit_gev(data, stationary=True, test_fit_goodness=True)
-    # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
-
-
-def test_fit_gev_1d_numpy():
-    """Run stationary fit using 1D np.ndarray & check results."""
-    data, theta_i = example_da_gev_1d()
-    data = data.values
-    theta = fit_gev(data, stationary=True)
-    # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
-
-
-def test_fit_gev_1d_dask():
-    """Run stationary fit using 1D dask array & check results."""
-    data, theta_i = example_da_gev_1d_dask()
-    theta = fit_gev(data, stationary=True, core_dim="time")
+    npt.assert_allclose(dparams, dparams_i, rtol=rtol)
+
+
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+@pytest.mark.parametrize(
+    "fitstart",
+    [
+        [1, -5, 1],
+        "LMM",
+        "scipy_fitstart",
+        "scipy",
+        "scipy_subset",
+        "xclim_fitstart",
+        "xclim",
+    ],
+)
+def test_fit_gev_1d_fitstart(example_da_gev, fitstart):
+    """Run stationary GEV fit using 1D array & fitstart method."""
+    data, dparams_i = example_da_gev
+    dparams = fit_gev(
+        data,
+        stationary=True,
+        fitstart=fitstart,
+        assert_good_fit=False,
+        pick_best_model=False,
+    )
     # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
+    npt.assert_allclose(dparams, dparams_i, rtol=rtol)
 
 
-def test_fit_gev_3d():
-    """Run stationary fit using 3D array & check results."""
-    data, theta_i = example_da_gev_3d()
-    theta = fit_gev(data, stationary=True, core_dim="time")
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+def test_fit_gev_1d_assert_good_fit(example_da_gev):
+    """Run stationary GEV fit using 1D array & fit_goodness_test."""
+    data, dparams_i = example_da_gev
+    dparams = fit_gev(data, stationary=True, assert_good_fit=True)
     # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
+    npt.assert_allclose(dparams, dparams_i, rtol=0.3)
 
 
-def test_fit_gev_3d_dask():
-    """Run stationary fit using 3D dask array & check results."""
-    data, theta_i = example_da_gev_3d_dask()
-    theta = fit_gev(data, stationary=True, core_dim="time")
+# todo FAILED unseen/tests/test_eva.py::test_fit_gev_3d[xarray] - AssertionError:
+@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
+def test_fit_gev_3d(example_da_gev_3d):
+    """Run stationary GEV fit using 3D array & check results."""
+    data, dparams_i = example_da_gev_3d
+    dparams = fit_gev(data, stationary=True, fitstart="LMM", core_dim="time")
     # Check fitted params match params used to create data
-    npt.assert_allclose(theta, theta_i, rtol=rtol)
+    npt.assert_allclose(dparams, dparams_i, rtol=0.4)
 
 
-def test_fit_ns_gev_1d():
-    """Run non-stationary fit using 1D array & check results."""
-    data, _ = example_da_gev_1d()
+@pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True)
+def test_fit_ns_gev_1d(example_da_gev):
+    """Run non-stationary GEV fit using 1D array & check results."""
+    data, _ = example_da_gev
     data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
-    theta = fit_gev(
+    dparams = fit_gev(
         data,
         stationary=False,
         core_dim="time",
         covariate=covariate,
     )
-    assert np.all(theta[2] > 0)  # Positive trend in location
+    assert np.all(dparams[2] > 0)  # Positive trend in location
 
 
-def test_fit_ns_gev_1d_loc_only():
-    """Run non-stationary fit using 1D array (location parameter only)."""
-    data, _ = example_da_gev_1d()
+@pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True)
+def test_fit_ns_gev_1d_loc_only(example_da_gev):
+    """Run non-stationary GEV fit using 1D array (location parameter only)."""
+    data, _ = example_da_gev
     data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
-    theta = fit_gev(
+    dparams = fit_gev(
         data,
         stationary=False,
         core_dim="time",
@@ -193,17 +104,18 @@ def test_fit_ns_gev_1d_loc_only():
         scale1=None,
         covariate=covariate,
     )
-    assert np.all(theta[2] > 0)  # Positive trend in location
-    assert np.all(theta[4] == 0)  # No trend in scale
+    assert np.all(dparams[2] > 0)  # Positive trend in location
+    assert np.all(dparams[4] == 0)  # No trend in scale
 
 
-def test_fit_ns_gev_1d_scale_only():
-    """Run non-stationary fit using 1D array (scale parameter only)."""
-    data, _ = example_da_gev_1d()
+@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True)
+def test_fit_ns_gev_1d_scale_only(example_da_gev):
+    """Run non-stationary GEV fit using 1D array (scale parameter only)."""
+    data, _ = example_da_gev
     data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
-    theta = fit_gev(
+    dparams = fit_gev(
         data,
         stationary=False,
         core_dim="time",
@@ -211,158 +123,119 @@ def test_fit_ns_gev_1d_scale_only():
         scale1=0,
         covariate=covariate,
     )
-    assert np.all(theta[2] == 0)  # No trend in location
-    assert np.all(theta[4] != 0)  # Nonzero trend in scale
-
-
-def test_fit_ns_gev_1d_dask():
-    """Run non-stationary fit using 1D dask array & check results."""
-    data, _ = example_da_gev_1d_dask()
-    # Add a positive linear trend
-    data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
-    theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time")
-    assert np.all(theta[2] > 0)  # Positive trend in location
+    assert np.all(dparams[2] == 0)  # No trend in location
+    assert np.all(dparams[4] != 0)  # Nonzero trend in scale
 
 
-def test_fit_ns_gev_3d():
-    """Run non-stationary fit using 3D array & check results."""
-    data, _ = example_da_gev_3d()
+@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
+def test_fit_ns_gev_3d(example_da_gev_3d):
+    """Run non-stationary GEV fit using 3D array & check results."""
+    data, _ = example_da_gev_3d
     # Add a positive linear trend
     data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
-    theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time")
-    assert np.all(theta.isel(theta=2) > 0)  # Positive trend in location
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
+    dparams = fit_gev(data, stationary=False, covariate=covariate, core_dim="time")
+    assert np.all(dparams.isel(dparams=2) > 0)  # Positive trend in location
 
 
-def test_fit_ns_gev_1d_relative_fit_test_bic_trend():
-    """Run non-stationary fit & check 'BIC' test returns nonstationary params."""
-    data, _ = example_da_gev_1d()
+@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True)
+def test_fit_ns_gev_1d_pick_best_model_bic_trend(example_da_gev):
+    """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params."""
+    data, _ = example_da_gev
     # Add a large positive linear trend
     data = add_example_gev_trend(data)
     data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
-    theta = fit_gev(
+    dparams = fit_gev(
         data,
         stationary=False,
         core_dim="time",
         covariate=covariate,
-        relative_fit_test="bic",
+        pick_best_model="bic",
     )
-    assert np.all(theta[2] > 0)  # Positive trend in location
+    assert np.all(dparams[2] > 0)  # Positive trend in location
 
 
-def test_fit_ns_gev_1d_relative_fit_test_bic_no_trend():
-    """Run non-stationary fit & check 'BIC' test returns stationary params."""
-    data, _ = example_da_gev_1d()
-    covariate = np.arange(data.time.size, dtype=int)
+@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True)
+def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev):
+    """Run non-stationary GEV fit & check 'BIC' test returns stationary params."""
+    data, _ = example_da_gev
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
-    theta = fit_gev(
+    dparams = fit_gev(
         data,
         stationary=False,
         core_dim="time",
         covariate=covariate,
-        relative_fit_test="bic",
+        pick_best_model="bic",
     )
-    assert np.all(theta[2] == 0)  # No trend in location
-    assert np.all(theta[4] == 0)  # No trend in scale
-
-
-def test_fit_ns_gev_3d_dask():
-    """Run non-stationary fit using 3D dask array & check results."""
-    data, _ = example_da_gev_3d_dask()
-    # Add a positive linear trend
-    data = add_example_gev_trend(data)
-    covariate = np.arange(data.time.size, dtype=int)
-    theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="time")
-    assert np.all(theta.isel(theta=2) > 0)  # Positive trend in location
+    assert np.all(dparams[2] == 0)  # No trend in location
+    assert np.all(dparams[4] == 0)  # No trend in scale
 
 
-def test_fit_ns_gev_forecast():
-    """Run non-stationary fit using stacked forecast dataArray."""
-    data, _ = example_da_gev_forecast()
-    # Convert times to numerical timesteps
-    covariate = DataArray(date2num(data.time), coords={"sample": data.sample})
-    # Add a positive linear trend
-    trend = covariate / 1e2
-    data = data + trend
-    data = data.sortby(data.time)
-    covariate = covariate.sortby(data.time)
-    theta = fit_gev(data, stationary=False, covariate=covariate, core_dim="sample")
-    assert np.all(theta[2] > 0)  # Positive trend in location
-
-
-def test_get_return_period():
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+def test_get_return_period(example_da_gev):
     """Run get_return_period for a single event using 1d data."""
-    data, _ = example_da_gev_1d()
-    event = data.mean()
-    rp = get_return_period(event, data=data)
-    assert rp.size == 1
-    assert np.all(np.isfinite(rp))
+    data, _ = example_da_gev
+    event = np.mean(data)
+    ari = get_return_period(event, data=data)
+    assert ari.size == 1
+    assert np.all(np.isfinite(ari))
 
 
-def test_get_return_period_1d():
-    """Run get_return_period for 1d array of events using 1d data."""
-    data, theta = example_da_gev_1d()
-    event = data.quantile([0.25, 0.5, 0.75], dim="time")
-    rp = get_return_period(event, theta)
-    assert rp.shape == event.shape
-    assert np.all(np.isfinite(rp))
-
-
-def test_get_return_period_3d():
+@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
+def test_get_return_period_3d(example_da_gev_3d):
     """Run get_return_period for 3d array of events using 3d data."""
-    data, theta = example_da_gev_3d()
-    theta = fit_gev(data, stationary=True)
+    data, dparams = example_da_gev_3d
+    dparams = fit_gev(data, stationary=True)
     # Multiple events unique to each lat/lon
     event = data.quantile([0.25, 0.5, 0.75], dim="time")
-    rp = get_return_period(event, theta)
-    assert rp.shape == event.shape
-    assert np.all(np.isfinite(rp))
+    ari = get_return_period(event, dparams)
+    assert ari.shape == event.shape
+    assert np.all(np.isfinite(ari))
 
 
-def test_get_return_period_3d_nonstationary():
+@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
+def test_get_return_period_3d_nonstationary(example_da_gev_3d):
     """Run get_return_period for 3d events using 3d nonstationary data."""
-    data, _ = example_da_gev_3d()
+    data, _ = example_da_gev_3d
     data = add_example_gev_trend(data)
-    covariate = DataArray(np.arange(data.time.size), dims="time")
+    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
     params = fit_gev(data, stationary=False, covariate=covariate, core_dim="time")
 
     # Multiple events unique to each lat/lon
     event = data.quantile([0.25, 0.5, 0.75], dim="time")
-    covariate_subset = DataArray([0, covariate.size], dims="time")
-    rp = get_return_period(event, params, covariate=covariate_subset)
-    assert rp.shape == (*list(event.shape), covariate_subset.size)
-    assert np.all(np.isfinite(rp))
-
-
-def test_get_return_level():
-    """Run get_return_level for a single return_period using 1d data."""
-    _, theta = example_da_gev_1d()
-    rp = 100
-    return_level = get_return_level(rp, theta)
-    assert return_level.size == 1
-    assert np.all(np.isfinite(return_level))
+    covariate_subset = xr.DataArray([0, covariate.size], dims="time")
+    ari = get_return_period(event, params, covariate=covariate_subset)
+    assert ari.shape == (*list(event.shape), covariate_subset.size)
+    assert np.all(np.isfinite(ari))
 
 
-def test_get_return_level_1d():
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+@pytest.mark.parametrize("ari", [100, np.array([10, 100, 1000])])
+def test_get_return_level(example_da_gev, ari):
     """Run get_return_level for 1d array of periods using 1d data."""
-    _, theta = example_da_gev_1d()
-    rp = np.array([10, 100, 1000])
-    return_level = get_return_level(rp, theta)
-    assert return_level.shape == rp.shape
+    _, dparams = example_da_gev
+    return_level = get_return_level(ari, dparams)
+    if isinstance(ari, int):
+        assert return_level.size == 1
+    else:
+        assert return_level.shape == ari.shape
     assert np.all(np.isfinite(return_level))
 
 
-def test_get_return_level_3d():
+@pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
+def test_get_return_level_3d(example_da_gev_3d):
     """Run get_return_level for 3d array of periods using 3d data."""
-    data, theta = example_da_gev_3d()
-    theta = fit_gev(data, stationary=True)
+    data, dparams = example_da_gev_3d
+    dparams = fit_gev(data, stationary=True, core_dim="time")
+
     # Multiple events unique to each lat/lon
-    dims = ("return_period", "lat", "lon")
-    rp = np.array([10, 100, 1000] * 4).T
-    rp = DataArray(rp.reshape((3, 2, 2)), dims=dims)
-    return_level = get_return_level(rp, theta)
-    assert return_level.shape == rp.shape
+    dims = ("lat", "lon", "return_period")
+    ari = np.array([10, 100, 1000] * 4).T
+    ari = xr.DataArray(ari.reshape(dparams.shape), dims=dims)
+    return_level = get_return_level(ari, dparams)
+
+    assert return_level.shape == ari.shape
     assert np.all(np.isfinite(return_level))

From e2c6482efccb16f247ab166bf91ff93065a0022b Mon Sep 17 00:00:00 2001
From: Annette Stellema <40450353+stellema@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:08:05 +1100
Subject: [PATCH 2/4] Update fit_gev goodness of fit and retry and add draft of
 gev_confidence_interval

---
 unseen/eva.py            | 278 +++++++++++++++++++++++++++++----------
 unseen/tests/test_eva.py |  61 +++++----
 2 files changed, 235 insertions(+), 104 deletions(-)

diff --git a/unseen/eva.py b/unseen/eva.py
index 5a745bd..3367883 100644
--- a/unseen/eva.py
+++ b/unseen/eva.py
@@ -8,7 +8,7 @@
 from matplotlib.ticker import AutoMinorLocator
 import numpy as np
 from scipy.optimize import minimize
-from scipy.stats import genextreme, goodness_of_fit
+from scipy.stats import genextreme, ks_1samp, cramervonmises
 from scipy.stats.distributions import chi2
 import warnings
 from xarray import apply_ufunc, DataArray
@@ -64,12 +64,14 @@ def fit_gev(
     fitstart="LMM",
     loc1=0,
     scale1=0,
+    retry_fit=False,
     assert_good_fit=False,
     pick_best_model=False,
     alpha=0.05,
     method="Nelder-Mead",
+    goodness_of_fit_kwargs=dict(test="ks"),
 ):
-    """Estimate stationary or nonstationary GEV distribution parameters.
+    """Estimate stationary or nonstationary GEV distributionß parameters.
 
     Parameters
     ----------
@@ -86,9 +88,11 @@ def fit_gev(
         Initial guess method/estimate of the shape, loc and scale parameters
     loc1, scale1 : float or None, default 0
         Initial guess of trend parameters. If None, the trend is fixed at zero
+    retry_fit : bool, default True
+        Retry fit using a fitstart(data[::2]) if the fit does not pass the
+        goodness of fit test (p-value > alpha).
     assert_good_fit : bool, default False
-        Stationary parameters must pass goodness of fit test at `alpha` level.
-        Attempt a retry and return NaNs if the test fails again.
+        Return NaNs if data fails a GEV goodness of fit test at `alpha` level.
     pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False
         Method to test relative fit of stationary and nonstationary models.
         Do not use if you don't want nonstationary parameters. The output will
@@ -98,6 +102,8 @@ def fit_gev(
     method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell',
     'trust-constr', 'COBYLA'}, default 'Nelder-Mead'
         Optimization method for nonstationary fit
+    goodness_of_fit_kwargs : dict, optional
+        Additional keyword arguments to pass to `check_gev_fit`
 
     Returns
     -------
@@ -115,51 +121,35 @@ def fit_gev(
     - For stationary data the parameters are estimated using
      `scipy.stats.genextreme.fit`.
     - For nonstationary data, the parameters (including the linear location and
-    scale trend parameters are estimated by minimising
-    a penalised negative log-likelihood function.
-    - The `assert_good_fit` option ensures that the distribution fit is
-    accepted if the goodness of fit test `p-value > alpha` (i.e., accept
-    the null hypothesis). It will retry the fit using data[::2] to generate
-    an initial guess.
+    scale trend parameters are estimated by minimising a penalised negative
+    log-likelihood function.
     - The `covariate` must be numeric and have dimensions aligned with `data`.
     - If `pick_best_model` is a method, the relative goodness of fit method is
     used to determine if stationary or nonstationary parameters are returned.
+    - `assert_good_fit`: Return NaNs if the goodness of fit null hypothesis is
+    rejected (i.e., `p-value <= alpha`).
+    - `retry_fit`: retry the fit using data[::2] to generate an initial
+    guess (same fitstart method).
+
 
     """
-    kwargs = {k: v for k, v in locals().items() if k not in ["data", "covariate"]}
-
-    def _assert_good_fit_1d(data, dparams, alpha, fit_kwargs):
-        """Test goodness of stationary GEV fit and retry if failed."""
-        pvalue = check_gev_fit(data, dparams)
-
-        if np.all(pvalue < alpha):
-            # Retry fit using alternative fitstart methods
-            warnings.warn("GEV fit failed. Retrying fitstart with data subset.")
-            _kwargs = fit_kwargs.copy()
-            _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart)
-            _kwargs["stationary"] = True
-            dparams = _fit_1d(data, covariate, **_kwargs)
-            pvalue = check_gev_fit(data, dparams)
-
-        # Return NaNs if the test still fails
-        if np.all(pvalue < alpha):
-            # Return NaNs
-            dparams = dparams * np.nan
-            warnings.warn("Data fit failed.")
-        return dparams
+    kwargs = {
+        k: v for k, v in locals().items() if k not in ["data", "covariate", "core_dim"]
+    }
 
     def _fit_1d(
         data,
         covariate,
         stationary,
         fitstart,
-        core_dim,
         loc1,
         scale1,
+        retry_fit,
         assert_good_fit,
         pick_best_model,
         alpha,
         method,
+        goodness_of_fit_kwargs,
     ):
         """Estimate distribution parameters."""
         if np.all(~np.isfinite(data)):
@@ -183,16 +173,38 @@ def _fit_1d(
 
         # Use genextreme to get stationary distribution parameters
         if stationary or pick_best_model:
-            if dparams_i is None:
-                dparams_i = genextreme.fit(data)
+            if fitstart is None:
+                dparams = genextreme.fit(data)
             else:
                 dparams = genextreme.fit(
                     data, dparams_i[0], loc=dparams_i[1], scale=dparams_i[2]
                 )
             dparams = np.array([i for i in dparams], dtype="float64")
 
-            if assert_good_fit:
-                dparams = _assert_good_fit_1d(data, dparams, alpha, kwargs)
+            if retry_fit or assert_good_fit:
+                pvalue = check_gev_fit(data, dparams, **goodness_of_fit_kwargs)
+
+                if retry_fit and np.all(pvalue <= alpha):
+                    # Retry fit using alternative fitstart methods
+                    _kwargs = kwargs.copy()
+                    _kwargs["fitstart"] = _fitstart_1d(data[::2], fitstart)
+                    _kwargs["stationary"] = True
+                    for k in ["retry_fit", "assert_good_fit", "pick_best_model"]:
+                        _kwargs[k] = False  # Avoids recursion
+                    dparams_alt = _fit_1d(data, covariate, **_kwargs)
+
+                    # Test if the alternative fit is better
+                    L1 = _gev_nllf([-dparams[0], *dparams[1:]], data)
+                    L2 = _gev_nllf([-dparams_alt[0], *dparams_alt[1:]], data)
+                    if L2 < L1:
+                        dparams = dparams_alt
+                        pvalue = check_gev_fit(data, dparams, **goodness_of_fit_kwargs)
+                        warnings.warn("Better fit estimate using data[::2].")
+
+                if assert_good_fit and pvalue <= alpha:
+                    # Return NaNs
+                    dparams = dparams * np.nan
+                    warnings.warn("Data fit failed.")
 
         if not stationary or pick_best_model:
             # Temporarily reverse shape sign (scipy uses different sign convention)
@@ -210,7 +222,7 @@ def _fit_1d(
 
             # Minimise the negative log-likelihood function to get optimal dparams
             res = minimize(
-                nllf,
+                _gev_nllf,
                 dparams_ns_i,
                 args=(data, covariate),
                 method=method,
@@ -300,8 +312,8 @@ def penalised_sum(x):
     return total + penalty
 
 
-def nllf(dparams, x, covariate=None):
-    """Penalised negative log-likelihood function.
+def _gev_nllf(dparams, x, covariate=None):
+    """GEV penalised negative log-likelihood function.
 
     Parameters
     ----------
@@ -389,6 +401,16 @@ def _fitstart_1d(data, method):
     -----
     - Use `scipy_fitstart` to reproduce the scipy fit in `fit_gev`.
     - The LMM shape sign is reversed for consistency with scipy.stats results.
+    scipy_fitstart:
+    >>> shape = skew(data) / 2
+    >>> scale = np.std(data) / np.sqrt(6)
+    >>> location = np.mean(data) - (scale * (0.5772 + np.log(2)))
+    >>> dparams_i = (-shape, location, scale) # + or - shape?
+
+    xclim_fitstart:
+    >>> scale = np.sqrt(6 * np.var(data)) / np.pi
+    >>> location = np.mean(data) - 0.57722 * scale
+    >>> dparams_i = [-0.1, location, scale]
     """
 
     if method == "LMM":
@@ -464,8 +486,8 @@ def _format_covariate(data, covariate, core_dim):
     return covariate
 
 
-def check_gev_fit(data, dparams, core_dim=[], **kwargs):
-    """Test stationary GEV distribution goodness of fit.
+def check_gev_fit(data, dparams, core_dim=[], test="ks", **kwargs):
+    """Perform a goodness of fit of GEV distribution with the given parameters.
 
     Parameters
     ----------
@@ -475,24 +497,35 @@ def check_gev_fit(data, dparams, core_dim=[], **kwargs):
         Shape, location and scale parameters
     core_dim : str, optional
         Data dimension to test over
+    test : {'ks', 'cvm'}, default 'ks'
+        Test to use for goodness of fit
     kwargs : dict, optional
-        Additional keyword arguments to pass to `goodness_of_fit`.
+        Additional keyword arguments to pass to the stats function.
 
     Returns
     -------
-    pvalue : scipy.stats._fit.GoodnessOfFitResult.pvalue
+    pvalue : float
         Goodness of fit p-value
+
+    Notes
+    -----
+    - CvM is more likely to detect small discrepancies that may not matter
+    practically in large datasets.
     """
+    stats_func = {
+        "ks": ks_1samp,
+        "cvm": cramervonmises,
+    }
 
-    def _goodness_of_fit(data, dparams, **kwargs):
+    def _fit_test_genextreme(data, dparams, **kwargs):
         """Test GEV goodness of fit."""
         # Stationary parameters
-        shape, loc, scale = dparams
+        c, loc, scale = dparams
 
-        res = goodness_of_fit(
-            genextreme,
+        res = stats_func[test](
             data,
-            known_params=dict(c=shape, loc=loc, scale=scale),
+            genextreme.cdf,
+            args=(c, loc, scale),
             **kwargs,
         )
         return res.pvalue
@@ -501,7 +534,7 @@ def _goodness_of_fit(data, dparams, **kwargs):
         core_dim = [core_dim]
 
     pvalue = apply_ufunc(
-        _goodness_of_fit,
+        _fit_test_genextreme,
         data,
         dparams,
         input_core_dims=[core_dim, ["dparams"]],
@@ -565,8 +598,8 @@ def get_best_GEV_model_1d(data, dparams, dparams_ns, covariate, alpha, test):
     shape, loc, scale = dparams
 
     # Negative log-likelihood of stationary and nonstationary models
-    L1 = nllf([-shape, loc, scale], data)
-    L2 = nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate)
+    L1 = _gev_nllf([-shape, loc, scale], data)
+    L2 = _gev_nllf([-dparams_ns[0], *dparams_ns[1:]], data, covariate)
 
     result = check_gev_relative_fit(data, L1, L2, test=test, alpha=alpha)
     if not result:
@@ -701,6 +734,104 @@ def get_return_level(return_period, dparams=None, covariate=None, **kwargs):
     return return_level
 
 
+def gev_confidence_interval(
+    data,
+    dparams=None,
+    return_period=None,
+    return_level=None,
+    bootstrap_method="non-parametric",
+    n_resamples=1000,
+    ci=0.95,
+    core_dim="time",
+    fit_kwargs={},
+):
+    """
+    Bootstrapped confidence intervals for return periods or return levels.
+
+    Parameters:
+    -----------
+
+    data : xarray.DataArray
+        Input data to fit GEV distribution
+    dparams : xarray.DataArray, optional
+        GEV distribution parameters. If None, the parameters are estimated.
+    return_period : float or xarray.DataArray, default None
+        Return period(s). Mutually exclusive with `return_level`.
+    return_level : float or xarray.DataArray, default None
+        Return level(s) Mutually exclusive with `return_period`.
+    bootstrap_method : {'parametric', 'non-parametric'}, default 'non-parametric'
+        Bootstrap method to use for resampling
+    n_resamples : int, optional
+        Number of bootstrap resamples to perform (default: 1000)
+    ci : float, optional
+        Confidence level (e.g., 0.95 for 95% confidence interval, default: 0.95)
+    core_dim : str, optional
+        The core dimension along which to apply GEV fitting (default: None, will auto-detect)
+    fit_kwargs : dict, optional
+        Additional keyword arguments to pass to `fit_gev`
+
+    Returns:
+    --------
+    ci_bounds : xarray.DataArray
+        Confidence intervals with lower and upper bounds along dim 'quantile'
+    """
+    # todo: max_shape_ratio
+    # Replace core dim with the one from the fit_kwargs if it exists
+    core_dim = fit_kwargs.pop("core_dim", core_dim)
+
+    rng = np.random.default_rng(seed=0)
+    if dparams is None:
+        dparams = fit_gev(data, core_dim=core_dim, **fit_kwargs)
+    shape, loc, scale = unpack_gev_params(dparams)
+
+    # Generate random indices for resampling
+    if bootstrap_method == "parametric":
+        boot_data = apply_ufunc(
+            genextreme.rvs,
+            shape,
+            loc,
+            scale,
+            input_core_dims=[[], [], []],
+            output_core_dims=[["k", core_dim]],
+            kwargs=dict(size=(n_resamples, data[core_dim].size)),
+            vectorize=True,
+            dask="parallelized",
+        )
+        boot_data = boot_data.transpose("k", core_dim, ...)
+
+    elif bootstrap_method == "non-parametric":
+        resample_indices = rng.integers(
+            0, data[core_dim].size, (n_resamples, data[core_dim].size)
+        )
+        indexer = DataArray(resample_indices, dims=("k", core_dim))
+        boot_data = data.isel({core_dim: indexer})
+
+    # Fit GEV parameters to resampled data
+    gev_params_resampled = fit_gev(boot_data, core_dim=core_dim, **fit_kwargs)
+
+    if return_period is not None:
+        result = get_return_level(
+            return_period, gev_params_resampled, core_dim=core_dim
+        )
+    elif return_level is not None:
+        result = get_return_period(
+            return_level, gev_params_resampled, core_dim=core_dim
+        )
+
+    # Bounds of confidence intervals
+    ci = ci * 100  # Avoid rounding errors
+    q = (100 - ci) * 0.5 / 100
+
+    # Calculate confidence intervals from resampled percentiles
+    ci_bounds = result.quantile([q, 1 - q], dim="k")
+
+    ci_bounds.attrs = {
+        "long_name": "Confidence interval",
+        "description": f"{ci:g}% confidence interval ({n_resamples} resamples)",
+    }
+    return ci_bounds
+
+
 def gev_return_curve(
     data,
     event_value,
@@ -708,6 +839,7 @@ def gev_return_curve(
     n_bootstraps=1000,
     max_return_period=4,
     max_shape_ratio=None,
+    ci=0.95,
     **fit_kwargs,
 ):
     """Return x and y data for a GEV return period curve.
@@ -733,7 +865,9 @@ def gev_return_curve(
     dparams = fit_gev(data, **fit_kwargs)
     shape, loc, scale = unpack_gev_params(dparams)
 
-    curve_return_periods = np.logspace(0, max_return_period, num=10000)
+    curve_return_periods = DataArray(
+        np.logspace(0, max_return_period, num=10000), dims="ari"
+    )
     curve_probabilities = 1.0 / curve_return_periods
     curve_values = genextreme.isf(curve_probabilities, shape, loc, scale)
 
@@ -748,24 +882,21 @@ def gev_return_curve(
             boot_data = genextreme.rvs(shape, loc=loc, scale=scale, size=len(data))
         elif bootstrap_method == "non-parametric":
             boot_data = rng.choice(data, size=data.shape, replace=True)
-        boot_shape, boot_loc, boot_scale = fit_gev(boot_data, fitstart="scipy_subet")
+
+        boot_dparams = fit_gev(boot_data, **fit_kwargs)
         if max_shape_ratio:
-            shape_ratio = abs(boot_shape) / abs(shape)
+            shape_ratio = abs(boot_dparams[0]) / abs(boot_dparams[0])
             if shape_ratio > max_shape_ratio:
                 continue
-        boot_value = genextreme.isf(
-            curve_probabilities, boot_shape, boot_loc, boot_scale
-        )
-        boot_values = np.vstack((boot_values, boot_value))
 
-        boot_event_probability = genextreme.sf(
-            event_value, boot_shape, loc=boot_loc, scale=boot_scale
-        )
-        boot_event_return_period = 1.0 / boot_event_probability
+        boot_value = get_return_level(curve_return_periods, boot_dparams)
+        boot_values = np.vstack((boot_values, boot_value))
+        boot_event_return_period = get_return_period(event_value, boot_dparams)
         boot_event_return_periods.append(boot_event_return_period)
 
-    curve_values_lower_ci = np.quantile(boot_values, 0.025, axis=0)
-    curve_values_upper_ci = np.quantile(boot_values, 0.975, axis=0)
+    q = (100 - ci * 100) * 0.5 / 100  # Quantile for lower and upper bounds
+    curve_values_lower_ci = np.quantile(boot_values, q, axis=0)
+    curve_values_upper_ci = np.quantile(boot_values, 1 - q, axis=0)
     curve_data = (
         curve_return_periods,
         curve_values,
@@ -777,8 +908,8 @@ def gev_return_curve(
     boot_event_return_periods = boot_event_return_periods[
         np.isfinite(boot_event_return_periods)
     ]
-    event_return_period_lower_ci = np.quantile(boot_event_return_periods, 0.025)
-    event_return_period_upper_ci = np.quantile(boot_event_return_periods, 0.975)
+    event_return_period_lower_ci = np.quantile(boot_event_return_periods, q)
+    event_return_period_upper_ci = np.quantile(boot_event_return_periods, q - 1)
     event_data = (
         event_return_period,
         event_return_period_lower_ci,
@@ -1180,11 +1311,17 @@ def _parse_command_line():
         ),
         help="Initial guess method (or estimate) of the GEV parameters",
     )
+    parser.add_argument(
+        "--retry_fit",
+        action="store_true",
+        default=False,
+        help="Return NaNs if fit doesn't pass the goodness of fit test",
+    )
     parser.add_argument(
         "--assert_good_fit",
         action="store_true",
         default=False,
-        help="Test fit goodness",
+        help="Return NaNs if fit doesn't pass the goodness of fit test",
     )
     parser.add_argument(
         "--pick_best_model",
@@ -1216,12 +1353,6 @@ def _parse_command_line():
         action=general_utils.store_dict,
         help="Minimum lead time file",
     )
-    # parser.add_argument(
-    #     "--confidence_interval",
-    #     type=float,
-    #     default=0.95,
-    #     help="Confidence interval e.g., --confidence_interval 0.95",
-    # )
     parser.add_argument(
         "--ensemble_dim",
         type=str,
@@ -1303,6 +1434,7 @@ def _main():
         stationary=args.stationary,
         fitstart=args.fitstart,
         covariate=covariate,
+        retry_fit=args.retry_fit,
         assert_good_fit=args.assert_good_fit,
         pick_best_model=args.pick_best_model,
     )
diff --git a/unseen/tests/test_eva.py b/unseen/tests/test_eva.py
index d30f13e..e436f3c 100644
--- a/unseen/tests/test_eva.py
+++ b/unseen/tests/test_eva.py
@@ -32,6 +32,7 @@ def test_fit_gev_1d(example_da_gev):
     "fitstart",
     [
         [1, -5, 1],
+        None,
         "LMM",
         "scipy_fitstart",
         "scipy",
@@ -54,23 +55,13 @@ def test_fit_gev_1d_fitstart(example_da_gev, fitstart):
     npt.assert_allclose(dparams, dparams_i, rtol=rtol)
 
 
-@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
-def test_fit_gev_1d_assert_good_fit(example_da_gev):
-    """Run stationary GEV fit using 1D array & fit_goodness_test."""
-    data, dparams_i = example_da_gev
-    dparams = fit_gev(data, stationary=True, assert_good_fit=True)
-    # Check fitted params match params used to create data
-    npt.assert_allclose(dparams, dparams_i, rtol=0.3)
-
-
-# todo FAILED unseen/tests/test_eva.py::test_fit_gev_3d[xarray] - AssertionError:
 @pytest.mark.parametrize("example_da_gev_3d", ["xarray", "dask"], indirect=True)
 def test_fit_gev_3d(example_da_gev_3d):
     """Run stationary GEV fit using 3D array & check results."""
     data, dparams_i = example_da_gev_3d
     dparams = fit_gev(data, stationary=True, fitstart="LMM", core_dim="time")
     # Check fitted params match params used to create data
-    npt.assert_allclose(dparams, dparams_i, rtol=0.4)
+    npt.assert_allclose(dparams, dparams_i, rtol=rtol)
 
 
 @pytest.mark.parametrize("example_da_gev", ["xarray", "dask"], indirect=True)
@@ -138,29 +129,34 @@ def test_fit_ns_gev_3d(example_da_gev_3d):
     assert np.all(dparams.isel(dparams=2) > 0)  # Positive trend in location
 
 
-@pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True)
-def test_fit_ns_gev_1d_pick_best_model_bic_trend(example_da_gev):
-    """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params."""
-    data, _ = example_da_gev
-    # Add a large positive linear trend
-    data = add_example_gev_trend(data)
-    data = add_example_gev_trend(data)
-    covariate = xr.DataArray(np.arange(data.time.size), dims="time")
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+def test_fit_gev_1d_retry_fit(example_da_gev):
+    """Run stationary GEV fit using 1D array & retry_fit."""
+    data, dparams_i = example_da_gev
+    # Set large alpha to force any fit considered bad
+    dparams = fit_gev(data, stationary=True, retry_fit=True, alpha=1)
+    # Check fitted params match params used to create data
+    npt.assert_allclose(dparams, dparams_i, rtol=rtol)
 
-    dparams = fit_gev(
-        data,
-        stationary=False,
-        core_dim="time",
-        covariate=covariate,
-        pick_best_model="bic",
-    )
-    assert np.all(dparams[2] > 0)  # Positive trend in location
+
+@pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)
+def test_fit_gev_1d_assert_good_fit(example_da_gev):
+    """Run stationary GEV fit using 1D array & assert_good_fit."""
+    data, _ = example_da_gev
+    # Set large alpha to force any fit considered bad
+    dparams = fit_gev(data, stationary=True, assert_good_fit=True, alpha=1)
+    assert all(np.isnan(dparams))
 
 
 @pytest.mark.parametrize("example_da_gev", ["xarray"], indirect=True)
-def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev):
-    """Run non-stationary GEV fit & check 'BIC' test returns stationary params."""
+@pytest.mark.parametrize("trend", [False, True])
+def test_fit_ns_gev_1d_pick_best_model_bic(example_da_gev, trend):
+    """Run non-stationary GEV fit & check 'BIC' test returns nonstationary params."""
     data, _ = example_da_gev
+    if trend:
+        # Add a large positive linear trend
+        data = add_example_gev_trend(data)
+        data = add_example_gev_trend(data)
     covariate = xr.DataArray(np.arange(data.time.size), dims="time")
 
     dparams = fit_gev(
@@ -170,8 +166,11 @@ def test_fit_ns_gev_1d_pick_best_model_bic_no_trend(example_da_gev):
         covariate=covariate,
         pick_best_model="bic",
     )
-    assert np.all(dparams[2] == 0)  # No trend in location
-    assert np.all(dparams[4] == 0)  # No trend in scale
+    if trend:
+        assert np.all(dparams[2] > 0)  # Positive trend in location
+    else:
+        assert np.all(dparams[2] == 0)  # No trend in location
+        assert np.all(dparams[4] == 0)  # No trend in scale
 
 
 @pytest.mark.parametrize("example_da_gev", ["xarray", "numpy", "dask"], indirect=True)

From 123ac98a4d1c89b0add4501637672f2b20091c36 Mon Sep 17 00:00:00 2001
From: Annette Stellema <40450353+stellema@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:33:57 +1100
Subject: [PATCH 3/4] Update eva.py and timeseries plot

---
 unseen/eva.py           | 127 +++++++++++++++++++++++++---------------
 unseen/general_utils.py |   7 ++-
 2 files changed, 84 insertions(+), 50 deletions(-)

diff --git a/unseen/eva.py b/unseen/eva.py
index 3367883..18d6295 100644
--- a/unseen/eva.py
+++ b/unseen/eva.py
@@ -64,7 +64,7 @@ def fit_gev(
     fitstart="LMM",
     loc1=0,
     scale1=0,
-    retry_fit=False,
+    retry_fit=True,
     assert_good_fit=False,
     pick_best_model=False,
     alpha=0.05,
@@ -83,20 +83,21 @@ def fit_gev(
         Fit as a stationary GEV using `fit_stationary_gev`
     covariate : array_like, optional
         A nonstationary covariate array with the same `core_dim` as `data`
-    fitstart : {array-like, 'LMM', 'MM', 'scipy', 'scipy_fitstart',
+    fitstart : {array-like, 'LMM', 'scipy_fitstart', 'scipy',
     'scipy_subset', 'xclim_fitstart', 'xclim'}, default 'scipy_fitstart'
         Initial guess method/estimate of the shape, loc and scale parameters
     loc1, scale1 : float or None, default 0
         Initial guess of trend parameters. If None, the trend is fixed at zero
     retry_fit : bool, default True
-        Retry fit using a fitstart(data[::2]) if the fit does not pass the
-        goodness of fit test (p-value > alpha).
+        Retry fit with different initial estimate if the fit does not pass the
+        goodness of fit test at the `alpha` level.
     assert_good_fit : bool, default False
         Return NaNs if data fails a GEV goodness of fit test at `alpha` level.
+        Mutually exclusive with `stationary=False` and `pick_best_model`.
     pick_best_model : {False, 'lrt', 'aic', 'bic'}, default False
         Method to test relative fit of stationary and nonstationary models.
-        Do not use if you don't want nonstationary parameters. The output will
-        have GEV 5 parameters even if stationary is True.
+        The output will have GEV 5 parameters even if stationary is True.
+        Mutually exclusive with `stationary` and/or `assert_good_fit`.
     alpha : float, default 0.05
         Fit test p-value threshold for stationary fit (relative/goodness of fit)
     method : {'Nelder-Mead', 'L-BFGS-B', 'TNC', 'SLSQP', 'Powell',
@@ -116,23 +117,23 @@ def fit_gev(
     Notes
     -----
     - Use `unpack_gev_params` to get the shape, location and scale parameters
-    as a separate array. If nonstationary, the output will still be three
+    as a separate array. If nonstationary, the output will also have three
     parameters that have an extra covariate dimension.
     - For stationary data the parameters are estimated using
-     `scipy.stats.genextreme.fit`.
+     `scipy.stats.genextreme.fit` with initial guess based on `fitstart` (use
+     fitstart 'scipy_fitstart' or None to use scipy defaults).
     - For nonstationary data, the parameters (including the linear location and
-    scale trend parameters are estimated by minimising a penalised negative
+    scale trend parameters) are estimated by minimising a penalised negative
     log-likelihood function.
     - The `covariate` must be numeric and have dimensions aligned with `data`.
-    - If `pick_best_model` is a method, the relative goodness of fit method is
-    used to determine if stationary or nonstationary parameters are returned.
-    - `assert_good_fit`: Return NaNs if the goodness of fit null hypothesis is
-    rejected (i.e., `p-value <= alpha`).
     - `retry_fit`: retry the fit using data[::2] to generate an initial
     guess (same fitstart method).
-
-
+    - If `pick_best_model` is a method, the relative goodness of fit method is
+    used to determine if stationary or nonstationary parameters are returned.
+    If the stationary fit is better, the nonstationary parameters are returned
+    with zero trends (see `check_gev_relative_fit` and `get_best_GEV_model_1d`).
     """
+
     kwargs = {
         k: v for k, v in locals().items() if k not in ["data", "covariate", "core_dim"]
     }
@@ -206,7 +207,7 @@ def _fit_1d(
                     dparams = dparams * np.nan
                     warnings.warn("Data fit failed.")
 
-        if not stationary or pick_best_model:
+        if not stationary:
             # Temporarily reverse shape sign (scipy uses different sign convention)
             dparams_ns_i = [-dparams_i[0], dparams_i[1], loc1, dparams_i[2], scale1]
 
@@ -242,6 +243,13 @@ def _fit_1d(
 
         return dparams
 
+    if stationary and pick_best_model:
+        raise ValueError(
+            f"Stationary must be false if pick_best_model={pick_best_model}."
+        )
+    if assert_good_fit and pick_best_model:
+        raise ValueError("pick_best_model and assert_good_fit are mutually exclusive.")
+
     if covariate is not None:
         covariate = _format_covariate(data, covariate, core_dim)
     else:
@@ -255,7 +263,7 @@ def _fit_1d(
         # Covariate is a 1D array
         input_core_dims = [[core_dim], []]
 
-    n_params = 5 if (not stationary or pick_best_model) else 3
+    n_params = 3 if stationary else 5
     # Fit data to distribution parameters
     dparams = apply_ufunc(
         _fit_1d,
@@ -269,8 +277,9 @@ def _fit_1d(
         output_dtypes=["float64"],
         dask_gufunc_kwargs={"output_sizes": {"dparams": n_params}},
     )
+
+    # Format output (consistent with xclim)
     if isinstance(data, DataArray):
-        # Format output (consistent with xclim)
         if n_params == 3:
             dparams.coords["dparams"] = ["c", "loc", "scale"]
         else:
@@ -417,7 +426,6 @@ def _fitstart_1d(data, method):
         # L-moments method
         dparams_i = distr.gev.lmom_fit(data)
         dparams_i = list(dparams_i.values())
-        dparams_i[0] = -dparams_i[0]
 
     elif method == "scipy_fitstart":
         # Moments method?
@@ -734,6 +742,28 @@ def get_return_level(return_period, dparams=None, covariate=None, **kwargs):
     return return_level
 
 
+def aep_to_ari(aep):
+    """Convert from aep (%) to ari (years)
+
+    Details: http://www.bom.gov.au/water/designRainfalls/ifd-arr87/glossary.shtml
+    Stolen from https://github.com/climate-innovation-hub/frequency-analysis/blob/master/eva.py
+    """
+
+    assert aep < 100, "aep to be expressed as a percentage (must be < 100)"
+    aep = aep / 100
+
+    return 1 / (-np.log(1 - aep))
+
+
+def ari_to_aep(ari):
+    """Convert from ari (years) to aep (%)
+
+    Details: http://www.bom.gov.au/water/designRainfalls/ifd-arr87/glossary.shtml
+    Stolen from https://github.com/climate-innovation-hub/frequency-analysis/blob/master/eva.py
+    """
+    return ((np.exp(1 / ari) - 1) / np.exp(1 / ari)) * 100
+
+
 def gev_confidence_interval(
     data,
     dparams=None,
@@ -775,7 +805,7 @@ def gev_confidence_interval(
     ci_bounds : xarray.DataArray
         Confidence intervals with lower and upper bounds along dim 'quantile'
     """
-    # todo: max_shape_ratio
+    # todo: add max_shape_ratio
     # Replace core dim with the one from the fit_kwargs if it exists
     core_dim = fit_kwargs.pop("core_dim", core_dim)
 
@@ -800,6 +830,7 @@ def gev_confidence_interval(
         boot_data = boot_data.transpose("k", core_dim, ...)
 
     elif bootstrap_method == "non-parametric":
+        # todo: replace with rng.choice
         resample_indices = rng.integers(
             0, data[core_dim].size, (n_resamples, data[core_dim].size)
         )
@@ -1288,14 +1319,20 @@ def _parse_command_line():
         type=str,
         nargs="*",
         default=["ensemble", "init_date", "lead_time"],
-        help="Dimensions to stack",
+        help="Dimensions to stack",  # todo: test this
     )
     parser.add_argument("--core_dim", type=str, default="time", help="Core dimension")
     parser.add_argument(
         "--stationary",
-        type=bool,
+        action="store_true",
         default=True,
-        help="Fit nonstationary GEV distribution",
+        help="Fit stationary GEV distribution",
+    )
+    parser.add_argument(
+        "--nonstationary",
+        action="store_true",
+        default=False,
+        help="Fit non-stationary GEV distribution",
     )
     parser.add_argument(
         "--fitstart",
@@ -1307,7 +1344,7 @@ def _parse_command_line():
             "scipy_subset",
             "xclim_MLE",
             "xclim",
-            ["shape", "loc", "scale"],
+            ["shape", "loc", "scale"],  # todo: test this
         ),
         help="Initial guess method (or estimate) of the GEV parameters",
     )
@@ -1315,7 +1352,7 @@ def _parse_command_line():
         "--retry_fit",
         action="store_true",
         default=False,
-        help="Return NaNs if fit doesn't pass the goodness of fit test",
+        help="Retry fit if it doesn't pass the goodness of fit test",
     )
     parser.add_argument(
         "--assert_good_fit",
@@ -1325,8 +1362,7 @@ def _parse_command_line():
     )
     parser.add_argument(
         "--pick_best_model",
-        type=str,
-        default=None,
+        default=False,
         help="Relative fit test to pick stationary or nonstationary parameters",
     )
     parser.add_argument(
@@ -1341,7 +1377,7 @@ def _parse_command_line():
     )
     parser.add_argument(
         "--covariate_file", type=str, default=None, help="Covariate file"
-    )
+    )  # todo: test this
     parser.add_argument(
         "--min_lead", default=None, help="Minimum lead time (int or filename)"
     )
@@ -1351,7 +1387,7 @@ def _parse_command_line():
         nargs="*",
         default={},
         action=general_utils.store_dict,
-        help="Minimum lead time file",
+        help="Keyword arguments for opening min_lead file",
     )
     parser.add_argument(
         "--ensemble_dim",
@@ -1372,33 +1408,30 @@ def _parse_command_line():
         help="Name of lead time dimension",
     )
     parser.add_argument(
-        "--output_chunks",
+        "--file_kwargs",
         type=str,
         nargs="*",
-        action=general_utils.store_dict,
         default={},
-        help="Output chunks",
-    )
-    parser.add_argument(
-        "--dask_config", type=str, help="YAML file specifying dask client configuration"
+        action=general_utils.store_dict,
+        help="Keyword arguments for opening the data file",
     )
     args = parser.parse_args()
-
     return args
 
 
 def _main():
-    """Run the command line program."""
+    """Run the command line program to save GEV distribution parameters."""
 
     args = _parse_command_line()
+    args.stationary = False if args.nonstationary else True
 
-    ds = fileio.open_dataset(args.file, variables=[args.var])
+    ds = fileio.open_dataset(args.file, **args.file_kwargs)
 
     if args.covariate_file is not None:
-        # Add covariate to dataset (to ensure all operations are aligned)
         ds_covariate = fileio.open_dataset(
             args.covariate_file, variables=[args.covariate]
         )
+        # Add covariate to dataset (to ensure all operations are aligned)
         ds[args.covariate] = ds_covariate[args.covariate]
 
     # Filter data by reference time period
@@ -1420,10 +1453,10 @@ def _main():
 
     # Stack dimensions along new "sample" dimension
     if all([dim in ds[args.var].dims for dim in args.stack_dims]):
-        ds = ds.stack(**{"sample": args.stack_dims})
+        ds = ds.stack(**{"sample": args.stack_dims}, create_index=False)
         args.core_dim = "sample"
 
-    if not args.stationary:
+    if args.nonstationary:
         covariate = _format_covariate(ds[args.var], ds[args.covariate], args.core_dim)
     else:
         covariate = None
@@ -1440,20 +1473,20 @@ def _main():
     )
 
     # Format outfile
-    dparams = dparams.to_dataset()
+    dparams = dparams.to_dataset(name=args.var)
 
     # Add the covariate variable
-    if not args.stationary or args.pick_best_model:
-        dparams[args.covariate] = covariate
+    if args.nonstationary:
+        dparams["covariate"] = covariate
 
+    # Add metadata
+    dparams.attrs = ds.attrs
     infile_logs = {args.file: ds.attrs["history"]}
     if isinstance(args.min_lead, str):
         infile_logs[args.min_lead] = ds_min_lead.attrs["history"]
     dparams.attrs["history"] = fileio.get_new_log(infile_logs=infile_logs)
 
-    if args.output_chunks:
-        dparams = dparams.chunk(args.output_chunks)
-
+    # Save to file
     if "zarr" in args.outfile:
         fileio.to_zarr(dparams, args.outfile)
     else:
diff --git a/unseen/general_utils.py b/unseen/general_utils.py
index a6f42db..6c84895 100644
--- a/unseen/general_utils.py
+++ b/unseen/general_utils.py
@@ -164,12 +164,10 @@ def plot_timeseries_scatter(
             units = ""
 
     if ax is None:
-        fig, ax = plt.subplots(1, 1, figsize=(10, 4))
+        fig, ax = plt.subplots(1, 1, figsize=(12, 5))
     if title is not None:
         ax.set_title(title, loc="left")
 
-    # Plot ensemble data
-    ax.scatter(da[time_dim], da, s=3, c="lightskyblue", label=label)
     # Plot observed data
     if da_obs is not None:
         ax.scatter(
@@ -179,7 +177,10 @@ def plot_timeseries_scatter(
             c="k",
             marker="x",
             label=obs_label,
+            zorder=10,
         )
+    # Plot ensemble data
+    ax.scatter(da[time_dim], da, s=5, c="deepskyblue", label=label)
 
     ax.set_ylabel(units)
     ax.set_xmargin(1e-2)

From 27f7e092d3693b6c81db2b41ebd76b055ff0f639 Mon Sep 17 00:00:00 2001
From: Annette Stellema <40450353+stellema@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:35:44 +1100
Subject: [PATCH 4/4] Update unseen/eva.py

Co-authored-by: Damien Irving <irving.damien@gmail.com>
---
 unseen/eva.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unseen/eva.py b/unseen/eva.py
index 18d6295..940f9c0 100644
--- a/unseen/eva.py
+++ b/unseen/eva.py
@@ -71,7 +71,7 @@ def fit_gev(
     method="Nelder-Mead",
     goodness_of_fit_kwargs=dict(test="ks"),
 ):
-    """Estimate stationary or nonstationary GEV distributionß parameters.
+    """Estimate stationary or nonstationary GEV distribution parameters.
 
     Parameters
     ----------