From 4081b5b9211a36450d5d285fb1363aa5627dbc67 Mon Sep 17 00:00:00 2001 From: "Oriol (ZBook)" Date: Mon, 24 Feb 2020 21:28:26 +0100 Subject: [PATCH 01/19] start rewriting benchmarks --- asv.conf.json => asv_benckmarks/asv.conf.json | 10 +- .../benchmarks}/__init__.py | 0 asv_benckmarks/benchmarks/benchmarks.py | 164 +++++++ benchmarks/benchmarks.py | 401 ------------------ 4 files changed, 169 insertions(+), 406 deletions(-) rename asv.conf.json => asv_benckmarks/asv.conf.json (96%) rename {benchmarks => asv_benckmarks/benchmarks}/__init__.py (100%) create mode 100644 asv_benckmarks/benchmarks/benchmarks.py delete mode 100644 benchmarks/benchmarks.py diff --git a/asv.conf.json b/asv_benckmarks/asv.conf.json similarity index 96% rename from asv.conf.json rename to asv_benckmarks/asv.conf.json index 0b87e0e7c0..35eac476c4 100644 --- a/asv.conf.json +++ b/asv_benckmarks/asv.conf.json @@ -11,7 +11,7 @@ // The URL or local path of the source code repository for the // project being benchmarked - "repo": ".", + "repo": "..", // The Python project's subdirectory in your repo. If missing or // the empty string, the project is assumed to be located at the root @@ -47,7 +47,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.6","3.7","3.8"], + "pythons": ["3.8"], // The list of conda channel names to be searched for benchmark // dependency packages in the specified order @@ -64,9 +64,9 @@ // followed by the pip installed packages). // "matrix": { // test with and without six installed - "numba": ["",null], - "numpy":[""], // emcee is only available for install with pip. - }, + "numba": ["", null], + // "numpy":[""], // emcee is only available for install with pip. + }, // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional diff --git a/benchmarks/__init__.py b/asv_benckmarks/benchmarks/__init__.py similarity index 100% rename from benchmarks/__init__.py rename to asv_benckmarks/benchmarks/__init__.py diff --git a/asv_benckmarks/benchmarks/benchmarks.py b/asv_benckmarks/benchmarks/benchmarks.py new file mode 100644 index 0000000000..77330a339d --- /dev/null +++ b/asv_benckmarks/benchmarks/benchmarks.py @@ -0,0 +1,164 @@ +# Write the benchmarking functions here. +# See "Writing benchmarks" in the airspeed velocity docs for more information. +# https://asv.readthedocs.io/en/stable/ +import numpy as np +from numpy import newaxis +from scipy.stats import circstd +from scipy.sparse import coo_matrix +import scipy.signal as ss +import warnings +from arviz.plots.plot_utils import _fast_kde, _fast_kde_2d +from arviz import Numba + + +class Hist: + def time_histogram(self): + try: + data = np.random.rand(10000, 1000) + import numba + + @numba.njit(cache=True) + def _hist(data): + return np.histogram(data, bins=100) + + return _hist(data) + except ImportError: + data = np.random.rand(10000, 1000) + return np.histogram(data, bins=100) + + +class Variance: + def time_variance(self): + try: + data = np.random.randn(10000, 10000) + import numba + + @numba.njit(cache=True) + def stats_variance_1d(data, ddof=0): + a, b = 0, 0 + for i in data: + a = a + i + b = b + i * i + var = b / (len(data)) - ((a / (len(data))) ** 2) + var = var * (len(data) / (len(data) - ddof)) + return var + + def stats_variance_2d(data, ddof=0, axis=1): + a, b = data.shape + if axis == 1: + var = np.zeros(a) + for i in range(a): + var[i] = stats_variance_1d(data[i], ddof=ddof) + else: + var = np.zeros(b) + for i in range(b): + var[i] = stats_variance_1d(data[:, i], ddof=ddof) + return var + + return stats_variance_2d(data) + except ImportError: + data = np.random.randn(10000, 10000) + return np.var(data, axis=1) + + +class CircStd: + def time_circ_std(self): + try: + data = np.random.randn(10000, 1000) + import numba + + def _circfunc(samples, high, low): + samples = np.asarray(samples) + if samples.size == 0: + return np.nan, np.nan + return samples, _angle(samples, low, high, np.pi) + + @numba.vectorize(nopython=True) + def _angle(samples, low, high, pi=np.pi): + ang = (samples - low) * 2.0 * pi / (high - low) + return ang + + def _circular_standard_deviation(samples, high=2 * np.pi, low=0, axis=None): + pi = np.pi + samples, ang = _circfunc(samples, high, low) + S = np.sin(ang).mean(axis=axis) + C = np.cos(ang).mean(axis=axis) + R = np.hypot(S, C) + return ((high - low) / 2.0 / pi) * np.sqrt(-2 * np.log(R)) + + return _circular_standard_deviation(data) + except ImportError: + data = np.random.randn(10000, 1000) + return circstd(data) + + +class Fast_Kde_1d: + params = [(True, False), (10**3, 10**5, 10**6)] + param_names = ("Numba", "n") + + def setup(self, numba_flag, n): + self.x = np.random.randn(n, 10) + + def time_fast_kde_normal(self, numba_flag, n): + if numba_flag: + Numba.enable_numba() + else: + Numba.disable_numba() + + _fast_kde(self.x) + +class Fast_KDE_2d: + params = [(True, False), (10**3, 10**5)] + param_names = ("Numba", "n") + + def setup(self, numba_flag, n): + self.x = np.random.randn(n, 10) + self.y = np.random.randn(n, 10) + + def time_fast_kde_2d(self, numba_flag, n): + if numba_flag: + Numba.enable_numba() + else: + Numba.disable_numba() + + _fast_kde_2d(self.x, self.y) + +class Data: + def time_2d_custom(self): + data = np.random.randn(100000) + + def two_de(data): + if not isinstance(data, np.ndarray): + return np.atleast_2d(data) + if data.ndim == 0: + result = data.reshape(1, 1) + elif data.ndim == 1: + result = data[newaxis, :] + else: + result = data + return result + + return two_de(data) + + def time_numpy_2d(self): + data = np.random.randn(100000) + return np.atleast_2d(data) + + def time_1d_custom(self): + x = np.random.randn(100000).tolist() + + def one_de(x): + """Jitting numpy atleast_1d.""" + if not isinstance(x, np.ndarray): + return np.atleast_1d(x) + if x.ndim == 0: + result = x.reshape(1) + else: + result = x + return result + + return one_de(x) + + def time_numpy_1d(self): + x = np.random.randn(100000).tolist() + return np.atleast_1d(x) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py deleted file mode 100644 index f99c72be94..0000000000 --- a/benchmarks/benchmarks.py +++ /dev/null @@ -1,401 +0,0 @@ -# Write the benchmarking functions here. -# See "Writing benchmarks" in the airspeed velocity docs for more information. -# https://asv.readthedocs.io/en/stable/ -import numpy as np -from numpy import newaxis -from scipy.stats import circstd -from scipy.sparse import coo_matrix -import scipy.signal as ss -import warnings - - -class Hist: - def time_histogram(self): - try: - data = np.random.rand(10000, 1000) - import numba - - @numba.njit(cache=True) - def _hist(data): - return np.histogram(data, bins=100) - - return _hist(data) - except ImportError: - data = np.random.rand(10000, 1000) - return np.histogram(data, bins=100) - - -class Variance: - def time_variance(self): - try: - data = np.random.randn(10000, 10000) - import numba - - @numba.njit(cache=True) - def stats_variance_1d(data, ddof=0): - a, b = 0, 0 - for i in data: - a = a + i - b = b + i * i - var = b / (len(data)) - ((a / (len(data))) ** 2) - var = var * (len(data) / (len(data) - ddof)) - return var - - def stats_variance_2d(data, ddof=0, axis=1): - a, b = data.shape - if axis == 1: - var = np.zeros(a) - for i in range(a): - var[i] = stats_variance_1d(data[i], ddof=ddof) - else: - var = np.zeros(b) - for i in range(b): - var[i] = stats_variance_1d(data[:, i], ddof=ddof) - return var - - return stats_variance_2d(data) - except ImportError: - data = np.random.randn(10000, 10000) - return np.var(data, axis=1) - - -class CircStd: - def time_circ_std(self): - try: - data = np.random.randn(10000, 1000) - import numba - - def _circfunc(samples, high, low): - samples = np.asarray(samples) - if samples.size == 0: - return np.nan, np.nan - return samples, _angle(samples, low, high, np.pi) - - @numba.vectorize(nopython=True) - def _angle(samples, low, high, pi=np.pi): - ang = (samples - low) * 2.0 * pi / (high - low) - return ang - - def _circular_standard_deviation(samples, high=2 * np.pi, low=0, axis=None): - pi = np.pi - samples, ang = _circfunc(samples, high, low) - S = np.sin(ang).mean(axis=axis) - C = np.cos(ang).mean(axis=axis) - R = np.hypot(S, C) - return ((high - low) / 2.0 / pi) * np.sqrt(-2 * np.log(R)) - - return _circular_standard_deviation(data) - except ImportError: - data = np.random.randn(10000, 1000) - return circstd(data) - - -class Fast_Kde_1d: - def time_fast_kde(self): - - try: - x = np.random.randn(10000, 100) - import numba - - def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - if x.size == 0: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - len_x = len(x) - n_points = 200 if (xmin or xmax) is None else 500 - - if xmin is None: - xmin = np.min(x) - if xmax is None: - xmax = np.max(x) - - assert np.min(x) >= xmin - assert np.max(x) <= xmax - - log_len_x = np.log(len_x) * bw - - n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) - if n_bins < 2: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - d_x = (xmax - xmin) / (n_bins - 1) - grid = _histogram(x, n_bins, range_hist=(xmin, xmax)) - - scotts_factor = len_x ** (-0.2) - kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) - kernel = ss.gaussian(kern_nx, scotts_factor * log_len_x) - - npad = min(n_bins, 2 * kern_nx) - grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) - density = ss.convolve(grid, kernel, mode="same", method="direct")[ - npad : npad + n_bins - ] - norm_factor = len_x * d_x * (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 - - density /= norm_factor - - if cumulative: - density = density.cumsum() / density.sum() - - return density, xmin, xmax - - @numba.njit(cache=True) - def _histogram(x, n_bins, range_hist=None): - grid, _ = np.histogram(x, bins=n_bins, range=range_hist) - return grid - - return _fast_kde(x) - - except ImportError: - - x = np.random.randn(10000, 100) - - def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - if x.size == 0: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - len_x = len(x) - n_points = 200 if (xmin or xmax) is None else 500 - - if xmin is None: - xmin = np.min(x) - if xmax is None: - xmax = np.max(x) - - assert np.min(x) >= xmin - assert np.max(x) <= xmax - - log_len_x = np.log(len_x) * bw - - n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) - if n_bins < 2: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - d_x = (xmax - xmin) / (n_bins - 1) - grid = _histogram(x, n_bins, range_hist=(xmin, xmax)) - - scotts_factor = len_x ** (-0.2) - kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) - kernel = ss.gaussian(kern_nx, scotts_factor * log_len_x) - - npad = min(n_bins, 2 * kern_nx) - grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) - density = ss.convolve(grid, kernel, mode="same", method="direct")[ - npad : npad + n_bins - ] - norm_factor = len_x * d_x * (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 - - density /= norm_factor - - if cumulative: - density = density.cumsum() / density.sum() - - return density, xmin, xmax - - def _histogram(x, n_bins, range_hist=None): - grid, _ = np.histogram(x, bins=n_bins, range=range_hist) - return grid - - return _fast_kde(x) - - -class Fast_KDE_2d: - def time_fast_kde_2d(self): - try: - x = np.random.randn(10000, 100) - y = np.random.randn(10000, 100) - import numba - - def _cov_1d(x): - x = x - x.mean(axis=0) - fact = x.shape[0] - 1 - by_hand = np.dot(x.T, x.conj()) / fact - return np.array(by_hand) - - def _cov(data): - if data.ndim == 1: - return _cov_1d(data) - elif data.ndim == 2: - x = data.astype(float) - avg, _ = np.average(x, axis=1, weights=None, returned=True) - fact = x.shape[1] - 1 - if fact <= 0: - warnings.warn( - "Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2 - ) - fact = 0.0 - x -= avg[:, None] - x_t = x.T - c_c = _dot(x, x_t.conj()) - c_c *= np.true_divide(1, fact) - return c_c.squeeze() - else: - raise ValueError("{} dimension arrays are not supported".format(data.ndimn)) - - @numba.njit(cache=True) - def _dot(x, y): - return np.dot(x, y) - - def _stack(x, y): - return np.vstack((x, y)) - - def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - y = np.asarray(y, dtype=float) - y = y[np.isfinite(y)] - - xmin, xmax = x.min(), x.max() - ymin, ymax = y.min(), y.max() - - len_x = len(x) - weights = np.ones(len_x) - n_x, n_y = gridsize - - d_x = (xmax - xmin) / (n_x - 1) - d_y = (ymax - ymin) / (n_y - 1) - - xyi = _stack(x, y).T - xyi -= [xmin, ymin] - xyi /= [d_x, d_y] - xyi = np.floor(xyi, xyi).T - - scotts_factor = len_x ** (-1 / 6) - cov = _cov(xyi) - std_devs = np.diag(cov ** 0.5) - kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) - - inv_cov = np.linalg.inv(cov * scotts_factor ** 2) - - x_x = np.arange(kern_nx) - kern_nx / 2 - y_y = np.arange(kern_ny) - kern_ny / 2 - x_x, y_y = np.meshgrid(x_x, y_y) - - kernel = _stack(x_x.flatten(), y_y.flatten()) - kernel = _dot(inv_cov, kernel) * kernel - kernel = np.exp(-kernel.sum(axis=0) / 2) - kernel = kernel.reshape((int(kern_ny), int(kern_nx))) - - boundary = "wrap" if circular else "symm" - - grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray() - grid = ss.convolve2d(grid, kernel, mode="same", boundary=boundary) - - norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2) - norm_factor = len_x * d_x * d_y * norm_factor ** 0.5 - - grid /= norm_factor - - return grid, xmin, xmax, ymin, ymax - - return _fast_kde_2d(x, y) - - except ImportError: - x = np.random.randn(10000, 100) - y = np.random.randn(10000, 100) - - def _cov(data): - return np.cov(data) - - def _stack(x, y): - return np.vstack((x, y)) - - def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - y = np.asarray(y, dtype=float) - y = y[np.isfinite(y)] - - xmin, xmax = x.min(), x.max() - ymin, ymax = y.min(), y.max() - - len_x = len(x) - weights = np.ones(len_x) - n_x, n_y = gridsize - - d_x = (xmax - xmin) / (n_x - 1) - d_y = (ymax - ymin) / (n_y - 1) - - xyi = _stack(x, y).T - xyi -= [xmin, ymin] - xyi /= [d_x, d_y] - xyi = np.floor(xyi, xyi).T - - scotts_factor = len_x ** (-1 / 6) - cov = _cov(xyi) - std_devs = np.diag(cov ** 0.5) - kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) - - inv_cov = np.linalg.inv(cov * scotts_factor ** 2) - - x_x = np.arange(kern_nx) - kern_nx / 2 - y_y = np.arange(kern_ny) - kern_ny / 2 - x_x, y_y = np.meshgrid(x_x, y_y) - - kernel = _stack(x_x.flatten(), y_y.flatten()) - kernel = np.dot(inv_cov, kernel) * kernel - kernel = np.exp(-kernel.sum(axis=0) / 2) - kernel = kernel.reshape((int(kern_ny), int(kern_nx))) - - boundary = "wrap" if circular else "symm" - - grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray() - grid = ss.convolve2d(grid, kernel, mode="same", boundary=boundary) - - norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2) - norm_factor = len_x * d_x * d_y * norm_factor ** 0.5 - - grid /= norm_factor - - return grid, xmin, xmax, ymin, ymax - - return _fast_kde_2d(x, y) - - -class Data: - def time_2d_custom(self): - data = np.random.randn(100000) - - def two_de(data): - if not isinstance(data, np.ndarray): - return np.atleast_2d(data) - if data.ndim == 0: - result = data.reshape(1, 1) - elif data.ndim == 1: - result = data[newaxis, :] - else: - result = data - return result - - return two_de(data) - - def time_numpy_2d(self): - data = np.random.randn(100000) - return np.atleast_2d(data) - - def time_1d_custom(self): - x = np.random.randn(100000).tolist() - - def one_de(x): - """Jitting numpy atleast_1d.""" - if not isinstance(x, np.ndarray): - return np.atleast_1d(x) - if x.ndim == 0: - result = x.reshape(1) - else: - result = x - return result - - return one_de(x) - - def time_numpy_1d(self): - x = np.random.randn(100000).tolist() - return np.atleast_1d(x) From a32d31c7c1710b45ee4897a148c4c03b0cd1e3ea Mon Sep 17 00:00:00 2001 From: "Oriol (ZBook)" Date: Mon, 24 Feb 2020 22:37:51 +0100 Subject: [PATCH 02/19] azure update --- .azure-pipelines/azure-pipelines-benchmarks.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/azure-pipelines-benchmarks.yml b/.azure-pipelines/azure-pipelines-benchmarks.yml index fcd8bf3770..76897bc53c 100644 --- a/.azure-pipelines/azure-pipelines-benchmarks.yml +++ b/.azure-pipelines/azure-pipelines-benchmarks.yml @@ -41,11 +41,13 @@ jobs: cat hashestobenchmark.txt python -m asv run HASHFILE:hashestobenchmark.txt displayName: 'Run the benchmarks' + workingDirectory: asv_benchmarks/ - script: | git log --pretty=format:'%h' -n 1 | xargs python -m asv show - git log --pretty=format:'%h' -n 1 | xargs python -m asv show > benchmark_results.txt + git log --pretty=format:'%h' -n 1 | xargs python -m asv show > ../benchmark_results.txt displayName: 'Print and save results' + workingDirectory: asv_benchmarks/ - task: PublishBuildArtifacts@1 inputs: From 4b4cefafcf6c0885619fa1d03e4c01d22f5ff85f Mon Sep 17 00:00:00 2001 From: "Oriol (ZBook)" Date: Mon, 24 Feb 2020 22:40:31 +0100 Subject: [PATCH 03/19] fix typo --- {asv_benckmarks => asv_benchmarks}/asv.conf.json | 0 {asv_benckmarks => asv_benchmarks}/benchmarks/__init__.py | 0 {asv_benckmarks => asv_benchmarks}/benchmarks/benchmarks.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {asv_benckmarks => asv_benchmarks}/asv.conf.json (100%) rename {asv_benckmarks => asv_benchmarks}/benchmarks/__init__.py (100%) rename {asv_benckmarks => asv_benchmarks}/benchmarks/benchmarks.py (100%) diff --git a/asv_benckmarks/asv.conf.json b/asv_benchmarks/asv.conf.json similarity index 100% rename from asv_benckmarks/asv.conf.json rename to asv_benchmarks/asv.conf.json diff --git a/asv_benckmarks/benchmarks/__init__.py b/asv_benchmarks/benchmarks/__init__.py similarity index 100% rename from asv_benckmarks/benchmarks/__init__.py rename to asv_benchmarks/benchmarks/__init__.py diff --git a/asv_benckmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py similarity index 100% rename from asv_benckmarks/benchmarks/benchmarks.py rename to asv_benchmarks/benchmarks/benchmarks.py From 90bbe4376f4768e928ba4d610f81e16b9b829132 Mon Sep 17 00:00:00 2001 From: "Oriol (ZBook)" Date: Tue, 25 Feb 2020 01:00:36 +0100 Subject: [PATCH 04/19] git use jitted histogram again in fast_kde --- arviz/plots/plot_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 0c167ef690..74052e46e8 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -18,6 +18,7 @@ from ..utils import conditional_jit, _stack from ..rcparams import rcParams +from ..stats.stats_utils import histogram as _histogram KwargSpec = Dict[str, Any] @@ -793,7 +794,7 @@ def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): warnings.warn("kde plot failed, you may want to check your data") return np.array([np.nan]), np.nan, np.nan - hist, bin_edges = np.histogram(x, bins=n_bins, range=(xmin, xmax)) + hist, bin_edges = _histogram(x, bins=n_bins, range=(xmin, xmax)) grid = hist / (hist.sum() * np.diff(bin_edges)) # _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) From d094b1ac1285e9ce2d3feefff4b8e1edb0e26ccc Mon Sep 17 00:00:00 2001 From: "Oriol (ZBook)" Date: Tue, 25 Feb 2020 01:01:11 +0100 Subject: [PATCH 05/19] continue benchmark rewrite --- asv_benchmarks/asv.conf.json | 5 +- asv_benchmarks/benchmarks/benchmarks.py | 117 +++++++++--------------- 2 files changed, 43 insertions(+), 79 deletions(-) diff --git a/asv_benchmarks/asv.conf.json b/asv_benchmarks/asv.conf.json index 35eac476c4..f7ea9552fa 100644 --- a/asv_benchmarks/asv.conf.json +++ b/asv_benchmarks/asv.conf.json @@ -63,9 +63,8 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). // - "matrix": { // test with and without six installed - "numba": ["", null], - // "numpy":[""], // emcee is only available for install with pip. + "matrix": { + "numba": [], }, // Combinations of libraries/python versions can be excluded/included diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index 77330a339d..a50fddb33f 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -7,8 +7,9 @@ from scipy.sparse import coo_matrix import scipy.signal as ss import warnings +import arviz as az +from arviz.stats.stats_utils import _circular_standard_deviation from arviz.plots.plot_utils import _fast_kde, _fast_kde_2d -from arviz import Numba class Hist: @@ -62,103 +63,67 @@ def stats_variance_2d(data, ddof=0, axis=1): class CircStd: - def time_circ_std(self): - try: - data = np.random.randn(10000, 1000) - import numba + params = (True, False) + param_names = "Numba", - def _circfunc(samples, high, low): - samples = np.asarray(samples) - if samples.size == 0: - return np.nan, np.nan - return samples, _angle(samples, low, high, np.pi) - - @numba.vectorize(nopython=True) - def _angle(samples, low, high, pi=np.pi): - ang = (samples - low) * 2.0 * pi / (high - low) - return ang - - def _circular_standard_deviation(samples, high=2 * np.pi, low=0, axis=None): - pi = np.pi - samples, ang = _circfunc(samples, high, low) - S = np.sin(ang).mean(axis=axis) - C = np.cos(ang).mean(axis=axis) - R = np.hypot(S, C) - return ((high - low) / 2.0 / pi) * np.sqrt(-2 * np.log(R)) - - return _circular_standard_deviation(data) - except ImportError: - data = np.random.randn(10000, 1000) - return circstd(data) + def setup(self, numba_flag): + self.data = np.random.randn(10000, 1000) + if numba_flag: + self.circstd = _circular_standard_deviation + else: + self.circstd = circstd + + def time_circ_std(self, numba_flag): + self.circstd(self.data) class Fast_Kde_1d: - params = [(True, False), (10**3, 10**5, 10**6)] + params = [(True, False), (10**5, 10**6, 10**7)] param_names = ("Numba", "n") def setup(self, numba_flag, n): - self.x = np.random.randn(n, 10) + self.x = np.random.randn(n//10, 10) def time_fast_kde_normal(self, numba_flag, n): if numba_flag: - Numba.enable_numba() + az.Numba.enable_numba() else: - Numba.disable_numba() + az.Numba.disable_numba() _fast_kde(self.x) class Fast_KDE_2d: - params = [(True, False), (10**3, 10**5)] + params = [(True, False), (10**5, 10**6)] param_names = ("Numba", "n") def setup(self, numba_flag, n): - self.x = np.random.randn(n, 10) - self.y = np.random.randn(n, 10) + self.x = np.random.randn(n//10, 10) + self.y = np.random.randn(n//10, 10) def time_fast_kde_2d(self, numba_flag, n): if numba_flag: - Numba.enable_numba() + az.Numba.enable_numba() else: - Numba.disable_numba() + az.Numba.disable_numba() _fast_kde_2d(self.x, self.y) -class Data: - def time_2d_custom(self): - data = np.random.randn(100000) - - def two_de(data): - if not isinstance(data, np.ndarray): - return np.atleast_2d(data) - if data.ndim == 0: - result = data.reshape(1, 1) - elif data.ndim == 1: - result = data[newaxis, :] - else: - result = data - return result - - return two_de(data) - - def time_numpy_2d(self): - data = np.random.randn(100000) - return np.atleast_2d(data) - - def time_1d_custom(self): - x = np.random.randn(100000).tolist() - - def one_de(x): - """Jitting numpy atleast_1d.""" - if not isinstance(x, np.ndarray): - return np.atleast_1d(x) - if x.ndim == 0: - result = x.reshape(1) - else: - result = x - return result - - return one_de(x) - - def time_numpy_1d(self): - x = np.random.randn(100000).tolist() - return np.atleast_1d(x) +class Atleast_Nd: + params = ("az.utils", "numpy") + param_names = ("source",) + + def setup(self, source): + self.data = np.random.randn(100000) + self.x = np.random.randn(100000).tolist() + if source == "az.utils": + self.atleast_2d = az.utils.two_de + self.atleast_1d = az.utils.one_de + else: + self.atleast_2d = np.atleast_2d + self.atleast_1d = np.atleast_1d + + def time_atleast_2d_array(self, source): + self.atleast_2d(self.data) + + def time_atleast_1d(self, source): + self.atleast_1d(self.x) From 446d9364a5beaa9cf7f54dd24f07bdef8415961b Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sat, 4 Apr 2020 11:49:25 +0530 Subject: [PATCH 06/19] move plot_kde and get_bins --- arviz/__init__.py | 2 +- arviz/plots/__init__.py | 3 +- arviz/plots/backends/bokeh/densityplot.py | 4 +- arviz/plots/backends/bokeh/distplot.py | 2 +- arviz/plots/backends/bokeh/forestplot.py | 4 +- arviz/plots/backends/bokeh/posteriorplot.py | 2 +- arviz/plots/backends/bokeh/ppcplot.py | 7 +- arviz/plots/backends/bokeh/violinplot.py | 4 +- .../plots/backends/matplotlib/densityplot.py | 3 +- arviz/plots/backends/matplotlib/distplot.py | 2 +- arviz/plots/backends/matplotlib/forestplot.py | 4 +- arviz/plots/backends/matplotlib/loopitplot.py | 2 +- .../backends/matplotlib/posteriorplot.py | 2 +- arviz/plots/backends/matplotlib/ppcplot.py | 4 +- arviz/plots/backends/matplotlib/traceplot.py | 5 + arviz/plots/backends/matplotlib/violinplot.py | 4 +- arviz/plots/distplot.py | 3 +- arviz/plots/kdeplot.py | 3 +- arviz/plots/loopitplot.py | 2 +- arviz/plots/plot_utils.py | 46 +------ arviz/stats/stats.py | 5 + arviz/stats/stats_utils.py | 115 ++++++++++++++++++ arviz/tests/base_tests/test_plot_utils.py | 8 ++ .../tests/base_tests/test_plots_matplotlib.py | 3 +- 24 files changed, 164 insertions(+), 75 deletions(-) diff --git a/arviz/__init__.py b/arviz/__init__.py index e18344a145..40f46cb917 100644 --- a/arviz/__init__.py +++ b/arviz/__init__.py @@ -25,8 +25,8 @@ from .rcparams import rcParams, rc_context from .data import * -from .plots import * from .stats import * +from .plots import * from .utils import Numba, interactive_backend from .plots import backends from .wrappers import * diff --git a/arviz/plots/__init__.py b/arviz/plots/__init__.py index ae417351c7..392cf72b02 100644 --- a/arviz/plots/__init__.py +++ b/arviz/plots/__init__.py @@ -20,7 +20,7 @@ from .rankplot import plot_rank from .traceplot import plot_trace from .violinplot import plot_violin -from .plot_utils import _fast_kde, _fast_kde_2d +from .plot_utils import _fast_kde_2d __all__ = [ @@ -35,7 +35,6 @@ "plot_hpd", "plot_joint", "plot_kde", - "_fast_kde", "_fast_kde_2d", "plot_khat", "plot_loo_pit", diff --git a/arviz/plots/backends/bokeh/densityplot.py b/arviz/plots/backends/bokeh/densityplot.py index 3bc83a76ea..868c02adb1 100644 --- a/arviz/plots/backends/bokeh/densityplot.py +++ b/arviz/plots/backends/bokeh/densityplot.py @@ -10,11 +10,9 @@ make_label, _create_axes_grid, calculate_point_estimate, - _fast_kde, - get_bins, ) from ....stats import hpd -from ....stats.stats_utils import histogram +from ....stats.stats_utils import histogram, _fast_kde, get_bins def plot_density( diff --git a/arviz/plots/backends/bokeh/distplot.py b/arviz/plots/backends/bokeh/distplot.py index 987c616169..b02f09a44d 100644 --- a/arviz/plots/backends/bokeh/distplot.py +++ b/arviz/plots/backends/bokeh/distplot.py @@ -6,7 +6,7 @@ from . import backend_kwarg_defaults from .. import show_layout from ...kdeplot import plot_kde -from ...plot_utils import get_bins +from ....stats.stats_utils import get_bins def plot_dist( diff --git a/arviz/plots/backends/bokeh/forestplot.py b/arviz/plots/backends/bokeh/forestplot.py index 0d19ef8c67..c0be489158 100644 --- a/arviz/plots/backends/bokeh/forestplot.py +++ b/arviz/plots/backends/bokeh/forestplot.py @@ -12,11 +12,11 @@ from . import backend_kwarg_defaults from .. import show_layout -from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label, get_bins, _fast_kde +from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label from ....rcparams import rcParams from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram +from ....stats.stats_utils import histogram, _fast_kde, get_bins from ....utils import conditional_jit diff --git a/arviz/plots/backends/bokeh/posteriorplot.py b/arviz/plots/backends/bokeh/posteriorplot.py index 3560aae88a..984f50b851 100644 --- a/arviz/plots/backends/bokeh/posteriorplot.py +++ b/arviz/plots/backends/bokeh/posteriorplot.py @@ -14,9 +14,9 @@ format_sig_figs, round_num, calculate_point_estimate, - get_bins, ) from ....stats import hpd +from ....stats.stats_utils import get_bins def plot_posterior( diff --git a/arviz/plots/backends/bokeh/ppcplot.py b/arviz/plots/backends/bokeh/ppcplot.py index e2f544a625..4160ed47d1 100644 --- a/arviz/plots/backends/bokeh/ppcplot.py +++ b/arviz/plots/backends/bokeh/ppcplot.py @@ -4,11 +4,8 @@ from . import backend_kwarg_defaults from .. import show_layout from ...kdeplot import plot_kde, _fast_kde -from ...plot_utils import ( - _create_axes_grid, - get_bins, -) -from ....stats.stats_utils import histogram +from ...plot_utils import _create_axes_grid +from ....stats.stats_utils import histogram, get_bins def plot_ppc( diff --git a/arviz/plots/backends/bokeh/violinplot.py b/arviz/plots/backends/bokeh/violinplot.py index fc0264be7a..ac53f73c1f 100644 --- a/arviz/plots/backends/bokeh/violinplot.py +++ b/arviz/plots/backends/bokeh/violinplot.py @@ -5,9 +5,9 @@ from . import backend_kwarg_defaults from .. import show_layout from ...kdeplot import _fast_kde -from ...plot_utils import get_bins, make_label, _create_axes_grid +from ...plot_utils import make_label, _create_axes_grid from ....stats import hpd -from ....stats.stats_utils import histogram +from ....stats.stats_utils import histogram, get_bins def plot_violin( diff --git a/arviz/plots/backends/matplotlib/densityplot.py b/arviz/plots/backends/matplotlib/densityplot.py index 4d52b2b9c3..2acd3e0ab9 100644 --- a/arviz/plots/backends/matplotlib/densityplot.py +++ b/arviz/plots/backends/matplotlib/densityplot.py @@ -8,9 +8,8 @@ make_label, _create_axes_grid, calculate_point_estimate, - _fast_kde, - get_bins, ) +from ....stats.stats_utils import _fast_kde, get_bins def plot_density( diff --git a/arviz/plots/backends/matplotlib/distplot.py b/arviz/plots/backends/matplotlib/distplot.py index 193bcc98f5..c719c0db44 100644 --- a/arviz/plots/backends/matplotlib/distplot.py +++ b/arviz/plots/backends/matplotlib/distplot.py @@ -5,7 +5,7 @@ from . import backend_show from ...kdeplot import plot_kde from ...plot_utils import matplotlib_kwarg_dealiaser -from ...plot_utils import get_bins +from ....stats.stats_utils import get_bins def plot_dist( diff --git a/arviz/plots/backends/matplotlib/forestplot.py b/arviz/plots/backends/matplotlib/forestplot.py index d68671dcf8..3364ef8dab 100644 --- a/arviz/plots/backends/matplotlib/forestplot.py +++ b/arviz/plots/backends/matplotlib/forestplot.py @@ -9,8 +9,8 @@ from . import backend_kwarg_defaults, backend_show from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram -from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label, get_bins, _fast_kde +from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label from ....utils import conditional_jit diff --git a/arviz/plots/backends/matplotlib/loopitplot.py b/arviz/plots/backends/matplotlib/loopitplot.py index e375a7a4c7..efbd799822 100644 --- a/arviz/plots/backends/matplotlib/loopitplot.py +++ b/arviz/plots/backends/matplotlib/loopitplot.py @@ -3,7 +3,7 @@ import numpy as np from . import backend_kwarg_defaults, backend_show -from ...plot_utils import _fast_kde +from ....stats.stats_utils import _fast_kde from ...hpdplot import plot_hpd diff --git a/arviz/plots/backends/matplotlib/posteriorplot.py b/arviz/plots/backends/matplotlib/posteriorplot.py index b15a7dc5e2..6ea8f3d932 100644 --- a/arviz/plots/backends/matplotlib/posteriorplot.py +++ b/arviz/plots/backends/matplotlib/posteriorplot.py @@ -13,8 +13,8 @@ format_sig_figs, round_num, calculate_point_estimate, - get_bins, ) +from ....stats.stats_utils import get_bins def plot_posterior( diff --git a/arviz/plots/backends/matplotlib/ppcplot.py b/arviz/plots/backends/matplotlib/ppcplot.py index 57827e8d55..58ea39e098 100644 --- a/arviz/plots/backends/matplotlib/ppcplot.py +++ b/arviz/plots/backends/matplotlib/ppcplot.py @@ -8,10 +8,8 @@ from ...plot_utils import ( make_label, _create_axes_grid, - get_bins, - _fast_kde, ) -from ....stats.stats_utils import histogram +from ....stats.stats_utils import histogram, _fast_kde, get_bins def plot_ppc( diff --git a/arviz/plots/backends/matplotlib/traceplot.py b/arviz/plots/backends/matplotlib/traceplot.py index fbd51316e0..7dd33e0429 100644 --- a/arviz/plots/backends/matplotlib/traceplot.py +++ b/arviz/plots/backends/matplotlib/traceplot.py @@ -8,8 +8,13 @@ from . import backend_kwarg_defaults, backend_show from ...distplot import plot_dist +<<<<<<< HEAD from ...rankplot import plot_rank from ...plot_utils import _scale_fig_size, get_bins, make_label, format_coords_as_labels +======= +from ...plot_utils import _scale_fig_size, make_label, format_coords_as_labels +from ....stats.stats_utils import get_bins +>>>>>>> move plot_kde and get_bins def plot_trace( diff --git a/arviz/plots/backends/matplotlib/violinplot.py b/arviz/plots/backends/matplotlib/violinplot.py index 9c533103c6..8080e8d56b 100644 --- a/arviz/plots/backends/matplotlib/violinplot.py +++ b/arviz/plots/backends/matplotlib/violinplot.py @@ -4,8 +4,8 @@ from . import backend_show from ....stats import hpd -from ....stats.stats_utils import histogram -from ...plot_utils import get_bins, make_label, _create_axes_grid, _fast_kde +from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ...plot_utils import make_label, _create_axes_grid def plot_violin( diff --git a/arviz/plots/distplot.py b/arviz/plots/distplot.py index af7aefc985..2d2a72cbaf 100644 --- a/arviz/plots/distplot.py +++ b/arviz/plots/distplot.py @@ -2,7 +2,8 @@ """Plot distribution as histogram or kernel density estimates.""" import xarray as xr -from .plot_utils import get_bins, get_plotting_function, matplotlib_kwarg_dealiaser +from .plot_utils import get_plotting_function, matplotlib_kwarg_dealiaser +from ..stats.stats_utils import get_bins from ..data import InferenceData from ..rcparams import rcParams diff --git a/arviz/plots/kdeplot.py b/arviz/plots/kdeplot.py index 4837c7d076..2b3ea92bed 100644 --- a/arviz/plots/kdeplot.py +++ b/arviz/plots/kdeplot.py @@ -3,7 +3,8 @@ import xarray as xr from ..data import InferenceData -from .plot_utils import get_plotting_function, _fast_kde, _fast_kde_2d +from ..stats.stats_utils import _fast_kde +from .plot_utils import get_plotting_function, _fast_kde_2d from ..rcparams import rcParams diff --git a/arviz/plots/loopitplot.py b/arviz/plots/loopitplot.py index 40cf225fca..05744bf0ad 100644 --- a/arviz/plots/loopitplot.py +++ b/arviz/plots/loopitplot.py @@ -5,10 +5,10 @@ from xarray import DataArray from ..stats import loo_pit as _loo_pit +from ..stats.stats_utils import _fast_kde from .plot_utils import ( _scale_fig_size, get_plotting_function, - _fast_kde, matplotlib_kwarg_dealiaser, ) from ..rcparams import rcParams diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 74052e46e8..30836e4f93 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -17,6 +17,7 @@ from ..utils import conditional_jit, _stack +from ..stats.stats_utils import _fast_kde from ..rcparams import rcParams from ..stats.stats_utils import histogram as _histogram @@ -100,48 +101,6 @@ def _scale_fig_size(figsize, textsize, rows=1, cols=1): return (width, height), ax_labelsize, titlesize, xt_labelsize, linewidth, markersize -def get_bins(values): - """ - Automatically compute the number of bins for discrete variables. - - Parameters - ---------- - values = numpy array - values - - Returns - ------- - array with the bins - - Notes - ----- - Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis - estimators. Acording to numpy `np.histogram` this provides good all around performance. - - The Sturges is a very simplistic estimator based on the assumption of normality of the data. - This estimator has poor performance for non-normal data, which becomes especially obvious for - large data sets. The estimate depends only on size of the data. - - The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth. - It is considered a robusts version of the Scott rule as the IQR is less affected by outliers - than the standard deviation. However, the IQR depends on fewer points than the standard - deviation, so it is less accurate, especially for long tailed distributions. - """ - x_min = values.min().astype(int) - x_max = values.max().astype(int) - - # Sturges histogram bin estimator - bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1) - - # The Freedman-Diaconis histogram bin estimator. - iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return - bins_fd = 2 * iqr * values.size ** (-1 / 3) - - width = round(np.max([1, bins_sturges, bins_fd])).astype(int) - - return np.arange(x_min, x_max + width + 1, width) - - def _sturges_formula(dataset, mult=1): """Use Sturges' formula to determine number of bins. @@ -745,6 +704,7 @@ def calculate_point_estimate(point_estimate, values, bw=4.5): return point_value +<<<<<<< HEAD def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): """Fast Fourier transform-based Gaussian kernel density estimate (KDE). @@ -816,6 +776,8 @@ def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): return density, xmin, xmax +======= +>>>>>>> move plot_kde and get_bins def _cov_1d(x): x = x - x.mean(axis=0) ddof = x.shape[0] - 1 diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py index 7ebfe0e6f8..c943133669 100644 --- a/arviz/stats/stats.py +++ b/arviz/stats/stats.py @@ -12,7 +12,10 @@ from scipy.optimize import minimize import xarray as xr +<<<<<<< HEAD from ..plots.plot_utils import _fast_kde, get_bins, get_coords +======= +>>>>>>> move plot_kde and get_bins from ..data import convert_to_inference_data, convert_to_dataset, InferenceData, CoordSpec, DimSpec from .diagnostics import _multichain_statistics, _mc_error, ess from .stats_utils import ( @@ -24,6 +27,8 @@ histogram, _circular_standard_deviation, get_log_likelihood as _get_log_likelihood, + _fast_kde, + get_bins, ) from ..utils import _var_names, Numba, _numba_var from ..rcparams import rcParams diff --git a/arviz/stats/stats_utils.py b/arviz/stats/stats_utils.py index e26af7b3cb..4949e9644a 100644 --- a/arviz/stats/stats_utils.py +++ b/arviz/stats/stats_utils.py @@ -7,6 +7,8 @@ import numpy as np import pandas as pd from scipy.fftpack import next_fast_len +from scipy.signal import convolve +from scipy.signal.windows import gaussian from scipy.stats.mstats import mquantiles from xarray import apply_ufunc from ..utils import conditional_jit, conditional_vect @@ -572,3 +574,116 @@ def _circular_standard_deviation(samples, high=2 * np.pi, low=0, skipna=False, a c_c = np.cos(ang).mean(axis=axis) r_r = np.hypot(s_s, c_c) return ((high - low) / 2.0 / np.pi) * np.sqrt(-2 * np.log(r_r)) + + +def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): + """Fast Fourier transform-based Gaussian kernel density estimate (KDE). + + The code was adapted from https://github.com/mfouesneau/faststats + + Parameters + ---------- + x : Numpy array or list + cumulative : bool + If true, estimate the cdf instead of the pdf + bw : float + Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the + smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule + of thumb (the default rule used by SciPy). + xmin : float + Manually set lower limit. + xmax : float + Manually set upper limit. + + Returns + ------- + density: A gridded 1D KDE of the input points (x) + xmin: minimum value of x + xmax: maximum value of x + """ + x = np.asarray(x, dtype=float) + x = x[np.isfinite(x)] + if x.size == 0: + warnings.warn("kde plot failed, you may want to check your data") + return np.array([np.nan]), np.nan, np.nan + + len_x = len(x) + n_points = 200 if (xmin or xmax) is None else 500 + + if xmin is None: + xmin = np.min(x) + if xmax is None: + xmax = np.max(x) + + assert np.min(x) >= xmin + assert np.max(x) <= xmax + + log_len_x = np.log(len_x) * bw + + n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) + if n_bins < 2: + warnings.warn("kde plot failed, you may want to check your data") + return np.array([np.nan]), np.nan, np.nan + + # hist, bin_edges = np.histogram(x, bins=n_bins, range=(xmin, xmax)) + # grid = hist / (hist.sum() * np.diff(bin_edges)) + + _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) + + scotts_factor = len_x ** (-0.2) + kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) + kernel = gaussian(kern_nx, scotts_factor * log_len_x) + + npad = min(n_bins, 2 * kern_nx) + grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) + density = convolve(grid, kernel, mode="same", method="direct")[npad : npad + n_bins] + norm_factor = (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 + + density /= norm_factor + + if cumulative: + density = density.cumsum() / density.sum() + + return density, xmin, xmax + + +def get_bins(values): + """ + Automatically compute the number of bins for discrete variables. + + Parameters + ---------- + values = numpy array + values + + Returns + ------- + array with the bins + + Notes + ----- + Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis + estimators. Acording to numpy `np.histogram` this provides good all around performance. + + The Sturges is a very simplistic estimator based on the assumption of normality of the data. + This estimator has poor performance for non-normal data, which becomes especially obvious for + large data sets. The estimate depends only on size of the data. + + The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth. + It is considered a robusts version of the Scott rule as the IQR is less affected by outliers + than the standard deviation. However, the IQR depends on fewer points than the standard + deviation, so it is less accurate, especially for long tailed distributions. + """ + x_min = values.min().astype(int) + x_max = values.max().astype(int) + + # Sturges histogram bin estimator + bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1) + + # The Freedman-Diaconis histogram bin estimator. + iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return + bins_fd = 2 * iqr * values.size ** (-1 / 3) + + width = round(np.max([1, bins_sturges, bins_fd])).astype(int) + + return np.arange(x_min, x_max + width + 1, width) diff --git a/arviz/tests/base_tests/test_plot_utils.py b/arviz/tests/base_tests/test_plot_utils.py index 9250f84ae5..49d895c020 100644 --- a/arviz/tests/base_tests/test_plot_utils.py +++ b/arviz/tests/base_tests/test_plot_utils.py @@ -8,6 +8,13 @@ from ...data import from_dict from ..helpers import running_on_ci from ...plots.plot_utils import ( +<<<<<<< HEAD +======= + make_2d, + xarray_to_ndarray, + xarray_var_iter, + get_coords, +>>>>>>> move plot_kde and get_bins filter_plotters_list, format_sig_figs, get_bins, @@ -19,6 +26,7 @@ xarray_var_iter, ) from ...rcparams import rc_context +from ...stats.stats_utils import get_bins @pytest.mark.parametrize( diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index 60facbe797..e0fb8d3f14 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -41,7 +41,8 @@ plot_loo_pit, plot_mcse, ) -from ...plots.plot_utils import _fast_kde, _cov +from ...plots.plot_utils import _cov +from ...stats.stats_utils import _fast_kde rcParams["data.load"] = "eager" From 0f01cf9dfb5d2286a06cf41f31470578aff96642 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sat, 4 Apr 2020 16:35:38 +0530 Subject: [PATCH 07/19] add kde_utils --- arviz/kde_utils.py | 151 +++++++++++++++ arviz/plots/__init__.py | 2 +- arviz/plots/backends/bokeh/densityplot.py | 3 +- arviz/plots/backends/bokeh/forestplot.py | 3 +- .../plots/backends/matplotlib/densityplot.py | 3 +- arviz/plots/backends/matplotlib/forestplot.py | 3 +- arviz/plots/backends/matplotlib/loopitplot.py | 2 +- arviz/plots/backends/matplotlib/ppcplot.py | 3 +- arviz/plots/backends/matplotlib/violinplot.py | 3 +- arviz/plots/kdeplot.py | 4 +- arviz/plots/loopitplot.py | 2 +- arviz/plots/plot_utils.py | 181 +----------------- arviz/stats/stats.py | 2 +- arviz/stats/stats_utils.py | 71 ------- .../tests/base_tests/test_plots_matplotlib.py | 4 +- arviz/utils.py | 31 +++ asv_benchmarks/benchmarks/benchmarks.py | 2 +- 17 files changed, 204 insertions(+), 266 deletions(-) create mode 100644 arviz/kde_utils.py diff --git a/arviz/kde_utils.py b/arviz/kde_utils.py new file mode 100644 index 0000000000..e21708ffa5 --- /dev/null +++ b/arviz/kde_utils.py @@ -0,0 +1,151 @@ +"""KDE functions for ArviZ""" +import warnings +import numpy as np +from scipy.signal import convolve, convolve2d +from scipy.signal.windows import gaussian +from scipy.sparse import coo_matrix + +from .stats.stats_utils import histogram +from .utils import _stack, _dot, _cov + + +def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): + """Fast Fourier transform-based Gaussian kernel density estimate (KDE). + + The code was adapted from https://github.com/mfouesneau/faststats + + Parameters + ---------- + x : Numpy array or list + cumulative : bool + If true, estimate the cdf instead of the pdf + bw : float + Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the + smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule + of thumb (the default rule used by SciPy). + xmin : float + Manually set lower limit. + xmax : float + Manually set upper limit. + + Returns + ------- + density: A gridded 1D KDE of the input points (x) + xmin: minimum value of x + xmax: maximum value of x + """ + x = np.asarray(x, dtype=float) + x = x[np.isfinite(x)] + if x.size == 0: + warnings.warn("kde plot failed, you may want to check your data") + return np.array([np.nan]), np.nan, np.nan + + len_x = len(x) + n_points = 200 if (xmin or xmax) is None else 500 + + if xmin is None: + xmin = np.min(x) + if xmax is None: + xmax = np.max(x) + + assert np.min(x) >= xmin + assert np.max(x) <= xmax + + log_len_x = np.log(len_x) * bw + + n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) + if n_bins < 2: + warnings.warn("kde plot failed, you may want to check your data") + return np.array([np.nan]), np.nan, np.nan + + # hist, bin_edges = np.histogram(x, bins=n_bins, range=(xmin, xmax)) + # grid = hist / (hist.sum() * np.diff(bin_edges)) + + _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) + + scotts_factor = len_x ** (-0.2) + kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) + kernel = gaussian(kern_nx, scotts_factor * log_len_x) + + npad = min(n_bins, 2 * kern_nx) + grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) + density = convolve(grid, kernel, mode="same", method="direct")[npad : npad + n_bins] + norm_factor = (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 + + density /= norm_factor + + if cumulative: + density = density.cumsum() / density.sum() + + return density, xmin, xmax + + +def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): + """ + 2D fft-based Gaussian kernel density estimate (KDE). + + The code was adapted from https://github.com/mfouesneau/faststats + + Parameters + ---------- + x : Numpy array or list + y : Numpy array or list + gridsize : tuple + Number of points used to discretize data. Use powers of 2 for fft optimization + circular: bool + If True, use circular boundaries. Defaults to False + Returns + ------- + grid: A gridded 2D KDE of the input points (x, y) + xmin: minimum value of x + xmax: maximum value of x + ymin: minimum value of y + ymax: maximum value of y + """ + x = np.asarray(x, dtype=float) + x = x[np.isfinite(x)] + y = np.asarray(y, dtype=float) + y = y[np.isfinite(y)] + + xmin, xmax = x.min(), x.max() + ymin, ymax = y.min(), y.max() + + len_x = len(x) + weights = np.ones(len_x) + n_x, n_y = gridsize + + d_x = (xmax - xmin) / (n_x - 1) + d_y = (ymax - ymin) / (n_y - 1) + + xyi = _stack(x, y).T + xyi -= [xmin, ymin] + xyi /= [d_x, d_y] + xyi = np.floor(xyi, xyi).T + + scotts_factor = len_x ** (-1 / 6) + cov = _cov(xyi) + std_devs = np.diag(cov ** 0.5) + kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) + + inv_cov = np.linalg.inv(cov * scotts_factor ** 2) + + x_x = np.arange(kern_nx) - kern_nx / 2 + y_y = np.arange(kern_ny) - kern_ny / 2 + x_x, y_y = np.meshgrid(x_x, y_y) + + kernel = _stack(x_x.flatten(), y_y.flatten()) + kernel = _dot(inv_cov, kernel) * kernel + kernel = np.exp(-kernel.sum(axis=0) / 2) + kernel = kernel.reshape((int(kern_ny), int(kern_nx))) + + boundary = "wrap" if circular else "symm" + + grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray() + grid = convolve2d(grid, kernel, mode="same", boundary=boundary) + + norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2) + norm_factor = len_x * d_x * d_y * norm_factor ** 0.5 + + grid /= norm_factor + + return grid, xmin, xmax, ymin, ymax diff --git a/arviz/plots/__init__.py b/arviz/plots/__init__.py index 392cf72b02..ef2c478307 100644 --- a/arviz/plots/__init__.py +++ b/arviz/plots/__init__.py @@ -20,7 +20,7 @@ from .rankplot import plot_rank from .traceplot import plot_trace from .violinplot import plot_violin -from .plot_utils import _fast_kde_2d +from ..kde_utils import _fast_kde_2d __all__ = [ diff --git a/arviz/plots/backends/bokeh/densityplot.py b/arviz/plots/backends/bokeh/densityplot.py index 868c02adb1..a42038a6d4 100644 --- a/arviz/plots/backends/bokeh/densityplot.py +++ b/arviz/plots/backends/bokeh/densityplot.py @@ -12,7 +12,8 @@ calculate_point_estimate, ) from ....stats import hpd -from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ....stats.stats_utils import histogram, get_bins +from ....kde_utils import _fast_kde def plot_density( diff --git a/arviz/plots/backends/bokeh/forestplot.py b/arviz/plots/backends/bokeh/forestplot.py index c0be489158..ac058a60b5 100644 --- a/arviz/plots/backends/bokeh/forestplot.py +++ b/arviz/plots/backends/bokeh/forestplot.py @@ -16,7 +16,8 @@ from ....rcparams import rcParams from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ....stats.stats_utils import histogram, get_bins +from ....kde_utils import _fast_kde from ....utils import conditional_jit diff --git a/arviz/plots/backends/matplotlib/densityplot.py b/arviz/plots/backends/matplotlib/densityplot.py index 2acd3e0ab9..7f914daa14 100644 --- a/arviz/plots/backends/matplotlib/densityplot.py +++ b/arviz/plots/backends/matplotlib/densityplot.py @@ -9,7 +9,8 @@ _create_axes_grid, calculate_point_estimate, ) -from ....stats.stats_utils import _fast_kde, get_bins +from ....stats.stats_utils import get_bins +from ....kde_utils import _fast_kde def plot_density( diff --git a/arviz/plots/backends/matplotlib/forestplot.py b/arviz/plots/backends/matplotlib/forestplot.py index 3364ef8dab..8b4413c23f 100644 --- a/arviz/plots/backends/matplotlib/forestplot.py +++ b/arviz/plots/backends/matplotlib/forestplot.py @@ -9,7 +9,8 @@ from . import backend_kwarg_defaults, backend_show from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ....stats.stats_utils import histogram, get_bins +from ....kde_utils import _fast_kde from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label from ....utils import conditional_jit diff --git a/arviz/plots/backends/matplotlib/loopitplot.py b/arviz/plots/backends/matplotlib/loopitplot.py index efbd799822..8f7b59f796 100644 --- a/arviz/plots/backends/matplotlib/loopitplot.py +++ b/arviz/plots/backends/matplotlib/loopitplot.py @@ -3,7 +3,7 @@ import numpy as np from . import backend_kwarg_defaults, backend_show -from ....stats.stats_utils import _fast_kde +from ....kde_utils import _fast_kde from ...hpdplot import plot_hpd diff --git a/arviz/plots/backends/matplotlib/ppcplot.py b/arviz/plots/backends/matplotlib/ppcplot.py index 58ea39e098..2420a22bd6 100644 --- a/arviz/plots/backends/matplotlib/ppcplot.py +++ b/arviz/plots/backends/matplotlib/ppcplot.py @@ -9,7 +9,8 @@ make_label, _create_axes_grid, ) -from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ....stats.stats_utils import histogram, get_bins +from ....kde_utils import _fast_kde def plot_ppc( diff --git a/arviz/plots/backends/matplotlib/violinplot.py b/arviz/plots/backends/matplotlib/violinplot.py index 8080e8d56b..0fb4107584 100644 --- a/arviz/plots/backends/matplotlib/violinplot.py +++ b/arviz/plots/backends/matplotlib/violinplot.py @@ -4,7 +4,8 @@ from . import backend_show from ....stats import hpd -from ....stats.stats_utils import histogram, _fast_kde, get_bins +from ....stats.stats_utils import histogram, get_bins +from ....kde_utils import _fast_kde from ...plot_utils import make_label, _create_axes_grid diff --git a/arviz/plots/kdeplot.py b/arviz/plots/kdeplot.py index 2b3ea92bed..0ece8dde55 100644 --- a/arviz/plots/kdeplot.py +++ b/arviz/plots/kdeplot.py @@ -3,8 +3,8 @@ import xarray as xr from ..data import InferenceData -from ..stats.stats_utils import _fast_kde -from .plot_utils import get_plotting_function, _fast_kde_2d +from ..kde_utils import _fast_kde, _fast_kde_2d +from .plot_utils import get_plotting_function from ..rcparams import rcParams diff --git a/arviz/plots/loopitplot.py b/arviz/plots/loopitplot.py index 05744bf0ad..43e456d25d 100644 --- a/arviz/plots/loopitplot.py +++ b/arviz/plots/loopitplot.py @@ -5,7 +5,7 @@ from xarray import DataArray from ..stats import loo_pit as _loo_pit -from ..stats.stats_utils import _fast_kde +from ..kde_utils import _fast_kde from .plot_utils import ( _scale_fig_size, get_plotting_function, diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 30836e4f93..ef98e29151 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -15,11 +15,8 @@ import matplotlib.cbook as cbook import xarray as xr - -from ..utils import conditional_jit, _stack -from ..stats.stats_utils import _fast_kde +from ..kde_utils import _fast_kde from ..rcparams import rcParams -from ..stats.stats_utils import histogram as _histogram KwargSpec = Dict[str, Any] @@ -704,182 +701,6 @@ def calculate_point_estimate(point_estimate, values, bw=4.5): return point_value -<<<<<<< HEAD -def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): - """Fast Fourier transform-based Gaussian kernel density estimate (KDE). - - The code was adapted from https://github.com/mfouesneau/faststats - - Parameters - ---------- - x : Numpy array or list - cumulative : bool - If true, estimate the cdf instead of the pdf - bw : float - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). - xmin : float - Manually set lower limit. - xmax : float - Manually set upper limit. - - Returns - ------- - density: A gridded 1D KDE of the input points (x) - xmin: minimum value of x - xmax: maximum value of x - """ - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - if x.size == 0: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - len_x = len(x) - n_points = 200 if (xmin or xmax) is None else 500 - - if xmin is None: - xmin = np.min(x) - if xmax is None: - xmax = np.max(x) - - assert np.min(x) >= xmin - assert np.max(x) <= xmax - - log_len_x = np.log(len_x) * bw - - n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) - if n_bins < 2: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - hist, bin_edges = _histogram(x, bins=n_bins, range=(xmin, xmax)) - grid = hist / (hist.sum() * np.diff(bin_edges)) - - # _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) - - scotts_factor = len_x ** (-0.2) - kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) - kernel = gaussian(kern_nx, scotts_factor * log_len_x) - - npad = min(n_bins, 2 * kern_nx) - grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) - density = convolve(grid, kernel, mode="same", method="direct")[npad : npad + n_bins] - norm_factor = (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 - - density /= norm_factor - - if cumulative: - density = density.cumsum() / density.sum() - - return density, xmin, xmax - - -======= ->>>>>>> move plot_kde and get_bins -def _cov_1d(x): - x = x - x.mean(axis=0) - ddof = x.shape[0] - 1 - return np.dot(x.T, x.conj()) / ddof - - -def _cov(data): - if data.ndim == 1: - return _cov_1d(data) - elif data.ndim == 2: - x = data.astype(float) - avg, _ = np.average(x, axis=1, weights=None, returned=True) - ddof = x.shape[1] - 1 - if ddof <= 0: - warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2) - ddof = 0.0 - x -= avg[:, None] - prod = _dot(x, x.T.conj()) - prod *= np.true_divide(1, ddof) - prod = prod.squeeze() - prod += 1e-6 * np.eye(prod.shape[0]) - return prod - else: - raise ValueError("{} dimension arrays are not supported".format(data.ndim)) - - -@conditional_jit(cache=True) -def _dot(x, y): - return np.dot(x, y) - - -def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): - """ - 2D fft-based Gaussian kernel density estimate (KDE). - - The code was adapted from https://github.com/mfouesneau/faststats - - Parameters - ---------- - x : Numpy array or list - y : Numpy array or list - gridsize : tuple - Number of points used to discretize data. Use powers of 2 for fft optimization - circular: bool - If True, use circular boundaries. Defaults to False - Returns - ------- - grid: A gridded 2D KDE of the input points (x, y) - xmin: minimum value of x - xmax: maximum value of x - ymin: minimum value of y - ymax: maximum value of y - """ - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - y = np.asarray(y, dtype=float) - y = y[np.isfinite(y)] - - xmin, xmax = x.min(), x.max() - ymin, ymax = y.min(), y.max() - - len_x = len(x) - weights = np.ones(len_x) - n_x, n_y = gridsize - - d_x = (xmax - xmin) / (n_x - 1) - d_y = (ymax - ymin) / (n_y - 1) - - xyi = _stack(x, y).T - xyi -= [xmin, ymin] - xyi /= [d_x, d_y] - xyi = np.floor(xyi, xyi).T - - scotts_factor = len_x ** (-1 / 6) - cov = _cov(xyi) - std_devs = np.diag(cov ** 0.5) - kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) - - inv_cov = np.linalg.inv(cov * scotts_factor ** 2) - - x_x = np.arange(kern_nx) - kern_nx / 2 - y_y = np.arange(kern_ny) - kern_ny / 2 - x_x, y_y = np.meshgrid(x_x, y_y) - - kernel = _stack(x_x.flatten(), y_y.flatten()) - kernel = _dot(inv_cov, kernel) * kernel - kernel = np.exp(-kernel.sum(axis=0) / 2) - kernel = kernel.reshape((int(kern_ny), int(kern_nx))) - - boundary = "wrap" if circular else "symm" - - grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray() - grid = convolve2d(grid, kernel, mode="same", boundary=boundary) - - norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2) - norm_factor = len_x * d_x * d_y * norm_factor ** 0.5 - - grid /= norm_factor - - return grid, xmin, xmax, ymin, ymax - - def matplotlib_kwarg_dealiaser(args, kind, backend="matplotlib"): """De-aliase the kwargs passed to plots.""" if args is None: diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py index c943133669..8708613c54 100644 --- a/arviz/stats/stats.py +++ b/arviz/stats/stats.py @@ -27,9 +27,9 @@ histogram, _circular_standard_deviation, get_log_likelihood as _get_log_likelihood, - _fast_kde, get_bins, ) +from ..kde_utils import _fast_kde from ..utils import _var_names, Numba, _numba_var from ..rcparams import rcParams diff --git a/arviz/stats/stats_utils.py b/arviz/stats/stats_utils.py index 4949e9644a..3616eb760d 100644 --- a/arviz/stats/stats_utils.py +++ b/arviz/stats/stats_utils.py @@ -576,77 +576,6 @@ def _circular_standard_deviation(samples, high=2 * np.pi, low=0, skipna=False, a return ((high - low) / 2.0 / np.pi) * np.sqrt(-2 * np.log(r_r)) -def _fast_kde(x, cumulative=False, bw=4.5, xmin=None, xmax=None): - """Fast Fourier transform-based Gaussian kernel density estimate (KDE). - - The code was adapted from https://github.com/mfouesneau/faststats - - Parameters - ---------- - x : Numpy array or list - cumulative : bool - If true, estimate the cdf instead of the pdf - bw : float - Bandwidth scaling factor for the KDE. Should be larger than 0. The higher this number the - smoother the KDE will be. Defaults to 4.5 which is essentially the same as the Scott's rule - of thumb (the default rule used by SciPy). - xmin : float - Manually set lower limit. - xmax : float - Manually set upper limit. - - Returns - ------- - density: A gridded 1D KDE of the input points (x) - xmin: minimum value of x - xmax: maximum value of x - """ - x = np.asarray(x, dtype=float) - x = x[np.isfinite(x)] - if x.size == 0: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - len_x = len(x) - n_points = 200 if (xmin or xmax) is None else 500 - - if xmin is None: - xmin = np.min(x) - if xmax is None: - xmax = np.max(x) - - assert np.min(x) >= xmin - assert np.max(x) <= xmax - - log_len_x = np.log(len_x) * bw - - n_bins = min(int(len_x ** (1 / 3) * log_len_x * 2), n_points) - if n_bins < 2: - warnings.warn("kde plot failed, you may want to check your data") - return np.array([np.nan]), np.nan, np.nan - - # hist, bin_edges = np.histogram(x, bins=n_bins, range=(xmin, xmax)) - # grid = hist / (hist.sum() * np.diff(bin_edges)) - - _, grid, _ = histogram(x, n_bins, range_hist=(xmin, xmax)) - - scotts_factor = len_x ** (-0.2) - kern_nx = int(scotts_factor * 2 * np.pi * log_len_x) - kernel = gaussian(kern_nx, scotts_factor * log_len_x) - - npad = min(n_bins, 2 * kern_nx) - grid = np.concatenate([grid[npad:0:-1], grid, grid[n_bins : n_bins - npad : -1]]) - density = convolve(grid, kernel, mode="same", method="direct")[npad : npad + n_bins] - norm_factor = (2 * np.pi * log_len_x ** 2 * scotts_factor ** 2) ** 0.5 - - density /= norm_factor - - if cumulative: - density = density.cumsum() / density.sum() - - return density, xmin, xmax - - def get_bins(values): """ Automatically compute the number of bins for discrete variables. diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index e0fb8d3f14..cb3fd05618 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -41,8 +41,8 @@ plot_loo_pit, plot_mcse, ) -from ...plots.plot_utils import _cov -from ...stats.stats_utils import _fast_kde +from ...utils import _cov +from ...kde_utils import _fast_kde rcParams["data.load"] = "eager" diff --git a/arviz/utils.py b/arviz/utils.py index ac096a1d19..4673b3e4bf 100644 --- a/arviz/utils.py +++ b/arviz/utils.py @@ -319,6 +319,37 @@ def expand_dims(x): return x.reshape(shape[:0] + (1,) + shape[0:]) +@conditional_jit(cache=True) +def _dot(x, y): + return np.dot(x, y) + + +def _cov_1d(x): + x = x - x.mean(axis=0) + ddof = x.shape[0] - 1 + return np.dot(x.T, x.conj()) / ddof + + +def _cov(data): + if data.ndim == 1: + return _cov_1d(data) + elif data.ndim == 2: + x = data.astype(float) + avg, _ = np.average(x, axis=1, weights=None, returned=True) + ddof = x.shape[1] - 1 + if ddof <= 0: + warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning, stacklevel=2) + ddof = 0.0 + x -= avg[:, None] + prod = _dot(x, x.T.conj()) + prod *= np.true_divide(1, ddof) + prod = prod.squeeze() + prod += 1e-6 * np.eye(prod.shape[0]) + return prod + else: + raise ValueError("{} dimension arrays are not supported".format(data.ndim)) + + @conditional_jit def full(shape, x, dtype=None): """Jitting numpy full.""" diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index a50fddb33f..c66a8a26c1 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -9,7 +9,7 @@ import warnings import arviz as az from arviz.stats.stats_utils import _circular_standard_deviation -from arviz.plots.plot_utils import _fast_kde, _fast_kde_2d +from arviz.kde_utils import _fast_kde, _fast_kde_2d class Hist: From 9bd8fd9d2acaceddcf5d0394ef737d38d31590a7 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 5 Apr 2020 15:39:21 +0530 Subject: [PATCH 08/19] modify benchmarks --- asv_benchmarks/benchmarks/benchmarks.py | 108 ++++++++++++++---------- 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index c66a8a26c1..6d4dcc50a8 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -8,58 +8,77 @@ import scipy.signal as ss import warnings import arviz as az -from arviz.stats.stats_utils import _circular_standard_deviation +from arviz.stats.stats_utils import _circular_standard_deviation, histogram, stats_variance_2d from arviz.kde_utils import _fast_kde, _fast_kde_2d class Hist: - def time_histogram(self): - try: - data = np.random.rand(10000, 1000) - import numba + params = (True, False) + param_names = "Numba" - @numba.njit(cache=True) - def _hist(data): - return np.histogram(data, bins=100) + def setup(self, numba_flag): + self.data = np.random.rand(10000, 1000) + if numba_flag: + az.Numba.enable_numba() + else: + az.Numba.disable_numba() + + def time_histogram(self): + histogram(self.data, bins=100) - return _hist(data) - except ImportError: - data = np.random.rand(10000, 1000) - return np.histogram(data, bins=100) + #try: + # data = np.random.rand(10000, 1000) + # import numba + # @numba.njit(cache=True) + # def _hist(data): + # return np.histogram(data, bins=100) + # return _hist(data) + #except ImportError: + # data = np.random.rand(10000, 1000) + # return np.histogram(data, bins=100) class Variance: + params = (True, False) + param_names = "Numba" + + def setup(self, numba_flag): + self.data = np.random.rand(10000, 10000) + if numba_flag: + az.Numba.enable_numba() + else: + az.Numba.disable_numba() + def time_variance(self): - try: - data = np.random.randn(10000, 10000) - import numba - - @numba.njit(cache=True) - def stats_variance_1d(data, ddof=0): - a, b = 0, 0 - for i in data: - a = a + i - b = b + i * i - var = b / (len(data)) - ((a / (len(data))) ** 2) - var = var * (len(data) / (len(data) - ddof)) - return var - - def stats_variance_2d(data, ddof=0, axis=1): - a, b = data.shape - if axis == 1: - var = np.zeros(a) - for i in range(a): - var[i] = stats_variance_1d(data[i], ddof=ddof) - else: - var = np.zeros(b) - for i in range(b): - var[i] = stats_variance_1d(data[:, i], ddof=ddof) - return var - - return stats_variance_2d(data) - except ImportError: - data = np.random.randn(10000, 10000) - return np.var(data, axis=1) + stats_variance_2d(self.data) + + #try: + # data = np.random.randn(10000, 10000) + # import numba + # @numba.njit(cache=True) + # def stats_variance_1d(data, ddof=0): + # a, b = 0, 0 + # for i in data: + # a = a + i + # b = b + i * i + # var = b / (len(data)) - ((a / (len(data))) ** 2) + # var = var * (len(data) / (len(data) - ddof)) + # return var + # def stats_variance_2d(data, ddof=0, axis=1): + # a, b = data.shape + # if axis == 1: + # var = np.zeros(a) + # for i in range(a): + # var[i] = stats_variance_1d(data[i], ddof=ddof) + # else: + # var = np.zeros(b) + # for i in range(b): + # var[i] = stats_variance_1d(data[:, i], ddof=ddof) + # return var + # return stats_variance_2d(data) + #except ImportError: + # data = np.random.randn(10000, 10000) + # return np.var(data, axis=1) class CircStd: @@ -73,7 +92,7 @@ def setup(self, numba_flag): else: self.circstd = circstd - def time_circ_std(self, numba_flag): + def time_circ_std(self): self.circstd(self.data) @@ -83,13 +102,12 @@ class Fast_Kde_1d: def setup(self, numba_flag, n): self.x = np.random.randn(n//10, 10) - - def time_fast_kde_normal(self, numba_flag, n): if numba_flag: az.Numba.enable_numba() else: az.Numba.disable_numba() + def time_fast_kde_normal(self): _fast_kde(self.x) class Fast_KDE_2d: From b38619a38b11c4c98e53ea97d54a0efa41390e75 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 5 Apr 2020 15:46:16 +0530 Subject: [PATCH 09/19] minor changes --- asv_benchmarks/benchmarks/benchmarks.py | 8 ++++---- asv_benchmarks/hashestobenchmark.txt | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 asv_benchmarks/hashestobenchmark.txt diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index 6d4dcc50a8..c0ed3288d2 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -23,7 +23,7 @@ def setup(self, numba_flag): else: az.Numba.disable_numba() - def time_histogram(self): + def time_histogram(self, numba_flag): histogram(self.data, bins=100) #try: @@ -49,7 +49,7 @@ def setup(self, numba_flag): else: az.Numba.disable_numba() - def time_variance(self): + def time_variance(self, numba_flag): stats_variance_2d(self.data) #try: @@ -92,7 +92,7 @@ def setup(self, numba_flag): else: self.circstd = circstd - def time_circ_std(self): + def time_circ_std(self, numba_flag): self.circstd(self.data) @@ -107,7 +107,7 @@ def setup(self, numba_flag, n): else: az.Numba.disable_numba() - def time_fast_kde_normal(self): + def time_fast_kde_normal(self, numba_flag, n): _fast_kde(self.x) class Fast_KDE_2d: diff --git a/asv_benchmarks/hashestobenchmark.txt b/asv_benchmarks/hashestobenchmark.txt new file mode 100644 index 0000000000..d6be1e1507 --- /dev/null +++ b/asv_benchmarks/hashestobenchmark.txt @@ -0,0 +1 @@ +8640fac \ No newline at end of file From 9ea2f95b94209c0a8f029385149713ed262e3bae Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 5 Apr 2020 16:09:44 +0530 Subject: [PATCH 10/19] minor modifications to benchmarks --- asv_benchmarks/benchmarks/benchmarks.py | 42 +------------------------ asv_benchmarks/hashestobenchmark.txt | 1 - 2 files changed, 1 insertion(+), 42 deletions(-) delete mode 100644 asv_benchmarks/hashestobenchmark.txt diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index c0ed3288d2..6eccd2b8ca 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -26,17 +26,6 @@ def setup(self, numba_flag): def time_histogram(self, numba_flag): histogram(self.data, bins=100) - #try: - # data = np.random.rand(10000, 1000) - # import numba - # @numba.njit(cache=True) - # def _hist(data): - # return np.histogram(data, bins=100) - # return _hist(data) - #except ImportError: - # data = np.random.rand(10000, 1000) - # return np.histogram(data, bins=100) - class Variance: params = (True, False) @@ -52,34 +41,6 @@ def setup(self, numba_flag): def time_variance(self, numba_flag): stats_variance_2d(self.data) - #try: - # data = np.random.randn(10000, 10000) - # import numba - # @numba.njit(cache=True) - # def stats_variance_1d(data, ddof=0): - # a, b = 0, 0 - # for i in data: - # a = a + i - # b = b + i * i - # var = b / (len(data)) - ((a / (len(data))) ** 2) - # var = var * (len(data) / (len(data) - ddof)) - # return var - # def stats_variance_2d(data, ddof=0, axis=1): - # a, b = data.shape - # if axis == 1: - # var = np.zeros(a) - # for i in range(a): - # var[i] = stats_variance_1d(data[i], ddof=ddof) - # else: - # var = np.zeros(b) - # for i in range(b): - # var[i] = stats_variance_1d(data[:, i], ddof=ddof) - # return var - # return stats_variance_2d(data) - #except ImportError: - # data = np.random.randn(10000, 10000) - # return np.var(data, axis=1) - class CircStd: params = (True, False) @@ -117,13 +78,12 @@ class Fast_KDE_2d: def setup(self, numba_flag, n): self.x = np.random.randn(n//10, 10) self.y = np.random.randn(n//10, 10) - - def time_fast_kde_2d(self, numba_flag, n): if numba_flag: az.Numba.enable_numba() else: az.Numba.disable_numba() + def time_fast_kde_2d(self, numba_flag, n): _fast_kde_2d(self.x, self.y) class Atleast_Nd: diff --git a/asv_benchmarks/hashestobenchmark.txt b/asv_benchmarks/hashestobenchmark.txt deleted file mode 100644 index d6be1e1507..0000000000 --- a/asv_benchmarks/hashestobenchmark.txt +++ /dev/null @@ -1 +0,0 @@ -8640fac \ No newline at end of file From 40f844c8799b3d2030e8531403cd75bdc1fe0ba8 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 5 Apr 2020 16:16:58 +0530 Subject: [PATCH 11/19] remove comma --- asv_benchmarks/benchmarks/benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index 6eccd2b8ca..da2148b4a2 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -44,7 +44,7 @@ def time_variance(self, numba_flag): class CircStd: params = (True, False) - param_names = "Numba", + param_names = "Numba" def setup(self, numba_flag): self.data = np.random.randn(10000, 1000) From 5662efbff3f69643d54ce56f0b40fc48da4a77d9 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Tue, 7 Apr 2020 19:26:01 +0530 Subject: [PATCH 12/19] move get_coords to utils --- arviz/plots/backends/matplotlib/traceplot.py | 4 -- arviz/plots/elpdplot.py | 2 +- arviz/plots/essplot.py | 3 +- arviz/plots/forestplot.py | 4 +- arviz/plots/jointplot.py | 3 +- arviz/plots/khatplot.py | 2 +- arviz/plots/mcseplot.py | 3 +- arviz/plots/pairplot.py | 4 +- arviz/plots/parallelplot.py | 4 +- arviz/plots/plot_utils.py | 45 -------------------- arviz/plots/posteriorplot.py | 3 +- arviz/plots/traceplot.py | 3 +- arviz/stats/stats.py | 6 +-- arviz/tests/base_tests/test_plot_utils.py | 10 +---- arviz/utils.py | 44 +++++++++++++++++++ 15 files changed, 59 insertions(+), 81 deletions(-) diff --git a/arviz/plots/backends/matplotlib/traceplot.py b/arviz/plots/backends/matplotlib/traceplot.py index 7dd33e0429..7255d52a0d 100644 --- a/arviz/plots/backends/matplotlib/traceplot.py +++ b/arviz/plots/backends/matplotlib/traceplot.py @@ -8,13 +8,9 @@ from . import backend_kwarg_defaults, backend_show from ...distplot import plot_dist -<<<<<<< HEAD from ...rankplot import plot_rank -from ...plot_utils import _scale_fig_size, get_bins, make_label, format_coords_as_labels -======= from ...plot_utils import _scale_fig_size, make_label, format_coords_as_labels from ....stats.stats_utils import get_bins ->>>>>>> move plot_kde and get_bins def plot_trace( diff --git a/arviz/plots/elpdplot.py b/arviz/plots/elpdplot.py index cc4d1f9f31..31d80d6236 100644 --- a/arviz/plots/elpdplot.py +++ b/arviz/plots/elpdplot.py @@ -6,7 +6,6 @@ from ..data import convert_to_inference_data from .plot_utils import ( - get_coords, format_coords_as_labels, color_from_dim, get_plotting_function, @@ -14,6 +13,7 @@ ) from ..stats import waic, loo, ELPDData from ..rcparams import rcParams +from ..utils import get_coords def plot_elpd( diff --git a/arviz/plots/essplot.py b/arviz/plots/essplot.py index 7acaf992a0..a7338a6450 100644 --- a/arviz/plots/essplot.py +++ b/arviz/plots/essplot.py @@ -8,13 +8,12 @@ xarray_var_iter, _scale_fig_size, default_grid, - get_coords, filter_plotters_list, get_plotting_function, matplotlib_kwarg_dealiaser, ) from ..rcparams import rcParams -from ..utils import _var_names +from ..utils import _var_names, get_coords def plot_ess( diff --git a/arviz/plots/forestplot.py b/arviz/plots/forestplot.py index b7ec01ac94..ffc63174f9 100644 --- a/arviz/plots/forestplot.py +++ b/arviz/plots/forestplot.py @@ -1,7 +1,7 @@ """Forest plot.""" from ..data import convert_to_dataset -from .plot_utils import get_coords, get_plotting_function -from ..utils import _var_names +from .plot_utils import get_plotting_function +from ..utils import _var_names, get_coords from ..rcparams import rcParams diff --git a/arviz/plots/jointplot.py b/arviz/plots/jointplot.py index 695750c16f..768703e28d 100644 --- a/arviz/plots/jointplot.py +++ b/arviz/plots/jointplot.py @@ -5,12 +5,11 @@ from .plot_utils import ( _scale_fig_size, xarray_var_iter, - get_coords, get_plotting_function, matplotlib_kwarg_dealiaser, ) from ..rcparams import rcParams -from ..utils import _var_names +from ..utils import _var_names, get_coords def plot_joint( diff --git a/arviz/plots/khatplot.py b/arviz/plots/khatplot.py index c62cfae63d..1437a55ac2 100644 --- a/arviz/plots/khatplot.py +++ b/arviz/plots/khatplot.py @@ -7,7 +7,6 @@ from .plot_utils import ( _scale_fig_size, - get_coords, color_from_dim, format_coords_as_labels, get_plotting_function, @@ -15,6 +14,7 @@ ) from ..stats import ELPDData from ..rcparams import rcParams +from ..utils import get_coords def plot_khat( diff --git a/arviz/plots/mcseplot.py b/arviz/plots/mcseplot.py index 905b8c7f8b..b6397d25ec 100644 --- a/arviz/plots/mcseplot.py +++ b/arviz/plots/mcseplot.py @@ -8,13 +8,12 @@ xarray_var_iter, _scale_fig_size, default_grid, - get_coords, filter_plotters_list, get_plotting_function, matplotlib_kwarg_dealiaser, ) from ..rcparams import rcParams -from ..utils import _var_names +from ..utils import _var_names, get_coords def plot_mcse( diff --git a/arviz/plots/pairplot.py b/arviz/plots/pairplot.py index 7adff78a47..62de6b49d6 100644 --- a/arviz/plots/pairplot.py +++ b/arviz/plots/pairplot.py @@ -3,9 +3,9 @@ import numpy as np from ..data import convert_to_dataset, convert_to_inference_data -from .plot_utils import xarray_to_ndarray, get_coords, get_plotting_function +from .plot_utils import xarray_to_ndarray, get_plotting_function from ..rcparams import rcParams -from ..utils import _var_names +from ..utils import _var_names, get_coords def plot_pair( diff --git a/arviz/plots/parallelplot.py b/arviz/plots/parallelplot.py index dae1c019fe..21a8898dda 100644 --- a/arviz/plots/parallelplot.py +++ b/arviz/plots/parallelplot.py @@ -3,9 +3,9 @@ from scipy.stats.mstats import rankdata from ..data import convert_to_dataset -from .plot_utils import _scale_fig_size, xarray_to_ndarray, get_coords, get_plotting_function +from .plot_utils import _scale_fig_size, xarray_to_ndarray, get_plotting_function from ..rcparams import rcParams -from ..utils import _var_names, _numba_var +from ..utils import _var_names, _numba_var, get_coords from ..stats.stats_utils import stats_variance_2d as svar diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index ef98e29151..5b7f8004e2 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -486,51 +486,6 @@ def xarray_to_ndarray(data, *, var_names=None, combined=True): return unpacked_var_names, unpacked_data -def get_coords(data, coords): - """Subselects xarray DataSet or DataArray object to provided coords. Raises exception if fails. - - Raises - ------ - ValueError - If coords name are not available in data - - KeyError - If coords dims are not available in data - - Returns - ------- - data: xarray - xarray.DataSet or xarray.DataArray object, same type as input - """ - if not isinstance(data, (list, tuple)): - try: - return data.sel(**coords) - - except ValueError: - invalid_coords = set(coords.keys()) - set(data.coords.keys()) - raise ValueError("Coords {} are invalid coordinate keys".format(invalid_coords)) - - except KeyError as err: - raise KeyError( - ( - "Coords should follow mapping format {{coord_name:[dim1, dim2]}}. " - "Check that coords structure is correct and" - " dimensions are valid. {}" - ).format(err) - ) - if not isinstance(coords, (list, tuple)): - coords = [coords] * len(data) - data_subset = [] - for idx, (datum, coords_dict) in enumerate(zip(data, coords)): - try: - data_subset.append(get_coords(datum, coords_dict)) - except ValueError as err: - raise ValueError("Error in data[{}]: {}".format(idx, err)) - except KeyError as err: - raise KeyError("Error in data[{}]: {}".format(idx, err)) - return data_subset - - def color_from_dim(dataarray, dim_name): """Return colors and color mapping of a DataArray using coord values as color code. diff --git a/arviz/plots/posteriorplot.py b/arviz/plots/posteriorplot.py index e7656e9dbe..d00cdcdaf4 100644 --- a/arviz/plots/posteriorplot.py +++ b/arviz/plots/posteriorplot.py @@ -6,12 +6,11 @@ xarray_var_iter, _scale_fig_size, default_grid, - get_coords, filter_plotters_list, get_plotting_function, matplotlib_kwarg_dealiaser, ) -from ..utils import _var_names +from ..utils import _var_names, get_coords from ..rcparams import rcParams diff --git a/arviz/plots/traceplot.py b/arviz/plots/traceplot.py index 7618364fe8..1c56ba92de 100644 --- a/arviz/plots/traceplot.py +++ b/arviz/plots/traceplot.py @@ -7,13 +7,12 @@ from .plot_utils import ( get_plotting_function, - get_coords, xarray_var_iter, KwargSpec, matplotlib_kwarg_dealiaser, ) from ..data import convert_to_dataset, InferenceData, CoordSpec -from ..utils import _var_names +from ..utils import _var_names, get_coords from ..rcparams import rcParams diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py index 8708613c54..183cca504a 100644 --- a/arviz/stats/stats.py +++ b/arviz/stats/stats.py @@ -12,10 +12,6 @@ from scipy.optimize import minimize import xarray as xr -<<<<<<< HEAD -from ..plots.plot_utils import _fast_kde, get_bins, get_coords -======= ->>>>>>> move plot_kde and get_bins from ..data import convert_to_inference_data, convert_to_dataset, InferenceData, CoordSpec, DimSpec from .diagnostics import _multichain_statistics, _mc_error, ess from .stats_utils import ( @@ -30,7 +26,7 @@ get_bins, ) from ..kde_utils import _fast_kde -from ..utils import _var_names, Numba, _numba_var +from ..utils import _var_names, Numba, _numba_var, get_coords from ..rcparams import rcParams _log = logging.getLogger(__name__) diff --git a/arviz/tests/base_tests/test_plot_utils.py b/arviz/tests/base_tests/test_plot_utils.py index 49d895c020..c713729a66 100644 --- a/arviz/tests/base_tests/test_plot_utils.py +++ b/arviz/tests/base_tests/test_plot_utils.py @@ -8,17 +8,8 @@ from ...data import from_dict from ..helpers import running_on_ci from ...plots.plot_utils import ( -<<<<<<< HEAD -======= - make_2d, - xarray_to_ndarray, - xarray_var_iter, - get_coords, ->>>>>>> move plot_kde and get_bins filter_plotters_list, format_sig_figs, - get_bins, - get_coords, get_plotting_function, make_2d, matplotlib_kwarg_dealiaser, @@ -27,6 +18,7 @@ ) from ...rcparams import rc_context from ...stats.stats_utils import get_bins +from ...utils import get_coords @pytest.mark.parametrize( diff --git a/arviz/utils.py b/arviz/utils.py index 4673b3e4bf..87c9011da9 100644 --- a/arviz/utils.py +++ b/arviz/utils.py @@ -529,3 +529,47 @@ def flatten_inference_data_to_dict( data_dict[var_name_dim] = var_values[loc] return data_dict + +def get_coords(data, coords): + """Subselects xarray DataSet or DataArray object to provided coords. Raises exception if fails. + + Raises + ------ + ValueError + If coords name are not available in data + + KeyError + If coords dims are not available in data + + Returns + ------- + data: xarray + xarray.DataSet or xarray.DataArray object, same type as input + """ + if not isinstance(data, (list, tuple)): + try: + return data.sel(**coords) + + except ValueError: + invalid_coords = set(coords.keys()) - set(data.coords.keys()) + raise ValueError("Coords {} are invalid coordinate keys".format(invalid_coords)) + + except KeyError as err: + raise KeyError( + ( + "Coords should follow mapping format {{coord_name:[dim1, dim2]}}. " + "Check that coords structure is correct and" + " dimensions are valid. {}" + ).format(err) + ) + if not isinstance(coords, (list, tuple)): + coords = [coords] * len(data) + data_subset = [] + for idx, (datum, coords_dict) in enumerate(zip(data, coords)): + try: + data_subset.append(get_coords(datum, coords_dict)) + except ValueError as err: + raise ValueError("Error in data[{}]: {}".format(idx, err)) + except KeyError as err: + raise KeyError("Error in data[{}]: {}".format(idx, err)) + return data_subset From 08ca77b567fc45b0fa31836674e1f3d9861bb096 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Tue, 7 Apr 2020 20:10:55 +0530 Subject: [PATCH 13/19] move _sturges_formula --- arviz/plots/plot_utils.py | 25 ------------------------- arviz/plots/rankplot.py | 2 +- arviz/stats/stats_utils.py | 24 ++++++++++++++++++++++-- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 5b7f8004e2..892172eb77 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -3,9 +3,6 @@ from typing import Dict, Any from itertools import product, tee import importlib -from scipy.signal import convolve, convolve2d -from scipy.sparse import coo_matrix -from scipy.signal.windows import gaussian from scipy.stats import mode import packaging @@ -98,28 +95,6 @@ def _scale_fig_size(figsize, textsize, rows=1, cols=1): return (width, height), ax_labelsize, titlesize, xt_labelsize, linewidth, markersize -def _sturges_formula(dataset, mult=1): - """Use Sturges' formula to determine number of bins. - - See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula - or https://doi.org/10.1080%2F01621459.1926.10502161 - - Parameters - ---------- - dataset: xarray.DataSet - Must have the `draw` dimension - - mult: float - Used to scale the number of bins up or down. Default is 1 for Sturges' formula. - - Returns - ------- - int - Number of bins to use - """ - return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) - - def default_grid(n_items, max_cols=4, min_cols=3): # noqa: D202 """Make a grid for subplots. diff --git a/arviz/plots/rankplot.py b/arviz/plots/rankplot.py index e1020be8b7..a9ad9c8400 100644 --- a/arviz/plots/rankplot.py +++ b/arviz/plots/rankplot.py @@ -8,11 +8,11 @@ xarray_var_iter, default_grid, filter_plotters_list, - _sturges_formula, get_plotting_function, ) from ..rcparams import rcParams from ..utils import _var_names +from ..stats.stats_utils import _sturges_formula def plot_rank( diff --git a/arviz/stats/stats_utils.py b/arviz/stats/stats_utils.py index 3616eb760d..87ae9ee774 100644 --- a/arviz/stats/stats_utils.py +++ b/arviz/stats/stats_utils.py @@ -7,8 +7,6 @@ import numpy as np import pandas as pd from scipy.fftpack import next_fast_len -from scipy.signal import convolve -from scipy.signal.windows import gaussian from scipy.stats.mstats import mquantiles from xarray import apply_ufunc from ..utils import conditional_jit, conditional_vect @@ -616,3 +614,25 @@ def get_bins(values): width = round(np.max([1, bins_sturges, bins_fd])).astype(int) return np.arange(x_min, x_max + width + 1, width) + + +def _sturges_formula(dataset, mult=1): + """Use Sturges' formula to determine number of bins. + + See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula + or https://doi.org/10.1080%2F01621459.1926.10502161 + + Parameters + ---------- + dataset: xarray.DataSet + Must have the `draw` dimension + + mult: float + Used to scale the number of bins up or down. Default is 1 for Sturges' formula. + + Returns + ------- + int + Number of bins to use + """ + return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) From 17de0ba9e8660e3039c8db8287d9ad04e15779d0 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Wed, 8 Apr 2020 19:39:13 +0530 Subject: [PATCH 14/19] fix pydocstyle --- arviz/kde_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arviz/kde_utils.py b/arviz/kde_utils.py index e21708ffa5..b851542e91 100644 --- a/arviz/kde_utils.py +++ b/arviz/kde_utils.py @@ -1,4 +1,4 @@ -"""KDE functions for ArviZ""" +"""KDE functions for ArviZ.""" import warnings import numpy as np from scipy.signal import convolve, convolve2d From e9adf6ca76cd31f1d9583af197fcdb73274f011d Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Wed, 8 Apr 2020 20:05:49 +0530 Subject: [PATCH 15/19] black changes --- arviz/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arviz/utils.py b/arviz/utils.py index 87c9011da9..923a37ada1 100644 --- a/arviz/utils.py +++ b/arviz/utils.py @@ -530,6 +530,7 @@ def flatten_inference_data_to_dict( data_dict[var_name_dim] = var_values[loc] return data_dict + def get_coords(data, coords): """Subselects xarray DataSet or DataArray object to provided coords. Raises exception if fails. From bc7776152b4b583509ecc3e1f52c90bd1044bd68 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Thu, 9 Apr 2020 12:50:35 +0530 Subject: [PATCH 16/19] fix benchmarks --- asv_benchmarks/benchmarks/benchmarks.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index da2148b4a2..ac0f520846 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -2,11 +2,7 @@ # See "Writing benchmarks" in the airspeed velocity docs for more information. # https://asv.readthedocs.io/en/stable/ import numpy as np -from numpy import newaxis from scipy.stats import circstd -from scipy.sparse import coo_matrix -import scipy.signal as ss -import warnings import arviz as az from arviz.stats.stats_utils import _circular_standard_deviation, histogram, stats_variance_2d from arviz.kde_utils import _fast_kde, _fast_kde_2d @@ -14,7 +10,7 @@ class Hist: params = (True, False) - param_names = "Numba" + param_names = ("Numba",) def setup(self, numba_flag): self.data = np.random.rand(10000, 1000) @@ -29,10 +25,10 @@ def time_histogram(self, numba_flag): class Variance: params = (True, False) - param_names = "Numba" + param_names = ("Numba",) def setup(self, numba_flag): - self.data = np.random.rand(10000, 10000) + self.data = np.random.rand(10000, 1000) if numba_flag: az.Numba.enable_numba() else: @@ -44,7 +40,7 @@ def time_variance(self, numba_flag): class CircStd: params = (True, False) - param_names = "Numba" + param_names = ("Numba",) def setup(self, numba_flag): self.data = np.random.randn(10000, 1000) From a1a0f27c3af86b05e962b1e5e3031057632750da Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Thu, 9 Apr 2020 19:17:04 +0530 Subject: [PATCH 17/19] create numerical_utils --- arviz/{kde_utils.py => numeric_utils.py} | 66 ++++++++++++++++++- arviz/plots/__init__.py | 2 +- arviz/plots/backends/bokeh/densityplot.py | 3 +- arviz/plots/backends/bokeh/distplot.py | 2 +- arviz/plots/backends/bokeh/forestplot.py | 3 +- arviz/plots/backends/bokeh/posteriorplot.py | 2 +- arviz/plots/backends/bokeh/ppcplot.py | 2 +- arviz/plots/backends/bokeh/violinplot.py | 3 +- .../plots/backends/matplotlib/densityplot.py | 3 +- arviz/plots/backends/matplotlib/distplot.py | 2 +- arviz/plots/backends/matplotlib/forestplot.py | 3 +- arviz/plots/backends/matplotlib/loopitplot.py | 2 +- .../backends/matplotlib/posteriorplot.py | 2 +- arviz/plots/backends/matplotlib/ppcplot.py | 3 +- arviz/plots/backends/matplotlib/traceplot.py | 2 +- arviz/plots/backends/matplotlib/violinplot.py | 3 +- arviz/plots/distplot.py | 2 +- arviz/plots/kdeplot.py | 2 +- arviz/plots/loopitplot.py | 2 +- arviz/plots/plot_utils.py | 2 +- arviz/plots/rankplot.py | 2 +- arviz/stats/stats.py | 4 +- arviz/stats/stats_utils.py | 64 ------------------ arviz/tests/base_tests/test_plot_utils.py | 2 +- .../tests/base_tests/test_plots_matplotlib.py | 2 +- asv_benchmarks/benchmarks/benchmarks.py | 4 +- 26 files changed, 90 insertions(+), 99 deletions(-) rename arviz/{kde_utils.py => numeric_utils.py} (67%) diff --git a/arviz/kde_utils.py b/arviz/numeric_utils.py similarity index 67% rename from arviz/kde_utils.py rename to arviz/numeric_utils.py index b851542e91..757284b2ec 100644 --- a/arviz/kde_utils.py +++ b/arviz/numeric_utils.py @@ -1,4 +1,4 @@ -"""KDE functions for ArviZ.""" +"""Numerical utility functions for ArviZ.""" import warnings import numpy as np from scipy.signal import convolve, convolve2d @@ -149,3 +149,67 @@ def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): grid /= norm_factor return grid, xmin, xmax, ymin, ymax + + +def get_bins(values): + """ + Automatically compute the number of bins for discrete variables. + + Parameters + ---------- + values = numpy array + values + + Returns + ------- + array with the bins + + Notes + ----- + Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis + estimators. Acording to numpy `np.histogram` this provides good all around performance. + + The Sturges is a very simplistic estimator based on the assumption of normality of the data. + This estimator has poor performance for non-normal data, which becomes especially obvious for + large data sets. The estimate depends only on size of the data. + + The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth. + It is considered a robusts version of the Scott rule as the IQR is less affected by outliers + than the standard deviation. However, the IQR depends on fewer points than the standard + deviation, so it is less accurate, especially for long tailed distributions. + """ + x_min = values.min().astype(int) + x_max = values.max().astype(int) + + # Sturges histogram bin estimator + bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1) + + # The Freedman-Diaconis histogram bin estimator. + iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return + bins_fd = 2 * iqr * values.size ** (-1 / 3) + + width = round(np.max([1, bins_sturges, bins_fd])).astype(int) + + return np.arange(x_min, x_max + width + 1, width) + + +def _sturges_formula(dataset, mult=1): + """Use Sturges' formula to determine number of bins. + + See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula + or https://doi.org/10.1080%2F01621459.1926.10502161 + + Parameters + ---------- + dataset: xarray.DataSet + Must have the `draw` dimension + + mult: float + Used to scale the number of bins up or down. Default is 1 for Sturges' formula. + + Returns + ------- + int + Number of bins to use + """ + return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) diff --git a/arviz/plots/__init__.py b/arviz/plots/__init__.py index ef2c478307..5cc2a6c087 100644 --- a/arviz/plots/__init__.py +++ b/arviz/plots/__init__.py @@ -20,7 +20,7 @@ from .rankplot import plot_rank from .traceplot import plot_trace from .violinplot import plot_violin -from ..kde_utils import _fast_kde_2d +from ..numeric_utils import _fast_kde_2d __all__ = [ diff --git a/arviz/plots/backends/bokeh/densityplot.py b/arviz/plots/backends/bokeh/densityplot.py index a42038a6d4..f21ade24ea 100644 --- a/arviz/plots/backends/bokeh/densityplot.py +++ b/arviz/plots/backends/bokeh/densityplot.py @@ -12,8 +12,7 @@ calculate_point_estimate, ) from ....stats import hpd -from ....stats.stats_utils import histogram, get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins def plot_density( diff --git a/arviz/plots/backends/bokeh/distplot.py b/arviz/plots/backends/bokeh/distplot.py index b02f09a44d..e2826a0638 100644 --- a/arviz/plots/backends/bokeh/distplot.py +++ b/arviz/plots/backends/bokeh/distplot.py @@ -6,7 +6,7 @@ from . import backend_kwarg_defaults from .. import show_layout from ...kdeplot import plot_kde -from ....stats.stats_utils import get_bins +from ....numeric_utils import get_bins def plot_dist( diff --git a/arviz/plots/backends/bokeh/forestplot.py b/arviz/plots/backends/bokeh/forestplot.py index ac058a60b5..86fc541520 100644 --- a/arviz/plots/backends/bokeh/forestplot.py +++ b/arviz/plots/backends/bokeh/forestplot.py @@ -16,8 +16,7 @@ from ....rcparams import rcParams from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram, get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins from ....utils import conditional_jit diff --git a/arviz/plots/backends/bokeh/posteriorplot.py b/arviz/plots/backends/bokeh/posteriorplot.py index 984f50b851..3996ee8590 100644 --- a/arviz/plots/backends/bokeh/posteriorplot.py +++ b/arviz/plots/backends/bokeh/posteriorplot.py @@ -16,7 +16,7 @@ calculate_point_estimate, ) from ....stats import hpd -from ....stats.stats_utils import get_bins +from ....numeric_utils import get_bins def plot_posterior( diff --git a/arviz/plots/backends/bokeh/ppcplot.py b/arviz/plots/backends/bokeh/ppcplot.py index 4160ed47d1..ade5e7b832 100644 --- a/arviz/plots/backends/bokeh/ppcplot.py +++ b/arviz/plots/backends/bokeh/ppcplot.py @@ -5,7 +5,7 @@ from .. import show_layout from ...kdeplot import plot_kde, _fast_kde from ...plot_utils import _create_axes_grid -from ....stats.stats_utils import histogram, get_bins +from ....numeric_utils import histogram, get_bins def plot_ppc( diff --git a/arviz/plots/backends/bokeh/violinplot.py b/arviz/plots/backends/bokeh/violinplot.py index ac53f73c1f..e312c747d6 100644 --- a/arviz/plots/backends/bokeh/violinplot.py +++ b/arviz/plots/backends/bokeh/violinplot.py @@ -4,10 +4,9 @@ from . import backend_kwarg_defaults from .. import show_layout -from ...kdeplot import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins from ...plot_utils import make_label, _create_axes_grid from ....stats import hpd -from ....stats.stats_utils import histogram, get_bins def plot_violin( diff --git a/arviz/plots/backends/matplotlib/densityplot.py b/arviz/plots/backends/matplotlib/densityplot.py index 7f914daa14..8437bcf7c5 100644 --- a/arviz/plots/backends/matplotlib/densityplot.py +++ b/arviz/plots/backends/matplotlib/densityplot.py @@ -9,8 +9,7 @@ _create_axes_grid, calculate_point_estimate, ) -from ....stats.stats_utils import get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, get_bins def plot_density( diff --git a/arviz/plots/backends/matplotlib/distplot.py b/arviz/plots/backends/matplotlib/distplot.py index c719c0db44..f71c0110ba 100644 --- a/arviz/plots/backends/matplotlib/distplot.py +++ b/arviz/plots/backends/matplotlib/distplot.py @@ -5,7 +5,7 @@ from . import backend_show from ...kdeplot import plot_kde from ...plot_utils import matplotlib_kwarg_dealiaser -from ....stats.stats_utils import get_bins +from ....numeric_utils import get_bins def plot_dist( diff --git a/arviz/plots/backends/matplotlib/forestplot.py b/arviz/plots/backends/matplotlib/forestplot.py index 8b4413c23f..20b453492a 100644 --- a/arviz/plots/backends/matplotlib/forestplot.py +++ b/arviz/plots/backends/matplotlib/forestplot.py @@ -9,8 +9,7 @@ from . import backend_kwarg_defaults, backend_show from ....stats import hpd from ....stats.diagnostics import _ess, _rhat -from ....stats.stats_utils import histogram, get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins from ...plot_utils import _scale_fig_size, xarray_var_iter, make_label from ....utils import conditional_jit diff --git a/arviz/plots/backends/matplotlib/loopitplot.py b/arviz/plots/backends/matplotlib/loopitplot.py index 8f7b59f796..f50fc908e8 100644 --- a/arviz/plots/backends/matplotlib/loopitplot.py +++ b/arviz/plots/backends/matplotlib/loopitplot.py @@ -3,7 +3,7 @@ import numpy as np from . import backend_kwarg_defaults, backend_show -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde from ...hpdplot import plot_hpd diff --git a/arviz/plots/backends/matplotlib/posteriorplot.py b/arviz/plots/backends/matplotlib/posteriorplot.py index 6ea8f3d932..15dd3146d5 100644 --- a/arviz/plots/backends/matplotlib/posteriorplot.py +++ b/arviz/plots/backends/matplotlib/posteriorplot.py @@ -14,7 +14,7 @@ round_num, calculate_point_estimate, ) -from ....stats.stats_utils import get_bins +from ....numeric_utils import get_bins def plot_posterior( diff --git a/arviz/plots/backends/matplotlib/ppcplot.py b/arviz/plots/backends/matplotlib/ppcplot.py index 2420a22bd6..47d96c2eea 100644 --- a/arviz/plots/backends/matplotlib/ppcplot.py +++ b/arviz/plots/backends/matplotlib/ppcplot.py @@ -9,8 +9,7 @@ make_label, _create_axes_grid, ) -from ....stats.stats_utils import histogram, get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins def plot_ppc( diff --git a/arviz/plots/backends/matplotlib/traceplot.py b/arviz/plots/backends/matplotlib/traceplot.py index 7255d52a0d..c848bb9a93 100644 --- a/arviz/plots/backends/matplotlib/traceplot.py +++ b/arviz/plots/backends/matplotlib/traceplot.py @@ -10,7 +10,7 @@ from ...distplot import plot_dist from ...rankplot import plot_rank from ...plot_utils import _scale_fig_size, make_label, format_coords_as_labels -from ....stats.stats_utils import get_bins +from ....numeric_utils import get_bins def plot_trace( diff --git a/arviz/plots/backends/matplotlib/violinplot.py b/arviz/plots/backends/matplotlib/violinplot.py index 0fb4107584..147f143065 100644 --- a/arviz/plots/backends/matplotlib/violinplot.py +++ b/arviz/plots/backends/matplotlib/violinplot.py @@ -4,8 +4,7 @@ from . import backend_show from ....stats import hpd -from ....stats.stats_utils import histogram, get_bins -from ....kde_utils import _fast_kde +from ....numeric_utils import _fast_kde, histogram, get_bins from ...plot_utils import make_label, _create_axes_grid diff --git a/arviz/plots/distplot.py b/arviz/plots/distplot.py index 2d2a72cbaf..024123acd1 100644 --- a/arviz/plots/distplot.py +++ b/arviz/plots/distplot.py @@ -3,7 +3,7 @@ import xarray as xr from .plot_utils import get_plotting_function, matplotlib_kwarg_dealiaser -from ..stats.stats_utils import get_bins +from ..numeric_utils import get_bins from ..data import InferenceData from ..rcparams import rcParams diff --git a/arviz/plots/kdeplot.py b/arviz/plots/kdeplot.py index 0ece8dde55..33714c2115 100644 --- a/arviz/plots/kdeplot.py +++ b/arviz/plots/kdeplot.py @@ -3,7 +3,7 @@ import xarray as xr from ..data import InferenceData -from ..kde_utils import _fast_kde, _fast_kde_2d +from ..numeric_utils import _fast_kde, _fast_kde_2d from .plot_utils import get_plotting_function from ..rcparams import rcParams diff --git a/arviz/plots/loopitplot.py b/arviz/plots/loopitplot.py index 43e456d25d..4135dc1f23 100644 --- a/arviz/plots/loopitplot.py +++ b/arviz/plots/loopitplot.py @@ -5,7 +5,7 @@ from xarray import DataArray from ..stats import loo_pit as _loo_pit -from ..kde_utils import _fast_kde +from ..numeric_utils import _fast_kde from .plot_utils import ( _scale_fig_size, get_plotting_function, diff --git a/arviz/plots/plot_utils.py b/arviz/plots/plot_utils.py index 892172eb77..874dc6a30e 100644 --- a/arviz/plots/plot_utils.py +++ b/arviz/plots/plot_utils.py @@ -12,7 +12,7 @@ import matplotlib.cbook as cbook import xarray as xr -from ..kde_utils import _fast_kde +from ..numeric_utils import _fast_kde from ..rcparams import rcParams KwargSpec = Dict[str, Any] diff --git a/arviz/plots/rankplot.py b/arviz/plots/rankplot.py index a9ad9c8400..036b92d59f 100644 --- a/arviz/plots/rankplot.py +++ b/arviz/plots/rankplot.py @@ -12,7 +12,7 @@ ) from ..rcparams import rcParams from ..utils import _var_names -from ..stats.stats_utils import _sturges_formula +from ..numeric_utils import _sturges_formula def plot_rank( diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py index 183cca504a..6f6764d2df 100644 --- a/arviz/stats/stats.py +++ b/arviz/stats/stats.py @@ -20,12 +20,10 @@ logsumexp as _logsumexp, ELPDData, stats_variance_2d as svar, - histogram, _circular_standard_deviation, get_log_likelihood as _get_log_likelihood, - get_bins, ) -from ..kde_utils import _fast_kde +from ..numeric_utils import _fast_kde, histogram, get_bins from ..utils import _var_names, Numba, _numba_var, get_coords from ..rcparams import rcParams diff --git a/arviz/stats/stats_utils.py b/arviz/stats/stats_utils.py index 87ae9ee774..e26af7b3cb 100644 --- a/arviz/stats/stats_utils.py +++ b/arviz/stats/stats_utils.py @@ -572,67 +572,3 @@ def _circular_standard_deviation(samples, high=2 * np.pi, low=0, skipna=False, a c_c = np.cos(ang).mean(axis=axis) r_r = np.hypot(s_s, c_c) return ((high - low) / 2.0 / np.pi) * np.sqrt(-2 * np.log(r_r)) - - -def get_bins(values): - """ - Automatically compute the number of bins for discrete variables. - - Parameters - ---------- - values = numpy array - values - - Returns - ------- - array with the bins - - Notes - ----- - Computes the width of the bins by taking the maximun of the Sturges and the Freedman-Diaconis - estimators. Acording to numpy `np.histogram` this provides good all around performance. - - The Sturges is a very simplistic estimator based on the assumption of normality of the data. - This estimator has poor performance for non-normal data, which becomes especially obvious for - large data sets. The estimate depends only on size of the data. - - The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth. - It is considered a robusts version of the Scott rule as the IQR is less affected by outliers - than the standard deviation. However, the IQR depends on fewer points than the standard - deviation, so it is less accurate, especially for long tailed distributions. - """ - x_min = values.min().astype(int) - x_max = values.max().astype(int) - - # Sturges histogram bin estimator - bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1) - - # The Freedman-Diaconis histogram bin estimator. - iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return - bins_fd = 2 * iqr * values.size ** (-1 / 3) - - width = round(np.max([1, bins_sturges, bins_fd])).astype(int) - - return np.arange(x_min, x_max + width + 1, width) - - -def _sturges_formula(dataset, mult=1): - """Use Sturges' formula to determine number of bins. - - See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula - or https://doi.org/10.1080%2F01621459.1926.10502161 - - Parameters - ---------- - dataset: xarray.DataSet - Must have the `draw` dimension - - mult: float - Used to scale the number of bins up or down. Default is 1 for Sturges' formula. - - Returns - ------- - int - Number of bins to use - """ - return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) diff --git a/arviz/tests/base_tests/test_plot_utils.py b/arviz/tests/base_tests/test_plot_utils.py index c713729a66..1f81610f1c 100644 --- a/arviz/tests/base_tests/test_plot_utils.py +++ b/arviz/tests/base_tests/test_plot_utils.py @@ -17,7 +17,7 @@ xarray_var_iter, ) from ...rcparams import rc_context -from ...stats.stats_utils import get_bins +from ...numeric_utils import get_bins from ...utils import get_coords diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py index cb3fd05618..8733abc268 100644 --- a/arviz/tests/base_tests/test_plots_matplotlib.py +++ b/arviz/tests/base_tests/test_plots_matplotlib.py @@ -42,7 +42,7 @@ plot_mcse, ) from ...utils import _cov -from ...kde_utils import _fast_kde +from ...numeric_utils import _fast_kde rcParams["data.load"] = "eager" diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index ac0f520846..1a6a3d01c5 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -4,8 +4,8 @@ import numpy as np from scipy.stats import circstd import arviz as az -from arviz.stats.stats_utils import _circular_standard_deviation, histogram, stats_variance_2d -from arviz.kde_utils import _fast_kde, _fast_kde_2d +from arviz.stats.stats_utils import _circular_standard_deviation, stats_variance_2d +from arviz.numeric_utils import _fast_kde, _fast_kde_2d, histogram class Hist: From e5414b5894fb6d5fd550d85facac17b776ec6eca Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 12 Apr 2020 20:51:33 +0530 Subject: [PATCH 18/19] update fast_kde_2d benchmark and changelog --- .azure-pipelines/azure-pipelines-external.yml | 2 +- CHANGELOG.md | 2 ++ CONTRIBUTING.md | 2 +- asv_benchmarks/benchmarks/benchmarks.py | 20 ++++++++++--------- scripts/lint.sh | 4 ++++ 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.azure-pipelines/azure-pipelines-external.yml b/.azure-pipelines/azure-pipelines-external.yml index 4e2508d7c0..2ac428b12d 100644 --- a/.azure-pipelines/azure-pipelines-external.yml +++ b/.azure-pipelines/azure-pipelines-external.yml @@ -119,7 +119,7 @@ jobs: displayName: 'pydocstyle' - script: | - python -m black --check arviz examples + python -m black --check arviz examples asv_benchmarks displayName: 'black' - task: PublishTestResults@2 diff --git a/CHANGELOG.md b/CHANGELOG.md index ebf6822400..7278cdd870 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,8 @@ * Fixed `TypeError` in `transform` argument of `plot_density` and `plot_forest` when `InferenceData` is a list or tuple (#1121) * Fixed overlaid pairplots issue (#1135) * Update Docker building steps (#1127) +* Updated benchmarks and moved to asv_benchmarks/benchmarks (#1142) +* Moved `_fast_kde`, `_fast_kde_2d`, `get_bins` and `_sturges_formula` to `numeric_utils` and `get_coords` to `utils` (#1142) ### Deprecation ### Documentation diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d76d1eb76a..f188148114 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -192,7 +192,7 @@ tools: ```bash $ pip install black - $ black arviz/ examples/ + $ black arviz/ examples/ asv_benchmarks/ ``` * Your code passes pylint diff --git a/asv_benchmarks/benchmarks/benchmarks.py b/asv_benchmarks/benchmarks/benchmarks.py index 1a6a3d01c5..b516a26a7c 100644 --- a/asv_benchmarks/benchmarks/benchmarks.py +++ b/asv_benchmarks/benchmarks/benchmarks.py @@ -34,7 +34,7 @@ def setup(self, numba_flag): else: az.Numba.disable_numba() - def time_variance(self, numba_flag): + def time_variance_2d(self, numba_flag): stats_variance_2d(self.data) @@ -54,11 +54,11 @@ def time_circ_std(self, numba_flag): class Fast_Kde_1d: - params = [(True, False), (10**5, 10**6, 10**7)] + params = [(True, False), (10 ** 5, 10 ** 6, 10 ** 7)] param_names = ("Numba", "n") def setup(self, numba_flag, n): - self.x = np.random.randn(n//10, 10) + self.x = np.random.randn(n // 10, 10) if numba_flag: az.Numba.enable_numba() else: @@ -67,21 +67,23 @@ def setup(self, numba_flag, n): def time_fast_kde_normal(self, numba_flag, n): _fast_kde(self.x) + class Fast_KDE_2d: - params = [(True, False), (10**5, 10**6)] - param_names = ("Numba", "n") + params = [(True, False), ((100, 10 ** 4), (10 ** 4, 100), (1000, 1000))] + param_names = ("Numba", "shape") - def setup(self, numba_flag, n): - self.x = np.random.randn(n//10, 10) - self.y = np.random.randn(n//10, 10) + def setup(self, numba_flag, shape): + self.x = np.random.randn(*shape) + self.y = np.random.randn(*shape) if numba_flag: az.Numba.enable_numba() else: az.Numba.disable_numba() - def time_fast_kde_2d(self, numba_flag, n): + def time_fast_kde_2d(self, numba_flag, shape): _fast_kde_2d(self.x, self.y) + class Atleast_Nd: params = ("az.utils", "numpy") param_names = ("source",) diff --git a/scripts/lint.sh b/scripts/lint.sh index 59bcfa2e07..cdc99ba3ef 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -9,7 +9,11 @@ python -m pydocstyle --convention=numpy ${SRC_DIR}/arviz/ echo "Success!" echo "Checking code style with black..." +<<<<<<< HEAD python -m black -l 100 --check ${SRC_DIR}/arviz/ ${SRC_DIR}/examples/ +======= +python -m black -l 100 --check ${SRC_DIR}/arviz/ ${SRC_DIR}/examples/ ${SRC_DIR}/asv_benchmarks/ +>>>>>>> update fast_kde_2d benchmark and changelog echo "Success!" echo "Checking code style with pylint..." From 7e4e6385a792174aa3b69ac6b7add0a5ab9f4a93 Mon Sep 17 00:00:00 2001 From: nitishp25 Date: Sun, 12 Apr 2020 21:18:05 +0530 Subject: [PATCH 19/19] update lint.sh --- scripts/lint.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/lint.sh b/scripts/lint.sh index cdc99ba3ef..a62334ee96 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -9,11 +9,7 @@ python -m pydocstyle --convention=numpy ${SRC_DIR}/arviz/ echo "Success!" echo "Checking code style with black..." -<<<<<<< HEAD -python -m black -l 100 --check ${SRC_DIR}/arviz/ ${SRC_DIR}/examples/ -======= python -m black -l 100 --check ${SRC_DIR}/arviz/ ${SRC_DIR}/examples/ ${SRC_DIR}/asv_benchmarks/ ->>>>>>> update fast_kde_2d benchmark and changelog echo "Success!" echo "Checking code style with pylint..."