From f197aeaeaf7d9f3e713abd845f6ee6eb47ab7ed6 Mon Sep 17 00:00:00 2001 From: Jonas Schulze Date: Sat, 10 Mar 2018 16:05:23 +0100 Subject: [PATCH 1/2] DOC: update the pandas.DataFrame.plot.kde and pandas.Series.plot.kde docstrings Unfortunately, I was not able to compute a kernel estimate of a two-dimensional random variable. Hence, the example is more of an analysis of some independent data series. --- pandas/plotting/_core.py | 94 ++++++++++++++++++++++++++++++++-------- 1 file changed, 75 insertions(+), 19 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 520c6cecce6d7..d586f06c46e94 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2618,13 +2618,16 @@ def hist(self, bins=10, **kwds): def kde(self, bw_method=None, ind=None, **kwds): """ - Kernel Density Estimate plot using Gaussian kernels. + Generate Kernel Density Estimate plot using Gaussian kernels. - In statistics, kernel density estimation (KDE) is a non-parametric way - to estimate the probability density function (PDF) of a random + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random variable. This function uses Gaussian kernels and includes automatic bandwith determination. + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + Parameters ---------- bw_method : str, scalar or callable, optional @@ -2635,9 +2638,9 @@ def kde(self, bw_method=None, ind=None, **kwds): ind : NumPy array or integer, optional Evaluation points for the estimated PDF. If None (default), 1000 equally spaced points are used. If `ind` is a NumPy array, the - kde is evaluated at the points passed. If `ind` is an integer, + KDE is evaluated at the points passed. If `ind` is an integer, `ind` number of equally spaced points are used. - kwds : optional + **kwds : optional Additional keyword arguments are documented in :meth:`pandas.Series.plot`. @@ -2645,16 +2648,17 @@ def kde(self, bw_method=None, ind=None, **kwds): ------- axes : matplotlib.AxesSubplot or np.array of them - See also + See Also -------- scipy.stats.gaussian_kde : Representation of a kernel-density estimate using Gaussian kernels. This is the function used internally to estimate the PDF. + DataFrame.plot.kde : Generate a KDE plot for a DataFrame. Examples -------- Given a Series of points randomly sampled from an unknown - distribution, estimate this distribution using KDE with automatic + distribution, estimate its distribution using KDE with automatic bandwidth determination and plot the results, evaluating them at 1000 equally spaced points (default): @@ -2664,10 +2668,9 @@ def kde(self, bw_method=None, ind=None, **kwds): >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) >>> ax = s.plot.kde() - - An scalar fixed bandwidth can be specified. Using a too small bandwidth - can lead to overfitting, while a too large bandwidth can result in - underfitting: + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to overfitting, while using a large bandwidth value may result + in underfitting: .. plot:: :context: close-figs @@ -2851,27 +2854,80 @@ def hist(self, by=None, bins=10, **kwds): def kde(self, bw_method=None, ind=None, **kwds): """ - Kernel Density Estimate plot + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwith determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation Parameters ---------- - bw_method: str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If None (default), 'scott' is used. See :class:`scipy.stats.gaussian_kde` for more information. ind : NumPy array or integer, optional - Evaluation points. If None (default), 1000 equally spaced points - are used. If `ind` is a NumPy array, the kde is evaluated at the - points passed. If `ind` is an integer, `ind` number of equally - spaced points are used. - `**kwds` : optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwds : optional Additional keyword arguments are documented in :meth:`pandas.DataFrame.plot`. Returns ------- axes : matplotlib.AxesSubplot or np.array of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + Series.plot.kde : Generate a KDE plot for a Series. + + Examples + -------- + Given several Series of points randomly sampled from unknown + distributions, estimate their distribution using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'x': [1, 2, 2.5, 3, 3.5, 4, 5], + ... 'y': [4, 4, 4.5, 5, 5.5, 6, 6], + ... }) + >>> ax = df.plot.kde() + + A scalar bandwidth can be specified. Using a small bandwidth value can + lead to overfitting, while using a large bandwidth value may result + in underfitting: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) """ return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) From a95751e90de542ae8c7216139d173f405ea3d04b Mon Sep 17 00:00:00 2001 From: Jonas Schulze Date: Sun, 11 Mar 2018 01:05:08 +0100 Subject: [PATCH 2/2] DOC: extract similarities of kde docstrings The `DataFrame.plot.kde` and `Series.plot.kde` now use a common docstring, for which the differences are inserted. --- pandas/plotting/_core.py | 146 ++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 86 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index d586f06c46e94..f587cb91ab932 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1380,6 +1380,50 @@ def orientation(self): return 'vertical' +_kde_docstring = """ + Generate Kernel Density Estimate plot using Gaussian kernels. + + In statistics, `kernel density estimation`_ (KDE) is a non-parametric + way to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwith determination. + + .. _kernel density estimation: + https://en.wikipedia.org/wiki/Kernel_density_estimation + + Parameters + ---------- + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be + 'scott', 'silverman', a scalar constant or a callable. + If None (default), 'scott' is used. + See :class:`scipy.stats.gaussian_kde` for more information. + ind : NumPy array or integer, optional + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + KDE is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + **kwds : optional + Additional keyword arguments are documented in + :meth:`pandas.%(this-datatype)s.plot`. + + Returns + ------- + axes : matplotlib.AxesSubplot or np.array of them + + See Also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + %(sibling-datatype)s.plot.kde : Generate a KDE plot for a + %(sibling-datatype)s. + + Examples + -------- + %(examples)s + """ + class KdePlot(HistPlot): _kind = 'kde' orientation = 'vertical' @@ -2616,49 +2660,12 @@ def hist(self, bins=10, **kwds): """ return self(kind='hist', bins=bins, **kwds) - def kde(self, bw_method=None, ind=None, **kwds): - """ - Generate Kernel Density Estimate plot using Gaussian kernels. - - In statistics, `kernel density estimation`_ (KDE) is a non-parametric - way to estimate the probability density function (PDF) of a random - variable. This function uses Gaussian kernels and includes automatic - bandwith determination. - - .. _kernel density estimation: - https://en.wikipedia.org/wiki/Kernel_density_estimation - - Parameters - ---------- - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. - If None (default), 'scott' is used. - See :class:`scipy.stats.gaussian_kde` for more information. - ind : NumPy array or integer, optional - Evaluation points for the estimated PDF. If None (default), - 1000 equally spaced points are used. If `ind` is a NumPy array, the - KDE is evaluated at the points passed. If `ind` is an integer, - `ind` number of equally spaced points are used. - **kwds : optional - Additional keyword arguments are documented in - :meth:`pandas.Series.plot`. - - Returns - ------- - axes : matplotlib.AxesSubplot or np.array of them - - See Also - -------- - scipy.stats.gaussian_kde : Representation of a kernel-density - estimate using Gaussian kernels. This is the function used - internally to estimate the PDF. - DataFrame.plot.kde : Generate a KDE plot for a DataFrame. - - Examples - -------- + @Appender(_kde_docstring % { + 'this-datatype': 'Series', + 'sibling-datatype': 'DataFrame', + 'examples': """ Given a Series of points randomly sampled from an unknown - distribution, estimate its distribution using KDE with automatic + distribution, estimate its PDF using KDE with automatic bandwidth determination and plot the results, evaluating them at 1000 equally spaced points (default): @@ -2689,7 +2696,9 @@ def kde(self, bw_method=None, ind=None, **kwds): :context: close-figs >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) - """ + """.strip() + }) + def kde(self, bw_method=None, ind=None, **kwds): return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde @@ -2852,49 +2861,12 @@ def hist(self, by=None, bins=10, **kwds): """ return self(kind='hist', by=by, bins=bins, **kwds) - def kde(self, bw_method=None, ind=None, **kwds): - """ - Generate Kernel Density Estimate plot using Gaussian kernels. - - In statistics, `kernel density estimation`_ (KDE) is a non-parametric - way to estimate the probability density function (PDF) of a random - variable. This function uses Gaussian kernels and includes automatic - bandwith determination. - - .. _kernel density estimation: - https://en.wikipedia.org/wiki/Kernel_density_estimation - - Parameters - ---------- - bw_method : str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be - 'scott', 'silverman', a scalar constant or a callable. - If None (default), 'scott' is used. - See :class:`scipy.stats.gaussian_kde` for more information. - ind : NumPy array or integer, optional - Evaluation points for the estimated PDF. If None (default), - 1000 equally spaced points are used. If `ind` is a NumPy array, the - KDE is evaluated at the points passed. If `ind` is an integer, - `ind` number of equally spaced points are used. - **kwds : optional - Additional keyword arguments are documented in - :meth:`pandas.DataFrame.plot`. - - Returns - ------- - axes : matplotlib.AxesSubplot or np.array of them - - See Also - -------- - scipy.stats.gaussian_kde : Representation of a kernel-density - estimate using Gaussian kernels. This is the function used - internally to estimate the PDF. - Series.plot.kde : Generate a KDE plot for a Series. - - Examples - -------- + @Appender(_kde_docstring % { + 'this-datatype': 'DataFrame', + 'sibling-datatype': 'Series', + 'examples': """ Given several Series of points randomly sampled from unknown - distributions, estimate their distribution using KDE with automatic + distributions, estimate their PDFs using KDE with automatic bandwidth determination and plot the results, evaluating them at 1000 equally spaced points (default): @@ -2928,7 +2900,9 @@ def kde(self, bw_method=None, ind=None, **kwds): :context: close-figs >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6]) - """ + """.strip() + }) + def kde(self, bw_method=None, ind=None, **kwds): return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) density = kde