diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..188a2df --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: 2 + +build: + os: "ubuntu-22.04" + tools: + python: "3.10" + +python: + install: + - requirements: docs/requirements.txt + +sphinx: + configuration: docs/source/conf.py \ No newline at end of file diff --git a/README.md b/README.md index 119c3cf..29812cd 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,10 @@ MIT license   - build   + + build   here. ## Table of contents @@ -101,6 +108,7 @@ Further examples can be found here. +- We are continuing to expand our library, and are open to suggestions for new models to implement. If you have a model you would like to see implemented, please open an issue on our GitHub page. ## Testing All tests are written using pytest and cover all user accessible code. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..6247f7e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..53fc1f3 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx==7.1.2 +sphinx-rtd-theme==1.3.0rc1 diff --git a/docs/source/Copulas.rst b/docs/source/Copulas.rst new file mode 100644 index 0000000..30f740d --- /dev/null +++ b/docs/source/Copulas.rst @@ -0,0 +1,342 @@ +.. _copulas: + +############## +Copula Models +############## + +This SklarPy package contains many different copula models. +Unlike univariate distributions, these are not wrappers of scipy objects. + +All implemented copula models are able to be fitted to both multivariate numpy and pandas data and contain easy saving and plotting methods. + +An important concept to remember when using these models that they are composed of 2 overall parts: + +1. The marginal distributions. These are univariate distributions of each random variable. +2. The copula distribution. This multivariate model captures the dependence structure between the variables. + +The overall multivariate joint distribution is created by combining these two parts, +which is handled by SklarPy's copula models. + +Which copula models are implemented? +------------------------------------ +Currently, the following copula models are implemented: + +.. csv-table:: Copula Models + :file: copula_table.csv + :header-rows: 1 + +All Normal-Mixture models use the parameterization specified by McNeil, Frey and Embrechts (2005). + +MarginalFitter +-------------- +This class is used to fit multiple univariate distributions to data easily and evaluate their methods. +It implements the following methods and attributes: + +- marginal_logpdf (log of the probability density functions of the marginal distributions) +- marginal_pdfs (the probability density functions of the fitted marginal distributions) +- marginal_cdfs (the cumulative distribution functions of the fitted marginal distributions) +- marginal_ppfs (the percent point functions / inverse cdfs of the fitted marginal distributions) +- marginal_rvs (random variate generators / samplers of the fitted marginal distributions) +- pairplot (pairplot of the fitted marginal distributions) +- marginals (the fitted marginal distributions as a dictionary) +- summary (a summary of the fitted marginal distributions) +- num_variables (the number of variables present in the original dataset) +- fitted (whether the marginal distributions have been fitted to data) +- fit (fitting the marginal distributions to data) + +PreFitCopula +------------- +This is the base class for all copula models. It implements the following methods and attributes: + +- logpdf (log of the probability density function of the overall joint distribution) +- pdf (probability density function of the overall joint distribution) +- cdf (cumulative distribution function of the overall joint distribution) +- mc_cdf (Monte Carlo approximation of the cumulative distribution function of the overall joint distribution) +- rvs (random variate generator / sampler of the overall joint distribution) +- copula_logpdf (log of the probability density function of the copula distribution) +- copula_pdf (probability density function of the copula distribution) +- copula_cdf (cumulative distribution function of the copula distribution) +- copula_mc_cdf (Monte Carlo approximation of the cumulative distribution function of the copula distribution) +- copula_rvs (random variate generator / sampler of the copula distribution) +- num_marginal_params (number of parameters in the marginal distributions) +- num_copula_params (number of parameters in the copula distribution) +- num_scalar_params (number of scalar parameters in the overall joint distribution) +- num_params (number of parameters in the overall joint distribution) +- likelihood (likelihood of the overall joint distribution) +- loglikelihood (log of the likelihood of the overall joint distribution) +- aic (Akaike Information Criterion of the overall joint distribution) +- bic (Bayesian Information Criterion of the overall joint distribution) +- marginal_pairplot (pairplot of the marginal distributions) +- pdf_plot (plot of the probability density function of the overall joint distribution) +- cdf_plot (plot of the cumulative distribution function of the overall joint distribution) +- mc_cdf_plot (plot of the Monte Carlo approximation of the cumulative distribution function of the overall joint distribution) +- copula_pdf_plot (plot of the probability density function of the copula distribution) +- copula_cdf_plot (plot of the cumulative distribution function of the copula distribution) +- copula_mc_cdf_plot (plot of the Monte Carlo approximation of the cumulative distribution function of the copula distribution) +- fit (fitting the overall joint distribution to data) + +mc_cdf and copula_mc_cdf are numerical approximations of their respective cumulative distribution functions. +These are usually necessary as the analytical forms of these functions are often not available and numerical integration is computationally expensive. + +Also note that pdf and cdf plots are only implemented for 2-dimensional distributions. + +FittedCopula +------------ +This class is the fitted version of PreFitCopula's subclasses. +It implements the same methods as PreFitCopula, but does not require copula_params or mdists as arguments. +It also implements the following additional methods and attributes: + +- copula_params (the fitted parameters of the copula distribution) +- mdists (the fitted univariate marginal distributions) +- num_variables (the number of variables the distribution is fitted too) +- fitted_num_data_points (the number of observations used to fit the distribution) +- converged (whether the fitting algorithm converged) +- summary (a summary of the overall fitted distribution) +- save (save the overall fitted distribution object) + +MarginalFitter Example +----------------------- +Generating data and fitting marginal distributions:: + + import numpy as np + import pandas as pd + + # specifying the parameters of the multivariate normal distribution we are + # sampling from + num_generate: int = 1000 + my_mu: np.ndarray = np.array([33, 44], dtype=float) + my_corr: np.ndarray = np.array([[1, 0.7], [0.7, 1]], dtype=float) + my_sig: np.ndarray = np.array([1.3, 2.5]) + my_cov: np.ndarray = np.diag(my_sig) @ my_corr @ np.diag(my_sig) + my_mvn_params: tuple = (my_mu, my_cov) + + # generating multivariate random normal variables + from sklarpy.multivariate import mvt_normal + + rvs: np.ndarray = mvt_normal.rvs(num_generate, my_mvn_params) + rvs_df: pd.DataFrame = pd.DataFrame(rvs, columns=['Wife Age', 'Husband Age' + ], dtype=float) + + # applying MarginalFitter to our random variables + from sklarpy.copulas import MarginalFitter + + mfitter: MarginalFitter = MarginalFitter(rvs_df) + mfitter.fit({'pvalue': 0.01}) + + # printing out a summary of our fits + from sklarpy import print_full + print_full() + + print(mfitter.summary) + + +.. code-block:: text + + Wife Age Husband Age + Parametric/Non-Parametric Parametric Parametric + Discrete/Continuous continuous continuous + Distribution lognorm lognorm + #Params 3 3 + param0 0.000005 0.000001 + param1 -262115.561308 -2097116.799667 + param2 262148.497841 2097160.700641 + Support (-262115.56130758836, inf) (-2097116.7996667635, inf) + Fitted Domain (28.438692411392555, 36.673753788627785) (35.20033323448715, 51.735336956575935) + Cramér-von Mises statistic 0.124954 0.102395 + Cramér-von Mises p-value 0.475847 0.573349 + Cramér-von Mises @ 10% True True + Cramér-von Mises @ 5% True True + Cramér-von Mises @ 1% True True + Kolmogorov-Smirnov statistic 0.032827 0.024709 + Kolmogorov-Smirnov p-value 0.226385 0.56612 + Kolmogorov-Smirnov @ 10% True True + Kolmogorov-Smirnov @ 5% True True + Kolmogorov-Smirnov @ 1% True True + Likelihood 0.0 0.0 + Log-Likelihood -1666.824453 -2382.153726 + AIC 3339.648906 4770.307452 + BIC 3354.372172 4785.030718 + Sum of Squared Error 16.819752 6.322994 + #Fitted Data Points 1000 1000 + +Printing Marginals:: + + print(mfitter.marginals) + +.. code-block:: text + + {0: lognorm(0.0, -262115.56, 262148.5), 1: lognorm(0.0, -2097116.8, 2097160.7)} + +Calculating marginal cdf values:: + + mcdf_values: pd.DataFrame = mfitter.marginal_cdfs() + print(mcdf_values) + +.. code-block:: text + + Wife Age Husband Age + 0 0.446886 0.676438 + 1 0.162115 0.107338 + 2 0.631869 0.461236 + 3 0.182751 0.589056 + 4 0.827908 0.870150 + .. ... ... + 995 0.732827 0.523818 + 996 0.457342 0.372388 + 997 0.319827 0.598163 + 998 0.476477 0.350149 + 999 0.353060 0.323429 + +Producing a pairplot of the marginals:: + + data: np.ndarray = np.full((num_generate, 10), np.NaN) + data[:, :2] = np.random.poisson(4, (num_generate, 2)) + data[:, 2] = np.random.randint(-5, 5, (num_generate,)) + data[:, 3] = data[:, :2].sum(axis=1) + data[:, 4] = data[:, 0] + data[:, 3] + data[:, 5] = np.random.normal(4, 2, (num_generate,)) + data[:, 6] = np.random.gamma(2, 1, (num_generate,)) + data[:, 7:9] = np.random.standard_t(3, (num_generate, 2)) + data[:, 9] = np.random.uniform(0, 1, (num_generate,)) + + mfitter2: MarginalFitter = MarginalFitter(data).fit() + + mfitter2.pairplot() + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/mfitter_pairplot.png?raw=true + :alt: MarginalFitter Pair-Plot + :scale: 60% + :align: center + +Copula Example +-------------- +Here we use the generalized hyperbolic copula, though all methods and attributes are generalized:: + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + + # specifying the parameters of the multivariate hyperbolic distribution we are + # generating from + my_loc = np.array([1, -3], dtype=float) + my_shape = np.array([[1, 0.7], [0.7, 1]], dtype=float) + my_chi = 1.7 + my_psi = 4.5 + my_gamma = np.array([2.3, -4.3], dtype=float) + my_params = (my_chi, my_psi, my_loc, my_shape, my_gamma) + + # generating multivariate hyperbolic random variables + from sklarpy.multivariate import mvt_hyperbolic + + num_generate: int = 1000 + rvs: np.ndarray = mvt_hyperbolic.rvs(num_generate, my_params) + rvs_df: pd.DataFrame = pd.DataFrame(rvs, columns=['Process A', 'Process B'], + dtype=float) + + # fitting a generalized hyperbolic copula to our generated data using + # Maximum Likelihood Estimation + from sklarpy.copulas import gh_copula + + fitted_copula = gh_copula.fit( + data=rvs_df, method='mle', + univariate_fitter_options={'significant': False}, show_progress=True) + + # prining our fitted parameters + from sklarpy import print_full + print_full() + + print(fitted_copula.copula_params.to_dict) + +.. code-block:: text + + {'lamb': -10.0, 'chi': 8.460830761870396, 'psi': 10.0, + 'loc': array([[0.], [0.]]), + 'shape': array([[ 1. , -0.5214283], + [-0.5214283, 1. ]]), + 'gamma': array([[0.99848424], [0.94696141]])} + +Printing marginal distributions:: + + print(fitted_copula.mdists) + +.. code-block:: text + + {0: lognorm(0.38, -0.78, 4.02), 1: lognorm(0.0, -1276.15, 1268.45)} + +Printing covariance parameters:: + + print(fitted_copula.copula_params.cov) + +.. code-block:: text + + [[ 0.39404386 -0.18821382] + [-0.18821382 0.3928638 ]] + +Printing a summary of our joint fit:: + + print(fitted_copula.summary) + +.. code-block:: text + + Joint Distribution gh summary summary + Distribution Joint Distribution mvt_gh lognorm cauchy + #Variables 2 2 NaN NaN + #Params 11 6 3 2 + #Scalar Params 11 6 NaN NaN + Converged True True NaN NaN + Likelihood 0.0 0.0 0.0 0.0 + Log-Likelihood -4298.311941 -1032.490682 -1880.434874 -2561.765741 + AIC 8618.623881 2076.981365 3766.869748 5127.531482 + BIC 8672.609189 2106.427896 3781.593014 5137.346993 + #Fitted Data Points 1000 1000 1000 1000 + Parametric/Non-Parametric NaN NaN Parametric Parametric + Discrete/Continuous NaN NaN continuous continuous + param0 NaN NaN 0.328725 -6.937913 + param1 NaN NaN -1.596967 1.485756 + param2 NaN NaN 4.826054 NaN + Support NaN NaN (-1.5969673012994325, inf) (-inf, inf) + Fitted Domain NaN NaN (0.030085402918948567, 10.416203209871883) (-28.483718062724616, -2.8836636097027206) + Cramér-von Mises statistic NaN NaN 0.055878 3.834238 + Cramér-von Mises p-value NaN NaN 0.840024 0.0 + Cramér-von Mises @ 10% NaN NaN True False + Cramér-von Mises @ 5% NaN NaN True False + Cramér-von Mises @ 1% NaN NaN True False + Kolmogorov-Smirnov statistic NaN NaN 0.018599 0.128949 + Kolmogorov-Smirnov p-value NaN NaN 0.872994 0.0 + Kolmogorov-Smirnov @ 10% NaN NaN True False + Kolmogorov-Smirnov @ 5% NaN NaN True False + Kolmogorov-Smirnov @ 1% NaN NaN True False + Sum of Squared Error NaN NaN 11.475127 8.464622 + +Plotting our fit:: + + fitted_copula.copula_pdf_plot(show=False) + fitted_copula.pdf_plot(show=False) + fitted_copula.mc_cdf_plot(show=False) + plt.show() + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/PDF_Gh_PDF_Plot_Plot2.png?raw=true + :alt: Generalized Hyperbolic PDF + :scale: 60% + :align: center + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/Copula_PDF_Gh_Copula_PDF_Plot_Plot2.png?raw=true + :alt: Generalized Hyperbolic Copula PDF + :scale: 60% + :align: center + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/MC_CDF_Gh_MC_CDF_Plot_Plot2.png?raw=true + :alt: Generalized Hyperbolic CDF + :scale: 60% + :align: center + +Saving our fitted copula:: + + fitted_copula.save() + +We can then easily reload this object later:: + + from sklarpy import load + + loaded_copula = load('gh.pickle') + print(loaded_copula.summary) \ No newline at end of file diff --git a/docs/source/Installation.rst b/docs/source/Installation.rst new file mode 100644 index 0000000..2d551e9 --- /dev/null +++ b/docs/source/Installation.rst @@ -0,0 +1,19 @@ +.. _installation: + +Installation +------------ + +To use SklarPy, first install it using pip: + +.. code-block:: text + + pip install sklarpy + +For Developers +-------------- + +If you wish to add your own significant modifications to SklarPy, you can can clone the current repository using: + +.. code-block:: text + + git clone https://github.com/tfm000/sklarpy diff --git a/docs/source/Misc.rst b/docs/source/Misc.rst new file mode 100644 index 0000000..07e8b0d --- /dev/null +++ b/docs/source/Misc.rst @@ -0,0 +1,93 @@ +.. _misc: + +#################### +Miscellaneous Tools +#################### + +This SklarPy package contains functions / objects which are both implemented across SklarPy and also intended for user use. + +CorrelationMatrix +------------------ +CorrelationMatrix is a SklarPy class which allows the user to estimate correlation and covariance matrices using a number of different estimators. + +This code is inspired by the methods described by Xu, Brin (2016) and implements the following estimators: + +- pearson +- spearman +- kendall +- pp-kendall +- rm-pearson +- rm-spearman +- rm-kendall +- rm-pp-kendall +- laloux-pearson +- laloux-spearman +- laloux-kendall +- laloux-pp-kendall + +rm stands for the technique described by Rousseeuw and Molenberghs (1993) and laloux for that by Laloux et al. (2000). + +The corr method allows you to calculate correlation matrices, whilst cov allows you to calculate covariance matrices. + +debye +----- +This function allows the user to easily evaluate any member of the Debye function family. + +gradient_1d +------------ +This function allows the user to calculate the numerical first derivative / gradient of a given 1-d function. + +kv +--- +This class allows the user to easily evaluate the Modified Bessel function of the 2nd kind, in addition to its log-values. +Limiting cases of the family parameter, v, and value, z, are also implemented. + +CorrelationMatrix Example +-------------------------- + +Here we calculate both the covariance and correlation matrix estimators:: + + import numpy as np + import pandas as pd + + # specifying the parameters of the multivariate hyperbolic distribution we are + # generating from + my_loc = np.array([1, -3], dtype=float) + my_shape = np.array([[1, 0.7], [0.7, 1]], dtype=float) + my_chi = 1.7 + my_psi = 4.5 + my_gamma = np.array([2.3, -4.3], dtype=float) + my_params = (my_chi, my_psi, my_loc, my_shape, my_gamma) + + # generating multivariate hyperbolic random variables + from sklarpy.multivariate import mvt_hyperbolic + + num_generate: int = 1000 + rvs: np.ndarray = mvt_hyperbolic.rvs(num_generate, my_params) + rvs_df: pd.DataFrame = pd.DataFrame(rvs, columns=['Process A', 'Process B'], + dtype=float) + + # calculating covariance matrix and correlation matrix estimators + from sklarpy.misc import CorrelationMatrix + + cmatrix: CorrelationMatrix = CorrelationMatrix(rvs_df) + +Calculating PP-Kendall Correlation Matrix with Laloux's adjustments:: + + corr_estimator: np.ndarray = cmatrix.corr(method='laloux_pp_kendall') + print(corr_estimator) + +.. code-block:: text + + [[ 1. -0.53750912] + [-0.53750912 1. ]] + +Calculating Spearman's Covariance Matrix:: + + cov_estimator: np.ndarray = cmatrix.cov(method='spearman') + print(cov_estimator) + +.. code-block:: text + + [[ 3.02797258 -2.68535942] + [-2.68535942 8.68778502]] \ No newline at end of file diff --git a/docs/source/Multivariate.rst b/docs/source/Multivariate.rst new file mode 100644 index 0000000..e091343 --- /dev/null +++ b/docs/source/Multivariate.rst @@ -0,0 +1,158 @@ +.. _multivariate: + +############################ +Multivariate Distributions +############################ + +This SklarPy package contains many different multivariate distributions. +Unlike univariate distributions, these are not wrappers of scipy objects (with the exceptions of mvt_normal and mvt_student_t). + +All implemented multivariate distributions are able to be fitted to both multivariate numpy and pandas data and contain easy saving and plotting methods. + +Which multivariate distributions are implemented? +------------------------------------------------ +Currently, the following multivariate distributions are implemented: + +.. csv-table:: Multivariate Distributions + :file: mvt_table.csv + :header-rows: 1 + +All Normal-Mixture models use the parameterization specified by McNeil, Frey and Embrechts (2005). + +PreFitContinuousMultivariate +---------------------------- +This is the base class for all multivariate distributions. It implements the following methods and attributes: + +- logpdf (log of the probability density function) +- pdf (probability density function) +- cdf (cumulative distribution function) +- mc_cdf (Monte Carlo approximation of the cumulative distribution function) +- rvs (random variate generator / sampler) +- likelihood (likelihood function) +- loglikelihood (log of the likelihood function) +- aic (Akaike Information Criterion) +- bic (Bayesian Information Criterion) +- marginal_pairplot (pairplot of the marginal distributions) +- pdf_plot (plot of the probability density function) +- cdf_plot (plot of the cumulative distribution function) +- mc_cdf_plot (plot of the Monte Carlo approximation of the cumulative distribution function) +- num_params (The number of parameters in the distribution) +- num_scalar_params (The number of scalar values across all parameters in the distribution) +- fit (fitting the distribution to data) + +mc_cdf is a numerical approximation of the cumulative distribution function. +This is usually necessary for distributions that do not have a closed form cumulative distribution function, +as the numerical integration alternative is computationally expensive. + +num_params is the number of parameter objects in the distribution, i.e. a vector / matrix is counted as 1. +num_scalar_params counts the number of unique scalar values across all parameter objects. + +Also note that pdf and cdf plots are only implemented for 2-dimensional distributions. + +FittedContinuousMultivariate +---------------------------- +This class is the fitted version of PreFitContinuousMultivariate's subclasses. +It implements the same methods as PreFitContinuousMultivariate, but does not require params as an argument. +It also implements the following additional methods and attributes: + +- params (the fitted parameters) +- num_variables (the number of variables the distribution is fitted too) +- fitted_num_data_points (the number of observations used to fit the distribution) +- converged (whether the fitting algorithm converged) +- summary (a summary of the fitted distribution) +- save (save the fitted distribution object) + +Multivariate Example +--------------------- +Here we use the multivariate normal and multivariate symmetric hyperbolic +distributions, though all methods and attributes are generalized:: + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + + # specifying the parameters of the multivariate normal distribution we are + # sampling from + my_mu: np.ndarray = np.array([33, 44], dtype=float) + my_corr: np.ndarray = np.array([[1, 0.7], [0.7, 1]], dtype=float) + my_sig: np.ndarray = np.array([1.3, 2.5]) + my_cov: np.ndarray = np.diag(my_sig) @ my_corr @ np.diag(my_sig) + my_mvn_params: tuple = (my_mu, my_cov) + + # generating multivariate random normal variables + from sklarpy.multivariate import mvt_normal + + rvs: np.ndarray = mvt_normal.rvs(1000, my_mvn_params) + rvs_df: pd.DataFrame = pd.DataFrame(rvs, columns=['Wife Age', 'Husband Age'], + dtype=float) + + # fitting a symmetric hyperbolic dist to our generated data using + # Maximum Likelihood Estimation + from sklarpy.multivariate import mvt_shyperbolic + + fitted_msh = mvt_shyperbolic.fit(rvs_df, method='mle', show_progress=True) + + # printing our fitted parameters + print(fitted_msh.params.to_dict) + print(fitted_msh.params.cov) + + +.. code-block:: text + + {'chi': 6.817911964473556, 'psi': 10.0, 'loc': array([[32.99012429], + [43.91822886]]), 'shape': array([[1.72408489, 2.27711492], + [2.27711492, 6.27443288]])} + + [[1.78702958 2.36025021] + [2.36025021 6.50350643]] + +Printing a summary of our fit:: + + print(fitted_msh.summary()) + +.. code-block:: text + + summary + Distribution mvt_shyperbolic + #Variables 2 + #Params 4 + #Scalar Params 7 + Converged True + Likelihood 0.0 + Log-Likelihood -3664.49604 + AIC 7342.99208 + BIC 7377.346367 + #Fitted Data Points 1000 + +Plotting our fitted distribution:: + + fitted_msh.pdf_plot(show=False) + fitted_msh.mc_cdf_plot(show=False) + fitted_msh.marginal_pairplot(show=False) + plt.show() + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/PDF_Mvt_Shyperbolic_PDF_Plot_Plot.png?raw=true + :alt: Symmetric Hyperbolic PDF + :scale: 60% + :align: center + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/MC_CDF_Mvt_Shyperbolic_MC_CDF_Plot_Plot.png?raw=true + :alt: Symmetric Hyperbolic PDF + :scale: 60% + :align: center + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/mvt_shyperbolic_marginal_pair_plot.png?raw=true + :alt: Symmetric Hyperbolic PDF + :scale: 60% + :align: center + +Saving our fitted parameters:: + + fitted_msh.params.save() + +Reloading and fitting to another distribution of the same type:: + + from sklarpy import load + + loaded_msh_params = load('mvt_shyperbolic.pickle') + param_fitted_msh = mvt_shyperbolic.fit(params=loaded_msh_params) diff --git a/docs/source/Univariate.rst b/docs/source/Univariate.rst new file mode 100644 index 0000000..4d79d44 --- /dev/null +++ b/docs/source/Univariate.rst @@ -0,0 +1,363 @@ +.. _univariate: + +######################### +Univariate Distributions +######################### + +This SklarPy package contains many different univariate distributions in addition to objects allowing for easy fitting. +With the exception of a handful of distributions, all univariate distribution objects are wrappers of scipy.stats univariate distributions, with added functionalities for plotting, fitting and saving. +This means that the distributions available in SklarPy are the same as those available in your installed version of scipy. + +There is also the UnivariateFitter object, which allows for easy fitting of univariate distributions to data and for determining the best / statistically significant distribution(s). + +Why is my interpreter unable to find univariate distributions? +-------------------------------------------------------------- + +If you try:: + + from sklarpy.univariate import normal + +You will likely find that your interpreter flags an error along the lines of "cannot find reference 'normal' in __init__.py". +Do not worry, this is to be expected as a side effect of the dynamic way SklarPy univariate distributions are created from scipy.stats distributions. +At runtime, *your code will work without any errors*! + +But how do I know which distributions are available? +---------------------------------------------------- +Good question! You can use the following code to print out a list of all available univariate distributions:: + + from sklarpy.univariate import distributions_map + print(distributions_map) + +For scipy version 1.11.4 you should get an output along the lines of: + +.. code-block:: text + + {'all': ('ksone', 'kstwo', 'kstwobign', 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'burr12', 'fisk', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'expon', 'exponnorm', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f', 'foldnorm', 'weibull_min', 'truncweibull_min', 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gamma', 'erlang', 'gengamma', 'genhalflogistic', 'genhyperbolic', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'gausshyper', 'invgamma', 'invgauss', 'geninvgauss', 'norminvgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'laplace', 'laplace_asymmetric', 'levy', 'levy_l', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gibrat', 'maxwell', 'mielke', 'kappa4', 'kappa3', 'moyal', 'nakagami', 'ncx2', 'ncf', 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'rayleigh', 'loguniform', 'reciprocal', 'rice', 'recipinvgauss', 'semicircular', 'skewcauchy', 'skewnorm', 'trapezoid', 'trapz', 'triang', 'truncexpon', 'truncnorm', 'truncpareto', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', 'wald', 'wrapcauchy', 'gennorm', 'halfgennorm', 'crystalball', 'argus', 'studentized_range', 'rel_breitwigner', 'gh', 'gig', 'ig', 'normal', 'student_t', 'gaussian_kde', 'empirical', 'poisson', 'planck', 'discrete_laplace', 'discrete_uniform', 'geometric', 'discrete_empirical'), 'all continuous': ('ksone', 'kstwo', 'kstwobign', 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'burr12', 'fisk', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'expon', 'exponnorm', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f', 'foldnorm', 'weibull_min', 'truncweibull_min', 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gamma', 'erlang', 'gengamma', 'genhalflogistic', 'genhyperbolic', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'gausshyper', 'invgamma', 'invgauss', 'geninvgauss', 'norminvgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'laplace', 'laplace_asymmetric', 'levy', 'levy_l', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gibrat', 'maxwell', 'mielke', 'kappa4', 'kappa3', 'moyal', 'nakagami', 'ncx2', 'ncf', 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'rayleigh', 'loguniform', 'reciprocal', 'rice', 'recipinvgauss', 'semicircular', 'skewcauchy', 'skewnorm', 'trapezoid', 'trapz', 'triang', 'truncexpon', 'truncnorm', 'truncpareto', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', 'wald', 'wrapcauchy', 'gennorm', 'halfgennorm', 'crystalball', 'argus', 'studentized_range', 'rel_breitwigner', 'gh', 'gig', 'ig', 'normal', 'student_t', 'gaussian_kde', 'empirical'), 'all discrete': ('poisson', 'planck', 'discrete_laplace', 'discrete_uniform', 'geometric', 'discrete_empirical'), 'all common': ('cauchy', 'chi2', 'expon', 'gamma', 'lognorm', 'powerlaw', 'rayleigh', 'uniform', 'discrete_laplace', 'discrete_uniform', 'geometric', 'poisson'), 'all multimodal': ('arcsine', 'beta'), 'all parametric': ('ksone', 'kstwo', 'kstwobign', 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'burr12', 'fisk', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'expon', 'exponnorm', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f', 'foldnorm', 'weibull_min', 'truncweibull_min', 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gamma', 'erlang', 'gengamma', 'genhalflogistic', 'genhyperbolic', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'gausshyper', 'invgamma', 'invgauss', 'geninvgauss', 'norminvgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'laplace', 'laplace_asymmetric', 'levy', 'levy_l', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gibrat', 'maxwell', 'mielke', 'kappa4', 'kappa3', 'moyal', 'nakagami', 'ncx2', 'ncf', 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'rayleigh', 'loguniform', 'reciprocal', 'rice', 'recipinvgauss', 'semicircular', 'skewcauchy', 'skewnorm', 'trapezoid', 'trapz', 'triang', 'truncexpon', 'truncnorm', 'truncpareto', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', 'wald', 'wrapcauchy', 'gennorm', 'halfgennorm', 'crystalball', 'argus', 'studentized_range', 'rel_breitwigner', 'gh', 'gig', 'ig', 'normal', 'student_t', 'poisson', 'planck', 'discrete_laplace', 'discrete_uniform', 'geometric'), 'all numerical': ('gaussian_kde', 'empirical', 'discrete_empirical'), 'all continuous parametric': ('ksone', 'kstwo', 'kstwobign', 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'burr12', 'fisk', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'expon', 'exponnorm', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f', 'foldnorm', 'weibull_min', 'truncweibull_min', 'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gamma', 'erlang', 'gengamma', 'genhalflogistic', 'genhyperbolic', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'gausshyper', 'invgamma', 'invgauss', 'geninvgauss', 'norminvgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'laplace', 'laplace_asymmetric', 'levy', 'levy_l', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gibrat', 'maxwell', 'mielke', 'kappa4', 'kappa3', 'moyal', 'nakagami', 'ncx2', 'ncf', 'nct', 'pareto', 'lomax', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'rayleigh', 'loguniform', 'reciprocal', 'rice', 'recipinvgauss', 'semicircular', 'skewcauchy', 'skewnorm', 'trapezoid', 'trapz', 'triang', 'truncexpon', 'truncnorm', 'truncpareto', 'tukeylambda', 'uniform', 'vonmises', 'vonmises_line', 'wald', 'wrapcauchy', 'gennorm', 'halfgennorm', 'crystalball', 'argus', 'studentized_range', 'rel_breitwigner', 'gh', 'gig', 'ig', 'normal', 'student_t'), 'all discrete parametric': ('poisson', 'planck', 'discrete_laplace', 'discrete_uniform', 'geometric'), 'all continuous numerical': ('gaussian_kde', 'empirical'), 'all discrete numerical': ('discrete_empirical',), 'common continuous': ('cauchy', 'chi2', 'expon', 'gamma', 'lognorm', 'powerlaw', 'rayleigh', 'uniform'), 'common discrete': ('discrete_laplace', 'discrete_uniform', 'geometric', 'poisson'), 'continuous multimodal': ('arcsine', 'beta'), 'discrete multimodal': ()} + +So you have a lot to choose from! + +Name differences between SklarPy and SciPy +------------------------------------------- +Whilst we have generally kept most of the distribution names consistent with SciPy, there are a few notable exceptions. +These are: + +.. csv-table:: Distribution Name Discrepancies + :file: univariate_table.csv + :header-rows: 1 + +PreFitUnivariateBase +--------------------- +This class and its subclasses contain the following methods / functions: + +- pdf (probability density function) +- cdf (cumulative distribution function) +- ppf (percent point function / cumulative inverse function) +- support +- ppf_approx (approximate ppf) +- cdf_approx (approximate cdf) +- rvs (random variate generator / sampler) +- logpdf (log of the probability density function) +- likelihood (likelihood function) +- loglikelihood (log of the likelihood function) +- aic (Akaike information criterion) +- bic (Bayesian information criterion) +- sse (Sum of squared errors) +- gof (goodness of fit) +- plot (plotting) +- fit (fitting the distribution to data) + +Many / all of these methods take params as an argument. This is a tuple containing the parameters of the associated scipy.stats distribution object. + +ppf_approx and cdf_approx are approximations of the ppf and cdf functions respectively, which may be useful for distributions where the cdf and therefore ppf functions require numerical integration to evaluate. + +FittedUnivariateBase +--------------------- +This class is the fitted version of PreFitUnivariateBase's subclasses. +It implements the same methods as PreFitUnivariateBase, but does not require params as an argument. +It also implements the following additional methods and attributes: + +- summary (summary of the distribution fit) +- params (the fitted parameters) +- fitted domain (the domain over which the distribution is fitted) +- fitted_num_data_points (the number of data points used to fit the distribution) +- save (save the fitted distribution to a pickle file) + +.. automodule:: sklarpy.univariate.univariate_fitter + :members: + :exclude-members: UnivariateFitter + + .. autoclass:: UnivariateFitter + :members: + + .. automethod:: __init__ + + .. automethod:: fit + + .. caution:: + + If 'use_processpoolexecutor' is set to True, the UnivariateFitter object will use the ProcessPoolExecutor to parallelize the fitting process. However, if the code is ran outside 'if __name__ == "__main__":', you may receive a runtime error. + + .. automethod:: get_summary + + .. automethod:: get_best + + .. automethod:: plot + + .. automethod:: fitted_distributions + + +.. automodule:: sklarpy.univariate._prefit_dists + :members: + :exclude-members: PreFitUnivariateBase, PreFitNumericalUnivariateBase + + +.. automodule:: sklarpy.univariate._fitted_dists + :members: + :exclude-members: FittedUnivariateBase + +Continuous Example +--------------------- +Here we use the normal and gamma distributions, though all methods and attributes are generalized:: + + import numpy as np + import pandas as pd + + # generating random variables + from sklarpy.univariate import normal + + num_generate: int = 1000 + + # generating a 1d array of N(1, 1) random variables + normal_rvs1: np.ndarray = normal.rvs((num_generate,), (1, 1)) + # generating a 1d array of N(2, 3) random variables + normal_rvs2: np.ndarray = normal.rvs((num_generate,), (0, 3)) + rvs = normal_rvs1 * normal_rvs2 + + # fitting a gamma distribution to our product of normal random variables + from sklarpy.univariate import gamma + + fitted_gamma = gamma.fit(rvs) + + # we can easily retrieve the fitted parameters + fitted_gamma_params: tuple = fitted_gamma.params + print(fitted_gamma_params) + +.. code-block:: text + + (9754.44976841112, -411.8704014945831, 0.042211986922603084) + +We can also print a summary of our fit:: + + summary: pd.DataFrame = fitted_gamma.summary + print(summary) + +.. code-block:: text + + summary + Parametric/Non-Parametric Parametric + Discrete/Continuous continuous + Distribution gamma + #Params 3 + param0 9754.449768 + param1 -411.870401 + param2 0.042212 + Support (-411.8704014945831, inf) + Fitted Domain (-20.13664960054484, 17.86802768972715) + Cramér-von Mises statistic 3.411862 + Cramér-von Mises p-value 0.0 + Cramér-von Mises @ 10% False + Cramér-von Mises @ 5% False + Cramér-von Mises @ 1% False + Kolmogorov-Smirnov statistic 0.094371 + Kolmogorov-Smirnov p-value 0.0 + Kolmogorov-Smirnov @ 10% False + Kolmogorov-Smirnov @ 5% False + Kolmogorov-Smirnov @ 1% False + Likelihood 0.0 + Log-Likelihood -2846.513514 + AIC 5699.027028 + BIC 5713.750294 + Sum of Squared Error 12.319097 + #Fitted Data Points 1000 + +And plot our fitted distribution:: + + fitted_gamma.plot() + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/univariate_continuous_example_figure1.png?raw=true + :alt: gamma plot + :align: center + +And save:: + + fitted_gamma.save() + + +We can then easily reload our saved model:: + + from sklarpy import load + + loaded_fitted_gamma = load('gamma.pickle') + + + +Discrete Example +--------------------- +Here we use the poisson distribution, though all methods and attributes are generalized. +We see this works in exactly the same way as continuous distributions.:: + + import numpy as np + import pandas as pd + + # generating random variables + from sklarpy.univariate import poisson + + num_generate: int = 10000 + poisson_rvs: np.ndarray = poisson.rvs((num_generate, ), (4,)) + rvs_df: pd.DataFrame = pd.DataFrame(poisson_rvs, columns=['rvs'], dtype=int) + + # fitting a poisson distribution to a dataframe of rvs + fitted_poisson = poisson.fit(rvs_df) + + # we can easily retrieve the fitted parameters + fitted_poisson_params: tuple = fitted_poisson.params + print(fitted_poisson_params) + +.. code-block:: text + + (3.992,) + +We can also print a summary of our fit:: + + summary: pd.DataFrame = fitted_poisson.summary + print(summary) + +.. code-block:: text + + summary + Parametric/Non-Parametric Parametric + Discrete/Continuous discrete + Distribution poisson + #Params 1 + param0 3.985 + Support (0, inf) + Fitted Domain (0, 12) + chi-square statistic 7.059903 + chi-square p-value 1.0 + chi-square @ 10% True + chi-square @ 5% True + chi-square @ 1% True + Likelihood 0.0 + Log-Likelihood -2100.955867 + AIC 4203.911734 + BIC 4208.819489 + Sum of Squared Error 0.044802 + #Fitted Data Points 1000 + +And plot our fitted distribution:: + + fitted_poisson.plot() + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/univariate_discrete_example_figure1.png?raw=true + :alt: poisson plot + :align: center + +And save:: + + fitted_poisson.save() + + +We can then easily reload our saved model:: + + from sklarpy import load + + loaded_fitted_poisson = load('poisson.pickle') + +UnivariateFitter Example +------------------------- +Here we use the UnivariateFitter object to fit a distribution to a dataset:: + + import numpy as np + + # generating random variables + from sklarpy.univariate import normal + + num_generate: int = 10000 + # generating a 1d array of N(1, 1) random variables + normal_rvs1: np.ndarray = normal.rvs((num_generate,), (1, 1)) + # generating a 1d array of N(2, 3) random variables + normal_rvs2: np.ndarray = normal.rvs((num_generate,), (0, 3)) + rvs = normal_rvs1 * normal_rvs2 + + # applying UnivariateFitter to our product of normal random variables + from sklarpy.univariate import UnivariateFitter + + ufitter: UnivariateFitter = UnivariateFitter(rvs) + ufitter.fit() + + # printing out the summary of our fits + from sklarpy import print_full + print_full() + + print(ufitter.get_summary()) + +.. code-block:: text + + Parametric/Non-Parametric Discrete/Continuous Distribution #Params param0 param1 param2 Support Fitted Domain Cramér-von Mises statistic Cramér-von Mises p-value Cramér-von Mises @ 10% Cramér-von Mises @ 5% Cramér-von Mises @ 1% Kolmogorov-Smirnov statistic Kolmogorov-Smirnov p-value Kolmogorov-Smirnov @ 10% Kolmogorov-Smirnov @ 5% Kolmogorov-Smirnov @ 1% Likelihood Log-Likelihood AIC BIC Sum of Squared Error #Fitted Data Points + chi2 Parametric continuous chi2 3 448.683161 -68.423622 0.15222 (-68.42362151895298, inf) (-24.241200503425766, 21.971575538054054) 3.955007 0.0 False False False 0.099469 0.0 False False False 0.0 -2916.834582 5839.669164 5854.39243 12.84073 1000 + powerlaw Parametric continuous powerlaw 3 1.485383 -24.284621 46.256197 (-24.28462141839885, 21.97157553805406) (-24.241200503425766, 21.971575538054054) 53.515366 0.0 False False False 0.393459 0.0 False False False 0.0 -3765.295723 7536.591446 7551.314712 23.1246 1000 + cauchy Parametric continuous cauchy 2 -0.141171 1.744522 NaN (-inf, inf) (-24.241200503425766, 21.971575538054054) 0.223919 0.225566 True True True 0.03747 0.117619 True True True 0.0 -2848.628202 5701.256403 5711.071914 7.057125 1000 + expon Parametric continuous expon 2 -24.241201 24.121323 NaN (-24.241200503425766, inf) (-24.241200503425766, 21.971575538054054) 68.507136 0.0 False False False 0.465333 0.0 False False False 0.0 -4183.09624 8370.19248 8380.007991 24.962541 1000 + lognorm Parametric continuous lognorm 3 0.024195 -185.928209 185.754474 (-185.92820884247777, inf) (-24.241200503425766, 21.971575538054054) 3.726801 0.0 False False False 0.093801 0.0 False False False 0.0 -2910.878606 5827.757211 5842.480477 12.702458 1000 + rayleigh Parametric continuous rayleigh 2 -24.268255 17.360527 NaN (-24.268254515672, inf) (-24.241200503425766, 21.971575538054054) 45.036613 0.0 False False False 0.364332 0.0 False False False 0.0 -3548.608918 7101.217836 7111.033346 21.635708 1000 + gamma Parametric continuous gamma 3 614.186953 -110.593183 0.179857 (-110.5931825074225, inf) (-24.241200503425766, 21.971575538054054) 3.612011 0.0 False False False 0.094024 0.0 False False False 0.0 -2911.657958 5829.315916 5844.039182 12.618159 1000 + uniform Parametric continuous uniform 2 -24.241201 46.212776 NaN (-24.241200503425766, 21.971575538054054) (-24.241200503425766, 21.971575538054054) 43.325309 0.0 False False False 0.328626 0.0 False False False 0.0 -3833.256298 7670.512595 7680.328106 23.507262 1000 + +finding our best fit:: + + best_fit = ufitter.get_best(significant=False) + print(best_fit.summary) + best_fit.plot() + +.. code-block:: text + + summary + Parametric/Non-Parametric Parametric + Discrete/Continuous continuous + Distribution cauchy + #Params 2 + param0 -0.070741 + param1 1.642212 + Support (-inf, inf) + Fitted Domain (-16.627835918238397, 20.41344998969709) + Cramér-von Mises statistic 0.272381 + Cramér-von Mises p-value 0.162046 + Cramér-von Mises @ 10% True + Cramér-von Mises @ 5% True + Cramér-von Mises @ 1% True + Kolmogorov-Smirnov statistic 0.034967 + Kolmogorov-Smirnov p-value 0.169277 + Kolmogorov-Smirnov @ 10% True + Kolmogorov-Smirnov @ 5% True + Kolmogorov-Smirnov @ 1% True + Likelihood 0.0 + Log-Likelihood -2791.769256 + AIC 5587.538511 + BIC 5597.354022 + Sum of Squared Error 9.18869 + #Fitted Data Points 1000 + +.. image:: https://github.com/tfm000/sklarpy/blob/docs/readthedocs/media/univariate_fitter_example_figure1.png?raw=true + :alt: poisson plot + :align: center + +We can also save our UnivariateFitter object:: + + ufitter.save() + +We can then easily reload this:: + + from sklarpy import load + + loaded_ufitter = load('UnivariateFitter.pickle') + loaded_best_fit = loaded_ufitter.get_best(significant=False) diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..6dbbbe7 --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,41 @@ +# Configuration file for the Sphinx documentation builder. +import os +import sys + +sys.path.insert(0, os.path.abspath(os.path.join('..', '..'))) + +# -- Project information + +project = 'SklarPy' +copyright = '2023, Tyler Mitchell' +author = 'Tyler Mitchell' + +release = '1.0.0' +version = '1.0.0' + +# -- General configuration + +extensions = [ + 'sphinx.ext.duration', + 'sphinx.ext.doctest', + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', +] + +intersphinx_mapping = { + 'python': ('https://docs.python.org/3/', None), + 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), +} +intersphinx_disabled_domains = ['std'] + +templates_path = ['_templates'] + +# -- Options for HTML output + +html_theme = 'sphinx_rtd_theme' + +# -- Options for EPUB output +epub_show_urls = 'footnote' diff --git a/docs/source/copula_table.csv b/docs/source/copula_table.csv new file mode 100644 index 0000000..4b60d1b --- /dev/null +++ b/docs/source/copula_table.csv @@ -0,0 +1,16 @@ +Family,Name,Dimensions,SklarPy Model +Normal Mixture,Normal / Gaussian,Multivariate,gaussian_copula +Normal Mixture,Student-T,Multivariate,student_t_copula +Normal Mixture,Skewed-T,Multivariate,skewed_t_copula +Normal Mixture,Generalized Hyperbolic,Multivariate,gh_copula +Normal Mixture,Symmetric Generalized Hyperbolic,Multivariate,sgh_copula +Normal Mixture,Hyperbolic,Multivariate,hyperbolic_copula +Normal Mixture,Symmetric Hyperbolic,Multivariate,shyperbolic_copula +Normal Mixture,Normal-Inverse Gaussian (NIG),Multivariate,nig_copula +Normal Mixture,Symmetric Normal-Inverse Gaussian,Multivariate,snig_copula +Normal Mixture,Marginal Hyperbolic,Multivariate,mh_copula +Normal Mixture,Symmetric Marginal Hyperbolic,Multivariate,smh_copula +Archimedean,Clayton,Multivariate,clayton_copula +Archimedean,Gumbel,Multivariate,gumbel_copula +Archimedean,Frank,Bivariate,frank_copula +Numerical,Gaussian KDE,Multivariate,gaussian_kde_copula \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..55a20bf --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,120 @@ +.. image:: https://github.com/tfm000/sklarpy/blob/main/media/logo.png?raw=true + :alt: SklarPy logo + :scale: 60% + :align: center + +.. raw:: html + + + + + + +

+ + MIT license   + + build   + + build   + + downloads   + + maintained +

+ +

+ + mac os + + windows +

+ + +SklarPy (pronounced 'Sky-Lar-Pee' or 'Sky-La-Pie') is an open-source software for probability distribution fitting. +It contains useful tools for fitting Copula, Multivariate and Univariate probability distributions. +In addition to over 100 univariate distributions, we implement many multivariate normal mixture distributions and their copulas, including Gaussian, Student-T, Skewed-T and Generalized Hyperbolic distributions. +Named after Sklar's theorem and Abe Sklar, the American mathematician who proved that multivariate cumulative distribution functions can be expressed in terms of copulas and their marginals. + +This library has many different possible use cases, ranging from machine learning to finance. + +Contents +-------- + +.. toctree:: + :maxdepth: 2 + Installation + Univariate + Multivariate + Copulas + Misc + +Why we are better +----------------- +- Unlike other Python implementations of copulas, we implement more than the Gaussian and Archimedean copulas. A full list of our implementated copula models can be found in the documentation, though it includes many normal mean-variance mixture models as well as Archimedean and non-parametric models. +- We allow for easy parameter fitting of both the univariate marginals and the multivariate copula distribution. +- We allow for easy plotting of all our distributions, allowing you to visualize your models. +- We use scipy.stats as a backend for all our univariate models, meaning as scipy expands and improves their model selection, so will ours! +- We provide multivariate and univariate distributions, in addition to our copula models, meaning SklarPy can act as a one-stop-shop for all probability distribution fitting. A full list of our implemented multivariate distributions can be found in the documentation. +- We are continuing to expand our library, and are open to suggestions for new models to implement. If you have a model you would like to see implemented, please open an issue on our GitHub page. + +Example +-------- + +Here we show a quick example of working with SklarPy. +For more information, see the specific documentaion.:: + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + from sklarpy.copulas import gh_copula + + # generating random data + n: int = 1000 + obs: np.ndarray = np.full((n, 2), np.nan) + obs[:, 0] = np.random.normal(3,4, size=(n,)) + obs[:, 1] = obs[:, 0] + 0.5 * np.random.normal(3, 5, size=(n,)) + obvs_df: pd.DataFrame = pd.DataFrame(obs, columns=['Process A', 'Process B']) + + # fitting our copula model + fitted_copula = gh_copula.fit(obvs_df) + + # printing our fitted copula parameters + print(fitted_copula.copula_params.to_dict) + + # printing our fitted marginal distributions + print(fitted_copula.mdists) + + # plotting our fit + fitted_copula.pdf_plot(show=False) + fitted_copula.copula_pdf_plot(show=False) + plt.show() + +This outputs: + +.. code-block:: text + + {'lamb': -10.0, 'chi': 4.227038325195731, 'psi': 10.0, + 'loc': array([[0.], [0.]]), + 'shape': array([[1. , 0.84273015], + [0.84273015, 1.]]), + 'gamma': array([[0.99696041], [0.99913161]])} + + {0: lognorm(0.02, -203.22, 206.18), 1: lognorm(0.04, -110.89, 115.4)} + +.. image:: https://github.com/tfm000/sklarpy/blob/main/media/PDF_Gh_PDF_Plot_Plot.png?raw=true + :alt: GH PDF + :scale: 60% + :align: center + +.. image:: https://github.com/tfm000/sklarpy/blob/main/media/Copula_PDF_Gh_Copula_PDF_Plot_Plot.png?raw=true + :alt: GH Copula PDF + :scale: 60% + :align: center diff --git a/docs/source/mvt_table.csv b/docs/source/mvt_table.csv new file mode 100644 index 0000000..3b41026 --- /dev/null +++ b/docs/source/mvt_table.csv @@ -0,0 +1,13 @@ +Family,Name,SklarPy Model +Normal Mixture,Normal / Gaussian,mvt_normal +Normal Mixture,Student-T,mvt_student_t +Normal Mixture,Skewed-T,mvt_skewed_t +Normal Mixture,Generalized Hyperbolic,mvt_gh +Normal Mixture,Symmetric Generalized Hyperbolic,mvt_sgh +Normal Mixture,Hyperbolic,mvt_hyperbolic +Normal Mixture,Symmetric Hyperbolic,mvt_shyperbolic +Normal Mixture,Normal-Inverse Gaussian (NIG),mvt_nig +Normal Mixture,Symmetric Normal-Inverse Gaussian,mvt_snig +Normal Mixture,Marginal Hyperbolic,mvt_mh +Normal Mixture,Symmetric Marginal Hyperbolic,mvt_smh +Numerical,Gaussian KDE,mvt_gaussian_kde \ No newline at end of file diff --git a/docs/source/univariate_table.csv b/docs/source/univariate_table.csv new file mode 100644 index 0000000..2e409cb --- /dev/null +++ b/docs/source/univariate_table.csv @@ -0,0 +1,6 @@ +SciPy Stats,SklarPy Univariate +norm,normal +t,student_t +dlaplace,discrete_laplace +randint,discrete_uniform +geom,geometric \ No newline at end of file diff --git a/examples/multivariate_examples/multivariate_example.py b/examples/multivariate_examples/multivariate_example.py index ed1093b..ad6b928 100644 --- a/examples/multivariate_examples/multivariate_example.py +++ b/examples/multivariate_examples/multivariate_example.py @@ -34,7 +34,7 @@ # printing a summary of our fit print(fitted_msh.summary) -# can plot +# # can plot fitted_msh.pdf_plot(show=False) fitted_msh.mc_cdf_plot(show=False) fitted_msh.marginal_pairplot(show=False) diff --git a/examples/univariate_examples/discrete_example.py b/examples/univariate_examples/discrete_example.py index e29b1a3..409cc8b 100644 --- a/examples/univariate_examples/discrete_example.py +++ b/examples/univariate_examples/discrete_example.py @@ -8,7 +8,7 @@ # generating random variables from sklarpy.univariate import poisson -num_generate: int = 10000 +num_generate: int = 100 poisson_rvs: np.ndarray = poisson.rvs((num_generate, ), (4,)) rvs_df: pd.DataFrame = pd.DataFrame(poisson_rvs, columns=['rvs'], dtype=int) diff --git a/examples/univariate_examples/univariate_fitter_example.py b/examples/univariate_examples/univariate_fitter_example.py index 9b88fa1..9ceb4c4 100644 --- a/examples/univariate_examples/univariate_fitter_example.py +++ b/examples/univariate_examples/univariate_fitter_example.py @@ -5,7 +5,7 @@ # generating random variables from sklarpy.univariate import normal -num_generate: int = 10000 +num_generate: int = 1000 # generating a 1d array of N(1, 1) random variables normal_rvs1: np.ndarray = normal.rvs((num_generate,), (1, 1)) # generating a 1d array of N(2, 3) random variables diff --git a/media/Copula_PDF_Gh_Copula_PDF_Plot_Plot2.png b/media/Copula_PDF_Gh_Copula_PDF_Plot_Plot2.png new file mode 100644 index 0000000..54c42f5 Binary files /dev/null and b/media/Copula_PDF_Gh_Copula_PDF_Plot_Plot2.png differ diff --git a/media/MC_CDF_Gh_MC_CDF_Plot_Plot2.png b/media/MC_CDF_Gh_MC_CDF_Plot_Plot2.png new file mode 100644 index 0000000..f15de01 Binary files /dev/null and b/media/MC_CDF_Gh_MC_CDF_Plot_Plot2.png differ diff --git a/media/MC_CDF_Mvt_Shyperbolic_MC_CDF_Plot_Plot.png b/media/MC_CDF_Mvt_Shyperbolic_MC_CDF_Plot_Plot.png new file mode 100644 index 0000000..b15ab55 Binary files /dev/null and b/media/MC_CDF_Mvt_Shyperbolic_MC_CDF_Plot_Plot.png differ diff --git a/media/PDF_Gh_PDF_Plot_Plot2.png b/media/PDF_Gh_PDF_Plot_Plot2.png new file mode 100644 index 0000000..2d79e89 Binary files /dev/null and b/media/PDF_Gh_PDF_Plot_Plot2.png differ diff --git a/media/PDF_Mvt_Shyperbolic_PDF_Plot_Plot.png b/media/PDF_Mvt_Shyperbolic_PDF_Plot_Plot.png new file mode 100644 index 0000000..dc91230 Binary files /dev/null and b/media/PDF_Mvt_Shyperbolic_PDF_Plot_Plot.png differ diff --git a/media/mfitter_pairplot.png b/media/mfitter_pairplot.png new file mode 100644 index 0000000..d4bb98c Binary files /dev/null and b/media/mfitter_pairplot.png differ diff --git a/media/mvt_shyperbolic_marginal_pair_plot.png b/media/mvt_shyperbolic_marginal_pair_plot.png new file mode 100644 index 0000000..52b5330 Binary files /dev/null and b/media/mvt_shyperbolic_marginal_pair_plot.png differ diff --git a/media/univariate_continuous_example_figure1.png b/media/univariate_continuous_example_figure1.png new file mode 100644 index 0000000..fa8310e Binary files /dev/null and b/media/univariate_continuous_example_figure1.png differ diff --git a/media/univariate_discrete_example_figure1.png b/media/univariate_discrete_example_figure1.png new file mode 100644 index 0000000..a34b324 Binary files /dev/null and b/media/univariate_discrete_example_figure1.png differ diff --git a/media/univariate_fitter_example_figure1.png b/media/univariate_fitter_example_figure1.png new file mode 100644 index 0000000..b2853e5 Binary files /dev/null and b/media/univariate_fitter_example_figure1.png differ diff --git a/pyproject.toml b/pyproject.toml index 20fe848..406b08d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,9 @@ dev = [ "pytest==7.1.2", "tox==3.25.1", ] +docs = [ + "sphinx~=4.2.0", +] [project.urls] # Optional "Homepage" = "https://github.com/tfm000/sklarpy" diff --git a/sklarpy/univariate/_fitted_dists.py b/sklarpy/univariate/_fitted_dists.py index 05866ab..372bd93 100644 --- a/sklarpy/univariate/_fitted_dists.py +++ b/sklarpy/univariate/_fitted_dists.py @@ -393,7 +393,7 @@ def plot(self, xrange: np.ndarray = None, include_empirical: bool = True, "no empirical data to display.") # getting xrange and qrange - eps: float = 10 ** -4 + eps: float = 0.05 prob_bounds: tuple = (eps, 1 - eps) if xrange is None: if not (isinstance(num_to_plot, int) and num_to_plot >= 1): @@ -446,8 +446,10 @@ def plot(self, xrange: np.ndarray = None, include_empirical: bool = True, alpha=empirical_alpha, label=empirical_label) ax[3].plot(xrange, xrange, color=qqplot_yx_color, alpha=qqplot_yx_alpha, label='y=x') - ax[3].plot(self.ppf(qrange), empirical_ppf_values, color=color, + ppf_values = self.ppf(qrange) + ax[3].plot(ppf_values, empirical_ppf_values, color=color, alpha=alpha, label=self.name) + ax[3].set_xlim([ppf_values.min(), ppf_values.max()]) # plotting distribution ax[0].plot(xrange, self.pdf(xrange), color=color, diff --git a/sklarpy/univariate/_prefit_dists.py b/sklarpy/univariate/_prefit_dists.py index addac3a..29d4aad 100644 --- a/sklarpy/univariate/_prefit_dists.py +++ b/sklarpy/univariate/_prefit_dists.py @@ -598,7 +598,7 @@ def plot(self, params: tuple, xrange: np.ndarray = None, "show are all boolean.") # getting xrange and qrange - eps: float = 10 ** -4 + eps: float = 0.05 prob_bounds: tuple = (eps, 1 - eps) if xrange is None: if not (isinstance(num_to_plot, int) and num_to_plot >= 1):