diff --git a/.requirements/base.in b/.requirements/base.in index 7e0dc2eca..48fd6a34c 100644 --- a/.requirements/base.in +++ b/.requirements/base.in @@ -6,3 +6,4 @@ natsort pandas_flavor multipledispatch scipy +lazy_loader diff --git a/AUTHORS.md b/AUTHORS.md index d457f72ad..6efebccb8 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -109,3 +109,4 @@ Contributors - [@ethompsy](https://github.com/ethompsy) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Aethompsy) - [@apatao](https://github.com/apatao) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3Aapatao) - [@OdinTech3](https://github.com/OdinTech3) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1094) +- [@asmirnov69](https://github.com/asmirnov69) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues/1059) diff --git a/CHANGELOG.md b/CHANGELOG.md index c92fee966..c8d43a601 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## [Unreleased] +- [ENH] Add lazy imports to speed up the time taken to load pyjanitor (part 2) - [DOC] Updated developer guide docs. - [ENH] Allow column selection/renaming within conditional_join. Issue #1102. Also allow first or last match. Issue #1020 @samukweku. - [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521 diff --git a/environment-dev.yml b/environment-dev.yml index 4f26797fb..0416b06d0 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -55,3 +55,4 @@ dependencies: - xorg-libxrender - pip: - mknotebooks + - lazy-loader diff --git a/janitor/__init__.py b/janitor/__init__.py index 9ec599cf7..9b19b7a43 100644 --- a/janitor/__init__.py +++ b/janitor/__init__.py @@ -1,15 +1,24 @@ -"""Top-level janitor API lives here.""" -try: - import janitor.xarray # noqa: F401 -except ImportError: - pass +"""Top-level janitor API lives here. + +Lazy loading used here to speed up imports. +""" +import lazy_loader as lazy + +from .accessors import * # noqa: F403, F401 from .functions import * # noqa: F403, F401 from .io import * # noqa: F403, F401 from .math import * # noqa: F403, F401 from .ml import get_features_targets as _get_features_targets from .utils import refactored_function -from .accessors import * # noqa: F403, F401 + + +_ = lazy.load("pandas_flavor") + +try: + jxr = lazy.load("janitor.xarray") # noqa: F401 +except ImportError: + pass @refactored_function( diff --git a/janitor/accessors/__init__.py b/janitor/accessors/__init__.py index 77dd81c08..6f9e707c3 100644 --- a/janitor/accessors/__init__.py +++ b/janitor/accessors/__init__.py @@ -1,2 +1,17 @@ -"""Top-level imports for pyjanitor's dataframe accessors.""" -from .data_description import DataDescription +"""Miscellaneous mathematical operators. + +Lazy loading used here to speed up imports. +""" + +import warnings +from typing import Tuple + + +import lazy_loader as lazy + +scipy_special = lazy.load("scipy.special") +ss = lazy.load("scipy.stats") +pf = lazy.load("pandas_flavor") +pd = lazy.load("pandas") +np = lazy.load("numpy") +pdtypes = lazy.load("pandas.api.types") diff --git a/janitor/accessors/data_description.py b/janitor/accessors/data_description.py index 37014e5e5..5224fdbb3 100644 --- a/janitor/accessors/data_description.py +++ b/janitor/accessors/data_description.py @@ -1,6 +1,8 @@ +import lazy_loader as lazy from typing import Dict, List, Union -import pandas_flavor as pf -import pandas as pd + +pf = lazy.load("pandas_flavor") +pd = lazy.load("pandas") @pf.register_dataframe_accessor("data_description") diff --git a/janitor/functions/impute.py b/janitor/functions/impute.py index 942bcad15..4a948958a 100644 --- a/janitor/functions/impute.py +++ b/janitor/functions/impute.py @@ -1,13 +1,15 @@ """Implementation of `impute` function""" from typing import Any, Hashable, Optional +import lazy_loader as lazy import numpy as np import pandas_flavor as pf import pandas as pd -from scipy.stats import mode from janitor.utils import deprecated_alias +ss = lazy.load("scipy.stats") + @pf.register_dataframe_method @deprecated_alias(column="column_name") @@ -100,7 +102,7 @@ def impute( "mean": np.mean, "average": np.mean, # aliased "median": np.median, - "mode": mode, + "mode": ss.mode, "minimum": np.min, "min": np.min, # aliased "maximum": np.max, diff --git a/janitor/math.py b/janitor/math.py index 08aa11766..aeeca4b21 100644 --- a/janitor/math.py +++ b/janitor/math.py @@ -1,16 +1,20 @@ -""" Miscellaneous mathematical operators. """ +"""Miscellaneous mathematical operators. + +Lazy loading used here to speed up imports. +""" import warnings from typing import Tuple -import numpy as np -import pandas as pd -import pandas_flavor as pf -from pandas.api.types import is_numeric_dtype -from scipy.special import expit -from scipy.special import logit as scipy_logit -from scipy.special import softmax as scipy_softmax -from scipy.stats import norm + +import lazy_loader as lazy + +scipy_special = lazy.load("scipy.special") +ss = lazy.load("scipy.stats") +pf = lazy.load("pandas_flavor") +pd = lazy.load("pandas") +np = lazy.load("numpy") +pdtypes = lazy.load("pandas.api.types") @pf.register_series_method @@ -94,7 +98,7 @@ def sigmoid(s: pd.Series) -> pd.Series: :param s: Input Series. :return: Transformed Series. """ - return expit(s) + return scipy_special.expit(s) @pf.register_series_method @@ -125,7 +129,7 @@ def softmax(s: pd.Series) -> pd.Series: :return: Transformed Series. """ - return pd.Series(scipy_softmax(s), index=s.index, name=s.name) + return pd.Series(scipy_special.softmax(s), index=s.index, name=s.name) @pf.register_series_method @@ -168,7 +172,7 @@ def logit(s: pd.Series, error: str = "warn") -> pd.Series: else: pass s[outside_support] = np.nan - return scipy_logit(s) + return scipy_special.logit(s) @pf.register_series_method @@ -188,7 +192,7 @@ def normal_cdf(s: pd.Series) -> pd.Series: :param s: Input Series. :return: Transformed Series. """ - return pd.Series(norm.cdf(s), index=s.index) + return pd.Series(ss.norm.cdf(s), index=s.index) @pf.register_series_method @@ -229,7 +233,7 @@ def probit(s: pd.Series, error: str = "warn") -> pd.Series: pass s[outside_support] = np.nan with np.errstate(all="ignore"): - out = pd.Series(norm.ppf(s), index=s.index) + out = pd.Series(ss.norm.ppf(s), index=s.index) return out @@ -313,7 +317,7 @@ def ecdf(s: pd.Series) -> Tuple[np.ndarray, np.ndarray]: :raises TypeError: if series is not numeric. :raises ValueError: if series contains nulls. """ - if not is_numeric_dtype(s): + if not pdtypes.is_numeric_dtype(s): raise TypeError(f"series {s.name} must be numeric!") if not s.isna().sum() == 0: raise ValueError(f"series {s.name} contains nulls. Please drop them.") diff --git a/janitor/utils.py b/janitor/utils.py index 3ee11d54e..3101bed0f 100644 --- a/janitor/utils.py +++ b/janitor/utils.py @@ -1,14 +1,23 @@ -"""Miscellaneous internal PyJanitor helper functions.""" +"""Miscellaneous mathematical operators. + +Lazy loading used here to speed up imports. +""" import os -import socket import sys -from warnings import warn -from functools import singledispatch, wraps +import socket from typing import Callable, Dict, Iterable, Union +from functools import singledispatch, wraps +from warnings import warn + +import lazy_loader as lazy -import numpy as np -import pandas as pd +scipy_special = lazy.load("scipy.special") +ss = lazy.load("scipy.stats") +pf = lazy.load("pandas_flavor") +pd = lazy.load("pandas") +np = lazy.load("numpy") +pdtypes = lazy.load("pandas.api.types") def check(varname: str, value, expected_types: list): diff --git a/janitor/xarray/functions.py b/janitor/xarray/functions.py index 7a015477c..1e6e2712c 100644 --- a/janitor/xarray/functions.py +++ b/janitor/xarray/functions.py @@ -5,17 +5,14 @@ from typing import Union +import lazy_loader -import numpy as np -from pandas_flavor import ( - register_xarray_dataarray_method, - register_xarray_dataset_method, -) +np = lazy_loader.load("numpy") +xr = lazy_loader.load("xarray") +pf = lazy_loader.load("pandas_flavor") -import xarray as xr - -@register_xarray_dataarray_method +@pf.register_xarray_dataarray_method def clone_using( da: xr.DataArray, np_arr: np.array, @@ -104,8 +101,8 @@ def clone_using( ) -@register_xarray_dataset_method -@register_xarray_dataarray_method +@pf.register_xarray_dataset_method +@pf.register_xarray_dataarray_method def convert_datetime_to_number( da_or_ds: Union[xr.DataArray, xr.Dataset], time_units: str, diff --git a/mkdocs/development/lazy_imports.md b/mkdocs/development/lazy_imports.md new file mode 100644 index 000000000..2f4fbca0f --- /dev/null +++ b/mkdocs/development/lazy_imports.md @@ -0,0 +1,55 @@ +# Lazy Imports + +In `pyjanitor`, we use lazy imports to speed up `import janitor`. +Prior to using lazy imports, `import janitor` would take about 1-2 seconds to complete, +thereby causing significant delays for downstream consumers of `pyjanitor`. +Slow importing be undesirable as it would slow down programs that demand low latency. + +## A brief history of the decision + +The original issue was raised by @ericmjl +in issue ([#1059](https://github.com/pyjanitor-devs/pyjanitor/issues/1059)). +The basis there is that the scientific Python community +was hurting with imports that took a long time, +especially the ones that depended on SciPy and Pandas. +As `pyjanitor` is a package that depends on `pandas`, +it was important for us to see if we could improve the speed at which imports happened. + +## Current Speed Benchmark + +As of 5 April 2022, imports take about ~0.5 seconds (give or take) to complete +on a GitHub Codespaces workspace. +This is much more desirable than the original 1-2 seconds, +also measured on a GitHub Codespaces workspace. + +## How to benchmark + +To benchmark, we run the following line: + +```bash +python -X importtime -c "import janitor" 2> timing.log +``` + +Then, using the `tuna` CLI tool, we can view the timing log: + +```bash +tuna timing.log +``` + +Note: You may need to install tuna using `pip install -U tuna`. +`tuna`'s development repository is [on GitHub][tuna] + +[tuna]: https://github.com/nschloe/tuna. + +You'll be redirected to your browser, +where the web UI will allow you to see +which imports are causing time delays. + +![Tuna's Web UI](./images/tuna.png) + +## Which imports to lazily load + +Generally speaking, the _external_ imports are the ones that +when lazily loaded, will give the maximal gain in speed. +You can also opt to lazily load `pyjanitor` submodules, +but we doubt they will give much advantage in speed.