Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX: dist_plot only displaying first numerical column #242

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
Expand All @@ -9,7 +9,9 @@ repos:
- id: check-toml
- id: debug-statements
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.5
rev: v0.5.5
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix, --show-fixes]
- id: ruff-format
args: [--line-length=100]
2 changes: 1 addition & 1 deletion examples/klib_data_cleaning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -793,4 +793,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "klib"
version = "1.1.2"
version = "1.1.3"
description = "Customized data preprocessing functions for frequent tasks."
authors = ["Andreas Kanz <andreas@akanz.de>"]
license = "MIT"
Expand Down Expand Up @@ -84,9 +84,18 @@ lint.unfixable = [

lint.select = ["ALL"]

lint.ignore = ["T201", "FBT001", "FBT002", "PLR0913", "D213", "D203", "UP038"]
lint.ignore = [
"T201",
"FBT001",
"FBT002",
"PLR0913",
"D213",
"D203",
"UP038",
"PD901",
]

line-length = 88
line-length = 100
target-version = "py310"

[tool.ruff.lint.mccabe]
Expand Down
32 changes: 0 additions & 32 deletions readthedocs.yml

This file was deleted.

27 changes: 13 additions & 14 deletions src/klib/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

:author: Andreas Kanz
"""

from __future__ import annotations

import itertools
Expand All @@ -28,7 +29,7 @@


def _optimize_ints(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
df = pd.DataFrame(data).copy() # noqa: PD901
df = pd.DataFrame(data).copy()
ints = df.select_dtypes(include=["int64"]).columns.tolist()
df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
return df
Expand All @@ -41,7 +42,7 @@ def _optimize_floats(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
return data


def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
def clean_column_names(data: pd.DataFrame, *, hints: bool = True) -> pd.DataFrame:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): Low code quality found in clean_column_names - 24% (low-code-quality)


ExplanationThe quality score for this function is below the quality threshold of 25%.
This score is a combination of the method length, cognitive complexity and working memory.

How can you solve this?

It might be worth refactoring this function to make it shorter and more readable.

  • Reduce the function length by extracting pieces of functionality out into
    their own functions. This is the most important thing you can do - ideally a
    function should be less than 10 lines.
  • Reduce nesting, perhaps by introducing guard clauses to return early.
  • Ensure that variables are tightly scoped, so that code using related concepts
    sits together within the function rather than being scattered.

"""Clean the column names of the provided Pandas Dataframe.

Optionally provides hints on duplicate and long column names.
Expand All @@ -58,6 +59,7 @@ def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
-------
pd.DataFrame
Pandas DataFrame with cleaned column names

"""
_validate_input_bool(hints, "hints")

Expand Down Expand Up @@ -157,6 +159,7 @@ def convert_datatypes(
-------
pd.DataFrame
Pandas DataFrame with converted Datatypes

"""
# Validate Inputs
_validate_input_bool(category, "Category")
Expand Down Expand Up @@ -216,6 +219,7 @@ def drop_missing(
Notes
-----
Columns are dropped first

"""
# Validate Inputs
_validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
Expand Down Expand Up @@ -316,6 +320,7 @@ def data_cleaning(
-----
The category dtype is not grouped in the summary, unless it contains exactly the \
same categories.

"""
if col_exclude is None:
col_exclude = []
Expand Down Expand Up @@ -424,6 +429,7 @@ def mv_col_handling(
optional:
cols_mv: Columns with missing values included in the analysis
drop_cols: List of dropped columns

"""
# Validate Inputs
_validate_input_range(mv_threshold, "mv_threshold", 0, 1)
Expand All @@ -434,9 +440,7 @@ def mv_col_handling(
data_local = data.copy()
mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
data_local[cols_mv] = (
data_local[cols_mv].applymap(lambda x: x if pd.isna(x) else 1).fillna(0)
)
data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: x if pd.isna(x) else 1).fillna(0)

high_corr_features = []
data_temp = data_local.copy()
Expand All @@ -450,9 +454,7 @@ def mv_col_handling(
if target is None:
data = data.drop(columns=high_corr_features)
else:
corrs = corr_mat(data_local, target=target, colored=False).loc[
high_corr_features
]
corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
data = data.drop(columns=drop_cols)

Expand Down Expand Up @@ -509,6 +511,7 @@ def pool_duplicate_subsets(

optional:
subset_cols: List of columns used as subset

"""
# Input validation
_validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
Expand All @@ -524,9 +527,7 @@ def pool_duplicate_subsets(
for i in range(data.shape[1] + 1 - min_col_pool):
# Consider only columns with lots of duplicates
check = [
col
for col in data.columns
if data.duplicated(subset=col).mean() > col_dupl_thresh
col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh
]

# Identify all possible combinations for the current interation
Expand All @@ -552,9 +553,7 @@ def pool_duplicate_subsets(
subset_cols = best_subset.columns.tolist()

unique_subset = (
best_subset.drop_duplicates()
.reset_index()
.rename(columns={"index": "pooled_vars"})
best_subset.drop_duplicates().reset_index().rename(columns={"index": "pooled_vars"})
)
data = data.merge(unique_subset, how="left", on=subset_cols).drop(
columns=subset_cols,
Expand Down
22 changes: 12 additions & 10 deletions src/klib/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
:author: Andreas Kanz

"""

from __future__ import annotations

from typing import Any
Expand Down Expand Up @@ -199,6 +200,7 @@ def corr_mat(
threshold: float = 0,
target: pd.DataFrame | pd.Series | np.ndarray | str | None = None,
method: Literal["pearson", "spearman", "kendall"] = "pearson",
*,
colored: bool = True,
) -> pd.DataFrame | pd.Series:
"""Return a color-encoded correlation matrix.
Expand Down Expand Up @@ -282,6 +284,7 @@ def corr_plot(
method: Literal["pearson", "spearman", "kendall"] = "pearson",
cmap: str = "BrBG",
figsize: tuple[float, float] = (12, 10),
*,
annot: bool = True,
dev: bool = False,
**kwargs, # noqa: ANN003
Expand Down Expand Up @@ -439,6 +442,7 @@ def corr_interactive_plot( # noqa: C901
method: Literal["pearson", "spearman", "kendall"] = "pearson",
cmap: str = "BrBG",
figsize: tuple[float, float] = (12, 10),
*,
annot: bool = True,
**kwargs, # noqa: ANN003
) -> go.Figure:
Expand Down Expand Up @@ -713,26 +717,22 @@ def dist_plot(

# Handle dictionary defaults
kde_kws = (
{"alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8}
if kde_kws is None
else kde_kws.copy()
{"alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8} if kde_kws is None else kde_kws.copy()
)
rug_kws = (
{"color": "#ff3333", "alpha": 0.15, "lw": 3, "height": 0.075}
if rug_kws is None
else rug_kws.copy()
)
fill_kws = (
{"color": "#80d4ff", "alpha": 0.2} if fill_kws is None else fill_kws.copy()
)
fill_kws = {"color": "#80d4ff", "alpha": 0.2} if fill_kws is None else fill_kws.copy()
font_kws = (
{"color": "#111111", "weight": "normal", "size": 11}
if font_kws is None
else font_kws.copy()
)

data = pd.DataFrame(data.copy()).dropna(axis=1, how="all")
df = data.copy() # noqa: PD901
df = data.copy()
data = data.loc[:, data.nunique() > 2] # noqa: PLR2004
if data.shape[0] > 10000: # noqa: PLR2004
data = data.sample(n=10000, random_state=408)
Expand All @@ -754,6 +754,9 @@ def dist_plot(
"the first 20 numerical features. Override this by setting showall=True.",
)
cols = cols[:20]
if not cols:
print("No columns with numeric data were detected.")
return None

for col in cols:
col_data = data[col].dropna(axis=0)
Expand Down Expand Up @@ -808,7 +811,7 @@ def dist_plot(
ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)],
ls=":",
color=".5",
label="\u03BC \u00B1 \u03C3",
label="\u03bc \u00b1 \u03c3",
)

g.axes[0, 0].set_ylim(0)
Expand Down Expand Up @@ -855,8 +858,7 @@ def dist_plot(
)
g.axes[0, 0].legend(loc="upper right")

return g.axes[0, 0]
return None
return g.axes[0, 0]


def missingval_plot( # noqa: PLR0915
Expand Down
1 change: 1 addition & 0 deletions src/klib/scripts/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

:author: Andreas Kanz
"""

import functools
from pathlib import Path
from time import perf_counter
Expand Down
18 changes: 10 additions & 8 deletions src/klib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
:author: Andreas Kanz

"""

from __future__ import annotations

from typing import Literal
Expand Down Expand Up @@ -34,6 +35,7 @@ def _corr_selector(
-------
pd.DataFrame
List or matrix of (filtered) correlations

"""
if split == "pos":
corr = corr.where((corr >= threshold) & (corr > 0))
Expand Down Expand Up @@ -102,6 +104,7 @@ def _diff_report(
-------
None
Print statement highlighting the datasets or changes between the two datasets.

"""
if show not in ["changes", "all"]:
return
Expand All @@ -125,17 +128,15 @@ def _diff_report(
)

print(
f"Shape of cleaned data: {data_cleaned.shape} - "
f"Remaining NAs: {data_cl_mv_tot}\n\n",
f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}\n\n",
)
print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
print(
f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n", # noqa: E501
f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n",
)
print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
print(
f" of which {len(single_val_cols)} single valued."
f" Columns: {single_val_cols}",
f" of which {len(single_val_cols)} single valued. Columns: {single_val_cols}",
)
print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
mem_change = data_mem - data_cl_mem
Expand Down Expand Up @@ -170,6 +171,7 @@ def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]
-------
Tuple[pd.DataFrame, List]
Deduplicated Pandas DataFrame and Index Object of rows dropped

"""
data = pd.DataFrame(data).copy()
dupl_rows = data[data.duplicated()].index.tolist()
Expand All @@ -192,6 +194,7 @@ def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
-------
float
Memory usage in megabytes

"""
return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)

Expand Down Expand Up @@ -222,6 +225,7 @@ def _missing_vals(data: pd.DataFrame) -> MVResult:
mv_cols: float, number of missing values in each column
mv_rows_ratio: float, ratio of missing values for each row
mv_cols_ratio: float, ratio of missing values for each column

"""
data = pd.DataFrame(data).copy()
mv_total: int = data.isna().sum().sum()
Expand Down Expand Up @@ -265,9 +269,7 @@ def _validate_input_smaller(value1: int, value2: int, desc: str) -> None:

def _validate_input_sum_smaller(limit: float, desc: str, *args) -> None: # noqa: ANN002
if sum(args) > limit:
msg = (
f"The sum of input values for '{desc}' should be less or equal to {limit}."
)
msg = f"The sum of input values for '{desc}' should be less or equal to {limit}."
raise ValueError(msg)


Expand Down
Loading
Loading