akanz1 · akanz1 · Jul 28, 2024 · Jul 28, 2024 · Jul 28, 2024 · Jul 28, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.5.0
+ rev: v4.6.0
  hooks:
  - id: check-yaml
  - id: end-of-file-fixer
@@ -9,7 +9,9 @@ repos:
  - id: check-toml
  - id: debug-statements
  - repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.1.5
+ rev: v0.5.5
  hooks:
  - id: ruff
+ args: [--fix, --exit-non-zero-on-fix, --show-fixes]
  - id: ruff-format
+ args: [--line-length=100]
diff --git a/examples/klib_data_cleaning.ipynb b/examples/klib_data_cleaning.ipynb
@@ -793,4 +793,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "klib"
-version = "1.1.2"
+version = "1.1.3"
 description = "Customized data preprocessing functions for frequent tasks."
 authors = ["Andreas Kanz <andreas@akanz.de>"]
 license = "MIT"
@@ -84,9 +84,18 @@ lint.unfixable = [
 
 lint.select = ["ALL"]
 
-lint.ignore = ["T201", "FBT001", "FBT002", "PLR0913", "D213", "D203", "UP038"]
+lint.ignore = [
+ "T201",
+ "FBT001",
+ "FBT002",
+ "PLR0913",
+ "D213",
+ "D203",
+ "UP038",
+ "PD901",
+]
 
-line-length = 88
+line-length = 100
 target-version = "py310"
 
 [tool.ruff.lint.mccabe]

diff --git a/readthedocs.yml b/readthedocs.yml
diff --git a/src/klib/clean.py b/src/klib/clean.py
@@ -2,6 +2,7 @@
 
 :author: Andreas Kanz
 """
+
 from __future__ import annotations
 
 import itertools
@@ -28,7 +29,7 @@
 
 
 def _optimize_ints(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
- df = pd.DataFrame(data).copy() # noqa: PD901
+ df = pd.DataFrame(data).copy()
  ints = df.select_dtypes(include=["int64"]).columns.tolist()
  df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
  return df
@@ -41,7 +42,7 @@ def _optimize_floats(data: pd.Series | pd.DataFrame) -> pd.DataFrame:
  return data
 
 
-def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
+def clean_column_names(data: pd.DataFrame, *, hints: bool = True) -> pd.DataFrame:
  """Clean the column names of the provided Pandas Dataframe.
 
  Optionally provides hints on duplicate and long column names.
@@ -58,6 +59,7 @@ def clean_column_names(data: pd.DataFrame, hints: bool = True) -> pd.DataFrame:
  -------
  pd.DataFrame
  Pandas DataFrame with cleaned column names
+
  """
  _validate_input_bool(hints, "hints")
 
@@ -157,6 +159,7 @@ def convert_datatypes(
  -------
  pd.DataFrame
  Pandas DataFrame with converted Datatypes
+
  """
  # Validate Inputs
  _validate_input_bool(category, "Category")
@@ -216,6 +219,7 @@ def drop_missing(
  Notes
  -----
  Columns are dropped first
+
  """
  # Validate Inputs
  _validate_input_range(drop_threshold_cols, "drop_threshold_cols", 0, 1)
@@ -316,6 +320,7 @@ def data_cleaning(
  -----
  The category dtype is not grouped in the summary, unless it contains exactly the \
  same categories.
+
  """
  if col_exclude is None:
  col_exclude = []
@@ -424,6 +429,7 @@ def mv_col_handling(
  optional:
  cols_mv: Columns with missing values included in the analysis
  drop_cols: List of dropped columns
+
  """
  # Validate Inputs
  _validate_input_range(mv_threshold, "mv_threshold", 0, 1)
@@ -434,9 +440,7 @@ def mv_col_handling(
  data_local = data.copy()
  mv_ratios = _missing_vals(data_local)["mv_cols_ratio"]
  cols_mv = mv_ratios[mv_ratios > mv_threshold].index.tolist()
- data_local[cols_mv] = (
- data_local[cols_mv].applymap(lambda x: x if pd.isna(x) else 1).fillna(0)
- )
+ data_local[cols_mv] = data_local[cols_mv].applymap(lambda x: x if pd.isna(x) else 1).fillna(0)
 
  high_corr_features = []
  data_temp = data_local.copy()
@@ -450,9 +454,7 @@ def mv_col_handling(
  if target is None:
  data = data.drop(columns=high_corr_features)
  else:
- corrs = corr_mat(data_local, target=target, colored=False).loc[
- high_corr_features
- ]
+ corrs = corr_mat(data_local, target=target, colored=False).loc[high_corr_features]
  drop_cols = corrs.loc[abs(corrs.iloc[:, 0]) < corr_thresh_target].index.tolist()
  data = data.drop(columns=drop_cols)
 
@@ -509,6 +511,7 @@ def pool_duplicate_subsets(
 
  optional:
  subset_cols: List of columns used as subset
+
  """
  # Input validation
  _validate_input_range(col_dupl_thresh, "col_dupl_thresh", 0, 1)
@@ -524,9 +527,7 @@ def pool_duplicate_subsets(
  for i in range(data.shape[1] + 1 - min_col_pool):
  # Consider only columns with lots of duplicates
  check = [
- col
- for col in data.columns
- if data.duplicated(subset=col).mean() > col_dupl_thresh
+ col for col in data.columns if data.duplicated(subset=col).mean() > col_dupl_thresh
  ]
 
  # Identify all possible combinations for the current interation
@@ -552,9 +553,7 @@ def pool_duplicate_subsets(
  subset_cols = best_subset.columns.tolist()
 
  unique_subset = (
- best_subset.drop_duplicates()
- .reset_index()
- .rename(columns={"index": "pooled_vars"})
+ best_subset.drop_duplicates().reset_index().rename(columns={"index": "pooled_vars"})
  )
  data = data.merge(unique_subset, how="left", on=subset_cols).drop(
  columns=subset_cols,

diff --git a/src/klib/describe.py b/src/klib/describe.py
@@ -3,6 +3,7 @@
 :author: Andreas Kanz
 
 """
+
 from __future__ import annotations
 
 from typing import Any
@@ -199,6 +200,7 @@ def corr_mat(
  threshold: float = 0,
  target: pd.DataFrame | pd.Series | np.ndarray | str | None = None,
  method: Literal["pearson", "spearman", "kendall"] = "pearson",
+ *,
  colored: bool = True,
 ) -> pd.DataFrame | pd.Series:
  """Return a color-encoded correlation matrix.
@@ -282,6 +284,7 @@ def corr_plot(
  method: Literal["pearson", "spearman", "kendall"] = "pearson",
  cmap: str = "BrBG",
  figsize: tuple[float, float] = (12, 10),
+ *,
  annot: bool = True,
  dev: bool = False,
  **kwargs, # noqa: ANN003
@@ -439,6 +442,7 @@ def corr_interactive_plot( # noqa: C901
  method: Literal["pearson", "spearman", "kendall"] = "pearson",
  cmap: str = "BrBG",
  figsize: tuple[float, float] = (12, 10),
+ *,
  annot: bool = True,
  **kwargs, # noqa: ANN003
 ) -> go.Figure:
@@ -713,26 +717,22 @@ def dist_plot(
 
  # Handle dictionary defaults
  kde_kws = (
- {"alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8}
- if kde_kws is None
- else kde_kws.copy()
+ {"alpha": 0.75, "linewidth": 1.5, "bw_adjust": 0.8} if kde_kws is None else kde_kws.copy()
  )
  rug_kws = (
  {"color": "#ff3333", "alpha": 0.15, "lw": 3, "height": 0.075}
  if rug_kws is None
  else rug_kws.copy()
  )
- fill_kws = (
- {"color": "#80d4ff", "alpha": 0.2} if fill_kws is None else fill_kws.copy()
- )
+ fill_kws = {"color": "#80d4ff", "alpha": 0.2} if fill_kws is None else fill_kws.copy()
  font_kws = (
  {"color": "#111111", "weight": "normal", "size": 11}
  if font_kws is None
  else font_kws.copy()
  )
 
  data = pd.DataFrame(data.copy()).dropna(axis=1, how="all")
- df = data.copy() # noqa: PD901
+ df = data.copy()
  data = data.loc[:, data.nunique() > 2] # noqa: PLR2004
  if data.shape[0] > 10000: # noqa: PLR2004
  data = data.sample(n=10000, random_state=408)
@@ -754,6 +754,9 @@ def dist_plot(
  "the first 20 numerical features. Override this by setting showall=True.",
  )
  cols = cols[:20]
+ if not cols:
+ print("No columns with numeric data were detected.")
+ return None
 
  for col in cols:
  col_data = data[col].dropna(axis=0)
@@ -808,7 +811,7 @@ def dist_plot(
  ymax=[np.interp(mean - std, x, y), np.interp(mean + std, x, y)],
  ls=":",
  color=".5",
- label="\u03BC \u00B1 \u03C3",
+ label="\u03bc \u00b1 \u03c3",
  )
 
  g.axes[0, 0].set_ylim(0)
@@ -855,8 +858,7 @@ def dist_plot(
  )
  g.axes[0, 0].legend(loc="upper right")
 
- return g.axes[0, 0]
- return None
+ return g.axes[0, 0]
 
 
 def missingval_plot( # noqa: PLR0915

diff --git a/src/klib/scripts/performance.py b/src/klib/scripts/performance.py
@@ -2,6 +2,7 @@
 
 :author: Andreas Kanz
 """
+
 import functools
 from pathlib import Path
 from time import perf_counter

diff --git a/src/klib/utils.py b/src/klib/utils.py
@@ -3,6 +3,7 @@
 :author: Andreas Kanz
 
 """
+
 from __future__ import annotations
 
 from typing import Literal
@@ -34,6 +35,7 @@ def _corr_selector(
  -------
  pd.DataFrame
  List or matrix of (filtered) correlations
+
  """
  if split == "pos":
  corr = corr.where((corr >= threshold) & (corr > 0))
@@ -102,6 +104,7 @@ def _diff_report(
  -------
  None
  Print statement highlighting the datasets or changes between the two datasets.
+
  """
  if show not in ["changes", "all"]:
  return
@@ -125,17 +128,15 @@ def _diff_report(
  )
 
  print(
- f"Shape of cleaned data: {data_cleaned.shape} - "
- f"Remaining NAs: {data_cl_mv_tot}\n\n",
+ f"Shape of cleaned data: {data_cleaned.shape} - Remaining NAs: {data_cl_mv_tot}\n\n",
  )
  print(f"Dropped rows: {data.shape[0]-data_cleaned.shape[0]}")
  print(
- f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n", # noqa: E501
+ f" of which {len(dupl_rows)} duplicates. (Rows (first 150 shown): {dupl_rows[:150]})\n",
  )
  print(f"Dropped columns: {data.shape[1]-data_cleaned.shape[1]}")
  print(
- f" of which {len(single_val_cols)} single valued."
- f" Columns: {single_val_cols}",
+ f" of which {len(single_val_cols)} single valued. Columns: {single_val_cols}",
  )
  print(f"Dropped missing values: {data_mv_tot-data_cl_mv_tot}")
  mem_change = data_mem - data_cl_mem
@@ -170,6 +171,7 @@ def _drop_duplicates(data: pd.DataFrame) -> tuple[pd.DataFrame, list[str | int]]
  -------
  Tuple[pd.DataFrame, List]
  Deduplicated Pandas DataFrame and Index Object of rows dropped
+
  """
  data = pd.DataFrame(data).copy()
  dupl_rows = data[data.duplicated()].index.tolist()
@@ -192,6 +194,7 @@ def _memory_usage(data: pd.DataFrame, deep: bool = True) -> float:
  -------
  float
  Memory usage in megabytes
+
  """
  return round(data.memory_usage(index=True, deep=deep).sum() / (1024**2), 2)
 
@@ -222,6 +225,7 @@ def _missing_vals(data: pd.DataFrame) -> MVResult:
  mv_cols: float, number of missing values in each column
  mv_rows_ratio: float, ratio of missing values for each row
  mv_cols_ratio: float, ratio of missing values for each column
+
  """
  data = pd.DataFrame(data).copy()
  mv_total: int = data.isna().sum().sum()
@@ -265,9 +269,7 @@ def _validate_input_smaller(value1: int, value2: int, desc: str) -> None:
 
 def _validate_input_sum_smaller(limit: float, desc: str, *args) -> None: # noqa: ANN002
  if sum(args) > limit:
- msg = (
- f"The sum of input values for '{desc}' should be less or equal to {limit}."
- )
+ msg = f"The sum of input values for '{desc}' should be less or equal to {limit}."
  raise ValueError(msg)