Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance Numeric Data Inspection and Introduce Positive/Negative Filtering #217

Merged
merged 19 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 143 additions & 65 deletions sdgx/data_models/inspectors/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,69 +14,132 @@ class NumericInspector(Inspector):

This class is a subclass of `Inspector` and is designed to provide methods for inspecting
and analyzing numeric data. It includes methods for detecting int or float data type.

In August 2024, we introduced a new feature that will continue to judge the positivity or
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we should indicate the PR and release version here, rather than the date?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, I also have another branch in development, I'll release after merging another PR. Due to various reasons, we haven't released a new version for a long time :(

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nerver mind, thanks for your work!

negativity after determining the type, thereby effectively improving the quality of synthetic
data in subsequent processing.
"""

int_columns: set = set()
"""
A set of column names that contain integer values.
"""

float_columns: set = set()
"""
A set of column names that contain float values.
"""

positive_columns: set = set()
"""
A set of column names that contain only positive numeric values.
"""

negative_columns: set = set()
"""
A set of column names that contain only negative numeric values.
"""

pos_threshold: float = 0.95
"""
The threshold proportion of positive values in a column to consider it as a positive column.
"""

negative_threshold: float = 0.95
"""
The threshold proportion of negative values in a column to consider it as a negative column.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.int_columns: set[str] = set()
self.float_columns: set[str] = set()
self._int_rate = 0.9
self.df_length = 0

def _is_int_column(self, col_series: pd.Series):
def _is_int_column(self, col_series: pd.Series) -> bool:
"""
Determine whether a column of pd.DataFrame is of type int
In the original pd.DataFrame automatically updated dtype, some int types will be marked as float.
In fact, we can make an accurate result by getting the decimal part of the value.
Determine if a column contains predominantly integer values.

This method checks if the proportion of integer values in the given column
exceeds a predefined threshold.

Args:
col_series (pd.Series): One single column of the raw data.
col_series (pd.Series): The column series to be inspected.

Returns:
bool: True if the column is predominantly integer, False otherwise.
"""
# Convert the column series to numeric values, coercing errors to NaN and dropping them
numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()

def is_decimal_part_zero(num: float):
"""
Is the decimal part == 0.0 ?

Args:
col_series (float): The number.
"""
try:
decimal_part = num - int(num)
except ValueError:
return None
if decimal_part == 0.0:
return True
else:
return False

# Initialize the counter for values with zero decimal part
int_cnt = 0
col_length = self.df_length

# Iterate over each value in the series
for each_val in col_series:
decimal_zer0 = is_decimal_part_zero(each_val)
# If the decimal part is zero, increment the counter and continue to the next value
if decimal_zer0 is True:
int_cnt += 1
continue
# If the decimal part is not zero or not a decimal number
# decrease the length of the series and continue to the next value
if decimal_zer0 is None:
col_length -= 1
continue

# Calculate the rate of values with zero decimal part
if col_length <= 0:
int_rate = 0
else:
int_rate = int_cnt / col_length

# Check if the rate is greater than the predefined rate
if int_rate > self._int_rate:
return True
else:
return False
# Count how many of the numeric values are integers
int_cnt = (numeric_values == numeric_values.astype(int)).sum()

# Calculate the ratio of integer values to the total numeric values
int_rate = int_cnt / len(numeric_values)

# Return True if the integer rate is greater than the predefined threshold
return int_rate > self._int_rate

def _is_positive_or_negative_column(
self, col_series: pd.Series, threshold: float, comparison_func
) -> bool:
"""
Determine if a column contains predominantly positive or negative values.

This method checks if the proportion of values that satisfy a given comparison
function exceeds a predefined threshold.

Args:
col_series (pd.Series): The column series to be inspected.
threshold (float): The proportion threshold for considering the column as positive or negative.
comparison_func (function): A function that takes a numeric value and returns a boolean.

Returns:
bool: True if the column satisfies the condition, False otherwise.
"""
# Convert the column series to numeric values, coercing errors to NaN and dropping NaN values
numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()

# Apply the comparison function to the numeric values and sum the results
count = comparison_func(numeric_values).sum()

# Calculate the proportion of values that meet the comparison criteria
proportion = count / len(numeric_values)

# Return True if the proportion meets or exceeds the threshold, otherwise False
return proportion >= threshold

def _is_positive_column(self, col_series: pd.Series) -> bool:
"""
Determine if a column contains predominantly positive values.

This method checks if the proportion of positive values in the given column
exceeds a predefined threshold.

Args:
col_series (pd.Series): The column series to be inspected.

Returns:
bool: True if the column is predominantly positive, False otherwise.
"""
return self._is_positive_or_negative_column(col_series, self.pos_threshold, lambda x: x > 0)

def _is_negative_column(self, col_series: pd.Series) -> bool:
"""
Determine if a column contains predominantly negative values.

This method checks if the proportion of negative values in the given column
exceeds a predefined threshold.

Args:
col_series (pd.Series): The column series to be inspected.

Returns:
bool: True if the column is predominantly negative, False otherwise.
"""
return self._is_positive_or_negative_column(
col_series, self.negative_threshold, lambda x: x < 0
)

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
MooooCat marked this conversation as resolved.
Show resolved Hide resolved
"""Fit the inspector.
Expand All @@ -87,33 +150,48 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
raw_data (pd.DataFrame): Raw data
"""

# Initialize sets for integer and float columns
self.int_columns = set()
self.float_columns = set()

self.df_length = len(raw_data)
# Initialize sets for positive and negative columns
self.positive_columns = set()
self.negative_columns = set()

float_candidate = self.float_columns.union(
set(raw_data.select_dtypes(include=["float64"]).columns)
)

for candidate in float_candidate:
if self._is_int_column(raw_data[candidate]):
self.int_columns.add(candidate)
else:
self.float_columns.add(candidate)

self.int_columns = self.int_columns.union(
set(raw_data.select_dtypes(include=["int64"]).columns)
)
# Store the length of the DataFrame
self.df_length = len(raw_data)

# Iterate all columns and determain the final data type
for col in raw_data.columns:
if raw_data[col].dtype in ["int64", "float64"]:
# float or int
if self._is_int_column(raw_data[col]):
self.int_columns.add(col)
else:
self.float_columns.add(col)

# positive? negative?
if self._is_positive_column(raw_data[col]):
self.positive_columns.add(col)
elif self._is_negative_column(raw_data[col]):
self.negative_columns.add(col)

# Mark the inspector as ready
self.ready = True

def inspect(self, *args, **kwargs) -> dict[str, Any]:
"""Inspect raw data and generate metadata."""

# Positive and negative columns should not be strictly considered as label columns
# We use the format dict to inspect and output to metadata
numeric_format: dict = {}
numeric_format["positive"] = sorted(list(self.positive_columns))
numeric_format["negative"] = sorted(list(self.negative_columns))

return {
"int_columns": list(self.int_columns),
"float_columns": list(self.float_columns),
"numeric_format": numeric_format,
}


Expand Down
1 change: 1 addition & 0 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def check_column_list(cls, value) -> Any:
datetime_columns: Set[str] = set()
const_columns: Set[str] = set()
datetime_format: Dict = defaultdict(str)
numeric_format: Dict = defaultdict(list)

# version info
version: str = "1.0"
Expand Down
11 changes: 11 additions & 0 deletions sdgx/data_processors/filter/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from __future__ import annotations

from sdgx.data_processors.base import DataProcessor


class Filter(DataProcessor):
"""
Base class for all data filters.

Filter is a module used to apply rules and remove sampled data that does not conform to the rules.
"""
109 changes: 109 additions & 0 deletions sdgx/data_processors/filter/positive_negative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from __future__ import annotations

from typing import Any

import pandas as pd

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.filter.base import Filter
from sdgx.utils import logger


class PositiveNegativeFilter(Filter):
"""
A data processor for filtering positive and negative values.

This filter is used to ensure that values in specific columns remain positive or negative.
During the reverse conversion process, rows that do not meet the expected positivity or
negativity will be removed.

Attributes:
int_columns (set): A set of column names containing integer values.
float_columns (set): A set of column names containing float values.
positive_columns (set): A set of column names that should contain positive values.
negative_columns (set): A set of column names that should contain negative values.
"""

int_columns: set = set()
"""
A set of column names that contain integer values.
"""

float_columns: set = set()
"""
A set of column names that contain float values.
"""

positive_columns: set = set()
"""
A set of column names that are identified as containing positive numeric values.
"""

negative_columns: set = set()
"""
A set of column names that are identified as containing negative numeric values.
"""

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the data filter.
"""
logger.info("PositiveNegativeFilter Fitted.")

# record int and float data
self.int_columns = metadata.int_columns
self.float_columns = metadata.float_columns

# record pos and neg
self.positive_columns = set(metadata.numeric_format["positive"])
self.negative_columns = set(metadata.numeric_format["negative"])

self.fitted = True

def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
"""
Convert method for data filter (No Action).
"""

logger.info("Converting data using PositiveNegativeFilter... Finished (No Action)")

return raw_data

def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
"""
Reverse_convert method for the pos_neg data filter.

Iterate through each row of data, check if there are negative values in positive_columns,
or positive values in negative_columns. If the conditions are not met, discard the row.
"""
logger.info(
f"Data reverse-converted by PositiveNegativeFilter Start with Shape: {processed_data.shape}."
)

# Create a boolean mask to mark the rows that need to be retained
mask = pd.Series(True, index=processed_data.index)

# Check positive_columns
for col in self.positive_columns:
if col in processed_data.columns:
mask &= processed_data[col] >= 0

# Check negative_columns
for col in self.negative_columns:
if col in processed_data.columns:
mask &= processed_data[col] <= 0

# Apply the mask to filter the data
filtered_data = processed_data[mask]

logger.info(
f"Data reverse-converted by PositiveNegativeFilter with Output Shape: {filtered_data.shape}."
)

return filtered_data


@hookimpl
def register(manager):
manager.register("PositiveNegativeFilter", PositiveNegativeFilter)
1 change: 1 addition & 0 deletions sdgx/data_processors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class DataProcessorManager(Manager):
]
] + [
"ConstValueTransformer".lower(),
"PositiveNegativeFilter".lower(),
"EmptyTransformer".lower(),
"ColumnOrderTransformer".lower(),
]
Expand Down
3 changes: 3 additions & 0 deletions tests/data_models/inspector/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def test_inspector(inspector: NumericInspector, raw_data):
)
assert not inspector.float_columns
assert inspector.inspect_level == 10
assert inspector.negative_columns == set()
assert inspector.positive_columns == {"age", "hours-per-week", "fnlwgt", "educational-num"}
assert set(inspector.inspect().keys()) == {"int_columns", "float_columns", "numeric_format"}


if __name__ == "__main__":
Expand Down
Loading
Loading