Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reformatted dataprofiler/profilers/order_column_profile.py using flak… #548

Merged
merged 2 commits into from
Jul 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ repos:
hooks:
- id: isort
language_version: python3
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
exclude: (^docs/|^dataprofiler/tests/)
language_version: python3
# General fixers: format files for white spaces and trailing new lines, warn on debug statements
# https://github.com/pre-commit/pre-commit-hooks#hooks-available
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
40 changes: 24 additions & 16 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
"""
This is the order-column profiler module.

This profiler handles index columns.
"""

from . import BaseColumnProfiler, utils
from .profiler_options import OrderOptions


class OrderColumn(BaseColumnProfiler):
"""
Index column profile subclass of BaseColumnProfiler. Represents a column in
the dataset which is an index column.
Index column profile subclass of BaseColumnProfiler.

Represents a column in the dataset which is an index column.
"""

type = "order"

def __init__(self, name, options=None):
"""
Initialization of column base properties and itself.
Initialize column base properties and self.

:param name: Name of the data
:type name: String
Expand All @@ -34,7 +41,7 @@ def __init__(self, name, options=None):
@staticmethod
def _is_intersecting(first_value1, last_value1, first_value2, last_value2):
"""
Checks to see if the range of the datasets intersect
Check to see if the range of the datasets intersect.

:param first_value1: beginning value of dataset 1
:type first_value1: Integer
Expand Down Expand Up @@ -68,7 +75,7 @@ def _is_intersecting(first_value1, last_value1, first_value2, last_value2):
@staticmethod
def _is_enveloping(first_value1, last_value1, first_value2, last_value2):
"""
Checks to see if the range of the dataset 1 envelopes dataset 2
Check to see if the range of the dataset 1 envelopes dataset 2.

:param first_value1: beginning value of dataset 1
:type first_value1: Integer
Expand Down Expand Up @@ -105,7 +112,7 @@ def _merge_order(
piecewise2,
):
"""
Adds the order of two datasets together
Add the order of two datasets together.

:param order1: order of original dataset
:param first_value1: beginning value of original dataset
Expand Down Expand Up @@ -210,7 +217,7 @@ def _merge_order(

def __add__(self, other):
"""
Merges the properties of two OrderColumn profiles
Merge the properties of two OrderColumn profiles.

:param self: first profile
:param other: second profile
Expand Down Expand Up @@ -269,7 +276,7 @@ def profile(self):

def diff(self, other_profile, options=None):
"""
Generates the differences between the orders of two OrderColumns
Generate the differences between the orders of two OrderColumns.

:return: Dict containing the differences between orders in their
appropriate output formats
Expand All @@ -287,9 +294,10 @@ def diff(self, other_profile, options=None):
@BaseColumnProfiler._timeit(name="order")
def _get_data_order(self, df_series):
"""
Retrieves the order profile of a given data series.
Will return either: ascending, descending, constant value, or random.
Additionally, returns the first and last value of the series.
Retrieve the order profile of a given data series.

Return either: ascending, descending, constant value, or random.
Additionally, return the first and last value of the series.

:param df_series: a given column
:type df_series: pandas.core.series.Series
Expand Down Expand Up @@ -327,8 +335,9 @@ def _update_order(
self, df_series, prev_dependent_properties=None, subset_properties=None
):
"""
Updates the order profile with order information attained
from the new dataset in two steps:
Update order profile with order info attained from new dataset.

Do this in following two steps:
1. Get order information from input column data.
2. Merge information between existing profile and new column
order information.
Expand Down Expand Up @@ -365,8 +374,7 @@ def _update_order(

def _update_helper(self, df_series_clean, profile):
"""
Method for updating the column profile properties with a cleaned
dataset and the known null parameters of the dataset.
Update col profile properties with clean dataset and its known null parameters.

:param df_series_clean: df series with nulls removed
:type df_series_clean: pandas.core.series.Series
Expand All @@ -378,7 +386,7 @@ def _update_helper(self, df_series_clean, profile):

def update(self, df_series):
"""
Updates the column profile.
Update the column profile.

:param df_series: df series
:type df_series: pandas.core.series.Series
Expand Down
6 changes: 4 additions & 2 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1844,7 +1844,9 @@ def _get_correlation(self, clean_samples, batch_properties):
columns = self.options.correlation.columns
column_ids = list(range(len(self._profile)))
if columns is not None:
column_ids = [idx for col_name in columns for idx in self._col_name_to_idx[col_name]]
column_ids = [
idx for col_name in columns for idx in self._col_name_to_idx[col_name]
]
clean_column_ids = []
for idx in column_ids:
data_type = (
Expand All @@ -1858,7 +1860,7 @@ def _get_correlation(self, clean_samples, batch_properties):
means = {index: mean for index, mean in enumerate(batch_properties["mean"])}
data = data.fillna(value=means)
data = data[clean_column_ids]

# Update the counts/std if needed (i.e. if null rows or exist)
if (len(data) != batch_properties["count"]).any():
adjusted_stds = np.sqrt(
Expand Down
4 changes: 4 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
[flake8]
max-line-length = 88
extend-ignore = E203

[isort]
multi_line_output=3
skip=dataprofiler/tests/data/,venv/
Expand Down