capitalone · taylorfturner · Jul 19, 2022 · Jul 19, 2022 · Jul 19, 2022
@@ -16,6 +16,15 @@ repos:
     hooks:
       - id: isort
         language_version: python3
+  # Flake8: complexity and style checking
+  # https://flake8.pycqa.org/en/latest/user/using-hooks.html
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        additional_dependencies: [flake8-docstrings]
+        exclude: (^docs/|^dataprofiler/tests/)
+        language_version: python3
   # General fixers: format files for white spaces and trailing new lines, warn on debug statements
   # https://github.com/pre-commit/pre-commit-hooks#hooks-available
   - repo: https://github.com/pre-commit/pre-commit-hooks

@@ -1,18 +1,25 @@
+"""
+This is the order-column profiler module.
+
+This profiler handles index columns.
+"""
+
 from . import BaseColumnProfiler, utils
 from .profiler_options import OrderOptions
 
 
 class OrderColumn(BaseColumnProfiler):
     """
-    Index column profile subclass of BaseColumnProfiler. Represents a column in
-    the dataset which is an index column.
+    Index column profile subclass of BaseColumnProfiler.
+
+    Represents a column in the dataset which is an index column.
     """
 
     type = "order"
 
     def __init__(self, name, options=None):
         """
-        Initialization of column base properties and itself.
+        Initialize column base properties and self.
 
         :param name: Name of the data
         :type name: String
@@ -34,7 +41,7 @@ def __init__(self, name, options=None):
     @staticmethod
     def _is_intersecting(first_value1, last_value1, first_value2, last_value2):
         """
-        Checks to see if the range of the datasets intersect
+        Check to see if the range of the datasets intersect.
 
         :param first_value1: beginning value of dataset 1
         :type first_value1: Integer
@@ -68,7 +75,7 @@ def _is_intersecting(first_value1, last_value1, first_value2, last_value2):
     @staticmethod
     def _is_enveloping(first_value1, last_value1, first_value2, last_value2):
         """
-        Checks to see if the range of the dataset 1 envelopes dataset 2
+        Check to see if the range of the dataset 1 envelopes dataset 2.
 
         :param first_value1: beginning value of dataset 1
         :type first_value1: Integer
@@ -105,7 +112,7 @@ def _merge_order(
         piecewise2,
     ):
         """
-        Adds the order of two datasets together
+        Add the order of two datasets together.
 
         :param order1: order of original dataset
         :param first_value1: beginning value of original dataset
@@ -210,7 +217,7 @@ def _merge_order(
 
     def __add__(self, other):
         """
-        Merges the properties of two OrderColumn profiles
+        Merge the properties of two OrderColumn profiles.
 
         :param self: first profile
         :param other: second profile
@@ -269,7 +276,7 @@ def profile(self):
 
     def diff(self, other_profile, options=None):
         """
-        Generates the differences between the orders of two OrderColumns
+        Generate the differences between the orders of two OrderColumns.
 
         :return: Dict containing the differences between orders in their
         appropriate output formats
@@ -287,9 +294,10 @@ def diff(self, other_profile, options=None):
     @BaseColumnProfiler._timeit(name="order")
     def _get_data_order(self, df_series):
         """
-        Retrieves the order profile of a given data series.
-        Will return either: ascending, descending, constant value, or random.
-        Additionally, returns the first and last value of the series.
+        Retrieve the order profile of a given data series.
+
+        Return either: ascending, descending, constant value, or random.
+        Additionally, return the first and last value of the series.
 
         :param df_series: a given column
         :type df_series: pandas.core.series.Series
@@ -327,8 +335,9 @@ def _update_order(
         self, df_series, prev_dependent_properties=None, subset_properties=None
     ):
         """
-        Updates the order profile with order information attained
-        from the new dataset in two steps:
+        Update order profile with order info attained from new dataset.
+
+        Do this in following two steps:
         1. Get order information from input column data.
         2. Merge information between existing profile and new column
            order information.
@@ -365,8 +374,7 @@ def _update_order(
 
     def _update_helper(self, df_series_clean, profile):
         """
-        Method for updating the column profile properties with a cleaned
-        dataset and the known null parameters of the dataset.
+        Update col profile properties with clean dataset and its known null parameters.
 
         :param df_series_clean: df series with nulls removed
         :type df_series_clean: pandas.core.series.Series
@@ -378,7 +386,7 @@ def _update_helper(self, df_series_clean, profile):
 
     def update(self, df_series):
         """
-        Updates the column profile.
+        Update the column profile.
 
         :param df_series: df series
         :type df_series: pandas.core.series.Series

@@ -1844,7 +1844,9 @@ def _get_correlation(self, clean_samples, batch_properties):
         columns = self.options.correlation.columns
         column_ids = list(range(len(self._profile)))
         if columns is not None:
-            column_ids = [idx for col_name in columns for idx in self._col_name_to_idx[col_name]]
+            column_ids = [
+                idx for col_name in columns for idx in self._col_name_to_idx[col_name]
+            ]
         clean_column_ids = []
         for idx in column_ids:
             data_type = (
@@ -1858,7 +1860,7 @@ def _get_correlation(self, clean_samples, batch_properties):
         means = {index: mean for index, mean in enumerate(batch_properties["mean"])}
         data = data.fillna(value=means)
         data = data[clean_column_ids]
-        
+
         # Update the counts/std if needed (i.e. if null rows or exist)
         if (len(data) != batch_properties["count"]).any():
             adjusted_stds = np.sqrt(

@@ -1,3 +1,7 @@
+[flake8]
+max-line-length = 88
+extend-ignore = E203
+
 [isort]
 multi_line_output=3
 skip=dataprofiler/tests/data/,venv/