HOT FIX: resolving black and isort (#538)

* test * fixing black
capitalone · Jul 13, 2022 · fcd881d · fcd881d
1 parent c09c756
commit fcd881d
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 66 deletions.
diff --git a/dataprofiler/data_readers/graph_data.py b/dataprofiler/data_readers/graph_data.py
@@ -3,21 +3,22 @@
 import networkx as nx
 from numpy import source
 
+from . import data_utils
 from .base_data import BaseData
 from .csv_data import CSVData
-from . import data_utils
 from .filepath_or_buffer import FileOrBufferHandler
 
 
 class GraphData(BaseData):
     """
     GraphData class to identify, read, and load graph data
     """
-    data_type = 'graph'
+
+    data_type = "graph"
 
     def __init__(self, input_file_path=None, options=None, data=None):
         """
-        Data class for identifying, reading, and loading graph data. Current 
+        Data class for identifying, reading, and loading graph data. Current
         implementation only accepts file path as input. An options parameter is
         also passed in to specify properties of the input file.
 
@@ -62,12 +63,18 @@ def __init__(self, input_file_path=None, options=None, data=None):
 
         self._source_node = options.get("source_node", None)
         self._destination_node = options.get("destination_node", None)
-        self._target_keywords = options.get("target_keywords", ['target', 'destination', 'dst'])
-        self._source_keywords = options.get("source_keywords", ['source', 'src', 'origin'])
-        self._column_names = options.get("column_names", self.csv_column_names(self.input_file_path, self.options))
+        self._target_keywords = options.get(
+            "target_keywords", ["target", "destination", "dst"]
+        )
+        self._source_keywords = options.get(
+            "source_keywords", ["source", "src", "origin"]
+        )
+        self._column_names = options.get(
+            "column_names", self.csv_column_names(self.input_file_path, self.options)
+        )
         self._delimiter = options.get("delimiter", None)
         self._quotechar = options.get("quotechar", None)
-        self._header = options.get("header", 'auto')
+        self._header = options.get("header", "auto")
 
     @classmethod
     def _find_target_string_in_column(self, column_names, keyword_list):
@@ -104,8 +111,8 @@ def csv_column_names(cls, file_path, options):
         """
         column_names = []
         if options.get("header") is None:
-                return column_names
-        
+            return column_names
+
         with FileOrBufferHandler(file_path) as csv_file:
             csv_reader = csv.reader(csv_file, delimiter=options.get("delimiter", ","))
 
@@ -115,7 +122,7 @@ def csv_column_names(cls, file_path, options):
                 if row_count is options.get("header"):
                     column_names.append(row)
                     break
-                row_count+=1
+                row_count += 1
         column_names = column_names[0]
 
         # replace all whitespaces in the column names
@@ -147,23 +154,30 @@ def is_match(cls, file_path, options=None):
         has_target = True if destination_index >= 0 else False
 
         if has_target and has_source:
-            options.update(source_node = source_index)
-            options.update(destination_node = destination_index)
-            options.update(destination_list = target_keywords)
-            options.update(source_list = source_keywords)
-            options.update(column_names = column_names)
+            options.update(source_node=source_index)
+            options.update(destination_node=destination_index)
+            options.update(destination_list=target_keywords)
+            options.update(source_list=source_keywords)
+            options.update(column_names=column_names)
             return True
         return False
-        
+
     def _format_data_networkx(self):
-        '''
+        """
         Formats the input file into a networkX graph
-        '''
+        """
         networkx_graph = nx.DiGraph()
 
         # read lines from csv
         csv_as_list = []
-        data_as_pd = data_utils.read_csv_df(self.input_file_path,self._delimiter,self._header,[],read_in_string=True,encoding=self.file_encoding)
+        data_as_pd = data_utils.read_csv_df(
+            self.input_file_path,
+            self._delimiter,
+            self._header,
+            [],
+            read_in_string=True,
+            encoding=self.file_encoding,
+        )
         data_as_pd = data_as_pd.apply(lambda x: x.str.strip())
         csv_as_list = data_as_pd.values.tolist()
 
@@ -177,11 +191,18 @@ def _format_data_networkx(self):
             for column in range(0, len(csv_as_list[0])):
                 if csv_as_list[line][column] is None:
                     continue
-                if column is not self._source_node or column is not self._destination_node:
-                    attributes[self._column_names[column]] = (csv_as_list[line][column])
+                if (
+                    column is not self._source_node
+                    or column is not self._destination_node
+                ):
+                    attributes[self._column_names[column]] = csv_as_list[line][column]
                 elif column is self._source_node or column is self._destination_node:
                     networkx_graph.add_node(csv_as_list[line][column])
-            networkx_graph.add_edge(csv_as_list[line][self._source_node], csv_as_list[line][self._destination_node], **attributes)
+            networkx_graph.add_edge(
+                csv_as_list[line][self._source_node],
+                csv_as_list[line][self._destination_node],
+                **attributes
+            )
 
         # get NetworkX object from list
         return networkx_graph

diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
@@ -292,8 +292,8 @@ def load_from_disk(cls, dirpath):
         # use f1 score metric
         custom_objects = {
             "F1Score": labeler_utils.F1Score(
-                num_classes=max(label_mapping.values()) + 1,
-                average='micro'),
+                num_classes=max(label_mapping.values()) + 1, average="micro"
+            ),
             "CharacterLevelCnnModel": cls,
         }
         with tf.keras.utils.custom_object_scope(custom_objects):
@@ -514,8 +514,9 @@ def encoding_function(input_str):
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+            num_classes=num_labels, average="micro"
+        )
+        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
 
@@ -575,8 +576,9 @@ def _reconstruct_model(self):
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+            num_classes=num_labels, average="micro"
+        )
+        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
         self._epoch_id = 0

diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py
@@ -1,11 +1,11 @@
+import logging
 import os
 import warnings
-import logging
 
 import numpy as np
 import scipy
-from sklearn.exceptions import UndefinedMetricWarning
 import tensorflow as tf
+from sklearn.exceptions import UndefinedMetricWarning
 
 from .. import dp_logging
 from .classification_report_utils import classification_report
@@ -217,31 +217,36 @@ def hide_tf_logger_warnings():
     """
     Filters out a set of warnings from the tf logger.
     """
+
     class NoV1ResourceMessageFilter(logging.Filter):
         """Removes TF2 warning for using TF1 model which has resources."""
+
         def filter(self, record):
-            msg = 'is a problem, consider rebuilding the SavedModel after ' + \
-                'running tf.compat.v1.enable_resource_variables()'
+            msg = (
+                "is a problem, consider rebuilding the SavedModel after "
+                + "running tf.compat.v1.enable_resource_variables()"
+            )
             return msg not in record.getMessage()
 
-
-    tf_logger = logging.getLogger('tensorflow')
+    tf_logger = logging.getLogger("tensorflow")
     tf_logger.addFilter(NoV1ResourceMessageFilter())
 
 
-def protected_register_keras_serializable(package='Custom', name=None):
+def protected_register_keras_serializable(package="Custom", name=None):
     """
     Protects against already registered keras serializable layers. This
     ensures that if it was already registered, it will not try to register it
     again.
     """
+
     def decorator(arg):
         """Protects against double registration of a keras layer."""
         class_name = name if name is not None else arg.__name__
-        registered_name = package + '>' + class_name
+        registered_name = package + ">" + class_name
         if tf.keras.utils.get_registered_object(registered_name) is None:
             tf.keras.utils.register_keras_serializable(package, name)(arg)
         return arg
+
     return decorator
 
 
@@ -290,8 +295,16 @@ class FBetaScore(tf.keras.metrics.Metric):
     """
 
     # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
-                 name="fbeta_score", dtype=None, **kwargs):
+    def __init__(
+        self,
+        num_classes,
+        average=None,
+        beta=1.0,
+        threshold=None,
+        name="fbeta_score",
+        dtype=None,
+        **kwargs,
+    ):
         super().__init__(name=name, dtype=dtype)
 
         if average not in (None, "micro", "macro", "weighted"):
@@ -442,8 +455,9 @@ class F1Score(FBetaScore):
     """
 
     # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, threshold=None,
-                 name="f1_score", dtype=None):
+    def __init__(
+        self, num_classes, average=None, threshold=None, name="f1_score", dtype=None
+    ):
         super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
 
     def get_config(self):

diff --git a/dataprofiler/tests/data_readers/test_csv_graph_data.py b/dataprofiler/tests/data_readers/test_csv_graph_data.py
@@ -20,27 +20,43 @@ def setUpClass(cls):
                 path=os.path.join(
                     test_dir, "csv/graph-differentiator-input-positive.csv"
                 ),
-                list_nodes = ['1','2','3','4','5','6','7','8','9'],
-                list_edges = [('2', '1'),('3', '2'),('4', '2'),('5','2'),('6','2'),('7','2'),('8','2'),('9','2')],
-                options = {"header": 0, "delimiter": ","},
+                list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
+                list_edges=[
+                    ("2", "1"),
+                    ("3", "2"),
+                    ("4", "2"),
+                    ("5", "2"),
+                    ("6", "2"),
+                    ("7", "2"),
+                    ("8", "2"),
+                    ("9", "2"),
+                ],
+                options={"header": 0, "delimiter": ","},
                 encoding="utf-8",
             ),
             dict(
                 path=os.path.join(
                     test_dir, "csv/graph-differentiator-input-standard-positive.csv"
                 ),
-                list_nodes = ['1','2','3','4'],
-                list_edges = [('2', '1'),('1', '3'),('2', '4')],
-                options = {"header": 0, "delimiter": ","},
+                list_nodes=["1", "2", "3", "4"],
+                list_edges=[("2", "1"), ("1", "3"), ("2", "4")],
+                options={"header": 0, "delimiter": ","},
                 encoding="utf-8",
             ),
             dict(
-                path=os.path.join(
-                    test_dir, "csv/graph-data-input-positive-header.csv"
-                ),
-                list_nodes = ['1','2','3','4','5','6','7','8','9'],
-                list_edges = [('2', '1'),('3', '2'),('4', '2'),('5','2'),('6','2'),('7','2'),('8','2'),('9','2')],
-                options = {"header": 2, "delimiter": ","},
+                path=os.path.join(test_dir, "csv/graph-data-input-positive-header.csv"),
+                list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
+                list_edges=[
+                    ("2", "1"),
+                    ("3", "2"),
+                    ("4", "2"),
+                    ("5", "2"),
+                    ("6", "2"),
+                    ("7", "2"),
+                    ("8", "2"),
+                    ("9", "2"),
+                ],
+                options={"header": 2, "delimiter": ","},
                 encoding="utf-8",
             ),
         ]
@@ -124,7 +140,10 @@ def test_csv_column_names(self):
             "open_date_dst",
         ]
         for input_file in self.input_file_names_pos:
-            self.assertEqual(GraphData.csv_column_names(input_file["path"], input_file["options"]), column_names)
+            self.assertEqual(
+                GraphData.csv_column_names(input_file["path"], input_file["options"]),
+                column_names,
+            )
 
     # test is_match for true output w/ different options
     def test_is_graph_positive_1(self):
@@ -140,8 +159,8 @@ def test_is_graph_negative_1(self):
         Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
         """
         for input_file in self.input_file_names_neg:
-            self.assertFalse(GraphData.is_match(input_file["path"]))  
-    
+            self.assertFalse(GraphData.is_match(input_file["path"]))
+
     # test loading data
     def test_data_loader_nodes(self):
         """
@@ -174,5 +193,5 @@ def test_data_loader_edges(self):
             self.assertTrue(all_edges_present)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -1,6 +1,6 @@
+import logging
 import unittest
 from unittest import mock
-import logging
 
 import numpy as np
 import pandas as pd
@@ -272,28 +272,27 @@ def test_save_conf_mat(self, mock_dataframe):
 
 
 class TestTFFunctions(unittest.TestCase):
-
     def test_get_tf_layer_index_from_name(self):
         model = tf.keras.Sequential()
-        model.add(tf.keras.Input((1, 2), name='input'))
-        model.add(tf.keras.layers.Dense(units=4, name='dense0'))
-        model.add(tf.keras.layers.Dense(units=3, name='dense1'))
+        model.add(tf.keras.Input((1, 2), name="input"))
+        model.add(tf.keras.layers.Dense(units=4, name="dense0"))
+        model.add(tf.keras.layers.Dense(units=3, name="dense1"))
 
-        ind = labeler_utils.get_tf_layer_index_from_name(model, 'not a layer')
+        ind = labeler_utils.get_tf_layer_index_from_name(model, "not a layer")
         self.assertIsNone(ind)
 
         # input is not counted in the layer
-        ind = labeler_utils.get_tf_layer_index_from_name(model, 'input')
+        ind = labeler_utils.get_tf_layer_index_from_name(model, "input")
         self.assertIsNone(ind)
 
-        ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense1')
+        ind = labeler_utils.get_tf_layer_index_from_name(model, "dense1")
         self.assertEqual(1, ind)
 
-        ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense0')
+        ind = labeler_utils.get_tf_layer_index_from_name(model, "dense0")
         self.assertEqual(0, ind)
 
     def test_hide_tf_logger_warnings(self):
-        logger = logging.getLogger('tensorflow')
+        logger = logging.getLogger("tensorflow")
         num_loggers = len(logger.filters)
 
         # make change and validate updated filter