Skip to content

Commit

Permalink
HOT FIX: resolving black and isort (#538)
Browse files Browse the repository at this point in the history
* test

* fixing black
  • Loading branch information
taylorfturner authored Jul 13, 2022
1 parent c09c756 commit fcd881d
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 66 deletions.
65 changes: 43 additions & 22 deletions dataprofiler/data_readers/graph_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
import networkx as nx
from numpy import source

from . import data_utils
from .base_data import BaseData
from .csv_data import CSVData
from . import data_utils
from .filepath_or_buffer import FileOrBufferHandler


class GraphData(BaseData):
"""
GraphData class to identify, read, and load graph data
"""
data_type = 'graph'

data_type = "graph"

def __init__(self, input_file_path=None, options=None, data=None):
"""
Data class for identifying, reading, and loading graph data. Current
Data class for identifying, reading, and loading graph data. Current
implementation only accepts file path as input. An options parameter is
also passed in to specify properties of the input file.
Expand Down Expand Up @@ -62,12 +63,18 @@ def __init__(self, input_file_path=None, options=None, data=None):

self._source_node = options.get("source_node", None)
self._destination_node = options.get("destination_node", None)
self._target_keywords = options.get("target_keywords", ['target', 'destination', 'dst'])
self._source_keywords = options.get("source_keywords", ['source', 'src', 'origin'])
self._column_names = options.get("column_names", self.csv_column_names(self.input_file_path, self.options))
self._target_keywords = options.get(
"target_keywords", ["target", "destination", "dst"]
)
self._source_keywords = options.get(
"source_keywords", ["source", "src", "origin"]
)
self._column_names = options.get(
"column_names", self.csv_column_names(self.input_file_path, self.options)
)
self._delimiter = options.get("delimiter", None)
self._quotechar = options.get("quotechar", None)
self._header = options.get("header", 'auto')
self._header = options.get("header", "auto")

@classmethod
def _find_target_string_in_column(self, column_names, keyword_list):
Expand Down Expand Up @@ -104,8 +111,8 @@ def csv_column_names(cls, file_path, options):
"""
column_names = []
if options.get("header") is None:
return column_names
return column_names

with FileOrBufferHandler(file_path) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=options.get("delimiter", ","))

Expand All @@ -115,7 +122,7 @@ def csv_column_names(cls, file_path, options):
if row_count is options.get("header"):
column_names.append(row)
break
row_count+=1
row_count += 1
column_names = column_names[0]

# replace all whitespaces in the column names
Expand Down Expand Up @@ -147,23 +154,30 @@ def is_match(cls, file_path, options=None):
has_target = True if destination_index >= 0 else False

if has_target and has_source:
options.update(source_node = source_index)
options.update(destination_node = destination_index)
options.update(destination_list = target_keywords)
options.update(source_list = source_keywords)
options.update(column_names = column_names)
options.update(source_node=source_index)
options.update(destination_node=destination_index)
options.update(destination_list=target_keywords)
options.update(source_list=source_keywords)
options.update(column_names=column_names)
return True
return False

def _format_data_networkx(self):
'''
"""
Formats the input file into a networkX graph
'''
"""
networkx_graph = nx.DiGraph()

# read lines from csv
csv_as_list = []
data_as_pd = data_utils.read_csv_df(self.input_file_path,self._delimiter,self._header,[],read_in_string=True,encoding=self.file_encoding)
data_as_pd = data_utils.read_csv_df(
self.input_file_path,
self._delimiter,
self._header,
[],
read_in_string=True,
encoding=self.file_encoding,
)
data_as_pd = data_as_pd.apply(lambda x: x.str.strip())
csv_as_list = data_as_pd.values.tolist()

Expand All @@ -177,11 +191,18 @@ def _format_data_networkx(self):
for column in range(0, len(csv_as_list[0])):
if csv_as_list[line][column] is None:
continue
if column is not self._source_node or column is not self._destination_node:
attributes[self._column_names[column]] = (csv_as_list[line][column])
if (
column is not self._source_node
or column is not self._destination_node
):
attributes[self._column_names[column]] = csv_as_list[line][column]
elif column is self._source_node or column is self._destination_node:
networkx_graph.add_node(csv_as_list[line][column])
networkx_graph.add_edge(csv_as_list[line][self._source_node], csv_as_list[line][self._destination_node], **attributes)
networkx_graph.add_edge(
csv_as_list[line][self._source_node],
csv_as_list[line][self._destination_node],
**attributes
)

# get NetworkX object from list
return networkx_graph
Expand Down
14 changes: 8 additions & 6 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,8 @@ def load_from_disk(cls, dirpath):
# use f1 score metric
custom_objects = {
"F1Score": labeler_utils.F1Score(
num_classes=max(label_mapping.values()) + 1,
average='micro'),
num_classes=max(label_mapping.values()) + 1, average="micro"
),
"CharacterLevelCnnModel": cls,
}
with tf.keras.utils.custom_object_scope(custom_objects):
Expand Down Expand Up @@ -514,8 +514,9 @@ def encoding_function(input_str):

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average='micro')
metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
num_classes=num_labels, average="micro"
)
metrics = {softmax_output_layer_name: ["acc", f1_score_training]}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)

Expand Down Expand Up @@ -575,8 +576,9 @@ def _reconstruct_model(self):

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
num_classes=num_labels, average='micro')
metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
num_classes=num_labels, average="micro"
)
metrics = {softmax_output_layer_name: ["acc", f1_score_training]}

self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
self._epoch_id = 0
Expand Down
38 changes: 26 additions & 12 deletions dataprofiler/labelers/labeler_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import logging
import os
import warnings
import logging

import numpy as np
import scipy
from sklearn.exceptions import UndefinedMetricWarning
import tensorflow as tf
from sklearn.exceptions import UndefinedMetricWarning

from .. import dp_logging
from .classification_report_utils import classification_report
Expand Down Expand Up @@ -217,31 +217,36 @@ def hide_tf_logger_warnings():
"""
Filters out a set of warnings from the tf logger.
"""

class NoV1ResourceMessageFilter(logging.Filter):
"""Removes TF2 warning for using TF1 model which has resources."""

def filter(self, record):
msg = 'is a problem, consider rebuilding the SavedModel after ' + \
'running tf.compat.v1.enable_resource_variables()'
msg = (
"is a problem, consider rebuilding the SavedModel after "
+ "running tf.compat.v1.enable_resource_variables()"
)
return msg not in record.getMessage()


tf_logger = logging.getLogger('tensorflow')
tf_logger = logging.getLogger("tensorflow")
tf_logger.addFilter(NoV1ResourceMessageFilter())


def protected_register_keras_serializable(package='Custom', name=None):
def protected_register_keras_serializable(package="Custom", name=None):
"""
Protects against already registered keras serializable layers. This
ensures that if it was already registered, it will not try to register it
again.
"""

def decorator(arg):
"""Protects against double registration of a keras layer."""
class_name = name if name is not None else arg.__name__
registered_name = package + '>' + class_name
registered_name = package + ">" + class_name
if tf.keras.utils.get_registered_object(registered_name) is None:
tf.keras.utils.register_keras_serializable(package, name)(arg)
return arg

return decorator


Expand Down Expand Up @@ -290,8 +295,16 @@ class FBetaScore(tf.keras.metrics.Metric):
"""

# Modification: remove the run-time type checking for functions
def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
name="fbeta_score", dtype=None, **kwargs):
def __init__(
self,
num_classes,
average=None,
beta=1.0,
threshold=None,
name="fbeta_score",
dtype=None,
**kwargs,
):
super().__init__(name=name, dtype=dtype)

if average not in (None, "micro", "macro", "weighted"):
Expand Down Expand Up @@ -442,8 +455,9 @@ class F1Score(FBetaScore):
"""

# Modification: remove the run-time type checking for functions
def __init__(self, num_classes, average=None, threshold=None,
name="f1_score", dtype=None):
def __init__(
self, num_classes, average=None, threshold=None, name="f1_score", dtype=None
):
super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)

def get_config(self):
Expand Down
51 changes: 35 additions & 16 deletions dataprofiler/tests/data_readers/test_csv_graph_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,43 @@ def setUpClass(cls):
path=os.path.join(
test_dir, "csv/graph-differentiator-input-positive.csv"
),
list_nodes = ['1','2','3','4','5','6','7','8','9'],
list_edges = [('2', '1'),('3', '2'),('4', '2'),('5','2'),('6','2'),('7','2'),('8','2'),('9','2')],
options = {"header": 0, "delimiter": ","},
list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
list_edges=[
("2", "1"),
("3", "2"),
("4", "2"),
("5", "2"),
("6", "2"),
("7", "2"),
("8", "2"),
("9", "2"),
],
options={"header": 0, "delimiter": ","},
encoding="utf-8",
),
dict(
path=os.path.join(
test_dir, "csv/graph-differentiator-input-standard-positive.csv"
),
list_nodes = ['1','2','3','4'],
list_edges = [('2', '1'),('1', '3'),('2', '4')],
options = {"header": 0, "delimiter": ","},
list_nodes=["1", "2", "3", "4"],
list_edges=[("2", "1"), ("1", "3"), ("2", "4")],
options={"header": 0, "delimiter": ","},
encoding="utf-8",
),
dict(
path=os.path.join(
test_dir, "csv/graph-data-input-positive-header.csv"
),
list_nodes = ['1','2','3','4','5','6','7','8','9'],
list_edges = [('2', '1'),('3', '2'),('4', '2'),('5','2'),('6','2'),('7','2'),('8','2'),('9','2')],
options = {"header": 2, "delimiter": ","},
path=os.path.join(test_dir, "csv/graph-data-input-positive-header.csv"),
list_nodes=["1", "2", "3", "4", "5", "6", "7", "8", "9"],
list_edges=[
("2", "1"),
("3", "2"),
("4", "2"),
("5", "2"),
("6", "2"),
("7", "2"),
("8", "2"),
("9", "2"),
],
options={"header": 2, "delimiter": ","},
encoding="utf-8",
),
]
Expand Down Expand Up @@ -124,7 +140,10 @@ def test_csv_column_names(self):
"open_date_dst",
]
for input_file in self.input_file_names_pos:
self.assertEqual(GraphData.csv_column_names(input_file["path"], input_file["options"]), column_names)
self.assertEqual(
GraphData.csv_column_names(input_file["path"], input_file["options"]),
column_names,
)

# test is_match for true output w/ different options
def test_is_graph_positive_1(self):
Expand All @@ -140,8 +159,8 @@ def test_is_graph_negative_1(self):
Determine if the input CSV file can be automatically recognized as not being a graph w/ no options selected
"""
for input_file in self.input_file_names_neg:
self.assertFalse(GraphData.is_match(input_file["path"]))
self.assertFalse(GraphData.is_match(input_file["path"]))

# test loading data
def test_data_loader_nodes(self):
"""
Expand Down Expand Up @@ -174,5 +193,5 @@ def test_data_loader_edges(self):
self.assertTrue(all_edges_present)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
19 changes: 9 additions & 10 deletions dataprofiler/tests/labelers/test_labeler_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import unittest
from unittest import mock
import logging

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -272,28 +272,27 @@ def test_save_conf_mat(self, mock_dataframe):


class TestTFFunctions(unittest.TestCase):

def test_get_tf_layer_index_from_name(self):
model = tf.keras.Sequential()
model.add(tf.keras.Input((1, 2), name='input'))
model.add(tf.keras.layers.Dense(units=4, name='dense0'))
model.add(tf.keras.layers.Dense(units=3, name='dense1'))
model.add(tf.keras.Input((1, 2), name="input"))
model.add(tf.keras.layers.Dense(units=4, name="dense0"))
model.add(tf.keras.layers.Dense(units=3, name="dense1"))

ind = labeler_utils.get_tf_layer_index_from_name(model, 'not a layer')
ind = labeler_utils.get_tf_layer_index_from_name(model, "not a layer")
self.assertIsNone(ind)

# input is not counted in the layer
ind = labeler_utils.get_tf_layer_index_from_name(model, 'input')
ind = labeler_utils.get_tf_layer_index_from_name(model, "input")
self.assertIsNone(ind)

ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense1')
ind = labeler_utils.get_tf_layer_index_from_name(model, "dense1")
self.assertEqual(1, ind)

ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense0')
ind = labeler_utils.get_tf_layer_index_from_name(model, "dense0")
self.assertEqual(0, ind)

def test_hide_tf_logger_warnings(self):
logger = logging.getLogger('tensorflow')
logger = logging.getLogger("tensorflow")
num_loggers = len(logger.filters)

# make change and validate updated filter
Expand Down

0 comments on commit fcd881d

Please sign in to comment.