Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reformatted dataprofiler/tests using black 22.3.0. #521

Merged
merged 2 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dataprofiler/tests/data_readers/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def chunk_file(filepath, c_size):
def test_read_url_content_overflow(self, mock_request_get):
# assumed chunk size
c_size = 8192
max_allows_file_size = 1024 ** 3 # 1GB
max_allows_file_size = 1024**3 # 1GB

try:
# mock the iter_content to return just under 1GB so no error raises
Expand Down Expand Up @@ -112,7 +112,7 @@ def test_read_url_content_overflow(self, mock_request_get):
def test_read_url_header_overflow(self, mock_request_get):
# assumed chunk size
c_size = 8192
max_allows_file_size = 1024 ** 3 # 1GB
max_allows_file_size = 1024**3 # 1GB

# set valid content length size
content_length = 5000
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/tests/data_readers/test_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def test_file_UTF_encoding_detection(self):
def test_nth_loc_detection(self):
"""
Tests the ability for the `data_utils.find_nth_location` to detect the
nth index of a search_query in a string.
nth index of a search_query in a string.
"""
# Input args: string, query, n
# Expected results: index, occurrences
Expand Down Expand Up @@ -122,7 +122,7 @@ def test_nth_loc_detection(self):

def test_load_as_str_from_file(self):
"""
Tests if the load_as_str_file function can appropriately load files
Tests if the load_as_str_file function can appropriately load files
thresholded by bytes or max lines.
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,12 @@ def test_classification_report(self):
"f1-score": 1 / 2,
"support": 2,
},
"UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
"UNKNOWN": {
"precision": 0,
"recall": 0,
"f1-score": 0,
"support": 1,
},
"OTHER": {
"precision": 2 / 3,
"recall": 2 / 3,
Expand Down
21 changes: 16 additions & 5 deletions dataprofiler/tests/labelers/test_data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,9 @@ def test_process_batch_helper(self):

def test_process(self):
preprocessor = CharPreprocessor(
max_length=5, default_label="UNKNOWN", pad_label="PAD",
max_length=5,
default_label="UNKNOWN",
pad_label="PAD",
)

label_mapping = {
Expand Down Expand Up @@ -1769,7 +1771,10 @@ def test_flatten_convert(self):
)
postprocessor = CharPostprocessor(flatten_separator=flatten_separator)

output_generator = preprocessor.process(test_sentences, batch_size=batch_size,)
output_generator = preprocessor.process(
test_sentences,
batch_size=batch_size,
)

# mimic model output as a sentence instead of prediction
output = [
Expand Down Expand Up @@ -1832,7 +1837,9 @@ def test_flatten_convert(self):
list(data[0]) for batch_data in output_generator for data in batch_data
]
output = postprocessor.process(
test_sentences, dict(pred=output), label_mapping=dict(test=0),
test_sentences,
dict(pred=output),
label_mapping=dict(test=0),
)
reconstructed_test_sentences = [
"".join(sentence) for sentence in output["pred"]
Expand Down Expand Up @@ -2065,7 +2072,9 @@ def test_get_parameters(self):

def test_convert_to_unstructured_format(self):
preprocessor = StructCharPreprocessor(
max_length=10, default_label="UNKNOWN", pad_label="PAD",
max_length=10,
default_label="UNKNOWN",
pad_label="PAD",
)

# test a single sentence
Expand Down Expand Up @@ -2114,7 +2123,9 @@ def test_convert_to_unstructured_format(self):

def test_process(self):
preprocessor = StructCharPreprocessor(
max_length=10, default_label="UNKNOWN", pad_label="PAD",
max_length=10,
default_label="UNKNOWN",
pad_label="PAD",
)

label_mapping = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,10 @@ def test_check_pipeline_overlap_mismatch(self):

# make preprocess and model the same, but different from postprocessor
data_labeler.set_params(
{"model": {"default_label": "a"}, "postprocessor": {"default_label": "b"},}
{
"model": {"default_label": "a"},
"postprocessor": {"default_label": "b"},
}
)
with self.assertRaisesRegex(
RuntimeError,
Expand Down
36 changes: 31 additions & 5 deletions dataprofiler/tests/labelers/test_labeler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ def test_no_omit_class(self):
"f1-score": 1 / 2,
"support": 2,
},
"UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
"UNKNOWN": {
"precision": 0,
"recall": 0,
"f1-score": 0,
"support": 1,
},
"OTHER": {
"precision": 2 / 3,
"recall": 2 / 3,
Expand Down Expand Up @@ -65,7 +70,12 @@ def test_no_omit_class(self):
def test_omit_1_class(self):

expected_output = {
"UNKNOWN": {"precision": 0, "recall": 0, "f1-score": 0, "support": 1,},
"UNKNOWN": {
"precision": 0,
"recall": 0,
"f1-score": 0,
"support": 1,
},
"OTHER": {
"precision": 2 / 3,
"recall": 2 / 3,
Expand Down Expand Up @@ -153,8 +163,18 @@ def test_no_support_classes(self):
"f1-score": 2 / 3,
"support": 3,
},
"NO_SUPPORT": {"precision": 0, "recall": 0, "f1-score": 0, "support": 0,},
"NO_SUPPORT2": {"precision": 0, "recall": 0, "f1-score": 0, "support": 0,},
"NO_SUPPORT": {
"precision": 0,
"recall": 0,
"f1-score": 0,
"support": 0,
},
"NO_SUPPORT2": {
"precision": 0,
"recall": 0,
"f1-score": 0,
"support": 0,
},
"micro avg": {
"precision": 2 / 3,
"recall": 2 / 3,
Expand Down Expand Up @@ -218,7 +238,13 @@ def test_save_conf_mat(self, mock_dataframe):

# ideally mock out the actual contents written to file, but
# would be difficult to get this completely worked out.
expected_conf_mat = np.array([[1, 0, 1], [1, 0, 0], [0, 1, 2],])
expected_conf_mat = np.array(
[
[1, 0, 1],
[1, 0, 0],
[0, 1, 2],
]
)
expected_row_col_names = dict(
columns=["pred:PAD", "pred:UNKNOWN", "pred:OTHER"],
index=["true:PAD", "true:UNKNOWN", "true:OTHER"],
Expand Down
67 changes: 61 additions & 6 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,12 +233,44 @@ def test_mixed_categorical_col_integer_string(self):
def test_categorical_mapping(self):

df1 = pd.Series(
["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2", np.nan,]
[
"abcd",
"aa",
"abcd",
"aa",
"b",
"4",
"3",
"2",
"dfd",
"2",
np.nan,
]
)
df2 = pd.Series(
["1", "null", "ee", "NaN", "ff", "nan", "gg", "None", "aa", "b", "ee",]
[
"1",
"null",
"ee",
"NaN",
"ff",
"nan",
"gg",
"None",
"aa",
"b",
"ee",
]
)
df3 = pd.Series(
[
"NaN",
"b",
"nan",
"c",
None,
]
)
df3 = pd.Series(["NaN", "b", "nan", "c", None,])

column_profile = StructuredColProfiler(df1)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
Expand Down Expand Up @@ -297,7 +329,20 @@ def test_categorical_mapping(self):

def test_true_categorical_report(self):
df_categorical = pd.Series(
["a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c", "c",]
[
"a",
"a",
"a",
"b",
"b",
"b",
"b",
"c",
"c",
"c",
"c",
"c",
]
)
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
Expand Down Expand Up @@ -338,7 +383,12 @@ def test_false_categorical_report(self):
self.assertIsNotNone(report.pop("times", None))
expected_profile = dict(
categorical=False,
statistics=dict([("unique_count", 20), ("unique_ratio", 1),]),
statistics=dict(
[
("unique_count", 20),
("unique_ratio", 1),
]
),
)
self.assertEqual(report, expected_profile)

Expand Down Expand Up @@ -447,7 +497,12 @@ def test_categorical_merge(self):
self.assertIsNotNone(report.pop("times", None))
expected_profile = dict(
categorical=False,
statistics=dict([("unique_count", 16), ("unique_ratio", 16 / 33),]),
statistics=dict(
[
("unique_count", 16),
("unique_ratio", 16 / 33),
]
),
)
self.assertEqual(report, expected_profile)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,11 @@ def test_profile(self):
min="03/10/13 15:43",
max="Mar 11, 2013",
histogram=None,
format=["%Y-%m-%d %H:%M:%S", "%m/%d/%y %H:%M", "%b %d, %Y",],
format=[
"%Y-%m-%d %H:%M:%S",
"%m/%d/%y %H:%M",
"%b %d, %Y",
],
times=defaultdict(float, {"datetime": 1.0}),
)
time_array = [float(i) for i in range(4, 0, -1)]
Expand Down
14 changes: 9 additions & 5 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b):
delta = mean_b - mean_a
m_a = var_a * (count_a - 1)
m_b = var_b * (count_b - 1)
M2 = m_a + m_b + delta ** 2 * count_a * count_b / (count_a + count_b)
M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b)
return M2 / (count_a + count_b - 1)

data = np.linspace(-5, 5, 11).tolist()
Expand Down Expand Up @@ -668,10 +668,10 @@ def test_profiled_histogram(self):
list_data_test.append([df4, expected_histogram4])

# this data has only one unique value, overflow
df5 = pd.Series([-(10.0 ** 20)]).apply(str)
df5 = pd.Series([-(10.0**20)]).apply(str)
expected_histogram5 = {
"bin_counts": np.array([1]),
"bin_edges": np.array([-(10.0 ** 20), -(10.0 ** 20)]),
"bin_edges": np.array([-(10.0**20), -(10.0**20)]),
}
list_data_test.append([df5, expected_histogram5])

Expand Down Expand Up @@ -776,7 +776,7 @@ def test_histogram_with_varying_number_of_bin(self):
# this data uses large number of bins, which will be set to
# the max limit
df2 = pd.Series(
[3.195103249264023e18, 9999995.0, 9999999.0, 0.0, -(10 ** 10)]
[3.195103249264023e18, 9999995.0, 9999999.0, 0.0, -(10**10)]
).apply(str)
profiler2 = FloatColumn(df2.name)
profiler2.max_histogram_bin = 50
Expand Down Expand Up @@ -1063,7 +1063,11 @@ def test_profile(self):
"bin_counts": np.array([1, 1, 0, 1]),
"bin_edges": np.array([2.5, 5.0, 7.5, 10.0, 12.5]),
},
quantiles={0: 2.5075, 1: 5.005, 2: 12.4925,},
quantiles={
0: 2.5075,
1: 5.005,
2: 12.4925,
},
times=defaultdict(
float,
{
Expand Down
8 changes: 6 additions & 2 deletions dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def batch_variance(mean_a, var_a, count_a, mean_b, var_b, count_b):
delta = mean_b - mean_a
m_a = var_a * (count_a - 1)
m_b = var_b * (count_b - 1)
M2 = m_a + m_b + delta ** 2 * count_a * count_b / (count_a + count_b)
M2 = m_a + m_b + delta**2 * count_a * count_b / (count_a + count_b)
return M2 / (count_a + count_b - 1)

data = np.linspace(-5, 5, 11).tolist()
Expand Down Expand Up @@ -542,7 +542,11 @@ def test_profile(self):
"bin_counts": np.array([1, 0, 1]),
"bin_edges": np.array([2.0, 10.0 / 3.0, 14.0 / 3.0, 6.0]),
},
quantiles={0: 2.002, 1: 4, 2: 5.998,},
quantiles={
0: 2.002,
1: 4,
2: 5.998,
},
times=defaultdict(
float,
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -660,7 +660,11 @@ def test_profile(self):
"bin_counts": np.array([1, 1, 1]),
"bin_edges": np.array([1.0, 2.0, 3.0, 4.0]),
},
quantiles={0: 2.0, 1: 3.0, 2: 4.0,},
quantiles={
0: 2.0,
1: 3.0,
2: 4.0,
},
num_zeros=0, # default
num_negatives=0, # default
times=defaultdict(float), # default
Expand Down
Loading