capitalone · JGSweets · Nov 9, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 2, 2022
@@ -94,7 +94,7 @@ def is_match(
         if data_utils.is_stream_buffer(file_path):
             starting_location = file_path.tell()
 
-        is_valid_avro = fastavro.is_avro(file_path)
+        is_valid_avro: bool = fastavro.is_avro(file_path)
 
         # return to original position in stream
         if data_utils.is_stream_buffer(file_path):

@@ -9,6 +9,7 @@
 import pandas as pd
 from six import StringIO
 
+from .._typing import JSONType
 from . import data_utils
 from .base_data import BaseData
 from .filepath_or_buffer import FileOrBufferHandler
@@ -236,36 +237,38 @@ def _get_data_as_flattened_dataframe(self, json_lines):
 
         return data
 
-    def _load_data_from_str(self, data_as_str: str) -> List:
+    def _load_data_from_str(self, data_as_str: str) -> JSONType:
         """
         Load the data from a string.
 
         :param data_as_str: data in string format.
         :type data_as_str: str
-        :return: dict
+        :return: JSONType
         """
+        data: JSONType
         try:
             data = json.loads(data_as_str)
         except json.JSONDecodeError:
-            data = data_utils.data_generator(data_as_str.splitlines())
+            data_generator = data_utils.data_generator(data_as_str.splitlines())
             data = data_utils.read_json(
-                data_generator=data,
+                data_generator=data_generator,
                 selected_columns=self.selected_keys,
                 read_in_string=False,
             )
         return data
 
-    def _load_data_from_file(self, input_file_path: str) -> List:
+    def _load_data_from_file(self, input_file_path: str) -> JSONType:
         """
         Load the data from a file.
 
         :param input_file_path: file path to file being loaded.
         :type input_file_path: str
-        :return:
+        :return: JSONType
         """
         with FileOrBufferHandler(
             input_file_path, "r", encoding=self.file_encoding
         ) as input_file:
+            data: JSONType
             try:
                 data = json.load(input_file)
             except (json.JSONDecodeError, UnicodeDecodeError):

@@ -72,12 +72,12 @@ def _get_data_as_df(self, data: pd.DataFrame) -> pd.DataFrame:
     def _get_data_as_records(self, data: Any) -> List[str]:
         """Return data records."""
         records_per_line = min(len(data), self.SAMPLES_PER_LINE_DEFAULT)
-        data = [
+        data_ = [
             str(
                 "\n".join(data[i * records_per_line : (i + 1) * records_per_line])
                 .encode("UTF-8")
                 .decode()
             )
             for i in range((len(data) + records_per_line - 1) // records_per_line)
         ]
-        return data
+        return data_
@@ -463,7 +463,7 @@ def _load_parameters(dirpath: str, load_options: Dict = None) -> Dict[str, Dict]
             load_options = {}
 
         with open(os.path.join(dirpath, "data_labeler_parameters.json")) as fp:
-            params = json.load(fp)
+            params: Dict[str, Dict] = json.load(fp)
 
         if "model_class" in load_options:
             model_class = load_options.get("model_class")
@@ -677,7 +677,7 @@ def load_with_components(
         data_labeler.set_preprocessor(preprocessor)
         data_labeler.set_model(model)
         data_labeler.set_postprocessor(postprocessor)
-        return data_labeler
+        return cast(BaseDataLabeler, data_labeler)
 
     def _save_model(self, dirpath: str) -> None:
         """
@@ -914,4 +914,4 @@ def load_with_components(
         data_labeler.set_preprocessor(preprocessor)
         data_labeler.set_model(model)
         data_labeler.set_postprocessor(postprocessor)
-        return data_labeler
+        return cast(TrainableDataLabeler, data_labeler)
@@ -21,7 +21,7 @@ def __new__(
             cls, clsname, bases, attrs
         )
         new_class._register_subclass()
-        return new_class
+        return cast(AutoSubRegistrationMeta, new_class)
 
 
 class BaseModel(object, metaclass=abc.ABCMeta):

@@ -79,7 +79,7 @@ def evaluate_accuracy(
     predicted_entities_in_index: List[List[int]],
     true_entities_in_index: List[List[int]],
     num_labels: int,
-    entity_rev_dict: Dict,
+    entity_rev_dict: Dict[int, Any],
     verbose: bool = True,
     omitted_labels: Tuple[str, ...] = ("PAD", "UNKNOWN"),
     confusion_matrix_file: str = None,
@@ -125,6 +125,7 @@ def evaluate_accuracy(
     true_labels_flatten = np.hstack(true_labels_padded)  # type: ignore
     predicted_labels_flatten = np.hstack(predicted_entities_in_index)
 
+    all_labels: List[str] = []
     if entity_rev_dict:
         all_labels = [entity_rev_dict[key] for key in sorted(entity_rev_dict.keys())]
 

@@ -267,6 +267,7 @@ def __init__(self, name: Optional[str]) -> None:
         # Number of values that match the column type. eg. how many floats match
         # in the float column
         self.match_count: int = 0
+        self.sample_size: int  # inherited from BaseColumnProfiler
 
     def _update_column_base_properties(self, profile: Dict) -> None:
         """

@@ -3,7 +3,7 @@
 
 import copy
 import re
-from typing import Dict, List, Optional
+from typing import Dict, Optional
 
 import numpy as np
 import pandas as pd
@@ -285,7 +285,7 @@ def _get_float_precision(
         return subset_precision
 
     @classmethod
-    def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
+    def _is_each_row_float(cls, df_series: pd.Series) -> pd.Series[bool]:
         """
         Determine if each value in a dataframe is a float.
 
@@ -297,7 +297,7 @@ def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
         :param df_series: series of values to evaluate
         :type df_series: pandas.core.series.Series
         :return: is_float_col
-        :rtype: list
+        :rtype: pandas.Series[bool]
         """
         if len(df_series) == 0:
             return list()

@@ -4,7 +4,7 @@
 import pickle
 from collections import defaultdict
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 import networkx as nx
 import numpy as np
@@ -330,12 +330,12 @@ def _update_categorical_distribution(
     @BaseColumnProfiler._timeit(name="num_nodes")
     def _get_num_nodes(self, graph: nx.Graph) -> int:
         """Compute the number of nodes."""
-        return graph.number_of_nodes()
+        return cast(int, graph.number_of_nodes())
 
     @BaseColumnProfiler._timeit(name="num_edges")
     def _get_num_edges(self, graph: nx.Graph) -> int:
         """Compute the number of edges."""
-        return graph.number_of_edges()
+        return cast(int, graph.number_of_edges())
 
     @BaseColumnProfiler._timeit(name="categorical_attributes")
     def _get_categorical_attributes(self, graph: nx.Graph) -> List[str]:
@@ -362,7 +362,7 @@ def _get_global_max_component_size(self, graph: nx.Graph) -> int:
             nx.connected_components(graph), key=len, reverse=True
         )
         largest_component: nx.Graph = graph.subgraph(graph_connected_components[0])
-        return largest_component.size()
+        return cast(int, largest_component.size())
 
     @BaseColumnProfiler._timeit(name="continuous_distribution")
     def _get_continuous_distribution(

diff --git a/dataprofiler/profilers/histogram_utils.py b/dataprofiler/profilers/histogram_utils.py
@@ -7,11 +7,8 @@
 from typing import List, Optional, Tuple, Union
 
 import numpy as np
-from numpy.lib.histograms import (  # type: ignore
-    _get_outer_edges,
-    _hist_bin_selectors,
-    _unsigned_subtract,
-)
+from numpy.lib.histograms import _get_outer_edges  # type: ignore
+from numpy.lib.histograms import _hist_bin_selectors, _unsigned_subtract
 
 
 def _get_bin_edges(

@@ -6,7 +6,7 @@
 import copy
 import itertools
 import warnings
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import pandas as pd
@@ -399,7 +399,7 @@ def mean(self) -> float:
         """Return mean value."""
         if self.match_count == 0:
             return 0
-        return float(self.sum) / self.match_count
+        return float(self.sum) / float(self.match_count)
 
     @property
     def mode(self) -> List[float]:
@@ -422,7 +422,7 @@ def median(self) -> float:
         :rtype: float
         """
         if not self._has_histogram or not self._median_is_enabled:
-            return np.nan
+            return cast(float, np.nan)
         return self._get_percentile([50])[0]
 
     @property
@@ -438,8 +438,8 @@ def variance(self) -> float:
     def stddev(self) -> float:
         """Return stddev value."""
         if self.match_count == 0:
-            return np.nan
-        return np.sqrt(self.variance)
+            return cast(float, np.nan)
+        return cast(float, np.sqrt(self.variance))
 
     @property
     def skewness(self) -> float:
@@ -563,7 +563,7 @@ def _merge_biased_variance(
         elif match_count2 < 1:
             return biased_variance1
         elif np.isnan(biased_variance1) or np.isnan(biased_variance2):
-            return np.nan
+            return cast(float, np.nan)
 
         curr_count = match_count1
         delta = mean2 - mean1
@@ -586,7 +586,7 @@ def _correct_bias_variance(match_count: int, biased_variance: float) -> float:
                 "False in ProfilerOptions.",
                 RuntimeWarning,
             )
-            return np.nan
+            return cast(float, np.nan)
 
         variance = match_count / (match_count - 1) * biased_variance
         return variance
@@ -621,7 +621,7 @@ def _merge_biased_skewness(
         elif match_count2 < 1:
             return biased_skewness1
         elif np.isnan(biased_skewness1) or np.isnan(biased_skewness2):
-            return np.nan
+            return cast(float, np.nan)
 
         delta = mean2 - mean1
         N = match_count1 + match_count2
@@ -645,7 +645,7 @@ def _merge_biased_skewness(
         third_term = 3 * delta * (match_count1 * M2_2 - match_count2 * M2_1) / N
         M3 = first_term + second_term + third_term
 
-        biased_skewness = np.sqrt(N) * M3 / np.sqrt(M2**3)
+        biased_skewness: float = np.sqrt(N) * M3 / np.sqrt(M2**3)
         return biased_skewness
 
     @staticmethod
@@ -665,9 +665,9 @@ def _correct_bias_skewness(match_count: int, biased_skewness: float) -> float:
                 "False in ProfilerOptions.",
                 RuntimeWarning,
             )
-            return np.nan
+            return cast(float, np.nan)
 
-        skewness = (
+        skewness: float = (
             np.sqrt(match_count * (match_count - 1))
             * biased_skewness
             / (match_count - 2)
@@ -708,7 +708,7 @@ def _merge_biased_kurtosis(
         elif match_count2 < 1:
             return biased_kurtosis1
         elif np.isnan(biased_kurtosis1) or np.isnan(biased_kurtosis2):
-            return np.nan
+            return cast(float, np.nan)
 
         delta = mean2 - mean1
         N = match_count1 + match_count2
@@ -742,7 +742,7 @@ def _merge_biased_kurtosis(
         fourth_term = 4 * delta * (match_count1 * M3_2 - match_count2 * M3_1) / N
         M4 = first_term + second_term + third_term + fourth_term
 
-        biased_kurtosis = N * M4 / M2**2 - 3
+        biased_kurtosis: float = N * M4 / M2**2 - 3
         return biased_kurtosis
 
     @staticmethod
@@ -762,7 +762,7 @@ def _correct_bias_kurtosis(match_count: int, biased_kurtosis: float) -> float:
                 "False in ProfilerOptions.",
                 RuntimeWarning,
             )
-            return np.nan
+            return cast(float, np.nan)
 
         kurtosis = (
             (match_count - 1)
@@ -803,15 +803,15 @@ def _estimate_mode_from_histogram(self) -> List[float]:
         mode = (
             bin_edges[highest_idxs] + bin_edges[highest_idxs + 1]  # type: ignore
         ) / 2
-        return mode.tolist()
+        return cast(List[float], mode.tolist())
 
     def _estimate_stats_from_histogram(self) -> float:
         # test estimated mean and var
         bin_counts = self._stored_histogram["histogram"]["bin_counts"]
         bin_edges = self._stored_histogram["histogram"]["bin_edges"]
         mids = 0.5 * (bin_edges[1:] + bin_edges[:-1])
         mean = np.average(mids, weights=bin_counts)
-        var = np.average((mids - mean) ** 2, weights=bin_counts)
+        var: float = np.average((mids - mean) ** 2, weights=bin_counts)
         return var
 
     def _total_histogram_bin_variance(
@@ -858,7 +858,7 @@ def _histogram_bin_error(self, input_array: Union[np.ndarray, pd.Series]) -> flo
         # reset the edge
         bin_edges[-1] = temp_last_edge
 
-        sum_error = sum(
+        sum_error: float = sum(
             (input_array - (bin_edges[inds] + bin_edges[inds - 1]) / 2) ** 2
         )
 
@@ -1180,7 +1180,7 @@ def _get_best_histogram_for_profile(self) -> Dict:
                     self.histogram_selection = method
                     best_hist_loss = hist_loss
 
-        return self.histogram_methods[self.histogram_selection]["histogram"]
+        return cast(Dict, self.histogram_methods[self.histogram_selection]["histogram"])
 
     def _get_percentile(
         self, percentiles: Union[np.ndarray, List[float]]
@@ -1220,7 +1220,7 @@ def _get_percentile(
         )
         if median_value:
             quantiles[percentiles == 50] = median_value
-        return quantiles.tolist()
+        return cast(List[float], quantiles.tolist())
 
     @staticmethod
     def _fold_histogram(
@@ -1295,7 +1295,7 @@ def median_abs_deviation(self) -> float:
         :return: median absolute deviation
         """
         if not self._has_histogram or not self._median_abs_dev_is_enabled:
-            return np.nan
+            return cast(float, np.nan)
 
         bin_counts = self._stored_histogram["histogram"]["bin_counts"]
         bin_edges = self._stored_histogram["histogram"]["bin_edges"]
@@ -1344,9 +1344,9 @@ def median_abs_deviation(self) -> float:
 
         median_inds = np.abs(bin_counts_impose - 0.5) < 1e-10
         if np.sum(median_inds) > 1:
-            return np.mean(bin_edges_impose[median_inds])
+            return cast(float, np.mean(bin_edges_impose[median_inds]))
 
-        return np.interp(0.5, bin_counts_impose, bin_edges_impose)
+        return cast(float, np.interp(0.5, bin_counts_impose, bin_edges_impose))
 
     def _get_quantiles(self) -> None:
         """
@@ -1670,7 +1670,7 @@ def is_int(x: str) -> bool:
             return a == b
 
     @staticmethod
-    def np_type_to_type(val: Any) -> Union[int, float]:
+    def np_type_to_type(val: Any) -> Union[int, float, Any]:
         """
         Convert numpy variables to base python type variables.