capitalone · JGSweets · Nov 9, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 2, 2022
@@ -44,7 +44,7 @@ repos:
         exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
         language_version: python3
         additional_dependencies: ['types-setuptools', 'types-python-dateutil',
-        'types-requests', 'types-chardet', 'types-six']
+        'types-requests', 'types-chardet', 'types-six', 'numpy']
   # Check-manifest: ensures required non-Python files are included in MANIFEST.in
   # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
   - repo: https://github.com/mgedmin/check-manifest

@@ -1,3 +1,5 @@
+global-exclude .DS_Store
+
 include *.txt
 include CODEOWNERS
 recursive-include dataprofiler *.avro

@@ -94,7 +94,7 @@ def is_match(
         if data_utils.is_stream_buffer(file_path):
             starting_location = file_path.tell()
 
-        is_valid_avro = fastavro.is_avro(file_path)
+        is_valid_avro: bool = fastavro.is_avro(file_path)
 
         # return to original position in stream
         if data_utils.is_stream_buffer(file_path):

@@ -9,6 +9,7 @@
 import pandas as pd
 from six import StringIO
 
+from .._typing import JSONType
 from . import data_utils
 from .base_data import BaseData
 from .filepath_or_buffer import FileOrBufferHandler
@@ -236,36 +237,38 @@ def _get_data_as_flattened_dataframe(self, json_lines):
 
         return data
 
-    def _load_data_from_str(self, data_as_str: str) -> List:
+    def _load_data_from_str(self, data_as_str: str) -> JSONType:
         """
         Load the data from a string.
 
         :param data_as_str: data in string format.
         :type data_as_str: str
-        :return: dict
+        :return: JSONType
         """
+        data: JSONType
         try:
             data = json.loads(data_as_str)
         except json.JSONDecodeError:
-            data = data_utils.data_generator(data_as_str.splitlines())
+            data_generator = data_utils.data_generator(data_as_str.splitlines())
             data = data_utils.read_json(
-                data_generator=data,
+                data_generator=data_generator,
                 selected_columns=self.selected_keys,
                 read_in_string=False,
             )
         return data
 
-    def _load_data_from_file(self, input_file_path: str) -> List:
+    def _load_data_from_file(self, input_file_path: str) -> JSONType:
         """
         Load the data from a file.
 
         :param input_file_path: file path to file being loaded.
         :type input_file_path: str
-        :return:
+        :return: JSONType
         """
         with FileOrBufferHandler(
             input_file_path, "r", encoding=self.file_encoding
         ) as input_file:
+            data: JSONType
             try:
                 data = json.load(input_file)
             except (json.JSONDecodeError, UnicodeDecodeError):

@@ -1,6 +1,6 @@
 """Contains mixin data class for loading datasets of tye SpreadSheet."""
 from logging import Logger
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, cast
 
 import pandas as pd
 
@@ -80,4 +80,4 @@ def _get_data_as_records(self, data: Any) -> List[str]:
             )
             for i in range((len(data) + records_per_line - 1) // records_per_line)
         ]
-        return data
+        return cast(List[str], data)
@@ -463,7 +463,7 @@ def _load_parameters(dirpath: str, load_options: Dict = None) -> Dict[str, Dict]
             load_options = {}
 
         with open(os.path.join(dirpath, "data_labeler_parameters.json")) as fp:
-            params = json.load(fp)
+            params: Dict[str, Dict] = json.load(fp)
 
         if "model_class" in load_options:
             model_class = load_options.get("model_class")
@@ -677,7 +677,7 @@ def load_with_components(
         data_labeler.set_preprocessor(preprocessor)
         data_labeler.set_model(model)
         data_labeler.set_postprocessor(postprocessor)
-        return data_labeler
+        return cast(BaseDataLabeler, data_labeler)
 
     def _save_model(self, dirpath: str) -> None:
         """
@@ -914,4 +914,4 @@ def load_with_components(
         data_labeler.set_preprocessor(preprocessor)
         data_labeler.set_model(model)
         data_labeler.set_postprocessor(postprocessor)
-        return data_labeler
+        return cast(TrainableDataLabeler, data_labeler)
@@ -5,20 +5,23 @@
 import copy
 import inspect
 import warnings
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union, cast
 
 from dataprofiler._typing import DataArray
 
+T = TypeVar("T", bound="BaseModel")
+
 
 class AutoSubRegistrationMeta(abc.ABCMeta):
     """For registering subclasses."""
 
     def __new__(
         cls, clsname: str, bases: Tuple[type, ...], attrs: Dict[str, object]
-    ) -> AutoSubRegistrationMeta:
+    ) -> type[T]:
         """Create auto registration object and return new class."""
-        new_class: Any = super(AutoSubRegistrationMeta, cls).__new__(
-            cls, clsname, bases, attrs
+        new_class = cast(
+            Type[T],
+            super(AutoSubRegistrationMeta, cls).__new__(cls, clsname, bases, attrs),
         )
         new_class._register_subclass()
         return new_class

@@ -58,7 +58,7 @@ def create_glove_char(n_dims: int, source_file: str = None) -> None:
     embd_table = build_embd_dictionary(source_file)
     embd_words: List[str]
     embd_matrix: List[np.ndarray]
-    embd_words, embd_matrix = [
+    embd_words, embd_matrix = [  # type: ignore
         np.asarray(ls) if i > 0 else list(ls)  # type: ignore
         for i, ls in enumerate(zip(*embd_table.items()))
     ]

@@ -26,6 +26,7 @@
 )
 
 import numpy as np
+import numpy.typing as npt
 import pkg_resources
 
 default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
@@ -1416,19 +1417,20 @@ def get_parameters(self, param_list: List[str] = None) -> Dict:
         return params
 
     def convert_to_unstructured_format(
-        self, data: np.ndarray, labels: Optional[List[str]]
+        self, data: np.ndarray, labels: Optional[Union[List[str], npt.NDArray[np.str_]]]
     ) -> Tuple[str, Optional[List[Tuple[int, int, str]]]]:
         """
         Convert data samples list to StructCharPreprocessor required input data format.
 
         :param data: list of strings
         :type data: numpy.ndarray
         :param labels: labels for each input character
-        :type labels: list
+        :type labels: Optional[Union[List[str], npt.NDArray[np.str_]]]
         :return: data in the following format
                  text="<SAMPLE><SEPARATOR><SAMPLE>...",
                  entities=[(start=<INT>, end=<INT>, label="<LABEL>"),
                                   ...(num_samples in data)])
+        :rtype: Tuple[str, Optional[List[Tuple[int, int, str]]]]
         """
         separator: str = self._parameters["flatten_separator"]
         default_label: str = self._parameters["default_label"]
@@ -1507,7 +1509,7 @@ def process(  # type: ignore
         # with rework, can be tuned to be batches > size 1
         for ind in range(len(data)):
             batch_data: np.ndarray = data[ind : ind + 1]
-            batch_labels: Optional[List[str]] = (
+            batch_labels: Optional[Union[npt.NDArray[np.str_], List[str]]] = (
                 None if labels is None else labels[ind : ind + 1]
             )
             (

@@ -79,7 +79,7 @@ def evaluate_accuracy(
     predicted_entities_in_index: List[List[int]],
     true_entities_in_index: List[List[int]],
     num_labels: int,
-    entity_rev_dict: Dict,
+    entity_rev_dict: Dict[int, str],
     verbose: bool = True,
     omitted_labels: Tuple[str, ...] = ("PAD", "UNKNOWN"),
     confusion_matrix_file: str = None,
@@ -125,6 +125,7 @@ def evaluate_accuracy(
     true_labels_flatten = np.hstack(true_labels_padded)  # type: ignore
     predicted_labels_flatten = np.hstack(predicted_entities_in_index)
 
+    all_labels: List[str] = []
     if entity_rev_dict:
         all_labels = [entity_rev_dict[key] for key in sorted(entity_rev_dict.keys())]
 

@@ -267,6 +267,7 @@ def __init__(self, name: Optional[str]) -> None:
         # Number of values that match the column type. eg. how many floats match
         # in the float column
         self.match_count: int = 0
+        self.sample_size: int  # inherited from BaseColumnProfiler
 
     def _update_column_base_properties(self, profile: Dict) -> None:
         """

@@ -3,7 +3,7 @@
 
 import copy
 import re
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -285,7 +285,9 @@ def _get_float_precision(
         return subset_precision
 
     @classmethod
-    def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
+    def _is_each_row_float(
+        cls, df_series: pd.Series
+    ) -> Union[List[bool], pd.Series[bool]]:
         """
         Determine if each value in a dataframe is a float.
 
@@ -297,7 +299,7 @@ def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
         :param df_series: series of values to evaluate
         :type df_series: pandas.core.series.Series
         :return: is_float_col
-        :rtype: list
+        :rtype: Union[List[bool], pandas.Series[bool]]
         """
         if len(df_series) == 0:
             return list()

@@ -4,7 +4,7 @@
 import pickle
 from collections import defaultdict
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union, cast
 
 import networkx as nx
 import numpy as np
@@ -330,12 +330,12 @@ def _update_categorical_distribution(
     @BaseColumnProfiler._timeit(name="num_nodes")
     def _get_num_nodes(self, graph: nx.Graph) -> int:
         """Compute the number of nodes."""
-        return graph.number_of_nodes()
+        return cast(int, graph.number_of_nodes())
 
     @BaseColumnProfiler._timeit(name="num_edges")
     def _get_num_edges(self, graph: nx.Graph) -> int:
         """Compute the number of edges."""
-        return graph.number_of_edges()
+        return cast(int, graph.number_of_edges())
 
     @BaseColumnProfiler._timeit(name="categorical_attributes")
     def _get_categorical_attributes(self, graph: nx.Graph) -> List[str]:
@@ -362,7 +362,7 @@ def _get_global_max_component_size(self, graph: nx.Graph) -> int:
             nx.connected_components(graph), key=len, reverse=True
         )
         largest_component: nx.Graph = graph.subgraph(graph_connected_components[0])
-        return largest_component.size()
+        return cast(int, largest_component.size())
 
     @BaseColumnProfiler._timeit(name="continuous_distribution")
     def _get_continuous_distribution(