Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated setup.cfg mypy flags and resolved related errors. #703

Merged
merged 31 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a74ef96
updated setup.cfg mypy flags and resolved errors
Sanketh7 Nov 1, 2022
8b1d198
Merge branch 'main' into mypy-flags
Sanketh7 Nov 1, 2022
cb58600
Merge branch 'main' into mypy-flags
Sanketh7 Nov 2, 2022
1eb64ae
updated return type of _is_each_row_float
Sanketh7 Nov 2, 2022
0ad2a38
updated mypy hook to include numpy as a dependency and fixed relevant…
Sanketh7 Nov 2, 2022
0313deb
updated _is_each_row_float
Sanketh7 Nov 2, 2022
2d5307f
updated _get_data_as_records
Sanketh7 Nov 2, 2022
82cbf42
Merge branch 'main' into mypy-flags
taylorfturner Nov 3, 2022
81f9f36
clean up
taylorfturner Nov 3, 2022
c1b2bd1
Merge branch 'main' into mypy-flags
taylorfturner Nov 3, 2022
3c9aaa3
updated evaluate_accuracy types
Sanketh7 Nov 3, 2022
1de9958
removed float cast in biased_skew
Sanketh7 Nov 3, 2022
cc8224e
typed self.match_count
Sanketh7 Nov 4, 2022
2538ea3
updated estimate_stats_from_histogram type
Sanketh7 Nov 4, 2022
b737249
Merge branch 'main' into mypy-flags
Sanketh7 Nov 4, 2022
60a2202
update biased skewness to Union[float, np.float64]
Sanketh7 Nov 4, 2022
fde3a97
updated _correct_bias_skewness to return Union[float, np.float64]
Sanketh7 Nov 4, 2022
481a8ec
updated biased kurtosis to be Union[float, np.float64]
Sanketh7 Nov 4, 2022
9a261e1
added generics to AutoSubRegistrationMeta
Sanketh7 Nov 4, 2022
06762be
Merge branch 'main' into mypy-flags
Sanketh7 Nov 7, 2022
25094c2
update np_type_to_type return type
Sanketh7 Nov 7, 2022
1ee80eb
changed float to float64 where needed
Sanketh7 Nov 7, 2022
a9d097b
updated _estimate_mode_from_histogram to not use Union
Sanketh7 Nov 8, 2022
d7ee476
Merge branch 'main' into mypy-flags
Sanketh7 Nov 8, 2022
841adf5
return 0.0 instead of 0
Sanketh7 Nov 8, 2022
9c2d636
Merge branch 'main' into mypy-flags
Sanketh7 Nov 9, 2022
78f57fa
revert AutoSubRegistrationMeta changes
Sanketh7 Nov 9, 2022
ffbcb43
Update base_model.py
taylorfturner Nov 9, 2022
e417189
Update base_model.py
taylorfturner Nov 9, 2022
ec2745b
isort fix
taylorfturner Nov 9, 2022
e6399be
DS_Store fix
taylorfturner Nov 9, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ repos:
exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
language_version: python3
additional_dependencies: ['types-setuptools', 'types-python-dateutil',
'types-requests', 'types-chardet', 'types-six']
'types-requests', 'types-chardet', 'types-six', 'numpy']
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
- repo: https://github.com/mgedmin/check-manifest
Expand Down
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
global-exclude .DS_Store

include *.txt
include CODEOWNERS
recursive-include dataprofiler *.avro
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/data_readers/avro_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def is_match(
if data_utils.is_stream_buffer(file_path):
starting_location = file_path.tell()

is_valid_avro = fastavro.is_avro(file_path)
is_valid_avro: bool = fastavro.is_avro(file_path)

# return to original position in stream
if data_utils.is_stream_buffer(file_path):
Expand Down
15 changes: 9 additions & 6 deletions dataprofiler/data_readers/json_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pandas as pd
from six import StringIO

from .._typing import JSONType
from . import data_utils
from .base_data import BaseData
from .filepath_or_buffer import FileOrBufferHandler
Expand Down Expand Up @@ -236,36 +237,38 @@ def _get_data_as_flattened_dataframe(self, json_lines):

return data

def _load_data_from_str(self, data_as_str: str) -> List:
def _load_data_from_str(self, data_as_str: str) -> JSONType:
"""
Load the data from a string.

:param data_as_str: data in string format.
:type data_as_str: str
:return: dict
:return: JSONType
"""
data: JSONType
try:
data = json.loads(data_as_str)
except json.JSONDecodeError:
data = data_utils.data_generator(data_as_str.splitlines())
data_generator = data_utils.data_generator(data_as_str.splitlines())
data = data_utils.read_json(
data_generator=data,
data_generator=data_generator,
selected_columns=self.selected_keys,
read_in_string=False,
)
return data

def _load_data_from_file(self, input_file_path: str) -> List:
def _load_data_from_file(self, input_file_path: str) -> JSONType:
"""
Load the data from a file.

:param input_file_path: file path to file being loaded.
:type input_file_path: str
:return:
:return: JSONType
"""
with FileOrBufferHandler(
input_file_path, "r", encoding=self.file_encoding
) as input_file:
data: JSONType
try:
data = json.load(input_file)
except (json.JSONDecodeError, UnicodeDecodeError):
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/data_readers/structured_mixins.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Contains mixin data class for loading datasets of tye SpreadSheet."""
from logging import Logger
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, cast

import pandas as pd

Expand Down Expand Up @@ -80,4 +80,4 @@ def _get_data_as_records(self, data: Any) -> List[str]:
)
for i in range((len(data) + records_per_line - 1) // records_per_line)
]
return data
return cast(List[str], data)
6 changes: 3 additions & 3 deletions dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ def _load_parameters(dirpath: str, load_options: Dict = None) -> Dict[str, Dict]
load_options = {}

with open(os.path.join(dirpath, "data_labeler_parameters.json")) as fp:
params = json.load(fp)
params: Dict[str, Dict] = json.load(fp)

if "model_class" in load_options:
model_class = load_options.get("model_class")
Expand Down Expand Up @@ -677,7 +677,7 @@ def load_with_components(
data_labeler.set_preprocessor(preprocessor)
data_labeler.set_model(model)
data_labeler.set_postprocessor(postprocessor)
return data_labeler
return cast(BaseDataLabeler, data_labeler)

def _save_model(self, dirpath: str) -> None:
"""
Expand Down Expand Up @@ -914,4 +914,4 @@ def load_with_components(
data_labeler.set_preprocessor(preprocessor)
data_labeler.set_model(model)
data_labeler.set_postprocessor(postprocessor)
return data_labeler
return cast(TrainableDataLabeler, data_labeler)
11 changes: 7 additions & 4 deletions dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,23 @@
import copy
import inspect
import warnings
from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union, cast

from dataprofiler._typing import DataArray

T = TypeVar("T", bound="BaseModel")


class AutoSubRegistrationMeta(abc.ABCMeta):
"""For registering subclasses."""

def __new__(
cls, clsname: str, bases: Tuple[type, ...], attrs: Dict[str, object]
) -> AutoSubRegistrationMeta:
) -> type[T]:
"""Create auto registration object and return new class."""
new_class: Any = super(AutoSubRegistrationMeta, cls).__new__(
cls, clsname, bases, attrs
new_class = cast(
Type[T],
super(AutoSubRegistrationMeta, cls).__new__(cls, clsname, bases, attrs),
)
new_class._register_subclass()
return new_class
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def create_glove_char(n_dims: int, source_file: str = None) -> None:
embd_table = build_embd_dictionary(source_file)
embd_words: List[str]
embd_matrix: List[np.ndarray]
embd_words, embd_matrix = [
embd_words, embd_matrix = [ # type: ignore
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't love the #type: ignore. If there isn't a way to avoid this, that's okay. But IIRC, in prior PRs we are tried to avoid this, @Sanketh7?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue I was having was that there's just a lot of types involved and numpy's type annotations aren't very specific to help mypy out. Mypy sort of gives up and says that you're trying to assign an object to List[str] and List[np.ndarray].

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i see

np.asarray(ls) if i > 0 else list(ls) # type: ignore
for i, ls in enumerate(zip(*embd_table.items()))
]
Expand Down
8 changes: 5 additions & 3 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)

import numpy as np
import numpy.typing as npt
import pkg_resources

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
Expand Down Expand Up @@ -1416,19 +1417,20 @@ def get_parameters(self, param_list: List[str] = None) -> Dict:
return params

def convert_to_unstructured_format(
self, data: np.ndarray, labels: Optional[List[str]]
self, data: np.ndarray, labels: Optional[Union[List[str], npt.NDArray[np.str_]]]
) -> Tuple[str, Optional[List[Tuple[int, int, str]]]]:
"""
Convert data samples list to StructCharPreprocessor required input data format.

:param data: list of strings
:type data: numpy.ndarray
:param labels: labels for each input character
:type labels: list
:type labels: Optional[Union[List[str], npt.NDArray[np.str_]]]
:return: data in the following format
text="<SAMPLE><SEPARATOR><SAMPLE>...",
entities=[(start=<INT>, end=<INT>, label="<LABEL>"),
...(num_samples in data)])
:rtype: Tuple[str, Optional[List[Tuple[int, int, str]]]]
"""
separator: str = self._parameters["flatten_separator"]
default_label: str = self._parameters["default_label"]
Expand Down Expand Up @@ -1507,7 +1509,7 @@ def process( # type: ignore
# with rework, can be tuned to be batches > size 1
for ind in range(len(data)):
batch_data: np.ndarray = data[ind : ind + 1]
batch_labels: Optional[List[str]] = (
batch_labels: Optional[Union[npt.NDArray[np.str_], List[str]]] = (
None if labels is None else labels[ind : ind + 1]
)
(
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/labelers/labeler_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def evaluate_accuracy(
predicted_entities_in_index: List[List[int]],
true_entities_in_index: List[List[int]],
num_labels: int,
entity_rev_dict: Dict,
entity_rev_dict: Dict[int, str],
verbose: bool = True,
omitted_labels: Tuple[str, ...] = ("PAD", "UNKNOWN"),
confusion_matrix_file: str = None,
Expand Down Expand Up @@ -125,6 +125,7 @@ def evaluate_accuracy(
true_labels_flatten = np.hstack(true_labels_padded) # type: ignore
predicted_labels_flatten = np.hstack(predicted_entities_in_index)

all_labels: List[str] = []
if entity_rev_dict:
all_labels = [entity_rev_dict[key] for key in sorted(entity_rev_dict.keys())]

Expand Down
1 change: 1 addition & 0 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ def __init__(self, name: Optional[str]) -> None:
# Number of values that match the column type. eg. how many floats match
# in the float column
self.match_count: int = 0
self.sample_size: int # inherited from BaseColumnProfiler

def _update_column_base_properties(self, profile: Dict) -> None:
"""
Expand Down
8 changes: 5 additions & 3 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import copy
import re
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -285,7 +285,9 @@ def _get_float_precision(
return subset_precision

@classmethod
def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
def _is_each_row_float(
cls, df_series: pd.Series
) -> Union[List[bool], pd.Series[bool]]:
"""
Determine if each value in a dataframe is a float.

Expand All @@ -297,7 +299,7 @@ def _is_each_row_float(cls, df_series: pd.Series) -> List[bool]:
:param df_series: series of values to evaluate
:type df_series: pandas.core.series.Series
:return: is_float_col
:rtype: list
:rtype: Union[List[bool], pandas.Series[bool]]
"""
if len(df_series) == 0:
return list()
Expand Down
8 changes: 4 additions & 4 deletions dataprofiler/profilers/graph_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pickle
from collections import defaultdict
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Optional, Tuple, Union, cast

import networkx as nx
import numpy as np
Expand Down Expand Up @@ -330,12 +330,12 @@ def _update_categorical_distribution(
@BaseColumnProfiler._timeit(name="num_nodes")
def _get_num_nodes(self, graph: nx.Graph) -> int:
"""Compute the number of nodes."""
return graph.number_of_nodes()
return cast(int, graph.number_of_nodes())

@BaseColumnProfiler._timeit(name="num_edges")
def _get_num_edges(self, graph: nx.Graph) -> int:
"""Compute the number of edges."""
return graph.number_of_edges()
return cast(int, graph.number_of_edges())

@BaseColumnProfiler._timeit(name="categorical_attributes")
def _get_categorical_attributes(self, graph: nx.Graph) -> List[str]:
Expand All @@ -362,7 +362,7 @@ def _get_global_max_component_size(self, graph: nx.Graph) -> int:
nx.connected_components(graph), key=len, reverse=True
)
largest_component: nx.Graph = graph.subgraph(graph_connected_components[0])
return largest_component.size()
return cast(int, largest_component.size())

@BaseColumnProfiler._timeit(name="continuous_distribution")
def _get_continuous_distribution(
Expand Down
Loading