Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Applying isort and resolving circular imports #453

Merged
merged 5 commits into from
Apr 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[settings]
skip=dataprofiler/tests/data/
multi_line_output=2
14 changes: 6 additions & 8 deletions dataprofiler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
from . import settings
from .data_readers.data import Data
from .profilers.profile_builder import StructuredProfiler, \
UnstructuredProfiler, Profiler
from .dp_logging import get_logger, set_verbosity
from .labelers.data_labelers import DataLabeler, StructuredDataLabeler, \
UnstructuredDataLabeler, train_structured_labeler
from .profilers.profile_builder import Profiler, StructuredProfiler, \
UnstructuredProfiler
from .profilers.profiler_options import ProfilerOptions
from .labelers.data_labelers import train_structured_labeler, DataLabeler, \
StructuredDataLabeler, \
UnstructuredDataLabeler
from .reports import graphs
from .validators.base_validators import Validator
from .dp_logging import set_verbosity, get_logger
from .version import __version__
from . import settings


try:
import snappy
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/data_readers/avro_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler
import fastavro

from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler

from . import data_utils
from .base_data import BaseData
from .json_data import JSONData
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/data_readers/base_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import sys
import locale
import sys
from collections import OrderedDict
from io import StringIO

import numpy as np
import pandas as pd
from io import StringIO

from .. import dp_logging
from . import data_utils
Expand Down
9 changes: 4 additions & 5 deletions dataprofiler/data_readers/csv_data.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
import csv
from io import BytesIO
import re
from six import StringIO

import random
import re
from collections import Counter
from io import BytesIO

import numpy as np
from six import StringIO

from . import data_utils
from .avro_data import AVROData
from .base_data import BaseData
from .json_data import JSONData
from .parquet_data import ParquetData
from .avro_data import AVROData
from .structured_mixins import SpreadSheetDataMixin


Expand Down
11 changes: 5 additions & 6 deletions dataprofiler/data_readers/data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import absolute_import, division

from .. import dp_logging
from .avro_data import AVROData
from .csv_data import CSVData
from .data_utils import is_valid_url, url_to_bytes
from .json_data import JSONData
from .text_data import TextData
from .parquet_data import ParquetData
from .avro_data import AVROData
from .data_utils import is_valid_url, url_to_bytes
from .. import dp_logging
from .text_data import TextData

logger = dp_logging.get_child_logger(__name__)

Expand Down
31 changes: 8 additions & 23 deletions dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from builtins import next
import re
import json
from io import open, StringIO, BytesIO, TextIOWrapper
from collections import OrderedDict
import dateutil
import requests
import re
import urllib
from builtins import next
from collections import OrderedDict
from io import BytesIO, StringIO, TextIOWrapper, open

import dateutil
import pandas as pd
import pyarrow.parquet as pq
import requests
from chardet.universaldetector import UniversalDetector

from .filepath_or_buffer import FileOrBufferHandler
from .. import dp_logging
from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer

logger = dp_logging.get_child_logger(__name__)

Expand Down Expand Up @@ -395,7 +395,7 @@ def _decode_is_valid(encoding):
if not _decode_is_valid(encoding):
try:
from charset_normalizer import CharsetNormalizerMatches as CnM

# Try with small sample
with FileOrBufferHandler(file_path, 'rb') as input_file:
raw_data = input_file.read(10000)
Expand Down Expand Up @@ -605,21 +605,6 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10,
return data_as_str


def is_stream_buffer(filepath_or_buffer):
"""
Determines whether a given argument is a filepath or buffer.

:param filepath_or_buffer: path to the file or buffer
:type filepath_or_buffer: str
:return: true if string is a buffer or false if string is a filepath
:rtype: boolean
"""

if isinstance(filepath_or_buffer, (StringIO, BytesIO)):
return True
return False


def is_valid_url(url_as_string):
"""
Determines whether a given string is a valid URL
Expand Down
19 changes: 16 additions & 3 deletions dataprofiler/data_readers/filepath_or_buffer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
from io import open, StringIO, BytesIO, TextIOWrapper
from io import BytesIO, StringIO, TextIOWrapper, open

from . import data_utils

def is_stream_buffer(filepath_or_buffer):
"""
Determines whether a given argument is a filepath or buffer.

:param filepath_or_buffer: path to the file or buffer
:type filepath_or_buffer: str
:return: true if string is a buffer or false if string is a filepath
:rtype: boolean
"""

if isinstance(filepath_or_buffer, (StringIO, BytesIO)):
return True
return False


class FileOrBufferHandler:
Expand Down Expand Up @@ -43,7 +56,7 @@ def __enter__(self):
TextIOWrapper(self._filepath_or_buffer, encoding=self._encoding)
self._is_wrapped = True

elif not data_utils.is_stream_buffer(self._filepath_or_buffer):
elif not is_stream_buffer(self._filepath_or_buffer):
# Raise AttributeError if attribute value not found.
raise AttributeError(f'Type {type(self._filepath_or_buffer)} is '
f'invalid. filepath_or_buffer must be a '
Expand Down
7 changes: 4 additions & 3 deletions dataprofiler/data_readers/json_data.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from collections import OrderedDict
import json
import warnings
from six import StringIO
from collections import OrderedDict

import numpy as np
import pandas as pd
from six import StringIO

from . import data_utils
from .base_data import BaseData
from .structured_mixins import SpreadSheetDataMixin
from .filepath_or_buffer import FileOrBufferHandler
from .structured_mixins import SpreadSheetDataMixin


class JSONData(SpreadSheetDataMixin, BaseData):
Expand Down
6 changes: 4 additions & 2 deletions dataprofiler/data_readers/text_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import unicode_literals # at top of module
from __future__ import unicode_literals # at top of module
from __future__ import print_function
from past.builtins import basestring

from io import StringIO

from past.builtins import basestring

from . import data_utils
from .base_data import BaseData

Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/dp_logging.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import threading
import sys
import threading

_dp_logger = None
_dp_logger_lock = threading.Lock()
Expand Down
13 changes: 5 additions & 8 deletions dataprofiler/labelers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,12 @@
2. structured_model
3. regex_model
"""
# import models
from .base_data_labeler import BaseDataLabeler

# import data processors
from .data_processing import CharPreprocessor, CharPostprocessor, \
StructCharPreprocessor, StructCharPostprocessor, \
DirectPassPreprocessor, RegexPostProcessor

# import data labelers
# import models
from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler
from .data_labelers import DataLabeler, StructuredDataLabeler, \
UnstructuredDataLabeler
# import data processors
from .data_processing import CharPostprocessor, CharPreprocessor, \
DirectPassPreprocessor, RegexPostProcessor, StructCharPostprocessor, \
StructCharPreprocessor
6 changes: 3 additions & 3 deletions dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import sys
import json
import os
import sys
import warnings
import json
import pkg_resources

import numpy as np
import pandas as pd
import pkg_resources

from .. import data_readers
from . import data_processing
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def num_labels(self):
def get_class(cls, class_name):

# Import possible internal models
from .regex_model import RegexModel
from .character_level_cnn_model import CharacterLevelCnnModel
from .regex_model import RegexModel

return cls._BaseModel__subclasses.get(class_name.lower(), None)

Expand Down
11 changes: 5 additions & 6 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import json
import copy
import json
import logging
import os
import sys
import time
import logging
from collections import defaultdict

import tensorflow as tf
import numpy as np
import tensorflow as tf
from sklearn import decomposition

from . import labeler_utils
from .base_model import BaseModel, BaseTrainableModel
from .base_model import AutoSubRegistrationMeta
from .. import dp_logging
from . import labeler_utils
from .base_model import AutoSubRegistrationMeta, BaseModel, BaseTrainableModel

_file_dir = os.path.dirname(os.path.abspath(__file__))

Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import pkg_resources

import pandas as pd
import pkg_resources

from .. import data_readers
from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler
Expand Down
13 changes: 6 additions & 7 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
import os
import pkg_resources
import abc
import types
import copy
import json
import inspect
from collections import Counter
import random
import json
import math
import os
import random
import types
import warnings
import copy
from collections import Counter

import numpy as np
import pkg_resources

default_labeler_dir = pkg_resources.resource_filename(
'resources', 'labelers'
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/labelers/labeler_utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import warnings

import scipy
import numpy as np
import scipy
from sklearn.exceptions import UndefinedMetricWarning

from .classification_report_utils import classification_report
from .. import dp_logging
from .classification_report_utils import classification_report

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

Expand Down
7 changes: 3 additions & 4 deletions dataprofiler/labelers/regex_model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import copy
import json
import os
import sys
import re
import copy
import sys

import numpy as np

from .base_model import BaseModel
from .base_model import AutoSubRegistrationMeta
from .. import dp_logging
from .base_model import AutoSubRegistrationMeta, BaseModel

logger = dp_logging.get_child_logger(__name__)

Expand Down
14 changes: 6 additions & 8 deletions dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
from .base_column_profilers import BaseColumnProfiler
from .numerical_column_stats import NumericStatsMixin
from .categorical_column_profile import CategoricalColumn
from .data_labeler_column_profile import DataLabelerColumn
from .datetime_column_profile import DateTimeColumn
from .int_column_profile import IntColumn
from .float_column_profile import FloatColumn
from .text_column_profile import TextColumn

from .categorical_column_profile import CategoricalColumn
from .int_column_profile import IntColumn
from .numerical_column_stats import NumericStatsMixin
from .order_column_profile import OrderColumn

from .data_labeler_column_profile import DataLabelerColumn
from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler
from .text_column_profile import TextColumn
from .unstructured_labeler_profile import UnstructuredLabelerProfile

from .profile_builder import StructuredProfiler, UnstructuredProfiler, Profiler
"""
The purpose of this package is to provide statistics and predictions for a
given dataset.
Expand Down
Loading