diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml new file mode 100644 index 000000000..8164f174e --- /dev/null +++ b/.github/workflows/pipeline.yaml @@ -0,0 +1,25 @@ +name: Test pipeline +on: + push: + branches: + - main + pull_request: +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11'] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install --upgrade pip + pip install poetry==1.4.0 + poetry install + - name: Test with pre-commit + run: poetry run pre-commit run --all-files --show-diff-on-failure diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..45dc504b0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +ci: + autoupdate_branch: "main" + autoupdate_schedule: monthly +repos: + - repo: local + hooks: + - id: pylint + name: pylint + entry: pylint + language: system + types: [python] + files: "^express/" + args: + [ + "-rn", # Only display messages + "-sn", # Don't display the score + "--disable=fixme" + ] diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index c3b5c5064..000000000 --- a/.pylintrc +++ /dev/null @@ -1,587 +0,0 @@ -[MASTER] - -# A comma-separated list of package or module names from where C extensions may -# be loaded. Extensions are loading into the active Python interpreter and may -# run arbitrary code. -extension-pkg-whitelist=cv2 - -# Add files or directories to the blacklist. They should be base names, not -# paths. -ignore=CVS - -# Add files or directories matching the regex patterns to the blacklist. The -# regex matches against base names, not paths. -ignore-patterns= - -# Python code to execute, usually for sys.path manipulation such as -# pygtk.require(). -#init-hook= - -# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the -# number of processors available to use. -jobs=1 - -# Control the amount of potential inferred values when inferring a single -# object. This can help the performance when dealing with large functions or -# complex, nested conditions. -limit-inference-results=100 - -# List of plugins (as comma separated values of python module names) to load, -# usually to register additional checkers. -load-plugins= - -# Pickle collected data for later comparisons. -persistent=yes - -# Specify a configuration file. -#rcfile= - -# When enabled, pylint would attempt to guess common misconfiguration and emit -# user-friendly hints instead of false-positive error messages. -suggestion-mode=yes - -# Allow loading of arbitrary C extensions. Extensions are imported into the -# active Python interpreter and may run arbitrary code. -unsafe-load-any-extension=no - - -[MESSAGES CONTROL] - -# Only show warnings with the listed confidence levels. Leave empty to show -# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. -confidence= - -# Disable the message, report, category or checker with the given id(s). You -# can either give multiple identifiers separated by comma (,) or put this -# option multiple times (only on the command line, not in the configuration -# file where it should appear only once). You can also use "--disable=all" to -# disable everything first and then reenable specific checks. For example, if -# you want to run only the similarities checker, you can use "--disable=all -# --enable=similarities". If you want to run only the classes checker, but have -# no Warning level messages displayed, use "--disable=all --enable=classes -# --disable=W". -disable=print-statement, - parameter-unpacking, - unpacking-in-except, - old-raise-syntax, - backtick, - long-suffix, - old-ne-operator, - old-octal-literal, - import-star-module-level, - non-ascii-bytes-literal, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - apply-builtin, - basestring-builtin, - buffer-builtin, - cmp-builtin, - coerce-builtin, - execfile-builtin, - file-builtin, - long-builtin, - raw_input-builtin, - reduce-builtin, - standarderror-builtin, - unicode-builtin, - xrange-builtin, - coerce-method, - delslice-method, - getslice-method, - setslice-method, - no-absolute-import, - old-division, - dict-iter-method, - dict-view-method, - next-method-called, - metaclass-assignment, - indexing-exception, - raising-string, - reload-builtin, - oct-method, - hex-method, - nonzero-method, - cmp-method, - input-builtin, - round-builtin, - intern-builtin, - unichr-builtin, - map-builtin-not-iterating, - zip-builtin-not-iterating, - range-builtin-not-iterating, - filter-builtin-not-iterating, - using-cmp-argument, - eq-without-hash, - div-method, - idiv-method, - rdiv-method, - exception-message-attribute, - invalid-str-codec, - sys-max-int, - bad-python3-import, - deprecated-string-function, - deprecated-str-translate-call, - deprecated-itertools-function, - deprecated-types-field, - next-method-defined, - dict-items-not-iterating, - dict-keys-not-iterating, - dict-values-not-iterating, - deprecated-operator-function, - deprecated-urllib-function, - xreadlines-attribute, - deprecated-sys-function, - exception-escape, - comprehension-escape, - no-else-return, # Explicit elif / else can improve readability - bad-continuation, # Checked by pycodestyle - missing-final-newline, # Too strict to enforce - trailing-newlines # Too strict to enforce - -# Enable the message, report, category or checker with the given id(s). You can -# either give multiple identifier separated by comma (,) or put this option -# multiple time (only on the command line, not in the configuration file where -# it should appear only once). See also the "--disable" option for examples. -enable=c-extension-no-member - - -[REPORTS] - -# Python expression which should return a score less than or equal to 10. You -# have access to the variables 'error', 'warning', 'refactor', and 'convention' -# which contain the number of messages in each category, as well as 'statement' -# which is the total number of statements analyzed. This score is used by the -# global evaluation report (RP0004). -evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) - -# Template used to display messages. This is a python new-style format string -# used to format the message information. See doc for all details. -#msg-template= - -# Set the output format. Available formats are text, parseable, colorized, json -# and msvs (visual studio). You can also give a reporter class, e.g. -# mypackage.mymodule.MyReporterClass. -output-format=text - -# Tells whether to display a full report or only the messages. -reports=no - -# Activate the evaluation score. -score=yes - - -[REFACTORING] - -# Maximum number of nested blocks for function / method body -max-nested-blocks=5 - -# Complete name of functions that never returns. When checking for -# inconsistent-return-statements if a never returning function is called then -# it will be considered as an explicit return statement and no message will be -# printed. -never-returning-functions=sys.exit - - -[VARIABLES] - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid defining new builtins when possible. -additional-builtins= - -# Tells whether unused global variables should be treated as a violation. -allow-global-unused-variables=yes - -# List of strings which can identify a callback function by name. A callback -# name must start or end with one of those strings. -callbacks=cb_, - _cb - -# A regular expression matching the name of dummy variables (i.e. expected to -# not be used). -dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ - -# Argument names that match this expression will be ignored. Default to name -# with leading underscore. -ignored-argument-names=_.*|^ignored_|^unused_ - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# List of qualified module names which can have objects that can redefine -# builtins. -redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io - - -[MISCELLANEOUS] - -# List of note tags to take in consideration, separated by a comma. -notes=FIXME, - XXX - - -[SPELLING] - -# Limits count of emitted suggestions for spelling mistakes. -max-spelling-suggestions=4 - -# Spelling dictionary name. Available dictionaries: none. To make it work, -# install the python-enchant package. -spelling-dict= - -# List of comma separated words that should not be checked. -spelling-ignore-words= - -# A path to a file that contains the private dictionary; one word per line. -spelling-private-dict-file= - -# Tells whether to store unknown words to the private dictionary (see the -# --spelling-private-dict-file option) instead of raising a message. -spelling-store-unknown-words=no - - -[TYPECHECK] - -# List of decorators that produce context managers, such as -# contextlib.contextmanager. Add to this list to register other decorators that -# produce valid context managers. -contextmanager-decorators=contextlib.contextmanager - -# List of members which are set dynamically and missed by pylint inference -# system, and so shouldn't trigger E1101 when accessed. Python regular -# expressions are accepted. -generated-members= - -# Tells whether missing members accessed in mixin class should be ignored. A -# mixin class is detected if its name ends with "mixin" (case insensitive). -ignore-mixin-members=yes - -# Tells whether to warn about missing members when the owner of the attribute -# is inferred to be None. -ignore-none=yes - -# This flag controls whether pylint should warn about no-member and similar -# checks whenever an opaque object is returned when inferring. The inference -# can return multiple potential results while evaluating a Python object, but -# some branches might not be evaluated, which results in partial inference. In -# that case, it might be useful to still emit no-member and other checks for -# the rest of the inferred objects. -ignore-on-opaque-inference=yes - -# List of class names for which member attributes should not be checked (useful -# for classes with dynamically set attributes). This supports the use of -# qualified names. -ignored-classes=optparse.Values,thread._local,_thread._local - -# List of module names for which member attributes should not be checked -# (useful for modules/projects where namespaces are manipulated during runtime -# and thus existing member attributes cannot be deduced by static analysis). It -# supports qualified module names, as well as Unix pattern matching. -ignored-modules=cv2 - -# Show a hint with possible names when a member name was not found. The aspect -# of finding the hint is based on edit distance. -missing-member-hint=yes - -# The minimum edit distance a name should have in order to be considered a -# similar match for a missing member name. -missing-member-hint-distance=1 - -# The total number of similar names that should be taken in consideration when -# showing a hint for a missing member. -missing-member-max-choices=1 - -# List of decorators that change the signature of a decorated function. -signature-mutators= - - -[BASIC] - -# Naming style matching correct argument names. -argument-naming-style=snake_case - -# Regular expression matching correct argument names. Overrides argument- -# naming-style. -#argument-rgx= - -# Naming style matching correct attribute names. -attr-naming-style=snake_case - -# Regular expression matching correct attribute names. Overrides attr-naming- -# style. -#attr-rgx= - -# Bad variable names which should always be refused, separated by a comma. -bad-names=foo, - bar, - baz, - toto, - tutu, - tata - -# Naming style matching correct class attribute names. -class-attribute-naming-style=any - -# Regular expression matching correct class attribute names. Overrides class- -# attribute-naming-style. -#class-attribute-rgx= - -# Naming style matching correct class names. -class-naming-style=PascalCase - -# Regular expression matching correct class names. Overrides class-naming- -# style. -#class-rgx= - -# Naming style matching correct constant names. -const-naming-style=UPPER_CASE - -# Regular expression matching correct constant names. Overrides const-naming- -# style. -#const-rgx= - -# Minimum line length for functions/classes that require docstrings, shorter -# ones are exempt. -docstring-min-length=-1 - -# Naming style matching correct function names. -function-naming-style=snake_case - -# Regular expression matching correct function names. Overrides function- -# naming-style. -#function-rgx= - -# Good variable names which should always be accepted, separated by a comma. -good-names=e, - f, - i, - j, - k, - df, - ex, - Run, - _ - -# Include a hint for the correct naming format with invalid-name. -include-naming-hint=no - -# Naming style matching correct inline iteration names. -inlinevar-naming-style=any - -# Regular expression matching correct inline iteration names. Overrides -# inlinevar-naming-style. -#inlinevar-rgx= - -# Naming style matching correct method names. -method-naming-style=snake_case - -# Regular expression matching correct method names. Overrides method-naming- -# style. -#method-rgx= - -# Naming style matching correct module names. -module-naming-style=snake_case - -# Regular expression matching correct module names. Overrides module-naming- -# style. -#module-rgx= - -# Colon-delimited sets of names that determine each other's naming style when -# the name regexes allow several styles. -name-group= - -# Regular expression which should only match function or class names that do -# not require a docstring. -no-docstring-rgx=^_ - -# List of decorators that produce properties, such as abc.abstractproperty. Add -# to this list to register other decorators that produce valid properties. -# These decorators are taken in consideration only for invalid-name. -property-classes=abc.abstractproperty - -# Naming style matching correct variable names. -variable-naming-style=snake_case - -# Regular expression matching correct variable names. Overrides variable- -# naming-style. -#variable-rgx= - - -[STRING] - -# This flag controls whether the implicit-str-concat-in-sequence should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=no - - -[LOGGING] - -# Format style used to check logging format string. `old` means using % -# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. -logging-format-style=old - -# Logging modules to check that the string format arguments are in logging -# function parameter format. -logging-modules=logging - - -[FORMAT] - -# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. -expected-line-ending-format= - -# Regexp for a line that is allowed to be longer than the limit. -ignore-long-lines=^\s*(# )??$ - -# Number of spaces of indent required inside a hanging or continued line. -indent-after-paren=4 - -# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 -# tab). -indent-string=' ' - -# Maximum number of characters on a single line. -max-line-length=100 - -# Maximum number of lines in a module. -max-module-lines=1000 - -# List of optional constructs for which whitespace checking is disabled. `dict- -# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. -# `trailing-comma` allows a space between comma and closing bracket: (a, ). -# `empty-line` allows space-only lines. -no-space-check=trailing-comma, - dict-separator - -# Allow the body of a class to be on the same line as the declaration if body -# contains single statement. -single-line-class-stmt=no - -# Allow the body of an if to be on the same line as the test if there is no -# else. -single-line-if-stmt=no - - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no - -# Minimum lines number of a similarity. -min-similarity-lines=7 - - -[DESIGN] - -# Maximum number of arguments for function / method. -max-args=5 - -# Maximum number of attributes for a class (see R0902). -max-attributes=7 - -# Maximum number of boolean expressions in an if statement (see R0916). -max-bool-expr=5 - -# Maximum number of branch for function / method body. -max-branches=12 - -# Maximum number of locals for function / method body. -max-locals=15 - -# Maximum number of parents for a class (see R0901). -max-parents=7 - -# Maximum number of public methods for a class (see R0904). -max-public-methods=20 - -# Maximum number of return / yield for function / method body. -max-returns=6 - -# Maximum number of statements in function / method body. -max-statements=50 - -# Minimum number of public methods for a class (see R0903). -min-public-methods=2 - - -[IMPORTS] - -# List of modules that can be imported at any level, not just the top level -# one. -allow-any-import-level= - -# Allow wildcard imports from modules that define __all__. -allow-wildcard-with-all=no - -# Analyse import fallback blocks. This can be used to support both Python 2 and -# 3 compatible code, which means that the block might have code that exists -# only in one or another interpreter, leading to false positives when analysed. -analyse-fallback-blocks=no - -# Deprecated modules which should not be used, separated by a comma. -deprecated-modules=optparse,tkinter.tix - -# Create a graph of external dependencies in the given file (report RP0402 must -# not be disabled). -ext-import-graph= - -# Create a graph of every (i.e. internal and external) dependencies in the -# given file (report RP0402 must not be disabled). -import-graph= - -# Create a graph of internal dependencies in the given file (report RP0402 must -# not be disabled). -int-import-graph= - -# Force import order to recognize a module as part of the standard -# compatibility libraries. -known-standard-library= - -# Force import order to recognize a module as part of a third party library. -known-third-party=enchant - -# Couples of modules and preferred modules, separated by a comma. -preferred-modules= - - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__, - __new__, - setUp, - __post_init__ - -# List of member names, which should be excluded from the protected access -# warning. -exclude-protected=_asdict, - _fields, - _replace, - _source, - _make - -# List of valid names for the first argument in a class method. -valid-classmethod-first-arg=cls - -# List of valid names for the first argument in a metaclass class method. -valid-metaclass-classmethod-first-arg=cls - - -[EXCEPTIONS] - -# Exceptions that will emit a warning when being caught. Defaults to -# "BaseException, Exception". -overgeneral-exceptions=BaseException, - Exception diff --git a/README.md b/README.md index 4d0e4092b..55f7f902a 100644 --- a/README.md +++ b/README.md @@ -87,5 +87,13 @@ Example use cases of Express include: Check out the [examples folder](examples) for some illustrations. +## Contributing +We use [poetry](https://python-poetry.org/docs/) and pre-commit to enable a smooth developer flow. Run the following commands to +set up your development environment: +```commandline +pip install poetry +poetry install +pre-commit install +``` \ No newline at end of file diff --git a/express/components/__init__.py b/express/components/__init__.py index a9429195d..be2e64770 100644 --- a/express/components/__init__.py +++ b/express/components/__init__.py @@ -1 +1,3 @@ -from .pandas_components import PandasLoaderComponent, PandasTransformComponent \ No newline at end of file +"""Module holding base components to build on.""" + +from .pandas_components import PandasLoaderComponent, PandasTransformComponent diff --git a/express/components/common.py b/express/components/common.py index 5257b8151..3dbc84889 100644 --- a/express/components/common.py +++ b/express/components/common.py @@ -18,11 +18,11 @@ STORAGE_MODULE_PATH = StorageHandlerModule().to_dict()[os.environ.get('CLOUD_ENV', 'GCP')] STORAGE_HANDLER = importlib.import_module(STORAGE_MODULE_PATH).StorageHandler() -TIndex = TypeVar('TIndex') -TData = TypeVar('TData') +IndexT = TypeVar('IndexT') +DataT = TypeVar('DataT') -class ExpressDataset(ABC, Generic[TIndex, TData]): +class ExpressDataset(ABC, Generic[IndexT, DataT]): """ An abstract wrapper class that gives read access to Express Datasets. It can be extended to create a draft for a new (output) dataset. @@ -36,19 +36,19 @@ def __init__(self, manifest: DataManifest): self.manifest = manifest self._index_data = self.load_index() - def extend(self) -> 'ExpressDatasetDraft[TIndex, TData]': + def extend(self) -> 'ExpressDatasetDraft[IndexT, DataT]': """ Create an `ExpressDatasetDraft` that extends this dataset. """ return ExpressDatasetDraft.extend(self) @abstractmethod - def load_index(self) -> TIndex: + def load_index(self) -> IndexT: """ Loads the index data. """ - def load(self, data_source: str, for_index: Optional[TIndex] = None) -> TData: + def load(self, data_source: str, for_index: Optional[IndexT] = None) -> DataT: """ Load data from a named data source. @@ -70,7 +70,7 @@ def load(self, data_source: str, for_index: Optional[TIndex] = None) -> TData: @staticmethod @abstractmethod - def _load_data_source(data_source: DataSource, for_index: Optional[TIndex]) -> TData: + def _load_data_source(data_source: DataSource, index_filter: Optional[IndexT]) -> DataT: """ Load data from a (possibly remote) path. This method can be subclassed to present data in a specific way. For example, as a local @@ -78,7 +78,7 @@ def _load_data_source(data_source: DataSource, for_index: Optional[TIndex]) -> T Args: data_source (DataSource): The DataSource to load the data from. - for_index (Optional[TIndex]): Pass in an index to filter the data on. + index_filter (Optional[TIndex]): Pass in an index to filter the data on. Returns: TData: Data of type TData @@ -87,7 +87,7 @@ def _load_data_source(data_source: DataSource, for_index: Optional[TIndex]) -> T # TODO should we keep track of both input and output in the manifests? -class ExpressDatasetDraft(ABC, Generic[TIndex, TData]): +class ExpressDatasetDraft(ABC, Generic[IndexT, DataT]): """ Draft of an `ExpressDataset`, tracking both preexisting data sources and local data that still needs to be uploaded. @@ -103,11 +103,11 @@ class ExpressDatasetDraft(ABC, Generic[TIndex, TData]): """ def __init__(self, - index: Optional[Union[DataSource, TIndex]] = None, - data_sources: Dict[str, Union[DataSource, TData]] = None, - extending_dataset: Optional[ExpressDataset[TIndex, TData]] = None): + index: Optional[Union[DataSource, IndexT]] = None, + data_sources: Dict[str, Union[DataSource, DataT]] = None, + extending_dataset: Optional[ExpressDataset[IndexT, DataT]] = None): self.index = index - self.data_sources = data_sources or dict() + self.data_sources = data_sources or {} if not (extending_dataset is None) ^ (index is None): raise ValueError( "A dataset draft needs to have a single valid index. Either pass an index " @@ -124,14 +124,14 @@ def __init__(self, self.with_data_source(name, dataset, replace_ok=False) @classmethod - def extend(cls, dataset: ExpressDataset[TIndex, TData]) -> 'ExpressDatasetDraft[TIndex, TData]': + def extend(cls, dataset: ExpressDataset[IndexT, DataT]) -> 'ExpressDatasetDraft[IndexT, DataT]': """ Creates a new Express Dataset draft extending the given dataset, which will take over both its index and all data sources. """ return cls(extending_dataset=dataset) - def with_index(self, index: TData) -> 'ExpressDatasetDraft[TIndex, TData]': + def with_index(self, index: DataT) -> 'ExpressDatasetDraft[IndexT, DataT]': """ Replaces the current index with the given index. @@ -141,8 +141,8 @@ def with_index(self, index: TData) -> 'ExpressDatasetDraft[TIndex, TData]': self.index = index return self - def with_data_source(self, name: str, data: Union[TData, DataSource], replace_ok=False) \ - -> 'ExpressDatasetDraft[TIndex, TData]': + def with_data_source(self, name: str, data: Union[DataT, DataSource], replace_ok=False) \ + -> 'ExpressDatasetDraft[IndexT, DataT]': """ Adds a new data source or replaces a preexisting data source with the same name. @@ -166,7 +166,7 @@ def with_data_source(self, name: str, data: Union[TData, DataSource], replace_ok # pylint: disable=too-few-public-methods -class ExpressDatasetHandler(ABC, Generic[TIndex, TData]): +class ExpressDatasetHandler(ABC, Generic[IndexT, DataT]): """ Abstract mixin class to read from and write to Express Datasets. Can be subclassed to deal with a specific type of parsed data representations, like reading to a @@ -192,7 +192,7 @@ def _path_for_upload(metadata: Metadata, name: str) -> str: @classmethod @abstractmethod - def _load_dataset(cls, input_manifest: DataManifest) -> ExpressDataset[TIndex, TData]: + def _load_dataset(cls, input_manifest: DataManifest) -> ExpressDataset[IndexT, DataT]: """ Parses a manifest to an ExpressDataset of a specific type, for downstream use by transform components. @@ -200,7 +200,7 @@ def _load_dataset(cls, input_manifest: DataManifest) -> ExpressDataset[TIndex, T @classmethod @abstractmethod - def _upload_index(cls, index: TIndex, remote_path: str) -> DataSource: + def _upload_index(cls, index: IndexT, remote_path: str) -> DataSource: """ Uploads index data of a certain type as parquet and creates a new DataSource. @@ -214,7 +214,7 @@ def _upload_index(cls, index: TIndex, remote_path: str) -> DataSource: @classmethod @abstractmethod - def _upload_data_source(cls, name: str, data: TData, remote_path: str) -> DataSource: + def _upload_data_source(cls, name: str, data: DataT, remote_path: str) -> DataSource: """ Uploads data of a certain type as parquet and creates a new DataSource. @@ -265,7 +265,7 @@ def _update_metadata(cls, return Metadata.from_dict(metadata_dict) @classmethod - def _create_output_dataset(cls, draft: ExpressDatasetDraft[TIndex, TData], + def _create_output_dataset(cls, draft: ExpressDatasetDraft[IndexT, DataT], metadata: Metadata, save_path: str) -> DataManifest: """ Processes a dataset draft of a specific type, uploading all local data to storage and @@ -277,7 +277,7 @@ def _create_output_dataset(cls, draft: ExpressDatasetDraft[TIndex, TData], remote_path = cls._path_for_upload(metadata, "index") index = cls._upload_index(draft.index, remote_path) - data_sources = dict() + data_sources = {} for name, dataset in draft.data_sources.items(): if isinstance(dataset, DataSource): data_sources[name] = dataset @@ -290,11 +290,11 @@ def _create_output_dataset(cls, draft: ExpressDatasetDraft[TIndex, TData], metadata=metadata ) Path(save_path).parent.mkdir(parents=True, exist_ok=True) - Path(save_path).write_text(manifest.to_json()) + Path(save_path).write_text(manifest.to_json(), encoding="utf-8") return manifest -class ExpressTransformComponent(ExpressDatasetHandler, Generic[TIndex, TData]): +class ExpressTransformComponent(ExpressDatasetHandler, Generic[IndexT, DataT]): """ An abstract component that facilitates end-to-end transformation of Express Datasets. It can be subclassed or used with a mixin to support reading and writing of a specific data @@ -347,9 +347,9 @@ def _parse_args(cls): @classmethod @abstractmethod - def transform(cls, data: ExpressDataset[TIndex, TData], + def transform(cls, data: ExpressDataset[IndexT, DataT], extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) \ - -> ExpressDatasetDraft[TIndex, TData]: + -> ExpressDatasetDraft[IndexT, DataT]: """ Applies transformations to the input dataset and creates a draft for a new dataset. The recommended pattern for a transform is to extend the input dataset with a filtered index @@ -371,7 +371,7 @@ def transform(cls, data: ExpressDataset[TIndex, TData], """ -class ExpressLoaderComponent(ExpressDatasetHandler, Generic[TIndex, TData]): +class ExpressLoaderComponent(ExpressDatasetHandler, Generic[IndexT, DataT]): """ An abstract component that facilitates creation of a new Express Dataset. This will commonly be the first component in an Express Pipeline. It can be subclassed or used @@ -421,7 +421,7 @@ def _parse_args(cls): @classmethod @abstractmethod def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) -> \ - ExpressDatasetDraft[TIndex, TData]: + ExpressDatasetDraft[IndexT, DataT]: """ Loads data from an arbitrary source to create a draft for a new dataset. @@ -432,4 +432,4 @@ def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = No Returns: ExpressDatasetDraft[TIndex, TData]: draft of output dataset, to be uploaded after this loader completes. - """ \ No newline at end of file + """ diff --git a/express/components/pandas_components.py b/express/components/pandas_components.py index f80cd64de..0a44e8fcb 100644 --- a/express/components/pandas_components.py +++ b/express/components/pandas_components.py @@ -8,10 +8,15 @@ import pandas as pd -from .common import ExpressDatasetHandler, ExpressDataset, ExpressTransformComponent, \ - ExpressDatasetDraft, ExpressLoaderComponent -from express.storage_interface import StorageHandlerModule +from express.components.common import ( + ExpressDatasetHandler, + ExpressDataset, + ExpressTransformComponent, + ExpressDatasetDraft, + ExpressLoaderComponent +) from express.manifest import DataManifest, DataSource, DataType +from express.storage_interface import StorageHandlerModule # Define interface of pandas draft # pylint: disable=unsubscriptable-object @@ -36,7 +41,7 @@ def load_index(self) -> pd.Series: @staticmethod def _load_data_source(data_source: DataSource, index_filter: Union[pd.DataFrame, pd.Series, List[str]]) -> pd.DataFrame: - if data_source.type != DataType.parquet: + if data_source.type != DataType.PARQUET: raise TypeError("Only reading from parquet is currently supported.") with tempfile.TemporaryDirectory() as tmp_dir: @@ -78,7 +83,7 @@ def _upload_parquet(data: Union[pd.DataFrame, pd.Series], destination=fully_qualified_blob_path) return DataSource( location=fully_qualified_blob_path, - type=DataType.parquet, + type=DataType.PARQUET, extensions=["parquet"], n_files=1, n_items=len(data) @@ -127,4 +132,4 @@ class PandasLoaderComponent(PandasDatasetHandler, ExpressLoaderComponent[List[st @abstractmethod def load(cls, extra_args: Optional[Dict[str, Union[str, int, float, bool]]] = None) \ -> PandasDatasetDraft: - """Load initial dataset""" \ No newline at end of file + """Load initial dataset""" diff --git a/express/gcp_storage.py b/express/gcp_storage.py index 67841f71c..3c1188a9b 100644 --- a/express/gcp_storage.py +++ b/express/gcp_storage.py @@ -11,7 +11,7 @@ from express.storage_interface import StorageHandlerInterface, DecodedBlobPath from express import io -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class StorageHandler(StorageHandlerInterface): @@ -87,7 +87,7 @@ def copy_folder(source: str, destination: str, copy_source_content: bool = False ["gsutil", '-o', '"GSUtil:use_gcloud_storage=True"', '-q', "-m", "cp", "-r", source, destination], check=True) - LOGGER.info("Copying folder from %s to %s [DONE]", source, destination) + logger.info("Copying folder from %s to %s [DONE]", source, destination) folder_name = io.get_file_name(source) @@ -107,19 +107,19 @@ def copy_files(source_files: List[str], destination: str): # Write file paths to a text file before piping with tempfile.TemporaryDirectory() as temp_folder: upload_text_file = os.path.join(temp_folder, "files_to_upload.txt") - with open(upload_text_file, "w") as out_file: + with open(upload_text_file, "w", encoding="utf-8") as out_file: for file in source_files: out_file.write(file) out_file.write("\n") # Write files to tmp director - pipe_file_list = subprocess.Popen(["cat", upload_text_file], # nosec - stdout=subprocess.PIPE) - subprocess.call( # nosec - ['gsutil', '-o', '"GSUtil:use_gcloud_storage=True"', '-q', '-m', 'cp', '-I', - destination], stdin=pipe_file_list.stdout) + with subprocess.Popen(["cat", upload_text_file], stdout=subprocess.PIPE) \ + as pipe_file_list: + subprocess.call( # nosec + ['gsutil', '-o', '"GSUtil:use_gcloud_storage=True"', '-q', '-m', 'cp', '-I', + destination], stdin=pipe_file_list.stdout) - LOGGER.info("A total of %s files were copied to %s", len(source_files), destination) + logger.info("A total of %s files were copied to %s", len(source_files), destination) def copy_file(self, source_file: str, destination: str) -> str: """ @@ -133,4 +133,4 @@ def copy_file(self, source_file: str, destination: str) -> str: """ self.copy_files([source_file], destination) file_name = io.get_file_name(source_file, return_extension=True) - return os.path.join(destination, file_name) \ No newline at end of file + return os.path.join(destination, file_name) diff --git a/express/io.py b/express/io.py index a8117fa20..9011cb072 100644 --- a/express/io.py +++ b/express/io.py @@ -5,7 +5,7 @@ import logging import os -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) def get_file_extension(file_url: str) -> str: diff --git a/express/kfp.py b/express/kfp.py index 0f9451ba8..dc11189c4 100644 --- a/express/kfp.py +++ b/express/kfp.py @@ -2,9 +2,7 @@ import ast import logging -import torch - -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) def parse_kfp_list(kfp_parsed_string: str) -> list: diff --git a/express/logger.py b/express/logger.py index 3e1dbd945..6ef49da18 100644 --- a/express/logger.py +++ b/express/logger.py @@ -19,4 +19,4 @@ def configure_logging() -> None: handler = logging.StreamHandler(stream=sys.stdout) handler.setFormatter( logging.Formatter(fmt='[%(asctime)s | %(name)s | %(levelname)s] %(message)s')) - logger.addHandler(handler) \ No newline at end of file + logger.addHandler(handler) diff --git a/express/manifest.py b/express/manifest.py index 4c0005757..3e277c356 100644 --- a/express/manifest.py +++ b/express/manifest.py @@ -14,8 +14,8 @@ class DataType(str, Enum): """ Supported types for stored data. """ - parquet = 'parquet' - blob = 'blob' + PARQUET = 'parquet' + BLOB = 'blob' @classmethod def is_valid(cls, data_type: str) -> bool: @@ -90,7 +90,7 @@ class DataManifest: metadata: Metadata = field(default_factory=Metadata) # TODO: make mandatory during construction def __post_init__(self): - if (self.index.type != DataType.parquet) or (not DataType.is_valid(self.index.type)): + if (self.index.type != DataType.PARQUET) or (not DataType.is_valid(self.index.type)): raise TypeError("Index needs to be of type 'parquet'.") for name, dataset in self.data_sources.items(): if not DataType.is_valid(dataset.type): @@ -100,7 +100,7 @@ def __post_init__(self): @classmethod def from_path(cls, manifest_path): """Load data manifest from a given manifest path""" - with open(manifest_path) as f: - manifest_load = json.load(f) + with open(manifest_path, encoding="utf-8") as file_: + manifest_load = json.load(file_) # pylint: disable=no-member - return DataManifest.from_dict(manifest_load) \ No newline at end of file + return DataManifest.from_dict(manifest_load) diff --git a/express/storage_interface.py b/express/storage_interface.py index b9b656303..c53431561 100644 --- a/express/storage_interface.py +++ b/express/storage_interface.py @@ -83,9 +83,8 @@ def copy_folder(source: str, destination: str, copy_source_content: bool = False str: the path to the destination copied folder """ - @staticmethod @abstractmethod - def copy_file(source_file: str, destination: str) -> str: + def copy_file(self, source_file: str, destination: str) -> str: """ Function that copies source files from a remote/local source to a local/remote location respectively @@ -128,4 +127,4 @@ def copy_parquet(self, parquet_path: str, destination: str) -> str: parquet_path, destination) - return local_parquet_path \ No newline at end of file + return local_parquet_path diff --git a/pylint.py b/pylint.py deleted file mode 100644 index 0e0f05c7c..000000000 --- a/pylint.py +++ /dev/null @@ -1,104 +0,0 @@ -""" -Pylint script to run pylint from the root of a module. This ensures that -all paths and imports are resolved correctly. -""" - -import os -import sys -import logging -import argparse -import subprocess # nosec -from typing import Any -from pathlib import Path -from setuptools import find_packages - -# pylint: disable=invalid-name -parser = argparse.ArgumentParser() -module_parser = parser.add_mutually_exclusive_group() -module_parser.add_argument('--module', - help='Module to pylint', - type=str, - action='append') -module_parser.add_argument('--modules-file', - help='Path to file with modules', - type=str) - -GREEN, RED, NC = '\033[0;32m', '\033[0;31m', '\033[0m' - - -def install_requirements(module_path: str) -> None: - """Install the requirements.txt file of a module on best effort basis. If - the installation fails, the error is printed but ignored. - - Args: - module_path: Path to the root of the module. - - """ - try: - subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', # nosec - 'requirements.txt', '--force-reinstall'], - cwd=module_path, - check=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - except subprocess.CalledProcessError as e: - logging.error('Failed to install requirements') - logging.error(e.stderr.decode()) - - -def pylint_module(module_path: str) -> int: - """Run pylint on a module with correct path configuration. - - Args: - module_path: Path to the module. - - Returns: - Pylint return code. - - """ - print(f'Linting module {module_path}') - requirements = Path(os.path.join(module_path, "requirements.txt")) - if requirements.exists(): - install_requirements(module_path) - - dir_path = os.path.dirname(os.path.realpath(__file__)) - init_hook = f'import sys; sys.path.append(r"{module_path}")' - - process = subprocess.run(['pylint', module_path, # nosec - f'--rcfile={os.path.join(dir_path, ".pylintrc")}', - f'--init-hook={init_hook}'], - check=False) - - return process.returncode - - -def main(args: Any) -> None: - """Run pylint on provided modules.""" - if args.modules_file: - # Read in all modules to be pylinted - with open(args.modules_file, 'r') as f: - modules = [line.strip('\n') for line in f.readlines()] - elif args.module: - modules = args.module - else: - # If no modules were given, detect them automatically - logging.warning('No modules were supplied, detecting them automatically') - modules = [module for module in find_packages() if '.' not in module] - - logging.warning('Running pylint for %s', modules) - - # Run pylint for each module from the module directory - failures = [module for module in modules - if pylint_module(os.path.abspath(module)) != 0] - - # Print some encouraging messages with info - if failures: - print(RED + "Failure :'(" + NC) - print("Following modules failed: {}".format(failures)) - sys.exit(1) - else: - print(GREEN + 'Success!' + NC) - - -if __name__ == '__main__': - main(parser.parse_args()) diff --git a/pyproject.toml b/pyproject.toml index 681725f9d..71ffcbd69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ pandas = "^1.3.5" [tool.poetry.group.test.dependencies] bandit = { version = "^1.7.4", extras = ["toml"] } liccheck = "^0.7.3" -pylint = "^2.5.3" +pylint = "2.16.4" +pre-commit = "^3.1.1" [tool.bandit] exclude_dirs = ["*/mlpipelines/components/sd_finetuning_component/src/train_text_to_image.py"]