Skip to content

Commit

Permalink
Rewrite Builder without dask dependencies (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
andersy005 authored May 24, 2021
1 parent 36ffcf2 commit c8287c2
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 15 deletions.
12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
exclude: schema/generic_schema.yaml
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -11,7 +11,7 @@ repos:
- id: double-quote-string-fixer

- repo: https://github.com/ambv/black
rev: 21.4b2
rev: 21.5b1
hooks:
- id: black

Expand All @@ -21,7 +21,7 @@ repos:
- id: blackdoc

- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.1
rev: 3.9.2
hooks:
- id: flake8

Expand All @@ -35,15 +35,15 @@ repos:
- id: isort

- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.2.1
rev: v2.3.0
hooks:
- id: prettier

- repo: https://github.com/nbQA-dev/nbQA
rev: 0.7.1
rev: 0.9.0
hooks:
- id: nbqa-black
additional_dependencies: [black==21.4b2]
additional_dependencies: [black==21.5b1]
- id: nbqa-pyupgrade
additional_dependencies: [pyupgrade==2.7.3]
- id: nbqa-isort
Expand Down
7 changes: 4 additions & 3 deletions ci/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@ channels:
dependencies:
- cf_xarray
- codecov
- dask
- intake-esm
- joblib
- nbsphinx
- ncar-jobqueue
- netcdf4
- numpydoc
- pip
- pre-commit
- pydantic
- pytest
- pytest-cov
- pytest-sugar
- pytest-xdist
- pyyaml
- rich
- typer
- xarray
2 changes: 1 addition & 1 deletion ecgtools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""Top-level module for ecgtools ."""
from pkg_resources import DistributionNotFound, get_distribution

from .core import Builder
from .builder import INVALID_ASSET, TRACEBACK, Builder

try:
__version__ = get_distribution(__name__).version
Expand Down
110 changes: 110 additions & 0 deletions ecgtools/builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import fnmatch
import itertools
import pathlib
import typing

import joblib
import pandas as pd
import pydantic

INVALID_ASSET = 'INVALID_ASSET'
TRACEBACK = 'TRACEBACK'


@pydantic.dataclasses.dataclass
class Builder:
"""
Generates a catalog from a list of files.
Parameters
----------
root_path : str
Path of root directory.
extension : str, optional
File extension, by default None. If None, the builder will look for files with
"*.nc" extension.
depth : int, optional
Recursion depth. Recursively crawl `root_path` up to a specified depth, by default None
exclude_patterns : list, optional
Directory, file patterns to exclude during catalog generation, by default None
njobs : int, optional
The maximum number of concurrently running jobs, by default 25
"""

root_path: pydantic.types.DirectoryPath
extension: str = '*.nc'
depth: int = 0
exclude_patterns: typing.List[str] = None
parsing_func: typing.Callable = None
njobs: int = -1

def __post_init__(self):
self.df = pd.DataFrame()
self.invalid_assets = pd.DataFrame()
self.dirs = None
self.filelist = None
self.entries = None

def get_directories(self):
pattern = '*/' * (self.depth + 1)
dirs = [x for x in self.root_path.glob(pattern) if x.is_dir()]
if not dirs:
dirs = [self.root_path]
self.dirs = dirs
return self

def get_filelist(self):
"""Get a list of files from a list of directories."""

def _filter_files(filelist):
return not any(
fnmatch.fnmatch(filelist, pat=exclude_pattern)
for exclude_pattern in self.exclude_patterns
)

def _glob_dir(directory, extension):
return list(directory.rglob(f'{extension}'))

filelist = joblib.Parallel(n_jobs=self.njobs, verbose=5)(
joblib.delayed(_glob_dir)(directory, self.extension) for directory in self.dirs
)
filelist = itertools.chain(*filelist)
if self.exclude_patterns:
filelist = list(filter(_filter_files, filelist))
self.filelist = list(filelist)
return self

def parse(self, parsing_func: typing.Callable = None):
func = parsing_func or self.parsing_func
if func is None:
raise ValueError(f'`parsing_func` must a valid Callable. Got {type(func)}')
entries = joblib.Parallel(n_jobs=self.njobs, verbose=5)(
joblib.delayed(func)(file) for file in self.filelist
)
self.entries = entries
self.df = pd.DataFrame(entries)
return self

def clean_dataframe(self):
if INVALID_ASSET in self.df.columns:
invalid_assets = self.df[self.df[INVALID_ASSET].notnull()][[INVALID_ASSET, TRACEBACK]]
df = self.df[self.df[INVALID_ASSET].isnull()].drop(columns=[INVALID_ASSET, TRACEBACK])
self.invalid_assets = invalid_assets
self.df = df
return self

def save(
self,
catalog_file: typing.Union[pathlib.Path, str],
**kwargs,
):
catalog_file = pathlib.Path(catalog_file)
index = kwargs.pop('index') or False
self.df.to_csv(catalog_file, index=index, **kwargs)
if not self.invalid_assets.empty:
invalid_assets_report_file = (
catalog_file.parent / f'invalid_assets_{catalog_file.parts[-1]}'
)
self.invalid_assets.to_csv(invalid_assets_report_file, index=False)
print(f'Saved catalog location: {catalog_file}')
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
cf_xarray
dask[bag]
dask[delayed]
ncar-jobqueue
joblib
netCDF4
rich
typer
xarray
pyyaml
pydantic
pandas
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ select = B,C,E,F,W,T4,B9

[isort]
known_first_party=ecgtools
known_third_party=cf_xarray,dask,distributed,ncar_jobqueue,pandas,pkg_resources,rich,setuptools,typer,xarray
known_third_party=cf_xarray,dask,distributed,joblib,ncar_jobqueue,pandas,pkg_resources,pydantic,pytest,rich,setuptools,typer,xarray
multi_line_output=3
include_trailing_comma=True
force_grid_wrap=0
Expand Down
101 changes: 101 additions & 0 deletions tests/test_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import os
import pathlib
import traceback

import pandas as pd
import pydantic
import pytest

from ecgtools import INVALID_ASSET, TRACEBACK, Builder

sample_data_dir = pathlib.Path(os.path.dirname(__file__)).parent / 'sample_data'


def parsing_func(file):
return {'path': file}


def parsing_func_errors(file):
try:
file.is_valid()
except:
return {INVALID_ASSET: file.as_posix(), TRACEBACK: traceback.format_exc()}


def test_root_path_error():
with pytest.raises(pydantic.ValidationError):
Builder('test_directory')


@pytest.mark.parametrize(
'root_path',
[
sample_data_dir / 'cmip' / 'CMIP6',
sample_data_dir / 'cmip' / 'cmip5',
sample_data_dir / 'cesm',
],
)
def test_init(root_path):
_ = Builder(root_path)


@pytest.mark.parametrize(
'root_path',
[
sample_data_dir / 'cmip' / 'CMIP6',
sample_data_dir / 'cmip' / 'cmip5',
sample_data_dir / 'cesm',
],
)
def test_get_filelist(root_path):
b = Builder(
root_path,
exclude_patterns=['*/files/*', '*/latest/*'],
).get_directories()
assert b.dirs
assert isinstance(b.dirs[0], pathlib.Path)

b = b.get_filelist()
assert b.filelist
assert isinstance(b.filelist[0], pathlib.Path)


def test_parse_error():
b = Builder(sample_data_dir / 'cesm').get_directories().get_filelist()

with pytest.raises(ValueError):
b.parse()


@pytest.mark.parametrize(
'root_path',
[
sample_data_dir / 'cmip' / 'CMIP6',
sample_data_dir / 'cmip' / 'cmip5',
sample_data_dir / 'cesm',
],
)
def test_parse(root_path):
b = (
Builder(root_path, exclude_patterns=['*/files/*', '*/latest/*'], parsing_func=parsing_func)
.get_directories()
.get_filelist()
.parse()
)
assert b.entries
assert isinstance(b.entries[0], dict)
assert isinstance(b.df, pd.DataFrame)
assert not b.df.empty


def test_parse_invalid_assets():
b = (
Builder(sample_data_dir / 'cesm')
.get_directories()
.get_filelist()
.parse(parsing_func=parsing_func_errors)
.clean_dataframe()
)

assert not b.invalid_assets.empty
assert set(b.invalid_assets.columns) == set([INVALID_ASSET, TRACEBACK])

0 comments on commit c8287c2

Please sign in to comment.