Rewrite Builder without dask dependencies (#19)

ncar-xdev · May 24, 2021 · c8287c2 · c8287c2
1 parent 36ffcf2
commit c8287c2
Show file tree

Hide file tree

Showing 7 changed files with 226 additions and 15 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 exclude: schema/generic_schema.yaml
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.4.0
+    rev: v4.0.1
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -11,7 +11,7 @@ repos:
       - id: double-quote-string-fixer
 
   - repo: https://github.com/ambv/black
-    rev: 21.4b2
+    rev: 21.5b1
     hooks:
       - id: black
 
@@ -21,7 +21,7 @@ repos:
       - id: blackdoc
 
   - repo: https://gitlab.com/pycqa/flake8
-    rev: 3.9.1
+    rev: 3.9.2
     hooks:
       - id: flake8
 
@@ -35,15 +35,15 @@ repos:
       - id: isort
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v2.2.1
+    rev: v2.3.0
     hooks:
       - id: prettier
 
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 0.7.1
+    rev: 0.9.0
     hooks:
       - id: nbqa-black
-        additional_dependencies: [black==21.4b2]
+        additional_dependencies: [black==21.5b1]
       - id: nbqa-pyupgrade
         additional_dependencies: [pyupgrade==2.7.3]
       - id: nbqa-isort

diff --git a/ci/environment.yml b/ci/environment.yml
@@ -5,17 +5,18 @@ channels:
 dependencies:
   - cf_xarray
   - codecov
-  - dask
   - intake-esm
+  - joblib
   - nbsphinx
-  - ncar-jobqueue
   - netcdf4
   - numpydoc
   - pip
   - pre-commit
+  - pydantic
   - pytest
   - pytest-cov
+  - pytest-sugar
+  - pytest-xdist
   - pyyaml
-  - rich
   - typer
   - xarray
diff --git a/ecgtools/__init__.py b/ecgtools/__init__.py
@@ -3,7 +3,7 @@
 """Top-level module for ecgtools ."""
 from pkg_resources import DistributionNotFound, get_distribution
 
-from .core import Builder
+from .builder import INVALID_ASSET, TRACEBACK, Builder
 
 try:
     __version__ = get_distribution(__name__).version

diff --git a/ecgtools/builder.py b/ecgtools/builder.py
@@ -0,0 +1,110 @@
+import fnmatch
+import itertools
+import pathlib
+import typing
+
+import joblib
+import pandas as pd
+import pydantic
+
+INVALID_ASSET = 'INVALID_ASSET'
+TRACEBACK = 'TRACEBACK'
+
+
+@pydantic.dataclasses.dataclass
+class Builder:
+    """
+    Generates a catalog from a list of files.
+
+    Parameters
+    ----------
+    root_path : str
+        Path of root directory.
+    extension : str, optional
+        File extension, by default None. If None, the builder will look for files with
+        "*.nc" extension.
+    depth : int, optional
+        Recursion depth. Recursively crawl `root_path` up to a specified depth, by default None
+    exclude_patterns : list, optional
+        Directory, file patterns to exclude during catalog generation, by default None
+    njobs : int, optional
+        The maximum number of concurrently running jobs, by default 25
+
+    """
+
+    root_path: pydantic.types.DirectoryPath
+    extension: str = '*.nc'
+    depth: int = 0
+    exclude_patterns: typing.List[str] = None
+    parsing_func: typing.Callable = None
+    njobs: int = -1
+
+    def __post_init__(self):
+        self.df = pd.DataFrame()
+        self.invalid_assets = pd.DataFrame()
+        self.dirs = None
+        self.filelist = None
+        self.entries = None
+
+    def get_directories(self):
+        pattern = '*/' * (self.depth + 1)
+        dirs = [x for x in self.root_path.glob(pattern) if x.is_dir()]
+        if not dirs:
+            dirs = [self.root_path]
+        self.dirs = dirs
+        return self
+
+    def get_filelist(self):
+        """Get a list of files from a list of directories."""
+
+        def _filter_files(filelist):
+            return not any(
+                fnmatch.fnmatch(filelist, pat=exclude_pattern)
+                for exclude_pattern in self.exclude_patterns
+            )
+
+        def _glob_dir(directory, extension):
+            return list(directory.rglob(f'{extension}'))
+
+        filelist = joblib.Parallel(n_jobs=self.njobs, verbose=5)(
+            joblib.delayed(_glob_dir)(directory, self.extension) for directory in self.dirs
+        )
+        filelist = itertools.chain(*filelist)
+        if self.exclude_patterns:
+            filelist = list(filter(_filter_files, filelist))
+        self.filelist = list(filelist)
+        return self
+
+    def parse(self, parsing_func: typing.Callable = None):
+        func = parsing_func or self.parsing_func
+        if func is None:
+            raise ValueError(f'`parsing_func` must a valid Callable. Got {type(func)}')
+        entries = joblib.Parallel(n_jobs=self.njobs, verbose=5)(
+            joblib.delayed(func)(file) for file in self.filelist
+        )
+        self.entries = entries
+        self.df = pd.DataFrame(entries)
+        return self
+
+    def clean_dataframe(self):
+        if INVALID_ASSET in self.df.columns:
+            invalid_assets = self.df[self.df[INVALID_ASSET].notnull()][[INVALID_ASSET, TRACEBACK]]
+            df = self.df[self.df[INVALID_ASSET].isnull()].drop(columns=[INVALID_ASSET, TRACEBACK])
+            self.invalid_assets = invalid_assets
+            self.df = df
+        return self
+
+    def save(
+        self,
+        catalog_file: typing.Union[pathlib.Path, str],
+        **kwargs,
+    ):
+        catalog_file = pathlib.Path(catalog_file)
+        index = kwargs.pop('index') or False
+        self.df.to_csv(catalog_file, index=index, **kwargs)
+        if not self.invalid_assets.empty:
+            invalid_assets_report_file = (
+                catalog_file.parent / f'invalid_assets_{catalog_file.parts[-1]}'
+            )
+            self.invalid_assets.to_csv(invalid_assets_report_file, index=False)
+        print(f'Saved catalog location: {catalog_file}')
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,8 @@
 cf_xarray
-dask[bag]
-dask[delayed]
-ncar-jobqueue
+joblib
 netCDF4
-rich
 typer
 xarray
 pyyaml
+pydantic
+pandas
diff --git a/setup.cfg b/setup.cfg
@@ -7,7 +7,7 @@ select = B,C,E,F,W,T4,B9
 
 [isort]
 known_first_party=ecgtools
-known_third_party=cf_xarray,dask,distributed,ncar_jobqueue,pandas,pkg_resources,rich,setuptools,typer,xarray
+known_third_party=cf_xarray,dask,distributed,joblib,ncar_jobqueue,pandas,pkg_resources,pydantic,pytest,rich,setuptools,typer,xarray
 multi_line_output=3
 include_trailing_comma=True
 force_grid_wrap=0

diff --git a/tests/test_builder.py b/tests/test_builder.py
@@ -0,0 +1,101 @@
+import os
+import pathlib
+import traceback
+
+import pandas as pd
+import pydantic
+import pytest
+
+from ecgtools import INVALID_ASSET, TRACEBACK, Builder
+
+sample_data_dir = pathlib.Path(os.path.dirname(__file__)).parent / 'sample_data'
+
+
+def parsing_func(file):
+    return {'path': file}
+
+
+def parsing_func_errors(file):
+    try:
+        file.is_valid()
+    except:
+        return {INVALID_ASSET: file.as_posix(), TRACEBACK: traceback.format_exc()}
+
+
+def test_root_path_error():
+    with pytest.raises(pydantic.ValidationError):
+        Builder('test_directory')
+
+
+@pytest.mark.parametrize(
+    'root_path',
+    [
+        sample_data_dir / 'cmip' / 'CMIP6',
+        sample_data_dir / 'cmip' / 'cmip5',
+        sample_data_dir / 'cesm',
+    ],
+)
+def test_init(root_path):
+    _ = Builder(root_path)
+
+
+@pytest.mark.parametrize(
+    'root_path',
+    [
+        sample_data_dir / 'cmip' / 'CMIP6',
+        sample_data_dir / 'cmip' / 'cmip5',
+        sample_data_dir / 'cesm',
+    ],
+)
+def test_get_filelist(root_path):
+    b = Builder(
+        root_path,
+        exclude_patterns=['*/files/*', '*/latest/*'],
+    ).get_directories()
+    assert b.dirs
+    assert isinstance(b.dirs[0], pathlib.Path)
+
+    b = b.get_filelist()
+    assert b.filelist
+    assert isinstance(b.filelist[0], pathlib.Path)
+
+
+def test_parse_error():
+    b = Builder(sample_data_dir / 'cesm').get_directories().get_filelist()
+
+    with pytest.raises(ValueError):
+        b.parse()
+
+
+@pytest.mark.parametrize(
+    'root_path',
+    [
+        sample_data_dir / 'cmip' / 'CMIP6',
+        sample_data_dir / 'cmip' / 'cmip5',
+        sample_data_dir / 'cesm',
+    ],
+)
+def test_parse(root_path):
+    b = (
+        Builder(root_path, exclude_patterns=['*/files/*', '*/latest/*'], parsing_func=parsing_func)
+        .get_directories()
+        .get_filelist()
+        .parse()
+    )
+    assert b.entries
+    assert isinstance(b.entries[0], dict)
+    assert isinstance(b.df, pd.DataFrame)
+    assert not b.df.empty
+
+
+def test_parse_invalid_assets():
+    b = (
+        Builder(sample_data_dir / 'cesm')
+        .get_directories()
+        .get_filelist()
+        .parse(parsing_func=parsing_func_errors)
+        .clean_dataframe()
+    )
+
+    assert not b.invalid_assets.empty
+    assert set(b.invalid_assets.columns) == set([INVALID_ASSET, TRACEBACK])