allow passing format_spec (#57)

* allow passing format_spec * add some docs * tests
mpytools · May 23, 2023 · 32e1210 · 32e1210
1 parent 1b14fb9
commit 32e1210
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@
 
 ## v0.2.0 - unreleased
 
+- Allow passing format spec to the captured names to allow more precise name matching
+- ([#57](https://github.com/mathause/filefinder/pull/57)).
 - Add tests for the cmip functionality and fix issue with `filefinder.cmip.ensure_unique_grid`
   ([#35](https://github.com/mathause/filefinder/pull/35)).
 - Removed support for python 3.6.

diff --git a/README.md b/README.md
@@ -73,11 +73,46 @@ ff.find_files(category=["a1", "b2"], number=1)
 >>> 2  /root/b2/b2_file_1       b2      1
 ```
 
+## Format syntax
+
+You can pass format specifiers to allow more complex formats, see
+[format-specification](https://github.com/r1chardj0n3s/parse#format-specification) for details.
+Using format specifiers, you can parse names that are possible otherwise.
+
+### Example
+
+```python
+from filefinder import FileFinder
+
+paths = ["a1_abc", "ab200_abcdef",]
+
+ff = FileFinder("", "{letters:l}{num:d}_{beg:2}{end}", test_paths=paths)
+
+fc = ff.find_files()
+
+fc
+```
+
+which results in the following:
+
+```python
+<FileContainer>
+       filename letters  num beg   end
+0        a1_abc       a    1  ab     c
+1  ab200_abcdef      ab  200  ab  cdef
+```
+
+Note that `fc.df.num` has now a data type of `int` while without the `:d` it would be an
+string (or more precisely an object as pandas uses this dtype to represent strings).
+
+
 ## Filters
 
 Filters can postprocess the found paths in `<FileContainer>`. Currently only a `priority_filter`
 is implemented.
 
+### Example
+
 Assuming you have data for several models with different time resolution, e.g., 1 hourly
 (`"1h"`), 6 hourly (`"6h"`), and daily (`"1d"`), but not all models have all time resolutions:
 

diff --git a/filefinder/_filefinder.py b/filefinder/_filefinder.py
@@ -3,6 +3,7 @@
 import glob
 import logging
 import os
+import re
 
 import numpy as np
 import pandas as pd
@@ -28,6 +29,11 @@ def __init__(self, pattern, suffix=""):
         self.parser = parse.compile(self.pattern)
         self._suffix = suffix
 
+        # replace the fmt spec - add the capture group again
+        self._pattern_no_fmt_spec = re.sub(
+            r"\{([A-Za-z0-9_]+)(:.*?)\}", r"{\1}", pattern
+        )
+
     def create_name(self, keys=None, **keys_kwargs):
         """build name from keys
 
@@ -42,7 +48,7 @@ def create_name(self, keys=None, **keys_kwargs):
 
         keys = update_dict_with_kwargs(keys, **keys_kwargs)
 
-        return self.pattern.format(**keys)
+        return self._pattern_no_fmt_spec.format(**keys)
 
 
 class _Finder(_FinderBase):

diff --git a/filefinder/tests/test_filefinder_fmt.py b/filefinder/tests/test_filefinder_fmt.py
@@ -0,0 +1,126 @@
+import textwrap
+
+import pandas as pd
+import pytest
+
+from filefinder import FileFinder
+
+
+@pytest.fixture(scope="module", params=["from_filesystem", "from_string"])
+def test_paths(request, tmp_path):
+
+    if request.param == "from_filesystem":
+        return None
+
+    paths = ["a1/foo/file", "a2/foo/file"]
+    paths = [str(tmp_path / path) for path in paths]
+
+    return paths
+
+
+def test_pattern_no_fmt_spec():
+
+    path_pattern = "{path:l}_{pattern:2d}_{no_fmt}/"
+    file_pattern = "{file}_{pattern:2d}"
+
+    path_pattern_no_fmt = "{path}_{pattern}_{no_fmt}/"
+    file_pattern_no_fmt = "{file}_{pattern}"
+
+    ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)
+
+    assert ff.path._pattern_no_fmt_spec == path_pattern_no_fmt
+    assert ff.file._pattern_no_fmt_spec == file_pattern_no_fmt
+
+    assert ff.full._pattern_no_fmt_spec == path_pattern_no_fmt + file_pattern_no_fmt
+
+
+def test_keys():
+
+    file_pattern = "{a:l}_{b}_{c:d}"
+    path_pattern = "{ab}_{c:d}"
+    ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)
+
+    expected = {"a", "b", "c", "ab"}
+    assert ff.keys == expected
+
+    expected = {"a", "b", "c"}
+    assert ff.keys_file == expected
+
+    expected = {"ab", "c"}
+    assert ff.keys_path == expected
+
+
+def test_repr():
+
+    path_pattern = "/{a:l}/{b}"
+    file_pattern = "{b}_{c:d}"
+    ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)
+
+    expected = """\
+    <FileFinder>
+    path_pattern: '/{a:l}/{b}/'
+    file_pattern: '{b}_{c:d}'
+
+    keys: 'a', 'b', 'c'
+    """
+    expected = textwrap.dedent(expected)
+    assert expected == ff.__repr__()
+
+
+def test_create_name():
+
+    path_pattern = "{a:w}/{b}"
+    file_pattern = "{b}_{c:l}"
+    ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)
+
+    result = ff.create_path_name(a="a", b="b")
+    assert result == "a/b/"
+
+    result = ff.create_file_name(b="b", c="c")
+    assert result == "b_c"
+
+    result = ff.create_full_name(a="a", b="b", c="c")
+    assert result == "a/b/b_c"
+
+
+def test_create_name_dict():
+
+    path_pattern = "{a:w}/{b}"
+    file_pattern = "{b}_{c:d}"
+    ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)
+
+    result = ff.create_path_name(dict(a="a", b="b"))
+    assert result == "a/b/"
+
+    result = ff.create_file_name(dict(b="b", c="c"))
+    assert result == "b_c"
+
+    result = ff.create_full_name(dict(a="a", b="b", c="c"))
+    assert result == "a/b/b_c"
+
+
+def test_find_paths_fmt():
+
+    test_paths = ["a1/a1_abc", "ab200/ab200_aicdef"]
+
+    path_pattern = "{letters:l}{num:d}"
+    file_pattern = "{letters:l}{num:d}_{beg:2}{end}"
+
+    ff = FileFinder(
+        path_pattern=path_pattern, file_pattern=file_pattern, test_paths=test_paths
+    )
+
+    expected = {
+        "filename": {0: "a1/a1_abc", 1: "ab200/ab200_aicdef"},
+        "letters": {0: "a", 1: "ab"},
+        "num": {0: 1, 1: 200},
+        "beg": {0: "ab", 1: "ai"},
+        "end": {0: "c", 1: "cdef"},
+    }
+    expected = pd.DataFrame.from_dict(expected)
+
+    result = ff.find_files()
+    pd.testing.assert_frame_equal(result.df, expected)
+
+    result = ff.find_files(num=[1])
+    pd.testing.assert_frame_equal(result.df, expected.iloc[[0]])
diff --git a/filefinder/tests/test_utils.py b/filefinder/tests/test_utils.py
@@ -9,10 +9,19 @@
 )
 
 
-def test_find_keys():
+@pytest.mark.parametrize(
+    "string, expected",
+    (
+        ["/path/{var_name}/{year}", {"var_name", "year"}],
+        ["{a:d}{b:d}", {"a", "b"}],
+        ["{a}{b:d}", {"a", "b"}],
+        ["{a:d}{b}", {"a", "b"}],
+        ["{a:d}{b:d}{c:d}", {"a", "b", "c"}],
+    ),
+)
+def test_find_keys(string, expected):
 
-    result = _find_keys("/path/{var_name}/{year}")
-    expected = {"var_name", "year"}
+    result = _find_keys(string)
 
     assert result == expected
 

diff --git a/filefinder/utils.py b/filefinder/utils.py
@@ -2,15 +2,27 @@
 import re
 
 
-def _find_keys(pattern):
+def _find_keys(string):
     """find keys in a format string
 
     find all keys enclosed by curly brackets
 
     >>> _find_keys("/path/{var_name}/{year}") == {"var_name", "year"}
     True
+
+    >>> _find_keys("/path/{var_name}/{year:d}") == {"var_name", "year"}
+    True
     """
-    keys = set(re.findall(r"\{([A-Za-z0-9_]+)\}", pattern))
+
+    # match group
+    pattern = (
+        r"\{"
+        r"([A-Za-z0-9_]+)"  # capturing group with one or more characters, number or _
+        r"(?::.*?)?"  # non-capturing group, non greedy matching any char, zero or once
+        r"\}"
+    )
+
+    keys = set(re.findall(pattern, string))
 
     return keys