Skip to content

Commit

Permalink
allow passing format_spec (#57)
Browse files Browse the repository at this point in the history
* allow passing format_spec

* add some docs

* tests
  • Loading branch information
mathause authored May 23, 2023
1 parent 1b14fb9 commit 32e1210
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 6 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

## v0.2.0 - unreleased

- Allow passing format spec to the captured names to allow more precise name matching
- ([#57](https://github.com/mathause/filefinder/pull/57)).
- Add tests for the cmip functionality and fix issue with `filefinder.cmip.ensure_unique_grid`
([#35](https://github.com/mathause/filefinder/pull/35)).
- Removed support for python 3.6.
Expand Down
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,46 @@ ff.find_files(category=["a1", "b2"], number=1)
>>> 2 /root/b2/b2_file_1 b2 1
```

## Format syntax

You can pass format specifiers to allow more complex formats, see
[format-specification](https://github.com/r1chardj0n3s/parse#format-specification) for details.
Using format specifiers, you can parse names that are possible otherwise.

### Example

```python
from filefinder import FileFinder

paths = ["a1_abc", "ab200_abcdef",]

ff = FileFinder("", "{letters:l}{num:d}_{beg:2}{end}", test_paths=paths)

fc = ff.find_files()

fc
```

which results in the following:

```python
<FileContainer>
filename letters num beg end
0 a1_abc a 1 ab c
1 ab200_abcdef ab 200 ab cdef
```

Note that `fc.df.num` has now a data type of `int` while without the `:d` it would be an
string (or more precisely an object as pandas uses this dtype to represent strings).


## Filters

Filters can postprocess the found paths in `<FileContainer>`. Currently only a `priority_filter`
is implemented.

### Example

Assuming you have data for several models with different time resolution, e.g., 1 hourly
(`"1h"`), 6 hourly (`"6h"`), and daily (`"1d"`), but not all models have all time resolutions:

Expand Down
8 changes: 7 additions & 1 deletion filefinder/_filefinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import logging
import os
import re

import numpy as np
import pandas as pd
Expand All @@ -28,6 +29,11 @@ def __init__(self, pattern, suffix=""):
self.parser = parse.compile(self.pattern)
self._suffix = suffix

# replace the fmt spec - add the capture group again
self._pattern_no_fmt_spec = re.sub(
r"\{([A-Za-z0-9_]+)(:.*?)\}", r"{\1}", pattern
)

def create_name(self, keys=None, **keys_kwargs):
"""build name from keys
Expand All @@ -42,7 +48,7 @@ def create_name(self, keys=None, **keys_kwargs):

keys = update_dict_with_kwargs(keys, **keys_kwargs)

return self.pattern.format(**keys)
return self._pattern_no_fmt_spec.format(**keys)


class _Finder(_FinderBase):
Expand Down
126 changes: 126 additions & 0 deletions filefinder/tests/test_filefinder_fmt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import textwrap

import pandas as pd
import pytest

from filefinder import FileFinder


@pytest.fixture(scope="module", params=["from_filesystem", "from_string"])
def test_paths(request, tmp_path):

if request.param == "from_filesystem":
return None

paths = ["a1/foo/file", "a2/foo/file"]
paths = [str(tmp_path / path) for path in paths]

return paths


def test_pattern_no_fmt_spec():

path_pattern = "{path:l}_{pattern:2d}_{no_fmt}/"
file_pattern = "{file}_{pattern:2d}"

path_pattern_no_fmt = "{path}_{pattern}_{no_fmt}/"
file_pattern_no_fmt = "{file}_{pattern}"

ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)

assert ff.path._pattern_no_fmt_spec == path_pattern_no_fmt
assert ff.file._pattern_no_fmt_spec == file_pattern_no_fmt

assert ff.full._pattern_no_fmt_spec == path_pattern_no_fmt + file_pattern_no_fmt


def test_keys():

file_pattern = "{a:l}_{b}_{c:d}"
path_pattern = "{ab}_{c:d}"
ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)

expected = {"a", "b", "c", "ab"}
assert ff.keys == expected

expected = {"a", "b", "c"}
assert ff.keys_file == expected

expected = {"ab", "c"}
assert ff.keys_path == expected


def test_repr():

path_pattern = "/{a:l}/{b}"
file_pattern = "{b}_{c:d}"
ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)

expected = """\
<FileFinder>
path_pattern: '/{a:l}/{b}/'
file_pattern: '{b}_{c:d}'
keys: 'a', 'b', 'c'
"""
expected = textwrap.dedent(expected)
assert expected == ff.__repr__()


def test_create_name():

path_pattern = "{a:w}/{b}"
file_pattern = "{b}_{c:l}"
ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)

result = ff.create_path_name(a="a", b="b")
assert result == "a/b/"

result = ff.create_file_name(b="b", c="c")
assert result == "b_c"

result = ff.create_full_name(a="a", b="b", c="c")
assert result == "a/b/b_c"


def test_create_name_dict():

path_pattern = "{a:w}/{b}"
file_pattern = "{b}_{c:d}"
ff = FileFinder(path_pattern=path_pattern, file_pattern=file_pattern)

result = ff.create_path_name(dict(a="a", b="b"))
assert result == "a/b/"

result = ff.create_file_name(dict(b="b", c="c"))
assert result == "b_c"

result = ff.create_full_name(dict(a="a", b="b", c="c"))
assert result == "a/b/b_c"


def test_find_paths_fmt():

test_paths = ["a1/a1_abc", "ab200/ab200_aicdef"]

path_pattern = "{letters:l}{num:d}"
file_pattern = "{letters:l}{num:d}_{beg:2}{end}"

ff = FileFinder(
path_pattern=path_pattern, file_pattern=file_pattern, test_paths=test_paths
)

expected = {
"filename": {0: "a1/a1_abc", 1: "ab200/ab200_aicdef"},
"letters": {0: "a", 1: "ab"},
"num": {0: 1, 1: 200},
"beg": {0: "ab", 1: "ai"},
"end": {0: "c", 1: "cdef"},
}
expected = pd.DataFrame.from_dict(expected)

result = ff.find_files()
pd.testing.assert_frame_equal(result.df, expected)

result = ff.find_files(num=[1])
pd.testing.assert_frame_equal(result.df, expected.iloc[[0]])
15 changes: 12 additions & 3 deletions filefinder/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,19 @@
)


def test_find_keys():
@pytest.mark.parametrize(
"string, expected",
(
["/path/{var_name}/{year}", {"var_name", "year"}],
["{a:d}{b:d}", {"a", "b"}],
["{a}{b:d}", {"a", "b"}],
["{a:d}{b}", {"a", "b"}],
["{a:d}{b:d}{c:d}", {"a", "b", "c"}],
),
)
def test_find_keys(string, expected):

result = _find_keys("/path/{var_name}/{year}")
expected = {"var_name", "year"}
result = _find_keys(string)

assert result == expected

Expand Down
16 changes: 14 additions & 2 deletions filefinder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,27 @@
import re


def _find_keys(pattern):
def _find_keys(string):
"""find keys in a format string
find all keys enclosed by curly brackets
>>> _find_keys("/path/{var_name}/{year}") == {"var_name", "year"}
True
>>> _find_keys("/path/{var_name}/{year:d}") == {"var_name", "year"}
True
"""
keys = set(re.findall(r"\{([A-Za-z0-9_]+)\}", pattern))

# match group
pattern = (
r"\{"
r"([A-Za-z0-9_]+)" # capturing group with one or more characters, number or _
r"(?::.*?)?" # non-capturing group, non greedy matching any char, zero or once
r"\}"
)

keys = set(re.findall(pattern, string))

return keys

Expand Down

0 comments on commit 32e1210

Please sign in to comment.