Skip to content
This repository was archived by the owner on Feb 12, 2025. It is now read-only.

Commit 1db69cf

Browse files
committed
snapshotter: use wcmatch.glob.globmatch function
All existing python glob match functions seem to have issues. Path.match does not accept '**'. When using fnmatch, '*' matches a / character which is not what we want. To fix this mess, I've introduced a new 3rd party library wcmatch to handle the globmatching.
1 parent 4ea6911 commit 1db69cf

File tree

7 files changed

+142
-26
lines changed

7 files changed

+142
-26
lines changed

astacus/common/snapshot.py

+4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"""
55
from astacus.common.magic import DEFAULT_EMBEDDED_FILE_SIZE
66
from typing import Sequence
7+
from typing_extensions import Self
78

89
import dataclasses
910

@@ -15,3 +16,6 @@ class SnapshotGroup:
1516
excluded_names: Sequence[str] = ()
1617
# None means "no limit": all files matching the glob will be embedded
1718
embedded_file_size_max: int | None = DEFAULT_EMBEDDED_FILE_SIZE
19+
20+
def without_excluded_names(self) -> Self:
21+
return dataclasses.replace(self, excluded_names=())

astacus/node/memory_snapshot.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from astacus.common.snapshot import SnapshotGroup
1212
from astacus.node.snapshot import Snapshot
1313
from astacus.node.snapshotter import hash_hexdigest_readable, Snapshotter
14-
from glob import iglob
1514
from pathlib import Path
1615
from typing import Iterable, Iterator, Mapping, Sequence
1716

@@ -77,13 +76,11 @@ def get_all_digests(self) -> Iterable[SnapshotHash]:
7776
class MemorySnapshotter(Snapshotter[MemorySnapshot]):
7877
def _list_files(self, basepath: Path) -> list[FoundFile]:
7978
result_files = set()
80-
for group in self._groups:
81-
for p in iglob(group.root_glob, root_dir=basepath, recursive=True):
79+
for group in self._groups.groups:
80+
for p in group.glob(root_dir=basepath):
8281
path = basepath / p
8382
if not path.is_file() or path.is_symlink():
8483
continue
85-
if path.name in group.excluded_names:
86-
continue
8784
relpath = path.relative_to(basepath)
8885
for parent in relpath.parents:
8986
if parent.name == magic.ASTACUS_TMPDIR:
@@ -92,9 +89,7 @@ def _list_files(self, basepath: Path) -> list[FoundFile]:
9289
result_files.add(
9390
FoundFile(
9491
relative_path=relpath,
95-
group=SnapshotGroup(
96-
root_glob=group.root_glob, embedded_file_size_max=group.embedded_file_size_max
97-
),
92+
group=group.group.without_excluded_names(),
9893
)
9994
)
10095
return sorted(result_files, key=lambda found_file: found_file.relative_path)

astacus/node/snapshot_groups.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""
2+
3+
Copyright (c) 2023 Aiven Ltd
4+
See LICENSE for details
5+
6+
Classes for working with snapshot groups.
7+
8+
"""
9+
from astacus.common.snapshot import SnapshotGroup
10+
from pathlib import Path
11+
from typing import Iterable, Optional, Sequence
12+
from typing_extensions import Self
13+
from wcmatch.glob import GLOBSTAR, iglob, translate
14+
15+
import dataclasses
16+
import os
17+
import re
18+
19+
WCMATCH_FLAGS = GLOBSTAR
20+
21+
22+
@dataclasses.dataclass
23+
class CompiledGroup:
24+
group: SnapshotGroup
25+
regex: re.Pattern
26+
27+
@classmethod
28+
def compile(cls, group: SnapshotGroup) -> Self:
29+
return cls(group, glob_compile(group.root_glob))
30+
31+
def matches(self, relative_path: Path) -> bool:
32+
return bool(self.regex.match(str(relative_path))) and relative_path.name not in self.group.excluded_names
33+
34+
def glob(self, root_dir: Optional[Path] = None) -> Iterable[str]:
35+
for path in iglob(self.group.root_glob, root_dir=root_dir, flags=WCMATCH_FLAGS):
36+
if os.path.basename(path) not in self.group.excluded_names:
37+
yield path
38+
39+
40+
@dataclasses.dataclass
41+
class CompiledGroups:
42+
groups: Sequence[CompiledGroup]
43+
44+
@classmethod
45+
def compile(cls, groups: Sequence[SnapshotGroup]) -> Self:
46+
return cls([CompiledGroup.compile(group) for group in groups])
47+
48+
def get_matching(self, relative_path: Path) -> list[SnapshotGroup]:
49+
return [group.group for group in self.groups if group.matches(relative_path)]
50+
51+
def any_match(self, relative_path: Path) -> bool:
52+
return any(group.matches(relative_path) for group in self.groups)
53+
54+
def root_globs(self) -> list[str]:
55+
return [group.group.root_glob for group in self.groups]
56+
57+
58+
def glob_compile(glob: str) -> re.Pattern:
59+
return re.compile(translate(glob, flags=WCMATCH_FLAGS)[0][0])

astacus/node/snapshotter.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from astacus.common.progress import Progress
1111
from astacus.common.snapshot import SnapshotGroup
1212
from astacus.node.snapshot import Snapshot
13+
from astacus.node.snapshot_groups import CompiledGroups
1314
from multiprocessing import dummy
1415
from pathlib import Path
1516
from threading import Lock
@@ -28,7 +29,7 @@ def __init__(self, groups: Sequence[SnapshotGroup], src: Path, dst: Path, snapsh
2829
self.snapshot = snapshot
2930
self._src = src
3031
self._dst = dst
31-
self._groups = groups
32+
self._groups = CompiledGroups.compile(groups)
3233
self._parallel = parallel
3334
self._dst.mkdir(parents=True, exist_ok=True)
3435

@@ -45,9 +46,7 @@ def release(self, hexdigests: Iterable[str], *, progress: Progress) -> None:
4546
...
4647

4748
def get_snapshot_state(self) -> SnapshotState:
48-
return SnapshotState(
49-
root_globs=[group.root_glob for group in self._groups], files=list(self.snapshot.get_all_files())
50-
)
49+
return SnapshotState(root_globs=self._groups.root_globs(), files=list(self.snapshot.get_all_files()))
5150

5251
def _file_in_src(self, relative_path: Path) -> SnapshotFile:
5352
src_path = self._src / relative_path
@@ -71,10 +70,7 @@ def _cb(snapshotfile: SnapshotFile) -> SnapshotFile:
7170
yield from p.imap_unordered(_cb, files)
7271

7372
def _embedded_file_size_max_for_file(self, file: SnapshotFile) -> int | None:
74-
groups = []
75-
for group in self._groups:
76-
if file.relative_path.match(group.root_glob):
77-
groups.append(group)
73+
groups = self._groups.get_matching(file.relative_path)
7874
assert groups
7975
head, *tail = groups
8076
for group in tail:

astacus/node/sqlite_snapshot.py

+2-10
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from astacus.node.snapshot import Snapshot
1212
from astacus.node.snapshotter import Snapshotter
1313
from contextlib import closing
14-
from fnmatch import fnmatch
1514
from pathlib import Path
1615
from typing import Iterable, Sequence
1716
from typing_extensions import override
@@ -124,15 +123,8 @@ def _list_files_and_create_directories(self) -> Iterable[Path]:
124123
(self._dst / rel_dir).mkdir(parents=True, exist_ok=True)
125124
for f in files:
126125
rel_path = rel_dir / f
127-
full_path = dir_path / f
128-
if full_path.is_symlink():
129-
continue
130-
for group in self._groups:
131-
# fnmatch works strangely with paths until 3.13 so convert to string
132-
# https://github.com/python/cpython/issues/73435
133-
if fnmatch(str(rel_path), group.root_glob) and f not in group.excluded_names:
134-
yield rel_path
135-
break
126+
if not (dir_path / f).is_symlink() and self._groups.any_match(rel_path):
127+
yield rel_path
136128

137129
def _compare_current_snapshot(self, files: Iterable[Path]) -> Iterable[tuple[Path, SnapshotFile | None]]:
138130
with closing(self._con.cursor()) as cur:

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ install_requires =
1818
tabulate==0.9.0
1919
typing-extensions==4.7.1
2020
uvicorn==0.15.0
21+
wcmatch==8.4.1
2122
# Pinned transitive deps
2223
pydantic==1.10.2
2324

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""
2+
3+
Copyright (c) 2023 Aiven Ltd
4+
See LICENSE for details
5+
6+
"""
7+
from astacus.common.snapshot import SnapshotGroup
8+
from astacus.node.snapshot_groups import CompiledGroup, CompiledGroups, glob_compile
9+
from pathlib import Path
10+
11+
import os
12+
13+
POSITIVE_TEST_CASES: list[tuple[Path, str]] = [
14+
(Path("foo"), "foo"),
15+
(Path("foo"), "*"),
16+
(Path("foo/bar"), "*/bar"),
17+
(Path("foo"), "**"),
18+
(Path("foo/bar"), "**"),
19+
(Path("foo/bar/baz"), "**/*"),
20+
(Path("foo/bar"), "**/*"),
21+
(Path("foo/bar"), "**/**"),
22+
]
23+
24+
NEGATIVE_TEST_CASES: list[tuple[Path, str]] = [
25+
(Path("foo/bar/baz"), "*/*"),
26+
(Path("foo"), "foobar"),
27+
(Path("foo"), "*/foo"),
28+
]
29+
30+
31+
def test_compile() -> None:
32+
for path, glob in POSITIVE_TEST_CASES:
33+
assert glob_compile(glob).match(str(path)) is not None
34+
for path, glob in NEGATIVE_TEST_CASES:
35+
assert glob_compile(glob).match(str(path)) is None
36+
37+
38+
def test_CompiledGroup_matches() -> None:
39+
for path, glob in POSITIVE_TEST_CASES:
40+
group = SnapshotGroup(root_glob=glob)
41+
assert CompiledGroup.compile(group).matches(path)
42+
group = SnapshotGroup(root_glob=glob, excluded_names=[os.path.basename(path)])
43+
assert not CompiledGroup.compile(group).matches(path)
44+
for path, glob in NEGATIVE_TEST_CASES:
45+
group = SnapshotGroup(root_glob=glob)
46+
assert not CompiledGroup.compile(group).matches(path)
47+
48+
49+
def test_CompiledGroups() -> None:
50+
for path, glob in POSITIVE_TEST_CASES:
51+
group1 = SnapshotGroup(root_glob=glob)
52+
group2 = SnapshotGroup(root_glob=glob, excluded_names=[os.path.basename(path)])
53+
group3 = SnapshotGroup(root_glob="doesntmatch")
54+
compiled = CompiledGroups.compile([group1, group2, group3])
55+
assert compiled.any_match(path)
56+
assert compiled.get_matching(path) == [group1]
57+
58+
59+
def test_CompiledGroup_glob(tmp_path: Path) -> None:
60+
for p, _ in POSITIVE_TEST_CASES + NEGATIVE_TEST_CASES:
61+
p = tmp_path / p
62+
p.mkdir(parents=True, exist_ok=True)
63+
p.touch()
64+
for p, glob in POSITIVE_TEST_CASES:
65+
group = SnapshotGroup(root_glob=glob)
66+
assert str(p) in CompiledGroup.compile(group).glob(tmp_path)
67+
for p, glob in NEGATIVE_TEST_CASES:
68+
group = SnapshotGroup(root_glob=glob)
69+
assert str(p) not in CompiledGroup.compile(group).glob(tmp_path)

0 commit comments

Comments
 (0)