diff --git a/cylc/flow/pathutil.py b/cylc/flow/pathutil.py index 719e3bc4356..91b06d720ca 100644 --- a/cylc/flow/pathutil.py +++ b/cylc/flow/pathutil.py @@ -15,12 +15,11 @@ # along with this program. If not, see . """Functions to return paths to common workflow files and directories.""" -import glob import os from pathlib import Path import re from shutil import rmtree -from typing import Container, Dict, Iterable, List, Set, Union +from typing import Dict, Iterable, Set, Union from cylc.flow import LOG from cylc.flow.cfgspec.glbl_cfg import glbl_cfg @@ -314,57 +313,6 @@ def parse_rm_dirs(rm_dirs: Iterable[str]) -> Set[str]: return result -def glob_in_run_dir( - run_dir: Union[Path, str], pattern: str, symlink_dirs: Container[Path] -) -> List[Path]: - """Execute a (recursive) glob search in the given run directory. - - Returns list of any absolute paths that match the pattern. However: - * Does not follow symlinks (apart from the spcedified symlink dirs). - * Also does not return matching subpaths of matching directories (because - that would be redundant). - - Args: - run_dir: Absolute path of the workflow run dir. - pattern: The glob pattern. - symlink_dirs: Absolute paths to the workflow's symlink dirs. - """ - # Note: use os.path.join, not pathlib, to preserve trailing slash if - # present in pattern - pattern = os.path.join(glob.escape(str(run_dir)), pattern) - # Note: don't use pathlib.Path.glob() because when you give it an exact - # filename instead of pattern, it doesn't return broken symlinks - matches = sorted(Path(i) for i in glob.iglob(pattern, recursive=True)) - # sort guarantees parents come before their children - if len(matches) == 1 and not os.path.lexists(matches[0]): - # https://bugs.python.org/issue35201 - return [] - results: List[Path] = [] - subpath_excludes: Set[Path] = set() - for path in matches: - for rel_ancestor in reversed(path.relative_to(run_dir).parents): - ancestor = run_dir / rel_ancestor - if ancestor in subpath_excludes: - break - if ancestor.is_symlink() and ancestor not in symlink_dirs: - # Do not follow non-standard symlinks - subpath_excludes.add(ancestor) - break - if (not symlink_dirs) and ancestor in results: - # We can be sure all subpaths of this ancestor are redundant - subpath_excludes.add(ancestor) - break - if ancestor == path.parent: - # Final iteration over ancestors - if ancestor in matches and path not in symlink_dirs: - # Redundant (but don't exclude subpaths in case any of the - # subpaths are std symlink dirs) - break - else: # no break - results.append(path) - return results - - def is_relative_to(path1: Union[Path, str], path2: Union[Path, str]) -> bool: """Return whether or not path1 is relative to path2.""" # In future, we can just use pathlib.Path.is_relative_to() diff --git a/cylc/flow/workflow_files.py b/cylc/flow/workflow_files.py index b46e5530c2b..c3ed0babff2 100644 --- a/cylc/flow/workflow_files.py +++ b/cylc/flow/workflow_files.py @@ -20,6 +20,7 @@ from collections import deque from enum import Enum from functools import partial +import glob import logging import os from pathlib import Path @@ -29,8 +30,8 @@ from subprocess import Popen, PIPE, DEVNULL import time from typing import ( - Any, Deque, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple, - TYPE_CHECKING, Union + Any, Container, Deque, Dict, Iterable, List, NamedTuple, Optional, Set, + Tuple, TYPE_CHECKING, Union ) import zmq.auth @@ -47,7 +48,6 @@ from cylc.flow.pathutil import ( expand_path, get_workflow_run_dir, - glob_in_run_dir, make_localhost_symlinks, parse_rm_dirs, remove_dir_and_target, @@ -723,6 +723,57 @@ def get_symlink_dirs(reg: str, run_dir: Union[Path, str]) -> Dict[str, Path]: return ret +def glob_in_run_dir( + run_dir: Union[Path, str], pattern: str, symlink_dirs: Container[Path] +) -> List[Path]: + """Execute a (recursive) glob search in the given run directory. + + Returns list of any absolute paths that match the pattern. However: + * Does not follow symlinks (apart from the spcedified symlink dirs). + * Also does not return matching subpaths of matching directories (because + that would be redundant). + + Args: + run_dir: Absolute path of the workflow run dir. + pattern: The glob pattern. + symlink_dirs: Absolute paths to the workflow's symlink dirs. + """ + # Note: use os.path.join, not pathlib, to preserve trailing slash if + # present in pattern + pattern = os.path.join(glob.escape(str(run_dir)), pattern) + # Note: don't use pathlib.Path.glob() because when you give it an exact + # filename instead of pattern, it doesn't return broken symlinks + matches = sorted(Path(i) for i in glob.iglob(pattern, recursive=True)) + # sort guarantees parents come before their children + if len(matches) == 1 and not os.path.lexists(matches[0]): + # https://bugs.python.org/issue35201 + return [] + results: List[Path] = [] + subpath_excludes: Set[Path] = set() + for path in matches: + for rel_ancestor in reversed(path.relative_to(run_dir).parents): + ancestor = run_dir / rel_ancestor + if ancestor in subpath_excludes: + break + if ancestor.is_symlink() and ancestor not in symlink_dirs: + # Do not follow non-standard symlinks + subpath_excludes.add(ancestor) + break + if (not symlink_dirs) and ancestor in results: + # We can be sure all subpaths of this ancestor are redundant + subpath_excludes.add(ancestor) + break + if ancestor == path.parent: + # Final iteration over ancestors + if ancestor in matches and path not in symlink_dirs: + # Redundant (but don't exclude subpaths in case any of the + # subpaths are std symlink dirs) + break + else: # no break + results.append(path) + return results + + def _clean_using_glob( run_dir: Path, pattern: str, symlink_dirs: Iterable[str] ) -> None: diff --git a/tests/unit/filetree.py b/tests/unit/filetree.py new file mode 100644 index 00000000000..bef09e4e556 --- /dev/null +++ b/tests/unit/filetree.py @@ -0,0 +1,167 @@ +# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE. +# Copyright (C) NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Utilities for testing workflow directory structure. + +(There should be no tests in this module.) + +A filetree is represented by a dict like so: + { + # Dirs are represented by dicts (which are also sub-filetrees): + 'dir': { + 'another-dir': { + # Files are represented by None: + 'file.txt': None + } + }, + # Symlinks are represented by pathlib.Path, with the target represented + # by the relative path from the tmp_path directory: + 'symlink': Path('dir/another-dir') + } +""" + +from pathlib import Path +from typing import Any, Dict, List + + +def create_filetree( + filetree: Dict[str, Any], location: Path, root: Path +) -> None: + """Create the directory structure represented by the filetree dict. + + Args: + filetree: The filetree to create. + location: The absolute path in which to create the filetree. + root: The top-level dir from which relative symlink targets are + located (typically tmp_path). + """ + for name, entry in filetree.items(): + path = location / name + if isinstance(entry, dict): + path.mkdir(exist_ok=True) + create_filetree(entry, path, root) + elif isinstance(entry, Path): + path.symlink_to(root / entry) + else: + + path.touch() + + +def get_filetree_as_list( + filetree: Dict[str, Any], location: Path +) -> List[str]: + """Return a list of the paths in a filetree. + + Args: + filetree: The filetree to listify. + location: The absolute path to the filetree. + """ + ret: List[str] = [] + for name, entry in filetree.items(): + path = location / name + ret.append(str(path)) + if isinstance(entry, dict): + ret.extend(get_filetree_as_list(entry, path)) + return ret + + +FILETREE_1 = { + 'cylc-run': {'foo': {'bar': { + '.service': {'db': None}, + 'flow.cylc': None, + 'log': Path('sym/cylc-run/foo/bar/log'), + 'mirkwood': Path('you-shall-not-pass/mirkwood'), + 'rincewind.txt': Path('you-shall-not-pass/rincewind.txt') + }}}, + 'sym': {'cylc-run': {'foo': {'bar': { + 'log': { + 'darmok': Path('you-shall-not-pass/darmok'), + 'temba.txt': Path('you-shall-not-pass/temba.txt'), + 'bib': { + 'fortuna.txt': None + } + } + }}}}, + 'you-shall-not-pass': { # Nothing in here should get deleted + 'darmok': { + 'jalad.txt': None + }, + 'mirkwood': { + 'spiders.txt': None + }, + 'rincewind.txt': None, + 'temba.txt': None + } +} + +FILETREE_2 = { + 'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}}, + 'sym-run': {'cylc-run': {'foo': {'bar': { + '.service': {'db': None}, + 'flow.cylc': None, + 'share': Path('sym-share/cylc-run/foo/bar/share') + }}}}, + 'sym-share': {'cylc-run': {'foo': {'bar': { + 'share': { + 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') + } + }}}}, + 'sym-cycle': {'cylc-run': {'foo': {'bar': { + 'share': { + 'cycle': { + 'macklunkey.txt': None + } + } + }}}}, + 'you-shall-not-pass': {} +} + +FILETREE_3 = { + 'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}}, + 'sym-run': {'cylc-run': {'foo': {'bar': { + '.service': {'db': None}, + 'flow.cylc': None, + 'share': { + 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') + } + }}}}, + 'sym-cycle': {'cylc-run': {'foo': {'bar': { + 'share': { + 'cycle': { + 'sokath.txt': None + } + } + }}}}, + 'you-shall-not-pass': {} +} + +FILETREE_4 = { + 'cylc-run': {'foo': {'bar': { + '.service': {'db': None}, + 'flow.cylc': None, + 'share': { + 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') + } + }}}, + 'sym-cycle': {'cylc-run': {'foo': {'bar': { + 'share': { + 'cycle': { + 'kiazi.txt': None + } + } + }}}}, + 'you-shall-not-pass': {} +} diff --git a/tests/unit/test_pathutil.py b/tests/unit/test_pathutil.py index dea560d4c7f..6b3cf174ec3 100644 --- a/tests/unit/test_pathutil.py +++ b/tests/unit/test_pathutil.py @@ -432,7 +432,7 @@ def test_parse_rm_dirs(dirs: List[str], expected: Set[str]): "cannot take paths that point to the run directory or above"), ] ) -def test_parse_rm_dirs_bad(dirs: List[str], err_msg: str): +def test_parse_rm_dirs__bad(dirs: List[str], err_msg: str): """Test parse_dirs() with bad inputs""" with pytest.raises(UserInputError) as exc: parse_rm_dirs(dirs) diff --git a/tests/unit/test_workflow_files.py b/tests/unit/test_workflow_files.py index 0cb045e0753..300dafea35a 100644 --- a/tests/unit/test_workflow_files.py +++ b/tests/unit/test_workflow_files.py @@ -20,7 +20,7 @@ from pathlib import Path import pytest import shutil -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union from unittest import mock from cylc.flow import CYLC_LOG @@ -39,12 +39,22 @@ _remote_clean_cmd, check_flow_file, check_nested_run_dirs, + get_symlink_dirs, get_workflow_source_dir, + glob_in_run_dir, reinstall_workflow, search_install_source_dirs ) from tests.unit.conftest import MonkeyMock +from tests.unit.filetree import ( + FILETREE_1, + FILETREE_2, + FILETREE_3, + FILETREE_4, + create_filetree, + get_filetree_as_list +) @pytest.mark.parametrize( @@ -518,24 +528,144 @@ def test_clean__rm_dir_not_file(pattern: str, tmp_run_dir: Callable): assert a_file.exists() +@pytest.mark.parametrize( + 'filetree, expected', + [ + pytest.param( + FILETREE_1, + {'log': 'sym/cylc-run/foo/bar/log'}, + id="filetree1" + ), + pytest.param( + FILETREE_2, + { + 'share/cycle': 'sym-cycle/cylc-run/foo/bar/share/cycle', + 'share': 'sym-share/cylc-run/foo/bar/share', + '': 'sym-run/cylc-run/foo/bar/' + }, + id="filetree2" + ), + pytest.param( + FILETREE_3, + { + 'share/cycle': 'sym-cycle/cylc-run/foo/bar/share/cycle', + '': 'sym-run/cylc-run/foo/bar/' + }, + id="filetree3" + ), + pytest.param( + FILETREE_4, + {'share/cycle': 'sym-cycle/cylc-run/foo/bar/share/cycle'}, + id="filetree4" + ), + ] +) +def test_get_symlink_dirs( + filetree: Dict[str, Any], + expected: Dict[str, Union[Path, str]], + tmp_run_dir: Callable, tmp_path: Path +): + """Test get_symlink_dirs(). + + Params: + filetree: The directory structure to test against. + expected: The expected return dictionary, except with the values being + relative to tmp_path instead of absolute paths. + """ + # Setup + cylc_run_dir = tmp_run_dir() + create_filetree(filetree, tmp_path, tmp_path) + reg = 'foo/bar' + for k, v in expected.items(): + expected[k] = Path(tmp_path / v) + # Test + assert get_symlink_dirs(reg, cylc_run_dir / reg) == expected + + +@pytest.mark.parametrize( + 'pattern, filetree, expected_matches', + [ + pytest.param( + '**', + FILETREE_1, + ['cylc-run/foo/bar', + 'cylc-run/foo/bar/log'], + id="filetree1 **" + ), + pytest.param( + '*/*', + FILETREE_1, + ['cylc-run/foo/bar/flow.cylc', + 'cylc-run/foo/bar/log', + 'cylc-run/foo/bar/mirkwood', + 'cylc-run/foo/bar/rincewind.txt'], + id="filetree1 */*" + ), + pytest.param( + '**/*.txt', + FILETREE_1, + ['cylc-run/foo/bar/log/bib/fortuna.txt', + 'cylc-run/foo/bar/log/temba.txt', + 'cylc-run/foo/bar/rincewind.txt'], + id="filetree1 **/*.txt" + ), + pytest.param( + '**', + FILETREE_2, + ['cylc-run/foo/bar', + 'cylc-run/foo/bar/share', + 'cylc-run/foo/bar/share/cycle'], + id="filetree2 **" + ), + pytest.param( + '**', + FILETREE_3, + ['cylc-run/foo/bar', + 'cylc-run/foo/bar/share/cycle'], + id="filetree3 **" + ), + pytest.param( + '**/s*', + FILETREE_3, + ['cylc-run/foo/bar/share', + 'cylc-run/foo/bar/share/cycle/sokath.txt'], + id="filetree3 **/s*" + ), + pytest.param( + '**', + FILETREE_4, + ['cylc-run/foo/bar', + 'cylc-run/foo/bar/share/cycle'], + id="filetree4 **" + ), + ] +) +def test_glob_in_run_dir( + pattern: str, + filetree: Dict[str, Any], + expected_matches: List[str], + tmp_path: Path, tmp_run_dir: Callable +) -> None: + """Test that glob_in_run_dir() returns the minimal set of results with + no redundant paths. + """ + # Setup + cylc_run_dir: Path = tmp_run_dir() + reg = 'foo/bar' + run_dir = cylc_run_dir / reg + create_filetree(filetree, tmp_path, tmp_path) + symlink_dirs = [run_dir / i for i in get_symlink_dirs(reg, run_dir)] + expected = [tmp_path / i for i in expected_matches] + # Test + assert glob_in_run_dir(run_dir, pattern, symlink_dirs) == expected + + @pytest.fixture def filetree_for_testing_cylc_clean(tmp_path: Path): """Fixture that creates a filetree from the given dict, and returns which files are expected to be deleted and which aren't. - A filetree is represented by a dict like so: - { - # Dirs are represented by dicts (which are also sub-filetrees): - 'dir': { - 'another-dir': { - # Files are represented by None: - 'file.txt': None - } - }, - # Symlinks are represented by pathlib.Path, with the target represented - # by the relative path from the tmp_path directory: - 'symlink': Path('dir/another-dir') - } + See tests/unit/filetree.py Args: reg: Workflow name. @@ -554,7 +684,7 @@ def _filetree_for_testing_cylc_clean( initial_filetree: Dict[str, Any], filetree_left_behind: Dict[str, Any] ) -> Tuple[Path, List[str], List[str]]: - create_filetree(initial_filetree, tmp_path) + create_filetree(initial_filetree, tmp_path, tmp_path) files_not_to_delete = [ os.path.normpath(i) for i in iglob(str(tmp_path / 'you-shall-not-pass/**'), recursive=True) @@ -569,64 +699,9 @@ def _filetree_for_testing_cylc_clean( ) run_dir = tmp_path / 'cylc-run' / reg return run_dir, files_to_delete, files_not_to_delete - - def create_filetree(filetree: Dict[str, Any], location: Path) -> None: - """Helper function""" - for name, entry in filetree.items(): - path = location / name - if isinstance(entry, dict): - path.mkdir(exist_ok=True) - create_filetree(entry, path) - elif isinstance(entry, Path): - path.symlink_to(tmp_path / entry) - else: - path.touch() - - def get_filetree_as_list( - filetree: Dict[str, Any], location: Path - ) -> List[str]: - """Helper function""" - ret: List[str] = [] - for name, entry in filetree.items(): - path = location / name - ret.append(str(path)) - if isinstance(entry, dict): - ret.extend(get_filetree_as_list(entry, path)) - return ret - return _filetree_for_testing_cylc_clean -FILETREE_1 = { - 'cylc-run': {'foo': {'bar': { - '.service': {'db': None}, - 'flow.cylc': None, - 'log': Path('sym/cylc-run/foo/bar/log'), - 'mirkwood': Path('you-shall-not-pass/mirkwood'), - 'rincewind.txt': Path('you-shall-not-pass/rincewind.txt') - }}}, - 'sym': {'cylc-run': {'foo': {'bar': { - 'log': { - 'darmok': Path('you-shall-not-pass/darmok'), - 'temba.txt': Path('you-shall-not-pass/temba.txt'), - 'bib': { - 'fortuna.txt': None - } - } - }}}}, - 'you-shall-not-pass': { # Nothing in here should get deleted - 'darmok': { - 'jalad.txt': None - }, - 'mirkwood': { - 'spiders.txt': None - }, - 'rincewind.txt': None, - 'temba.txt': None - } -} - - @pytest.mark.parametrize( 'pattern, initial_filetree, filetree_left_behind', [ @@ -702,66 +777,6 @@ def test__clean_using_glob( assert os.path.lexists(file) is False -FILETREE_2 = { - 'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}}, - 'sym-run': {'cylc-run': {'foo': {'bar': { - '.service': {'db': None}, - 'flow.cylc': None, - 'share': Path('sym-share/cylc-run/foo/bar/share') - }}}}, - 'sym-share': {'cylc-run': {'foo': {'bar': { - 'share': { - 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') - } - }}}}, - 'sym-cycle': {'cylc-run': {'foo': {'bar': { - 'share': { - 'cycle': { - 'macklunkey.txt': None - } - } - }}}}, - 'you-shall-not-pass': {} -} - -FILETREE_3 = { - 'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}}, - 'sym-run': {'cylc-run': {'foo': {'bar': { - '.service': {'db': None}, - 'flow.cylc': None, - 'share': { - 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') - } - }}}}, - 'sym-cycle': {'cylc-run': {'foo': {'bar': { - 'share': { - 'cycle': { - 'sokath.txt': None - } - } - }}}}, - 'you-shall-not-pass': {} -} - -FILETREE_4 = { - 'cylc-run': {'foo': {'bar': { - '.service': {'db': None}, - 'flow.cylc': None, - 'share': { - 'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle') - } - }}}, - 'sym-cycle': {'cylc-run': {'foo': {'bar': { - 'share': { - 'cycle': { - 'kiazi.txt': None - } - } - }}}}, - 'you-shall-not-pass': {} -} - - @pytest.mark.parametrize( 'rm_dirs, initial_filetree, filetree_left_behind', [