Skip to content

Commit

Permalink
Targeted clean: Overhaul processing of glob results
Browse files Browse the repository at this point in the history
This filters out all redundant matches, i.e. matches that are subpaths
of other matches (except for the std symlink dirs - we need to return
these as we need to follow their targets).

As was implemented in a previous commit, it still also filters out
subpaths of symlinks as we don't want to follow symlinks (again except
for the std symlink dirs).
  • Loading branch information
MetRonnie committed Jun 11, 2021
1 parent 82ec437 commit 2300f4d
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 78 deletions.
57 changes: 47 additions & 10 deletions cylc/flow/pathutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from pathlib import Path
import re
from shutil import rmtree
from typing import Dict, Iterable, List, Set, Union
from typing import Container, Dict, Iterable, List, Set, Union

from cylc.flow import LOG
from cylc.flow.cfgspec.glbl_cfg import glbl_cfg
Expand Down Expand Up @@ -314,18 +314,55 @@ def parse_rm_dirs(rm_dirs: Iterable[str]) -> Set[str]:
return result


def glob_in_run_dir(run_dir: Union[Path, str], pattern: str) -> List[Path]:
def glob_in_run_dir(
run_dir: Union[Path, str], pattern: str, symlink_dirs: Container[Path]
) -> List[Path]:
"""Execute a (recursive) glob search in the given run directory.
NOTE: this follows symlinks, so be careful what you do with the results.
Returns list of any absolute paths that match the pattern. However:
* Does not follow symlinks (apart from the spcedified symlink dirs).
* Also does not return matching subpaths of matching directories (because
that would be redundant).
Args:
run_dir: Absolute path of the workflow run dir.
pattern: The glob pattern.
symlink_dirs: Absolute paths to the workflow's symlink dirs.
"""
return sorted(
Path(i) for i in glob.iglob(
# Note: use os.path.join, not pathlib, to preserve trailing slash
os.path.join(glob.escape(str(run_dir)), pattern),
recursive=True
)
)
# Note: use os.path.join, not pathlib, to preserve trailing slash if
# present in pattern
pattern = os.path.join(glob.escape(str(run_dir)), pattern)
# Note: don't use pathlib.Path.glob() because when you give it an exact
# filename instead of pattern, it doesn't return broken symlinks
matches = sorted(Path(i) for i in glob.iglob(pattern, recursive=True))
# sort guarantees parents come before their children
if len(matches) == 1 and not os.path.lexists(matches[0]):
# https://bugs.python.org/issue35201
return []
results: List[Path] = []
subpath_excludes: Set[Path] = set()
for path in matches:
for rel_ancestor in reversed(path.relative_to(run_dir).parents):
ancestor = run_dir / rel_ancestor
if ancestor in subpath_excludes:
break
if ancestor.is_symlink() and ancestor not in symlink_dirs:
# Do not follow non-standard symlinks
subpath_excludes.add(ancestor)
break
if (not symlink_dirs) and ancestor in results:
# We can be sure all subpaths of this ancestor are redundant
subpath_excludes.add(ancestor)
break
if ancestor == path.parent:
# Final iteration over ancestors
if ancestor in matches and path not in symlink_dirs:
# Redundant (but don't exclude subpaths in case any of the
# subpaths are std symlink dirs)
break
else: # no break
results.append(path)
return results


def is_relative_to(path1: Union[Path, str], path2: Union[Path, str]) -> bool:
Expand Down
73 changes: 31 additions & 42 deletions cylc/flow/workflow_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
expand_path,
get_workflow_run_dir,
glob_in_run_dir,
is_relative_to,
make_localhost_symlinks,
parse_rm_dirs,
remove_dir_and_target,
Expand Down Expand Up @@ -212,21 +211,20 @@ class Install:
SOURCE = 'source'
"""Symlink to the workflow definition (For run dir)."""

RESERVED_DIRNAMES = [
LOG_DIR, SHARE_DIR, WORK_DIR, RUN_N, Service.DIRNAME, Install.DIRNAME]
RESERVED_DIRNAMES = frozenset([
LOG_DIR, SHARE_DIR, WORK_DIR, RUN_N, Service.DIRNAME, Install.DIRNAME
])
"""Reserved directory names that cannot be present in a source dir."""

RESERVED_NAMES = [FLOW_FILE, SUITE_RC, *RESERVED_DIRNAMES]
RESERVED_NAMES = frozenset([FLOW_FILE, SUITE_RC, *RESERVED_DIRNAMES])
"""Reserved filenames that cannot be used as run names."""

SYMLINK_DIRS = sorted(
[SHARE_CYCLE_DIR, SHARE_DIR, LOG_DIR, WORK_DIR, ''],
reverse=True
)
"""The symlink dirs that may be set in global.cylc[symlink dirs] (apart
from the run dir). '' represents the run dir. Note: these should be ordered
by deepest to shallowest (i.e. share/cycle/ before share/, '' last),
hence the use of `sorted()` to make sure."""
SYMLINK_DIRS = frozenset([
SHARE_CYCLE_DIR, SHARE_DIR, LOG_DIR, WORK_DIR, ''
])
"""The paths of the symlink dirs that may be set in
global.cylc[symlink dirs], relative to the run dir
('' represents the run dir)."""


class ContactFileFields:
Expand Down Expand Up @@ -707,7 +705,8 @@ def get_symlink_dirs(reg: str, run_dir: Union[Path, str]) -> Dict[str, Path]:
Raises WorkflowFilesError if a symlink points to an unexpected place.
"""
ret: Dict[str, Path] = {}
for _dir in WorkflowFiles.SYMLINK_DIRS:
for _dir in sorted(WorkflowFiles.SYMLINK_DIRS, reverse=True):
# ordered by deepest to shallowest
path = Path(run_dir, _dir)
if path.is_symlink():
target = path.resolve()
Expand All @@ -730,41 +729,31 @@ def _clean_using_glob(
"""Delete the files/dirs in the run dir that match the pattern.
Does not follow symlinks (apart from the standard symlink dirs).
Args:
run_dir: Absolute path of workflow run dir.
pattern: The glob pattern.
symlink_dirs: Paths of the workflow's symlink dirs relative to
the run dir.
"""
matches = glob_in_run_dir(run_dir, pattern)
abs_symlink_dirs = tuple(sorted(
(run_dir / d for d in symlink_dirs),
reverse=True # ordered by deepest to shallowest
))
matches = glob_in_run_dir(run_dir, pattern, abs_symlink_dirs)
if not matches:
return
abs_symlink_dirs = [run_dir / d for d in symlink_dirs]
# First clean any matching symlink dirs
if abs_symlink_dirs:
for path in abs_symlink_dirs:
if path in matches:
remove_dir_and_target(path)
if path == run_dir:
# We have deleted the run dir
return
# Redo glob as contents have changed
matches = glob_in_run_dir(run_dir, pattern)
for path in abs_symlink_dirs:
if path in matches:
remove_dir_and_target(path)
if path == run_dir:
# We have deleted the run dir
return
matches.remove(path)
# Now clean the rest
if matches and matches[0] == run_dir:
remove_dir_or_file(run_dir)
return
skip_children_of: Optional[Path] = None
for path in matches:
# Skip path if inside a dir we have already dealt with
if skip_children_of and is_relative_to(path, skip_children_of):
continue
# Important: exclude subpaths of symlinks (apart from the standard
# symlink dirs)
for a in list(reversed(path.relative_to(run_dir).parents))[1:]:
ancestor = Path(run_dir, a)
if ancestor.is_symlink() and ancestor not in abs_symlink_dirs:
skip_children_of = ancestor
break
else: # no break
if os.path.lexists(path):
remove_dir_or_file(path)
skip_children_of = path
remove_dir_or_file(path)


def remote_clean(
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_pathutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
import logging
import os
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Set
import pytest
from typing import Callable, Dict, Iterable, List, Set
from unittest.mock import Mock, patch, call

from cylc.flow.exceptions import UserInputError, WorkflowFilesError
Expand Down
Loading

0 comments on commit 2300f4d

Please sign in to comment.