-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: top level params and metrics #8529
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,14 +68,50 @@ def __init__( | |
if stages is not None: | ||
self.stages: List["Stage"] = stages | ||
self._collected_targets: Dict[int, List["StageInfo"]] = {} | ||
self._metrics: Dict[str, List[str]] = {} | ||
self._plots: Dict[str, Any] = {} | ||
self._params: Dict[str, List[str]] = {} | ||
|
||
@cached_property | ||
def stages(self) -> List["Stage"]: # pylint: disable=method-hidden | ||
# note that ideally we should be keeping this in a set as it is unique, | ||
# hashable and has no concept of orderliness on its own. But we depend | ||
# on this to be somewhat ordered for status/metrics/plots, etc. | ||
return self._collect() | ||
|
||
@cached_property | ||
def _top_metrics(self): | ||
self._collect() | ||
return self._metrics | ||
|
||
@cached_property | ||
def _top_plots(self): | ||
self._collect() | ||
return self._plots | ||
|
||
@cached_property | ||
def _top_params(self): | ||
self._collect() | ||
return self._params | ||
|
||
def _collect(self): | ||
if "stages" in self.__dict__: | ||
return self.stages | ||
|
||
onerror = self.repo.stage_collection_error_handler | ||
return self.stage_collector.collect_repo(onerror=onerror) | ||
|
||
# pylint: disable=protected-access | ||
( | ||
stages, | ||
metrics, | ||
plots, | ||
params, | ||
) = self.stage_collector._collect_all_from_repo(onerror=onerror) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Collection needs to be brought inside |
||
self.stages = stages | ||
self._metrics = metrics | ||
self._plots = plots | ||
self._params = params | ||
return stages | ||
|
||
def __repr__(self) -> str: | ||
from dvc.fs import LocalFileSystem | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -502,3 +502,104 @@ def is_out_or_ignored(root, directory): | |
|
||
def collect_repo(self, onerror: Callable[[str, Exception], None] = None): | ||
return list(self._collect_repo(onerror)) | ||
|
||
def _load_file(self, path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated from |
||
from dvc.dvcfile import Dvcfile | ||
from dvc.stage.loader import SingleStageLoader, StageLoader | ||
|
||
path = self._get_filepath(path) | ||
dvcfile = Dvcfile(self.repo, path) | ||
# `dvcfile.stages` is not cached | ||
stages = dvcfile.stages # type: ignore | ||
|
||
if isinstance(stages, SingleStageLoader): | ||
stages_ = [stages[None]] | ||
else: | ||
assert isinstance(stages, StageLoader) | ||
keys = self._get_keys(stages) | ||
stages_ = [stages[key] for key in keys] | ||
|
||
return ( | ||
stages_, | ||
stages.metrics_data, | ||
stages.plots_data, | ||
stages.params_data, | ||
) | ||
|
||
def _collect_all_from_repo( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bad naming I know, also duplicates logic from |
||
self, onerror: Callable[[str, Exception], None] = None | ||
): | ||
"""Collects all of the stages present in the DVC repo. | ||
|
||
Args: | ||
onerror (optional): callable that will be called with two args: | ||
the filepath whose collection failed and the exc instance. | ||
It can report the error to continue with the collection | ||
(and, skip failed ones), or raise the exception to abort | ||
the collection. | ||
""" | ||
from dvc.dvcfile import is_valid_filename | ||
from dvc.fs import LocalFileSystem | ||
|
||
scm = self.repo.scm | ||
sep = self.fs.sep | ||
outs: Set[str] = set() | ||
|
||
is_local_fs = isinstance(self.fs, LocalFileSystem) | ||
|
||
def is_ignored(path): | ||
# apply only for the local fs | ||
return is_local_fs and scm.is_ignored(path) | ||
|
||
def is_dvcfile_and_not_ignored(root, file): | ||
return is_valid_filename(file) and not is_ignored( | ||
f"{root}{sep}{file}" | ||
) | ||
|
||
def is_out_or_ignored(root, directory): | ||
dir_path = f"{root}{sep}{directory}" | ||
# trailing slash needed to check if a directory is gitignored | ||
return dir_path in outs or is_ignored(f"{dir_path}{sep}") | ||
|
||
walk_iter = self.repo.dvcignore.walk(self.fs, self.repo.root_dir) | ||
if logger.isEnabledFor(logging.TRACE): # type: ignore[attr-defined] | ||
walk_iter = log_walk(walk_iter) | ||
|
||
stages = [] | ||
metrics = {} | ||
plots = {} | ||
params = {} | ||
|
||
for root, dirs, files in walk_iter: | ||
dvcfile_filter = partial(is_dvcfile_and_not_ignored, root) | ||
for file in filter(dvcfile_filter, files): | ||
file_path = self.fs.path.join(root, file) | ||
try: | ||
( | ||
new_stages, | ||
new_metrics, | ||
new_plots, | ||
new_params, | ||
) = self._load_file(file_path) | ||
except DvcException as exc: | ||
if onerror: | ||
onerror(relpath(file_path), exc) | ||
continue | ||
raise | ||
|
||
stages.extend(new_stages) | ||
if new_metrics: | ||
metrics[file_path] = new_metrics | ||
if new_plots: | ||
plots[file_path] = new_plots | ||
if new_params: | ||
params[file_path] = new_params | ||
|
||
outs.update( | ||
out.fspath | ||
for stage in new_stages | ||
for out in stage.outs | ||
if out.protocol == "local" | ||
) | ||
dirs[:] = [d for d in dirs if not is_out_or_ignored(root, d)] | ||
return stages, metrics, plots, params |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,9 @@ def __init__(self, dvcfile, data, lockfile_data=None): | |
self.dvcfile = dvcfile | ||
self.data = data or {} | ||
self.stages_data = self.data.get("stages", {}) | ||
self.metrics_data = self.data.get("metrics", []) | ||
self.params_data = self.data.get("params", []) | ||
self.plots_data = self.data.get("plots", {}) | ||
Comment on lines
+27
to
+29
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The whole PR is standing on these 3 lines. It feels weird to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to rearchitect here. |
||
self.repo = self.dvcfile.repo | ||
|
||
lockfile_data = lockfile_data or {} | ||
|
@@ -171,6 +174,9 @@ def __init__(self, dvcfile, stage_data, stage_text=None): | |
self.dvcfile = dvcfile | ||
self.stage_data = stage_data or {} | ||
self.stage_text = stage_text | ||
self.metrics_data = [] | ||
self.params_data = [] | ||
self.plots_data = {} | ||
|
||
def __getitem__(self, item): | ||
if item: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These all might be temporary, we probably should unify top-level metrics and stage-level metrics. Hence, keeping them as private.