Skip to content

Commit 8d5f9ab

Browse files
authored
add: do not delete stage files before add (#6239)
* add: do not delete stage files before add Because of the way we collect stages and cache them, we were not able to collect them for the `add` without removing them from the workspace. As doing so, we'd have two same/similar stages - one collected from the workspace and the other just created from the `dvc add` in-memory. This would raise errors during graph checks, so we started to delete them and reset them (which is very recently, see #2886 and #3349). By deleting the file before we even do any checks, we are making DVC fragile, and results in data loss for the users with even simple mistakes. This should make it more reliable and robust. And, recently, we have started to keep state of a lot of things, that by resetting them on each stage, we waste a lot of performance, especially on gitignores. We cache the dulwich's IgnoreManager, which when resetted too many times, will waste a lot of our time just collecting them again next time (see #6227). It's hard to say how much this improves, as this very much depends on no. of gitignores in the repo (which can be assumed to be quite in number for a dvc repo) and the amount of files that we are adding (eg: `-R` adding a large directory). On a directory with 10,000 files (in a datadet-registry repo), creating stages on `dvc add -R` went from 64 files/sec to 1.1k files/sec. * add tests * make the test more specific
1 parent 07f5f6b commit 8d5f9ab

File tree

2 files changed

+48
-6
lines changed

2 files changed

+48
-6
lines changed

dvc/repo/add.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,11 @@ def add( # noqa: C901
115115
**kwargs,
116116
)
117117

118+
# remove existing stages that are to-be replaced with these
119+
# new stages for the graph checks.
120+
old_stages = set(repo.stages) - set(stages)
118121
try:
119-
repo.check_modified_graph(stages)
122+
repo.check_modified_graph(stages, list(old_stages))
120123
except OverlappingOutputPathsError as exc:
121124
msg = (
122125
"Cannot add '{out}', because it is overlapping with other "
@@ -250,7 +253,6 @@ def _create_stages(
250253
transfer=False,
251254
**kwargs,
252255
):
253-
from dvc.dvcfile import Dvcfile
254256
from dvc.stage import Stage, create_stage, restore_meta
255257

256258
expanded_targets = glob_targets(targets, glob=glob)
@@ -276,12 +278,9 @@ def _create_stages(
276278
external=external,
277279
)
278280
restore_meta(stage)
279-
Dvcfile(repo, stage.path).remove()
280281
if desc:
281282
stage.outs[0].desc = desc
282283

283-
repo._reset() # pylint: disable=protected-access
284-
285284
if not stage:
286285
if pbar is not None:
287286
pbar.total -= 1

tests/func/test_add.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@
2424
from dvc.hash_info import HashInfo
2525
from dvc.main import main
2626
from dvc.objects.db import ODBManager
27-
from dvc.output import OutputAlreadyTrackedError, OutputIsStageFileError
27+
from dvc.output import (
28+
OutputAlreadyTrackedError,
29+
OutputDoesNotExistError,
30+
OutputIsStageFileError,
31+
)
2832
from dvc.stage import Stage
2933
from dvc.stage.exceptions import (
3034
StageExternalOutputsError,
@@ -1190,3 +1194,42 @@ def test_add_ignored(tmp_dir, scm, dvc):
11901194
assert str(exc.value) == ("bad DVC file name '{}' is git-ignored.").format(
11911195
os.path.join("dir", "subdir.dvc")
11921196
)
1197+
1198+
1199+
def test_add_on_not_existing_file_should_not_remove_stage_file(tmp_dir, dvc):
1200+
(stage,) = tmp_dir.dvc_gen("foo", "foo")
1201+
(tmp_dir / "foo").unlink()
1202+
dvcfile_contents = (tmp_dir / stage.path).read_text()
1203+
1204+
with pytest.raises(OutputDoesNotExistError):
1205+
dvc.add("foo")
1206+
assert (tmp_dir / "foo.dvc").exists()
1207+
assert (tmp_dir / stage.path).read_text() == dvcfile_contents
1208+
1209+
1210+
@pytest.mark.parametrize(
1211+
"target",
1212+
[
1213+
"dvc.repo.Repo.check_modified_graph",
1214+
"dvc.stage.Stage.save",
1215+
"dvc.stage.Stage.commit",
1216+
],
1217+
)
1218+
def test_add_does_not_remove_stage_file_on_failure(
1219+
tmp_dir, dvc, mocker, target
1220+
):
1221+
(stage,) = tmp_dir.dvc_gen("foo", "foo")
1222+
tmp_dir.gen("foo", "foobar") # update file
1223+
dvcfile_contents = (tmp_dir / stage.path).read_text()
1224+
1225+
exc_msg = f"raising error from mocked '{target}'"
1226+
mocker.patch(
1227+
target,
1228+
side_effect=DvcException(exc_msg),
1229+
)
1230+
1231+
with pytest.raises(DvcException) as exc_info:
1232+
dvc.add("foo")
1233+
assert str(exc_info.value) == exc_msg
1234+
assert (tmp_dir / "foo.dvc").exists()
1235+
assert (tmp_dir / stage.path).read_text() == dvcfile_contents

0 commit comments

Comments
 (0)