diff --git a/dvc/repo/data.py b/dvc/repo/data.py index da0e555297..9e0cf3333a 100644 --- a/dvc/repo/data.py +++ b/dvc/repo/data.py @@ -323,7 +323,6 @@ def _get_entries_not_in_remote( filter_keys: Optional[Iterable["DataIndexKey"]] = None, granular: bool = False, remote_refresh: bool = False, - batch_size: Optional[int] = None, ) -> list[str]: """Get entries that are not in remote storage.""" from dvc.repo.worktree import worktree_view @@ -337,10 +336,6 @@ def _get_entries_not_in_remote( missing_entries = [] - to_check: dict[FileSystem, dict[str, list[DataIndexEntry]]] = defaultdict( - lambda: defaultdict(list) - ) - storage_map = view.storage_map with TqdmCallback(size=0, desc="Checking remote", unit="entry") as cb: for key, entry in view.iteritems(shallow=not granular): @@ -358,28 +353,12 @@ def _get_entries_not_in_remote( continue k = (*key, "") if entry.meta and entry.meta.isdir else key - if remote_refresh: - # on remote_refresh, collect all entries to check - # then check them in batches below - try: - remote_fs, remote_path = storage_map.get_remote(entry) - to_check[remote_fs][remote_path].append(entry) - cb.size += 1 - cb.relative_update(0) # try to update the progress bar - except StorageKeyError: - pass - else: - try: - if not storage_map.remote_exists(entry, refresh=remote_refresh): - missing_entries.append(os.path.sep.join(k)) - cb.relative_update() # no need to update the size - except StorageKeyError: - pass - missing_entries.extend( - _get_missing_paths( - to_check, batch_size=batch_size, callback=StorageCallback(cb) - ) - ) + try: + if not storage_map.remote_exists(entry, refresh=remote_refresh): + missing_entries.append(os.path.sep.join(k)) + cb.relative_update() # no need to update the size + except StorageKeyError: + pass return missing_entries @@ -428,7 +407,6 @@ def status( filter_keys=filter_keys, granular=granular, remote_refresh=remote_refresh, - batch_size=batch_size, ) try: diff --git a/tests/func/test_data_status.py b/tests/func/test_data_status.py index e6fb151bfc..fa3f2777a2 100644 --- a/tests/func/test_data_status.py +++ b/tests/func/test_data_status.py @@ -881,3 +881,40 @@ def test_filter_targets_not_in_cache( assert dvc.data_status(targets, granular=True, not_in_remote=not_in_remote) == d | { key: granular } + + +def test_compat_legacy_new_cache_types(M, tmp_dir, dvc, scm): + tmp_dir.gen({"foo": "foo", "bar": "bar"}) + (tmp_dir / "foo.dvc").dump( + { + "outs": [ + {"path": "foo", "md5": "acbd18db4cc2f85cedef654fccc4a4d8", "size": 3}, + ] + } + ) + dvc.add(tmp_dir / "bar", no_commit=True) + + assert dvc.data_status() == { + **EMPTY_STATUS, + "not_in_cache": M.unordered("foo", "bar"), + "committed": {"added": M.unordered("foo", "bar")}, + "git": M.dict(), + } + + dvc.commit("foo") + + assert dvc.data_status() == { + **EMPTY_STATUS, + "not_in_cache": ["bar"], + "committed": {"added": M.unordered("foo", "bar")}, + "git": M.dict(), + } + + dvc.commit("bar") + + assert dvc.data_status() == { + **EMPTY_STATUS, + "not_in_cache": [], + "committed": {"added": M.unordered("foo", "bar")}, + "git": M.dict(), + }