Skip to content

Commit

Permalink
compact: add --stats option
Browse files Browse the repository at this point in the history
with --stats it will be as slow as before, listing all repo objs.

without --stats, it will be faster by using the cached chunks index.
  • Loading branch information
ThomasWaldmann committed Nov 24, 2024
1 parent 4c1e2bc commit a46131b
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 28 deletions.
66 changes: 47 additions & 19 deletions src/borg/archiver/compact_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from ._common import with_repository
from ..archive import Archive
from ..cache import write_chunkindex_to_repo_cache
from ..cache import write_chunkindex_to_repo_cache, build_chunkindex_from_repo
from ..constants import * # NOQA
from ..hashindex import ChunkIndex, ChunkIndexEntry
from ..helpers import set_ec, EXIT_WARNING, EXIT_ERROR, format_file_size, bin_to_hex
Expand All @@ -18,25 +18,25 @@


class ArchiveGarbageCollector:
def __init__(self, repository, manifest):
def __init__(self, repository, manifest, *, stats):
self.repository = repository
assert isinstance(repository, (Repository, RemoteRepository))
self.manifest = manifest
self.chunks = None # a ChunkIndex, here used for: id -> (is_used, stored_size)
self.total_files = None # overall number of source files written to all archives in this repo
self.total_size = None # overall size of source file content data written to all archives
self.archives_count = None # number of archives
self.stats = stats # compute repo space usage before/after - lists all repo objects, can be slow.

@property
def repository_size(self):
if self.chunks is None:
if self.chunks is None or not self.stats:
return None
return sum(entry.size for id, entry in self.chunks.iteritems()) # sum of stored sizes

def garbage_collect(self):
"""Removes unused chunks from a repository."""
logger.info("Starting compaction / garbage collection...")
logger.info("Getting object IDs present in the repository...")
self.chunks = self.get_repository_chunks()
logger.info("Computing object IDs used by archives...")
(self.missing_chunks, self.reappeared_chunks, self.total_files, self.total_size, self.archives_count) = (
Expand All @@ -47,20 +47,30 @@ def garbage_collect(self):
logger.info("Finished compaction / garbage collection...")

def get_repository_chunks(self) -> ChunkIndex:
"""Build a dict id -> size of all chunks present in the repository"""
chunks = ChunkIndex()
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
# we add this id to the chunks index (as unused chunk), because
# we do not know yet whether it is actually referenced from some archives.
# we "abuse" the size field here. usually there is the plaintext size,
# but we use it for the size of the stored object here.
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
"""return a chunks index"""
if self.stats: # slow method: build a fresh chunks index, with stored chunk sizes.
logger.info("Getting object IDs present in the repository...")
chunks = ChunkIndex()
for id, stored_size in repo_lister(self.repository, limit=LIST_SCAN_LIMIT):
# we add this id to the chunks index (as unused chunk), because
# we do not know yet whether it is actually referenced from some archives.
# we "abuse" the size field here. usually there is the plaintext size,
# but we use it for the size of the stored object here.
chunks[id] = ChunkIndexEntry(flags=ChunkIndex.F_NONE, size=stored_size)
else: # faster: rely on existing chunks index (with flags F_NONE and size 0).
logger.info("Getting object IDs from cached chunks index...")
chunks = build_chunkindex_from_repo(self.repository, cache_immediately=True)
return chunks

def save_chunk_index(self):
# write_chunkindex_to_repo now removes all flags and size infos.
# we need this, as we put the wrong size in there.
write_chunkindex_to_repo_cache(self.repository, self.chunks, clear=True, force_write=True, delete_other=True)
if self.stats:
# write_chunkindex_to_repo now removes all flags and size infos.
# we need this, as we put the wrong size in there to support --stats computations.
write_chunkindex_to_repo_cache(
self.repository, self.chunks, clear=True, force_write=True, delete_other=True
)
else:
self.chunks.clear() # we already have updated the repo cache in get_repository_chunks
self.chunks = None # nothing there (cleared!)

def analyze_archives(self) -> Tuple[Set, Set, int, int, int]:
Expand All @@ -75,7 +85,8 @@ def use_it(id, *, wanted=False):
# chunk id is from chunks_healthy list: a lost chunk has re-appeared!
reappeared_chunks.add(id)
else:
# we do NOT have this chunk in the repository!
# with --stats: we do NOT have this chunk in the repository!
# without --stats: we do not have this chunk or the chunks index is incomplete.
missing_chunks.add(id)

missing_chunks: set[bytes] = set()
Expand Down Expand Up @@ -153,15 +164,18 @@ def report_and_delete(self):
logger.info(
f"Source data size was {format_file_size(self.total_size, precision=0)} in {self.total_files} files."
)
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
if self.stats:
logger.info(f"Repository size is {format_file_size(repo_size_after, precision=0)} in {count} objects.")
logger.info(f"Compaction saved {format_file_size(repo_size_before - repo_size_after, precision=0)}.")
else:
logger.info(f"Repository has data stored in {count} objects.")


class CompactMixIn:
@with_repository(exclusive=True, compatibility=(Manifest.Operation.DELETE,))
def do_compact(self, args, repository, manifest):
"""Collect garbage in repository"""
ArchiveGarbageCollector(repository, manifest).garbage_collect()
ArchiveGarbageCollector(repository, manifest, stats=args.stats).garbage_collect()

def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog
Expand Down Expand Up @@ -198,6 +212,16 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
might not want to do that unless there are signs of lost archives (e.g. when
seeing fatal errors when creating backups or when archives are missing in
``borg repo-list``).
When giving the ``--stats`` option, borg will internally list all repository
objects to determine their existence AND stored size. It will build a fresh
chunks index from that information and cache it in the repository. For some
types of repositories, this might be very slow. It will tell you the sum of
stored object sizes, before and after compaction.
Without ``--stats``, borg will rely on the cached chunks index to determine
existing object IDs (but there is no stored size information in the index,
thus it can't compute before/after compaction size statistics).
"""
)
subparser = subparsers.add_parser(
Expand All @@ -210,3 +234,7 @@ def build_parser_compact(self, subparsers, common_parser, mid_common_parser):
help="compact repository",
)
subparser.set_defaults(func=self.do_compact)

subparser.add_argument(
"-s", "--stats", dest="stats", action="store_true", help="print statistics (might be much slower)"
)
35 changes: 26 additions & 9 deletions src/borg/testsuite/archiver/compact_cmd_test.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,61 @@
import pytest

from ...constants import * # NOQA
from . import cmd, create_src_archive, generate_archiver_tests, RK_ENCRYPTION

pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local,remote,binary") # NOQA


def test_compact_empty_repository(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_empty_repository(archivers, request, stats):
archiver = request.getfixturevalue(archivers)

cmd(archiver, "repo-create", RK_ENCRYPTION)

output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Repository size is 0 B in 0 objects." in output
if stats:
assert "Repository size is 0 B in 0 objects." in output
else:
assert "Repository has data stored in 0 objects." in output
assert "Finished compaction" in output


def test_compact_after_deleting_all_archives(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_after_deleting_all_archives(archivers, request, stats):
archiver = request.getfixturevalue(archivers)

cmd(archiver, "repo-create", RK_ENCRYPTION)
create_src_archive(archiver, "archive")
cmd(archiver, "delete", "-a", "archive", exit_code=0)

output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Deleting " in output
assert "Repository size is 0 B in 0 objects." in output
if stats:
assert "Repository size is 0 B in 0 objects." in output
else:
assert "Repository has data stored in 0 objects." in output
assert "Finished compaction" in output


def test_compact_after_deleting_some_archives(archivers, request):
@pytest.mark.parametrize("stats", (True, False))
def test_compact_after_deleting_some_archives(archivers, request, stats):
archiver = request.getfixturevalue(archivers)

cmd(archiver, "repo-create", RK_ENCRYPTION)
create_src_archive(archiver, "archive1")
create_src_archive(archiver, "archive2")
cmd(archiver, "delete", "-a", "archive1", exit_code=0)

output = cmd(archiver, "compact", "-v", exit_code=0)
args = ("-v", "--stats") if stats else ("-v",)
output = cmd(archiver, "compact", *args, exit_code=0)
assert "Starting compaction" in output
assert "Deleting " in output
assert "Repository size is 0 B in 0 objects." not in output
if stats:
assert "Repository size is 0 B in 0 objects." not in output
else:
assert "Repository has data stored in 0 objects." not in output
assert "Finished compaction" in output

0 comments on commit a46131b

Please sign in to comment.