From c182b749521bbbaf7fe48b8bc5deebf2d57e0729 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 03:01:03 +0200 Subject: [PATCH 1/8] transfer: copy archives from another repo this is somehow similar to borg recreate, but with different focus and way simpler: not changing compression algo not changing chunking not excluding files inside an archive by path match only dealing with complete archives but: different src and dst repo only reading each chunk once keeping the compressed payload (no decompression/recompression effort) --dry-run can be used before and afterwards to check --- src/borg/archiver.py | 101 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index de9489ff0a..8331c49070 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -338,6 +338,70 @@ def do_serve(self, args): ).serve() return EXIT_SUCCESS + @with_other_repository(manifest=True, key=True, compatibility=(Manifest.Operation.READ,)) + @with_repository(exclusive=True, manifest=True, cache=True, compatibility=(Manifest.Operation.WRITE,)) + def do_transfer(self, args, *, + repository, manifest, key, cache, + other_repository=None, other_manifest=None, other_key=None): + """archives transfer from other repository""" + dry_run = args.dry_run + + args.consider_checkpoints = True + archive_names = tuple(x.name for x in other_manifest.archives.list_considering(args)) + if not archive_names: + return EXIT_SUCCESS + + for name in archive_names: + transfer_size = 0 + present_size = 0 + if name in manifest.archives and not dry_run: + print(f"{name}: archive is already present in destination repo, skipping.") + else: + if not dry_run: + print(f"{name}: copying archive to destination repo...") + other_archive = Archive(other_repository, other_key, other_manifest, name) + archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None + for item in other_archive.iter_items(): + if 'chunks' in item: + chunks = [] + for chunk_id, size, _ in item.chunks: + refcount = cache.seen_chunk(chunk_id, size) + if refcount == 0: # target repo does not yet have this chunk + if not dry_run: + cdata = other_repository.get(chunk_id) + # keep compressed payload same, avoid decompression / recompression + data = other_key.decrypt(chunk_id, cdata, decompress=False) + chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False, + compress=False, size=size) + cache.repository.async_response(wait=False) + chunks.append(chunk_entry) + transfer_size += size + else: + if not dry_run: + chunk_entry = cache.chunk_incref(chunk_id, archive.stats) + chunks.append(chunk_entry) + present_size += size + if not dry_run: + item.chunks = chunks # overwrite! IDs and sizes are same, csizes are likely different + archive.stats.nfiles += 1 + # TODO: filter the item data, get rid of legacy crap + if not dry_run: + archive.add_item(item) + if not dry_run: + additional_metadata = {} + # keep all metadata except archive version and stats. also do not keep + # recreate_source_id, recreate_args, recreate_partial_chunks which were used only in 1.1.0b1 .. b2. + for attr in ('cmdline', 'hostname', 'username', 'time', 'time_end', 'comment', + 'chunker_params', 'recreate_cmdline'): + if hasattr(other_archive.metadata, attr): + additional_metadata[attr] = getattr(other_archive.metadata, attr) + archive.save(stats=archive.stats, additional_metadata=additional_metadata) + print(f"{name}: finished. transfer_size: {transfer_size} present_size: {present_size}") + else: + print(f"{name}: completed" if transfer_size == 0 else + f"{name}: incomplete, transfer_size: {transfer_size} present_size: {present_size}") + return EXIT_SUCCESS + @with_repository(create=True, exclusive=True, manifest=False) @with_other_repository(key=True, compatibility=(Manifest.Operation.READ, )) def do_init(self, args, repository, *, other_repository=None, other_key=None): @@ -4083,6 +4147,43 @@ def define_borg_mount(parser): help='archives to delete') define_archive_filters_group(subparser) + # borg transfer + transfer_epilog = process_epilog(""" + This command transfers archives from one repository to another repository. + + Suggested use: + + # initialize DST_REPO reusing key material from SRC_REPO, so that + # chunking and chunk id generation will work in the same way as before. + borg init --other-location=SRC_REPO --encryption=DST_ENC DST_REPO + + # transfer archives from SRC_REPO to DST_REPO + borg transfer --dry-run SRC_REPO DST_REPO # check what it would do + borg transfer SRC_REPO DST_REPO # do it! + borg transfer --dry-run SRC_REPO DST_REPO # check! anything left? + + The default is to transfer all archives, including checkpoint archives. + + You could use the misc. archive filter options to limit which archives it will + transfer, e.g. using the --prefix option. This is recommended for big + repositories with multiple data sets to keep the runtime per invocation lower. + """) + subparser = subparsers.add_parser('transfer', parents=[common_parser], add_help=False, + description=self.do_transfer.__doc__, + epilog=transfer_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + help='transfer of archives from another repository') + subparser.set_defaults(func=self.do_transfer) + subparser.add_argument('-n', '--dry-run', dest='dry_run', action='store_true', + help='do not change repository, just check') + subparser.add_argument('other_location', metavar='SRC_REPOSITORY', + type=location_validator(archive=False, other=True), + help='source repository') + subparser.add_argument('location', metavar='DST_REPOSITORY', + type=location_validator(archive=False, other=False), + help='destination repository') + define_archive_filters_group(subparser) + # borg diff diff_epilog = process_epilog(""" This command finds differences (file contents, user/group/mode) between archives. From 01ca5c71ffa73f9913a62352648e8ec8db6afebd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 16:58:57 +0200 Subject: [PATCH 2/8] transfer: clean item of attic 0.13 'acl' bug remnants --- src/borg/archiver.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 8331c49070..3a920d38a0 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -344,6 +344,12 @@ def do_transfer(self, args, *, repository, manifest, key, cache, other_repository=None, other_manifest=None, other_key=None): """archives transfer from other repository""" + + def upgrade_item(item): + """upgrade item as needed, get rid of legacy crap""" + item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 + return item + dry_run = args.dry_run args.consider_checkpoints = True @@ -384,9 +390,8 @@ def do_transfer(self, args, *, if not dry_run: item.chunks = chunks # overwrite! IDs and sizes are same, csizes are likely different archive.stats.nfiles += 1 - # TODO: filter the item data, get rid of legacy crap if not dry_run: - archive.add_item(item) + archive.add_item(upgrade_item(item)) if not dry_run: additional_metadata = {} # keep all metadata except archive version and stats. also do not keep From 27cc50c9b40137910ac11a5cda58e45ad16a4896 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 17:13:37 +0200 Subject: [PATCH 3/8] transfer: make sure items with chunks have precomputed size --- src/borg/archiver.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 3a920d38a0..0fba5c15ac 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -348,6 +348,7 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 + item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item dry_run = args.dry_run From af6c34222a5f8c5103ace01e3065661954bbc34a Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Tue, 3 May 2022 20:51:43 +0200 Subject: [PATCH 4/8] transfer: remove the zlib type bytes hack hack: see the docstring of ZLIB_legacy class. New clean ZLIB class that works as every other compressor. ZLIB ID 0x0500, ZLIB_legacy ID 0x.8.. --- src/borg/archiver.py | 8 +++++- src/borg/compress.pyx | 51 ++++++++++++++++++++++++++++++---- src/borg/testsuite/archiver.py | 4 +-- src/borg/testsuite/compress.py | 4 +-- 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 0fba5c15ac..14efb43dd3 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -44,7 +44,7 @@ from .archive import has_link from .cache import Cache, assert_secure, SecurityManager from .constants import * # NOQA - from .compress import CompressionSpec + from .compress import CompressionSpec, ZLIB, ZLIB_legacy from .crypto.key import key_creator, key_argument_names, tam_required_file, tam_required from .crypto.key import RepoKey, KeyfileKey, Blake2RepoKey, Blake2KeyfileKey, FlexiKey from .crypto.keymanager import KeyManager @@ -351,6 +351,11 @@ def upgrade_item(item): item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item + def upgrade_compressed_chunk(chunk): + if ZLIB_legacy.detect(chunk): + chunk = ZLIB.ID + chunk # get rid of the attic legacy: prepend separate type bytes for zlib + return chunk + dry_run = args.dry_run args.consider_checkpoints = True @@ -378,6 +383,7 @@ def upgrade_item(item): cdata = other_repository.get(chunk_id) # keep compressed payload same, avoid decompression / recompression data = other_key.decrypt(chunk_id, cdata, decompress=False) + data = upgrade_compressed_chunk(data) chunk_entry = cache.add_chunk(chunk_id, data, archive.stats, wait=False, compress=False, size=size) cache.repository.async_response(wait=False) diff --git a/src/borg/compress.pyx b/src/borg/compress.pyx index 1900480025..bbc2eaaf8b 100644 --- a/src/borg/compress.pyx +++ b/src/borg/compress.pyx @@ -331,14 +331,52 @@ class ZSTD(DecidingCompressor): return dest[:osize] -class ZLIB(CompressorBase): +class ZLIB(DecidingCompressor): """ zlib compression / decompression (python stdlib) """ - ID = b'\x08\x00' # not used here, see detect() - # avoid all 0x.8.. IDs elsewhere! + ID = b'\x05\x00' name = 'zlib' + def __init__(self, level=6, **kwargs): + super().__init__(**kwargs) + self.level = level + + def _decide(self, data): + """ + Decides what to do with *data*. Returns (compressor, zlib_data). + + *zlib_data* is the ZLIB result if *compressor* is ZLIB as well, otherwise it is None. + """ + zlib_data = zlib.compress(data, self.level) + if len(zlib_data) < len(data): + return self, zlib_data + else: + return NONE_COMPRESSOR, None + + def decompress(self, data): + data = super().decompress(data) + try: + return zlib.decompress(data) + except zlib.error as e: + raise DecompressionError(str(e)) from None + + +class ZLIB_legacy(CompressorBase): + """ + zlib compression / decompression (python stdlib) + + Note: This is the legacy ZLIB support as used by borg < 1.3. + It still suffers from attic *only* supporting zlib and not having separate + ID bytes to differentiate between differently compressed chunks. + This just works because zlib compressed stuff always starts with 0x.8.. bytes. + Newer borg uses the ZLIB class that has separate ID bytes (as all the other + compressors) and does not need this hack. + """ + ID = b'\x08\x00' # not used here, see detect() + # avoid all 0x.8.. IDs elsewhere! + name = 'zlib_legacy' + @classmethod def detect(cls, data): # matches misc. patterns 0x.8.. used by zlib @@ -502,13 +540,14 @@ COMPRESSOR_TABLE = { CNONE.name: CNONE, LZ4.name: LZ4, ZLIB.name: ZLIB, + ZLIB_legacy.name: ZLIB_legacy, LZMA.name: LZMA, Auto.name: Auto, ZSTD.name: ZSTD, ObfuscateSize.name: ObfuscateSize, } # List of possible compression types. Does not include Auto, since it is a meta-Compressor. -COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, LZMA, ObfuscateSize, ] # check fast stuff first +COMPRESSOR_LIST = [LZ4, ZSTD, CNONE, ZLIB, ZLIB_legacy, LZMA, ObfuscateSize, ] # check fast stuff first def get_compressor(name, **kwargs): cls = COMPRESSOR_TABLE[name] @@ -554,7 +593,7 @@ class CompressionSpec: self.name = values[0] if self.name in ('none', 'lz4', ): return - elif self.name in ('zlib', 'lzma', ): + elif self.name in ('zlib', 'lzma', 'zlib_legacy'): # zlib_legacy just for testing if count < 2: level = 6 # default compression level in py stdlib elif count == 2: @@ -597,7 +636,7 @@ class CompressionSpec: def compressor(self): if self.name in ('none', 'lz4', ): return get_compressor(self.name) - elif self.name in ('zlib', 'lzma', 'zstd', ): + elif self.name in ('zlib', 'lzma', 'zstd', 'zlib_legacy'): return get_compressor(self.name, level=self.level) elif self.name == 'auto': return get_compressor(self.name, compressor=self.inner.compressor) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 65fe2a446a..0a4f862c6d 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -2442,7 +2442,7 @@ def test_compression_none_uncompressible(self): def test_compression_zlib_compressible(self): size, csize = self._get_sizes('zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 + assert csize == 37 def test_compression_zlib_uncompressible(self): size, csize = self._get_sizes('zlib', compressible=False) @@ -2451,7 +2451,7 @@ def test_compression_zlib_uncompressible(self): def test_compression_auto_compressible(self): size, csize = self._get_sizes('auto,zlib', compressible=True) assert csize < size * 0.1 - assert csize == 35 # same as compression 'zlib' + assert csize == 37 # same as compression 'zlib' def test_compression_auto_uncompressible(self): size, csize = self._get_sizes('auto,zlib', compressible=False) diff --git a/src/borg/testsuite/compress.py b/src/borg/testsuite/compress.py index 3942c3537f..c93dd3bb67 100644 --- a/src/borg/testsuite/compress.py +++ b/src/borg/testsuite/compress.py @@ -88,11 +88,11 @@ def test_autodetect_invalid(): Compressor(**params).decompress(b'\x08\x00notreallyzlib') -def test_zlib_compat(): +def test_zlib_legacy_compat(): # for compatibility reasons, we do not add an extra header for zlib, # nor do we expect one when decompressing / autodetecting for level in range(10): - c = get_compressor(name='zlib', level=level) + c = get_compressor(name='zlib_legacy', level=level) cdata1 = c.compress(data) cdata2 = zlib.compress(data, level) assert cdata1 == cdata2 From c3a35b3db73eff5779e3d409350d2da902e27bcc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 4 May 2022 01:58:24 +0200 Subject: [PATCH 5/8] transfer: all hardlinks have chunks, maybe chunks_healty, hlid Item.hlid: same id, same hardlink (xxh64 digest) Item.hardlink_master: not used for new archives any more Item.source: not used for hardlink slaves any more --- src/borg/archiver.py | 15 +++++++++++++++ src/borg/constants.py | 2 +- src/borg/item.pyx | 3 ++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 14efb43dd3..4c5eb1e54b 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -347,6 +347,20 @@ def do_transfer(self, args, *, def upgrade_item(item): """upgrade item as needed, get rid of legacy crap""" + if item.get('hardlink_master', True) and 'source' not in item and hardlinkable(item.mode): + item._dict['hlid'] = hlid = hashlib.sha256(item._dict['path']) + hardlink_masters[hlid] = (item._dict.get('chunks'), item._dict.get('chunks_healthy')) + elif 'source' in item and hardlinkable(item.mode): + item._dict['hlid'] = hlid = hashlib.sha256(item._dict['source']) + chunks, chunks_healthy = hardlink_masters.get(hlid, (None, None)) + if chunks is not None: + item._dict['chunks'] = chunks + for chunk_id, _, _ in chunks: + cache.chunk_incref(chunk_id, archive.stats) + if chunks_healthy is not None: + item._dict['chunks_healthy'] = chunks + item._dict.pop('source') # not used for hardlinks any more, replaced by hlid + item._dict.pop('hardlink_master', None) # not used for hardlinks any more, replaced by hlid item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks return item @@ -371,6 +385,7 @@ def upgrade_compressed_chunk(chunk): else: if not dry_run: print(f"{name}: copying archive to destination repo...") + hardlink_masters = {} other_archive = Archive(other_repository, other_key, other_manifest, name) archive = Archive(repository, key, manifest, name, cache=cache, create=True) if not dry_run else None for item in other_archive.iter_items(): diff --git a/src/borg/constants.py b/src/borg/constants.py index 0b2ef16a16..13eb8bd232 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -1,5 +1,5 @@ # this set must be kept complete, otherwise the RobustUnpacker might malfunction: -ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', +ITEM_KEYS = frozenset(['path', 'source', 'rdev', 'chunks', 'chunks_healthy', 'hardlink_master', 'hlid', 'mode', 'user', 'group', 'uid', 'gid', 'mtime', 'atime', 'ctime', 'birthtime', 'size', 'xattrs', 'bsdflags', 'acl_nfs4', 'acl_access', 'acl_default', 'acl_extended', 'part']) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 48debf1839..0b2598ffe3 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -181,7 +181,8 @@ class Item(PropDict): # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) - hardlink_master = PropDict._make_property('hardlink_master', bool) + hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. + hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') chunks_healthy = PropDict._make_property('chunks_healthy', (list, type(None)), 'list or None') From d7b07a9a5edd93d7cadf54db9e445c7435421124 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Wed, 4 May 2022 10:34:33 +0200 Subject: [PATCH 6/8] transfer: convert timestamps int/bigint -> msgpack.Timestamp, see #2323 Timestamp scales to 64 or 96bit serialization formats, that should be enough for everybody. We use this in archived items and also in the files cache. --- src/borg/archiver.py | 4 ++++ src/borg/cache.py | 11 ++++++----- src/borg/helpers/msgpack.py | 17 ++++++++++++++--- src/borg/helpers/parseformat.py | 3 +++ src/borg/item.pyx | 11 +++++------ src/borg/testsuite/item.py | 7 ++++--- 6 files changed, 36 insertions(+), 17 deletions(-) diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 4c5eb1e54b..9303cd533e 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -360,6 +360,10 @@ def upgrade_item(item): if chunks_healthy is not None: item._dict['chunks_healthy'] = chunks item._dict.pop('source') # not used for hardlinks any more, replaced by hlid + for attr in 'atime', 'ctime', 'mtime', 'birthtime': + if attr in item: + ns = getattr(item, attr) # decode (bigint or Timestamp) --> int ns + setattr(item, attr, ns) # encode int ns --> msgpack.Timestamp only, no bigint any more item._dict.pop('hardlink_master', None) # not used for hardlinks any more, replaced by hlid item._dict.pop('acl', None) # remove remnants of bug in attic <= 0.13 item.get_size(memorize=True) # if not already present: compute+remember size for items with chunks diff --git a/src/borg/cache.py b/src/borg/cache.py index 6fa74e692d..6cd6123590 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -19,7 +19,7 @@ from .helpers import Error from .helpers import Manifest from .helpers import get_cache_dir, get_security_dir -from .helpers import int_to_bigint, bigint_to_int, bin_to_hex, parse_stringified_list +from .helpers import bin_to_hex, parse_stringified_list from .helpers import format_file_size from .helpers import safe_ns from .helpers import yes @@ -28,6 +28,7 @@ from .helpers import set_ec, EXIT_WARNING from .helpers import safe_unlink from .helpers import msgpack +from .helpers.msgpack import int_to_timestamp, timestamp_to_int from .item import ArchiveItem, ChunkListEntry from .crypto.key import PlaintextKey from .crypto.file_integrity import IntegrityCheckedFile, DetachedIntegrityCheckedFile, FileIntegrityError @@ -623,7 +624,7 @@ def commit(self): # this is to avoid issues with filesystem snapshots and cmtime granularity. # Also keep files from older backups that have not reached BORG_FILES_CACHE_TTL yet. entry = FileCacheEntry(*msgpack.unpackb(item)) - if entry.age == 0 and bigint_to_int(entry.cmtime) < self._newest_cmtime or \ + if entry.age == 0 and timestamp_to_int(entry.cmtime) < self._newest_cmtime or \ entry.age > 0 and entry.age < ttl: msgpack.pack((path_hash, entry), fd) entry_count += 1 @@ -1018,10 +1019,10 @@ def file_known_and_unchanged(self, hashed_path, path_hash, st): if 'i' in cache_mode and entry.inode != st.st_ino: files_cache_logger.debug('KNOWN-CHANGED: file inode number has changed: %r', hashed_path) return True, None - if 'c' in cache_mode and bigint_to_int(entry.cmtime) != st.st_ctime_ns: + if 'c' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_ctime_ns: files_cache_logger.debug('KNOWN-CHANGED: file ctime has changed: %r', hashed_path) return True, None - elif 'm' in cache_mode and bigint_to_int(entry.cmtime) != st.st_mtime_ns: + elif 'm' in cache_mode and timestamp_to_int(entry.cmtime) != st.st_mtime_ns: files_cache_logger.debug('KNOWN-CHANGED: file mtime has changed: %r', hashed_path) return True, None # we ignored the inode number in the comparison above or it is still same. @@ -1049,7 +1050,7 @@ def memorize_file(self, hashed_path, path_hash, st, ids): elif 'm' in cache_mode: cmtime_type = 'mtime' cmtime_ns = safe_ns(st.st_mtime_ns) - entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_bigint(cmtime_ns), chunk_ids=ids) + entry = FileCacheEntry(age=0, inode=st.st_ino, size=st.st_size, cmtime=int_to_timestamp(cmtime_ns), chunk_ids=ids) self.files[path_hash] = msgpack.packb(entry) self._newest_cmtime = max(self._newest_cmtime or 0, cmtime_ns) files_cache_logger.debug('FILES-CACHE-UPDATE: put %r [has %s] <- %r', diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 2ace88feef..411f00fec4 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -24,7 +24,7 @@ from msgpack import unpack as mp_unpack from msgpack import version as mp_version -from msgpack import ExtType +from msgpack import ExtType, Timestamp from msgpack import OutOfData @@ -164,7 +164,7 @@ def get_limited_unpacker(kind): return Unpacker(**args) -def bigint_to_int(mtime): +def bigint_to_int(mtime): # legacy """Convert bytearray to int """ if isinstance(mtime, bytes): @@ -172,7 +172,7 @@ def bigint_to_int(mtime): return mtime -def int_to_bigint(value): +def int_to_bigint(value): # legacy """Convert integers larger than 64 bits to bytearray Smaller integers are left alone @@ -180,3 +180,14 @@ def int_to_bigint(value): if value.bit_length() > 63: return value.to_bytes((value.bit_length() + 9) // 8, 'little', signed=True) return value + + +def int_to_timestamp(ns): + return Timestamp.from_unix_nano(ns) + + +def timestamp_to_int(ts): + if isinstance(ts, Timestamp): + return ts.to_unix_nano() + # legacy support note: we need to keep the bigint conversion for compatibility with borg < 1.3 archives. + return bigint_to_int(ts) diff --git a/src/borg/helpers/parseformat.py b/src/borg/helpers/parseformat.py index b681571283..99026f478d 100644 --- a/src/borg/helpers/parseformat.py +++ b/src/borg/helpers/parseformat.py @@ -19,6 +19,7 @@ from .errors import Error from .fs import get_keys_dir +from .msgpack import Timestamp from .time import OutputTimestamp, format_time, to_localtime, safe_timestamp, safe_s from .. import __version__ as borg_version from .. import __version_tuple__ as borg_version_tuple @@ -1050,6 +1051,8 @@ def decode(d): value = decode_tuple(value) elif isinstance(value, bytes): value = decode_bytes(value) + elif isinstance(value, Timestamp): + value = value.to_unix_nano() if isinstance(key, bytes): key = key.decode() res[key] = value diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 0b2598ffe3..764279db03 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -3,9 +3,9 @@ from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS from .helpers import safe_encode, safe_decode -from .helpers import bigint_to_int, int_to_bigint from .helpers import StableDict from .helpers import format_file_size +from .helpers.msgpack import timestamp_to_int, int_to_timestamp cdef extern from "_item.c": @@ -171,11 +171,10 @@ class Item(PropDict): rdev = PropDict._make_property('rdev', int) bsdflags = PropDict._make_property('bsdflags', int) - # note: we need to keep the bigint conversion for compatibility with borg 1.0 archives. - atime = PropDict._make_property('atime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - ctime = PropDict._make_property('ctime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - mtime = PropDict._make_property('mtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) - birthtime = PropDict._make_property('birthtime', int, 'bigint', encode=int_to_bigint, decode=bigint_to_int) + atime = PropDict._make_property('atime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + ctime = PropDict._make_property('ctime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + mtime = PropDict._make_property('mtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) + birthtime = PropDict._make_property('birthtime', int, 'int (ns)', encode=int_to_timestamp, decode=timestamp_to_int) # size is only present for items with a chunk list and then it is sum(chunk_sizes) # compatibility note: this is a new feature, in old archives size will be missing. diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index aa40cc0660..80b38edce4 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -3,6 +3,7 @@ from ..cache import ChunkListEntry from ..item import Item from ..helpers import StableDict +from ..helpers.msgpack import Timestamp def test_item_empty(): @@ -77,15 +78,15 @@ def test_item_int_property(): item.mode = "invalid" -def test_item_bigint_property(): +def test_item_mptimestamp_property(): item = Item() small, big = 42, 2 ** 65 item.atime = small assert item.atime == small - assert item.as_dict() == {'atime': small} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(small)} item.atime = big assert item.atime == big - assert item.as_dict() == {'atime': b'\0' * 8 + b'\x02'} + assert item.as_dict() == {'atime': Timestamp.from_unix_nano(big)} def test_item_user_group_none(): From 9447273d382fb723c7052b34c3880a9c061ff673 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Thu, 5 May 2022 19:36:02 +0200 Subject: [PATCH 7/8] cleanup msgpack related str/bytes mess, see #968 see ticket and borg.helpers.msgpack docstring. --- src/borg/archive.py | 16 ++- src/borg/archiver.py | 12 +-- src/borg/cache_sync/unpack.h | 36 ++++--- src/borg/helpers/manifest.py | 46 ++++----- src/borg/helpers/msgpack.py | 101 +++++++++++++------ src/borg/item.pyx | 172 +++++++++++++++++++++++++++------ src/borg/remote.py | 49 +++++----- src/borg/testsuite/archive.py | 14 +-- src/borg/testsuite/archiver.py | 30 +++--- src/borg/testsuite/item.py | 6 +- 10 files changed, 319 insertions(+), 163 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 49edb816b2..89f0766698 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -34,7 +34,7 @@ from .platform import uid2user, user2uid, gid2group, group2gid from .helpers import parse_timestamp, to_localtime from .helpers import OutputTimestamp, format_timedelta, format_file_size, file_status, FileSize -from .helpers import safe_encode, safe_decode, make_path_safe, remove_surrogates +from .helpers import safe_encode, make_path_safe, remove_surrogates from .helpers import StableDict from .helpers import bin_to_hex from .helpers import safe_ns @@ -492,7 +492,6 @@ def _load_meta(self, id): def load(self, id): self.id = id self.metadata = self._load_meta(self.id) - self.metadata.cmdline = [safe_decode(arg) for arg in self.metadata.cmdline] self.name = self.metadata.name self.comment = self.metadata.get('comment', '') @@ -1550,7 +1549,7 @@ class RobustUnpacker: """ def __init__(self, validator, item_keys): super().__init__() - self.item_keys = [msgpack.packb(name.encode()) for name in item_keys] + self.item_keys = [msgpack.packb(name) for name in item_keys] self.validator = validator self._buffered_data = [] self._resync = False @@ -1769,7 +1768,7 @@ def valid_archive(obj): # lost manifest on a older borg version than the most recent one that was ever used # within this repository (assuming that newer borg versions support more item keys). manifest = Manifest(self.key, self.repository) - archive_keys_serialized = [msgpack.packb(name.encode()) for name in ARCHIVE_KEYS] + archive_keys_serialized = [msgpack.packb(name) for name in ARCHIVE_KEYS] pi = ProgressIndicatorPercent(total=len(self.chunks), msg="Rebuilding manifest %6.2f%%", step=0.01, msgid='check.rebuild_manifest') for chunk_id, _ in self.chunks.iteritems(): @@ -1916,9 +1915,9 @@ def robust_iterator(archive): Missing item chunks will be skipped and the msgpack stream will be restarted """ - item_keys = frozenset(key.encode() for key in self.manifest.item_keys) - required_item_keys = frozenset(key.encode() for key in REQUIRED_ITEM_KEYS) - unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and b'path' in item, + item_keys = self.manifest.item_keys + required_item_keys = REQUIRED_ITEM_KEYS + unpacker = RobustUnpacker(lambda item: isinstance(item, StableDict) and 'path' in item, self.manifest.item_keys) _state = 0 @@ -1943,7 +1942,7 @@ def valid_item(obj): # A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item. # We ignore it here, should it exist. See test_attic013_acl_bug for details. obj.pop(b'acl', None) - keys = set(obj) + keys = set(k.decode('utf-8', errors='replace') for k in obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) if not keys.issubset(item_keys): @@ -2029,7 +2028,6 @@ def valid_item(obj): archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) if archive.version != 1: raise Exception('Unknown archive metadata version') - archive.cmdline = [safe_decode(arg) for arg in archive.cmdline] items_buffer = ChunkBuffer(self.key) items_buffer.write_chunk = add_callback for item in robust_iterator(archive): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index 9303cd533e..d90321f9e7 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -54,7 +54,7 @@ from .helpers import PrefixSpec, GlobSpec, CommentSpec, SortBySpec, FilesCacheMode from .helpers import BaseFormatter, ItemFormatter, ArchiveFormatter from .helpers import format_timedelta, format_file_size, parse_file_size, format_archive - from .helpers import safe_encode, remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes + from .helpers import remove_surrogates, bin_to_hex, prepare_dump_dict, eval_escapes from .helpers import interval, prune_within, prune_split, PRUNING_PATTERNS from .helpers import timestamp from .helpers import get_cache_dir, os_stat @@ -1939,12 +1939,12 @@ def do_upgrade(self, args, repository, manifest=None, key=None): print('This repository is not encrypted, cannot enable TAM.') return EXIT_ERROR - if not manifest.tam_verified or not manifest.config.get(b'tam_required', False): + if not manifest.tam_verified or not manifest.config.get('tam_required', False): # The standard archive listing doesn't include the archive ID like in borg 1.1.x print('Manifest contents:') for archive_info in manifest.archives.list(sort_by=['ts']): print(format_archive(archive_info), '[%s]' % bin_to_hex(archive_info.id)) - manifest.config[b'tam_required'] = True + manifest.config['tam_required'] = True manifest.write() repository.commit(compact=False) if not key.tam_required: @@ -1967,7 +1967,7 @@ def do_upgrade(self, args, repository, manifest=None, key=None): print('Key updated') if hasattr(key, 'find_key'): print('Key location:', key.find_key()) - manifest.config[b'tam_required'] = False + manifest.config['tam_required'] = False manifest.write() repository.commit(compact=False) else: @@ -2300,7 +2300,7 @@ def do_debug_dump_archive(self, args, repository, manifest, key): """dump decoded archive metadata (not: data)""" try: - archive_meta_orig = manifest.archives.get_raw_dict()[safe_encode(args.location.archive)] + archive_meta_orig = manifest.archives.get_raw_dict()[args.location.archive] except KeyError: raise Archive.DoesNotExist(args.location.archive) @@ -2317,7 +2317,7 @@ def output(fd): fd.write(do_indent(prepare_dump_dict(archive_meta_orig))) fd.write(',\n') - data = key.decrypt(archive_meta_orig[b'id'], repository.get(archive_meta_orig[b'id'])) + data = key.decrypt(archive_meta_orig['id'], repository.get(archive_meta_orig['id'])) archive_org_dict = msgpack.unpackb(data, object_hook=StableDict) fd.write(' "_meta":\n') diff --git a/src/borg/cache_sync/unpack.h b/src/borg/cache_sync/unpack.h index f6dd9ca8e8..bba566419f 100644 --- a/src/borg/cache_sync/unpack.h +++ b/src/borg/cache_sync/unpack.h @@ -384,19 +384,11 @@ static inline int unpack_callback_map_end(unpack_user* u) static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* p, unsigned int length) { - /* raw = what Borg uses for binary stuff and strings as well */ + /* raw = what Borg uses for text stuff */ /* Note: p points to an internal buffer which contains l bytes. */ (void)b; switch(u->expect) { - case expect_key: - if(length != 32) { - SET_LAST_ERROR("Incorrect key length"); - return -1; - } - memcpy(u->current.key, p, 32); - u->expect = expect_size; - break; case expect_map_key: if(length == 6 && !memcmp("chunks", p, 6)) { u->expect = expect_chunks_begin; @@ -409,19 +401,31 @@ static inline int unpack_callback_raw(unpack_user* u, const char* b, const char* u->expect = expect_map_item_end; } break; - default: - if(u->inside_chunks) { - SET_LAST_ERROR("Unexpected bytes in chunks structure"); - return -1; - } } return 0; } static inline int unpack_callback_bin(unpack_user* u, const char* b, const char* p, unsigned int length) { - (void)u; (void)b; (void)p; (void)length; - UNEXPECTED("bin"); + /* bin = what Borg uses for binary stuff */ + /* Note: p points to an internal buffer which contains l bytes. */ + (void)b; + + switch(u->expect) { + case expect_key: + if(length != 32) { + SET_LAST_ERROR("Incorrect key length"); + return -1; + } + memcpy(u->current.key, p, 32); + u->expect = expect_size; + break; + default: + if(u->inside_chunks) { + SET_LAST_ERROR("Unexpected bytes in chunks structure"); + return -1; + } + } return 0; } diff --git a/src/borg/helpers/manifest.py b/src/borg/helpers/manifest.py index 425feb4e68..1b9d91fb49 100644 --- a/src/borg/helpers/manifest.py +++ b/src/borg/helpers/manifest.py @@ -12,7 +12,7 @@ logger = create_logger() from .datastruct import StableDict -from .parseformat import bin_to_hex, safe_encode, safe_decode +from .parseformat import bin_to_hex from .time import parse_timestamp from .. import shellpattern from ..constants import * # NOQA @@ -39,39 +39,35 @@ class Archives(abc.MutableMapping): str timestamps or datetime timestamps. """ def __init__(self): - # key: encoded archive name, value: dict(b'id': bytes_id, b'time': bytes_iso_ts) + # key: str archive name, value: dict('id': bytes_id, 'time': str_iso_ts) self._archives = {} def __len__(self): return len(self._archives) def __iter__(self): - return iter(safe_decode(name) for name in self._archives) + return iter(self._archives) def __getitem__(self, name): assert isinstance(name, str) - _name = safe_encode(name) - values = self._archives.get(_name) + values = self._archives.get(name) if values is None: raise KeyError - ts = parse_timestamp(values[b'time'].decode()) - return ArchiveInfo(name=name, id=values[b'id'], ts=ts) + ts = parse_timestamp(values['time']) + return ArchiveInfo(name=name, id=values['id'], ts=ts) def __setitem__(self, name, info): assert isinstance(name, str) - name = safe_encode(name) assert isinstance(info, tuple) id, ts = info assert isinstance(id, bytes) if isinstance(ts, datetime): ts = ts.replace(tzinfo=None).strftime(ISO_FORMAT) assert isinstance(ts, str) - ts = ts.encode() - self._archives[name] = {b'id': id, b'time': ts} + self._archives[name] = {'id': id, 'time': ts} def __delitem__(self, name): assert isinstance(name, str) - name = safe_encode(name) del self._archives[name] def list(self, *, glob=None, match_end=r'\Z', sort_by=(), consider_checkpoints=True, first=None, last=None, reverse=False): @@ -116,8 +112,8 @@ def list_considering(self, args): def set_raw_dict(self, d): """set the dict we get from the msgpack unpacker""" for k, v in d.items(): - assert isinstance(k, bytes) - assert isinstance(v, dict) and b'id' in v and b'time' in v + assert isinstance(k, str) + assert isinstance(v, dict) and 'id' in v and 'time' in v self._archives[k] = v def get_raw_dict(self): @@ -196,10 +192,10 @@ def load(cls, repository, operations, key=None, force_tam_not_required=False): manifest.timestamp = m.get('timestamp') manifest.config = m.config # valid item keys are whatever is known in the repo or every key we know - manifest.item_keys = ITEM_KEYS | frozenset(key.decode() for key in m.get('item_keys', [])) + manifest.item_keys = ITEM_KEYS | frozenset(m.get('item_keys', [])) if manifest.tam_verified: - manifest_required = manifest.config.get(b'tam_required', False) + manifest_required = manifest.config.get('tam_required', False) security_required = tam_required(repository) if manifest_required and not security_required: logger.debug('Manifest is TAM verified and says TAM is required, updating security database...') @@ -214,32 +210,32 @@ def load(cls, repository, operations, key=None, force_tam_not_required=False): def check_repository_compatibility(self, operations): for operation in operations: assert isinstance(operation, self.Operation) - feature_flags = self.config.get(b'feature_flags', None) + feature_flags = self.config.get('feature_flags', None) if feature_flags is None: return - if operation.value.encode() not in feature_flags: + if operation.value not in feature_flags: continue - requirements = feature_flags[operation.value.encode()] - if b'mandatory' in requirements: - unsupported = set(requirements[b'mandatory']) - self.SUPPORTED_REPO_FEATURES + requirements = feature_flags[operation.value] + if 'mandatory' in requirements: + unsupported = set(requirements['mandatory']) - self.SUPPORTED_REPO_FEATURES if unsupported: - raise MandatoryFeatureUnsupported([f.decode() for f in unsupported]) + raise MandatoryFeatureUnsupported(list(unsupported)) def get_all_mandatory_features(self): result = {} - feature_flags = self.config.get(b'feature_flags', None) + feature_flags = self.config.get('feature_flags', None) if feature_flags is None: return result for operation, requirements in feature_flags.items(): - if b'mandatory' in requirements: - result[operation.decode()] = {feature.decode() for feature in requirements[b'mandatory']} + if 'mandatory' in requirements: + result[operation] = set(requirements['mandatory']) return result def write(self): from ..item import ManifestItem if self.key.tam_required: - self.config[b'tam_required'] = True + self.config['tam_required'] = True # self.timestamp needs to be strictly monotonically increasing. Clocks often are not set correctly if self.timestamp is None: self.timestamp = datetime.utcnow().strftime(ISO_FORMAT) diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 411f00fec4..5a2edecd67 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -1,21 +1,56 @@ +""" +wrapping msgpack +================ + +Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - +to avoid having lots of clutter in the calling code. see tickets #968 and #3632. + +Packing +------- +- use_bin_type = True (used by borg since borg 1.3) + This is used to generate output according to new msgpack 2.0 spec. + This cleanly keeps bytes and str types apart. + +- use_bin_type = False (used by borg < 1.3) + This creates output according to the older msgpack spec. + BAD: str and bytes were packed into same "raw" representation. + +- unicode_errors = 'surrogateescape' + Guess backup applications are one of the rare cases when this needs to be used. + It is needed because borg also needs to deal with data that does not cleanly encode/decode using utf-8. + There's a lot of crap out there, e.g. in filenames and as a backup tool, we must keep them as good as possible. + +Unpacking +--------- +- raw = True (the old way, used by borg <= 1.3) + This is currently still needed to not try to decode "raw" msgpack objects. + These could come either from str (new or old msgpack) or bytes (old msgpack). + Thus, we basically must know what we want and either keep the bytes we get + or decode them to str, if we want str. + +- raw = False (the new way) + This can be used in future, when we do not have to deal with data any more that was packed the old way. + It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str. + +- unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False). + +As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968. +borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. +But from now on, borg only **writes** new data according to the new msgpack spec, +thus we can complete the fix for #968 in a later borg release. + +current way in msgpack terms +---------------------------- + +- pack with use_bin_type=True (according to msgpack 2.0 spec) +- packs str -> raw and bytes -> bin +- unpack with raw=True (aka "the old way") +- unpacks raw to bytes (thus we always need to decode manually if we want str) +""" + from .datastruct import StableDict from ..constants import * # NOQA -# wrapping msgpack --------------------------------------------------------------------------------------------------- -# -# due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - -# to avoid having lots of clutter in the calling code. see tickets #968 and #3632. -# -# Packing -# ------- -# use_bin_type = False is needed to generate the old msgpack format (not msgpack 2.0 spec) as borg always did. -# unicode_errors = None is needed because usage of it is deprecated -# -# Unpacking -# --------- -# raw = True is needed to unpack the old msgpack format to bytes (not str, about the decoding see item.pyx). -# unicode_errors = None is needed because usage of it is deprecated - from msgpack import Packer as mp_Packer from msgpack import packb as mp_packb from msgpack import pack as mp_pack @@ -30,6 +65,10 @@ version = mp_version +USE_BIN_TYPE = True +RAW = True # should become False later when we do not need to read old stuff any more +UNICODE_ERRORS = 'surrogateescape' # previously done by safe_encode, safe_decode + class PackException(Exception): """Exception while msgpack packing""" @@ -40,10 +79,10 @@ class UnpackException(Exception): class Packer(mp_Packer): - def __init__(self, *, default=None, unicode_errors=None, - use_single_float=False, autoreset=True, use_bin_type=False, + def __init__(self, *, default=None, unicode_errors=UNICODE_ERRORS, + use_single_float=False, autoreset=True, use_bin_type=USE_BIN_TYPE, strict_types=False): - assert unicode_errors is None + assert unicode_errors == UNICODE_ERRORS super().__init__(default=default, unicode_errors=unicode_errors, use_single_float=use_single_float, autoreset=autoreset, use_bin_type=use_bin_type, strict_types=strict_types) @@ -55,16 +94,16 @@ def pack(self, obj): raise PackException(e) -def packb(o, *, use_bin_type=False, unicode_errors=None, **kwargs): - assert unicode_errors is None +def packb(o, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs): + assert unicode_errors == UNICODE_ERRORS try: return mp_packb(o, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs) except Exception as e: raise PackException(e) -def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs): - assert unicode_errors is None +def pack(o, stream, *, use_bin_type=USE_BIN_TYPE, unicode_errors=UNICODE_ERRORS, **kwargs): + assert unicode_errors == UNICODE_ERRORS try: return mp_pack(o, stream, use_bin_type=use_bin_type, unicode_errors=unicode_errors, **kwargs) except Exception as e: @@ -72,13 +111,13 @@ def pack(o, stream, *, use_bin_type=False, unicode_errors=None, **kwargs): class Unpacker(mp_Unpacker): - def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=True, + def __init__(self, file_like=None, *, read_size=0, use_list=True, raw=RAW, object_hook=None, object_pairs_hook=None, list_hook=None, - unicode_errors=None, max_buffer_size=0, + unicode_errors=UNICODE_ERRORS, max_buffer_size=0, ext_hook=ExtType, strict_map_key=False): - assert raw is True - assert unicode_errors is None + assert raw == RAW + assert unicode_errors == UNICODE_ERRORS kw = dict(file_like=file_like, read_size=read_size, use_list=use_list, raw=raw, object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, unicode_errors=unicode_errors, max_buffer_size=max_buffer_size, @@ -105,10 +144,11 @@ def __next__(self): next = __next__ -def unpackb(packed, *, raw=True, unicode_errors=None, +def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - assert unicode_errors is None + assert raw == RAW + assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, strict_map_key=strict_map_key) @@ -118,10 +158,11 @@ def unpackb(packed, *, raw=True, unicode_errors=None, raise UnpackException(e) -def unpack(stream, *, raw=True, unicode_errors=None, +def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - assert unicode_errors is None + # assert raw == RAW + assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, strict_map_key=strict_map_key) diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 764279db03..a2142a8ee5 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -2,7 +2,6 @@ import stat from collections import namedtuple from .constants import ITEM_KEYS, ARCHIVE_KEYS -from .helpers import safe_encode, safe_decode from .helpers import StableDict from .helpers import format_file_size from .helpers.msgpack import timestamp_to_int, int_to_timestamp @@ -16,6 +15,51 @@ cdef extern from "_item.c": API_VERSION = '1.2_01' +def fix_key(data, key): + """if k is a bytes-typed key, migrate key/value to a str-typed key in dict data""" + if isinstance(key, bytes): + value = data.pop(key) + key = key.decode() + data[key] = value + assert isinstance(key, str) + return key + + +def fix_str_value(data, key, errors='surrogateescape'): + """makes sure that data[key] is a str (decode if it is bytes)""" + assert isinstance(key, str) # fix_key must be called first + value = data[key] + if isinstance(value, bytes): + value = value.decode('utf-8', errors=errors) + data[key] = value + assert isinstance(value, str) + return value + + +def fix_list_of_str(t): + """make sure we have a list of str""" + assert isinstance(t, (tuple, list)) + l = [e.decode() if isinstance(e, bytes) else e for e in t] + assert all(isinstance(e, str) for e in l), repr(l) + return l + + +def fix_tuple_of_str(t): + """make sure we have a tuple of str""" + assert isinstance(t, (tuple, list)) + t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) + assert all(isinstance(e, str) for e in t), repr(t) + return t + + +def fix_tuple_of_str_and_int(t): + """make sure we have a tuple of str""" + assert isinstance(t, (tuple, list)) + t = tuple(e.decode() if isinstance(e, bytes) else e for e in t) + assert all(isinstance(e, (str, int)) for e in t), repr(t) + return t + + class PropDict: """ Manage a dictionary via properties. @@ -155,10 +199,10 @@ class Item(PropDict): # properties statically defined, so that IDEs can know their names: - path = PropDict._make_property('path', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - source = PropDict._make_property('source', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode) - group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None', encode=safe_encode, decode=safe_decode) + path = PropDict._make_property('path', str, 'surrogate-escaped str') + source = PropDict._make_property('source', str, 'surrogate-escaped str') + user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') + group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') acl_access = PropDict._make_property('acl_access', bytes) acl_default = PropDict._make_property('acl_default', bytes) @@ -291,6 +335,14 @@ class Item(PropDict): except AttributeError: return False + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k in ('path', 'source', 'user', 'group'): + v = fix_str_value(d, k) + self._dict[k] = v + class EncryptedKey(PropDict): """ @@ -310,7 +362,7 @@ class EncryptedKey(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - algorithm = PropDict._make_property('algorithm', str, encode=str.encode, decode=bytes.decode) + algorithm = PropDict._make_property('algorithm', str) iterations = PropDict._make_property('iterations', int) salt = PropDict._make_property('salt', bytes) hash = PropDict._make_property('hash', bytes) @@ -318,7 +370,17 @@ class EncryptedKey(PropDict): argon2_time_cost = PropDict._make_property('argon2_time_cost', int) argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int) argon2_parallelism = PropDict._make_property('argon2_parallelism', int) - argon2_type = PropDict._make_property('argon2_type', str, encode=str.encode, decode=bytes.decode) + argon2_type = PropDict._make_property('argon2_type', str) + + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k in ('algorithm', 'argon2_type'): + v = fix_str_value(d, k) + self._dict[k] = v class Key(PropDict): @@ -345,17 +407,13 @@ class Key(PropDict): chunk_seed = PropDict._make_property('chunk_seed', int) tam_required = PropDict._make_property('tam_required', bool) - -def tuple_encode(t): - """encode a tuple that might contain str items""" - # we have str, but want to give bytes to msgpack.pack - return tuple(safe_encode(e) if isinstance(e, str) else e for e in t) - - -def tuple_decode(t): - """decode a tuple that might contain bytes items""" - # we get bytes objects from msgpack.unpack, but want str - return tuple(safe_decode(e) if isinstance(e, bytes) else e for e in t) + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + self._dict[k] = v class ArchiveItem(PropDict): @@ -375,15 +433,15 @@ class ArchiveItem(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - name = PropDict._make_property('name', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) + name = PropDict._make_property('name', str, 'surrogate-escaped str') items = PropDict._make_property('items', list) cmdline = PropDict._make_property('cmdline', list) # list of s-e-str - hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - username = PropDict._make_property('username', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - time = PropDict._make_property('time', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - time_end = PropDict._make_property('time_end', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - comment = PropDict._make_property('comment', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) - chunker_params = PropDict._make_property('chunker_params', tuple, 'chunker-params tuple', encode=tuple_encode, decode=tuple_decode) + hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str') + username = PropDict._make_property('username', str, 'surrogate-escaped str') + time = PropDict._make_property('time', str) + time_end = PropDict._make_property('time_end', str) + comment = PropDict._make_property('comment', str, 'surrogate-escaped str') + chunker_params = PropDict._make_property('chunker_params', tuple) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2 recreate_source_id = PropDict._make_property('recreate_source_id', bytes) @@ -396,6 +454,22 @@ class ArchiveItem(PropDict): csize_parts = PropDict._make_property('csize_parts', int) nfiles_parts = PropDict._make_property('nfiles_parts', int) + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k in ('name', 'hostname', 'username', 'comment'): + v = fix_str_value(d, k) + if k in ('time', 'time_end'): + v = fix_str_value(d, k, 'replace') + if k == 'chunker_params': + v = fix_tuple_of_str_and_int(v) + if k in ('cmdline', 'recreate_cmdline'): + v = fix_list_of_str(v) + self._dict[k] = v + class ManifestItem(PropDict): """ @@ -414,10 +488,52 @@ class ManifestItem(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - archives = PropDict._make_property('archives', dict) # name -> dict - timestamp = PropDict._make_property('timestamp', str, 'surrogate-escaped str', encode=safe_encode, decode=safe_decode) + archives = PropDict._make_property('archives', dict, 'dict of str -> dict') # name -> dict + timestamp = PropDict._make_property('timestamp', str) config = PropDict._make_property('config', dict) - item_keys = PropDict._make_property('item_keys', tuple) + item_keys = PropDict._make_property('item_keys', tuple, 'tuple of str') + + def update_internal(self, d): + # legacy support for migration (data from old msgpacks comes in as bytes always, but sometimes we want str) + for k, v in list(d.items()): + k = fix_key(d, k) + if k == 'version': + assert isinstance(v, int) + if k == 'archives': + ad = v + assert isinstance(ad, dict) + for ak, av in list(ad.items()): + ak = fix_key(ad, ak) + assert isinstance(av, dict) + for ik, iv in list(av.items()): + ik = fix_key(av, ik) + assert set(av) == {'id', 'time'} + assert isinstance(av['id'], bytes) + fix_str_value(av, 'time') + if k == 'timestamp': + v = fix_str_value(d, k, 'replace') + if k == 'config': + cd = v + assert isinstance(cd, dict) + for ck, cv in list(cd.items()): + ck = fix_key(cd, ck) + if ck == 'tam_required': + assert isinstance(cv, bool) + if ck == 'feature_flags': + assert isinstance(cv, dict) + ops = {'read', 'check', 'write', 'delete'} + for op, specs in list(cv.items()): + op = fix_key(cv, op) + assert op in ops + for speck, specv in list(specs.items()): + speck = fix_key(specs, speck) + if speck == 'mandatory': + specs[speck] = fix_tuple_of_str(specv) + assert set(cv).issubset(ops) + if k == 'item_keys': + v = fix_tuple_of_str(v) + self._dict[k] = v + class ItemDiff: """ diff --git a/src/borg/remote.py b/src/borg/remote.py index 2870d71d98..8de302871f 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -38,7 +38,8 @@ RPC_PROTOCOL_VERSION = 2 BORG_VERSION = parse_version(__version__) -MSGID, MSG, ARGS, RESULT = b'i', b'm', b'a', b'r' +MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' # pack +MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r' # unpack MAX_INFLIGHT = 100 @@ -216,9 +217,9 @@ def serve(self): for unpacked in unpacker: if isinstance(unpacked, dict): dictFormat = True - msgid = unpacked[MSGID] - method = unpacked[MSG].decode() - args = decode_keys(unpacked[ARGS]) + msgid = unpacked[MSGIDB] + method = unpacked[MSGB].decode() + args = decode_keys(unpacked[ARGSB]) elif isinstance(unpacked, tuple) and len(unpacked) == 4: dictFormat = False # The first field 'type' was always 1 and has always been ignored @@ -256,21 +257,21 @@ def serve(self): try: msg = msgpack.packb({MSGID: msgid, - b'exception_class': e.__class__.__name__, - b'exception_args': e.args, - b'exception_full': ex_full, - b'exception_short': ex_short, - b'exception_trace': ex_trace, - b'sysinfo': sysinfo()}) + 'exception_class': e.__class__.__name__, + 'exception_args': e.args, + 'exception_full': ex_full, + 'exception_short': ex_short, + 'exception_trace': ex_trace, + 'sysinfo': sysinfo()}) except TypeError: msg = msgpack.packb({MSGID: msgid, - b'exception_class': e.__class__.__name__, - b'exception_args': [x if isinstance(x, (str, bytes, int)) else None - for x in e.args], - b'exception_full': ex_full, - b'exception_short': ex_short, - b'exception_trace': ex_trace, - b'sysinfo': sysinfo()}) + 'exception_class': e.__class__.__name__, + 'exception_args': [x if isinstance(x, (str, bytes, int)) else None + for x in e.args], + 'exception_full': ex_full, + 'exception_short': ex_short, + 'exception_trace': ex_trace, + 'sysinfo': sysinfo()}) os_write(stdout_fd, msg) else: @@ -570,7 +571,7 @@ def __init__(self, location, create=False, exclusive=False, lock_wait=None, lock try: try: version = self.call('negotiate', {'client_data': { - b'client_version': BORG_VERSION, + 'client_version': BORG_VERSION, }}) except ConnectionClosed: raise ConnectionClosedWithHint('Is borg working on the server?') from None @@ -791,7 +792,7 @@ def handle_error(unpacked): if b'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULT] + yield unpacked[RESULTB] if not waiting_for and not calls: return except KeyError: @@ -811,7 +812,7 @@ def handle_error(unpacked): if b'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULT] + yield unpacked[RESULTB] if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT): w_fds = [self.stdin_fd] else: @@ -828,15 +829,15 @@ def handle_error(unpacked): self.unpacker.feed(data) for unpacked in self.unpacker: if isinstance(unpacked, dict): - msgid = unpacked[MSGID] + msgid = unpacked[MSGIDB] elif isinstance(unpacked, tuple) and len(unpacked) == 4: # The first field 'type' was always 1 and has always been ignored _, msgid, error, res = unpacked if error: # ignore res, because it is only a fixed string anyway. - unpacked = {MSGID: msgid, b'exception_class': error} + unpacked = {MSGIDB: msgid, b'exception_class': error} else: - unpacked = {MSGID: msgid, RESULT: res} + unpacked = {MSGIDB: msgid, RESULTB: res} else: raise UnexpectedRPCDataFormatFromServer(data) if msgid in self.ignore_responses: @@ -847,7 +848,7 @@ def handle_error(unpacked): else: # we currently do not have async result values except "None", # so we do not add them into async_responses. - if unpacked[RESULT] is not None: + if unpacked[RESULTB] is not None: self.async_responses[msgid] = unpacked else: self.responses[msgid] = unpacked diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index db9ed573b5..c6b60fbeba 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -157,8 +157,8 @@ def process(self, input): return result def test_extra_garbage_no_sync(self): - chunks = [(False, [self.make_chunks([b'foo', b'bar'])]), - (False, [b'garbage'] + [self.make_chunks([b'boo', b'baz'])])] + chunks = [(False, [self.make_chunks(['foo', 'bar'])]), + (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])] result = self.process(chunks) self.assert_equal(result, [ {b'path': b'foo'}, {b'path': b'bar'}, @@ -174,19 +174,19 @@ def split(self, left, length): return parts def test_correct_stream(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 2) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2) input = [(False, chunks)] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}]) def test_missing_chunk(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, chunks[4:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) def test_corrupt_chunk(self): - chunks = self.split(self.make_chunks([b'foo', b'bar', b'boo', b'baz']), 4) + chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) @@ -213,7 +213,7 @@ def test_invalid_msgpacked_item(packed, item_keys_serialized): @pytest.mark.parametrize('packed', [msgpack.packb(o) for o in [ - {b'path': b'/a/b/c'}, # small (different msgpack mapping type!) + {'path': b'/a/b/c'}, # small (different msgpack mapping type!) OrderedDict((k, b'') for k in IK), # as big (key count) as it gets OrderedDict((k, b'x' * 1000) for k in IK), # as big (key count and volume) as it gets ]]) @@ -222,7 +222,7 @@ def test_valid_msgpacked_items(packed, item_keys_serialized): def test_key_length_msgpacked_items(): - key = b'x' * 32 # 31 bytes is the limit for fixstr msgpack type + key = 'x' * 32 # 31 bytes is the limit for fixstr msgpack type data = {key: b''} item_keys_serialized = [msgpack.packb(key), ] assert valid_msgpacked_dict(msgpack.packb(data), item_keys_serialized) diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 0a4f862c6d..e97ce08528 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -1787,7 +1787,7 @@ def test_create_dry_run(self): def add_unknown_feature(self, operation): with Repository(self.repository_path, exclusive=True) as repository: manifest, key = Manifest.load(repository, Manifest.NO_OPERATION_CHECK) - manifest.config[b'feature_flags'] = {operation.value.encode(): {b'mandatory': [b'unknown-feature']}} + manifest.config['feature_flags'] = {operation.value: {'mandatory': ['unknown-feature']}} manifest.write() repository.commit(compact=False) @@ -3617,13 +3617,13 @@ def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, ex with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm + assert key[b'algorithm'] == expected_algorithm.encode() def test_change_passphrase_does_not_change_algorithm_argon2(self): - self.verify_change_passphrase_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305') + self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') def test_change_passphrase_does_not_change_algorithm_pbkdf2(self): - self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', b'sha256') + self.verify_change_passphrase_does_not_change_algorithm('pbkdf2', 'sha256') def verify_change_location_does_not_change_algorithm(self, given_algorithm, expected_algorithm): self.cmd('init', '--encryption=keyfile', '--key-algorithm', given_algorithm, self.repository_location) @@ -3632,13 +3632,13 @@ def verify_change_location_does_not_change_algorithm(self, given_algorithm, expe with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm + assert key[b'algorithm'] == expected_algorithm.encode() def test_change_location_does_not_change_algorithm_argon2(self): - self.verify_change_location_does_not_change_algorithm('argon2', b'argon2 chacha20-poly1305') + self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') def test_change_location_does_not_change_algorithm_pbkdf2(self): - self.verify_change_location_does_not_change_algorithm('pbkdf2', b'sha256') + self.verify_change_location_does_not_change_algorithm('pbkdf2', 'sha256') def test_key_change_algorithm(self): self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location) @@ -3915,15 +3915,15 @@ class Attic013Item: def as_dict(self): return { # These are required - b'path': '1234', - b'mtime': 0, - b'mode': 0, - b'user': b'0', - b'group': b'0', - b'uid': 0, - b'gid': 0, + 'path': '1234', + 'mtime': 0, + 'mode': 0, + 'user': '0', + 'group': '0', + 'uid': 0, + 'gid': 0, # acl is the offending key. - b'acl': None, + 'acl': None, } archive, repository = self.open_archive('archive1') diff --git a/src/borg/testsuite/item.py b/src/borg/testsuite/item.py index 80b38edce4..94167e7ea9 100644 --- a/src/borg/testsuite/item.py +++ b/src/borg/testsuite/item.py @@ -102,7 +102,7 @@ def test_item_se_str_property(): item = Item() item.path = '/a/b/c' assert item.path == '/a/b/c' - assert item.as_dict() == {'path': b'/a/b/c'} + assert item.as_dict() == {'path': '/a/b/c'} del item.path assert item.as_dict() == {} with pytest.raises(TypeError): @@ -111,11 +111,11 @@ def test_item_se_str_property(): # non-utf-8 path, needing surrogate-escaping for latin-1 u-umlaut item = Item(internal_dict={'path': b'/a/\xfc/c'}) assert item.path == '/a/\udcfc/c' # getting a surrogate-escaped representation - assert item.as_dict() == {'path': b'/a/\xfc/c'} + assert item.as_dict() == {'path': '/a/\udcfc/c'} del item.path assert 'path' not in item item.path = '/a/\udcfc/c' # setting using a surrogate-escaped representation - assert item.as_dict() == {'path': b'/a/\xfc/c'} + assert item.as_dict() == {'path': '/a/\udcfc/c'} def test_item_list_property(): From b2cfad46ea2f410754c5047e396d513e79c573a3 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Fri, 6 May 2022 03:59:10 +0200 Subject: [PATCH 8/8] cleanup msgpack related str/bytes mess, fixes #968 see ticket and borg.helpers.msgpack docstring. this changeset implements the full migration to msgpack 2.0 spec (use_bin_type=True, raw=False). still needed compat to the past is done via want_bytes decoder in borg.item. --- src/borg/archive.py | 11 ++--- src/borg/archiver.py | 2 +- src/borg/crypto/key.py | 10 ++--- src/borg/helpers/msgpack.py | 34 +++++++------- src/borg/item.pyx | 35 +++++++++------ src/borg/remote.py | 77 +++++++++++++++----------------- src/borg/repository.py | 44 +++++++++--------- src/borg/testsuite/archive.py | 14 +++--- src/borg/testsuite/archiver.py | 10 ++--- src/borg/testsuite/key.py | 16 +++---- src/borg/testsuite/repository.py | 4 +- 11 files changed, 126 insertions(+), 131 deletions(-) diff --git a/src/borg/archive.py b/src/borg/archive.py index 89f0766698..36bf13aa51 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -1753,13 +1753,10 @@ def rebuild_manifest(self): Iterates through all objects in the repository looking for archive metadata blocks. """ - required_archive_keys = frozenset(key.encode() for key in REQUIRED_ARCHIVE_KEYS) - def valid_archive(obj): if not isinstance(obj, dict): return False - keys = set(obj) - return required_archive_keys.issubset(keys) + return REQUIRED_ARCHIVE_KEYS.issubset(obj) logger.info('Rebuilding missing manifest, this might take some time...') # as we have lost the manifest, we do not know any more what valid item keys we had. @@ -1939,10 +1936,10 @@ def list_keys_safe(keys): def valid_item(obj): if not isinstance(obj, StableDict): return False, 'not a dictionary' - # A bug in Attic up to and including release 0.13 added a (meaningless) b'acl' key to every item. + # A bug in Attic up to and including release 0.13 added a (meaningless) 'acl' key to every item. # We ignore it here, should it exist. See test_attic013_acl_bug for details. - obj.pop(b'acl', None) - keys = set(k.decode('utf-8', errors='replace') for k in obj) + obj.pop('acl', None) + keys = set(obj) if not required_item_keys.issubset(keys): return False, 'missing required keys: ' + list_keys_safe(required_item_keys - keys) if not keys.issubset(item_keys): diff --git a/src/borg/archiver.py b/src/borg/archiver.py index d90321f9e7..1fdb0c1764 100644 --- a/src/borg/archiver.py +++ b/src/borg/archiver.py @@ -2327,7 +2327,7 @@ def output(fd): unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict) first = True - for item_id in archive_org_dict[b'items']: + for item_id in archive_org_dict['items']: data = key.decrypt(item_id, repository.get(item_id)) unpacker.feed(data) for item in unpacker: diff --git a/src/borg/crypto/key.py b/src/borg/crypto/key.py index 6ca6fbac02..15df53d001 100644 --- a/src/borg/crypto/key.py +++ b/src/borg/crypto/key.py @@ -232,24 +232,24 @@ def unpack_and_verify_manifest(self, data, force_tam_not_required=False): unpacker = get_limited_unpacker('manifest') unpacker.feed(data) unpacked = unpacker.unpack() - if b'tam' not in unpacked: + if 'tam' not in unpacked: if tam_required: raise TAMRequiredError(self.repository._location.canonical_path()) else: logger.debug('TAM not found and not required') return unpacked, False - tam = unpacked.pop(b'tam', None) + tam = unpacked.pop('tam', None) if not isinstance(tam, dict): raise TAMInvalid() - tam_type = tam.get(b'type', b'').decode('ascii', 'replace') + tam_type = tam.get('type', '') if tam_type != 'HKDF_HMAC_SHA512': if tam_required: raise TAMUnsupportedSuiteError(repr(tam_type)) else: logger.debug('Ignoring TAM made with unsupported suite, since TAM is not required: %r', tam_type) return unpacked, False - tam_hmac = tam.get(b'hmac') - tam_salt = tam.get(b'salt') + tam_hmac = tam.get('hmac') + tam_salt = tam.get('salt') if not isinstance(tam_salt, bytes) or not isinstance(tam_hmac, bytes): raise TAMInvalid() offset = data.index(tam_hmac) diff --git a/src/borg/helpers/msgpack.py b/src/borg/helpers/msgpack.py index 5a2edecd67..268ee30e75 100644 --- a/src/borg/helpers/msgpack.py +++ b/src/borg/helpers/msgpack.py @@ -2,8 +2,7 @@ wrapping msgpack ================ -Due to the planned breaking api changes in upstream msgpack, we wrap it the way we need it - -to avoid having lots of clutter in the calling code. see tickets #968 and #3632. +We wrap msgpack here the way we need it - to avoid having lots of clutter in the calling code. Packing ------- @@ -22,30 +21,27 @@ Unpacking --------- -- raw = True (the old way, used by borg <= 1.3) - This is currently still needed to not try to decode "raw" msgpack objects. - These could come either from str (new or old msgpack) or bytes (old msgpack). - Thus, we basically must know what we want and either keep the bytes we get - or decode them to str, if we want str. - -- raw = False (the new way) - This can be used in future, when we do not have to deal with data any more that was packed the old way. +- raw = False (used by borg since borg 1.3) + We already can use this with borg 1.3 due to the want_bytes decoder. + This decoder can be removed in future, when we do not have to deal with data any more that was packed the old way. It will then unpack according to the msgpack 2.0 spec format and directly output bytes or str. +- raw = True (the old way, used by borg < 1.3) + - unicode_errors = 'surrogateescape' -> see description above (will be used when raw is False). -As of borg 1.3, we have the first part on the way to fix the msgpack str/bytes mess, #968. -borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. -But from now on, borg only **writes** new data according to the new msgpack spec, -thus we can complete the fix for #968 in a later borg release. +As of borg 1.3, we have fixed most of the msgpack str/bytes mess, #968. +Borg now still needs to **read** old repos, archives, keys, ... so we can not yet fix it completely. +But from now on, borg only **writes** new data according to the new msgpack 2.0 spec, +thus we can remove some legacy support in a later borg release (some places are marked with "legacy"). current way in msgpack terms ---------------------------- - pack with use_bin_type=True (according to msgpack 2.0 spec) - packs str -> raw and bytes -> bin -- unpack with raw=True (aka "the old way") -- unpacks raw to bytes (thus we always need to decode manually if we want str) +- unpack with raw=False (according to msgpack 2.0 spec, using unicode_errors='surrogateescape') +- unpacks bin to bytes and raw to str (thus we need to re-encode manually if we want bytes from "raw") """ from .datastruct import StableDict @@ -66,8 +62,8 @@ version = mp_version USE_BIN_TYPE = True -RAW = True # should become False later when we do not need to read old stuff any more -UNICODE_ERRORS = 'surrogateescape' # previously done by safe_encode, safe_decode +RAW = False +UNICODE_ERRORS = 'surrogateescape' class PackException(Exception): @@ -161,7 +157,7 @@ def unpackb(packed, *, raw=RAW, unicode_errors=UNICODE_ERRORS, def unpack(stream, *, raw=RAW, unicode_errors=UNICODE_ERRORS, strict_map_key=False, **kwargs): - # assert raw == RAW + assert raw == RAW assert unicode_errors == UNICODE_ERRORS try: kw = dict(raw=raw, unicode_errors=unicode_errors, diff --git a/src/borg/item.pyx b/src/borg/item.pyx index a2142a8ee5..128e317444 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -60,6 +60,15 @@ def fix_tuple_of_str_and_int(t): return t +def want_bytes(v): + """we know that we want bytes and the value should be bytes""" + # legacy support: it being str can be caused by msgpack unpack decoding old data that was packed with use_bin_type=False + if isinstance(v, str): + v = v.encode('utf-8', errors='surrogateescape') + assert isinstance(v, bytes) + return v + + class PropDict: """ Manage a dictionary via properties. @@ -204,10 +213,10 @@ class Item(PropDict): user = PropDict._make_property('user', (str, type(None)), 'surrogate-escaped str or None') group = PropDict._make_property('group', (str, type(None)), 'surrogate-escaped str or None') - acl_access = PropDict._make_property('acl_access', bytes) - acl_default = PropDict._make_property('acl_default', bytes) - acl_extended = PropDict._make_property('acl_extended', bytes) - acl_nfs4 = PropDict._make_property('acl_nfs4', bytes) + acl_access = PropDict._make_property('acl_access', bytes, decode=want_bytes) + acl_default = PropDict._make_property('acl_default', bytes, decode=want_bytes) + acl_extended = PropDict._make_property('acl_extended', bytes, decode=want_bytes) + acl_nfs4 = PropDict._make_property('acl_nfs4', bytes, decode=want_bytes) mode = PropDict._make_property('mode', int) uid = PropDict._make_property('uid', int) @@ -224,7 +233,7 @@ class Item(PropDict): # compatibility note: this is a new feature, in old archives size will be missing. size = PropDict._make_property('size', int) - hlid = PropDict._make_property('hlid', bytes) # hard link id: same value means same hard link. + hlid = PropDict._make_property('hlid', bytes, decode=want_bytes) # hard link id: same value means same hard link. hardlink_master = PropDict._make_property('hardlink_master', bool) # legacy chunks = PropDict._make_property('chunks', (list, type(None)), 'list or None') @@ -364,9 +373,9 @@ class EncryptedKey(PropDict): version = PropDict._make_property('version', int) algorithm = PropDict._make_property('algorithm', str) iterations = PropDict._make_property('iterations', int) - salt = PropDict._make_property('salt', bytes) - hash = PropDict._make_property('hash', bytes) - data = PropDict._make_property('data', bytes) + salt = PropDict._make_property('salt', bytes, decode=want_bytes) + hash = PropDict._make_property('hash', bytes, decode=want_bytes) + data = PropDict._make_property('data', bytes, decode=want_bytes) argon2_time_cost = PropDict._make_property('argon2_time_cost', int) argon2_memory_cost = PropDict._make_property('argon2_memory_cost', int) argon2_parallelism = PropDict._make_property('argon2_parallelism', int) @@ -400,10 +409,10 @@ class Key(PropDict): __slots__ = ("_dict", ) # avoid setting attributes not supported by properties version = PropDict._make_property('version', int) - repository_id = PropDict._make_property('repository_id', bytes) - enc_key = PropDict._make_property('enc_key', bytes) - enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes) - id_key = PropDict._make_property('id_key', bytes) + repository_id = PropDict._make_property('repository_id', bytes, decode=want_bytes) + enc_key = PropDict._make_property('enc_key', bytes, decode=want_bytes) + enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes, decode=want_bytes) + id_key = PropDict._make_property('id_key', bytes, decode=want_bytes) chunk_seed = PropDict._make_property('chunk_seed', int) tam_required = PropDict._make_property('tam_required', bool) @@ -444,7 +453,7 @@ class ArchiveItem(PropDict): chunker_params = PropDict._make_property('chunker_params', tuple) recreate_cmdline = PropDict._make_property('recreate_cmdline', list) # list of s-e-str # recreate_source_id, recreate_args, recreate_partial_chunks were used in 1.1.0b1 .. b2 - recreate_source_id = PropDict._make_property('recreate_source_id', bytes) + recreate_source_id = PropDict._make_property('recreate_source_id', bytes, decode=want_bytes) recreate_args = PropDict._make_property('recreate_args', list) # list of s-e-str recreate_partial_chunks = PropDict._make_property('recreate_partial_chunks', list) # list of tuples size = PropDict._make_property('size', int) diff --git a/src/borg/remote.py b/src/borg/remote.py index 8de302871f..6ea51d3c3d 100644 --- a/src/borg/remote.py +++ b/src/borg/remote.py @@ -38,8 +38,7 @@ RPC_PROTOCOL_VERSION = 2 BORG_VERSION = parse_version(__version__) -MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' # pack -MSGIDB, MSGB, ARGSB, RESULTB = b'i', b'm', b'a', b'r' # unpack +MSGID, MSG, ARGS, RESULT = 'i', 'm', 'a', 'r' MAX_INFLIGHT = 100 @@ -139,10 +138,6 @@ def __init__(self, data): } -def decode_keys(d): - return {k.decode(): d[k] for k in d} - - class RepositoryServer: # pragma: no cover rpc_methods = ( '__len__', @@ -217,14 +212,13 @@ def serve(self): for unpacked in unpacker: if isinstance(unpacked, dict): dictFormat = True - msgid = unpacked[MSGIDB] - method = unpacked[MSGB].decode() - args = decode_keys(unpacked[ARGSB]) + msgid = unpacked[MSGID] + method = unpacked[MSG] + args = unpacked[ARGS] elif isinstance(unpacked, tuple) and len(unpacked) == 4: dictFormat = False # The first field 'type' was always 1 and has always been ignored _, msgid, method, args = unpacked - method = method.decode() args = self.positional_to_named(method, args) else: if self.repository is not None: @@ -308,7 +302,7 @@ def negotiate(self, client_data): # clients since 1.1.0b3 use a dict as client_data # clients since 1.1.0b6 support json log format from server if isinstance(client_data, dict): - self.client_version = client_data[b'client_version'] + self.client_version = client_data['client_version'] level = logging.getLevelName(logging.getLogger('').level) setup_logging(is_serve=True, json=True, level=level) logger.debug('Initialized logging system for JSON-based protocol') @@ -370,7 +364,6 @@ def open(self, path, create=False, lock_wait=None, lock=True, exclusive=None, ap return self.repository.id def inject_exception(self, kind): - kind = kind.decode() s1 = 'test string' s2 = 'test string2' if kind == 'DoesNotExist': @@ -484,35 +477,35 @@ class RemoteRepository: class RPCError(Exception): def __init__(self, unpacked): - # for borg < 1.1: unpacked only has b'exception_class' as key - # for borg 1.1+: unpacked has keys: b'exception_args', b'exception_full', b'exception_short', b'sysinfo' + # for borg < 1.1: unpacked only has 'exception_class' as key + # for borg 1.1+: unpacked has keys: 'exception_args', 'exception_full', 'exception_short', 'sysinfo' self.unpacked = unpacked def get_message(self): - if b'exception_short' in self.unpacked: - return b'\n'.join(self.unpacked[b'exception_short']).decode() + if 'exception_short' in self.unpacked: + return '\n'.join(self.unpacked['exception_short']) else: return self.exception_class @property def traceback(self): - return self.unpacked.get(b'exception_trace', True) + return self.unpacked.get('exception_trace', True) @property def exception_class(self): - return self.unpacked[b'exception_class'].decode() + return self.unpacked['exception_class'] @property def exception_full(self): - if b'exception_full' in self.unpacked: - return b'\n'.join(self.unpacked[b'exception_full']).decode() + if 'exception_full' in self.unpacked: + return '\n'.join(self.unpacked['exception_full']) else: return self.get_message() + '\nRemote Exception (see remote log for the traceback)' @property def sysinfo(self): - if b'sysinfo' in self.unpacked: - return self.unpacked[b'sysinfo'].decode() + if 'sysinfo' in self.unpacked: + return self.unpacked['sysinfo'] else: return '' @@ -577,9 +570,9 @@ def __init__(self, location, create=False, exclusive=False, lock_wait=None, lock raise ConnectionClosedWithHint('Is borg working on the server?') from None if version == RPC_PROTOCOL_VERSION: self.dictFormat = False - elif isinstance(version, dict) and b'server_version' in version: + elif isinstance(version, dict) and 'server_version' in version: self.dictFormat = True - self.server_version = version[b'server_version'] + self.server_version = version['server_version'] else: raise Exception('Server insisted on using unsupported protocol version %s' % version) @@ -734,9 +727,9 @@ def pop_preload_msgid(chunkid): return msgid def handle_error(unpacked): - error = unpacked[b'exception_class'].decode() - old_server = b'exception_args' not in unpacked - args = unpacked.get(b'exception_args') + error = unpacked['exception_class'] + old_server = 'exception_args' not in unpacked + args = unpacked.get('exception_args') if error == 'DoesNotExist': raise Repository.DoesNotExist(self.location.processed) @@ -748,29 +741,29 @@ def handle_error(unpacked): if old_server: raise IntegrityError('(not available)') else: - raise IntegrityError(args[0].decode()) + raise IntegrityError(args[0]) elif error == 'AtticRepository': if old_server: raise Repository.AtticRepository('(not available)') else: - raise Repository.AtticRepository(args[0].decode()) + raise Repository.AtticRepository(args[0]) elif error == 'PathNotAllowed': if old_server: raise PathNotAllowed('(unknown)') else: - raise PathNotAllowed(args[0].decode()) + raise PathNotAllowed(args[0]) elif error == 'ParentPathDoesNotExist': - raise Repository.ParentPathDoesNotExist(args[0].decode()) + raise Repository.ParentPathDoesNotExist(args[0]) elif error == 'ObjectNotFound': if old_server: raise Repository.ObjectNotFound('(not available)', self.location.processed) else: - raise Repository.ObjectNotFound(args[0].decode(), self.location.processed) + raise Repository.ObjectNotFound(args[0], self.location.processed) elif error == 'InvalidRPCMethod': if old_server: raise InvalidRPCMethod('(not available)') else: - raise InvalidRPCMethod(args[0].decode()) + raise InvalidRPCMethod(args[0]) else: raise self.RPCError(unpacked) @@ -789,10 +782,10 @@ def handle_error(unpacked): try: unpacked = self.responses.pop(waiting_for[0]) waiting_for.pop(0) - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULTB] + yield unpacked[RESULT] if not waiting_for and not calls: return except KeyError: @@ -809,10 +802,10 @@ def handle_error(unpacked): else: return else: - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: handle_error(unpacked) else: - yield unpacked[RESULTB] + yield unpacked[RESULT] if self.to_send or ((calls or self.preload_ids) and len(waiting_for) < MAX_INFLIGHT): w_fds = [self.stdin_fd] else: @@ -829,26 +822,26 @@ def handle_error(unpacked): self.unpacker.feed(data) for unpacked in self.unpacker: if isinstance(unpacked, dict): - msgid = unpacked[MSGIDB] + msgid = unpacked[MSGID] elif isinstance(unpacked, tuple) and len(unpacked) == 4: # The first field 'type' was always 1 and has always been ignored _, msgid, error, res = unpacked if error: # ignore res, because it is only a fixed string anyway. - unpacked = {MSGIDB: msgid, b'exception_class': error} + unpacked = {MSGID: msgid, 'exception_class': error} else: - unpacked = {MSGIDB: msgid, RESULTB: res} + unpacked = {MSGID: msgid, RESULT: res} else: raise UnexpectedRPCDataFormatFromServer(data) if msgid in self.ignore_responses: self.ignore_responses.remove(msgid) # async methods never return values, but may raise exceptions. - if b'exception_class' in unpacked: + if 'exception_class' in unpacked: self.async_responses[msgid] = unpacked else: # we currently do not have async result values except "None", # so we do not add them into async_responses. - if unpacked[RESULTB] is not None: + if unpacked[RESULT] is not None: self.async_responses[msgid] = unpacked else: self.responses[msgid] = unpacked diff --git a/src/borg/repository.py b/src/borg/repository.py index 9267fe0e6c..3fcc72aadd 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -516,16 +516,16 @@ def _read_integrity(self, transaction_id, key): integrity = msgpack.unpack(fd) except FileNotFoundError: return - if integrity.get(b'version') != 2: - logger.warning('Unknown integrity data version %r in %s', integrity.get(b'version'), integrity_file) + if integrity.get('version') != 2: + logger.warning('Unknown integrity data version %r in %s', integrity.get('version'), integrity_file) return - return integrity[key].decode() + return integrity[key] def open_index(self, transaction_id, auto_recover=True): if transaction_id is None: return NSIndex() index_path = os.path.join(self.path, 'index.%d' % transaction_id) - integrity_data = self._read_integrity(transaction_id, b'index') + integrity_data = self._read_integrity(transaction_id, 'index') try: with IntegrityCheckedFile(index_path, write=False, integrity_data=integrity_data) as fd: return NSIndex.read(fd) @@ -575,7 +575,7 @@ def prepare_txn(self, transaction_id, do_cleanup=True): self.io.cleanup(transaction_id) hints_path = os.path.join(self.path, 'hints.%d' % transaction_id) index_path = os.path.join(self.path, 'index.%d' % transaction_id) - integrity_data = self._read_integrity(transaction_id, b'hints') + integrity_data = self._read_integrity(transaction_id, 'hints') try: with IntegrityCheckedFile(hints_path, write=False, integrity_data=integrity_data) as fd: hints = msgpack.unpack(fd) @@ -588,23 +588,23 @@ def prepare_txn(self, transaction_id, do_cleanup=True): self.check_transaction() self.prepare_txn(transaction_id) return - if hints[b'version'] == 1: + if hints['version'] == 1: logger.debug('Upgrading from v1 hints.%d', transaction_id) - self.segments = hints[b'segments'] + self.segments = hints['segments'] self.compact = FreeSpace() self.storage_quota_use = 0 self.shadow_index = {} - for segment in sorted(hints[b'compact']): + for segment in sorted(hints['compact']): logger.debug('Rebuilding sparse info for segment %d', segment) self._rebuild_sparse(segment) logger.debug('Upgrade to v2 hints complete') - elif hints[b'version'] != 2: - raise ValueError('Unknown hints file version: %d' % hints[b'version']) + elif hints['version'] != 2: + raise ValueError('Unknown hints file version: %d' % hints['version']) else: - self.segments = hints[b'segments'] - self.compact = FreeSpace(hints[b'compact']) - self.storage_quota_use = hints.get(b'storage_quota_use', 0) - self.shadow_index = hints.get(b'shadow_index', {}) + self.segments = hints['segments'] + self.compact = FreeSpace(hints['compact']) + self.storage_quota_use = hints.get('storage_quota_use', 0) + self.shadow_index = hints.get('shadow_index', {}) self.log_storage_quota() # Drop uncommitted segments in the shadow index for key, shadowed_segments in self.shadow_index.items(): @@ -621,16 +621,16 @@ def rename_tmp(file): os.rename(file + '.tmp', file) hints = { - b'version': 2, - b'segments': self.segments, - b'compact': self.compact, - b'storage_quota_use': self.storage_quota_use, - b'shadow_index': self.shadow_index, + 'version': 2, + 'segments': self.segments, + 'compact': self.compact, + 'storage_quota_use': self.storage_quota_use, + 'shadow_index': self.shadow_index, } integrity = { # Integrity version started at 2, the current hints version. # Thus, integrity version == hints version, for now. - b'version': 2, + 'version': 2, } transaction_id = self.io.get_segments_transaction_id() assert transaction_id is not None @@ -647,7 +647,7 @@ def rename_tmp(file): with IntegrityCheckedFile(hints_file + '.tmp', filename=hints_name, write=True) as fd: msgpack.pack(hints, fd) flush_and_sync(fd) - integrity[b'hints'] = fd.integrity_data + integrity['hints'] = fd.integrity_data # Write repository index index_name = 'index.%d' % transaction_id @@ -656,7 +656,7 @@ def rename_tmp(file): # XXX: Consider using SyncFile for index write-outs. self.index.write(fd) flush_and_sync(fd) - integrity[b'index'] = fd.integrity_data + integrity['index'] = fd.integrity_data # Write integrity file, containing checksums of the hints and index files integrity_name = 'integrity.%d' % transaction_id diff --git a/src/borg/testsuite/archive.py b/src/borg/testsuite/archive.py index c6b60fbeba..6fd766f9b9 100644 --- a/src/borg/testsuite/archive.py +++ b/src/borg/testsuite/archive.py @@ -142,7 +142,7 @@ def make_chunks(self, items): return b''.join(msgpack.packb({'path': item}) for item in items) def _validator(self, value): - return isinstance(value, dict) and value.get(b'path') in (b'foo', b'bar', b'boo', b'baz') + return isinstance(value, dict) and value.get('path') in ('foo', 'bar', 'boo', 'baz') def process(self, input): unpacker = RobustUnpacker(validator=self._validator, item_keys=ITEM_KEYS) @@ -161,10 +161,10 @@ def test_extra_garbage_no_sync(self): (False, [b'garbage'] + [self.make_chunks(['boo', 'baz'])])] result = self.process(chunks) self.assert_equal(result, [ - {b'path': b'foo'}, {b'path': b'bar'}, + {'path': 'foo'}, {'path': 'bar'}, 103, 97, 114, 98, 97, 103, 101, - {b'path': b'boo'}, - {b'path': b'baz'}]) + {'path': 'boo'}, + {'path': 'baz'}]) def split(self, left, length): parts = [] @@ -177,19 +177,19 @@ def test_correct_stream(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 2) input = [(False, chunks)] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'bar'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'bar'}, {'path': 'boo'}, {'path': 'baz'}]) def test_missing_chunk(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, chunks[4:])] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}]) def test_corrupt_chunk(self): chunks = self.split(self.make_chunks(['foo', 'bar', 'boo', 'baz']), 4) input = [(False, chunks[:3]), (True, [b'gar', b'bage'] + chunks[3:])] result = self.process(input) - self.assert_equal(result, [{b'path': b'foo'}, {b'path': b'boo'}, {b'path': b'baz'}]) + self.assert_equal(result, [{'path': 'foo'}, {'path': 'boo'}, {'path': 'baz'}]) @pytest.fixture diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index e97ce08528..51bfa85640 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -3600,14 +3600,14 @@ def test_init_defaults_to_argon2(self): self.cmd('init', '--encryption=repokey', self.repository_location) with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == b'argon2 chacha20-poly1305' + assert key['algorithm'] == 'argon2 chacha20-poly1305' def test_init_with_explicit_key_algorithm(self): """https://github.com/borgbackup/borg/issues/747#issuecomment-1076160401""" self.cmd('init', '--encryption=repokey', '--key-algorithm=pbkdf2', self.repository_location) with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == b'sha256' + assert key['algorithm'] == 'sha256' def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, expected_algorithm): self.cmd('init', '--encryption=repokey', '--key-algorithm', given_algorithm, self.repository_location) @@ -3617,7 +3617,7 @@ def verify_change_passphrase_does_not_change_algorithm(self, given_algorithm, ex with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm.encode() + assert key['algorithm'] == expected_algorithm def test_change_passphrase_does_not_change_algorithm_argon2(self): self.verify_change_passphrase_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') @@ -3632,7 +3632,7 @@ def verify_change_location_does_not_change_algorithm(self, given_algorithm, expe with Repository(self.repository_path) as repository: key = msgpack.unpackb(a2b_base64(repository.load_key())) - assert key[b'algorithm'] == expected_algorithm.encode() + assert key['algorithm'] == expected_algorithm def test_change_location_does_not_change_algorithm_argon2(self): self.verify_change_location_does_not_change_algorithm('argon2', 'argon2 chacha20-poly1305') @@ -3975,7 +3975,7 @@ def test_not_required(self): key.change_passphrase(key._passphrase) manifest = msgpack.unpackb(key.decrypt(Manifest.MANIFEST_ID, repository.get(Manifest.MANIFEST_ID))) - del manifest[b'tam'] + del manifest['tam'] repository.put(Manifest.MANIFEST_ID, key.encrypt(Manifest.MANIFEST_ID, msgpack.packb(manifest))) repository.commit(compact=False) output = self.cmd('list', '--debug', self.repository_location) diff --git a/src/borg/testsuite/key.py b/src/borg/testsuite/key.py index e0be752fae..0ad5fd9e07 100644 --- a/src/borg/testsuite/key.py +++ b/src/borg/testsuite/key.py @@ -360,23 +360,23 @@ def test_round_trip(self, key): assert blob.startswith(b'\x82') unpacked = msgpack.unpackb(blob) - assert unpacked[b'tam'][b'type'] == b'HKDF_HMAC_SHA512' + assert unpacked['tam']['type'] == 'HKDF_HMAC_SHA512' unpacked, verified = key.unpack_and_verify_manifest(blob) assert verified - assert unpacked[b'foo'] == b'bar' - assert b'tam' not in unpacked + assert unpacked['foo'] == 'bar' + assert 'tam' not in unpacked - @pytest.mark.parametrize('which', (b'hmac', b'salt')) + @pytest.mark.parametrize('which', ('hmac', 'salt')) def test_tampered(self, key, which): data = {'foo': 'bar'} blob = key.pack_and_authenticate_metadata(data) assert blob.startswith(b'\x82') unpacked = msgpack.unpackb(blob, object_hook=StableDict) - assert len(unpacked[b'tam'][which]) == 64 - unpacked[b'tam'][which] = unpacked[b'tam'][which][0:32] + bytes(32) - assert len(unpacked[b'tam'][which]) == 64 + assert len(unpacked['tam'][which]) == 64 + unpacked['tam'][which] = unpacked['tam'][which][0:32] + bytes(32) + assert len(unpacked['tam'][which]) == 64 blob = msgpack.packb(unpacked) with pytest.raises(TAMInvalid): @@ -421,4 +421,4 @@ def to_dict(key): load_me = RepoKey.detect(repository, manifest_data=None) assert to_dict(load_me) == to_dict(save_me) - assert msgpack.unpackb(a2b_base64(saved))[b'algorithm'] == expected_algorithm.encode() + assert msgpack.unpackb(a2b_base64(saved))['algorithm'] == expected_algorithm diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index 83cf728e4b..4b4d6487cb 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -655,8 +655,8 @@ def _subtly_corrupted_hints_setup(self): hints = msgpack.unpack(fd) fd.seek(0) # Corrupt segment refcount - assert hints[b'segments'][2] == 1 - hints[b'segments'][2] = 0 + assert hints['segments'][2] == 1 + hints['segments'][2] = 0 msgpack.pack(hints, fd) fd.truncate()