diff --git a/CHANGELOG.md b/CHANGELOG.md index e568febf3..751857e11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ Please note that compatibility for 0.x releases (software or repositories) isn't _When adding new entries to the changelog, please include issue/PR numbers wherever possible._ +## 0.10.1 + +#### Fix for `kart upgrade` +Fixed `kart upgrade` so that it preserves more complicated (or yet-to-be-released) features of V2 repos as they are upgraded to V3. [#448](https://github.com/koordinates/kart/issues/448) + +Specifically: +* `generated-pks.json` metadata, extra metadata found in datasets that have an automatically generated primary key and which are maintained by repeatedly importing from a primary-key-less datasource +* attachments (which are not yet fully supported by Kart) - arbitrary files kept alongside datasets, such as license or readme files. ## 0.10.0 diff --git a/kart/dataset3.py b/kart/dataset3.py index a91d8dd5f..b04b95bf2 100644 --- a/kart/dataset3.py +++ b/kart/dataset3.py @@ -332,6 +332,9 @@ def import_iter_meta_blobs(self, repo, source): yield full_path, content + for rel_path, content in source.attachment_items(): + yield self.full_attachment_path(rel_path), content + def iter_legend_blob_data(self): """ Generates (full_path, blob_data) tuples for each legend in this dataset diff --git a/kart/fast_import.py b/kart/fast_import.py index 019aafe70..55a57123e 100644 --- a/kart/fast_import.py +++ b/kart/fast_import.py @@ -42,7 +42,7 @@ class _CommitMissing(Exception): pass -def _safe_walk_repo(repo): +def _safe_walk_repo(repo, from_commit): """ Contextmanager. Walk the repo log, yielding each commit. If a commit isn't present, raises _CommitMissing. @@ -50,7 +50,7 @@ def _safe_walk_repo(repo): """ do_raise = False try: - for commit in repo.walk(repo.head.target): + for commit in repo.walk(from_commit.id): try: yield commit except KeyError: @@ -65,7 +65,10 @@ def _safe_walk_repo(repo): def should_compare_imported_features_against_old_features( - repo, source, replacing_dataset + repo, + source, + replacing_dataset, + from_commit, ): """ Returns True iff we should compare feature blobs to the previous feature blobs @@ -91,7 +94,7 @@ def should_compare_imported_features_against_old_features( # Walk the log until we encounter a relevant schema change try: - for commit in _safe_walk_repo(repo): + for commit in _safe_walk_repo(repo, from_commit): datasets = repo.datasets(commit.oid) try: old_dataset = datasets[replacing_dataset.path] @@ -149,14 +152,22 @@ def fast_import_clear_trees(*, procs, replace_ids, replacing_dataset, source): if replacing_dataset is None: # nothing to do return - dest_inner_path = f"{source.dest_path}/{replacing_dataset.DATASET_DIRNAME}" + dest_path = source.dest_path + dest_inner_path = f"{dest_path}/{replacing_dataset.DATASET_DIRNAME}" for i, proc in enumerate(procs): if replace_ids is None: # Delete the existing dataset, before we re-import it. proc.stdin.write(f"D {source.dest_path}\n".encode("utf8")) else: - # delete and reimport meta/ + # Delete and reimport any attachments at dest_path + attachment_names = [ + obj.name for obj in replacing_dataset.tree if obj.type_str == "blob" + ] + for name in attachment_names: + proc.stdin.write(f"D {dest_path}/{name}\n".encode("utf8")) + # Delete and reimport /meta/ proc.stdin.write(f"D {dest_inner_path}/meta\n".encode("utf8")) + # delete all features not pertaining to this process. # we also delete the features that *do*, but we do it further down # so that we don't have to iterate the IDs more than once. @@ -174,20 +185,25 @@ def fast_import_clear_trees(*, procs, replace_ids, replacing_dataset, source): pass +UNSPECIFIED = object() + + def fast_import_tables( repo, sources, *, verbosity=1, num_processes=4, - header=None, message=None, replace_existing=ReplaceExisting.DONT_REPLACE, + from_commit=UNSPECIFIED, replace_ids=None, allow_empty=False, limit=None, max_pack_size="2G", max_delta_depth=0, + # Advanced use - used by kart upgrade. + header=None, extra_cmd_args=(), ): """ @@ -200,13 +216,16 @@ def fast_import_tables( 1: basic status information 2: full output of `git-fast-import --stats ...` num_processes: how many import processes to run in parallel - header - the commit-header to supply git-fast-import. Generated if not supplied - see generate_header. message - the commit-message used when generating the header. Generated if not supplied - see generate_message. replace_existing - See ReplaceExisting enum + from_commit - the commit to be used as a starting point before beginning the import. replace_ids - list of PK values to replace, or None limit - maximum number of features to import per source. max_pack_size - maximum size of pack files. Affects performance. max_delta_depth - maximum depth of delta-compression chains. Affects performance. + + The following extra options are used by kart upgrade. + header - the commit-header to supply git-fast-import. Generated if not supplied - see generate_header. extra_cmd_args - any extra args for the git-fast-import command. """ @@ -220,28 +239,26 @@ def fast_import_tables( # too many processes it won't be very even. raise ValueError(f"Can't import with more than {MAX_PROCESSES} processes") - # The tree this repo was at before this function was called. - # May be None (repo is empty) - orig_tree = repo.head_tree - - # The tree we look at for considering what datasets already exist - # depends what we want to replace. - if replace_existing == ReplaceExisting.ALL: - starting_tree = None + # The commit that this import is using as the basis for the new commit. + # If we are replacing everything, we start from scratch, so from_commit is None. + if replace_existing is ReplaceExisting.ALL: + from_commit = None else: - starting_tree = repo.head_tree + if from_commit is UNSPECIFIED: + raise RuntimeError( + "Caller should specify from_commit when requesting an import that doesn't start from scratch" + ) - if not starting_tree: - replace_existing = ReplaceExisting.ALL + from_tree = from_commit.peel(pygit2.Tree) if from_commit else repo.empty_tree assert repo.version in SUPPORTED_REPO_VERSIONS - extra_blobs = extra_blobs_for_version(repo.version) if not starting_tree else [] + extra_blobs = extra_blobs_for_version(repo.version) if not from_commit else [] ImportSource.check_valid(sources) if replace_existing == ReplaceExisting.DONT_REPLACE: for source in sources: - if source.dest_path in starting_tree: + if source.dest_path in from_tree: raise InvalidOperation( f"Cannot import to {source.dest_path}/ - already exists in repository" ) @@ -260,7 +277,6 @@ def fast_import_tables( for arg in extra_cmd_args: cmd.append(arg) - orig_commit = repo.head_commit import_refs = [] if verbosity >= 1: @@ -291,11 +307,13 @@ def fast_import_tables( import_ref = f"refs/kart-import/{uuid.uuid4()}" import_refs.append(import_ref) - # may be None, if head is detached + # orig_branch may be None, if head is detached + # FIXME - this code relies upon the fact that we always either a) import at HEAD (import flow) + # or b) Fix up the branch heads later (upgrade flow). orig_branch = repo.head_branch - proc_header = generate_header(repo, sources, message, import_ref) - if replace_existing != ReplaceExisting.ALL: - proc_header += f"from {orig_commit.oid}\n" + proc_header = generate_header( + repo, sources, message, import_ref, from_commit + ) proc = stack.enter_context(_git_fast_import(repo, *cmd)) procs.append(proc) @@ -303,10 +321,7 @@ def fast_import_tables( # Write the extra blob that records the repo's version: for i, blob_path in write_blobs_to_stream(procs[0].stdin, extra_blobs): - if ( - replace_existing != ReplaceExisting.ALL - and blob_path in starting_tree - ): + if replace_existing != ReplaceExisting.ALL and blob_path in from_tree: raise ValueError(f"{blob_path} already exists") if num_processes == 1: @@ -326,6 +341,7 @@ def proc_for_feature_path(path): repo, source, replace_existing, + from_commit, procs, proc_for_feature_path, replace_ids, @@ -363,7 +379,7 @@ def proc_for_feature_path(path): else: new_tree = trees[0] if not allow_empty: - if new_tree == orig_tree: + if new_tree == from_tree: raise NotFound("No changes to commit", exit_code=NO_CHANGES) # use the existing commit details we already imported, but use the new tree @@ -380,16 +396,14 @@ def proc_for_feature_path(path): # remove the import branches for b in import_refs: if b in repo.references: - try: - repo.references.delete(b) - except KeyError: - pass # Nothing to delete, probably due to some earlier failure. + repo.references.delete(b) def _import_single_source( repo, source, replace_existing, + from_commit, procs, proc_for_feature_path, replace_ids, @@ -400,6 +414,7 @@ def _import_single_source( repo - the Kart repo to import into. source - an individual ImportSource replace_existing - See ReplaceExisting enum + from_commit - the commit to be used as a starting point before beginning the import. procs - all the processes to be used (for parallel imports) proc_for_feature_path - function, given a feature path returns the process to use to import it replace_ids - list of PK values to replace, or None @@ -412,7 +427,7 @@ def _import_single_source( replacing_dataset = None if replace_existing == ReplaceExisting.GIVEN: try: - replacing_dataset = repo.datasets()[source.dest_path] + replacing_dataset = repo.datasets(refish=from_commit)[source.dest_path] except KeyError: # no such dataset; no problem replacing_dataset = None @@ -478,7 +493,10 @@ def _ids(): ) elif should_compare_imported_features_against_old_features( - repo, source, replacing_dataset + repo, + source, + replacing_dataset, + from_commit, ): feature_blob_iter = dataset.import_iter_feature_blobs( repo, @@ -536,18 +554,21 @@ def copy_existing_blob_to_stream(stream, blob_path, blob_sha): stream.write(f"M 644 {blob_sha} {blob_path}\n".encode("utf8")) -def generate_header(repo, sources, message, branch): +def generate_header(repo, sources, message, branch, from_commit): if message is None: message = generate_message(sources) author = repo.author_signature() committer = repo.committer_signature() - return ( + result = ( f"commit {branch}\n" f"author {author.name} <{author.email}> {author.time} {minutes_to_tz_offset(author.offset)}\n" f"committer {committer.name} <{committer.email}> {committer.time} {minutes_to_tz_offset(committer.offset)}\n" f"data {len(message.encode('utf8'))}\n{message}\n" ) + if from_commit: + result += f"from {from_commit.oid}\n" + return result def generate_message(sources): diff --git a/kart/import_source.py b/kart/import_source.py index b719488b3..b336d5252 100644 --- a/kart/import_source.py +++ b/kart/import_source.py @@ -101,6 +101,14 @@ def meta_items(self): """ raise NotImplementedError() + def attachment_items(self): + """ + Returns a dict of all the attachment items that need to be imported. + These are files that will be imported verbatim to dest_path, but not hidden inside the dataset. + This could be a license or a readme. + """ + return {} + def crs_definitions(self): """ Returns an {identifier: definition} dict containing every CRS definition. diff --git a/kart/init.py b/kart/init.py index 8e5832cce..a36c5112b 100644 --- a/kart/init.py +++ b/kart/init.py @@ -326,15 +326,17 @@ def import_( import_sources.append(import_source) ImportSource.check_valid(import_sources, param_hint="tables") + replace_existing_enum = ( + ReplaceExisting.GIVEN if replace_existing else ReplaceExisting.DONT_REPLACE + ) fast_import_tables( repo, import_sources, verbosity=ctx.obj.verbosity + 1, message=message, max_delta_depth=max_delta_depth, - replace_existing=ReplaceExisting.GIVEN - if replace_existing - else ReplaceExisting.DONT_REPLACE, + replace_existing=replace_existing_enum, + from_commit=repo.head_commit, replace_ids=replace_ids, allow_empty=allow_empty, num_processes=num_processes or get_default_num_processes(), @@ -458,6 +460,7 @@ def init( fast_import_tables( repo, sources, + from_commit=None, message=message, max_delta_depth=max_delta_depth, num_processes=num_processes or get_default_num_processes(), diff --git a/kart/pk_generation.py b/kart/pk_generation.py index ce9c97541..666a40064 100644 --- a/kart/pk_generation.py +++ b/kart/pk_generation.py @@ -3,7 +3,7 @@ from .import_source import ImportSource from .dataset2 import Dataset2 from .dataset2 import Dataset3 -from .schema import ColumnSchema, Schema +from .schema import ColumnSchema class PkGeneratingImportSource(ImportSource): diff --git a/kart/rich_base_dataset.py b/kart/rich_base_dataset.py index f01b25ef2..0e1532b96 100644 --- a/kart/rich_base_dataset.py +++ b/kart/rich_base_dataset.py @@ -201,9 +201,9 @@ def diff_feature(self, other, feature_filter=UNFILTERED, reverse=False): """ feature_filter = feature_filter or UNFILTERED - params = {} + params = {"flags": pygit2.GIT_DIFF_SKIP_BINARY_CHECK} if reverse: - params = {"swap": True} + params["swap"] = True if other is None: diff_index = self.inner_tree.diff_to_tree(**params) diff --git a/kart/upgrade/__init__.py b/kart/upgrade/__init__.py index f7b1d23f0..943468bc6 100644 --- a/kart/upgrade/__init__.py +++ b/kart/upgrade/__init__.py @@ -10,7 +10,7 @@ from kart.dataset2 import Dataset2 from kart.exceptions import InvalidOperation, NotFound from kart.fast_import import fast_import_tables, ReplaceExisting -from kart.repo import KartRepo +from kart.repo import KartRepo, KartConfigKeys from kart.repo_version import DEFAULT_NEW_REPO_VERSION from kart.structure import RepoStructure from kart.timestamps import minutes_to_tz_offset @@ -27,22 +27,29 @@ def dataset_class_for_legacy_version(version, in_place=False): return Dataset1 elif version == 2: if in_place: - return InPlaceUpgradingDataset2 + return InPlaceUpgradeSourceDataset2 else: - return XmlUpgradingDataset2 + return UpgradeSourceDataset2 return None -class XmlUpgradingDataset2(Dataset2): - """Variant of Dataset2 that extracts the metadata.xml out of dataset/metadata.json.""" +class UpgradeSourceDataset2(Dataset2): + """ + Variant of Dataset2 that: + - preserves all meta_items, even non-standard ones. + - preserves attachments + - upgrades dataset/metadata.json to metadata.xml + """ + + def meta_items(self): + # We also want to include hidden items like generated-pks.json, in the odd case where we have it. + return super().meta_items(only_standard_items=False) def get_meta_item(self, name, missing_ok=True): if name == "metadata/dataset.json": return None - result = super().get_meta_item(name, missing_ok=missing_ok) - if result is None and name == "metadata.xml": metadata_json = super().get_meta_item("metadata/dataset.json") if metadata_json: @@ -53,8 +60,13 @@ def get_meta_item(self, name, missing_ok=True): return next(iter(metadata_xml))["text/xml"] return result + def attachment_items(self): + attachments = [obj for obj in self.tree if obj.type_str == "blob"] + for attachment in attachments: + yield attachment.name, attachment.data -class InPlaceUpgradingDataset2(XmlUpgradingDataset2): + +class InPlaceUpgradeSourceDataset2(UpgradeSourceDataset2): @property def feature_blobs_already_written(self): return True @@ -209,6 +221,11 @@ def upgrade(ctx, source, dest, in_place): subctx.obj.user_repo_path = str(dest) subctx.invoke(checkout.create_workingcopy) + if in_place: + dest_repo.config[KartConfigKeys.KART_REPOSTRUCTURE_VERSION] = str( + DEFAULT_NEW_REPO_VERSION + ) + click.secho("\nUpgrade complete", fg="green", bold=True) @@ -218,7 +235,7 @@ def _upgrade_commit( source_repo, source_commit, source_dataset_class, - dest_parents, + dest_parent_ids, dest_repo, commit_map, ): @@ -254,26 +271,28 @@ def _upgrade_commit( ds_path, ds_diff = sole_dataset_diff source_datasets = [source_rs.datasets[ds_path]] replace_existing = ReplaceExisting.GIVEN - from_parent = commit_map[source_commit.parents[0].hex] - merge_parents = [p for p in dest_parents if p != from_parent] + from_id = commit_map[source_commit.parents[0].hex] + from_commit = dest_repo[from_id] + merge_ids = [p for p in dest_parent_ids if p != from_id] replace_ids = list(ds_diff.get("feature", {}).keys()) feature_count = len(replace_ids) else: - from_parent = None - merge_parents = dest_parents + from_id = from_commit = None + merge_ids = dest_parent_ids replace_existing = ReplaceExisting.ALL replace_ids = None feature_count = sum(s.feature_count for s in source_datasets) - if from_parent: - header += f"from {from_parent}\n" - header += "".join(f"merge {p}\n" for p in merge_parents) + if from_id: + header += f"from {from_id}\n" + header += "".join(f"merge {p}\n" for p in merge_ids) try: fast_import_tables( dest_repo, source_datasets, replace_existing=replace_existing, + from_commit=from_commit, replace_ids=replace_ids, verbosity=ctx.obj.verbosity, header=header, diff --git a/tests/test_structure.py b/tests/test_structure.py index e7a55f1c7..97a7ff6b1 100644 --- a/tests/test_structure.py +++ b/tests/test_structure.py @@ -1120,7 +1120,7 @@ def test_fast_import(data_archive, tmp_path, cli_runner, chdir): data / "nz-pa-points-topo-150k.gpkg", table=table ) - fast_import.fast_import_tables(repo, [source]) + fast_import.fast_import_tables(repo, [source], from_commit=None) assert not repo.is_empty assert repo.head.name == "refs/heads/main"