koordinates · olsen232 · Dec 18, 2020 · Dec 14, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ Please note that compatibility for 0.x releases (software or repositories) isn't
 
 _When adding new entries to the changelog, please include issue/PR numbers wherever possible._
 
+## 0.8.0 (UNRELEASED)
+
+ * Support for detecting features which have changed slightly during a re-import from a data source without a primary key, and reimporting them with the same primary key as last time so they show as edits as opposed to inserts. [#212](https://github.com/koordinates/sno/issues/212)
+
 ## 0.7.1
 
 #### JSON syntax-highlighting fix

diff --git a/sno/base_dataset.py b/sno/base_dataset.py
@@ -178,13 +178,17 @@ def geom_column_name(self):
     def features(self):
         """
         Yields a dict for every feature. Dicts contain key-value pairs for each feature property,
-        and geometries use sno.geometry.Geometry objects, as in the following example:
+        and geometries use sno.geometry.Geometry objects, as in the following example::
+
         {
             "fid": 123,
             "geom": Geometry(b"..."),
             "name": "..."
             "last-modified": "..."
         }
+
+        Each dict is guaranteed to iterate in the same order as the columns are ordered in the schema,
+        so that zip(schema.columns, feature.values()) matches each field with its column.
         """
         for blob in self.feature_blobs():
             yield self.get_feature(path=blob.name, data=memoryview(blob))

diff --git a/sno/fast_import.py b/sno/fast_import.py
@@ -82,7 +82,7 @@ def fast_import_tables(
                 )
 
     # Add primary keys if needed.
-    sources = PkGeneratingImportSource.wrap_if_needed(sources, repo)
+    sources = PkGeneratingImportSource.wrap_sources_if_needed(sources, repo)
 
     cmd = [
         "git",

diff --git a/sno/import_source.py b/sno/import_source.py
@@ -130,13 +130,17 @@ def has_geometry(self):
     def features(self):
         """
         Yields a dict for every feature. Dicts contain key-value pairs for each feature property,
-        and geometries use sno.geometry.Geometry objects, as in the following example:
+        and geometries use sno.geometry.Geometry objects, as in the following example::
+
         {
             "fid": 123,
             "geom": Geometry(b"..."),
             "name": "..."
             "last-modified": "..."
         }
+
+        Each dict is guaranteed to iterate in the same order as the columns are ordered in the schema,
+        so that zip(schema.columns, feature.values()) matches each field with its column.
         """
         raise NotImplementedError()
 

diff --git a/sno/init.py b/sno/init.py
@@ -132,6 +132,17 @@ def _add_datasets_to_working_copy(repo, *datasets, replace_existing=False):
     is_flag=True,
     help="Replace existing dataset(s) of the same name.",
 )
+@click.option(
+    "--similarity-detection-limit",
+    hidden=True,
+    type=click.INT,
+    default=50,
+    help=(
+        "When replacing an existing dataset where primary keys are auto-generated: the maximum number of unmatched "
+        "features to search through for similar features, so that primary keys can be reassigned for features that "
+        "are similar but have had minor edits. Zero means that no similarity detection is performed. (Advanced users only)"
+    ),
+)
 @click.option(
     "--allow-empty",
     is_flag=True,
@@ -167,6 +178,7 @@ def import_table(
     tables,
     table_info,
     replace_existing,
+    similarity_detection_limit,
     allow_empty,
     max_delta_depth,
     do_checkout,
@@ -247,8 +259,11 @@ def import_table(
                 # will result in a new schema object, and thus a new blob for every feature.
                 # Note that alignment works better if we add the generated-pk-column first (when needed),
                 # if one schema has this and the other lacks it they will be harder to align.
-                import_source = PkGeneratingImportSource.wrap_if_needed(
-                    import_source, repo
+                import_source = PkGeneratingImportSource.wrap_source_if_needed(
+                    import_source,
+                    repo,
+                    dest_path=dest_path,
+                    similarity_detection_limit=similarity_detection_limit,
                 )
                 import_source.schema = existing_ds.schema.align_to_self(
                     import_source.schema
-Original file line number
+Diff line change
@@ Expand Up / @@ -82,7 +82,7 @@ def fast_import_tables( @@
                     )
         # Add primary keys if needed.
-        sources = PkGeneratingImportSource.wrap_if_needed(sources, repo)
+        sources = PkGeneratingImportSource.wrap_sources_if_needed(sources, repo)
         cmd = [
             "git",
@@ Expand Down @@