From 0040864b46a2e36a9f2a81a435d9aa07cfad1515 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Thu, 3 Mar 2022 14:49:59 -0500 Subject: [PATCH] feat(dedupe): Handle Geonames records with overridden parents Background ========== In https://github.com/pelias/geonames/pull/93 we added some special case logic to the Geonames importer that ensures Geonames records in the `locality` and `localadmin` layer have themselves as parents in that layer. Before this change, they would have a Who's on First parent, but these parents didn't always line up perfectly. Sometimes it would lead to broken labels, and as I recall it could also break search queries that rely on locality/localadmin names. Hierarchy checks ================ This special logic causes problems with our hierarchy checks, which expect records that can be considered duplicates to share all parents higher than the _lower_ record. So for example, if a locality and localadmin are to be considered duplicates, the hierarchy must be the same from the country layer down to localadmin. Geonames localadmins ==================== Geonames seems to have a penchant for having both a `locality` _and_ a `localadmin` record for a given city, even when the local administrative divisions don't really support such nuance. These records often have a name following the format 'City of X', which makes them very disruptive and confusing when shown in a list of results. Deduplication ============= Our deduplication code can handle minor name differences like 'City of' after https://github.com/pelias/api/pull/1371, but can't handle the hierarchy differences that generally occur with these records. Generally, there will be one of two scenarios: - A WOF locality record for the city can't deduplicate with the Geonames localadmin because the WOF record is parented by a WOF localadmin - A WOF locality record for the city can't deduplicate with the Geonames localadmin beause the WOF record has no localadmin parent at all Concordances (from https://github.com/pelias/api/pull/1606) generally don't help either, since ther often isn't a localadmin in WOF to even have a concordance to the Geonames localadmin. Adding a hierarchy exception ============================ This PR works by skipping the hierarchy checks for any layer where a Geonames record has itself as a parent. This means that assuming all the other layers are the same, the names are compatible, etc, deduplication is still possible. Impact ====== Of the 314 cities in our [`top_us_cities`](https://github.com/pelias/fuzzy-tests/blob/master/test_cases/top_us_cities.json) fuzzy tests, most of them (125) had a 'City of X' record somewhere in the results when querying via the autocomplete endpoint. With this PR, there are only 15 cases. Potential regressions ===================== Theoretically, this could allow records that aren't actually duplicates to be deduped, but they would have to have a similar name and likely share at least a `county`, so it feels like the chance for error is limited. That said, there are no regressions in our acceptance tests, and quite a few improvements. --- helper/diffPlaces.js | 25 +++++++++++++++++++++++++ test/unit/helper/diffPlaces.js | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index 537a2d960..6909034f2 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -46,6 +46,26 @@ function isUsState(item) { return item.parent.country_a[0] === 'USA' && item.layer === 'region'; } +// Geonames records in the locality and localadmin layer are parented by themselves +// This breaks our other hierarchy logic, so check for this special case +function isGeonamesWithSelfParent(item, placeType) { + if (item.source !== 'geonames') { return false; } + if (item.layer !== placeType) { return false; } + + if (!item.parent) { return false; } + + // get the relevant parent id(s) for the placeType in question + const parent_records = item.parent[`${placeType}_id`] || []; + + // check if the parent ids at this layer match this Geonames record + // we have special cased Geonames parents in many cases + // handle both array and scalar values + if (item.source_id === parent_records) { return true; } + if (parent_records.includes(item.source_id)) { return true; } + + return false; +} + /** * Compare the parent properties if they exist. * Returns false if the objects are the same, else true. @@ -108,6 +128,11 @@ function isParentHierarchyDifferent(item1, item2){ // skip layers that are more granular than $maxRank if (rank > maxRank){ return false; } + // Special case Geonames records that are parented by themselves and would otherwise break these checks + if (isGeonamesWithSelfParent(item1, placeType) || isGeonamesWithSelfParent(item2, placeType)) { + return false; + } + // ensure the parent ids are the same for all placetypes return isPropertyDifferent( parent1, parent2, `${placeType}_id` ); }); diff --git a/test/unit/helper/diffPlaces.js b/test/unit/helper/diffPlaces.js index 5910219bb..3ad68b95a 100644 --- a/test/unit/helper/diffPlaces.js +++ b/test/unit/helper/diffPlaces.js @@ -790,6 +790,40 @@ module.exports.tests.geonames = function (test, common) { }); }; +module.exports.tests.geonames_self_parent = function (test, common) { + test('geonames records that parent themselves should not break hierarchy checks', function(t) { + const gn_record = { + source: 'geonames', + layer: 'localadmin', + source_id: '7163824', + name: { + default: 'City of New York' + }, + parent: { + country_id: ['85633793'], // USA, WOF + region_id: ['85688543'], // NY, WOF + localadmin_id: ['7163824'] // New York County, Geonames + } + }; + const wof_record = { + source: 'whosonfirst', + layer: 'locality', + source_id: '85977539', + name: { + default: 'New York' + }, + parent: { + country_id: ['85633793'], // USA, WOF + region_id: ['85688543'], // NY, WOF + //localadmin_id: null // no localadmin is set for NYC in WOF + } + }; + + t.false(isDifferent(gn_record, wof_record), 'should be the same based on hierarchy'); + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) {