Skip to content

Commit

Permalink
Merge pull request #1606 from pelias/concordance-dedupe
Browse files Browse the repository at this point in the history
Dedupe Geonames records with WOF concordances
  • Loading branch information
orangejulius authored Mar 10, 2022
2 parents ad93fc4 + 533884c commit 5d4617b
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
32 changes: 32 additions & 0 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ const unicode = require('./unicode');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
const codec = require('pelias-model').codec;

// only consider these layers as synonymous for deduplication purposes.
// when performing inter-layer deduping, layers coming earlier in this list take
Expand Down Expand Up @@ -186,11 +187,42 @@ function isAddressDifferent(item1, item2){
return false;
}

function isGeonamesConcordanceSame(item1, item2) {
const items = [item1, item2];

const wof_record = items.find(i => i.source === 'whosonfirst');
const gn_record = items.find(i => i.source === 'geonames');

// must have found one wof and one gn record or this check does not apply
if (!wof_record || !gn_record) { return false; }

const concordances = _.get(wof_record, 'addendum.concordances');

if (!concordances) {
return false;
}

const json = codec.decode(concordances);
const concordance_id = json['gn:id'];

if (!concordance_id || !_.isNumber(concordance_id)) { return false; }

// only records with a matching concordance pass this check
if (concordance_id.toString() === gn_record.source_id) {
return true;
}

return false;
}

/**
* Compare the two records and return true if they differ and false if same.
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
*/
function isDifferent(item1, item2, requestLanguage){
// records that share a geonames concordance are the same, regardless of any other checks
if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; }

if( isLayerDifferent( item1, item2 ) ){ return true; }
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }
Expand Down
25 changes: 25 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,31 @@ module.exports.tests.layerDependentNormalization = function (test, common) {
});
};

module.exports.tests.geonames = function (test, common) {
test('geonames record with concordance is the same, regardless of anything else', function(t) {
const gn_record = {
source: 'geonames',
source_id: '123',
name: {
'default': 'One name'
}
};
const wof_record = {
source: 'whosonfirst',
source_id: '345',
name: {
default: 'Different name'
},
addendum: {
concordances: '{ "gn:id": 123 }'
}
};

t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance');
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit 5d4617b

Please sign in to comment.