Skip to content

Commit

Permalink
feat(dedupe): improved deduplication of places containing their place…
Browse files Browse the repository at this point in the history
…type in the name

Connects pelias/geonames#395
  • Loading branch information
missinglink committed Mar 14, 2021
1 parent 0968e6d commit c53745d
Show file tree
Hide file tree
Showing 2 changed files with 225 additions and 0 deletions.
61 changes: 61 additions & 0 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ function isNameDifferent(item1, item2, requestLanguage){
// note: this really shouldn't happen as name is a mandatory field
if( !isPojo1 || !isPojo2 ){ return false; }

// apply 'layer dependent normalization' to the names
// this ensures that 'Foo' and 'City of Foo' match for localities.
names1 = layerDependentNormalization(names1, _.get(item1, 'layer'));
names2 = layerDependentNormalization(names2, _.get(item2, 'layer'));

// else both have name info

// iterate over all the languages in item2, comparing them to the
Expand Down Expand Up @@ -229,6 +234,61 @@ function getPlaceTypeRank(item) {
* apply unicode normalization, lowercase characters and remove
* diacritics and some punctuation.
*/
function layerDependentNormalization(names, layer) {

// sanity checking inputs
if (!_.isPlainObject(names)) { return names; }
if (!_.isString(layer)) { return names; }

// clone the names to avoid mutating the response data
const copy = _.cloneDeep(names);

// region
if (layer === 'region') {
_.forEach(names, (value, lang) => {
copy[lang] = field.getArrayValue(value).map(name => {
return name
.replace(/^state\sof(?!\s?the)\s?(.*)$/i, '$1')
.replace(/^(.*)\sstate$/i, '$1')
.trim();
});
});
}

// county
if( layer === 'county' ){
_.forEach(names, (value, lang) => {
copy[lang] = field.getArrayValue(value).map(name => {
return name
.replace(/^county\sof(?!\s?the)\s?(.*)$/i, '$1')
.replace(/^(.*)\scounty$/i, '$1')
.trim();
});
});
}

// locality/localadmin
if (layer === 'locality' || layer === 'localadmin') {
_.forEach(names, (value, lang) => {
copy[lang] = field.getArrayValue(value).map(name => {
return name
.replace(/^city\sof(?!\s?the)\s?(.*)$/i, '$1')
.replace(/^(.*)\scity$/i, '$1')
.replace(/^town\sof(?!\s?the)\s?(.*)$/i, '$1')
.replace(/^(.*)\stown$/i, '$1')
.replace(/^township\sof(?!\s?the)\s?(.*)$/i, '$1')
.replace(/^(.*)\stownship$/i, '$1')
.trim();
});
});
}

return copy;
}

/**
* lowercase characters and remove diacritics and some punctuation
*/
function normalizeString(str){
return removeAccents(unicode.normalize(str)).toLowerCase().split(/[ ,-]+/).join(' ');
}
Expand All @@ -237,3 +297,4 @@ module.exports.isDifferent = isDifferent;
module.exports.layerPreferences = layerPreferences;
module.exports.isNameDifferent = isNameDifferent;
module.exports.normalizeString = normalizeString;
module.exports.layerDependentNormalization = layerDependentNormalization;
164 changes: 164 additions & 0 deletions test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
const _ = require('lodash');
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent;
const normalizeString = require('../../../helper/diffPlaces').normalizeString;
const layerDependentNormalization = require('../../../helper/diffPlaces').layerDependentNormalization;

module.exports.tests = {};

Expand Down Expand Up @@ -411,6 +413,99 @@ module.exports.tests.isNameDifferent = function (test, common) {
{ name: { default: 'Malmö', eng: 'Malmo' } }
), 'Malmö');

t.false(isNameDifferent(
{ name: { default: 'State of New York' }, layer: 'region' },
{ name: { default: 'New York' } }
), 'State of *');

t.false(isNameDifferent(
{ name: { default: 'New York State' }, layer: 'region' },
{ name: { default: 'New York' } }
), '* State');

t.false(isNameDifferent(
{ name: { default: 'County of New York' }, layer: 'county' },
{ name: { default: 'New York' } }
), 'County of *');

t.false(isNameDifferent(
{ name: { default: 'New York County' }, layer: 'county' },
{ name: { default: 'New York' } }
), '* County');

t.false(isNameDifferent(
{ name: { default: 'City of New York' }, layer: 'locality' },
{ name: { default: 'New York' } }
), 'City of *');

t.false(isNameDifferent(
{ name: { default: 'New York City' }, layer: 'locality' },
{ name: { default: 'New York' } }
), '* City');

t.false(isNameDifferent(
{ name: { default: 'Town of New York' }, layer: 'locality' },
{ name: { default: 'New York' } }
), 'Town of *');

t.false(isNameDifferent(
{ name: { default: 'New York Town' }, layer: 'locality' },
{ name: { default: 'New York' } }
), '* Town');

t.false(isNameDifferent(
{ name: { default: 'Township of New York' }, layer: 'locality' },
{ name: { default: 'New York' } }
), 'Township of *');

t.false(isNameDifferent(
{ name: { default: 'New York Township' }, layer: 'locality' },
{ name: { default: 'New York' } }
), '* Township');

t.false(isNameDifferent(
{ name: { default: 'City of New York' }, layer: 'localadmin' },
{ name: { default: 'New York' } }
), 'City of *');

t.false(isNameDifferent(
{ name: { default: 'New York City' }, layer: 'localadmin' },
{ name: { default: 'New York' } }
), '* City');

t.false(isNameDifferent(
{ name: { default: 'Town of New York' }, layer: 'localadmin' },
{ name: { default: 'New York' } }
), 'Town of *');

t.false(isNameDifferent(
{ name: { default: 'New York Town' }, layer: 'localadmin' },
{ name: { default: 'New York' } }
), '* Town');

t.false(isNameDifferent(
{ name: { default: 'Township of New York' }, layer: 'localadmin' },
{ name: { default: 'New York' } }
), 'Township of *');

t.false(isNameDifferent(
{ name: { default: 'New York Township' }, layer: 'locality' },
{ name: { default: 'New York' } }
), '* Township');

t.end();
});
test('mutation tests', function (t) {
// mutation test, $input data should not be mutated
const input = { name: { default: 'New York City' }, layer: 'locality' };
const expected = { name: { default: 'New York' } };

// repeat previous test to ensure that the strings were actually changed
t.false(isNameDifferent(input, expected), '* City');

// test that input wasn't mutated in the process
t.equal(input.name.default, 'New York City');

t.end();
});
};
Expand All @@ -437,6 +532,75 @@ module.exports.tests.normalizeString = function (test, common) {
});
};

module.exports.tests.layerDependentNormalization = function (test, common) {
test('region', function (t) {
const norm = _.bind(layerDependentNormalization, null, _, 'region');
t.deepEqual(norm(
{ default: ['State of Foo', 'State of Bar'], en: ['State of Baz'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.deepEqual(norm(
{ default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] }
),
{ default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] }
);
t.deepEqual(norm(
{ default: ['Foo State', 'Bar State'], en: ['Baz State'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.end();
});
test('county', function (t) {
const norm = _.bind(layerDependentNormalization, null, _, 'county');
t.deepEqual(norm(
{ default: ['County of Foo', 'County of Bar'], en: ['County of Baz'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.deepEqual(norm(
{ default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] }
),
{ default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] }
);
t.deepEqual(norm(
{ default: ['Foo County', 'Bar County'], en: ['Baz County'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.end();
});
test('locality', function (t) {
const norm = _.bind(layerDependentNormalization, null, _, 'locality');
t.deepEqual(norm(
{ default: ['City of Foo', 'Town of Bar'], en: ['Township of Baz'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.deepEqual(norm(
{ default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] }
),
{ default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] }
);
t.deepEqual(norm(
{ default: ['Foo City', 'Bar Town'], en: ['Baz Township'] }
),
{ default: ['Foo', 'Bar'], en: ['Baz'] }
);
t.end();
});
test('only applied to correct layer', function (t) {
const norm = _.bind(layerDependentNormalization, null, _, 'venue');
t.deepEqual(norm(
{ default: ['City of Los Angeles Fire Department Station'] }
),
{ default: ['City of Los Angeles Fire Department Station'] }
);
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit c53745d

Please sign in to comment.