From c53745d6b951a2f86b3723c1d4d81620c1956485 Mon Sep 17 00:00:00 2001 From: missinglink Date: Wed, 9 Oct 2019 15:16:42 +0200 Subject: [PATCH] feat(dedupe): improved deduplication of places containing their placetype in the name Connects https://github.com/pelias/geonames/issues/395 --- helper/diffPlaces.js | 61 ++++++++++++ test/unit/helper/diffPlaces.js | 164 +++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) diff --git a/helper/diffPlaces.js b/helper/diffPlaces.js index c311bf145..ecb6fb9dd 100644 --- a/helper/diffPlaces.js +++ b/helper/diffPlaces.js @@ -111,6 +111,11 @@ function isNameDifferent(item1, item2, requestLanguage){ // note: this really shouldn't happen as name is a mandatory field if( !isPojo1 || !isPojo2 ){ return false; } + // apply 'layer dependent normalization' to the names + // this ensures that 'Foo' and 'City of Foo' match for localities. + names1 = layerDependentNormalization(names1, _.get(item1, 'layer')); + names2 = layerDependentNormalization(names2, _.get(item2, 'layer')); + // else both have name info // iterate over all the languages in item2, comparing them to the @@ -229,6 +234,61 @@ function getPlaceTypeRank(item) { * apply unicode normalization, lowercase characters and remove * diacritics and some punctuation. */ +function layerDependentNormalization(names, layer) { + + // sanity checking inputs + if (!_.isPlainObject(names)) { return names; } + if (!_.isString(layer)) { return names; } + + // clone the names to avoid mutating the response data + const copy = _.cloneDeep(names); + + // region + if (layer === 'region') { + _.forEach(names, (value, lang) => { + copy[lang] = field.getArrayValue(value).map(name => { + return name + .replace(/^state\sof(?!\s?the)\s?(.*)$/i, '$1') + .replace(/^(.*)\sstate$/i, '$1') + .trim(); + }); + }); + } + + // county + if( layer === 'county' ){ + _.forEach(names, (value, lang) => { + copy[lang] = field.getArrayValue(value).map(name => { + return name + .replace(/^county\sof(?!\s?the)\s?(.*)$/i, '$1') + .replace(/^(.*)\scounty$/i, '$1') + .trim(); + }); + }); + } + + // locality/localadmin + if (layer === 'locality' || layer === 'localadmin') { + _.forEach(names, (value, lang) => { + copy[lang] = field.getArrayValue(value).map(name => { + return name + .replace(/^city\sof(?!\s?the)\s?(.*)$/i, '$1') + .replace(/^(.*)\scity$/i, '$1') + .replace(/^town\sof(?!\s?the)\s?(.*)$/i, '$1') + .replace(/^(.*)\stown$/i, '$1') + .replace(/^township\sof(?!\s?the)\s?(.*)$/i, '$1') + .replace(/^(.*)\stownship$/i, '$1') + .trim(); + }); + }); + } + + return copy; +} + +/** + * lowercase characters and remove diacritics and some punctuation + */ function normalizeString(str){ return removeAccents(unicode.normalize(str)).toLowerCase().split(/[ ,-]+/).join(' '); } @@ -237,3 +297,4 @@ module.exports.isDifferent = isDifferent; module.exports.layerPreferences = layerPreferences; module.exports.isNameDifferent = isNameDifferent; module.exports.normalizeString = normalizeString; +module.exports.layerDependentNormalization = layerDependentNormalization; diff --git a/test/unit/helper/diffPlaces.js b/test/unit/helper/diffPlaces.js index a966943a7..201f5978e 100644 --- a/test/unit/helper/diffPlaces.js +++ b/test/unit/helper/diffPlaces.js @@ -1,6 +1,8 @@ +const _ = require('lodash'); const isDifferent = require('../../../helper/diffPlaces').isDifferent; const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent; const normalizeString = require('../../../helper/diffPlaces').normalizeString; +const layerDependentNormalization = require('../../../helper/diffPlaces').layerDependentNormalization; module.exports.tests = {}; @@ -411,6 +413,99 @@ module.exports.tests.isNameDifferent = function (test, common) { { name: { default: 'Malmö', eng: 'Malmo' } } ), 'Malmö'); + t.false(isNameDifferent( + { name: { default: 'State of New York' }, layer: 'region' }, + { name: { default: 'New York' } } + ), 'State of *'); + + t.false(isNameDifferent( + { name: { default: 'New York State' }, layer: 'region' }, + { name: { default: 'New York' } } + ), '* State'); + + t.false(isNameDifferent( + { name: { default: 'County of New York' }, layer: 'county' }, + { name: { default: 'New York' } } + ), 'County of *'); + + t.false(isNameDifferent( + { name: { default: 'New York County' }, layer: 'county' }, + { name: { default: 'New York' } } + ), '* County'); + + t.false(isNameDifferent( + { name: { default: 'City of New York' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), 'City of *'); + + t.false(isNameDifferent( + { name: { default: 'New York City' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), '* City'); + + t.false(isNameDifferent( + { name: { default: 'Town of New York' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), 'Town of *'); + + t.false(isNameDifferent( + { name: { default: 'New York Town' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), '* Town'); + + t.false(isNameDifferent( + { name: { default: 'Township of New York' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), 'Township of *'); + + t.false(isNameDifferent( + { name: { default: 'New York Township' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), '* Township'); + + t.false(isNameDifferent( + { name: { default: 'City of New York' }, layer: 'localadmin' }, + { name: { default: 'New York' } } + ), 'City of *'); + + t.false(isNameDifferent( + { name: { default: 'New York City' }, layer: 'localadmin' }, + { name: { default: 'New York' } } + ), '* City'); + + t.false(isNameDifferent( + { name: { default: 'Town of New York' }, layer: 'localadmin' }, + { name: { default: 'New York' } } + ), 'Town of *'); + + t.false(isNameDifferent( + { name: { default: 'New York Town' }, layer: 'localadmin' }, + { name: { default: 'New York' } } + ), '* Town'); + + t.false(isNameDifferent( + { name: { default: 'Township of New York' }, layer: 'localadmin' }, + { name: { default: 'New York' } } + ), 'Township of *'); + + t.false(isNameDifferent( + { name: { default: 'New York Township' }, layer: 'locality' }, + { name: { default: 'New York' } } + ), '* Township'); + + t.end(); + }); + test('mutation tests', function (t) { + // mutation test, $input data should not be mutated + const input = { name: { default: 'New York City' }, layer: 'locality' }; + const expected = { name: { default: 'New York' } }; + + // repeat previous test to ensure that the strings were actually changed + t.false(isNameDifferent(input, expected), '* City'); + + // test that input wasn't mutated in the process + t.equal(input.name.default, 'New York City'); + t.end(); }); }; @@ -437,6 +532,75 @@ module.exports.tests.normalizeString = function (test, common) { }); }; +module.exports.tests.layerDependentNormalization = function (test, common) { + test('region', function (t) { + const norm = _.bind(layerDependentNormalization, null, _, 'region'); + t.deepEqual(norm( + { default: ['State of Foo', 'State of Bar'], en: ['State of Baz'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.deepEqual(norm( + { default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] } + ), + { default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] } + ); + t.deepEqual(norm( + { default: ['Foo State', 'Bar State'], en: ['Baz State'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.end(); + }); + test('county', function (t) { + const norm = _.bind(layerDependentNormalization, null, _, 'county'); + t.deepEqual(norm( + { default: ['County of Foo', 'County of Bar'], en: ['County of Baz'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.deepEqual(norm( + { default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] } + ), + { default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] } + ); + t.deepEqual(norm( + { default: ['Foo County', 'Bar County'], en: ['Baz County'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.end(); + }); + test('locality', function (t) { + const norm = _.bind(layerDependentNormalization, null, _, 'locality'); + t.deepEqual(norm( + { default: ['City of Foo', 'Town of Bar'], en: ['Township of Baz'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.deepEqual(norm( + { default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] } + ), + { default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] } + ); + t.deepEqual(norm( + { default: ['Foo City', 'Bar Town'], en: ['Baz Township'] } + ), + { default: ['Foo', 'Bar'], en: ['Baz'] } + ); + t.end(); + }); + test('only applied to correct layer', function (t) { + const norm = _.bind(layerDependentNormalization, null, _, 'venue'); + t.deepEqual(norm( + { default: ['City of Los Angeles Fire Department Station'] } + ), + { default: ['City of Los Angeles Fire Department Station'] } + ); + t.end(); + }); +}; + module.exports.all = function (tape, common) { function test(name, testFunction) {