Skip to content

Commit

Permalink
improved string normalization for deduplication (#1370)
Browse files Browse the repository at this point in the history
* fix(dedupe): improved string normalization for deduplication

* test(dedupe): additional test coverage
  • Loading branch information
missinglink authored Mar 14, 2021
1 parent de97774 commit 45ec00d
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 4 deletions.
10 changes: 7 additions & 3 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
const _ = require('lodash');
const removeAccents = require('remove-accents');
const unicode = require('./unicode');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
const removeAccents = require('remove-accents');

// only consider these layers as synonymous for deduplication purposes.
// when performing inter-layer deduping, layers coming earlier in this list take
Expand Down Expand Up @@ -225,11 +226,14 @@ function getPlaceTypeRank(item) {
}

/**
* lowercase characters and remove some punctuation
* apply unicode normalization, lowercase characters and remove
* diacritics and some punctuation.
*/
function normalizeString(str){
return removeAccents(str.toLowerCase().split(/[ ,-]+/).join(' '));
return removeAccents(unicode.normalize(str)).toLowerCase().split(/[ ,-]+/).join(' ');
}

module.exports.isDifferent = isDifferent;
module.exports.layerPreferences = layerPreferences;
module.exports.isNameDifferent = isNameDifferent;
module.exports.normalizeString = normalizeString;
102 changes: 101 additions & 1 deletion test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
var isDifferent= require('../../../helper/diffPlaces').isDifferent;
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent;
const normalizeString = require('../../../helper/diffPlaces').normalizeString;

module.exports.tests = {};

Expand Down Expand Up @@ -337,6 +339,104 @@ module.exports.tests.dedupe = function(test, common) {
});
};

module.exports.tests.isNameDifferent = function (test, common) {
test('missing names', function (t) {
t.false(isNameDifferent({}, {}), 'both have no name');
t.false(isNameDifferent({ name: { default: 'a' } }, {}), 'B has no name');
t.false(isNameDifferent({}, { name: { default: 'b' } }), 'A has no name');
t.end();
});
test('basic matching', function (t) {
t.false(isNameDifferent(
{ name: { default: 'a' } },
{ name: { default: 'a' } }
), 'basic match');

t.false(isNameDifferent(
{ name: { default: 'a' } },
{ name: { default: ['a'] } }
), 'basic match - different types');

t.false(isNameDifferent(
{ name: { default: ['a'] } },
{ name: { default: 'a' } }
), 'basic match - different types - inverse');

t.false(isNameDifferent(
{ name: { default: 'a' } },
{ name: { default: ['b','a'] } }
), 'basic match - different positions');

t.false(isNameDifferent(
{ name: { default: ['b', 'a'] } },
{ name: { default: 'a' } }
), 'basic match - different positions - inverse');

t.end();
});
test('inter-language matching', function (t) {
t.false(isNameDifferent(
{ name: { default: 'a' } },
{ name: { foo: 'a' } }
), 'match default with any lang');

t.false(isNameDifferent(
{ name: { foo: 'a' } },
{ name: { default: 'a' } }
), 'match default with any lang - inverse');

t.false(isNameDifferent(
{ name: { bar: 'a' } },
{ name: { foo: 'a' } },
'bar'
), 'match using request lang');

t.false(isNameDifferent(
{ name: { bar: 'a' } },
{ name: { foo: 'a' } },
'foo'
), 'match using request lang - inverse');

// note: this returns true
t.true(isNameDifferent(
{ name: { foo: 'a' } },
{ name: { bar: 'a' } }
), 'different lang');

t.end();
});
test('real-world tests', function (t) {
t.false(isNameDifferent(
{ name: { default: 'Malmoe', eng: 'Malmo' } },
{ name: { default: 'Malmö', eng: 'Malmo' } }
), 'Malmö');

t.end();
});
};

module.exports.tests.normalizeString = function (test, common) {
test('lowercase', function (t) {
t.equal(normalizeString('Foo Bar'), 'foo bar');
t.equal(normalizeString('FOOBAR'), 'foobar');
t.end();
});

test('punctuation', function (t) {
t.equal(normalizeString('foo, bar'), 'foo bar');
t.equal(normalizeString('foo-bar'), 'foo bar');
t.equal(normalizeString('foo , - , - bar'), 'foo bar');
t.end();
});

test('diacritics', function (t) {
t.equal(normalizeString('Malmö'), 'malmo');
t.equal(normalizeString('Grolmanstraße'), 'grolmanstraße');
t.equal(normalizeString('àáâãäåấắæầằçḉèéêëếḗềḕ'), 'aaaaaaaaaeaacceeeeeeee');
t.end();
});
};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit 45ec00d

Please sign in to comment.