Skip to content

Commit

Permalink
fix(dedupe): improved string normalization for deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
missinglink committed Mar 14, 2021
1 parent de97774 commit 98f2452
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 4 deletions.
9 changes: 6 additions & 3 deletions helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
const _ = require('lodash');
const removeAccents = require('remove-accents');
const unicode = require('./unicode');
const placeTypes = require('./placeTypes');
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
const field = require('../helper/fieldValue');
const removeAccents = require('remove-accents');

// only consider these layers as synonymous for deduplication purposes.
// when performing inter-layer deduping, layers coming earlier in this list take
Expand Down Expand Up @@ -225,11 +226,13 @@ function getPlaceTypeRank(item) {
}

/**
* lowercase characters and remove some punctuation
* apply unicode normalization, lowercase characters and remove
* diacritics and some punctuation.
*/
function normalizeString(str){
return removeAccents(str.toLowerCase().split(/[ ,-]+/).join(' '));
return removeAccents(unicode.normalize(str)).toLowerCase().split(/[ ,-]+/).join(' ');
}

module.exports.isDifferent = isDifferent;
module.exports.layerPreferences = layerPreferences;
module.exports.normalizeString = normalizeString;
27 changes: 26 additions & 1 deletion test/unit/helper/diffPlaces.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
var isDifferent= require('../../../helper/diffPlaces').isDifferent;
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
const normalizeString = require('../../../helper/diffPlaces').normalizeString;

module.exports.tests = {};

Expand Down Expand Up @@ -337,6 +338,30 @@ module.exports.tests.dedupe = function(test, common) {
});
};

module.exports.tests.normalizeString = function (test, common) {

test('lowercase', function (t) {
t.equal(normalizeString('Foo Bar'), 'foo bar');
t.equal(normalizeString('FOOBAR'), 'foobar');
t.end();
});

test('punctuation', function (t) {
t.equal(normalizeString('foo, bar'), 'foo bar');
t.equal(normalizeString('foo-bar'), 'foo bar');
t.equal(normalizeString('foo , - , - bar'), 'foo bar');
t.end();
});

test('diacritics', function (t) {
t.equal(normalizeString('Malmö'), 'malmo');
t.equal(normalizeString('Grolmanstraße'), 'grolmanstraße');
t.equal(normalizeString('àáâãäåấắæầằçḉèéêëếḗềḕ'), 'aaaaaaaaaeaacceeeeeeee');
t.end();
});

};

module.exports.all = function (tape, common) {

function test(name, testFunction) {
Expand Down

0 comments on commit 98f2452

Please sign in to comment.