From a1af69b8a1375572acec1c8c2942833f887d09bf Mon Sep 17 00:00:00 2001 From: Edward Mac Gillavry Date: Tue, 7 Sep 2021 17:27:27 +0200 Subject: [PATCH] feat(nl): improve support of NL addresses (#126) * Examples of correct addresses in NL. * Recognise 6 position postcodes in NL, addressing #127. * Check for -plein as street type. * Adjustments for typo 'plain' instead of 'plein' in libpostal resources. Addressing #128. * Put 'test' back in as per https://github.com/pelias/parser/pull/126#discussion_r566488438. * Prevent 'St' as a suffix in NL streetnames. Add Jr and Sr. Addressing #126. * First of 4 digits cannot be 0. No 'SA','SD' or 'SS'. Addressing #127. * Adjective 'korte' no longer a personal title. Addressing #129. * Tests and fix for typo 'BurgeRmeester'. Addressing #130. * Tests, code, and configs for '-daal', '-burg', '-baan'. Addressing #131 and #133. * Config for '-burg' not separable. Addressing #131. * Remove test for NL postal codes WITH spaces. Regex ok. #134 * Fix formatting spaces at the end of the line. * Also use directionals to parse Dutch street addresses. #137 * Add Dutch titulature for street name recognition. (#130) * Add 'plantsoen' as a street type. (#128). * Remove test for NL locality with stopword. --- classifier/CompoundStreetClassifier.js | 6 +++ classifier/DirectionalClassifier.js | 2 +- classifier/PersonalTitleClassifier.test.js | 2 +- classifier/PostcodeClassifier.js | 2 +- classifier/PostcodeClassifier.test.js | 5 ++ resources/chromium-i18n/ssl-address/NL.json | 2 +- .../libpostal/af/personal_titles.txt | 2 + .../nl/concatenated_suffixes_inseparable.txt | 1 + .../nl/concatenated_suffixes_separable.txt | 7 ++- .../libpostal/nl/directionals.txt | 4 ++ .../libpostal/nl/personal_suffixes.txt | 4 ++ .../libpostal/nl/personal_titles.txt | 43 ++++++++++++++++ test/address.nld.test.js | 50 +++++++++++++++++++ 13 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 resources/pelias/dictionaries/libpostal/af/personal_titles.txt create mode 100644 resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_inseparable.txt create mode 100644 resources/pelias/dictionaries/libpostal/nl/directionals.txt create mode 100644 resources/pelias/dictionaries/libpostal/nl/personal_suffixes.txt create mode 100644 resources/pelias/dictionaries/libpostal/nl/personal_titles.txt diff --git a/classifier/CompoundStreetClassifier.js b/classifier/CompoundStreetClassifier.js index ba809aba..ed15792e 100644 --- a/classifier/CompoundStreetClassifier.js +++ b/classifier/CompoundStreetClassifier.js @@ -15,6 +15,12 @@ class CompoundStreetClassifier extends WordClassifier { // this removes suffixes such as 'r.' which can be ambiguous minlength: 3 }) + + libpostal.load(this.suffixes, ['de', 'nl'], 'concatenated_suffixes_inseparable.txt', { + // remove any suffixes which contain less than 3 characters (excluding a period) + // this removes suffixes such as 'r.' which can be ambiguous + minlength: 3 + }) } each (span) { diff --git a/classifier/DirectionalClassifier.js b/classifier/DirectionalClassifier.js index b185f484..3e3cfc82 100644 --- a/classifier/DirectionalClassifier.js +++ b/classifier/DirectionalClassifier.js @@ -9,7 +9,7 @@ const libpostal = require('../resources/libpostal/libpostal') // optionally control which languages are included // note: reducing the languages will have a considerable performance benefit -const languages = ['en', 'es', 'de', 'fr'] +const languages = ['en', 'es', 'de', 'fr', 'nl'] class DirectionalClassifier extends WordClassifier { setup () { diff --git a/classifier/PersonalTitleClassifier.test.js b/classifier/PersonalTitleClassifier.test.js index 96321aab..d89c1b8d 100644 --- a/classifier/PersonalTitleClassifier.test.js +++ b/classifier/PersonalTitleClassifier.test.js @@ -24,7 +24,7 @@ module.exports.tests.contains_numerals = (test) => { module.exports.tests.classify = (test) => { let valid = [ 'Général', 'General', 'gal', - 'Saint', 'st', 'cdt', 'l\'Amiral' + 'Saint', 'st', 'cdt', 'l\'Amiral', 'Burgemeester' ] valid.forEach(token => { diff --git a/classifier/PostcodeClassifier.js b/classifier/PostcodeClassifier.js index 7fbbe587..c263c30e 100644 --- a/classifier/PostcodeClassifier.js +++ b/classifier/PostcodeClassifier.js @@ -10,7 +10,7 @@ const dictPath = path.join(__dirname, `../resources/chromium-i18n/ssl-address`) // const countryCodes = fs.readdirSync(dictPath) // .filter(p => p.endsWith('.json')) // .map(p => p.split('.')[0]) -const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru', 'br'] +const countryCodes = ['us', 'gb', 'fr', 'de', 'es', 'pt', 'au', 'nz', 'kr', 'jp', 'in', 'ru', 'br', 'nl'] class PostcodeClassifier extends WordClassifier { setup () { diff --git a/classifier/PostcodeClassifier.test.js b/classifier/PostcodeClassifier.test.js index 6321f631..f98835fe 100644 --- a/classifier/PostcodeClassifier.test.js +++ b/classifier/PostcodeClassifier.test.js @@ -72,6 +72,11 @@ module.exports.tests.classify = (test) => { t.deepEqual(s.classifications, { PostcodeClassification: new PostcodeClassification(1.0) }) t.end() }) + test('classify: NLD', (t) => { + let s = classify('7512EC') + t.deepEqual(s.classifications, { PostcodeClassification: new PostcodeClassification(1.0) }) + t.end() + }) } module.exports.all = (tape, common) => { diff --git a/resources/chromium-i18n/ssl-address/NL.json b/resources/chromium-i18n/ssl-address/NL.json index fd5c937d..2f3078a9 100644 --- a/resources/chromium-i18n/ssl-address/NL.json +++ b/resources/chromium-i18n/ssl-address/NL.json @@ -1 +1 @@ -{"zipex":"1234 AB,2490 AA","key":"NL","name":"NETHERLANDS","fmt":"%O%n%N%n%A%n%Z %C","require":"ACZ","zip":"\\d{4} ?[A-Z]{2}","posturl":"http://www.postnl.nl/voorthuis/","id":"data/NL"} \ No newline at end of file +{"zipex":"1234 AB,2490 AA","key":"NL","name":"NETHERLANDS","fmt":"%O%n%N%n%A%n%Z %C","require":"ACZ","zip":"[1-9][0-9]{3} ?(?!SA|SD|SS)[A-Z]{2}","posturl":"http://www.postnl.nl/voorthuis/","id":"data/NL"} \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/af/personal_titles.txt b/resources/pelias/dictionaries/libpostal/af/personal_titles.txt new file mode 100644 index 00000000..c3bb9013 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/af/personal_titles.txt @@ -0,0 +1,2 @@ +!kort|k +!korte|kte \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_inseparable.txt b/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_inseparable.txt new file mode 100644 index 00000000..a4904a2c --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_inseparable.txt @@ -0,0 +1 @@ +burg|brg|bg \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_separable.txt b/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_separable.txt index c4902598..d1762617 100644 --- a/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_separable.txt +++ b/resources/pelias/dictionaries/libpostal/nl/concatenated_suffixes_separable.txt @@ -1 +1,6 @@ -dijk \ No newline at end of file +baan +daal +dijk +!plain|pln. +plein|pln +plantsoen|plnts \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/nl/directionals.txt b/resources/pelias/dictionaries/libpostal/nl/directionals.txt new file mode 100644 index 00000000..ce2de416 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/nl/directionals.txt @@ -0,0 +1,4 @@ +noordzijde|nz|n.z.|n.z|n z +oostzijde|oz|o.z.|o.z|o z +westzijde|wz|w.z.|w.z|w z +zuidzijde|zz|z.z.|z.z|z z \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/nl/personal_suffixes.txt b/resources/pelias/dictionaries/libpostal/nl/personal_suffixes.txt new file mode 100644 index 00000000..37f65038 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/nl/personal_suffixes.txt @@ -0,0 +1,4 @@ +junior|jr|jnr +senior|sr|snr +# st is not a personal suffix in Dutch +!st \ No newline at end of file diff --git a/resources/pelias/dictionaries/libpostal/nl/personal_titles.txt b/resources/pelias/dictionaries/libpostal/nl/personal_titles.txt new file mode 100644 index 00000000..eed6d937 --- /dev/null +++ b/resources/pelias/dictionaries/libpostal/nl/personal_titles.txt @@ -0,0 +1,43 @@ +!kort|k +!korte|kte +aalmoezenier +admiraal|adm +bisschop|biss +#typo in LibPostal resource +!burgermeester|burg|bgm +burgemeester|burg|bgm +commissaris|comm +deken|dkn +directeur|dir +frater|fr +graaf +gravin +goeverneur|goev +gouverneur|gouv +heer|hr +jonker|jkr +juffrouw|juffr +hertog|htg +kanunnik|kan +kapelaan|kap +kapitein|kapt +keizer +luitenant generaal|lt gen +!mevrouw|mevr +mevrouw|mevr|mw +madame|mad +majoor|maj +notaris|not +overste|ov +pater|ptr +prelaat|prlt +rector|rect +schepen|sch +schout|sch +schout bij nacht|sbn +secretaris|secr +sekretaris|sekr +veldmaarschalk|veldm +vicaris|vic +wethouder|weth +zusters|zr \ No newline at end of file diff --git a/test/address.nld.test.js b/test/address.nld.test.js index fad71a0f..d2d95d98 100644 --- a/test/address.nld.test.js +++ b/test/address.nld.test.js @@ -12,6 +12,56 @@ const testcase = (test, common) => { assert('Bosserdijk, Hoogland', [ { street: 'Bosserdijk' }, { locality: 'Hoogland' } ]) + + assert('St Ludgerusstraat, Utrecht', [ + { street: 'Ludgerusstraat' }, { locality: 'Utrecht' } + ]) + + assert('Lange Groenendaal, Gouda', [ + { street: 'Lange Groenendaal' }, { locality: 'Gouda' } + ]) + + assert('Achter Clarenburg, Utrecht', [ + { street: 'Achter Clarenburg' }, { locality: 'Utrecht' } + ]) + + assert('Rozenburg', [ + [{ locality: 'Rozenburg' }], + [{ street: 'Rozenburg' }] + ], false) + + assert('Bloemendaal', [ + [{ locality: 'Bloemendaal' }], + [{ street: 'Bloemendaal' }] + ], false) + + assert('Brinkstraat 87, 7512EC, Enschede', [ + { street: 'Brinkstraat' }, { housenumber: '87' }, { postcode: '7512EC' }, { locality: 'Enschede' } + ]) + + assert('Weerdsingel O.Z., Utrecht', [ + { street: 'Weerdsingel O.Z.' }, { locality: 'Utrecht' } + ]) + + assert('Oranjelaan Westzijde 41, Puttershoek', [ + { street: 'Oranjelaan Westzijde' }, { housenumber: '41' }, { locality: 'Puttershoek' } + ]) + + assert('Rembrandtplein, Amsterdam', [ + { street: 'Rembrandtplein' }, { locality: 'Amsterdam' } + ]) + + assert('Korte Tiendeweg, Gouda', [ + { street: 'Korte Tiendeweg' }, { locality: 'Gouda' } + ]) + + assert('Burgemeester Martenssingel, Gouda', [ + { street: 'Burgemeester Martenssingel' }, { locality: 'Gouda' } + ]) + + assert('Agorabaan, Lelystad', [ + { street: 'Agorabaan' }, { locality: 'Lelystad' } + ]) } module.exports.all = (tape, common) => {