Merge pull request #207 from pelias/unicode

unicode: apply unicode normalization
pelias · Feb 8, 2022 · 5430492 · 5430492
2 parents 6b1a1dc + 8d21f63
commit 5430492
Show file tree

Hide file tree

Showing 7 changed files with 136 additions and 17 deletions.
diff --git a/lib/analysis.js b/lib/analysis.js
@@ -1,6 +1,6 @@
 
 const lowercase = require('lower-case').lowerCase;
-const removeAccents = require('remove-accents');
+const unicode = require('./unicode');
 
 const PARTIAL_TOKEN_SUFFIX = '\x26';
 
@@ -14,6 +14,9 @@ function normalize( input ){
   // sanity check arguments
   if( typeof input !== 'string' ){ return []; }
 
+  // apply unicode normalization
+  input = unicode.normalize(input);
+
   // trim input of superfluous whitespace
   input = input.trim();
 
@@ -98,11 +101,9 @@ function normalize( input ){
   return synonyms.map( function( synonym ){
     return synonym.replace(/\s{2,}/g, ' ').trim();
   })
-  // basic normalization
-  // note: lowercase MUST be run before removeAccents, please don't change the order
-  // see: https://github.com/pelias/placeholder/pull/12 for more detail.
+  // normalization
   .map( function( synonym ){
-    return removeAccents( lowercase( synonym ) );
+    return lowercase( unicode.fold( synonym ) );
   })
   // remove empty synonyms
   .filter( function( synonym ){

diff --git a/lib/unicode.js b/lib/unicode.js
@@ -0,0 +1,114 @@
+const _ = require('lodash');
+const regenerate = require('regenerate');
+const accentsDiacritics = require('remove-accents-diacritics');
+
+// non-printable control characters
+// ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
+const CONTROL_CODES = regenerate()
+  .addRange(0x0000, 0x001F) // C0 (0000-001F)
+  .add(0x007F) // Delete
+  .addRange(0x0080, 0x009F) // C1 (0080-009F)
+  .toRegExp('g');
+
+// non-standard spaces
+// ref: http://jkorpela.fi/chars/spaces.html
+const ALTERNATE_SPACES = regenerate()
+  .add(0x00A0) // NO-BREAK SPACE
+  .add(0x1680) // OGHAM SPACE MARK
+  .add(0x180E) // MONGOLIAN VOWEL SEPARATOR
+  .addRange(0x2000, 0x200B) // EN QUAD - ZERO WIDTH SPACE
+  .add(0x202F) // NARROW NO-BREAK SPACE
+  .add(0x205F) // MEDIUM MATHEMATICAL SPACE
+  .add(0x3000) // IDEOGRAPHIC SPACE
+  .add(0xFEFF) // ZERO WIDTH NO-BREAK SPACE
+  .toRegExp('g');
+
+// pattern to match consecutive spaces
+// const CONSECUTIVE_SPACES = /\s{2,}/g;
+
+// unicode combining marks
+// see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645
+// ref: https://en.wikipedia.org/wiki/Combining_character
+const COMBINING_MARKS = regenerate()
+  .add(0x200D) // ZERO WIDTH JOINER (U+200D)
+  .addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F)
+  .addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF)
+  .addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF)
+  .addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF)
+  .addRange(0xFE00, 0xFE0F) // Variation Selectors (FE00-FE0F)
+  .addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F)
+  .add(0x3099) // combining dakuten (U+3099)
+  .add(0x309A) // combining handakuten (U+309A)
+  .toRegExp('g');
+
+// miscellaneous symbols with no relevance to geocoding
+const MISC_UNSUPPORTED_SYMBOLS = regenerate()
+  // Superscripts and Subscripts (2070-209F)
+  // Currency Symbols (20A0-20CF)
+  // Letterlike Symbols (2100-214F)
+  // Number Forms (2150-218F)
+  // Arrows (2190-21FF)
+  // Mathematical Operators (2200-22FF)
+  // Miscellaneous Technical (2300-23FF)
+  // Control Pictures (2400-243F)
+  // Optical Character Recognition (2440-245F)
+  // Enclosed Alphanumerics (2460-24FF)
+  // Box Drawing (2500-257F)
+  // Block Elements (2580-259F)
+  // Geometric Shapes (25A0-25FF)
+  // Miscellaneous Symbols (2600-26FF)
+  // Dingbats (2700-27BF)
+  // Miscellaneous Mathematical Symbols-A (27C0-27EF)
+  // Supplemental Arrows-A (27F0-27FF)
+  // Braille Patterns (2800-28FF)
+  // Supplemental Arrows-B (2900-297F)
+  // Miscellaneous Mathematical Symbols-B (2980-29FF)
+  // Supplemental Mathematical Operators (2A00-2AFF)
+  // Miscellaneous Symbols and Arrows (2B00-2BFF)
+  .addRange(0x2070, 0x2BFF) // A Range Covering Consecutive Blocks Listed Above
+
+  // symbols
+  .addRange(0x02B0, 0x02FF) // Spacing Modifier Letters (02B0-02FF)
+  .addRange(0x1400, 0x167F) // Unified Canadian Aboriginal Syllabics (1400-167F)
+  .addRange(0x1D100, 0x1D1FF) // Musical Symbols (1D100-1D1FF)
+  .addRange(0x1D400, 0x1D7FF) // Mathematical Alphanumeric Symbols (1D400-1D7FF)
+
+  // emojis
+  .addRange(0x1F300, 0x1F5FF) // Miscellaneous Symbols and Pictographs (1F300-1F5FF)
+  .addRange(0x1F3FB, 0x1F3FF) // Emoji Modifier Fitzpatrick (skin tones) (1F3FB–1F3FF)
+  .addRange(0x1F600, 0x1F64F) // Emoticons (1F600–1F64F)
+  .addRange(0x1F680, 0x1F6FF) // Transport and Map Symbols (1F680-1F6FF)
+  .addRange(0x1F900, 0x1F9FF) // Supplemental Symbols and Pictographs (1F900-1F9FF)
+  .toRegExp('g');
+
+function normalize(str) {
+
+  // sanity checking
+  if(!_.isString(str)){ return str; }
+
+  return str
+    .normalize('NFKC')
+    .replace(CONTROL_CODES, '')
+    .replace(ALTERNATE_SPACES, ' ')
+    .replace(MISC_UNSUPPORTED_SYMBOLS, '')
+    .replace(COMBINING_MARKS, '');
+}
+
+/**
+ * Converts alphabetic, numeric, and symbolic characters that are not
+ * in the Basic Latin Unicode block(first 127 ASCII characters) to their
+ * ASCII equivalent, if one exists.For example, the filter changes à to a.
+ */
+function fold(str) {
+
+  // sanity checking
+  if (!_.isString(str)) { return str; }
+
+  return accentsDiacritics.remove(str)
+    .normalize('NFD')
+    .replace(COMBINING_MARKS, '')
+    .normalize('NFKC');
+}
+
+module.exports.normalize = normalize;
+module.exports.fold = fold;
diff --git a/package.json b/package.json
@@ -35,14 +35,15 @@
     "async": "^3.0.1",
     "better-sqlite3": "^7.4.3",
     "express": "^4.15.2",
-    "lodash": "^4.17.4",
+    "lodash": "^4.17.21",
     "lower-case": "^2.0.0",
     "morgan": "^1.9.0",
     "pelias-blacklist-stream": "^1.1.0",
     "pelias-config": "^4.5.0",
     "pelias-logger": "^1.2.1",
     "pelias-whosonfirst": "^5.0.0",
-    "remove-accents": "^0.4.0",
+    "regenerate": "^1.4.2",
+    "remove-accents-diacritics": "^1.0.2",
     "require-dir": "^1.0.0",
     "sorted-intersect": "^0.1.4",
     "split2": "^3.0.0",

diff --git a/prototype/query.js b/prototype/query.js
@@ -2,7 +2,6 @@
 var async = require('async');
 var util = require('util');
 var Result = require('../lib/Result');
-var sorted = require('../lib/sorted');
 var debug = false;
 
 function reduce( index, res ){

diff --git a/prototype/tokenize.js b/prototype/tokenize.js
@@ -1,9 +1,9 @@
 
 // plugin for tokenize
-const _ = require('lodash'),
-    async = require('async'),
-    analysis = require('../lib/analysis'),
-    permutations = require('../lib/permutations');
+const _ = require('lodash');
+const async = require('async');
+const analysis = require('../lib/analysis');
+const permutations = require('../lib/permutations');
 
 function tokenize(input, cb){
 

diff --git a/test/lib/analysis.js b/test/lib/analysis.js
@@ -15,14 +15,18 @@ module.exports.normalize = function(test, common) {
   // Punctuation substitutions
   assert( 'Straße', [ 'strasse' ] );
   assert( 'Jǿ œ̆', [ 'jo oe' ] );
+  assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] );
+  assert( 'z︠h︡ovkva', [ 'zhovkva' ] );
+  assert( 'Žovkva', [ 'zovkva' ] );
+  assert( 'Żółkiew', [ 'zolkiew' ] );
   assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] );
 
   // Tests to confirm the order of function execution
   // see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570
-  test( 'order of execution', function(t) {
-    t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'i̇nceyol' ] );
-    t.equal( analysis.normalize( 'İnceyol' )[0].length, 8 );
-    t.equal( analysis.normalize( 'İ' )[0].length, 2 );
+  test('order of execution', function(t) {
+    t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] );
+    t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 );
+    t.equal( analysis.normalize( 'İ' )[0].length, 1 );
     t.end();
   });
 

diff --git a/test/prototype/tokenize_integration.js b/test/prototype/tokenize_integration.js
@@ -11,7 +11,7 @@ module.exports.tokenize = function(test, util) {
 
   assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]);
   assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]);
-  assert('ケープタウン 南アフリカ', [['ケープタウン', '南アフリカ']]);
+  assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]);
 
   // duplicates
   assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);