From 509fbe2e1acaa35d9128252952ef5398cdda3d07 Mon Sep 17 00:00:00 2001 From: Julian Simioni Date: Fri, 20 Sep 2019 10:32:53 -0400 Subject: [PATCH] feat(filters): Remove complicated housenumber filter This removes the complex filtering logic known as the "complicated housenumber filter", or sometimes even the "goldbergian housenumber filter", originally described in https://github.com/pelias/schema/pull/133 and merged as part of our ES2 upgrade (https://github.com/pelias/schema/pull/138). The housenumber filter was made quite a long time ago, with the intention of removing partial numeric tokens. It's quite a lot of code, rather hard to understand, and may actually cause more problems than it sovles. In particular, fully numeric postal codes cannot be found with autocomplete until the postal code is entered completely (see https://github.com/pelias/pelias/issues/676). Since this code was written, we've improved autocomplete considerably, and having some tokens missing could be resulting in autocomplete not doing its job as well as it could. However, there could also be downsides: this will increase the hit count for numeric inputs dramatically, so it could cause more slow queries. --- .../analyzer_peliasIndexOneEdgeGram.js | 16 +++---- settings.js | 47 ------------------- test/fixtures/expected.json | 41 ---------------- test/settings.js | 6 --- 4 files changed, 8 insertions(+), 102 deletions(-) diff --git a/integration/analyzer_peliasIndexOneEdgeGram.js b/integration/analyzer_peliasIndexOneEdgeGram.js index 03ee39cd..27b0f643 100644 --- a/integration/analyzer_peliasIndexOneEdgeGram.js +++ b/integration/analyzer_peliasIndexOneEdgeGram.js @@ -159,7 +159,7 @@ module.exports.tests.functional = function(test, common){ ]); assertAnalysis('address', '101 mapzen place', [ - '0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place' + '0:1', '0:10', '0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place' ]); suite.run( t.end ); @@ -189,23 +189,23 @@ module.exports.tests.address = function(test, common){ suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up assertAnalysis( 'address', '101 mapzen place', [ - '0:101', + '0:1', '0:10', '0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place' ]); assertAnalysis( 'address', '30 w 26 st', [ - '0:30', + '0:3', '0:30', '1:w', '1:we', '1:wes', '1:west', - '2:26', + '2:2', '2:26', '3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street' ]); assertAnalysis( 'address', '4B 921 83 st', [ - '0:4b', - '2:921', // @todo: this token position is incorrect - '3:83', - '4:s', '4:st', '4:str', '4:stre', '4:stree', '4:street' + '0:4', '0:4b', + '1:9', '1:92', '1:921', + '2:8', '2:83', + '3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street' ]); suite.run( t.end ); diff --git a/settings.js b/settings.js index 94a43d36..700a343f 100644 --- a/settings.js +++ b/settings.js @@ -63,13 +63,7 @@ function generate(){ "ampersand", "remove_ordinals", "removeAllZeroNumericPrefix", - "surround_single_characters_with_word_markers", - "house_number_word_delimiter", - "remove_single_characters", - "surround_house_numbers_with_word_markers", "peliasOneEdgeGramFilter", - "eliminate_tokens_starting_with_word_marker", - "remove_encapsulating_word_markers", "unique_only_same_position", "notnull" ] @@ -216,47 +210,6 @@ function generate(){ "pattern": " +", "replacement": " " }, - - // START OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS - // @see: https://github.com/pelias/schema/pull/133 - // note: we use \x02 (start-of-text) and \x03 (end-of-text) characters to mark word borders - "surround_single_characters_with_word_markers":{ - "description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)", - "type": "pattern_replace", - "pattern": "^(.{1})$", - "replacement": "\x02$1\x03" - }, - "house_number_word_delimiter": { - "description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]", - "type": "word_delimiter", - "split_on_numerics": "true", - "preserve_original": "true" - }, - "remove_single_characters": { - "description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number", - "type": "length", - "min": 2 - }, - "surround_house_numbers_with_word_markers": { - "description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming", - "type": "pattern_replace", - "pattern": "^([0-9]+[a-z]?)$", - "replacement": "\x02$1\x03" - }, - "eliminate_tokens_starting_with_word_marker": { - "description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers", - "type": "pattern_replace", - "pattern": "^\x02(.*[^\x03])?$", - "replacement": "" - }, - "remove_encapsulating_word_markers": { - "description": "extract the stuff between the markers, extract 14 from \x0214\x03 since we're done the prefix n-gramming step", - "type": "pattern_replace", - "pattern": "^\x02(.*)\x03$", - "replacement": "$1" - } - // END OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS - // more generated below }, "char_filter": { diff --git a/test/fixtures/expected.json b/test/fixtures/expected.json index f480bf9d..8ed7719c 100644 --- a/test/fixtures/expected.json +++ b/test/fixtures/expected.json @@ -46,13 +46,7 @@ "ampersand", "remove_ordinals", "removeAllZeroNumericPrefix", - "surround_single_characters_with_word_markers", - "house_number_word_delimiter", - "remove_single_characters", - "surround_house_numbers_with_word_markers", "peliasOneEdgeGramFilter", - "eliminate_tokens_starting_with_word_marker", - "remove_encapsulating_word_markers", "unique_only_same_position", "notnull" ] @@ -220,41 +214,6 @@ "pattern": " +", "replacement": " " }, - "surround_single_characters_with_word_markers": { - "description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)", - "type": "pattern_replace", - "pattern": "^(.{1})$", - "replacement": "\u0002$1\u0003" - }, - "house_number_word_delimiter": { - "description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]", - "type": "word_delimiter", - "split_on_numerics": "true", - "preserve_original": "true" - }, - "remove_single_characters": { - "description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number", - "type": "length", - "min": 2 - }, - "surround_house_numbers_with_word_markers": { - "description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming", - "type": "pattern_replace", - "pattern": "^([0-9]+[a-z]?)$", - "replacement": "\u0002$1\u0003" - }, - "eliminate_tokens_starting_with_word_marker": { - "description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers", - "type": "pattern_replace", - "pattern": "^\u0002(.*[^\u0003])?$", - "replacement": "" - }, - "remove_encapsulating_word_markers": { - "description": "extract the stuff between the markers, extract 14 from \u000214\u0003 since we're done the prefix n-gramming step", - "type": "pattern_replace", - "pattern": "^\u0002(.*)\u0003$", - "replacement": "$1" - }, "ampersand": { "type": "synonym", "synonyms": [ diff --git a/test/settings.js b/test/settings.js index d932ff3e..0a8b1e86 100644 --- a/test/settings.js +++ b/test/settings.js @@ -85,13 +85,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) { "ampersand", "remove_ordinals", "removeAllZeroNumericPrefix", - "surround_single_characters_with_word_markers", - "house_number_word_delimiter", - "remove_single_characters", - "surround_house_numbers_with_word_markers", "peliasOneEdgeGramFilter", - "eliminate_tokens_starting_with_word_marker", - "remove_encapsulating_word_markers", "unique_only_same_position", "notnull" ]);