Skip to content

Commit

Permalink
feat(filters): Remove complicated housenumber filter
Browse files Browse the repository at this point in the history
This removes the complex filtering logic known as the "complicated
housenumber filter", or sometimes even the "goldbergian housenumber
filter", originally described in
#133 and merged as part of our ES2
upgrade (#138).

The housenumber filter was made quite a long time ago, with the
intention of removing partial numeric tokens.

It's quite a lot of code, rather hard to understand, and may actually
cause more problems than it sovles.

In particular, fully numeric postal codes cannot be found with
autocomplete until the postal code is entered completely (see
pelias/pelias#676).

Since this code was written, we've improved autocomplete considerably,
and having some tokens missing could be resulting in autocomplete not
doing its job as well as it could.

However, there could also be downsides: this will increase the hit count
for numeric inputs dramatically, so it could cause more slow queries.
  • Loading branch information
orangejulius authored and missinglink committed Nov 20, 2019
1 parent 4bc4ea2 commit 509fbe2
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 102 deletions.
16 changes: 8 additions & 8 deletions integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ module.exports.tests.functional = function(test, common){
]);

assertAnalysis('address', '101 mapzen place', [
'0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place'
'0:1', '0:10', '0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place'
]);

suite.run( t.end );
Expand Down Expand Up @@ -189,23 +189,23 @@ module.exports.tests.address = function(test, common){
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

assertAnalysis( 'address', '101 mapzen place', [
'0:101',
'0:1', '0:10', '0:101',
'1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen',
'2:p', '2:pl', '2:pla', '2:plac', '2:place'
]);

assertAnalysis( 'address', '30 w 26 st', [
'0:30',
'0:3', '0:30',
'1:w', '1:we', '1:wes', '1:west',
'2:26',
'2:2', '2:26',
'3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street'
]);

assertAnalysis( 'address', '4B 921 83 st', [
'0:4b',
'2:921', // @todo: this token position is incorrect
'3:83',
'4:s', '4:st', '4:str', '4:stre', '4:stree', '4:street'
'0:4', '0:4b',
'1:9', '1:92', '1:921',
'2:8', '2:83',
'3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street'
]);

suite.run( t.end );
Expand Down
47 changes: 0 additions & 47 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,7 @@ function generate(){
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]
Expand Down Expand Up @@ -216,47 +210,6 @@ function generate(){
"pattern": " +",
"replacement": " "
},

// START OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS
// @see: https://github.com/pelias/schema/pull/133
// note: we use \x02 (start-of-text) and \x03 (end-of-text) characters to mark word borders
"surround_single_characters_with_word_markers":{
"description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)",
"type": "pattern_replace",
"pattern": "^(.{1})$",
"replacement": "\x02$1\x03"
},
"house_number_word_delimiter": {
"description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]",
"type": "word_delimiter",
"split_on_numerics": "true",
"preserve_original": "true"
},
"remove_single_characters": {
"description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number",
"type": "length",
"min": 2
},
"surround_house_numbers_with_word_markers": {
"description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming",
"type": "pattern_replace",
"pattern": "^([0-9]+[a-z]?)$",
"replacement": "\x02$1\x03"
},
"eliminate_tokens_starting_with_word_marker": {
"description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers",
"type": "pattern_replace",
"pattern": "^\x02(.*[^\x03])?$",
"replacement": ""
},
"remove_encapsulating_word_markers": {
"description": "extract the stuff between the markers, extract 14 from \x0214\x03 since we're done the prefix n-gramming step",
"type": "pattern_replace",
"pattern": "^\x02(.*)\x03$",
"replacement": "$1"
}
// END OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS

// more generated below
},
"char_filter": {
Expand Down
41 changes: 0 additions & 41 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,7 @@
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]
Expand Down Expand Up @@ -220,41 +214,6 @@
"pattern": " +",
"replacement": " "
},
"surround_single_characters_with_word_markers": {
"description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)",
"type": "pattern_replace",
"pattern": "^(.{1})$",
"replacement": "\u0002$1\u0003"
},
"house_number_word_delimiter": {
"description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]",
"type": "word_delimiter",
"split_on_numerics": "true",
"preserve_original": "true"
},
"remove_single_characters": {
"description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number",
"type": "length",
"min": 2
},
"surround_house_numbers_with_word_markers": {
"description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming",
"type": "pattern_replace",
"pattern": "^([0-9]+[a-z]?)$",
"replacement": "\u0002$1\u0003"
},
"eliminate_tokens_starting_with_word_marker": {
"description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers",
"type": "pattern_replace",
"pattern": "^\u0002(.*[^\u0003])?$",
"replacement": ""
},
"remove_encapsulating_word_markers": {
"description": "extract the stuff between the markers, extract 14 from \u000214\u0003 since we're done the prefix n-gramming step",
"type": "pattern_replace",
"pattern": "^\u0002(.*)\u0003$",
"replacement": "$1"
},
"ampersand": {
"type": "synonym",
"synonyms": [
Expand Down
6 changes: 0 additions & 6 deletions test/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]);
Expand Down

0 comments on commit 509fbe2

Please sign in to comment.