Skip to content

Commit

Permalink
feature(punctuation): amend punctuation list
Browse files Browse the repository at this point in the history
  • Loading branch information
SiarheiFedartsou authored and missinglink committed Sep 26, 2024
1 parent 9fb43bc commit f08bc0b
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 17 deletions.
3 changes: 2 additions & 1 deletion integration/analyzer_peliasPhrase.js
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ module.exports.tests.analyze = function(test, common){
// remove punctuation (handled by the char_filter)
assertAnalysis( 'punctuation', punctuation.all.join(''), ['0:&', '0:and', '0:und'] );
assertAnalysis( 'punctuation', 'Hawai‘i', ['hawaii'] );

assertAnalysis( 'punctuation - « in between', '«res»pub«lika»', ['respublika'] );

assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );

Expand Down
21 changes: 8 additions & 13 deletions punctuation.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
// These characters will be removed from ngrams/shingles
// @see: org/apache/lucene/analysis/cn/smart/stopwords.txt

module.exports.all = [
".","`","‘","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*",
"#","&","^","$","@","!","~",":",";","+","《","》","—","-",",","。",
"、", ":",";","!","·","?","„","“","”",")","(","【","】","[","]","●"
const all = [
".","`","‘","’","‛","-","_","=","?","'","|","\"","(",")","{","}","[","]","<",">","*",
"#","&","^","$","@","!","~",":",";","+","《","》","—","-",",","。","‹","›","⹂","〝","〞",
"、", ":",";","!","·","?","„","“","”","‟",")","(","【","】","[","]","●","«","
];

module.exports.allowed = [
const allowed = [
"-", // allow hypens
"&" // allow ampersands
];

module.exports.blacklist = module.exports.all.slice();

// remove alowed chars from blacklist
module.exports.allowed.forEach(function(item){
var index = module.exports.blacklist.indexOf(item);
if( index > -1 ){
module.exports.blacklist.splice(index, 1);
}
});
const blacklist = all.filter(s => !allowed.includes(s));

module.exports = { all, allowed, blacklist };
14 changes: 12 additions & 2 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -2277,6 +2277,8 @@
".=>",
"`=>",
"‘=>",
"’=>",
"‛=>",
"_=>",
"==>",
"?=>",
Expand Down Expand Up @@ -2307,6 +2309,11 @@
"-=>",
",=>",
"。=>",
"‹=>",
"›=>",
"⹂=>",
"〝=>",
"〞=>",
"、=>",
":=>",
";=>",
Expand All @@ -2316,13 +2323,16 @@
"„=>",
"“=>",
"”=>",
"‟=>",
")=>",
"(=>",
"【=>",
"】=>",
"[=>",
"]=>",
"●=>"
"●=>",
"«=>",
"»=>"
]
},
"alphanumeric": {
Expand Down Expand Up @@ -3023,4 +3033,4 @@
},
"dynamic": "strict"
}
}
}
2 changes: 1 addition & 1 deletion test/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ module.exports.tests.punctuationCharFilter = function(test, common) {
var char_filter = s.analysis.char_filter.punctuation;
t.equal(char_filter.type, 'mapping');
t.true(Array.isArray(char_filter.mappings));
t.equal(char_filter.mappings.length, 49);
t.equal(char_filter.mappings.length, 59);
t.end();
});
};
Expand Down

0 comments on commit f08bc0b

Please sign in to comment.