-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.rb
93 lines (86 loc) · 4.1 KB
/
benchmark.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# rubocop:disable Style/FrozenStringLiteralComment
require 'benchmark'
require 'moji'
require_relative '../lib/neologdish/normalizer'
# original implementation came from https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
def original_normalize_neologd(norm)
norm.tr!('0-9A-Za-z', '0-9A-Za-z')
norm = Moji.han_to_zen(norm, Moji::HAN_KATA)
hypon_reg = /(?:˗|֊|‐|‑|‒|–|⁃|⁻|₋|−)/
norm.gsub!(hypon_reg, '-')
choon_reg = /(?:﹣|-|ー|—|―|─|━)/
norm.gsub!(choon_reg, 'ー')
chil_reg = /(?:~|∼|∾|〜|〰|~)/
norm.gsub!(chil_reg, '')
norm.gsub!(/ー+/, 'ー')
norm.tr!(%q{!"#$%&'()*+,-.\/:;<=>?@[¥]^_`{|}~。、・「」"}, '!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}〜。、・「」')
norm.gsub!(/ /, ' ')
norm.gsub!(/ {1,}/, ' ')
norm.gsub!(/^ +(.+?)$/, '\\1')
norm.gsub!(/^(.+?) +$/, '\\1')
# rubocop:disable Layout/LineLength
while norm =~ /([\p{InCjkUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+?) {1}([\p{InCjkUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+?)/
norm.gsub!(
/([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+?) {1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+?)/, '\\1\\2'
)
end
while norm =~ /(\p{InBasicLatin}+) {1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)/
norm.gsub!(
/(\p{InBasicLatin}+) {1}([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+)/, '\\1\\2'
)
end
while norm =~ /([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+) {1}(\p{InBasicLatin}+)/
norm.gsub!(
/([\p{InCJKUnifiedIdeographs}\p{InHiragana}\p{InKatakana}\p{InHalfwidthAndFullwidthForms}\p{InCJKSymbolsAndPunctuation}]+) {1}(\p{InBasicLatin}+)/, '\\1\\2'
)
end
# rubocop:enable Layout/LineLength
norm.tr!(
'!”#$%&’()*+,-./:;<>?@[¥]^_`{|}〜',
%q{!"#$%&'()*+,-.\/:;<>?@[¥]^_`{|}~}
)
norm
end
targets = ['0123456789',
'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
'abcdefghijklmnopqrstuvwxyz',
'!”#$%&’()*+,−./:;<>?@[¥]^_`{|}',
'=。、・「」',
'ハンカクダヨ',
'o₋o',
'majika━',
'わ〰い',
'スーパーーーー',
'スーパーーーーマーーーーケット',
'!#',
'ゼンカク スペース',
'お お',
' おお',
'おお ',
'検索 エンジン 自作 入門 を 買い ました!!!',
'アルゴリズム C',
' PRML 副 読 本 ',
'Coding the Matrix',
'consecutive spaces are in latin words',
'full width spaces are in latin words',
'南アルプスの 天然水 Sparking Lemon レモン一絞り',
'南アルプスの 天然水- Sparking* Lemon+ レモン一絞り',
'ツギノ「ヌ゚」ハオカシナモジデス']
n = 10_000
Benchmark.bm(20) do |x|
x.report('original normalizer:') do
n.times do
targets.each do |target|
original_normalize_neologd(target)
end
end
end
x.report('this library:') do
n.times do
targets.each do |target|
Neologdish::Normalizer.normalize(target)
end
end
end
end
# rubocop:enable Style/FrozenStringLiteralComment