-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathrule_util.py
63 lines (48 loc) · 1.84 KB
/
rule_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#-*- coding: utf-8 -*-
import re
quote_pattern = re.compile(ur"《(.*?)》")
class RuleUtil(object):
@classmethod
def process_text(cls, text):
if not (u"《" in text and u"》" in text):
return text
for s in quote_pattern.findall(text):
text = text.replace(s, " " + s.replace(" ", "") + " ")
print ("new_text: %s" % text).encode("utf-8")
return text
@classmethod
def add_to_keywords(cls, title, keywords, words_set, once_flag=False):
for token in title.split():
if token in words_set and token not in keywords:
keywords.append(token)
if once_flag:
return keywords
return keywords
@classmethod
def recognize_foreign_names(cls, title, names, keywords):
raw_title = title.replace(" ", "")
tokens = title.split()
if "·" in tokens and len(names) >= 2:
names_len = len(names)
new_name = ""
for i in range(names_len):
if new_name:
break
for j in range(names_len):
name = names[i] + "·" + names[j]
if name in raw_title:
new_name = name
print("new_name: " + new_name)
break
if new_name:
keywords.append(new_name)
new_tokens = [new_name]
for token in title.split():
if token not in new_name:
new_tokens.append(token)
new_title = " ".join(new_tokens)
print("new_title: %s" % new_title).encode("utf-8")
return new_title, keywords
return title, keywords
if __name__ == "__main__":
print RuleUtil.process_text(u"《 你 好 · 呀 》")