-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument.py
33 lines (29 loc) · 1009 Bytes
/
document.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from os import listdir
import re
from os.path import isfile, join
from helper import *
class Document :
def __init__(self, filepath) :
f = open(filepath, "r")
self.text = f.read()
f.close()
def find_rules(self,gazetteer) :
count = 0
rules = []
for ne in gazetteer :
traverse = 0
while traverse < len(self.text) :
index = self.text.find(ne, traverse)
if index < 0 :
traverse = len(self.text)
break
traverse = index+len(ne)
if subword_filter(self.text, index, ne):
continue
next_word = get_next_word(self.text, index+len(ne))
prev_word = get_prev_word(self.text, index)
if len(prev_word) > 1 :
rules.append((prev_word, ""))
if len(next_word) > 1 :
rules.append(("", next_word))
return rules