-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmakengram.py
executable file
·93 lines (83 loc) · 3.59 KB
/
makengram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/python3
# makengram.py
# script to make ngram models from the raw text
# by Anup pokhrel
# http://virtualanup.com/nepali-ngram-models/
from collections import defaultdict
import sys
class ngrammodel:
def __init__(self,n):
'''Constructor for the ngram model generator
n is the model number like 1 for unigram,
2 for bigram and so on'''
self.n = n
self.words = defaultdict(int) # words are stored as a dictionary
def processarticle(self,article):
'''
process a single article.
the function will split the article into sentences and
process the sentences
'''
# There is not simple way of representing a word in regex in nepali language
# so, we can simply assume the text to be split into sentences seperated by
# some symbols. We will then process the sentence
# endsymbols represent the symbols used to end sentences
endsymbols = ['?','!','।', ';', '\n', '\r', '\r\n']
for symbol in endsymbols:
article = article.replace(symbol,'.') #replace with end of sentence symbol
sentences = article.split('.')
for sentence in sentences:
# sentence must be of enough length
if len(sentence) > 10:
self.processsentence(sentence)
def processsentence(self,sentence):
'''
process the sentence. It splits the sentence into words and analyze the word list
'''
endsymbols = ['-',',','\'','"','\t','(',')','<','>','‘','’','“','”','–']
for symbol in endsymbols:
sentence = sentence.replace(symbol,' ')
if self.n > 1:
# record the start and end of sentences by #
words = ['#']+sentence.split(' ')+['#']
else:
words = sentence.split(' ')
wordlist = []
for word in words:
# to meet the requirements of being a word, some of the predefined characters
# must appear in it
validletters=['क','ख','ग','घ','ङ','च','छ','ज','झ','ञ','ट','ठ','ड','ढ','ण','त','थ','द','ध','न','प','फ','ब','भ','म','य','र','ल','व','श','ष','स','ह','अ','आ','इ','ई','उ','ऊ','ए','ऐ','ओ','औ','अ','अ','०','१','२','३','४','५','६','७','८','९','#']
for letter in validletters:
if letter in word:
wordlist.append(word)
if(len(wordlist) == self.n):
self.words[' '.join(wordlist)] += 1
wordlist = wordlist[1:]
break # break to next word
def readfile(self,file):
'''
reads the content of the file and saves in the ngram
model
'''
for line in file:
self.processarticle(line)
def saveoutput(self,file):
'''
saves the output in the given file
'''
for wordseq in sorted(self.words, key=self.words.get, reverse=True):
file.write(wordseq+' '+str(self.words[wordseq])+"\n")
if __name__ == '__main__':
# get the model number from command line
# like ./makengram.py 2 <outputfilename> for bigram model
if len(sys.argv) != 4:
print("Syntax : "+sys.argv[0]+"<model_number> <input_file> <output_file>",len(sys.argv))
exit()
mn = int(sys.argv[1])
if mn<1 or mn> 5:
print("Model number not supported")
exit()
model = ngrammodel(mn)
#model.readfile(open('test'))
model.readfile(open(sys.argv[2]))
model.saveoutput(open(sys.argv[3],'w+'))