Skip to content

Commit d87a7e8

Browse files
committed
Python server using N-Grams and Minimum Edit Distance
1 parent 2dcc310 commit d87a7e8

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

server.py

+104
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Thu Oct 26 16:54:56 2017
4+
5+
@author: VARUN
6+
"""
7+
8+
from flask import Flask, render_template, jsonify,request
9+
from pprint import pprint
10+
import simplejson as json
11+
import sys
12+
from nltk.corpus import brown
13+
from nltk.corpus import reuters
14+
import nltk
15+
from nltk.corpus import PlaintextCorpusReader
16+
17+
18+
def get_trigram_freq(tokens):
19+
tgs = list(nltk.trigrams(tokens))
20+
21+
a,b,c = list(zip(*tgs))
22+
bgs = list(zip(a,b))
23+
return nltk.ConditionalFreqDist(list(zip(bgs, c)))
24+
25+
def get_bigram_freq(tokens):
26+
bgs = list(nltk.bigrams(tokens))
27+
28+
return nltk.ConditionalFreqDist(bgs)
29+
30+
def appendwithcheck (preds, to_append):
31+
for pred in preds:
32+
if pred[0] == to_append[0]:
33+
return
34+
preds.append(to_append)
35+
36+
def incomplete_pred(words, n):
37+
all_succeeding = bgs_freq[(words[n-2])].most_common()
38+
#print (all_succeeding, file=sys.stderr)
39+
preds = []
40+
number=0
41+
for pred in all_succeeding:
42+
if pred[0].startswith(words[n-1]):
43+
appendwithcheck(preds, pred)
44+
number+=1
45+
if number==3:
46+
return preds
47+
if len(preds)<3:
48+
med=[]
49+
for pred in all_succeeding:
50+
med.append((pred[0], nltk.edit_distance(pred[0],words[n-1], transpositions=True)))
51+
med.sort(key=lambda x:x[1])
52+
index=0
53+
while len(preds)<3:
54+
print (index, len(med))
55+
if index<len(med):
56+
if med[index][1]>0:
57+
appendwithcheck(preds, med[index])
58+
index+=1
59+
if index>=len(preds):
60+
return preds
61+
62+
return preds
63+
64+
app = Flask(__name__)
65+
new_corpus = PlaintextCorpusReader('./','.*')
66+
67+
#tokens = nltk.word_tokenize(raw)
68+
tokens = brown.words() + new_corpus.words('my_corpus.txt')
69+
#tokens = reuters.words()
70+
71+
#compute frequency distribution for all the bigrams and trigrams in the text
72+
bgs_freq = get_bigram_freq(tokens)
73+
tgs_freq = get_trigram_freq(tokens)
74+
75+
76+
@app.route("/test")
77+
def output():
78+
return render_template("index.html")
79+
80+
81+
@app.route('/output', methods=['GET'])
82+
def worker():
83+
#print(request, file=sys.stderr)
84+
string = request.args.get('string')
85+
work = request.args.get('work')
86+
words=string.split()
87+
#print(words, file=sys.stderr)
88+
n=len(words)
89+
if work=='pred':
90+
if n==1:
91+
#print (bgs_freq[(string)].most_common(5),file=sys.stderr)
92+
93+
return json.dumps(bgs_freq[(string)].most_common(5))
94+
95+
elif n>1:
96+
#print (tgs_freq[(words[n-2],words[n-1])].most_common(5),file=sys.stderr)
97+
98+
return json.dumps(tgs_freq[(words[n-2],words[n-1])].most_common(5))
99+
else:
100+
return json.dumps(incomplete_pred(words, n))
101+
102+
103+
if __name__=="__main__":
104+
app.run()

0 commit comments

Comments
 (0)