|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +Created on Thu Oct 26 16:54:56 2017 |
| 4 | +
|
| 5 | +@author: VARUN |
| 6 | +""" |
| 7 | + |
| 8 | +from flask import Flask, render_template, jsonify,request |
| 9 | +from pprint import pprint |
| 10 | +import simplejson as json |
| 11 | +import sys |
| 12 | +from nltk.corpus import brown |
| 13 | +from nltk.corpus import reuters |
| 14 | +import nltk |
| 15 | +from nltk.corpus import PlaintextCorpusReader |
| 16 | + |
| 17 | + |
| 18 | +def get_trigram_freq(tokens): |
| 19 | + tgs = list(nltk.trigrams(tokens)) |
| 20 | + |
| 21 | + a,b,c = list(zip(*tgs)) |
| 22 | + bgs = list(zip(a,b)) |
| 23 | + return nltk.ConditionalFreqDist(list(zip(bgs, c))) |
| 24 | + |
| 25 | +def get_bigram_freq(tokens): |
| 26 | + bgs = list(nltk.bigrams(tokens)) |
| 27 | + |
| 28 | + return nltk.ConditionalFreqDist(bgs) |
| 29 | + |
| 30 | +def appendwithcheck (preds, to_append): |
| 31 | + for pred in preds: |
| 32 | + if pred[0] == to_append[0]: |
| 33 | + return |
| 34 | + preds.append(to_append) |
| 35 | + |
| 36 | +def incomplete_pred(words, n): |
| 37 | + all_succeeding = bgs_freq[(words[n-2])].most_common() |
| 38 | + #print (all_succeeding, file=sys.stderr) |
| 39 | + preds = [] |
| 40 | + number=0 |
| 41 | + for pred in all_succeeding: |
| 42 | + if pred[0].startswith(words[n-1]): |
| 43 | + appendwithcheck(preds, pred) |
| 44 | + number+=1 |
| 45 | + if number==3: |
| 46 | + return preds |
| 47 | + if len(preds)<3: |
| 48 | + med=[] |
| 49 | + for pred in all_succeeding: |
| 50 | + med.append((pred[0], nltk.edit_distance(pred[0],words[n-1], transpositions=True))) |
| 51 | + med.sort(key=lambda x:x[1]) |
| 52 | + index=0 |
| 53 | + while len(preds)<3: |
| 54 | + print (index, len(med)) |
| 55 | + if index<len(med): |
| 56 | + if med[index][1]>0: |
| 57 | + appendwithcheck(preds, med[index]) |
| 58 | + index+=1 |
| 59 | + if index>=len(preds): |
| 60 | + return preds |
| 61 | + |
| 62 | + return preds |
| 63 | + |
| 64 | +app = Flask(__name__) |
| 65 | +new_corpus = PlaintextCorpusReader('./','.*') |
| 66 | + |
| 67 | +#tokens = nltk.word_tokenize(raw) |
| 68 | +tokens = brown.words() + new_corpus.words('my_corpus.txt') |
| 69 | +#tokens = reuters.words() |
| 70 | + |
| 71 | +#compute frequency distribution for all the bigrams and trigrams in the text |
| 72 | +bgs_freq = get_bigram_freq(tokens) |
| 73 | +tgs_freq = get_trigram_freq(tokens) |
| 74 | + |
| 75 | + |
| 76 | +@app.route("/test") |
| 77 | +def output(): |
| 78 | + return render_template("index.html") |
| 79 | + |
| 80 | + |
| 81 | +@app.route('/output', methods=['GET']) |
| 82 | +def worker(): |
| 83 | + #print(request, file=sys.stderr) |
| 84 | + string = request.args.get('string') |
| 85 | + work = request.args.get('work') |
| 86 | + words=string.split() |
| 87 | + #print(words, file=sys.stderr) |
| 88 | + n=len(words) |
| 89 | + if work=='pred': |
| 90 | + if n==1: |
| 91 | + #print (bgs_freq[(string)].most_common(5),file=sys.stderr) |
| 92 | + |
| 93 | + return json.dumps(bgs_freq[(string)].most_common(5)) |
| 94 | + |
| 95 | + elif n>1: |
| 96 | + #print (tgs_freq[(words[n-2],words[n-1])].most_common(5),file=sys.stderr) |
| 97 | + |
| 98 | + return json.dumps(tgs_freq[(words[n-2],words[n-1])].most_common(5)) |
| 99 | + else: |
| 100 | + return json.dumps(incomplete_pred(words, n)) |
| 101 | + |
| 102 | + |
| 103 | +if __name__=="__main__": |
| 104 | + app.run() |
0 commit comments