-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathass.py
123 lines (106 loc) · 5.21 KB
/
ass.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import math, snowballstemmer, operator, os, glob
from string import ascii_lowercase
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from math import log
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
# initialize stemmer
stemmer = snowballstemmer.EnglishStemmer()
# grab stopword list, extend it a bit, and then turn it into a set for later
stop = stopwords.words('english')
stoplist = stemmer.stemWords(stop)
stop = stopwords.words('english')
stop.extend(['may','also','zero','one','two','three','four','five','six','seven','eight','nine','ten','across','among','beside','however','yet','within']+list(ascii_lowercase))
stop.extend(['using', 'way', 'would', 'use', 'i', 'don\'t', 'type', 'it', 'i\'m', 'is', 'get', '<a', 'want', 'another', 'could', '+', 'something', 'used', 'it\'s', 'like', '=', '-', '+', 'see', 'look', 'id', 'make', 'size', 'getting', 'let', 'height', 'maybe', 'know', 'thanks', 'fine', 'always', 'stored', 'us', 'bit', 'called', 'create', 'hand', 'menu'])
stop.extend( ['warn', 'main', 'localhost', 'http', 'startstop', 'bio', 'info', 'cluster', 'finest', 'chatapp', 'disabled', 'new', 'clientoutboundchannel', 'appenders', 'run', 'okdebug', 'monkey_', 'successful', 'finer', 'apr', 'edt', 'audit', 'jul', 'heartbeat', 'reconnection', 'uma', 'disconnected', 'ping', 'serv', 'jgroups', 'javawsapplicationmain'])
stop.extend( ['discovered', 'severe', 'remote', 'dc', 'maig', 'trace', 'wrappermanager', 'cest', 'connection', 'elasticsearch', 'maynard', 'roottracer', 'tiboldt', 'djavax', 'livebeansview', 'control', 'temporarily', 'completed', 'asyncio', 'bootstrap', 'cancelling', 'wthlnap', 'shutting', 'infowrapper', 'mchange', 'feb', 'cfpvednfsg'])
stop.extend( ['logged', 'appeventdao', 'datasourceutils', 'forwarding', 'elasticsearchmaynard', 'imyf', 'appeventdao', 'datasourceutils', 'forwarding', 'elasticsearchmaynard', 'imyf', 'connection[', 'unwanted', 'localvalue', 'exited', 'newsession', 'srvmain', 'gradle\\', 'lient', 'defuncting', 'org', 'errorexit', 'upclsch', 'permessage', 'httpsessionsecuritycontextrepository'] )
stop.extend( ['ulimit', '_auth', 'stdout', 'dwrapper', 'org', 'inflight', 'setdebug', 'defaulting', 'debugfilter', 'wrapperinfo', 'alive', 'simplejdbccall', 'logpath', '\\hellogluon\\', 'jlogger', 'multicast', 'ramlprocessor', 'false[chatapp', 'servicemetadata', 'openglrenderer', 'jackrabbit', 'appthread', 'setuseparenthandlers', 'getprotocol', 'loginfo', 'cookie', 'isloggable', 'multicast' ] )
def isdigit( strr ):
for i in range( len(strr) ):
if( strr[i]=='0' or strr[i]=='1' or strr[i]=='2' or strr[i]=='3' or strr[i]=='4' or strr[i]=='5' or strr[i]=='6' or strr[i]=='7' or strr[i]=='8' or strr[i]=='9' or strr[i]=='/' or strr[i]=='\\' ):
return 0
return 1;
lmtzr = WordNetLemmatizer()
names_of_doc = []
doc_list = []
pos_array = []
tokens = []
all_words = []
index_array = {}
tf_array = {}
doc_count = {}
scoringg = {}
j = 0
def read():
for filename in glob.glob(os.path.join("/home/user/Desktop/Ranking/Documents", '*.txt')):
names_of_doc.append( os.path.basename(filename) )
para = ""
f = open( filename,'r' )
para = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
para = [x.strip() for x in para]
doc_list.append( str(para) )
print( 'reading' )
read()
documents = doc_list
for doc in documents:
index_array = {}
tokens = doc.split(" ")
pos = 0
for tok in tokens:
pos_array = []
pos += 1
tok = tok.lower()
if( (tok not in stop) and len(tok)>2 ):
try:
oioi = lmtzr.lemmatize(tok)
except:
continue
word_without_unicode = oioi.encode('ascii', 'ignore')
if( isdigit(word_without_unicode)==1 ):
tok = word_without_unicode
if tok in index_array:
index_array[tok].append(pos)
else:
index_array[tok] = [pos]
tf_array[ names_of_doc[j] ] = index_array
doc_count[ names_of_doc[j] ] = len(tokens)
j += 1
ask_for_query = raw_input("Search: ")
ask_for_query = ask_for_query.lower()
query = ask_for_query.split(" ")
test = query
query = []
for qq in test:
if( qq not in stop ):
query.append( qq )
def func():
for query_word in query:
coun = 0
for doc in tf_array:
if( query_word in tf_array[doc] ):
coun += 1
coun = max(coun, 1)
idff = log(float(len(tf_array)) / coun) / log(10)
for doc in tf_array:
if( query_word in tf_array[doc] ):
term_freq = float( len(tf_array[doc][query_word]) )
term_freq = log(1 + term_freq)
else:
term_freq = 0
if( doc in scoringg ):
scoringg[doc] += idff*term_freq
else:
scoringg[doc] = idff*term_freq
func()
sort_dict = sorted( scoringg.items(), key=operator.itemgetter(1), reverse=True )
le = 0
for i in range( len(sort_dict) ):
if( ( sort_dict[i][1]!=0 ) and ( le<min(11, len(sort_dict)) ) ):
le += 1
print sort_dict[i][0]
if( le==0 ):
print( 'No documents' )