-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathranker.py
211 lines (184 loc) · 6.51 KB
/
ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import random
import re
import sys
import csv
DAMPING = 0.85
SAMPLES = 10000
"""
prompt
python pagerank.py -n --> simple counter based web ranker will be executed
python pagerank.py -sp --> pagerank with sampling will be executed
python pagerank.py -ip --> pagerank with iteration will be executed
"""
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python ranker.py [FLAG]")
flag = sys.argv[1]
corpus = read()
if (flag == "-n"):
print("------------------------------------------------")
print(f" Domain Name rankings using counter")
print("------------------------------------------------")
print()
print("................................................")
print(f" Domain Name \t\t\t Rank")
print("................................................")
print()
ranks = counter_ranker(corpus)
ranks = dict(sorted(ranks.items(), key=lambda item: item[1], reverse=True))
x = 0
for page in ranks:
x = x+1
print(x, ". ", page, " "*(40-len(page)), ranks[page])
print("-"*60)
elif (flag == "-sp"):
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
ranks = dict(sorted(ranks.items(), key=lambda item: item[1], reverse=True))
print("-------------------------------------------------------------")
print(f" Domain Name ranking using PageRank: Sampling (n = {SAMPLES})")
print("-------------------------------------------------------------")
print()
print("................................................")
print(f" Domain Name \t\t\t Rank")
print("................................................")
print()
x = 0
for page in ranks:
x = x+1
print(x, ". ", page, " "*(40-len(page)), ranks[page])
print("-"*60)
elif (flag == "-ip"):
ranks = iterate_pagerank(corpus, DAMPING)
ranks = dict(sorted(ranks.items(), key=lambda item: item[1], reverse=True))
print("----------------------------------------------")
print(f" Domain Name ranking using PageRank: Iteration")
print("----------------------------------------------")
print()
print("................................................")
print(f" Domain Name \t\t\t Rank")
print("................................................")
print()
x = 0
for page in ranks:
x = x+1
print(x, ". ", page, " "*(40-len(page)), ranks[page])
print("-"*60)
else:
sys.exit("Usage: python ranker.py [FLAG]")
#who do we think we are
def read():
"""
Read the csv file line by line
Adding the first website of each line as key and rest as its values
Because we have the csv file like that:
first one of each line is current website and others are linked websites to that website
"""
pages = dict()
with open("OUTPUT/pagerank.csv", "r") as csv_file:
csv_reader = csv.reader(csv_file)
for row in csv_reader:
pages[row[0]] = set(
x for x in row[1:]
)
return pages
def counter_ranker(corpus):
"""Return the web rankings using a simple counter"""
counter = dict()
keys = list(corpus.keys())
for x in keys:
for link in corpus[x]:
counter[x] = 0
for x in keys:
for link in corpus[x]:
counter[x] +=1
return counter
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
ans = dict()
n = len(corpus[page])
m = len(corpus.keys())
for x in corpus.keys():
if (x in corpus[page]):
ans[x] = ((damping_factor * (1/n)) + ((1-damping_factor)* (1/m)))
else:
ans[x] = ((1-damping_factor)* (1/m))
return ans
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
x = random.choices([x for x in corpus.keys()])[0]
pagerrank = dict()
for key in corpus.keys():
pagerrank[key] = 0
pagerrank[x] += (1/n)
for i in range(1,n):
model = transition_model(corpus,x,damping_factor)
keys = list(model.keys())
probability = list(model.values())
x = random.choices(keys,weights=probability)[0]
pagerrank[x] += (1/n)
return pagerrank
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
pagerrank = dict()
# no of links in corpus
n = len(corpus.keys())
# set having all the links of corpus in it
s = set()
# initializing pagerank of all links equally
for x in corpus.keys():
pagerrank[x] = 1/n
s.add(x)
# creating dict which shows 'x' link can be accessed by which all links
thru = dict()
for x in s:
temp = set()
for t in s:
if (x in corpus[t]):
temp.add(t)
thru[x] = temp
# loop goes untill pagerrank[x] new - before > 0.001 for all x which is link
while(1):
before = [pagerrank[v] for v in pagerrank.keys()]
for x in s:
belong = thru[x]
# calculating page rank of page x
pg = 0
for v in belong:
pg += (pagerrank[v]/len(corpus[v]))
pg = pg*damping_factor
pg += ((1-damping_factor)/n)
pagerrank[x] = pg
if (nochange(before, [pagerrank[v] for v in pagerrank.keys()])):
break
return pagerrank
def nochange(before,new):
n = len(before)
count=0
for i in range(n):
if (abs(before[i]-new[i]) <= 0.001):
count+=1
if (count==n):
return 1
else:
return 0
if __name__ == "__main__":
main()