-
Notifications
You must be signed in to change notification settings - Fork 5
/
pagerank.py
198 lines (150 loc) · 5.57 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import random
import re
import sys
import numpy
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
# probability-list to initialize weights for random no. generator
p = [damping_factor, 1 - damping_factor]
model = dict()
linked = set()
unlinked = set()
# from the corpus, get the list of pages that the current page is linked to
# and the list of pages the current page is not linked to
# also set the transition model probabiilties to 0.0 initially
for key, value in corpus.items():
model[key] = 0.0
if key != page:
continue
linked = value
unlinked = set()
for key in corpus:
if key in linked:
continue
unlinked.add(key)
# each page starts with a base probability of reach of (1 - damping_factor)/total_pages
# then each page linked to this page gets an additional probability of
# damping_factor / no_of_links
linked_count = len(linked)
unlinked_count = len(unlinked)
total = linked_count + unlinked_count
base_prob = p[1] / total
for key in model:
if key in linked:
model[key] = base_prob + damping_factor / linked_count
else:
model[key] = base_prob
return model
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
rank = dict()
# initialize rank of each page in the corpus to 0.0
for key in corpus:
rank[key] = 0.0
# start with a random sample page and set its probability to 1/n
# n = no. of samples
sample = random.choices(list(corpus))[0]
rank[sample] += (1 / n)
# based on the transition model generated by this page, go to the next page
# increment the rank of the next page by 1 / n
for i in range(1, n):
model = transition_model(corpus, sample, damping_factor)
next_pages = []
probabilities = []
for key, value in model.items():
next_pages.append(key)
probabilities.append(value)
sample = random.choices(next_pages, weights=probabilities)[0]
rank[sample] += (1 / n)
return rank
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
ranks = dict()
# set threshold of convergence to be +/- 0.001
threshold = 0.0005
# N = no. of pages
# set the rank of each page to be 1 / N
N = len(corpus)
for key in corpus:
ranks[key] = 1 / N
# for each page, determine which/how many other pages link to it
# then, apply the PageRank formula
# if change (new_rank, old_rank) < threshold update counter
# if by the end of the loop, counter == N,
# it means that the change in rank for each page in the corpus was within the threshold
# so end the loop
# return rank
while True:
count = 0
for key in corpus:
new = (1 - damping_factor) / N
sigma = 0
for page in corpus:
if key in corpus[page]:
num_links = len(corpus[page])
sigma = sigma + ranks[page] / num_links
sigma = damping_factor * sigma
new += sigma
if abs(ranks[key] - new) < threshold:
count += 1
ranks[key] = new
if count == N:
break
return ranks
if __name__ == "__main__":
main()