-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_crawler.py
40 lines (30 loc) · 1020 Bytes
/
web_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import multiprocessing as mp
from urllib.request import urlopen, urljoin
from bs4 import BeautifulSoup
import re
base_url = "https://github.com/myhpiok"
def crawl(url):
response = urlopen(url)
return response.read().decode()
def parse(html):
soup = BeautifulSoup(html,"lxml")
urls = soup.find_all("a",{"href":re.compile("^/.+?/$")})
title = soup.find("h1").get_text().strip()
page_urls = set([urljoin(base_url,url["href"]) for url in urls])
url = soup.find("meta",{"property":"og:url"})["content"]
return title,page_urls,url
unseen = set([base_url,])
seen = set()
count = 1
while len(unseen) != 0:
print("Crawling...")
htmls = [crawl(url) for url in unseen]
print("Parsing...")
results = [parse(html) for html in htmls]
print("Analsysing...")
seen.update(unseen)
unseen.clear()
for title, page_urls, url in results:
print(count,title,url)
count += 1
unseen.update(page_urls - seen)