-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
61 lines (50 loc) · 1.69 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from bs4 import BeautifulSoup
from pprint import pprint
import urllib
import time
def initalSoup():
r = urllib.urlopen('http://www.laughfactory.com/jokes/popular-jokes').read()
soup = BeautifulSoup(r, "lxml")
#Calculate the execution time
start_time = time.time()
# Get all the urls from the side navigation bar
def get_jokes_nav_urls():
print "Get URLS from Page................"
urls = []
for link in soup.find_all("div", {'class': 'jokes-nav'}):
for l in link.find_all('a', href=True):
urls.append(l.get('href'))
print "URLS are..............."
print urls
return urls
def append_url_parameters(urls):
paramsArray = ['/1','/2','/3','/4','/5']
urlArray = []
for url in urls:
url += paramsArray[1]
urlArray.append(url)
print urlArray
def get_jokes():
print "Start Scraping....."
# urls = get_jokes_nav_urls()
urls = ['http://www.laughfactory.com/jokes/popular-jokes',
'http://www.laughfactory.com/jokes/latest-jokes']
append_url_parameters(urls)
jokes = []
for joke_url in urls:
r = urllib.urlopen(joke_url).read()
soup = BeautifulSoup(r, "lxml")
joke_text = soup.find_all("div", {'class': 'joke-text'})
for j in joke_text:
for child in j.findChildren():
jokes.append(child.text.strip().encode('utf-8'))
print "SHOWING ALL JOKES.........."
pprint(jokes)
def scrape_data():
initalSoup()
get_jokes()
# Start main script
scrape_data()
end_time = time.time()
execution_time = end_time - start_time
print("Execution Time .... %s seconds......" %execution_time)