-
Notifications
You must be signed in to change notification settings - Fork 0
/
homepage.py
95 lines (76 loc) · 2.87 KB
/
homepage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import argparse
import csv
import logging
import gzip
from datetime import datetime
from scraper import SeleniumScraper
from notification import Notification
def setup_logger():
""" Set up logging
"""
# create logs dir if not exists
if not os.path.exists('./logs'):
os.makedirs('./logs')
now = datetime.utcnow()
logfilename = "./logs/homepage-{0:s}.log".format(now.strftime('%Y%m%d%H%M%S'))
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%m-%d %H:%M',
filename=logfilename,
filemode='w')
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(message)s')
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
return logfilename
def download_webpage(url, filepath, compress=False):
scraper = SeleniumScraper()
html = scraper.get(url)
if not html:
html = ''
logging.info("Saving to file {0:s}".format(filepath))
if compress:
with gzip.open(filepath, 'wb') as f:
f.write(bytes(html, 'utf-8'))
else:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(html)
if __name__ == "__main__":
timestamp = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
logfilename = setup_logger()
parser = argparse.ArgumentParser(description='Homepages scraper')
parser.add_argument('input', default=None)
parser.add_argument('-c', '--config', default='top10.cfg',
help='Configuration file')
parser.add_argument('-d', '--dir', default='homepages',
help='Output directory for HTML files')
parser.add_argument('--compress', dest='compress',
action='store_true',
help='Compress download HTML files')
parser.set_defaults(compress=False)
args = parser.parse_args()
logging.info(args)
# to keep scraped data
if not os.path.exists(args.dir):
os.makedirs(args.dir)
with open(args.input) as f:
reader = csv.DictReader(f)
for r in reader:
src = r['src']
url = r['url']
logging.info("Visit URL: {0:s}".format(url))
dt = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
name = '{0:s}_{1:s}.html'.format(src, dt)
filepath = os.path.join(args.dir, name)
if args.compress:
filepath += '.gz'
download_webpage(url, filepath, args.compress)
logging.info("Done")
notification = Notification(args.config)
subject = 'Homepages scraper ({0:s})'.format(timestamp)
body = "Please check out log file for more detail."
notification.send_email(subject, body, logfilename)