forked from lionaneesh/Monitor-Web
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsync.py
150 lines (136 loc) · 4.98 KB
/
sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/python
# Copyright (c) 2012-2013 Aneesh Dogra <lionaneesh@gmail.com>
# Third Party Modules
from BeautifulSoup import BeautifulSoup
# Standard Modules
import urllib2
import sys
import os
from urlparse import urljoin # to support relative urls
import re # url checking
import signal # for handling KeyboardInterrupts
import difflib # for cumputing differences between files
# My modules
from crawler_config import * # for handling Cralwer configurations
from crawler_log import *
from crawler_db_handling import *
from time import time
def checkUrl(url) :
# django regex for url validation
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if re.search(regex, url) == None :
return 0
else :
return 1
def handle_SIGINT(signal, frame) :
save_all()
print_result()
error("Ctrl + C, Detected!\nExiting Now!", 3)
def print_result():
for changes in web_diff:
sys.stderr.write('\n' + changes + web_diff[changes])
def save_all():
for i in pages:
c.execute("SELECT * from cache where url=\'%s\'" % (i,))
data = c.fetchone()
if data != None:
c.execute("UPDATE cache SET content=\'%s\' where url=\'%s\'" % (pages[i], i,))
else:
c.execute("INSERT INTO cache(url, content) values(\'%s\', \'%s\')" % (i, pages[i],))
con.commit()
web_list = read_config()
# connect to sqlite database
directory = os.getcwd() + '/cache'
data = []
crawled = []
tracked_pages = []
to_crawl = []
index = {}
graph = {}
content = ''
pages = {}
web_diff = {}
prefixes = ('http://', 'https://', 'ftp://') # prefixes to check whether the link is an absolute link
c, con = db_connect()
db_setup_everything(c, con)
db_get_data(pages, c)
try:
os.stat(directory)
except:
os.makedirs(directory)
#---- main -----
signal.signal(signal.SIGINT, handle_SIGINT)
for entry in pages:
tracked_pages.append(entry)
for website in web_list:
to_crawl.append(website.strip())
for current_url in to_crawl :
if checkUrl(current_url) == 0 :
continue
log("Crawling [%s]" % current_url)
try :
req = urllib2.Request(current_url)
req.add_header('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11')
html_page = urllib2.urlopen(req)
except urllib2.URLError as reason :
error("URLError : %s" % (reason,), 2)
continue
except ValueError :
error("Invalid URL : %s" % current_url, 2)
continue
except KeyboardInterrupt :
cleanup();
source = html_page.read()
soup = BeautifulSoup(source, fromEncoding="utf-8")
content = soup.prettify()
diff = ''
if current_url in tracked_pages:
# See if there is any difference in the page
text1 = content.split('\n')
text1.pop() # remove last newline
tp = open(pages[current_url])
text2 = tp.read().splitlines()
diff_generator = difflib.unified_diff(text1, text2)
for differences in diff_generator:
diff += differences + '\n'
if diff == '': # no difference == No point of creating another cache file
continue
web_diff[current_url] = '\n' + diff
os.remove(pages[current_url])
else: # new page added
web_diff[current_url] = ' -- New Page Added\n'
temp = directory + "/cache.%.7f.html" % time()
fp = open(temp, 'w')
fp.write(content)
fp.close()
anchor_tags = soup('a', limit = 100) # find at max 100 anchor tags from a webpage
for tag in anchor_tags :
try:
url = tag['href']
except KeyError:
continue
if url.startswith('#') : # Anchor Tags pointing to the same page
continue
if url.startswith(prefixes) == True: # We dont want to link to other sites
continue
else : # relative link, we'll get a ton of invalid links here , example href='javascript:' etc.
url = urljoin(current_url, url);
if checkUrl(current_url) == 0:
continue
if url not in to_crawl and url not in crawled:
to_crawl.append(url)
try:
# @TODO: Add Graph table
graph[current_url].append(url)
except KeyError:
graph[current_url] = [url]
pages[current_url] = temp
crawled.append(current_url)
save_all()
print_result()