-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
172 lines (137 loc) · 4.31 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import urllib2,os,url,re,validators
from BeautifulSoup import BeautifulSoup
"""
# About
Creator : Dion Ajie
Location : Bandung, Indonesia
Mail : mail@dionajie.com
# Package to install
pip install urllib2
pip install validators
pip install BeautifulSoup
pip install url
# TO DO
- download image in inline style
# Warning
Use this app wisely.
It is your responsibility to use this app
"""
def download(content):
filename = content.rsplit('/', 1)[-1].encode('unicode-escape')
path = content.rsplit('/', 1)[:-1][0].encode('unicode-escape')
print 'Download %s' %(filename)
if not os.path.exists(pathfolder+path):
os.makedirs(pathfolder+path)
try:
print 'File from ', str(url)+str(content)
res = urllib2.urlopen(str(url)+str(content))
print "Download complete. Save to %s \n" %(content)
with open(pathfolder+path+'/'+filename, "wb" ) as file:
file.write(res.read())
except urllib2.HTTPError, e:
print e
return pathfolder+path+'/'+filename
def getInsideCSS(path):
print "\ninside css -------------"
with open(path, "r") as f:
content = f.read()
path = '/'.join(path.split('/', -1)[:-1])
path = path.split('/', 1)[1]
pathbefore = '/'.join(path.split('/', -1)[:-1])
print path
match = re.findall('url\((.*?)\)',content)
for row in match:
#ex : row = '../fonts/ionicons.svg#Ionicons'
if not validators.url(row):
if '..' in row:
rowAfter = row.split('/', 1)[-1]
rowAfter = pathbefore+'/'+rowAfter
else:
rowAfter = path+'/'+row
rowAfter = rowAfter.replace("'","")
rowAfter = rowAfter.replace('"','')
rowAfter = rowAfter.split('#', 1)[0]
rowAfter = rowAfter.split('?', 1)[0]
print rowAfter
download(rowAfter)
def getCSS(soup):
# get css
exList = []
print exList
print "\n css------"
for css in soup.findAll("link"):
if "stylesheet" in css.get("rel", []):
if css["href"][-4:] == '.css' and not validators.url(css["href"]):
print css['href']
result = download(css['href'])
print result
if not any(ext in result for ext in exList):
getInsideCSS(result)
# exceot stylesheet
elif(css.get("rel", [])):
result = download(css['href'])
print "end of css ----------"
def getJS(soup):
#get js
print "\njs------"
for js in soup.findAll("script"):
if js["src"][-3:] == '.js' and not validators.url(js["src"]):
download(js["src"])
print "end of js ----------"
def getImages(soup):
#get images URL
print "\nimages------"
for image in soup.findAll("img"):
print image["src"]
download(image["src"])
print "end of images ----------"
def getFontAwesome(pathfolder):
# check font-awesome/fonts
if os.path.exists(pathfolder+'font-awesome/'):
print "\nFont Awesome------"
if not os.path.exists(pathfolder+'font-awesome/fonts'):
os.makedirs(pathfolder+'font-awesome/fonts/')
FAfiles = ['FontAwesome.otf', 'fontawesome-webfont.eot', 'fontawesome-webfont.svg', 'fontawesome-webfont.ttf', 'fontawesome-webfont.woff', 'fontawesome-webfont.woff2' ]
url = 'https://github.com/FortAwesome/Font-Awesome/blob/master/fonts/'
for content in FAfiles:
res = urllib2.urlopen(str(url)+str(content)+'?raw=true')
print "\nDownload %s" %(content)
print "File from %s" %(str(url)+str(content))
with open(pathfolder+'font-awesome/fonts/'+content, "wb" ) as file:
file.write(res.read())
print 'Download Complete. Save to %s/font-awesome/fonts/%s' %(pathfolder,content)
print "end of font-awesome ----------"
def scrapPage(url,filenamePage,pathfolder):
if not os.path.exists(pathfolder):
os.makedirs(pathfolder)
# request page
request = urllib2.Request(url)
response = urllib2.urlopen(request)
# get HTML content
webContent = response.read()
# save page
f = open(pathfolder+filenamePage+'.html', 'w')
f.write(webContent)
f.close
# create report
report = 'url : '+str(url)+'\n'
f = open(pathfolder+'report.txt', 'w')
f.write(report)
f.close
# get content page
soup = BeautifulSoup(webContent)
getCSS(soup)
getJS(soup)
getImages(soup)
getFontAwesome(pathfolder)
# update report
with open(pathfolder+"report.txt", "r+") as f:
old = f.read()
f.seek(0)
f.write(old+"\nstatus : complete")
print "\nComplete"
if __name__ == '__main__':
url = 'http://code-pages.com/html/Revalia/' #URL Page
filenamePage = 'index' # HTML File name
pathfolder = 'Revalia/' # Folder to save files
scrapPage(url,filenamePage,pathfolder)