-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadWebService.py
70 lines (63 loc) · 2.68 KB
/
DownloadWebService.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from __future__ import print_function
__author__ = 'Lithium'
import urllib2
import codecs
import time
fname = 'dataSourceWebService.csv'
tempFileName = 'dataSourceWebServiceTemp.csv'
sleepTime = 1 #In Seconds
with codecs.open(fname, "r", "utf-8") as f:
content = [line.rstrip('\n') for line in f]
fileExtension = '.txt'
outputPath = 'data/plaintext/'
webService = 'http://ec2-54-172-230-29.compute-1.amazonaws.com/?url='
numOfPagesInCSV = len(content)
curPageNum = 1
tempFile = codecs.open(tempFileName, "w", "utf-8")
#If the webservice has failed yet
webServiceFailed = False
#Each line are in the format "[0,1],[url],[language]"
for line in content:
print('Page %d of %d: '% (curPageNum,numOfPagesInCSV), end='')
curPageNum += 1
line = line.rstrip('\r') #Remove carriage returns from manual edits
splitLine = line.split(',')
update = splitLine[0]
queryUrl = splitLine[1]
#If the leading character is / remove it.
if(queryUrl[0] == '/'):
queryUrl = queryUrl[1:]
url = webService + queryUrl
lang = splitLine[2].upper() #remove return character
outputFile = outputPath + lang + '-' + url.split('/')[-1]
#Truncate filename if greater than 170 characters
if len(outputFile) > 170:
outputFile = outputFile[:170]
outputFile += fileExtension
#Download plaintext if the file indicates it should be updated or the webservice was rejected by wikipedia
if update.lower() == 'true' and not webServiceFailed:
print('Downloading and Parsing %s in %s'% (queryUrl,lang))
time.sleep(sleepTime)
try:
response = urllib2.urlopen(url)
data = response.read() # a `bytes` object
text = data.decode('utf-8') # a `str`; this step can't be used if data is binary
with codecs.open(outputFile,'w', encoding="utf-8") as file:
file.write("%s\n" % lang)
file.write(text)
#Plaintext loaded successfully don't download again
tempFile.write('FALSE' + ',' + queryUrl + ',' + lang + '\n')
except urllib2.HTTPError, e:
#Webservice was denied output error
webServiceFailed = True
print('===============Download denied by Wikipedia terminating future downloads=========')
tempFile.write(update + ',' + queryUrl + ',' + lang + ',' + 'FAILED HERE' + '\n')
else:
print('Page %s skipped' % (queryUrl))
tempFile.write(update + ',' + queryUrl + ',' + lang + '\n')
tempFile.close()
#Copy temp file to csv file
with codecs.open(tempFileName, "r", "utf-8") as tempCSV:
with codecs.open(fname, "w", "utf-8") as sourceCSV:
for line in tempCSV:
sourceCSV.write(line)