-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_it_short_forms.py
242 lines (213 loc) · 9.88 KB
/
scrape_it_short_forms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# import necessary packages for webscraping.
from dataclasses import replace
from bs4 import BeautifulSoup
from urllib import request
from dateutil.parser import parse
import time
import random
import os
import re
import csv
def is_date(string, fuzzy=False):
"""
Return whether the string can be interpreted as a date.
:param string: str, string to check for date
:param fuzzy: bool, ignore unknown tokens in string if True
"""
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def get_issue_link(link):
html = request.urlopen(link).read()
soup = BeautifulSoup(html, 'html5lib')
issue_toc = soup.select('div.textcontent')[0]
raw_links = issue_toc.find_all('a')
actual_issue_links = []
for raw_link in raw_links:
if raw_link.text.startswith('Issue') or raw_link['href'].startswith('https://jitp.commons.gc.cuny.edu/wp-content/plugins/peters-custom-anti-spam-image/custom_anti_spam.php') or raw_link['href'].startswith('https://jitp.commons.gc.cuny.edu/a-conversation-on-international-collaboration-in-digital-scholarship/'):
pass
elif raw_link.has_attr("title") and raw_link['title'] == "Share this article":
pass
elif raw_link['href'].endswith('#respond') or raw_link['href'].endswith('#comments') or '#comment' in raw_link['href']:
pass
elif raw_link.text in ['Attribution-NonCommercial-ShareAlike 4.0 International', 'Previous:', 'Next:', 'Learn how your comment data is processed', 'table of contents', 'Introduction /', 'JITP Issue 8 is now live! | Laura Wildemann Kane','JITP Issue 9 is now live! | Laura Wildemann Kane','Happenings – VREPS','Re-viewing Digital Technologies and Art History – DAHS','Wandering Volunteer Park /', '\n', 'Special Feature: Behind the Seams', 'Summer Supplement: A Conversation on International Collaboration in Digital Scholarship', 'Manifold']:
pass
elif raw_link['href'] in ['https://creativecommons.org/licenses/by-nc-sa/4.0/', 'http://teacherstech.net/?p=10236']:
pass
elif is_date(raw_link.text):
pass
elif raw_link.text in [text.replace(u'\xa0', u'') for link, text in actual_issue_links]:
pass
else:
actual_issue_links.append((raw_link['href'], raw_link.text.replace(u'\xa0', u'')))
return actual_issue_links
def scrape_contents_of_an_article(article_link):
html = request.urlopen(article_link).read()
soup = BeautifulSoup(html, 'html5lib')
# swap this line and the following one to grab only the article and not the comments
# issue_contents = soup.select('article')[0]
issue_contents = soup.select('div#main')[0]
# add 'li.a[href$="#comments' to get rid of comments
list_of_junk = ['div.tagslist','div.iw-social-share','a[href^="https://jitp.commons.gc.cuny.edu/category/issues"]', 'section#post-nav', 'div.comment-respond','p.akismet_comment_form_privacy_notice', 'p[style="display: none !important;"]', 'section.comments p.buttons', 'section.comments img', 'img.avatar', 'div.featimg.animated', 'div.cat']
for junk in list_of_junk:
# print(junk)
try:
issue_contents.select(junk)[0].decompose()
except:
pass
# print('could not match ' + junk)
try:
for item in issue_contents.select(junk):
item.decompose()
except:
print('not a thing we can loop over')
# massaging of author bios
try:
for quote in issue_contents.find_all('blockquote'):
try:
if quote.h3.text == "About the Authors" or quote.h3.text == "About the Author":
quote.name = 'div'
quote['id']='authorbio'
try:
quote.h3.name = 'h2'
except:
pass
except:
pass
try:
if quote.h2.text == "About the Authors" or quote.h2.text == "About the Author":
try:
quote.name = 'div'
quote['id']='authorbio'
except:
pass
except:
pass
except:
pass
#turn all address tags into p
try:
for address in issue_contents.find_all('address'):
address.name = 'p'
except:
pass
# remove bold from h2
try:
for abstract in issue_contents.find_all('h2'):
if address.b:
address.b.unwrap()
except:
pass
# make all h3 abstracts into h2
try:
for h3 in issue_contents.find_all('h3'):
if h3.text == "Abstract":
h3.name = 'h2'
except:
pass
# get rid of comment links and reformat date to not be a line item
try:
pass
replace_text = issue_contents.select('ul.textinfo')[0].find_all('li')[1].text.strip()
# replace_text = '<p>' + replace_text + '</p>'
# replace_text = BeautifulSoup(replace_text)
issue_contents.select('ul.textinfo')[0].replace_with(replace_text)
# text_info = issue_contents.select('ul.textinfo')[0].find_all('li')
# # text_info[0].decompose()
# text_info.contents = text_info[1].text
except:
pass
# search for image tags and replace direct links
for img in issue_contents.find_all('img'):
img['src'] = re.sub(
r'https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/`\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/EasyRotatorStorage\/user\-content\/erc\_62\_1406481274\/content\/assets',"images", img['src'])
del img['srcset']
for sup_tag in issue_contents.find_all("sup", {"class": "footnote"}):
del sup_tag.findChild('a')['onclick']
# setting the byline tag
print(article_link)
try:
for hit in issue_contents.find_all('h2', {"class": 'byline'}):
hit.name = 'p'
except:
# if no h2 with byline - just pass
# but issue right now is that sometimes there is no byline class but there is an h2. you could say "turn the first h2 into a p tag, but you won't know universally that is the case"
print('Fail: ' + article_link)
pass
try:
for thing in issue_contents.select('a[href^="https://jitp.commons.gc.cuny.edu/files"]'):
thing.replaceWithChildren()
except:
pass
try:
for thing in issue_contents.select('a[href^="http://jitp.commons.gc.cuny.edu/files"]'):
thing.replaceWithChildren()
except:
pass
# strip brackets from notes
for hit in issue_contents.find_all('a', {'class': 'ftn'}):
hit.string.replace_with(re.sub('[\[\]]', '', hit.string))
for hit in issue_contents.find_all('a', {'class': 'ftnref'}):
hit.string.replace_with(re.sub('[\[\]]', '', hit.string))
return issue_contents
def clean_issue(title, issue_contents):
metadata_title = "<head>\n<meta name=\"dc.title\" content=\"" + title + "\">\n</head>"
clean_contents = metadata_title + str(issue_contents)
# re.sub(
# r'img src="https:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+|src="http:\/\/jitp\.commons\.gc\.cuny\.edu\/files\/[0-9]+\/[0-9]+',"<img src=\"images", clean_contents)
return clean_contents
def scrape_issue(issue_title,issue_links):
if not os.path.exists(issue_title):
os.mkdir(issue_title)
print('Processing ' + issue_title)
print(issue_links)
for link,title in issue_links:
print(title)
contents = scrape_contents_of_an_article(link)
clean_contents = clean_issue(title, contents)
with open(os.path.join(issue_title, title.replace('/', ' ') +'.html'), 'w') as fout:
fout.write(clean_contents)
def get_main_toc_links():
# store the url we want to work with in the variable 'url'
url = 'https://jitp.commons.gc.cuny.edu/issues/'
html = request.urlopen(url).read()
# turn it into soup
soup = BeautifulSoup(html, 'html5lib')
# grab just the toc from the page
toc = soup.select('div.textcontent')[0]
# grab all the anchor tags from the toc but throw away the ones about cc lincensing
raw_links = toc.find_all('a')[1:-2]
actual_links = [(raw_link['href'], raw_link.text.replace(u'\xa0', u' ').replace('Table of Contents: ','')) for raw_link in raw_links]
return actual_links
def get_all_issue_links(main_toc_links):
links_for_individual_issues = {}
# MODIFY HERE TO REDUCE NUMBER OF ISSUES SCRAPED
for issue_toc_link, issue_title in main_toc_links:
if issue_title in ['Manifold', 'Issue Twenty-One', 'Summer Supplement: A Conversation on International Collaboration in Digital Scholarship']:
pass
else:
max_sleep = 5
time.sleep(random.random() * max_sleep)
print('=====')
print('Scraping ' + issue_title)
links_for_individual_issues[issue_title] = get_issue_link(issue_toc_link)
return links_for_individual_issues
def main():
# get all the main toc links
# main_toc_links = get_main_toc_links()
# main_toc_links.reverse()
# use the main toc links to get each issue link
links_for_individual_short_forms= {'tool tips':[], 'teaching fails':[],'reviews':[], 'blueprints':[], 'assignments':[], 'behind the seams':[]}
with open('short-form-links.tsv', newline='') as csvfile:
spamreader = csv.DictReader(csvfile, delimiter='\t', quotechar='|')
for row in spamreader:
print(row)
print(links_for_individual_short_forms)
print('======')
links_for_individual_short_forms[row['short_form_type']].append((row['link'],row['title']))
for issue_title,issue_links in links_for_individual_short_forms.items():
scrape_issue(issue_title, issue_links)
if __name__ == "__main__":
main()