-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenre_scraper.py
79 lines (66 loc) · 2.8 KB
/
genre_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from types import TracebackType
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from book import Book
import re
import json
from process_frequencies import get_word_frequencies
from remove_copyright import process_file
def get_genres_from_web(book_title):
genre_text = []
try:
# requires chromedriver.exe in the same directory
with webdriver.Chrome(executable_path = 'chromedriver.exe') as driver:
# gets HTML
driver.get("https://en.wikipedia.org/wiki/Special:Search?search="+book_title)
url = driver.current_url
html = driver.page_source
# sets up BeutifulSoup
soup = BeautifulSoup(html, 'html.parser')
# navigates to info boxes on right hand side
info_boxes_elem = soup.find("th", class_="infobox-label").parent
# navigates to "Genre" info box
while "Genre" not in str(info_boxes_elem):
info_boxes_elem = info_boxes_elem.next_sibling
# extracts Genres from the Wikipedia pages, delimited by ',' or '|'
for g in re.split(',|\|', info_boxes_elem.contents[1].getText()):
g = g.strip()
# removes the citation marker if listed
if '[' in g:
open_bracket = g.index('[')
close_bracket = g.index(']')
g = (g[0:open_bracket] + (g[close_bracket+1] if close_bracket < len(g) - 1 else ""))
genre_text.append(g.lower())
except Exception as e:
# return blank array if book not found
return []
return genre_text, url
def get_frequency_table(book_list):
# array that will contain the entire dataset
table = []
for book in book_list:
# temporary array storying data for single book
data = []
title = book.title()
# gets genre array
(genres, url) = get_genres_from_web(title)
# empty arrays signify the book was not found, so is skipped
if len(genres) != 0:
# gets word frequencies for book, and adds that
dict = get_word_frequencies(book.text())
data.append(dict)
# then appends each genre
for g in genres:
data.append(g)
table.append(data)
### json dump ###
json_list = json.load(open("test.json","r"))
json_list.append([{"title":title},{"url":url},dict])
json_object = (json.dumps(json_list, indent = 4))
with open("test.json", "w") as outfile:
outfile.write(json_object)
return table
books = [Book(open("TestFile.txt","r",encoding="utf-8")), Book(open("TestFile2.txt","r",encoding="utf-8"))]
freq = get_frequency_table(books)
print(freq)