-
Notifications
You must be signed in to change notification settings - Fork 1
/
metacriticscrape.py
65 lines (53 loc) · 2 KB
/
metacriticscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import re
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
def list_to_string(s):
# initialize an empty string
str1 = ""
return str1.join(s)
meta_score_block = [] # store metascore block
user_score_block = [] # store user score block
summary = [] # store plot summary
stuff = []
votes = [] # store votes
url = "https://www.metacritic.com/movie/the-emoji-movie" # I don't know how we are handling input - but whatever it
# is, the url goes here
driver.get(url)
page_content = driver.page_source
soup = BeautifulSoup(page_content, "html.parser")
title = soup.title.string
clean_title = title.split('Reviews')
for a in soup.find_all('span', attrs={'class': 'metascore_w header_size movie positive'}):
meta_score_block.append(a.string)
if len(meta_score_block) == 0:
for a in soup.find_all('span', attrs={'class': 'metascore_w header_size movie mixed'}):
meta_score_block.append(a.string)
if len(meta_score_block) == 0:
for a in soup.find_all('span', attrs={'class': 'metascore_w header_size movie negative'}):
meta_score_block.append(a.string)
meta_score = list_to_string(meta_score_block)
for a in soup.find_all('span', attrs={'class': 'metascore_w user larger movie positive'}):
user_score_block.append(a.string)
if len(user_score_block) == 0:
for a in soup.find_all('span', attrs={'class': 'metascore_w user larger movie mixed'}):
user_score_block.append(a.string)
if len(user_score_block) == 0:
for a in soup.find_all('span', attrs={'class': 'metascore_w user larger movie negative'}):
user_score_block.append(a.string)
user_score = list_to_string(user_score_block)
a_string = ""
for a in soup.find_all('span'):
a_string += str(a.string)
a_string += "\n"
runtime = re.findall("[0-9]+ min", a_string)
data = {
"Title": clean_title[0],
"Metascore": meta_score,
"Userscore": user_score,
"Runtime": runtime[0]
}
with open('data.txt', 'w') as outfile:
json.dump(data, outfile)
driver.close()