Skip to content

Commit 8a6f530

Browse files
committed
Moved MetaCriticScraper to its own class
1 parent 56344aa commit 8a6f530

File tree

3 files changed

+260
-42
lines changed

3 files changed

+260
-42
lines changed

.gitignore

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
#################
2+
## Eclipse
3+
#################
4+
5+
*.pydevproject
6+
.project
7+
.metadata
8+
bin/
9+
tmp/
10+
*.tmp
11+
*.bak
12+
*.swp
13+
*~.nib
14+
local.properties
15+
.classpath
16+
.settings/
17+
.loadpath
18+
19+
# External tool builders
20+
.externalToolBuilders/
21+
22+
# Locally stored "Eclipse launch configurations"
23+
*.launch
24+
25+
# CDT-specific
26+
.cproject
27+
28+
# PDT-specific
29+
.buildpath
30+
31+
32+
#################
33+
## Visual Studio
34+
#################
35+
36+
## Ignore Visual Studio temporary files, build results, and
37+
## files generated by popular Visual Studio add-ons.
38+
39+
# User-specific files
40+
*.suo
41+
*.user
42+
*.sln.docstates
43+
44+
# Build results
45+
[Dd]ebug/
46+
[Rr]elease/
47+
*_i.c
48+
*_p.c
49+
*.ilk
50+
*.meta
51+
*.obj
52+
*.pch
53+
*.pdb
54+
*.pgc
55+
*.pgd
56+
*.rsp
57+
*.sbr
58+
*.tlb
59+
*.tli
60+
*.tlh
61+
*.tmp
62+
*.vspscc
63+
.builds
64+
*.dotCover
65+
66+
## TODO: If you have NuGet Package Restore enabled, uncomment this
67+
#packages/
68+
69+
# Visual C++ cache files
70+
ipch/
71+
*.aps
72+
*.ncb
73+
*.opensdf
74+
*.sdf
75+
76+
# Visual Studio profiler
77+
*.psess
78+
*.vsp
79+
80+
# ReSharper is a .NET coding add-in
81+
_ReSharper*
82+
83+
# Installshield output folder
84+
[Ee]xpress
85+
86+
# DocProject is a documentation generator add-in
87+
DocProject/buildhelp/
88+
DocProject/Help/*.HxT
89+
DocProject/Help/*.HxC
90+
DocProject/Help/*.hhc
91+
DocProject/Help/*.hhk
92+
DocProject/Help/*.hhp
93+
DocProject/Help/Html2
94+
DocProject/Help/html
95+
96+
# Click-Once directory
97+
publish
98+
99+
# Others
100+
[Bb]in
101+
[Oo]bj
102+
sql
103+
TestResults
104+
*.Cache
105+
ClientBin
106+
stylecop.*
107+
~$*
108+
*.dbmdl
109+
Generated_Code #added for RIA/Silverlight projects
110+
111+
# Backup & report files from converting an old project file to a newer
112+
# Visual Studio version. Backup files are not needed, because we have git ;-)
113+
_UpgradeReport_Files/
114+
Backup*/
115+
UpgradeLog*.XML
116+
117+
118+
119+
############
120+
## Windows
121+
############
122+
123+
# Windows image file caches
124+
Thumbs.db
125+
126+
# Folder config file
127+
Desktop.ini
128+
129+
130+
#############
131+
## Python
132+
#############
133+
134+
*.py[co]
135+
136+
# Packages
137+
*.egg
138+
*.egg-info
139+
dist
140+
build
141+
eggs
142+
parts
143+
bin
144+
var
145+
sdist
146+
develop-eggs
147+
.installed.cfg
148+
149+
# Installer logs
150+
pip-log.txt
151+
152+
# Unit test / coverage reports
153+
.coverage
154+
.tox
155+
156+
#Translations
157+
*.mo
158+
159+
#Mr Developer
160+
.mr.developer.cfg
161+
162+
# Mac crap
163+
.DS_Store

MetaCriticScraper.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from bs4 import BeautifulSoup
2+
import urllib2
3+
4+
class MetaCriticScraper:
5+
def __init__(self, url):
6+
self.game = {'url': '',
7+
'title': '',
8+
'platform': '',
9+
'publisher': '',
10+
'release_date': '',
11+
'critic_score': '',
12+
'critic_outof': '',
13+
'critic_count': '',
14+
'user_score': '',
15+
'user_count': '',
16+
'developer': '',
17+
'genre': '',
18+
'rating': ''
19+
}
20+
21+
metacritic_url = urllib2.urlopen(url)
22+
self.game['url'] = metacritic_url.geturl()
23+
html = metacritic_url.read()
24+
self.soup = BeautifulSoup(html)
25+
self.scrape()
26+
27+
def scrape(self):
28+
# Get Title and Platform. If site changes and we can't find the right divs or classes
29+
# skip and leave these values as empty strings
30+
try:
31+
product_title_div = self.soup.find("div", class_="product_title")
32+
self.game['title'] = product_title_div.a.text.strip()
33+
self.game['platform'] = product_title_div.span.a.text.strip()
34+
except:
35+
print "WARNING: Problem getting title and platform information"
36+
pass
37+
38+
# Get publisher and release date.
39+
try:
40+
self.game['publisher'] = self.soup.find("li", class_="summary_detail publisher").a.text.strip()
41+
self.game['release_date'] = self.soup.find("li", class_="summary_detail release_data").find("span", class_="data").text.strip()
42+
#datetime.strptime(release_date.strip(), "%b %d, %Y")
43+
except:
44+
print "WARNING: Problem getting publisher and release date information"
45+
pass
46+
47+
# Get critic information
48+
try:
49+
critics = self.soup.find("div", class_="details main_details")
50+
self.game['critic_score'] = critics.find("span", class_="score_value").text.strip()
51+
self.game['critic_outof'] = critics.find("span", class_="score_total").span.text.strip()
52+
self.game['critic_count'] = critics.find("span", class_="count").a.span.text.strip()
53+
except:
54+
print "WARNING: Problem getting critic score information"
55+
pass
56+
57+
# Get user information
58+
try:
59+
users = self.soup.find("div", class_="details side_details")
60+
self.game['user_score'] = users.find("span", class_="score_value").text.strip()
61+
raw_users_count = users.find("span", class_="count").a.text
62+
user_count = ''
63+
for c in raw_users_count:
64+
if c.isdigit(): user_count += c
65+
self.game['user_count'] = user_count.strip()
66+
except:
67+
print "WARNING: Problem getting user score information"
68+
pass
69+
70+
# Get remaining information
71+
try:
72+
product_info = self.soup.find("div", class_="section product_details").find("div", class_="details side_details")
73+
self.game['developer'] = product_info.find("li", class_="summary_detail developer").find("span", class_="data").text.strip()
74+
self.game['genre'] = product_info.find("li", class_="summary_detail product_genre").find("span", class_="data").text.strip()
75+
self.game['rating'] = product_info.find("li", class_="summary_detail product_rating").find("span", class_="data").text.strip()
76+
except:
77+
print "WARNING: Problem getting miscellaneous game information"
78+
pass

metacritc.py

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,20 @@
1-
from bs4 import BeautifulSoup
21
import sys
3-
import urllib2
4-
5-
url = urllib2.urlopen(sys.argv[1])
6-
html = url.read()
7-
8-
soup = BeautifulSoup(html)
9-
10-
product_title_div = soup.find("div", class_="product_title")
11-
title = product_title_div.a.text
12-
platform = product_title_div.span.a.text
13-
14-
publisher = soup.find("li", class_="summary_detail publisher").a.text
15-
release_date = soup.find("li", class_="summary_detail release_data").find("span", class_="data").text
16-
17-
critics = soup.find("div", class_="details main_details")
18-
critic_score = critics.find("span", class_="score_value").text
19-
critic_outof = critics.find("span", class_="score_total").span.text
20-
critic_count = critics.find("span", class_="count").a.span.text
21-
22-
users = soup.find("div", class_="details side_details")
23-
users_score = users.find("span", class_="score_value").text
24-
raw_users_count = users.find("span", class_="count").a.text
25-
users_count = ""
26-
for c in raw_users_count:
27-
if c.isdigit(): users_count += c
28-
29-
product_info = soup.find("div", class_="section product_details").find("div", class_="details side_details")
30-
developer = product_info.find("li", class_="summary_detail developer").find("span", class_="data").text
31-
genre = product_info.find("li", class_="summary_detail product_genre").find("span", class_="data").text
32-
rating = product_info.find("li", class_="summary_detail product_rating").find("span", class_="data").text
33-
34-
print "URL: " + url.geturl()
35-
print "Title: " + title.strip()
36-
print "Platform: " + platform.strip()
37-
print "Publisher: " + publisher.strip()
38-
print "Release Date: " + release_date.strip()
39-
print "Critic Score: " + critic_score.strip() + "/" + critic_outof + " (" + critic_count + " critics)"
40-
print "User Score: " + users_score.strip() + " (" + users_count + " users)"
41-
print "Developer: " + developer.strip()
42-
print "Genre: " + genre.strip()
43-
print "Rating: " + rating.strip()
2+
from MetaCriticScraper import MetaCriticScraper
3+
import time
4+
# Do we want to handle movies, tv shows, etc...?
5+
# FIXME: Handle URL better
6+
7+
start_time = time.time()
8+
scraper = MetaCriticScraper(sys.argv[1])
9+
elapsed_time = time.time() - start_time
10+
print "URL: " + scraper.game['url']
11+
print "Title: " + scraper.game['title']
12+
print "Platform: " + scraper.game['platform']
13+
print "Publisher: " + scraper.game['publisher']
14+
print "Release Date: " + scraper.game['release_date']
15+
print "Critic Score: " + scraper.game['critic_score'] + "/" + scraper.game['critic_outof'] + " (" + scraper.game['critic_count'] + " critics)"
16+
print "User Score: " + scraper.game['user_score'] + " (" + scraper.game['user_count'] + " users)"
17+
print "Developer: " + scraper.game['developer']
18+
print "Genre: " + scraper.game['genre']
19+
print "Rating: " + scraper.game['rating']
20+
print "Time to scrape: ", round(elapsed_time, 2), "secs"

0 commit comments

Comments
 (0)