Skip to content

Commit 1b9354e

Browse files
committed
Full Scraper
1 parent 8a6f530 commit 1b9354e

File tree

4 files changed

+938
-6
lines changed

4 files changed

+938
-6
lines changed

MetaCriticScraper.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@ def __init__(self, url):
1717
'genre': '',
1818
'rating': ''
1919
}
20-
21-
metacritic_url = urllib2.urlopen(url)
22-
self.game['url'] = metacritic_url.geturl()
23-
html = metacritic_url.read()
24-
self.soup = BeautifulSoup(html)
25-
self.scrape()
20+
21+
try:
22+
metacritic_url = urllib2.urlopen(url)
23+
self.game['url'] = metacritic_url.geturl()
24+
html = metacritic_url.read()
25+
self.soup = BeautifulSoup(html)
26+
self.scrape()
27+
except:
28+
pass
2629

2730
def scrape(self):
2831
# Get Title and Platform. If site changes and we can't find the right divs or classes

gamedata-20130628-12_37_40.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
DateTime:,2013-06-28 12:37:40.498000
2+
name,platform,release year,genre,publisher,north america sales,europe sales,japan sales,rest of world sales,global sales,release date,critic score,critic outof,critic count,user score,user count,developer,rating
3+
Wii Sports,Wii,2006,Sports,Nintendo,40.56,24.54,3.77,12.46,81.33,"Nov 19, 2006",76,100,51,8.3,207,Nintendo,E
4+
Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,,,
5+
Mario Kart Wii,Wii,2008,Racing,Nintendo,14.65,10.27,3.65,5.22,33.78,"Apr 27, 2008",82,100,73,8.2,436,Nintendo,E
6+
Wii Sports Resort,Wii,2009,Sports,Nintendo,14.95,8.46,3.20,5.10,31.71,"Jul 26, 2009",80,100,73,7.8,113,Nintendo,E

scrape.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import argparse
2+
import urllib2
3+
import datetime
4+
import time
5+
import csv
6+
import sys
7+
from bs4 import BeautifulSoup
8+
from MetaCriticScraper import MetaCriticScraper
9+
10+
# This maps VGChartz Platform names to MetaCritic Platform names
11+
# VGChartz has sales data for nearly every platform (NES, Atari, etc...)
12+
# MetaCritic does not keep track of some of those older platforms,
13+
# So in that case, the MetaCritic data will be blank
14+
metacritic_platform = {'PS3': 'playstation-3',
15+
'X360': 'xbox-360',
16+
'PC': 'pc',
17+
'WiiU': 'wii-u',
18+
'3DS': '3ds',
19+
'PSV': 'playstation-vita',
20+
'iOS': 'ios',
21+
'Wii': 'wii',
22+
'DS': 'ds',
23+
'PSP': 'psp',
24+
'PS2': 'playstation-2',
25+
'PS': 'playstation',
26+
'XB': 'xbox', # original xbox
27+
'GC': 'gamecube',
28+
'GBA': 'game-boy-advance',
29+
'DC': 'dreamcast'
30+
}
31+
32+
# Parses a single row of game information from a VGChartz table.
33+
# Argument must be a BeautifulSoup'ed table row from VGChartz
34+
def vgchartz_parse(row):
35+
game = {}
36+
data = row.find_all("td")
37+
if data:
38+
game["name"] = data[1].get_text()
39+
game["url"] = data[1].a.get('href')
40+
game["basename"] = game["url"].rsplit('/', 2)[1]
41+
game["platform"] = data[2].get_text()
42+
game["year"] = data[3].get_text()
43+
game["genre"] = data[4].get_text()
44+
game["publisher"] = data[5].get_text()
45+
game["na_sales"] = data[6].get_text()
46+
game["eu_sales"] = data[7].get_text()
47+
game["ja_sales"] = data[8].get_text()
48+
game["rest_sales"] = data[9].get_text()
49+
game["global_sales"] = data[10].get_text()
50+
51+
return game
52+
53+
# Returns a url to MetaCritic based on game information from VGChartz.
54+
# Returns None if MetaCritic does not support the platform.
55+
# Argument must be a dictionary of game information from VGChartz.
56+
def make_metacritic_url(vg_game_info):
57+
url = None
58+
if vg_game_info["platform"] in metacritic_platform:
59+
url = "http://www.metacritic.com/game/"
60+
url = url + metacritic_platform[vg_game_info["platform"]] + "/"
61+
url = url + vg_game_info["basename"]
62+
63+
return url
64+
65+
# Command-line argument parser.
66+
# You can also specific -h, --help at the command line
67+
# to see which arguments are supported
68+
parser = argparse.ArgumentParser(description='VGChartz and MetaCritic Game Scraper.')
69+
parser.add_argument('-n', '--number', dest='max_games', type=int, default=0, help='Maximum number of games to scrape (0 to disable).')
70+
parser.add_argument('-w', '--wait', type=int, default=0, help='Number of seconds to wait before each request to MetaCritc (0 to disable).')
71+
72+
args = parser.parse_args()
73+
74+
# Do we have games available to scrape?
75+
# This lets us break out of our loop
76+
games_available = True
77+
78+
games_scraped = 0 # Count of how many games we have scraped so far
79+
vgchartz_page = 1 # Which VGChartz Page are we on
80+
81+
# Open our CSV file and write the headers
82+
now = datetime.datetime.now()
83+
csvfilename = "gamedata-" + now.strftime("%Y%m%d-%H_%M_%S") + ".csv"
84+
csvfile = open(csvfilename, "wb")
85+
gamewriter = csv.writer(csvfile)
86+
gamewriter.writerow(['DateTime:', str(now)])
87+
gamewriter.writerow(['name', 'platform', 'release year', 'genre', 'publisher', 'north america sales', 'europe sales', 'japan sales', 'rest of world sales', 'global sales', 'release date', 'critic score', 'critic outof', 'critic count', 'user score', 'user count', 'developer', 'rating'])
88+
89+
start_time = time.time()
90+
while games_available:
91+
# Connect to the VGChartz table. There are 1000 results per page.
92+
sys.stdout.write("Connecting to VGChartz Page " + str(vgchartz_page) + "...")
93+
vgchartz_url = "http://www.vgchartz.com/gamedb/?page=" + str(vgchartz_page) + "&results=1000&name=&platform=&minSales=0&publisher=&genre=&sort=GL"
94+
#vgchartz_url = "file:vgchartz.htm" # This is a DEBUG line - pulling vgchartz data from filesystem. Comment it out for production.
95+
vgchartz_conn = urllib2.urlopen(vgchartz_url)
96+
vgchartz_html = vgchartz_conn.read()
97+
sys.stdout.write("connected.\n")
98+
99+
vgsoup = BeautifulSoup(vgchartz_html)
100+
rows = vgsoup.find("table", class_="chart").find_all("tr")
101+
102+
# For each row, scrape the game information from VGChartz
103+
# With that information, make a MetaCritic URL
104+
# Connect to MetaCritic and scrape more information about the game
105+
# Save all this information to the CSV file
106+
for row in rows:
107+
vg_game_info = vgchartz_parse(row)
108+
if vg_game_info:
109+
print games_scraped+1, vg_game_info["name"]
110+
# VGChartz has many thousands of games in its database. A lot are old and have no sales figures.
111+
# If a game has 0 sales, we are done looking for games. This table is sorted by sales, so all other games will also have 0 sales.
112+
if (vg_game_info["global_sales"] == "0.00"):
113+
print "No more games with sales figures. Ending."
114+
games_available = False
115+
break
116+
117+
# Make MetaCritic URL
118+
metacritic_url = make_metacritic_url(vg_game_info)
119+
if (args.wait > 0):
120+
time.sleep(args.wait) # Option to sleep before connecting so MetaCritic doesn't throttle/block us.
121+
metacritic_scraper = MetaCriticScraper(metacritic_url)
122+
123+
# Write everything to the CSV. MetaCritic data will be blank if we could not get it.
124+
gamewriter.writerow([vg_game_info["name"], vg_game_info["platform"], vg_game_info["year"], vg_game_info["genre"], \
125+
vg_game_info["publisher"], vg_game_info["na_sales"], vg_game_info["eu_sales"], vg_game_info["ja_sales"], \
126+
vg_game_info["rest_sales"], vg_game_info["global_sales"], metacritic_scraper.game["release_date"], \
127+
metacritic_scraper.game["critic_score"], metacritic_scraper.game["critic_outof"], \
128+
metacritic_scraper.game["critic_count"], metacritic_scraper.game["user_score"], \
129+
metacritic_scraper.game["user_count"], metacritic_scraper.game["developer"], \
130+
metacritic_scraper.game["rating"]])
131+
#csvfile.flush()
132+
133+
# We successfully scraped a single game. If we hit max_games, quit. Otherwise, loop to the next game.
134+
games_scraped += 1
135+
if (args.max_games > 0 and args.max_games == games_scraped):
136+
print "Reached max_games limit. Ending."
137+
games_available = False
138+
break
139+
vgchartz_page += 1
140+
141+
csvfile.close()
142+
elapsed_time = time.time() - start_time
143+
print "Scraped", games_scraped, "games in", round(elapsed_time, 2), "seconds."
144+
print "Wrote scraper data to", csvfilename

0 commit comments

Comments
 (0)