Moved MetaCriticScraper to its own class

kareemy · kareemy · commit 8a6f5300ba36 · 2013-06-05T19:58:48.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,163 @@
+#################
+## Eclipse
+#################
+
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+[Dd]ebug/
+[Rr]elease/
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.vspscc
+.builds
+*.dotCover
+
+## TODO: If you have NuGet Package Restore enabled, uncomment this
+#packages/
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+
+# Visual Studio profiler
+*.psess
+*.vsp
+
+# ReSharper is a .NET coding add-in
+_ReSharper*
+
+# Installshield output folder
+[Ee]xpress
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish
+
+# Others
+[Bb]in
+[Oo]bj
+sql
+TestResults
+*.Cache
+ClientBin
+stylecop.*
+~$*
+*.dbmdl
+Generated_Code #added for RIA/Silverlight projects
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+
+
+
+############
+## Windows
+############
+
+# Windows image file caches
+Thumbs.db
+
+# Folder config file
+Desktop.ini
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
+
+# Mac crap
+.DS_Store
diff --git a/MetaCriticScraper.py b/MetaCriticScraper.py
@@ -0,0 +1,78 @@
+from bs4 import BeautifulSoup
+import urllib2
+
+class MetaCriticScraper:
+	def __init__(self, url):
+		self.game = {'url': '',
+					 'title': '',
+					 'platform': '',
+					 'publisher': '',
+					 'release_date': '',
+					 'critic_score': '',
+					 'critic_outof': '',
+					 'critic_count': '',
+					 'user_score': '',
+					 'user_count': '',
+					 'developer': '',
+					 'genre': '',
+					 'rating': ''
+					}
+					
+		metacritic_url = urllib2.urlopen(url)
+		self.game['url'] = metacritic_url.geturl()
+		html = metacritic_url.read()
+		self.soup = BeautifulSoup(html)
+		self.scrape()
+	
+	def scrape(self):
+		# Get Title and Platform. If site changes and we can't find the right divs or classes
+		# skip and leave these values as empty strings
+		try:
+			product_title_div = self.soup.find("div", class_="product_title")
+			self.game['title'] = product_title_div.a.text.strip()
+			self.game['platform'] = product_title_div.span.a.text.strip()
+		except:
+			print "WARNING: Problem getting title and platform information"
+			pass
+			
+		# Get publisher and release date. 
+		try:
+			self.game['publisher'] = self.soup.find("li", class_="summary_detail publisher").a.text.strip()
+			self.game['release_date'] = self.soup.find("li", class_="summary_detail release_data").find("span", class_="data").text.strip()
+			#datetime.strptime(release_date.strip(), "%b %d, %Y")
+		except:
+			print "WARNING: Problem getting publisher and release date information"
+			pass
+			
+		# Get critic information
+		try:
+			critics = self.soup.find("div", class_="details main_details")
+			self.game['critic_score'] = critics.find("span", class_="score_value").text.strip()
+			self.game['critic_outof'] = critics.find("span", class_="score_total").span.text.strip()
+			self.game['critic_count'] = critics.find("span", class_="count").a.span.text.strip()
+		except:
+			print "WARNING: Problem getting critic score information"
+			pass
+			
+		# Get user information
+		try:
+			users = self.soup.find("div", class_="details side_details")
+			self.game['user_score'] = users.find("span", class_="score_value").text.strip()
+			raw_users_count = users.find("span", class_="count").a.text
+			user_count = ''
+			for c in raw_users_count:
+				if c.isdigit(): user_count += c
+			self.game['user_count'] = user_count.strip()
+		except:
+			print "WARNING: Problem getting user score information"
+			pass
+				
+		# Get remaining information
+		try:
+			product_info = self.soup.find("div", class_="section product_details").find("div", class_="details side_details")
+			self.game['developer'] = product_info.find("li", class_="summary_detail developer").find("span", class_="data").text.strip()
+			self.game['genre'] = product_info.find("li", class_="summary_detail product_genre").find("span", class_="data").text.strip()
+			self.game['rating'] = product_info.find("li", class_="summary_detail product_rating").find("span", class_="data").text.strip()
+		except:
+			print "WARNING: Problem getting miscellaneous game information"
+			pass
diff --git a/metacritc.py b/metacritc.py
@@ -1,43 +1,20 @@
-from bs4 import BeautifulSoup
 import sys
-import urllib2
-
-url = urllib2.urlopen(sys.argv[1])
-html = url.read()
-
-soup = BeautifulSoup(html)
-
-product_title_div = soup.find("div", class_="product_title")
-title = product_title_div.a.text
-platform = product_title_div.span.a.text
-
-publisher = soup.find("li", class_="summary_detail publisher").a.text
-release_date = soup.find("li", class_="summary_detail release_data").find("span", class_="data").text
-
-critics = soup.find("div", class_="details main_details")
-critic_score = critics.find("span", class_="score_value").text
-critic_outof = critics.find("span", class_="score_total").span.text
-critic_count = critics.find("span", class_="count").a.span.text
-
-users = soup.find("div", class_="details side_details")
-users_score = users.find("span", class_="score_value").text
-raw_users_count = users.find("span", class_="count").a.text
-users_count = ""
-for c in raw_users_count:
-	if c.isdigit(): users_count += c
-
-product_info = soup.find("div", class_="section product_details").find("div", class_="details side_details")
-developer = product_info.find("li", class_="summary_detail developer").find("span", class_="data").text
-genre = product_info.find("li", class_="summary_detail product_genre").find("span", class_="data").text
-rating = product_info.find("li", class_="summary_detail product_rating").find("span", class_="data").text
-
-print "URL: " + url.geturl()
-print "Title: " + title.strip()
-print "Platform: " + platform.strip()
-print "Publisher: " + publisher.strip()
-print "Release Date: " + release_date.strip()
-print "Critic Score: " + critic_score.strip() + "/" + critic_outof + " (" + critic_count + " critics)"
-print "User Score: " + users_score.strip() + " (" + users_count + " users)"
-print "Developer: " + developer.strip()
-print "Genre: " + genre.strip()
-print "Rating: " + rating.strip()
+from MetaCriticScraper import MetaCriticScraper
+import time
+# Do we want to handle movies, tv shows, etc...?
+# FIXME: Handle URL better
+
+start_time = time.time()
+scraper = MetaCriticScraper(sys.argv[1])
+elapsed_time = time.time() - start_time
+print "URL: " + scraper.game['url']
+print "Title: " + scraper.game['title']
+print "Platform: " + scraper.game['platform']
+print "Publisher: " + scraper.game['publisher']
+print "Release Date: " + scraper.game['release_date']
+print "Critic Score: " + scraper.game['critic_score'] + "/" + scraper.game['critic_outof'] + " (" + scraper.game['critic_count'] + " critics)"
+print "User Score: " + scraper.game['user_score'] + " (" + scraper.game['user_count'] + " users)"
+print "Developer: " + scraper.game['developer']
+print "Genre: " + scraper.game['genre']
+print "Rating: " + scraper.game['rating']
+print "Time to scrape: ", round(elapsed_time, 2), "secs"