rodzam · numericOverflow · Feb 14, 2016 · Feb 14, 2016 · Feb 14, 2016 · Feb 14, 2016
diff --git a/create_ind_stats.py b/create_ind_stats.py
@@ -43,6 +43,9 @@
     # Parse the stats tables
     team_stats_total = [] # Create an empty list for storing the team stats
     alphanum = re.compile(r'[^\w\s:]+')
+
+    extractTeamID = scraperfunctions.get_regex_extractTeamID()
+
     for value, game in enumerate(game_mapping): # For each game in our dictionary
         if scrapersettings.debugmode == 1: print "Processing game " + str(game) + " (" + str(value+1) + " of " + str(len(game_mapping)) + ")"
         game_url = game_mapping[game][4]
@@ -51,7 +54,7 @@
         except:
             print "Error getting data. Moving on to next game."
             continue
-        game_page_data_soup = BeautifulSoup(game_page_data)
+        game_page_data_soup = BeautifulSoup(game_page_data,"html.parser")
         neutral = game_mapping[game][3]
         tables = game_page_data_soup.findAll('table', class_='mytable')
         headertable = tables[0]
@@ -62,7 +65,7 @@
         away_team_header = headertable.findAll('tr')[1]
         tds = away_team_header.findAll('td')
         try:
-            away_team =  str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip())
+            away_team =  str(extractTeamID.match(tds[0].find('a').get('href')).group(1))
         except:
             away_team = 0
         try:
@@ -75,7 +78,7 @@
         home_team_header = headertable.findAll('tr')[2]
         tds = home_team_header.findAll('td')
         try:
-            home_team =  str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip())
+            home_team =  str(extractTeamID.match(tds[0].find('a').get('href')).group(1))
         except:
             home_team = 0
         try:

diff --git a/create_player_mappings_and_agg_stats.py b/create_player_mappings_and_agg_stats.py
@@ -41,10 +41,14 @@
     team_stats_total = []
     for value, team in enumerate(team_mapping): # For each team in our dictionary
         if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")"
-        roster_url = str(scrapersettings.domain_base) + "/team/stats?org_id=" + team + "&sport_year_ctl_id=" + str(scrapersettings.year_index)
+        roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index)
         team_name = team_mapping[team][0]
-        roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
-        roster_page_data_soup = BeautifulSoup(roster_page_data)
+        try:
+            roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
+        except:
+            print "Error getting data. Moving on to next game."
+            continue        
+        roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser")
         stat_grid = roster_page_data_soup.select('#stat_grid')
 
         # Get Player Data

diff --git a/create_schedule_mappings.py b/create_schedule_mappings.py
@@ -12,6 +12,7 @@
 import scrapersettings
 import csv
 from bs4 import BeautifulSoup
+import re
 
 if (scrapersettings.map_schedule == 1):
     print "Generating schedule mappings"
@@ -22,7 +23,9 @@
     # Grab data
     # Parse our mappings file to get our list of teams
     team_mapping = scraperfunctions.get_team_mappings()
-
+
+    extractTeamID = scraperfunctions.get_regex_extractTeamID()
+
     # Create the schedule
     schedule_list = [] # Create an empty list for storing all of our games
     for value, team in enumerate(team_mapping): # For each team in our dictionary
@@ -32,13 +35,13 @@
         except:
             print "Error getting data. Moving on to next game."
             continue
-        team_mainpage_data_soup = BeautifulSoup(team_mainpage_data) # Soupify that page
+        team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page
         gamelinks = [] # Create a blank list for each game
         for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document
             if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game...
                 game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them
                 try:
-                    opponent_id = link.find_previous("td").find_previous("td").find("a").get('href').split("?org_id=")[1]
+                    opponent_id =  str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1))
                 except:
                     opponent_id = 0
                 opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip()
@@ -56,7 +59,7 @@
                 date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game
                 game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits)
                 schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list
-
+                
     schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries
     for item in schedule_dict: # For each item on that list
         schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file

diff --git a/create_team_mappings.py b/create_team_mappings.py
@@ -10,6 +10,7 @@
 # Import modules and libraries
 import scraperfunctions
 import scrapersettings
+import re
 from bs4 import BeautifulSoup
 
 if (scrapersettings.map_teams == 1):
@@ -21,12 +22,16 @@
     # Grab data
     # Download the page with the list of teams
     teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page
-    teamlist_data_soup = BeautifulSoup(teamlist_data) # Soupify that data
+    teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data
 
+    extractTeamID = scraperfunctions.get_regex_extractTeamID()
+
     # Create a mapping for teams
     for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page
-        if "team/index/" + str(scrapersettings.year_index) + "?org_id=" in link.get('href'): # If the hyperlink contains this string (limiting it only to team pages)
-            team_id = str(link.get('href').split("team/index/" + str(scrapersettings.year_index) + "?org_id=")[1]) # Get the team ID from the URL
+
+        linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string
+        if linkMatch: # If it does, parse onward
+            team_id = linkMatch.group(1) # Get the team ID from the URL
             team_name = str(link.get_text()) # Get the text associated with the hyperlink
             team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain
             team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes

diff --git a/scraperfunctions.py b/scraperfunctions.py
@@ -94,4 +94,9 @@ def get_game_mappings():
     game_map = open(scrapersettings.schedule_mappingfile, "rb")
     game_map = game_map.readlines()[1:]
     game_map = dict([(var.split("\t")[0], (var.split("\t")[1], var.split("\t")[2], var.split("\t")[3], var.split("\t")[4], var.split("\t")[5].strip("\n"))) for var in game_map])
-    return(game_map)
+    return(game_map)
+
+def get_regex_extractTeamID():
+    return re.compile(r'\/team\/([0-9]+)\/' + str(scrapersettings.year_index))
+
+
diff --git a/scrapersettings.py b/scrapersettings.py
@@ -9,9 +9,16 @@
 ##############################################################
 
 # Select year for parsing
-academic_year = "2014" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013.
-year_index = "11540" # Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are: [2013, 11220], [2012, 10740], [2011, 10440], and [2010, 10260]
+academic_year = "2016" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013.
 
+# Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are contained in this dictionary:
+yearIndexDict = {"2015":"12020","2014":"11540","2013":"11220","2012":"10740","2011":"10440","2010":"10260"}
+#You can add new academic_year/year_index mappings by copy & paste the line below with new mappings:
+yearIndexDict.update({"2016":"12260"})
+
+
+#No need to modify this line any more, just update the dictionary and the right year_index will be used based on academic_year
+year_index = yearIndexDict[academic_year]
 
 # What do you want to do? (Note: Lower tiers need higher tiers, i.e., ind_game_stats requires map_players (Tier 2), which requires map_teams (Tier 1).)
 map_teams = 1 # Create a team mapping (0 = no, 1 = yes) -- TIER 1