diff --git a/create_ind_stats.py b/create_ind_stats.py index f346e5e..f6de79d 100644 --- a/create_ind_stats.py +++ b/create_ind_stats.py @@ -43,6 +43,9 @@ # Parse the stats tables team_stats_total = [] # Create an empty list for storing the team stats alphanum = re.compile(r'[^\w\s:]+') + + extractTeamID = scraperfunctions.get_regex_extractTeamID() + for value, game in enumerate(game_mapping): # For each game in our dictionary if scrapersettings.debugmode == 1: print "Processing game " + str(game) + " (" + str(value+1) + " of " + str(len(game_mapping)) + ")" game_url = game_mapping[game][4] @@ -51,7 +54,7 @@ except: print "Error getting data. Moving on to next game." continue - game_page_data_soup = BeautifulSoup(game_page_data) + game_page_data_soup = BeautifulSoup(game_page_data,"html.parser") neutral = game_mapping[game][3] tables = game_page_data_soup.findAll('table', class_='mytable') headertable = tables[0] @@ -62,7 +65,7 @@ away_team_header = headertable.findAll('tr')[1] tds = away_team_header.findAll('td') try: - away_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip()) + away_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1)) except: away_team = 0 try: @@ -75,7 +78,7 @@ home_team_header = headertable.findAll('tr')[2] tds = home_team_header.findAll('td') try: - home_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip()) + home_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1)) except: home_team = 0 try: diff --git a/create_player_mappings_and_agg_stats.py b/create_player_mappings_and_agg_stats.py index 2495500..95e21b3 100644 --- a/create_player_mappings_and_agg_stats.py +++ b/create_player_mappings_and_agg_stats.py @@ -41,10 +41,14 @@ team_stats_total = [] for value, team in enumerate(team_mapping): # For each team in our dictionary if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")" - roster_url = str(scrapersettings.domain_base) + "/team/stats?org_id=" + team + "&sport_year_ctl_id=" + str(scrapersettings.year_index) + roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index) team_name = team_mapping[team][0] - roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team - roster_page_data_soup = BeautifulSoup(roster_page_data) + try: + roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team + except: + print "Error getting data. Moving on to next game." + continue + roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser") stat_grid = roster_page_data_soup.select('#stat_grid') # Get Player Data diff --git a/create_schedule_mappings.py b/create_schedule_mappings.py index ba1630e..f21cbff 100644 --- a/create_schedule_mappings.py +++ b/create_schedule_mappings.py @@ -12,6 +12,7 @@ import scrapersettings import csv from bs4 import BeautifulSoup +import re if (scrapersettings.map_schedule == 1): print "Generating schedule mappings" @@ -22,7 +23,9 @@ # Grab data # Parse our mappings file to get our list of teams team_mapping = scraperfunctions.get_team_mappings() - + + extractTeamID = scraperfunctions.get_regex_extractTeamID() + # Create the schedule schedule_list = [] # Create an empty list for storing all of our games for value, team in enumerate(team_mapping): # For each team in our dictionary @@ -32,13 +35,13 @@ except: print "Error getting data. Moving on to next game." continue - team_mainpage_data_soup = BeautifulSoup(team_mainpage_data) # Soupify that page + team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page gamelinks = [] # Create a blank list for each game for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game... game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them try: - opponent_id = link.find_previous("td").find_previous("td").find("a").get('href').split("?org_id=")[1] + opponent_id = str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1)) except: opponent_id = 0 opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip() @@ -56,7 +59,7 @@ date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits) schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list - + schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in schedule_dict: # For each item on that list schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file diff --git a/create_team_mappings.py b/create_team_mappings.py index e4afe95..9a4a5af 100644 --- a/create_team_mappings.py +++ b/create_team_mappings.py @@ -10,6 +10,7 @@ # Import modules and libraries import scraperfunctions import scrapersettings +import re from bs4 import BeautifulSoup if (scrapersettings.map_teams == 1): @@ -21,12 +22,16 @@ # Grab data # Download the page with the list of teams teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page - teamlist_data_soup = BeautifulSoup(teamlist_data) # Soupify that data + teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data + extractTeamID = scraperfunctions.get_regex_extractTeamID() + # Create a mapping for teams for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page - if "team/index/" + str(scrapersettings.year_index) + "?org_id=" in link.get('href'): # If the hyperlink contains this string (limiting it only to team pages) - team_id = str(link.get('href').split("team/index/" + str(scrapersettings.year_index) + "?org_id=")[1]) # Get the team ID from the URL + + linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string + if linkMatch: # If it does, parse onward + team_id = linkMatch.group(1) # Get the team ID from the URL team_name = str(link.get_text()) # Get the text associated with the hyperlink team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes diff --git a/scraperfunctions.py b/scraperfunctions.py index 8a1e3bf..b202fda 100644 --- a/scraperfunctions.py +++ b/scraperfunctions.py @@ -94,4 +94,9 @@ def get_game_mappings(): game_map = open(scrapersettings.schedule_mappingfile, "rb") game_map = game_map.readlines()[1:] game_map = dict([(var.split("\t")[0], (var.split("\t")[1], var.split("\t")[2], var.split("\t")[3], var.split("\t")[4], var.split("\t")[5].strip("\n"))) for var in game_map]) - return(game_map) \ No newline at end of file + return(game_map) + +def get_regex_extractTeamID(): + return re.compile(r'\/team\/([0-9]+)\/' + str(scrapersettings.year_index)) + + \ No newline at end of file diff --git a/scrapersettings.py b/scrapersettings.py index 342b056..16ccc56 100644 --- a/scrapersettings.py +++ b/scrapersettings.py @@ -9,9 +9,16 @@ ############################################################## # Select year for parsing -academic_year = "2014" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013. -year_index = "11540" # Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are: [2013, 11220], [2012, 10740], [2011, 10440], and [2010, 10260] +academic_year = "2016" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013. +# Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are contained in this dictionary: +yearIndexDict = {"2015":"12020","2014":"11540","2013":"11220","2012":"10740","2011":"10440","2010":"10260"} +#You can add new academic_year/year_index mappings by copy & paste the line below with new mappings: +yearIndexDict.update({"2016":"12260"}) + + +#No need to modify this line any more, just update the dictionary and the right year_index will be used based on academic_year +year_index = yearIndexDict[academic_year] # What do you want to do? (Note: Lower tiers need higher tiers, i.e., ind_game_stats requires map_players (Tier 2), which requires map_teams (Tier 1).) map_teams = 1 # Create a team mapping (0 = no, 1 = yes) -- TIER 1