From 7ebd4ccaa0a6758bc2f7918f69ec86e9cb24232d Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:45:20 -0600 Subject: [PATCH 01/10] Reworked so year_index is pulled from a dictionary The user can now update the code for new academic years by adding a line to define the academic_year/year_index to the dictionary --- scrapersettings.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/scrapersettings.py b/scrapersettings.py index 342b056..b1d2f02 100644 --- a/scrapersettings.py +++ b/scrapersettings.py @@ -9,8 +9,12 @@ ############################################################## # Select year for parsing -academic_year = "2014" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013. -year_index = "11540" # Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are: [2013, 11220], [2012, 10740], [2011, 10440], and [2010, 10260] +academic_year = "2016" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013. + +# Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are contained in this dictionary: +yearIndexDict = {"2015":"12020","2014":"11540","2013":"11220","2012":"10740","2011":"10440","2010":"10260"} +#You can add new academic_year/year_index mappings by copy & paste the line below with new mappings: +yearIndexDict.update({"2016":"12260"}) # What do you want to do? (Note: Lower tiers need higher tiers, i.e., ind_game_stats requires map_players (Tier 2), which requires map_teams (Tier 1).) @@ -38,6 +42,7 @@ #### The variables below could be set, but probably don't need any modification ##### debugmode = 1 # Output program steps (0 = off, 1 = on) params = { } # Any POST parameters that need to be sent (default) +year_index = yearIndexDict[academic_year] http_header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0", "Accept": "text/plain, */*; q=0.01", @@ -52,4 +57,7 @@ } # Variables from the HTTP header (default) start_url = 'http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&academic_year=' + str(academic_year) + "&division=1" # URL to start from (Change this for different years). You can get this URL from http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&division=1. This URL is for the 2011-2012 season. -domain_base = 'http://stats.ncaa.org' # Base domain \ No newline at end of file + +print "start_url=",start_url + +domain_base = 'http://stats.ncaa.org' # Base domain From 9638914dc25cf83f01483d28e3f6684a366d3aaa Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:47:17 -0600 Subject: [PATCH 02/10] Tweaked for new NCAA site format in 2016 --- create_player_mappings_and_agg_stats.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/create_player_mappings_and_agg_stats.py b/create_player_mappings_and_agg_stats.py index 2495500..cb1ab76 100644 --- a/create_player_mappings_and_agg_stats.py +++ b/create_player_mappings_and_agg_stats.py @@ -41,10 +41,10 @@ team_stats_total = [] for value, team in enumerate(team_mapping): # For each team in our dictionary if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")" - roster_url = str(scrapersettings.domain_base) + "/team/stats?org_id=" + team + "&sport_year_ctl_id=" + str(scrapersettings.year_index) + roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index) team_name = team_mapping[team][0] roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team - roster_page_data_soup = BeautifulSoup(roster_page_data) + roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser") stat_grid = roster_page_data_soup.select('#stat_grid') # Get Player Data @@ -160,4 +160,4 @@ if (scrapersettings.map_players == 1): player_dict = dict([(case[0], (case[1:])) for case in player_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in player_dict: # For each item on that list - player_mappingfile_w.writelines(str(item) + "\t" + player_dict[item][1] + "\t" + player_dict[item][0] + "\n") \ No newline at end of file + player_mappingfile_w.writelines(str(item) + "\t" + player_dict[item][1] + "\t" + player_dict[item][0] + "\n") From 51ad5ea04e1506f174654e2c55f61b64edba2072 Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:52:35 -0600 Subject: [PATCH 03/10] pass "html.parser" to make BeautifulSoup happy Gets rid of a warning BeautifulSoup throws w/out specifying parser --- create_ind_stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/create_ind_stats.py b/create_ind_stats.py index f346e5e..0a22b22 100644 --- a/create_ind_stats.py +++ b/create_ind_stats.py @@ -51,7 +51,7 @@ except: print "Error getting data. Moving on to next game." continue - game_page_data_soup = BeautifulSoup(game_page_data) + game_page_data_soup = BeautifulSoup(game_page_data,"html.parser") neutral = game_mapping[game][3] tables = game_page_data_soup.findAll('table', class_='mytable') headertable = tables[0] @@ -514,4 +514,4 @@ writeline += "\n" team_data_w.writelines(writeline) - print "Successfully generated individual statistics for players and/or teams" \ No newline at end of file + print "Successfully generated individual statistics for players and/or teams" From c80c90b1c61b65b28f4f71e7d80d551858128862 Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:53:51 -0600 Subject: [PATCH 04/10] pass "html.parser" to make BeautifulSoup happy Gets rid of a warning BeautifulSoup throws w/out specifying parser --- create_schedule_mappings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/create_schedule_mappings.py b/create_schedule_mappings.py index ba1630e..9abcaa1 100644 --- a/create_schedule_mappings.py +++ b/create_schedule_mappings.py @@ -32,7 +32,7 @@ except: print "Error getting data. Moving on to next game." continue - team_mainpage_data_soup = BeautifulSoup(team_mainpage_data) # Soupify that page + team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page gamelinks = [] # Create a blank list for each game for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game... @@ -56,8 +56,8 @@ date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits) schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list - + schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in schedule_dict: # For each item on that list schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file - print "Successfully generated schedule mappings" \ No newline at end of file + print "Successfully generated schedule mappings" From 57ef7c6c43de689ce3814fa516c7cc0dfa4b5bde Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:55:29 -0600 Subject: [PATCH 05/10] Tweaked for new NCAA site format in 2016 URL formats changed, so switched to REGEX to parse teamID, also get rid of a warning BeautifulSoup throws w/out specifying parser --- create_team_mappings.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/create_team_mappings.py b/create_team_mappings.py index e4afe95..a26fd9e 100644 --- a/create_team_mappings.py +++ b/create_team_mappings.py @@ -10,6 +10,7 @@ # Import modules and libraries import scraperfunctions import scrapersettings +import re from bs4 import BeautifulSoup if (scrapersettings.map_teams == 1): @@ -21,12 +22,14 @@ # Grab data # Download the page with the list of teams teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page - teamlist_data_soup = BeautifulSoup(teamlist_data) # Soupify that data + teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data # Create a mapping for teams for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page - if "team/index/" + str(scrapersettings.year_index) + "?org_id=" in link.get('href'): # If the hyperlink contains this string (limiting it only to team pages) - team_id = str(link.get('href').split("team/index/" + str(scrapersettings.year_index) + "?org_id=")[1]) # Get the team ID from the URL + + linkMatch = re.match(r'/team\/([0-9]+)\/' + str(scrapersettings.year_index),link.get('href')) # If the hyperlink contains this string + if linkMatch: # If it does, parse onward + team_id = linkMatch.group(1) # Get the team ID from the URL team_name = str(link.get_text()) # Get the text associated with the hyperlink team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes From fb59c4a4d0179b19d67fb60bd8a1677116bafab8 Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 11:59:38 -0600 Subject: [PATCH 06/10] Reworked so year_index is pulled from a dictionary The user can now update the code for new academic years by adding a line to define the academic_year/year_index to the dictionary --- scrapersettings.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scrapersettings.py b/scrapersettings.py index b1d2f02..086fa3c 100644 --- a/scrapersettings.py +++ b/scrapersettings.py @@ -58,6 +58,4 @@ start_url = 'http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&academic_year=' + str(academic_year) + "&division=1" # URL to start from (Change this for different years). You can get this URL from http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&division=1. This URL is for the 2011-2012 season. -print "start_url=",start_url - domain_base = 'http://stats.ncaa.org' # Base domain From 3c65d2c855a61730c73d093c1a93deed83f0b0d4 Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 12:00:39 -0600 Subject: [PATCH 07/10] Reworked so year_index is pulled from a dictionary The user can now update the code for new academic years by adding a line to define the academic_year/year_index to the dictionary --- scrapersettings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scrapersettings.py b/scrapersettings.py index 086fa3c..569c0c5 100644 --- a/scrapersettings.py +++ b/scrapersettings.py @@ -57,5 +57,4 @@ } # Variables from the HTTP header (default) start_url = 'http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&academic_year=' + str(academic_year) + "&division=1" # URL to start from (Change this for different years). You can get this URL from http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&division=1. This URL is for the 2011-2012 season. - domain_base = 'http://stats.ncaa.org' # Base domain From 85806e732da6de38c472a973f9a5d6f07a3321ea Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 14 Feb 2016 12:03:40 -0600 Subject: [PATCH 08/10] pass "html.parser" to make BeautifulSoup happy --- create_schedule_mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/create_schedule_mappings.py b/create_schedule_mappings.py index 9abcaa1..7d9105a 100644 --- a/create_schedule_mappings.py +++ b/create_schedule_mappings.py @@ -56,7 +56,7 @@ date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits) schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list - + schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in schedule_dict: # For each item on that list schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file From 5c4c7ed4b2f564a6bd483cf6d9d2b903e07e66ec Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 21 Feb 2016 08:53:56 -0600 Subject: [PATCH 09/10] Add try/except when getting data to catch errors --- create_player_mappings_and_agg_stats.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/create_player_mappings_and_agg_stats.py b/create_player_mappings_and_agg_stats.py index cb1ab76..e3ca591 100644 --- a/create_player_mappings_and_agg_stats.py +++ b/create_player_mappings_and_agg_stats.py @@ -43,7 +43,11 @@ if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")" roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index) team_name = team_mapping[team][0] - roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team + try: + roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team + except: + print "Error getting data. Moving on to next game." + continue roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser") stat_grid = roster_page_data_soup.select('#stat_grid') From d492923c78171155e0d38255556dc42956112801 Mon Sep 17 00:00:00 2001 From: numericOverflow Date: Sun, 21 Feb 2016 14:28:36 -0600 Subject: [PATCH 10/10] Various updates to work with new NCAA site format Reworked so year_index is pulled from a dictionary, Added some extra try/catches, Fixed regex to pull teamID that was not working with new site format --- create_ind_stats.py | 9 ++++++--- create_player_mappings_and_agg_stats.py | 2 +- create_schedule_mappings.py | 11 +++++++---- create_team_mappings.py | 4 +++- scraperfunctions.py | 7 ++++++- scrapersettings.py | 6 ++++-- 6 files changed, 27 insertions(+), 12 deletions(-) diff --git a/create_ind_stats.py b/create_ind_stats.py index 0a22b22..f6de79d 100644 --- a/create_ind_stats.py +++ b/create_ind_stats.py @@ -43,6 +43,9 @@ # Parse the stats tables team_stats_total = [] # Create an empty list for storing the team stats alphanum = re.compile(r'[^\w\s:]+') + + extractTeamID = scraperfunctions.get_regex_extractTeamID() + for value, game in enumerate(game_mapping): # For each game in our dictionary if scrapersettings.debugmode == 1: print "Processing game " + str(game) + " (" + str(value+1) + " of " + str(len(game_mapping)) + ")" game_url = game_mapping[game][4] @@ -62,7 +65,7 @@ away_team_header = headertable.findAll('tr')[1] tds = away_team_header.findAll('td') try: - away_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip()) + away_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1)) except: away_team = 0 try: @@ -75,7 +78,7 @@ home_team_header = headertable.findAll('tr')[2] tds = home_team_header.findAll('td') try: - home_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip()) + home_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1)) except: home_team = 0 try: @@ -514,4 +517,4 @@ writeline += "\n" team_data_w.writelines(writeline) - print "Successfully generated individual statistics for players and/or teams" + print "Successfully generated individual statistics for players and/or teams" \ No newline at end of file diff --git a/create_player_mappings_and_agg_stats.py b/create_player_mappings_and_agg_stats.py index e3ca591..95e21b3 100644 --- a/create_player_mappings_and_agg_stats.py +++ b/create_player_mappings_and_agg_stats.py @@ -164,4 +164,4 @@ if (scrapersettings.map_players == 1): player_dict = dict([(case[0], (case[1:])) for case in player_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in player_dict: # For each item on that list - player_mappingfile_w.writelines(str(item) + "\t" + player_dict[item][1] + "\t" + player_dict[item][0] + "\n") + player_mappingfile_w.writelines(str(item) + "\t" + player_dict[item][1] + "\t" + player_dict[item][0] + "\n") \ No newline at end of file diff --git a/create_schedule_mappings.py b/create_schedule_mappings.py index 7d9105a..f21cbff 100644 --- a/create_schedule_mappings.py +++ b/create_schedule_mappings.py @@ -12,6 +12,7 @@ import scrapersettings import csv from bs4 import BeautifulSoup +import re if (scrapersettings.map_schedule == 1): print "Generating schedule mappings" @@ -22,7 +23,9 @@ # Grab data # Parse our mappings file to get our list of teams team_mapping = scraperfunctions.get_team_mappings() - + + extractTeamID = scraperfunctions.get_regex_extractTeamID() + # Create the schedule schedule_list = [] # Create an empty list for storing all of our games for value, team in enumerate(team_mapping): # For each team in our dictionary @@ -38,7 +41,7 @@ if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game... game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them try: - opponent_id = link.find_previous("td").find_previous("td").find("a").get('href').split("?org_id=")[1] + opponent_id = str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1)) except: opponent_id = 0 opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip() @@ -56,8 +59,8 @@ date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits) schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list - + schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries for item in schedule_dict: # For each item on that list schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file - print "Successfully generated schedule mappings" + print "Successfully generated schedule mappings" \ No newline at end of file diff --git a/create_team_mappings.py b/create_team_mappings.py index a26fd9e..9a4a5af 100644 --- a/create_team_mappings.py +++ b/create_team_mappings.py @@ -24,10 +24,12 @@ teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data + extractTeamID = scraperfunctions.get_regex_extractTeamID() + # Create a mapping for teams for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page - linkMatch = re.match(r'/team\/([0-9]+)\/' + str(scrapersettings.year_index),link.get('href')) # If the hyperlink contains this string + linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string if linkMatch: # If it does, parse onward team_id = linkMatch.group(1) # Get the team ID from the URL team_name = str(link.get_text()) # Get the text associated with the hyperlink diff --git a/scraperfunctions.py b/scraperfunctions.py index 8a1e3bf..b202fda 100644 --- a/scraperfunctions.py +++ b/scraperfunctions.py @@ -94,4 +94,9 @@ def get_game_mappings(): game_map = open(scrapersettings.schedule_mappingfile, "rb") game_map = game_map.readlines()[1:] game_map = dict([(var.split("\t")[0], (var.split("\t")[1], var.split("\t")[2], var.split("\t")[3], var.split("\t")[4], var.split("\t")[5].strip("\n"))) for var in game_map]) - return(game_map) \ No newline at end of file + return(game_map) + +def get_regex_extractTeamID(): + return re.compile(r'\/team\/([0-9]+)\/' + str(scrapersettings.year_index)) + + \ No newline at end of file diff --git a/scrapersettings.py b/scrapersettings.py index 569c0c5..16ccc56 100644 --- a/scrapersettings.py +++ b/scrapersettings.py @@ -17,6 +17,9 @@ yearIndexDict.update({"2016":"12260"}) +#No need to modify this line any more, just update the dictionary and the right year_index will be used based on academic_year +year_index = yearIndexDict[academic_year] + # What do you want to do? (Note: Lower tiers need higher tiers, i.e., ind_game_stats requires map_players (Tier 2), which requires map_teams (Tier 1).) map_teams = 1 # Create a team mapping (0 = no, 1 = yes) -- TIER 1 map_schedule = 1 # Create schedule mapping (0 = no, 1 = yes) @@ -42,7 +45,6 @@ #### The variables below could be set, but probably don't need any modification ##### debugmode = 1 # Output program steps (0 = off, 1 = on) params = { } # Any POST parameters that need to be sent (default) -year_index = yearIndexDict[academic_year] http_header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0", "Accept": "text/plain, */*; q=0.01", @@ -57,4 +59,4 @@ } # Variables from the HTTP header (default) start_url = 'http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&academic_year=' + str(academic_year) + "&division=1" # URL to start from (Change this for different years). You can get this URL from http://stats.ncaa.org/team/inst_team_list?sport_code=MBB&division=1. This URL is for the 2011-2012 season. -domain_base = 'http://stats.ncaa.org' # Base domain +domain_base = 'http://stats.ncaa.org' # Base domain \ No newline at end of file