Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates for 2016 NCAA site #2

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
9 changes: 6 additions & 3 deletions create_ind_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
# Parse the stats tables
team_stats_total = [] # Create an empty list for storing the team stats
alphanum = re.compile(r'[^\w\s:]+')

extractTeamID = scraperfunctions.get_regex_extractTeamID()

for value, game in enumerate(game_mapping): # For each game in our dictionary
if scrapersettings.debugmode == 1: print "Processing game " + str(game) + " (" + str(value+1) + " of " + str(len(game_mapping)) + ")"
game_url = game_mapping[game][4]
Expand All @@ -51,7 +54,7 @@
except:
print "Error getting data. Moving on to next game."
continue
game_page_data_soup = BeautifulSoup(game_page_data)
game_page_data_soup = BeautifulSoup(game_page_data,"html.parser")
neutral = game_mapping[game][3]
tables = game_page_data_soup.findAll('table', class_='mytable')
headertable = tables[0]
Expand All @@ -62,7 +65,7 @@
away_team_header = headertable.findAll('tr')[1]
tds = away_team_header.findAll('td')
try:
away_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip())
away_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1))
except:
away_team = 0
try:
Expand All @@ -75,7 +78,7 @@
home_team_header = headertable.findAll('tr')[2]
tds = home_team_header.findAll('td')
try:
home_team = str(tds[0].find('a').get('href').split('=')[-1].encode('utf-8').strip())
home_team = str(extractTeamID.match(tds[0].find('a').get('href')).group(1))
except:
home_team = 0
try:
Expand Down
10 changes: 7 additions & 3 deletions create_player_mappings_and_agg_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@
team_stats_total = []
for value, team in enumerate(team_mapping): # For each team in our dictionary
if scrapersettings.debugmode == 1: print "Processing team " + str(team) + " (" + str(value+1) + " of " + str(len(team_mapping)) + ")"
roster_url = str(scrapersettings.domain_base) + "/team/stats?org_id=" + team + "&sport_year_ctl_id=" + str(scrapersettings.year_index)
roster_url = str(scrapersettings.domain_base) + "/team/" + team + "/stats/" + str(scrapersettings.year_index)
team_name = team_mapping[team][0]
roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
roster_page_data_soup = BeautifulSoup(roster_page_data)
try:
roster_page_data = scraperfunctions.grabber(roster_url, scrapersettings.params, scrapersettings.http_header) # Grab the main page for each team
except:
print "Error getting data. Moving on to next game."
continue
roster_page_data_soup = BeautifulSoup(roster_page_data,"html.parser")
stat_grid = roster_page_data_soup.select('#stat_grid')

# Get Player Data
Expand Down
11 changes: 7 additions & 4 deletions create_schedule_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import scrapersettings
import csv
from bs4 import BeautifulSoup
import re

if (scrapersettings.map_schedule == 1):
print "Generating schedule mappings"
Expand All @@ -22,7 +23,9 @@
# Grab data
# Parse our mappings file to get our list of teams
team_mapping = scraperfunctions.get_team_mappings()


extractTeamID = scraperfunctions.get_regex_extractTeamID()

# Create the schedule
schedule_list = [] # Create an empty list for storing all of our games
for value, team in enumerate(team_mapping): # For each team in our dictionary
Expand All @@ -32,13 +35,13 @@
except:
print "Error getting data. Moving on to next game."
continue
team_mainpage_data_soup = BeautifulSoup(team_mainpage_data) # Soupify that page
team_mainpage_data_soup = BeautifulSoup(team_mainpage_data,"html.parser") # Soupify that page
gamelinks = [] # Create a blank list for each game
for link in team_mainpage_data_soup.find_all('a'): # Locate all links in the document
if "game/index/" in link.get('href'): # If they contain a URL segment suggesting it is a game...
game_link = str(scrapersettings.domain_base + link.get('href')).split("?")[0] # Strip out any URL variables since we don't need them
try:
opponent_id = link.find_previous("td").find_previous("td").find("a").get('href').split("?org_id=")[1]
opponent_id = str(extractTeamID.match(link.find_previous("td").find_previous("td").find("a").get('href')).group(1))
except:
opponent_id = 0
opponent_text = link.find_previous("td").find_previous("td").get_text().encode('utf-8').strip()
Expand All @@ -56,7 +59,7 @@
date = link.find_previous("td").find_previous("td").find_previous("td").get_text() # Get the date for the game
game_id = game_link.split("/")[-1] # Get the game ID from the URL (last set of digits)
schedule_list.append([game_id, home_team, away_team, date, neutral, game_link]) # Append all of this information to our master schedule list

schedule_dict = dict([(case[0], (case[1:])) for case in schedule_list]) # Create a dictionary from our list so we don't have any duplicate entries
for item in schedule_dict: # For each item on that list
schedule_mappingfile_w.writelines(item + "\t" + str(schedule_dict[item][0]) + "\t" + str(schedule_dict[item][1]) + "\t" + str(schedule_dict[item][2]) + "\t" + str(schedule_dict[item][3]) + "\t" + str(schedule_dict[item][4]) + "\n") # Write to our mapping file
Expand Down
11 changes: 8 additions & 3 deletions create_team_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# Import modules and libraries
import scraperfunctions
import scrapersettings
import re
from bs4 import BeautifulSoup

if (scrapersettings.map_teams == 1):
Expand All @@ -21,12 +22,16 @@
# Grab data
# Download the page with the list of teams
teamlist_data = scraperfunctions.grabber(scrapersettings.start_url, scrapersettings.params, scrapersettings.http_header) # Get data from main page
teamlist_data_soup = BeautifulSoup(teamlist_data) # Soupify that data
teamlist_data_soup = BeautifulSoup(teamlist_data,"html.parser") # Soupify that data

extractTeamID = scraperfunctions.get_regex_extractTeamID()

# Create a mapping for teams
for link in teamlist_data_soup.find_all('a'): # For each hyperlink on the page
if "team/index/" + str(scrapersettings.year_index) + "?org_id=" in link.get('href'): # If the hyperlink contains this string (limiting it only to team pages)
team_id = str(link.get('href').split("team/index/" + str(scrapersettings.year_index) + "?org_id=")[1]) # Get the team ID from the URL

linkMatch = extractTeamID.match(link.get('href')) # If the hyperlink contains this string
if linkMatch: # If it does, parse onward
team_id = linkMatch.group(1) # Get the team ID from the URL
team_name = str(link.get_text()) # Get the text associated with the hyperlink
team_url = str(scrapersettings.domain_base + link.get('href')) # Get the URL and append the base domain
team_mappingfile_w.writelines(str(team_id) + "\t" + str(team_name) + "\t" + str(team_url) + "\n") # Add lines to our TSV file for archival purposes
Expand Down
7 changes: 6 additions & 1 deletion scraperfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,9 @@ def get_game_mappings():
game_map = open(scrapersettings.schedule_mappingfile, "rb")
game_map = game_map.readlines()[1:]
game_map = dict([(var.split("\t")[0], (var.split("\t")[1], var.split("\t")[2], var.split("\t")[3], var.split("\t")[4], var.split("\t")[5].strip("\n"))) for var in game_map])
return(game_map)
return(game_map)

def get_regex_extractTeamID():
return re.compile(r'\/team\/([0-9]+)\/' + str(scrapersettings.year_index))


11 changes: 9 additions & 2 deletions scrapersettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,16 @@
##############################################################

# Select year for parsing
academic_year = "2014" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013.
year_index = "11540" # Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are: [2013, 11220], [2012, 10740], [2011, 10440], and [2010, 10260]
academic_year = "2016" # Set the academic year (2012 refers to 2011-2012 season). As of writing, this can range from 2010 to 2013.

# Set the index that maps to the academic year. This may be obtained from looking at the team URLs on the list of available teams, for the given academic year. As of writing, the [academic_year, year_index] mappings are contained in this dictionary:
yearIndexDict = {"2015":"12020","2014":"11540","2013":"11220","2012":"10740","2011":"10440","2010":"10260"}
#You can add new academic_year/year_index mappings by copy & paste the line below with new mappings:
yearIndexDict.update({"2016":"12260"})


#No need to modify this line any more, just update the dictionary and the right year_index will be used based on academic_year
year_index = yearIndexDict[academic_year]

# What do you want to do? (Note: Lower tiers need higher tiers, i.e., ind_game_stats requires map_players (Tier 2), which requires map_teams (Tier 1).)
map_teams = 1 # Create a team mapping (0 = no, 1 = yes) -- TIER 1
Expand Down