From f6781ddc8d588426199355ccbe31bf23be424522 Mon Sep 17 00:00:00 2001 From: Kevin Lane Date: Mon, 7 Oct 2024 07:34:26 -0700 Subject: [PATCH] Update references to old default branch (#624) - Updates branch references from `master` to `main` - Updates the GitHub Pages deploy action - Limit the main pipeline to run on pushes to `main` - Adds a note that the website is not updated regularly --- .github/workflows/gh-pages.yml | 16 +++++++--------- .github/workflows/main.yml | 2 +- README.md | 10 +++++----- docs/README.md | 2 +- docs/_includes/footer.html | 6 +++--- docs/_pages/covers.md | 2 +- docs/_pages/stats.md | 2 +- docs/index.md | 2 ++ notebooks/data-exploration.ipynb | 2 +- notebooks/feature-selection.ipynb | 2 +- notebooks/model-performance.ipynb | 2 +- notebooks/parameter-tuning.ipynb | 2 +- 12 files changed, 25 insertions(+), 25 deletions(-) diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index 9c25bee9..7a4e49cf 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -3,7 +3,7 @@ name: GitHub Pages on: push: branches: - - master + - main jobs: Deploy: @@ -11,18 +11,16 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2.2.2 + uses: actions/setup-python@v5 - name: Install JekyllNB run: pip install jekyllnb - name: Convert Notebooks run: jupyter jekyllnb --site-dir docs --page-dir _pages --image-dir assets/images notebooks/*.ipynb - name: Deploy to GitHub Pages - uses: JamesIves/github-pages-deploy-action@releases/v3 + uses: JamesIves/github-pages-deploy-action@v4 with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BRANCH: gh-pages - BASE_BRANCH: master - FOLDER: docs - CLEAN: true + branch: gh-pages + folder: docs + clean: true diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2325815c..84e92ad7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -3,7 +3,7 @@ name: Tests on: push: branches: - - '*' + - main paths-ignore: - 'docs/**' - '**.md' diff --git a/README.md b/README.md index 018c889b..96901269 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ This project combines my interest in data science with my love of sports. I atte Contents: -- [covers](https://github.com/klane/databall/tree/master/covers): Scrapy project to scrape point spreads and over/under lines from [covers.com](http://covers.com) -- [databall](https://github.com/klane/databall/tree/master/databall): Python module with support functions to perform tasks including collecting stats to a SQLite database, simulating seasons, and customizing plots -- [docs](https://github.com/klane/databall/tree/master/docs): Code required to build the GitHub Pages [site](https://klane.github.io/databall/) for this project -- [notebooks](https://github.com/klane/databall/tree/master/notebooks): Jupyter notebooks of all analyses -- [report](https://github.com/klane/databall/tree/master/report): LaTeX files for report and slides +- [covers](https://github.com/klane/databall/tree/main/databall/covers): Scrapy project to scrape point spreads and over/under lines from [covers.com](http://covers.com) +- [databall](https://github.com/klane/databall/tree/main/databall): Python module with support functions to perform tasks including collecting stats to a SQLite database, simulating seasons, and customizing plots +- [docs](https://github.com/klane/databall/tree/main/docs): Code required to build the GitHub Pages [site](https://klane.github.io/databall/) for this project +- [notebooks](https://github.com/klane/databall/tree/main/notebooks): Jupyter notebooks of all analyses +- [report](https://github.com/klane/databall/tree/main/report): LaTeX files for report and slides Link to a test database with data from 1990 - March 2020 [test nba.db file](https://drive.google.com/file/d/10CBcCLv2N_neFL39ThykcudUVUv5xqLB/view?usp=sharing) diff --git a/docs/README.md b/docs/README.md index 0d2e1de9..9ebad6ce 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,3 +1,3 @@ # DataBall: Betting on the NBA with data -This folder contains the code required to build the GitHub Pages [site](https://klane.github.io/databall/) for this project. The site uses a slightly modified version of the [Jekyll](http://jekyllrb.com) theme [Hyde](https://github.com/poole/hyde). Several of the [pages](https://github.com/klane/databall/tree/master/docs/_pages) and all the [images](https://github.com/klane/databall/tree/master/docs/assets/images) were generated by converting the Jupyter notebooks to Markdown with [jekyllnb](https://github.com/klane/jekyllnb). +This folder contains the code required to build the GitHub Pages [site](https://klane.github.io/databall/) for this project. The site uses a slightly modified version of the [Jekyll](http://jekyllrb.com) theme [Hyde](https://github.com/poole/hyde). Several of the [pages](https://github.com/klane/databall/tree/gh-pages/_pages) and all the [images](https://github.com/klane/databall/tree/gh-pages/assets/images) were generated by converting the Jupyter notebooks to Markdown with [jekyllnb](https://github.com/klane/jekyllnb). diff --git a/docs/_includes/footer.html b/docs/_includes/footer.html index 8ca808ee..9f93b151 100644 --- a/docs/_includes/footer.html +++ b/docs/_includes/footer.html @@ -11,9 +11,9 @@ Downloads - slides | - .zip | - .tar.gz + slides | + .zip | + .tar.gz diff --git a/docs/_pages/covers.md b/docs/_pages/covers.md index 14921998..98859711 100644 --- a/docs/_pages/covers.md +++ b/docs/_pages/covers.md @@ -6,6 +6,6 @@ permalink: /data/covers/ I combined the stats with point spreads and over/under lines obtained from [covers.com](http://covers.com), which provides historical betting data going back to the 1990-91 season. Each team page contains season schedules like [this one](http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/nba/teams/pastresults/2016-2017/team403975.html) for the 2016-17 season of my hometown Sacramento Kings. In addition to game results, the pages include the betting lines (point spreads), over/under lines, and the results of both types of bets. The betting line results are categorized as W/L/P (win, lose, or push against the spread) and the over/under results as O/U/P (over, under, or equal to the over/under line). -I utilized the Python web scraping framework [Scrapy](https://scrapy.org/) to collect all the betting data and store it to the same database the stats were written to. The heavy lifting of the [Scrapy project](https://github.com/klane/databall/tree/master/covers) was performed by what Scrapy designates [spiders](https://doc.scrapy.org/en/latest/topics/spiders.html) and [pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html). The job of a Scrapy spider is to crawl a web page and extract the desired data into an [item](https://doc.scrapy.org/en/latest/topics/items.html) or number of items and pass them to all registered pipelines. Pipelines can perform a number of tasks ranging from data cleansing and validation to data storage, which is how I wrote betting information to the database. I only wrote data for games in which the team I was parsing was the home team. This avoids duplicating data and makes it easier to setup a machine learning problem similar to my [previous project](https://klane.github.io/databall1/) where I am concerned with predicting if the home team wins against the spread. +I utilized the Python web scraping framework [Scrapy](https://scrapy.org/) to collect all the betting data and store it to the same database the stats were written to. The heavy lifting of the [Scrapy project](https://github.com/klane/databall/tree/main/databall/covers) was performed by what Scrapy designates [spiders](https://doc.scrapy.org/en/latest/topics/spiders.html) and [pipelines](https://doc.scrapy.org/en/latest/topics/item-pipeline.html). The job of a Scrapy spider is to crawl a web page and extract the desired data into an [item](https://doc.scrapy.org/en/latest/topics/items.html) or number of items and pass them to all registered pipelines. Pipelines can perform a number of tasks ranging from data cleansing and validation to data storage, which is how I wrote betting information to the database. I only wrote data for games in which the team I was parsing was the home team. This avoids duplicating data and makes it easier to setup a machine learning problem similar to my [previous project](https://klane.github.io/databall1/) where I am concerned with predicting if the home team wins against the spread. Crawling the website provided a number of challenges including missing data and data entry errors. The site includes many games with missing betting data, such as two games for the [2000-01 Minnesota Timberwolves](http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/nba/teams/pastresults/2000-2001/team403995.html). Most of these instances occurred between 1995-1999, and none have happened since the 2000-01 season. These games get stored with null values for the missing data because they might have point spreads or over/under lines, just not both. Another edge case I had to account for is the rare "pick'em" game indicating the point spread is zero. However, the website displays the point spread as PK instead of 0, in which case I just replace it with a zero. A curious error in the website lists a game between the Houston Rockets and Sacramento Kings on April 4, 1995 as being played in [Houston](http://www.covers.com/pageLoader/pageLoader.aspx?page=/data/nba/teams/pastresults/1994-1995/team403975.html), when in fact it was played in [Sacramento](https://www.basketball-reference.com/boxscores/199504040SAC.html). The last thing I had to account for is the confusing history of the [Charlotte Hornets](https://en.wikipedia.org/wiki/Charlotte_Hornets). They moved to New Orleans in 2002 and the NBA established the Charlotte Bobcats shortly after in 2004. In 2013, the Hornets rebranded as the Pelicans, which freed up the Hornets name and allowed the Bobcats to change in 2014. The NBA stats database lists old Hornets games as Charlotte, which they technically are, but covers.com lists them as New Orleans. In order to assign game IDs to the betting data to later join with the game information in the database, I had to switch the team to Charlotte in the pipeline for "New Orleans" games prior to the 2002-03 season. diff --git a/docs/_pages/stats.md b/docs/_pages/stats.md index fdd7378e..7e2c3934 100644 --- a/docs/_pages/stats.md +++ b/docs/_pages/stats.md @@ -56,7 +56,7 @@ The process for generating a database of NBA stats consists of four steps: 3. Write the stats DataFrame to the database. 4. Close the database connection. -Steps 2 and 3 can be wrapped in a loop to store stats for different seasons. I used this process to create a database with player and team stats for full seasons since the 1996-97 season and for individual games going back to the 1989-90 season. The season stats start later because the NBA stats website only includes [season stats](http://stats.nba.com/teams/traditional/) since the 1996-97 season, something I did not realize initially, but [box scores](http://stats.nba.com/teams/boxscores/) go back much further. All season stats moving forward are actually averaged from box scores to permit analysis of seasons prior to 1996. The code used to generate the database is located [here](https://github.com/klane/databall/blob/master/databall/database_builder.py). +Steps 2 and 3 can be wrapped in a loop to store stats for different seasons. I used this process to create a database with player and team stats for full seasons since the 1996-97 season and for individual games going back to the 1989-90 season. The season stats start later because the NBA stats website only includes [season stats](http://stats.nba.com/teams/traditional/) since the 1996-97 season, something I did not realize initially, but [box scores](http://stats.nba.com/teams/boxscores/) go back much further. All season stats moving forward are actually averaged from box scores to permit analysis of seasons prior to 1996. The code used to generate the database is located [here](https://github.com/klane/databall/blob/main/databall/database_builder.py). ## Calculating Advanced Stats diff --git a/docs/index.md b/docs/index.md index 1f50cfd1..f16dbe00 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,6 +5,8 @@ title: Home # DataBall +> **Note**: The project is undergoing significant changes and this site is not updated regularly at the moment. + Thank you for visiting my website. It explores a project that combines my interest in data science with my love of sports. The discussion that follows details the process I used to predict NBA game winners against betting lines, from acquiring data to evaluating models. The project's name was inspired by a [Grantland article](http://grantland.com/features/expected-value-possession-nba-analytics/) by Kirk Goldsberry. Several of the pages on this site are converted from [Jupyter Notebooks](http://jupyter.org/), in which case I provide a link to the original notebook hosted on [GitHub]({{ site.github.repository_url }}). This project is a continuation of a [previous project](https://klane.github.io/databall1/) in which I predicted NBA winners straight up using season-averaged stats. I was interested in predicting winners against the spread in a sequential manner to represent a real-life betting scenario, which is what sparked this project. Full disclosure, I do not recommend running off to Vegas next season and bet on games using the models presented here. Betting on the spread is a difficult problem to model. My first foray into machine learning in sports came in the form of a [Kaggle competition](https://www.kaggle.com/c/march-machine-learning-mania-2014), where competitors were tasked with calculating the odds one team would beat another for each potential matchup of the NCAA men's basketball tournament. Models were evaluated on the [log loss](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_error_function_and_logistic_regression) of their predicted probabilities for the games that actually occurred. This causes models that are incorrectly confident to be heavily penalized. Predicting all possible matchups instead of filling out a traditional bracket also allowed submissions to be easily compared against one another. It would otherwise have been difficult to determine who had the best model since filling out a perfect bracket is [near impossible](http://fivethirtyeight.com/features/the-odds-youll-fill-out-a-perfect-bracket/). This project is a natural progression of that initial work. diff --git a/notebooks/data-exploration.ipynb b/notebooks/data-exploration.ipynb index 22f807e4..729d98bc 100644 --- a/notebooks/data-exploration.ipynb +++ b/notebooks/data-exploration.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/master/notebooks/data-exploration.ipynb). It explores some of the data contained in or derived from the database. First we must import the necessary installed modules." + "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/main/notebooks/data-exploration.ipynb). It explores some of the data contained in or derived from the database. First we must import the necessary installed modules." ] }, { diff --git a/notebooks/feature-selection.ipynb b/notebooks/feature-selection.ipynb index 43176ab4..55131ace 100644 --- a/notebooks/feature-selection.ipynb +++ b/notebooks/feature-selection.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/master/notebooks/feature-selection.ipynb). It investigates which attributes in the database to select for further study. First we must import the necessary installed modules." + "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/main/notebooks/feature-selection.ipynb). It investigates which attributes in the database to select for further study. First we must import the necessary installed modules." ] }, { diff --git a/notebooks/model-performance.ipynb b/notebooks/model-performance.ipynb index 788e7ac9..48360d02 100644 --- a/notebooks/model-performance.ipynb +++ b/notebooks/model-performance.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/master/notebooks/model-performance.ipynb). It compares model performance using various algorithms. First we must import the necessary installed modules." + "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/main/notebooks/model-performance.ipynb). It compares model performance using various algorithms. First we must import the necessary installed modules." ] }, { diff --git a/notebooks/parameter-tuning.ipynb b/notebooks/parameter-tuning.ipynb index d08930aa..5f9d5b8d 100644 --- a/notebooks/parameter-tuning.ipynb +++ b/notebooks/parameter-tuning.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/master/notebooks/parameter-tuning.ipynb). It investigates tuning model parameters to achieve better performance. First we must import the necessary installed modules." + "This page was created from a Jupyter notebook. The original notebook can be found [here](https://github.com/klane/databall/blob/main/notebooks/parameter-tuning.ipynb). It investigates tuning model parameters to achieve better performance. First we must import the necessary installed modules." ] }, {