-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add url checking file for gymnastics * Fix covid model
- Loading branch information
Showing
4 changed files
with
176 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
159 changes: 159 additions & 0 deletions
159
collaborations/sports-gymnastics-2024/gymnastics_score_crawl.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"example_url = \"https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/15/Stats.htm\"\n", | ||
"\n", | ||
"base_url = \"https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/\"\n", | ||
"end_url = \"/Stats.htm\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import datetime\n", | ||
"import time\n", | ||
"import os" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a list of all dates for each year (list in list) for the years 2015-2024. Each year is a list and date as a string in yyyy/m/d format\n", | ||
"\n", | ||
"years = list(range(2000,2025))\n", | ||
"dates = []\n", | ||
"\n", | ||
"for year in years:\n", | ||
" year_dates = []\n", | ||
" for month in range(1,3):\n", | ||
" for day in range(1,32):\n", | ||
" try:\n", | ||
" date = datetime.date(year, month, day)\n", | ||
" year_dates.append(date.strftime(\"%Y/%-m/%-d\"))\n", | ||
" except ValueError:\n", | ||
" pass\n", | ||
" dates.append(year_dates)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# For each year, create a list of urls for each date in the dates list\n", | ||
"\n", | ||
"urls = []\n", | ||
"\n", | ||
"for year in dates:\n", | ||
" year_urls = []\n", | ||
" for date in year:\n", | ||
" year_urls.append(base_url + date + end_url)\n", | ||
" urls.append(year_urls)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# iterate through each url and check if url exists. If it does, add to list of valid urls. If not, pass. If there is a valid url for a year, skip to the next year.\n", | ||
"\n", | ||
"valid_urls = []\n", | ||
"\n", | ||
"for year in urls:\n", | ||
" for url in year:\n", | ||
" try:\n", | ||
" response = requests.get(url)\n", | ||
" if response.status_code == 200:\n", | ||
" valid_urls.append(url)\n", | ||
" break\n", | ||
" except:\n", | ||
" pass" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2017/1/7/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2018/1/7/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2019/1/5/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2020/1/5/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2021/1/24/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2022/1/18/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/8/Stats.htm',\n", | ||
" 'https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2024/1/10/Stats.htm']" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"valid_urls" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# export valid urls to a txt file\n", | ||
"\n", | ||
"with open(\"valid_urls.txt\", \"w\") as file:\n", | ||
" for url in valid_urls:\n", | ||
" file.write(url + \"\\n\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "base", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2017/1/7/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2018/1/7/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2019/1/5/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2020/1/5/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2021/1/24/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2022/1/18/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2023/1/8/Stats.htm | ||
https://s3.us-east-2.amazonaws.com/sidearm.nextgen.sites/uclabruins.com/documents/2024/1/10/Stats.htm |