-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathweb scraper with events.py
97 lines (81 loc) · 3.36 KB
/
web scraper with events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 3 18:19:37 2023
@author: jespe
"""
import requests
from bs4 import BeautifulSoup
import csv
#iConferenceList = [3941, 3933, 3938, 3934, 3940, 3935, 3931, 3932, 3936, 3937, 3998,
# 4010, 4003, 4000, 3999, 4002, 4035, 4008, 4009, 4041, 4025, 4001,
# 4027, 4011, 4005, 4012, 4013, 4016, 4014, 4006, 4015, 4017, 4026,
# 3939, 4004, 4018, 3911, 4019, 4020, 4024, 4007, 4021, 4022, 4023,
# 3903]
oConferenceList = [4043, 4268, 4248, 4237, 4270, 4246, 4301, 4267, 4242, 4245, 4244, 4273,
4239, 4241, 4300, 4247, 4272, 4243, 4249, 4250, 4251, 4252, 4254,
4238, 4253, 4274, 4258, 4294, 4240, 4255, 4256, 4269, 4257, 4222,
4260, 4259, 4261, 4235, 4234, 4265, 4262, 4236, 4263, 4264, 4271,
4266]
fullData = []
teamList = []
for i in oConferenceList:
# name = 'output' + str(i) + '.csv'
urlName = 'https://www.tfrrs.org/lists/' + str(i) + '/'
url = urlName
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = []
event = soup.find('h3')
rows = soup.find_all('tr')
# data.append(event)
for row in rows:
cols = row.find_all('td')
cols = [col.text.strip() for col in cols]
if cols == []:
data.append([])
events = event.find_next('h3').text.strip().replace("\n","")
events = " ".join(events.split())
events = [events]
data.append(events)
event = event.find_next('h3')
else:
data.append(cols)
#Changes the order of team events
if len(data[-1]) == 6:
team = data[-1][1]
time = data[-1][2]
athleteName = data[-1][3]
data[-1][1] = athleteName
data[-1][2] = team
data[-1][3] = time
data[-1].insert(4, "")
data[-1].insert(5, "")
data[-1].insert(2, "")
#Removes unecessary nformation from the times
if len(data[-1][4].split()) > 1:
data[-1][4] = data[-1][4].split()[0]
#Adds NaN values to distance events
if 'm' in data[-1][4]:
data[-1][5] = ""
data[-1].insert(4, "")
#Adds NaN columns to Mulit-events
try:
int(data[-1][4]) == type(int)
except ValueError:
if len(data[-1]) == 8 or len(data[-1]) == 7:
data[-1].insert(5, "")
data[-1].insert(6, "")
else:
data[-1].insert(4, "")
data[-1].insert(5, "")
#Adds NaN to the wind column if there is not a value in it
if len(data[-1]) == 9:
data[-1].append("")
#Puts all the competing teams in a list
# if data[-1][3] not in teamList:
# teamList.append(data[-1][3])
for line in data:
fullData.append(line)
with open("fullDataNoEvents.csv", 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(fullData)