-
Notifications
You must be signed in to change notification settings - Fork 0
/
Unemployment_Scraping.py
225 lines (144 loc) · 7.3 KB
/
Unemployment_Scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import requests
from bs4 import BeautifulSoup
import re
import os
import csv
import sqlite3
import json
def get_week_dates():
#Gets start dates of weeks and formats into keys that match the rest of the code
#Function will return a list of these weekly start dates
#List returns dates chronologically from March 2020 to may 2020
with open("unemployment_report.html") as f:
soup = BeautifulSoup(f, "html.parser")
date_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[13:-42]:
header = row.find('td')['headers']
date = header[0]
#broke up the date into year, month, and day for reformatting purposes
year = date[-4:]
day = date[3:5]
month = date[:2]
#this formats the date given into a format our group agreed upon
formatted_date = str(year) + '-' + str(month) + '-' + str(day)
date_list.append(formatted_date)
return date_list
def get_national_initial_nsa_claims():
with open("unemployment_report.html") as f:
soup = BeautifulSoup(f, "html.parser")
#Gets weekly initial not seasonally adjusted jobless claims from week of March 14, 2020 to March 13, 2021.
#Function will return a list of these numbers per week
#List returns numbers chronologically from March 2020 to may 2020
nsa_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[13:-42]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
return nsa_list
def get_mich_initial_nsa_claims():
with open("mich_unemployment.html") as f:
soup = BeautifulSoup(f, "html.parser")
#Gets weekly initial not seasonally adjusted jobless claims from week of March 14, 2020 to March 13, 2021.
#Function will return a list of these numbers per week
#List returns numbers chronologically from March 2020 to may 2020
nsa_list = []
anchor = soup.find("table")
rows = anchor.find_all('tr')
#rows is sliced to get data from specified dates our group decided upon
for row in rows[12:-4]:
nsa_initial_claim_num = row.find('td').text
#this gets rid of commas in numbers so I can convert the strings into integers without error
num = nsa_initial_claim_num.replace(',', '')
nsa_list.append(int(num))
return nsa_list
def get_national_weekly_cumulative_total():
#Function gets a weekly cumulative total of initial National NSA unemployment claims as weeks go on
#Function will return a list of the cumulative total per week
#List returns numbers chronologically from March 2020 to may 2020
total = 0
cumulative_totals = []
claim_per_week = get_national_initial_nsa_claims()
for num in claim_per_week:
total += num
cumulative_totals.append(total)
return cumulative_totals
def get_mich_weekly_cumulative_total():
#Function gets a weekly cumulative total of initial Michigan NSA unemployment claims as weeks go on
#Function will return a list of the cumulative total per week
#List returns numbers chronologically from March 2020 to may 2020
total = 0
cumulative_totals = []
claim_per_week = get_mich_initial_nsa_claims()
for num in claim_per_week:
total += num
cumulative_totals.append(total)
return cumulative_totals
def make_national_dict():
#Function calls get_week_dates and get_national_initial_nsa_claims
#Function will turn both lists that are returned into a dictionary
#Dates will be the keys and initial NSA claims for national data will be values
#Function will return the dictionary
dates = get_week_dates()
claim_nums = get_national_initial_nsa_claims()
return dict(zip(dates, claim_nums))
def make_mich_dict():
#Function calls get_week_dates and get_mich_initial_nsa_claims
#Function will turn both lists that are returned into a dictionary
#Dates will be the keys and initial NSA claims for Michigan data will be values
#Function will return the dictionary
dates = get_week_dates()
claim_nums = get_mich_initial_nsa_claims()
return dict(zip(dates, claim_nums))
def create_database(database_name):
#Creates a database to store information in
path = os.path.dirname(os.path.abspath(__file__))
conn = sqlite3.connect(path+'/'+database_name)
cur = conn.cursor()
return (cur, conn)
def fill_database(nat_unemployment_dict, mich_unemployment_dict, nat_totals_list, mich_totals_list, cur, conn):
#creates weeks table
cur.execute('''CREATE TABLE IF NOT EXISTS Weeks (id INTEGER PRIMARY KEY, week TEXT)''')
#creates table of unemployment rates
cur.execute('''CREATE TABLE IF NOT EXISTS unemployment_rates (week INTEGER, loc TEXT, initial_nsa_claims INTEGER, total_claims INTEGER)''')
for x in range(len(nat_unemployment_dict.keys())):
weeks = list(nat_unemployment_dict.keys())
#creates variables to insert into table
week_key = weeks[x]
nat_unemployment_claim_num = nat_unemployment_dict[week_key]
nat_total = nat_totals_list[x]
week_num = x + 1
loc = "National"
#adds information into weeks table
cur.execute('''INSERT OR IGNORE INTO Weeks (id, week) VALUES (?, ?)''', (week_num, week_key))
#adds information into unemployment rates table
cur.execute('''INSERT OR IGNORE INTO unemployment_rates (week, loc, initial_nsa_claims, total_claims) VALUES (?, ?, ?, ?)''', (week_num, loc, nat_unemployment_claim_num, nat_total))
for x in range(len(mich_unemployment_dict.keys())):
weeks = list(mich_unemployment_dict.keys())
#creates variables to insert into table
week_key = weeks[x]
mich_unemployment_claim_num = mich_unemployment_dict[week_key]
mich_total = mich_totals_list[x]
week_num = x + 1
loc = "MI"
#adds information into unemployment rates table
cur.execute('''INSERT OR IGNORE INTO unemployment_rates (week, loc, initial_nsa_claims, total_claims) VALUES (?, ?, ?, ?)''', (week_num, loc, mich_unemployment_claim_num, mich_total))
print("Finished adding data")
conn.commit()
def main():
"""Takes nothing as an input and returns nothing. Calls the functions make_mich_dict(), make_national_dict(), get_mich_weekly_cumlative_total(),, get_national_weekly_cumlative_total(), create_database(), and fill_database(). Closes the database connection."""
weekly_national_umemployment_claims_dict = make_national_dict()
weekly_mich_umemployment_claims_dict = make_mich_dict()
cumulative_national_claims_list = get_national_weekly_cumulative_total()
cumulative_mich_claims_list = get_mich_weekly_cumulative_total()
cur, conn = create_database('unemployment_tables.db')
fill_database(weekly_national_umemployment_claims_dict, weekly_mich_umemployment_claims_dict, cumulative_national_claims_list, cumulative_mich_claims_list, cur, conn)
conn.close()
if __name__ == "__main__":
main()