-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_utils.py
75 lines (66 loc) · 2.77 KB
/
scraping_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from bs4 import BeautifulSoup
import requests
import pandas as pd
import camelot
def payroll_beautiful_parser(URL):
"""Parse given URL into a BeautifulSoup Object"""
payroll_html = requests.get(URL).content
parsed_payroll_html = BeautifulSoup(payroll_html, "html.parser")
return parsed_payroll_html
def anchor_tag_URL_extraction(parsed_html_webpage, year: str):
"""Extract URL from anchor tags on WUSTL Payroll Site with PDF content"""
payroll_dates_pdf_url = ""
for anchor in parsed_html_webpage.find_all("a"):
if (
f"{year} Payroll Processing" in anchor.text
): # Filter Specifically for Year of Interest
if str(anchor.get("href")).endswith(".pdf"):
payroll_dates_pdf_url = anchor.get("href")
break
else:
continue
return payroll_dates_pdf_url
def pdf_to_dataframe_processing(pdf_file, calendar_year: int):
"""Process PDF File (.pdf) for Pay Dates"""
pdf_tables = camelot.read_pdf(pdf_file, pages="1,2") # Two pages worth of pay dates
table_one_dataframe = pdf_tables[0].df # Convert Camelot object from PDF page 1 into dataframe
table_two_dataframe = pdf_tables[1].df
table_one_dataframe = table_one_dataframe.rename(
columns=table_one_dataframe.iloc[0]
) # Set Column Header Names From 1st Row Values
table_two_dataframe = table_two_dataframe.rename(
columns=table_two_dataframe.iloc[0]
)
table_one_dataframe = (
table_one_dataframe.drop(
list(table_one_dataframe)[1:8], axis=1
) # Drop all rows from position 1 up to but not include position 8
.dropna(axis=0)
.query("`P/R` == 'MON/STP'")
.reset_index(drop=True)
)
table_two_dataframe = (
table_two_dataframe.drop(list(table_two_dataframe)[1:8], axis=1)
.dropna(axis=0)
.query("`P/R` == 'MON/STP'")
.reset_index(drop=True)
)
payroll_dataframe = pd.concat(
[table_one_dataframe, table_two_dataframe], ignore_index=True
).rename(columns={"P/R": "Payroll Type", "Pay Date": "Checks Released"})
payroll_dataframe["Checks Released"] = (
payroll_dataframe["Checks Released"] + f"/{str(calendar_year)}"
)
payroll_dataframe["Month"] = pd.to_datetime(
payroll_dataframe["Checks Released"]
).dt.strftime(
"%B"
) # Create Month Column
payroll_dataframe = payroll_dataframe[["Month", "Payroll Type", "Checks Released"]]
return payroll_dataframe
PAYROLL_URL: str = (
"https://financialservices.wustl.edu/wfin-topic/payroll/payroll-deadlines/"
)
parsed_html = payroll_beautiful_parser(PAYROLL_URL)
payroll_pdf_url = anchor_tag_URL_extraction(parsed_html, "2024")
processed_dataframe = pdf_to_dataframe_processing(payroll_pdf_url, 2024)