-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
131 lines (112 loc) · 5.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import pandas as pd
data_url = 'https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/PERM_Disclosure_Data_FY2023_Q2.xlsx'
data_url = 'https://www.dol.gov/sites/dolgov/files/ETA/oflc/pdfs/PERM_Disclosure_Data_FY2022_Q4.xlsx'
# Read the data from the URL
df = pd.read_excel(data_url)
df['DECISION_DATE_month'] = df['DECISION_DATE'].dt.to_period('M')
# Add the date decision time column
df['DECISION_TIME'] = df['DECISION_DATE'] - df['RECEIVED_DATE']
# Print general statistics about the approval rate
print(df['CASE_STATUS'].value_counts(normalize=True))
# Remove withdrawn cases
df = df[df['CASE_STATUS'] != 'Withdrawn']
# Concert the decision time to days
df['DECISION_TIME'] = df['DECISION_TIME'].dt.days
# Get mean decision time for each case status
print(df.groupby('CASE_STATUS')['DECISION_TIME'].mean())
# Get the mean decision time for all cases
print(df['DECISION_TIME'].mean())
# Get statistics about the decision time
print(df['DECISION_TIME'].describe(percentiles=[0.25, 0.5, 0.75, 0.80, 0.85, 0.9, 0.95, 0.99]))
#%%
# Plot the distribution of the decision time
import matplotlib.pyplot as plt
import seaborn as sns
# Plot histpgram of cases status per decision month
# plt.hist(df, x='DECISION_DATE_month', hue='CASE_STATUS', multiple='stack', log=True)
# plt.hist([x1, x2], bins=bins, stacked=True,
# color=colors, label=names, density=False)
df['DECISION_DATE_month'] = df['DECISION_DATE_month'].astype(str).replace('20', '', regex=True)
sns.histplot(df, x='DECISION_DATE_month', hue='CASE_STATUS', multiple='stack')
plt.xticks(fontsize=8)
plt.show()
#%%
# Keep only the first 7 character of SOC codes
df['PW_SOC_CODE'] = df['PW_SOC_CODE'].str[:7]
df['PW_SOC_CODE_ROOT'] = df['PW_SOC_CODE'].str[:2]
# Get the mean decision time for a given SOC codes
soc_codes = ['15-2031']
soc_codes_root = ['15']
df_soc = df[df['PW_SOC_CODE'].isin(soc_codes)]
df_soc_root = df[df['PW_SOC_CODE_ROOT'].isin(soc_codes_root)]
print('Statistics for SOC code: ', soc_codes)
print(df_soc['CASE_STATUS'].value_counts(normalize=True))
print(df_soc.groupby('CASE_STATUS')['DECISION_TIME'].mean())
print(df_soc['DECISION_TIME'].describe(percentiles=[0.25, 0.5, 0.75, 0.80, 0.85, 0.9, 0.95, 0.99]))
#%%
# Get the mean decision time for a given law firm
law_firms = 'fragomen'
df_law = df[~df['AGENT_ATTORNEY_FIRM_NAME'].isna()]
df_law['AGENT_ATTORNEY_FIRM_NAME'] = df_law['AGENT_ATTORNEY_FIRM_NAME'].astype(str)
df_law['AGENT_ATTORNEY_FIRM_NAME'] = df_law['AGENT_ATTORNEY_FIRM_NAME'].str.lower()
df_law = df_law[df_law['AGENT_ATTORNEY_FIRM_NAME'].str.contains(law_firms)]
print(f'Statistics for law firm: {law_firms}')
print(df_law['CASE_STATUS'].value_counts(normalize=True))
print(df_law['DECISION_TIME'].mean())
print(df_law['DECISION_TIME'].describe(percentiles=[0.25, 0.5, 0.75, 0.80, 0.85, 0.9, 0.95, 0.99]))
#%%
# Get the approval rate by waiting time
priority_date = pd.to_datetime('2022-05-19')
# get the number of days since the priority date and now
days_pending = (pd.to_datetime('today') - priority_date).days
df_pending = df[(df['DECISION_TIME'] >= days_pending - 5) & (df['DECISION_TIME'] <= days_pending + 5)]
print(df_pending['CASE_STATUS'].value_counts(normalize=True))
# Get decile of the decision time
print('Decile of the decision time in total: ')
print(df['DECISION_TIME'].lt(days_pending).sum()/df.shape[0])
print(f'Decile of the decision time for soc {soc_codes[0]}: ')
print(df_soc['DECISION_TIME'].lt(days_pending).sum()/df_soc.shape[0])
print(f'Decile of the decision time for lawyer firm {law_firms}: ')
print(df_law['DECISION_TIME'].lt(days_pending).sum()/df_law.shape[0])
#%%
# Denial analysis
df_denial = df[df['CASE_STATUS'] == 'Denied']
df_denial['PW_SOC_TITLE'] = df_denial['PW_SOC_TITLE'].str.lower()
df_denial['EMPLOYER_NAME'] = df_denial['EMPLOYER_NAME'].str.lower()
print('Number of denied cases: ')
print(df_denial.shape[0])
# Top SOC TITLE for denied cases
print(df_denial['PW_SOC_TITLE'].value_counts().head(20))
print(df_denial['PW_SOC_TITLE'].value_counts().tail(20))
# Top Employer count for denied cases
print(df_denial['EMPLOYER_NAME'].value_counts().head(10))
# Top education level for denied cases
print(df_denial['MINIMUM_EDUCATION'].value_counts().head(10))
# Top field of studies for denied cases
print(df_denial['MAJOR_FIELD_OF_STUDY'].value_counts().head(10))
# Top PW skill levels for denied cases
print(df_denial['PW_SKILL_LEVEL'].value_counts(normalize=True))
print(df['PW_SKILL_LEVEL'].value_counts(normalize=True))
# Foreign worker education level
print(df_denial.iloc[:, 130].value_counts(normalize=True))
#%%
# Build some visualizations
# Denial rate by decision date
# certified rate by decision date month
df_hist = df[['CASE_STATUS', 'DECISION_DATE_month']].groupby('DECISION_DATE_month', as_index=False).value_counts(normalize=True)
print(df_hist)
df_hist_law = df_law[['CASE_STATUS', 'DECISION_DATE_month']].groupby('DECISION_DATE_month', as_index=False).value_counts(normalize=True)
print(df_hist_law)
df_hist_soc_root = df_soc_root[['CASE_STATUS', 'DECISION_DATE_month', 'PW_SKILL_LEVEL']].groupby(['DECISION_DATE_month', 'PW_SKILL_LEVEL'], as_index=False).value_counts(normalize=True)
print(df_hist_soc_root)
def get_certified_rate(df):
df = pd.DataFrame(df)
try:
result = df.value_counts(normalize=True)[1]
except:
result = 0
return result
df_sorted = df.sort_values('DECISION_TIME')
df_sorted['CERTIFIED_INT'] = df_sorted['CASE_STATUS'].map({'Certified': 1, 'Denied': 0})
window = 10
df_sorted['CERTIFIED_RATE'] = df_sorted.rolling(window=window, on='DECISION_TIME', axis=0)['CERTIFIED_INT'].apply(get_certified_rate)