-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsource.R
119 lines (103 loc) · 2.44 KB
/
source.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
library(tidyverse)
library(readxl)
library(stringr)
library(sets)
library(dplyr)
# a問題
semester_df1 <- read.csv(
"raw_data/semester_dummy/semester_data_1.csv",
fileEncoding = "UTF-8",
stringsAsFactors=FALSE,
skip = 1
) |>
mutate(
unitid=as.numeric(unitid),
semester=as.numeric(semester),
quarter=as.numeric(quarter),
year=as.numeric(year)
)
semester_df2 <- read.csv(
"raw_data/semester_dummy/semester_data_2.csv",
fileEncoding = "UTF-8",
stringsAsFactors=FALSE,
col.names = c("unitid","instnm","semester","quarter","year","Y")
)
semester_dummy_tidy <- bind_rows(
semester_df1, semester_df2
) |>
select(-"Y")
# b問題
gradrate_tidy <- NULL
file_name_list <-
list.files(path = "raw_data/outcome" ,
pattern = "\\.xlsx$",
full.names = TRUE)
for (file_name in file_name_list) {
gradrate_tidy <- readxl::read_excel(
file_name
) |>
mutate(
women_gradrate_4yr = 0.01 * women_gradrate_4yr
) |>
bind_rows(gradrate_tidy)
}
# c問題
covariates_tidy <- readxl::read_excel(
"raw_data/covariates/covariates.xlsx"
) |>
rename(
unitid = university_id
) |>
mutate(
unitid = str_replace(unitid,
pattern="aaaa",
replacement=""
)
) |>
pivot_wider(names_from = "category",
values_from = "value"
)
# d問題
gradrate_ready <- mutate(
gradrate_tidy,
men_gradrate_4yr = as.numeric(m_4yrgrads) / m_cohortsize,
tot_gradrate_4yr = tot4yrgrads / as.numeric(totcohortsize)
) |>
mutate(
gradrate_tidy,
men_gradrate_4yr = round(men_gradrate_4yr, 4),
tot_gradrate_4yr = round(tot_gradrate_4yr, 4)
) |>
drop_na(tot_gradrate_4yr)
# e問題
years_set <- unique(
unique(semester_dummy_tidy$year),
unique(gradrate_ready$year)
)
unitid_set <- unique(semester_dummy_tidy$unitid)
covariates_ready <- mutate(
covariates_tidy,
year = as.numeric(year)
) |>
dplyr::filter(
year %in% years_set
) |>
mutate(
unitid = as.numeric(unitid)
) |>
dplyr::filter(
unitid %in% unitid_set
)
# f問題
master <- left_join(
semester_dummy_tidy,
covariates_ready,
by = c("year"="year", "unitid" = "unitid")
) |>
left_join(
gradrate_ready,
by = c("year"="year", "unitid" = "unitid")
) |>
mutate(
white_rate = round(as.numeric(white_cohortsize) / as.numeric(totcohortsize), 4)
)