forked from csaluski/interpretable_school_policy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
DCI pre-processing.rmd
343 lines (285 loc) · 12.6 KB
/
DCI pre-processing.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
---
title: "Interpretable Analysis of School Policy Decisions, Data Pre-processing"
author: "Charles Saluski"
# date: "1/4/2022"
output: pdf_document
---
```{r}
library(data.table)
library(openxlsx)
library(dplyr)
library(stringr)
library(purrr)
```
Process yearly NCES tables, removing non-universal columns
```{r}
nces.base.location <- "Data Sources/NCES Data - District-Building Characteristics/NCES annual data/"
nces.file.vec <- list.files(nces.base.location)
# not all data sets have same columns, so this method of renaming doesn't work
nces.cols <- c(
"School Name",
"State Name [Public School] Latest available year",
"Agency ID - NCES Assigned [Public School] Latest available year",
"School ID - NCES Assigned [Public School] Latest available year",
"School Name [Public School]",
"School Type [Public School]",
"Charter School [Public School]",
# "Urban-centric Locale [Public School]",
"School-wide Title I [Public School]",
"Title I Eligible School [Public School]",
"State School ID [Public School]",
"National School Lunch Program [Public School]",
"Lowest Grade Offered [Public School]",
"Highest Grade Offered [Public School]",
"Total Students, All Grades (Excludes AE) [Public School]",
"Total Students, All Grades (Includes AE) [Public School]",
"Free Lunch Eligible [Public School]",
# "Direct Certification [Public School]",
"Reduced-price Lunch Eligible Students [Public School]",
"Free and Reduced Lunch Students [Public School]",
"Full-Time Equivalent (FTE) Teachers [Public School]",
"Pupil/Teacher Ratio [Public School]"
)
getYearVec <- function(file.vec) {
temp.list <- list()
for (file in file.vec) {
temp.list[[file]] <- str_split(file, "_")[[1]][5]
}
as.numeric(temp.list)
}
combine_nces_files <- function(file.vec, base.location) {
temp.dt.list <- list()
nces.year.vec <- getYearVec(file.vec)
for (file.index in 1:length(file.vec)) {
file.loc <- paste(base.location, file.vec[[file.index]], sep = "")
curr.dt <- as.data.table(read.xlsx(file.loc, startRow = 7))
curr.names <- names(curr.dt)
# not all tables have locale, and we can assume that it does not change
# over time so we merge it from other data.
curr.locale.index <- (grepl("Locale", curr.names))
curr.has.locale <- (TRUE %in% curr.locale.index)
if (curr.has.locale) {
curr.dt <- curr.dt[, !which(curr.locale.index), with = FALSE]
}
# this column does not exist in all data sets, also not sure what it means.
curr.certification.index <- (grepl("Certification", curr.names))
curr.has.certification <- (TRUE %in% curr.certification.index)
if (curr.has.certification) {
curr.dt <- curr.dt[, !which(curr.certification.index), with = FALSE]
}
# print(names(curr.dt))
setnames(curr.dt, old = names(curr.dt), new = nces.cols)
curr.dt[, year := nces.year.vec[[file.index]]]
# replace expanded district codes with short district codes used in other
# data sets
districts <- curr.dt[["State School ID [Public School]"]]
# this creates some "MO-NA" entries, which is not ideal, but they will be
# dropped later
districts <- paste("MO-", str_extract(districts, "\\d{6}"), sep = "")
curr.dt[, "State School ID [Public School]" := districts]
temp.dt.list[[file.index]] <- curr.dt
}
temp.dt.list
}
nces.yearly.dt.list <- combine_nces_files(nces.file.vec, nces.base.location)
nces.yearly.dt <- do.call(rbind, nces.yearly.dt.list)
setnames(nces.yearly.dt, "State School ID [Public School]", "State.District.ID")
setnames(nces.yearly.dt, "School ID - NCES Assigned [Public School] Latest available year", "NCES.School.ID")
nces.dt <- data.table(read.xlsx("Data Sources/NCES Data - District-Building Characteristics/ncesdata_ECCDA30A NO HEADER.xlsx"))[, .(NCES.School.ID, get("Locale*"))]
setnames(nces.dt, "V2", "Locale")
nces.yearly.dt <- nces.yearly.dt[nces.dt, on="NCES.School.ID"]
setnames(
nces.yearly.dt,
old = c("Free Lunch Eligible [Public School]", "Reduced-price Lunch Eligible Students [Public School]", "Total Students, All Grades (Includes AE) [Public School]"),
new = c("Free.Lunch.Eligible", "Reduced.Lunch.Eligible", "Total.Students")
)
nces.yearly.dt[, Free.Reduced.Lunch.Rate := (as.numeric(get("Free and Reduced Lunch Students [Public School]"))) / as.numeric(Total.Students)]
nces.yearly.dt[is.na(Free.Reduced.Lunch.Rate), Free.Reduced.Lunch.Rate := 0]
nces.yearly.dt[Free.Reduced.Lunch.Rate == Inf, Free.Reduced.Lunch.Rate := 100]
no.rename.cols <- c("State.District.ID", "year")
for (col in names(nces.yearly.dt)) {
if (!col %in% no.rename.cols) {
prefix.name <- paste("NCES", col, sep="_")
setnames(nces.yearly.dt, col, prefix.name)
}
}
```
Load implementation data from Melvin's work, RQ_IC file aggregates CWIS survey results (implementation checklist)
```{r}
ic.file <- "Data Sources/Data Research Questions/RQ_IC.csv.xlsx"
ic.dt <- as.data.table(read.xlsx(ic.file))
ic.district.cols <- unique(ic.dt[["District.Code"]])
valid.ic.rows <- grepl("MO", ic.district.cols)
ic.district.cols <- ic.district.cols[valid.ic.rows]
setnames(ic.dt, "currentSchoolYear", "year")
setnames(ic.dt, "District.Code", "State.District.ID")
# set the columns to the correct names
ic.col.file <- "./Data Sources/DCI Data/Implementation Checklist/Cross_Walk.xlsx"
ic.col.dt <- as.data.table(read.xlsx(ic.col.file, colNames = FALSE, cols=2:3))
setnames(ic.col.dt, colnames(ic.col.dt), c("name", "description"))
for (index in 1:nrow(ic.col.dt)) {
ic.col.name <- ic.col.dt[index]$name
ic.col.desc <- str_trunc(ic.col.dt[index]$description, 50)
setnames(ic.dt, ic.col.name, ic.col.desc)
}
for (col in names(ic.dt)) {
if (!col %in% no.rename.cols) {
prefix.name <- paste("IC", col, sep="_")
setnames(ic.dt, col, prefix.name)
}
}
cwis.avg.file <- "Data Sources/Data Research Questions/CWIS_avg_per_domain_per_district_2017_untill_2021_revised_05.20.22.csv"
cwis.avg.dt <- as.data.table(fread(cwis.avg.file))
# cwis.avg.dt <- cwis.avg.dt[ic.district.cols, on = .(State.District.ID)]
cwis.district.cols <- cwis.avg.dt[["State.District.ID"]]
time_divide_cwis <- function(val) {
time.section <- 0
if (val >= 201608 && val < 201707) {
time.section <- 2016
} else if (val >= 201708 && val < 201807) {
time.section <- 2017
} else if (val >= 201808 && val < 201907) {
time.section <- 2018
} else if (val >= 201908 && val < 202007) {
time.section <- 2019
} else if (val >= 202008 && val < 202107) {
time.section <- 2020
}
time.section
}
cwis.avg.dt <- cwis.avg.dt[complete.cases(cwis.avg.dt[, ])]
cwis.avg.dt <- data.table(
cwis.avg.dt,
year = sapply(cwis.avg.dt$session, time_divide_cwis)
)
# Keep only the latest CWIS survey that was completed each year
cwis.avg.dt[, keep := seq_along(sort(-session)), by = c("State.District.ID", "year")]
cwis.avg.dt <- cwis.avg.dt[keep == 1]
cwis.avg.dt$keep <- NULL
for (col in names(cwis.avg.dt)) {
if (!col %in% no.rename.cols) {
prefix.name <- paste("CWIS", col, sep="_")
setnames(cwis.avg.dt, col, prefix.name)
}
}
```
```{r}
nces.computed.cols <- list()
# we will probably want this wide data
# By observation, there is a single locale per district ID,
# so we can not use one-hot encoding for our ML models.
# We may want to see if this can be broken down to a boolean variable,
# city vs rural
nces.computed.cols[["locale.cols"]] <- dcast(
nces.yearly.dt,
State.District.ID ~ NCES_Locale,
fun.aggregate = length
)
locale.dt <- nces.computed.cols$locale.cols
locale.prop.dt <- data.table(
State.District.ID = locale.dt$State.District.ID
)
locale.prop.list <- list()
# Rename columns indicating origin,
for (col in names(nces.computed.cols$locale.cols)) {
if (!col %in% no.rename.cols) {
locale.col.prop.name <- paste("NCES", "Locale", "prop", col, sep="_")
prop.vec <- locale.dt[[paste(col)]]/rowSums(locale.dt[, !c("State.District.ID")])
temp.prop.dt <- data.table(
State.District.ID = locale.prop.dt$State.District.ID,
prop.vec
)
setnames(temp.prop.dt, "prop.vec", locale.col.prop.name)
locale.prop.dt <- locale.prop.dt[temp.prop.dt, on = c("State.District.ID")]
prefix.name <- paste("NCES", "Locale", col, sep="_")
setnames(nces.computed.cols$locale.cols, col, prefix.name)
}
}
locale.dt <- data.table(locale.prop.list)
# recover the locale categories that were lost in earlier processing
nces.yearly.dt <- nces.yearly.dt[nces.computed.cols[["locale.cols"]],
on = .(State.District.ID)
]
# for every numeric column, compute the summary statistics,
# grouped by State.District.ID
numeric.col.ops <- c("min", "max", "mean", "sd", "median")
nces.yearly.numeric.cols <- c(
"Total Students, All Grades (Excludes AE) [Public School]",
"Full-Time Equivalent (FTE) Teachers [Public School]",
"Pupil/Teacher Ratio [Public School]",
"Free.Lunch.Eligible",
"Reduced.Lunch.Eligible",
"Total.Students",
"Free and Reduced Lunch Students [Public School]",
"Free.Reduced.Lunch.Rate"
)
nces.renamed.cols <- c()
for (col in nces.yearly.numeric.cols) {
nces.renamed.cols <- c(nces.renamed.cols, paste("NCES", col, sep="_"))
}
aggregate_dt_columns <- function(dt, which, what) {
what.computed.cols <- list()
for (col in which) {
dt[[col]] <- as.numeric(dt[[col]])
for (op in what) {
op.call <- function(x) {
fun <- get(op)
fun(x)
}
col.name <- paste("Calculated_", col, ".", op, sep = "")
# Should we ignore NA here?
what.computed.cols[[col.name]] <-
dt[, op.call(na.omit(get(col))), by = list(State.District.ID, year)]
setnames(what.computed.cols[[col.name]], "V1", col.name)
}
}
what.computed.cols %>% reduce(inner_join, by = c("State.District.ID", "year"))
}
nces.yearly.computed.dt <- aggregate_dt_columns(nces.yearly.dt, nces.renamed.cols, numeric.col.ops)
nces.yearly.computed.dt <- nces.yearly.computed.dt[nces.computed.cols[["locale.cols"]], on = .(State.District.ID)]
# nces.computed.dt <- {
# nces.computed.cols %>% reduce(inner_join, by = "State.District.ID")
# }
# this sets the NA values from the SD of a single number to 0
nces.yearly.computed.dt[is.na(nces.yearly.computed.dt)] <- 0
for (col_num in 1:ncol(nces.yearly.computed.dt)) set(nces.yearly.computed.dt, which(is.infinite(nces.yearly.computed.dt[[col_num]])), col_num, 0)
```
Merges we are interested in, IC + CWIS + NCES, CWIS + NCES. IC does not have as many rows as CWIS so both merges are interesting to consider.
```{r}
# subset nces.dt and nces.yearly.dt to rows in DCI program,
# i.e. those who provide integration checklists.
# nces.dt <- nces.dt[ic.district.cols, on = .(State.District.ID)]
# nces.yearly.dt <- nces.yearly.dt[ic.district.cols, on = .(State.District.ID)]
cwis.nces.computed.combined.dt <- cwis.avg.dt[nces.yearly.computed.dt, on = .(State.District.ID, year), nomatch = NULL]
ic.cwis.nces.computed.combined.dt <- ic.dt[cwis.nces.computed.combined.dt, on=.(State.District.ID, year), nomatch=NULL]
write.csv(nces.yearly.computed.dt, "./Data Sources CSV/nces.yearly.computed.csv")
write.csv(cwis.nces.computed.combined.dt, "./Data Sources CSV/cwis.nces.computed.combined.csv")
write.csv(ic.cwis.nces.computed.combined.dt, "./Data Sources CSV/ic.cwis.nces.computed.combined.csv")
```
Identifying rows that were lost in joins because of the nomatch=NULL parameter.
```{r}
str(unique(cwis.avg.dt[, .(State.District.ID, year)]))
str(unique(cwis.nces.computed.combined.dt[, .(State.District.ID, year)]))
```
We see that we lost 5 combinations of district ID and year, so which are they?
```{r}
cwis.unjoined.unique <- unique(cwis.avg.dt[, .(State.District.ID, year)])
cwis.joined.unique <- unique(cwis.nces.computed.combined.dt[, .(State.District.ID, year)])
cwis.unjoined.unique[!cwis.joined.unique, on=.(State.District.ID, year)]
```
We find that MO-118118 does not exist in the NCES data, but it does in the CWIS data for all 5 years that have data.
```{r}
str(unique(ic.dt[, .(State.District.ID, year)]))
str(unique(ic.cwis.nces.computed.combined.dt[, .(State.District.ID, year)]))
```
We also see that we lost 27 rows on this join, however this is expected as we know the IC dataset is smaller.
We may come back to this code
Coaching aggregation is now performed in `./choaching_aggeregation.rmd`.
```{r}
# interested.coach.cols <- c("State.District.ID", "Date.of.Event/Visit", "Interaction.Type", "Consultants")
# interested.cwis.cols <- c("State.District.ID", "ETL.AVERAGE")
# interested.coach.data <- coach.data[,..interested.coach.cols]
# interested.cwis.data <- cwis.data[,..interested.cwis.cols]
# interested.coach.data[, "Date.of.Event/Visit" := convertToDate(sapply(coach.data[, "Date.of.Event/Visit"], as.numeric))]
# joined.data <- nces.data[cwis.data, nomatch=0, on="State.District.ID"]
```