forked from csaluski/interpretable_school_policy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
involvement_scores.rmd
288 lines (243 loc) · 11.6 KB
/
involvement_scores.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
---
title: "Interpretable Analysis of School Policy Decisions, DCI Involvement Scores"
author: "Charles Saluski"
# date: "8/4/2022"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```
We believe that the immersion level of a district may have an impact on the ETLP result of teachers within that district, however this data is new as of 2022. To this end, we desire to classify previous coaching years into
There are many approachs we could take to calculating how invovled a district is, taking into account the number of visits, duration of visits, and consistency of visits, as well as possible other metrics, such as number of visits per teacher, to account for school district size.
Check our scores against the 2022 IC self reported immersion levels, as kind of sanity check. These are factored into full immersion, partial immersion, and self guided immersion. This can clearly be treated as a classification problem, but regression methods may also prove valuable. The levels may need adjustment, but within a regression model we could consider these as 1, 0.5, and 0.
```{r}
library(data.table)
library(mlr3)
library(openxlsx)
library(mlr3extralearners)
library(mlr3learners)
```
Create a data set using every combination of products of variables, then optimize learners to generate best fit to our provided data set.
```{r}
year.month.to.school.year <- function(year.month.str) {
# we expect the input to be in "%Y-%m" format, so we can
# use exact substring extraction
year <- as.numeric(substr(year.month.str, 1, 4))
month <- as.numeric(substr(year.month.str, 6, 8))
if (month > 6) {
year <- year + 1
}
year
}
cl.2022.loc <- "./Data Sources/DCI Data/Coaching Logs/CST Reporting Form 2021-22 (Responses)_08.04.22.xlsx"
cl.2022.dt <- data.table(read.xlsx(cl.2022.loc, sheet = "Condensed_df"))
setnames(cl.2022.dt, "Date.of.Event/Visit", "Date")
cl.2022.dt <- cl.2022.dt[, .(Date, Event.Duration, State.District.ID)]
# Excel stores times and dates in decimal format, date times are days since
# 1900-01-01 12:00 AM, while times are just the decimal part.
cl.2022.dt[, Date := convertToDate(Date)]
cl.2022.dt[, year.month := format(Date, "%Y-%m")]
# cl.2022.dt[, year := sapply(year.month, year.month.to.school.year)]
# Here we look at all reported coaching from this log, as it is considered the
# log for the 21-22 school year. In applying this model, we will subset by year
# cl.2022.dt <- cl.2022.dt[year == 2022]
cl.2022.dt[, Event.Duration := Event.Duration * 24]
# Correct special error cases that Melvin highlighted
cl.2022.dt[Event.Duration == 9.5, Event.Duration := 7.5]
cl.2022.dt[Event.Duration == 30, Event.Duration := 0.5]
cl.2022.dt[Event.Duration == 45, Event.Duration := 0.75]
cl.2022.dt[Event.Duration == 9.0, Event.Duration := 5.0]
cl.2022.dt[Event.Duration == 8.5, Event.Duration := 5.5]
nces.loc <- "./Data Sources/NCES Data - District-Building Characteristics/ncesdata_ECCDA30A NO HEADER.xlsx"
nces.dt <- data.table(read.xlsx(nces.loc))
setnames(nces.dt, "Teachers*", "Teachers")
# There are some NA teacher results, from schools where data is NA, or where
# data is missing. Remove them so that they do not poison merged data.
nces.dt[, Teachers := as.numeric(Teachers)]
nces.dt <- nces.dt[!is.na(Teachers)]
# Possibility to overcount, if teacher is represented in multiple districts
# this could lead to overcounting
nces.dt[, total.teachers := sum(Teachers), by = State.District.ID]
nces.dt <- unique(nces.dt[, State.District.ID, total.teachers])
cl.2022.dt <- cl.2022.dt[nces.dt, nomatch = NULL, on=c("State.District.ID")]
cl.2022.dt[, total.duration := sum(Event.Duration), by=State.District.ID]
cl.2022.dt[, teacher.inverse := 1/total.teachers]
cl.2022.dt[, visit.n := nrow(.SD), by = State.District.ID]
cl.2022.dt[, n.months.visited := length(unique(year.month)), by = State.District.ID]
# We are not interested in individual interactions, so we unique filter down to
# the unique calculated aggregate statistics we're interested in
cl.2022.dt <- unique(cl.2022.dt[
, !c("Date", "Event.Duration", "year.month")
])
# We want a column of the product of every combination of variables, named
# after the variables that make up that product
# Have to use c around colnames here to convince it to return a plain vector of
# characters instead of a reference to the data table's vector of characters,
# since we are adding columns and that will cause combn to create columns
# of nth powers of variables
table.cols <- c(colnames(cl.2022.dt)[
!colnames(cl.2022.dt) %in% c("State.District.ID", "year")
])
for (n in 1:length(table.cols)) {
combn(table.cols, n, simplify = F, function(selected.vars){
new.col.name <- paste(selected.vars, collapse = "_")
# make the text of the expression we want to use in with
new.col.op <- paste(selected.vars, collapse = "*")
# parse the text into an expression and eval that expression in the
# environment of the data table
res <- with(cl.2022.dt, eval(parse(text=new.col.op)))
cl.2022.dt[, paste(new.col.name) := res]
}
)
}
ic.2022.loc <- "./Data Sources/IC_schoolyear_2021_2022.xlsx"
ic.2022.dt <- data.table(read.xlsx(ic.2022.loc))
ic.2022.immersion <- ic.2022.dt[, .(District.ID, Immersion)]
setnames(ic.2022.immersion, "District.ID", "State.District.ID")
full.immersion <- c("Full Immersion with in-person coaching",
"Full immersion with virtual coaching",
"Full immersion with in-person coaching")
partial.immersion <- c("Partial immersion", "Partial Immersion")
# Convert to 3 immersion level factors, treat in-person and virtual coaching as
# same factor
cl.2022.dt <- cl.2022.dt[ic.2022.immersion, on = "State.District.ID"]
cl.2022.dt[, Immersion := ifelse(Immersion %in% full.immersion,
"Full",
ifelse(
Immersion %in% partial.immersion, "Partial", "Minimal"))]
cl.2022.dt[, Immersion := as.factor(Immersion)]
cl.2022.dt[, immersion.score := ifelse(Immersion == "Full", 1,
ifelse(Immersion == "Partial",0.5, 0))]
# There are a couple districts from the immersion level set that are not
# present in other sets, and an "n/a" district, these are removed.
cl.2022.dt <- cl.2022.dt[complete.cases(cl.2022.dt)]
predict.cols <- c(colnames(cl.2022.dt)[
!colnames(cl.2022.dt)
%in% c("Date", "State.District.ID", "year.month", "year", "Immersion")])
predict.classif.cols <- c(colnames(cl.2022.dt)[
!colnames(cl.2022.dt)
%in% c("Date", "State.District.ID", "year.month", "year", "immersion.score")])
```
```{r}
set.seed(123)
num.folds <- 3
regr.task <- TaskRegr$new(
id = "Immersion score",
backend = cl.2022.dt[, ..predict.cols],
target = "immersion.score"
)
classif.backend <- cl.2022.dt[, ..predict.classif.cols]
write.csv(classif.backend, "./Data Sources CSV/immersion.classification.csv")
classif.task <- TaskClassif$new(
id = "Immersion Factor",
backend = classif.backend,
target = "Immersion"
)
regr.learner.list <- list()
regr.learner.list[["regr.featureless"]] <- LearnerRegrFeatureless$new()
regr.learner.list[["regr.ctree"]] <- LearnerRegrCTree$new()
# cv_glmnet returns 2 models, one with s1 and one with minimum
regr.learner.list[["regr.cv_glmnet"]] <- LearnerRegrCVGlmnet$new()
classif.learner.list <- list()
classif.learner.list[["classif.ctree"]] <- LearnerClassifCTree$new()
# Should this be interpretable or not? Lower depth is easier to understand, but
# might underfit. Is the objective to create a metric that can be understood
# and applied by people, or to create a metric that is most accurate?
classif.learner.list[["classif.ctree"]]$param_set$values <- list(maxdepth = 8)
# CVGlmnet has issues with small sample sizes, so we're not including it in
# this analysis
# classif.learner.list[["classif.cv_glmnet"]] <- LearnerClassifCVGlmnet$new()
classif.learner.list[["classif.cforest"]] <- LearnerClassifCForest$new()
classif.learner.list[["classif.xgboost"]] <- LearnerClassifXgboost$new()
classif.learner.list[["classif.featureless"]] <- LearnerClassifFeatureless$new()
resampling <- rsmp("cv", folds = num.folds)
regr.benchmark.obj <- benchmark_grid(
task = list(regr.task),
learners = regr.learner.list,
resamplings = list(resampling)
)
classif.benchmark.obj <- benchmark_grid(
task = list(classif.task),
learners = classif.learner.list,
resamplings = list(resampling)
)
regr.benchmark.res <- benchmark(regr.benchmark.obj, store_models = TRUE)
classif.benchmark.res <- benchmark(classif.benchmark.obj, store_models = TRUE)
regr.measure <- msr("regr.mse")
classif.measure <- msr("classif.ce")
regr.result.dt <- regr.benchmark.res$score(regr.measure)
classif.result.dt <- classif.benchmark.res$score(classif.measure)
classif.result.dt[, .(learner_id, iteration, classif.ce)]
```
```{r}
library(ggplot2)
method.levels <- classif.result.dt[, .(mean = mean(classif.ce)), by = learner_id][order(-mean), learner_id]
classif.result.dt[, Method := factor(learner_id, method.levels)]
err.plot <- ggplot() +
geom_point(data = classif.result.dt, aes(x = classif.ce, y = Method)) +
facet_grid(task_id ~ .) +
xlab("Classification Error")
if (!dir.exists("./img_out/immersion_score/")) {
dir.create("./img_out/immersion_score/")
}
png(filename = "./img_out/immersion_score/classif.error.png", width = 6, height = 4, unit = "in", res = 200)
print(err.plot)
dev.off()
err.plot
```
```{r}
# Should probably print out the confusion matrices of each result within
# classification tasks
if (!dir.exists("./img_out/immersion_score/trees")) {
dir.create("./img_out/immersion_score/trees")
}
dest <- "./img_out/immersion_score/trees/"
if (!dir.exists(dest)) {
dir.create(dest)
}
for (fold in 1:num.folds) {
curr.tree <- classif.result.dt[learner_id == "classif.ctree" & iteration == fold]$learner[[1]]$model
filename <- paste(fold, "tree.png", sep = "_")
filename <- paste(dest, filename, sep = "/")
png(filename = filename, width = 20, height = 6, unit = "in", res = 200)
plot(curr.tree)
dev.off()
}
```
```{r}
table(cl.2022.dt$Immersion)
# Prepare figure for best model, confusion matrices of test + full, and overall
# classification error
best.ctree <- classif.result.dt[learner_id == "classif.ctree" & classif.ce == min(classif.result.dt[learner_id == "classif.ctree"]$classif.ce)]
best.ctree[, .(iteration, classif.ce, learner_id)]
print(best.ctree$prediction[[1]]$confusion)
best.ctree.full.predict <- best.ctree$learner[[1]]$predict(classif.task)
print(best.ctree.full.predict$confusion)
plot(best.ctree$learner[[1]]$model)
best.model <- classif.result.dt[classif.ce == min(classif.result.dt$classif.ce)]
best.model[, .(iteration, classif.ce, learner_id)]
print(best.model$prediction[[1]]$confusion)
best.full.predict <- best.model$learner[[1]]$predict(classif.task)
print(best.full.predict$confusion)
best.learner <- best.model$learner[[1]]
if (!dir.exists("./obj_out/")) {
dir.create("./obj_out/")
}
save(best.learner, file = "./obj_out/immersion_classifier_model.RData", compress=T)
```
Look at which districts are misclassified, and see if there is a trend in their misclassification.
```{r}
cl.2022.w.predict.dt <- cbind(cl.2022.dt, as.data.table(best.full.predict))
wrong.predict.dt <- cl.2022.w.predict.dt[
truth != response,
.(total.teachers, total.duration, visit.n, n.months.visited, truth, response)]
# Most of these are misclassifications where I agree with the generated label,
# especially "Partial" being classified as "Full" when there are many long and
# consistent visits, or "Full" being classified as "Partial" when there are few
# and shorter visits.
wrong.predict.dt
table(wrong.predict.dt[, .(response, truth)])
# The only "Minimal" district classified as "Full" is one which had many visits
# and a high duration, but in relatively fewer months.
wrong.predict.dt[truth == "Minimal" & response == "Full"]
```