matching_districts.rmd

We will use free and reduced lunch (FRL), enrollment number (Enroll), percentage of non-whote students (Nonwhitepercent), and percentage of students passing exams (Passratebase) for our matching. 

```{r}
library(MatchIt)
library(data.table)
library(dplyr)
library(stringr)
library(purrr)
library(openxlsx)

district.map.dt <- fread("Data Sources CSV/building_map_data/District MAP content area and grade all disag.csv")

district.map.dt[, State.District.ID := paste0("MO-", str_pad(COUNTY_DISTRICT, 6, "left", "0"))]

# This introduces NA for any grade outside 03 to 08. This is appropriate for
# this analysis, as we are not concerned with grades outside this range,
# however this should be considered for other forms of analysis.
district.map.dt[, GRADE_LEVEL := as.numeric(GRADE_LEVEL)]

district.map.dt[, PROFICIENT_PCT := as.numeric(PROFICIENT_PCT)]
district.map.dt[, PROFICIENT_PCT := ifelse(is.na(PROFICIENT_PCT), 0, PROFICIENT_PCT)]

district.map.dt[, ADVANCED_PCT := as.numeric(ADVANCED_PCT)]
district.map.dt[, ADVANCED_PCT := ifelse(is.na(ADVANCED_PCT), 0, ADVANCED_PCT)]
district.map.dt[, passratebase := PROFICIENT_PCT + ADVANCED_PCT]

iep.reported.2021.districts <- unique(district.map.dt[GRADE_LEVEL == 3 & TYPE == "IEP Non MAPA" & YEAR == 2021]$State.District.ID)

district.demographic.dt <- data.table(read.xlsx("Data Sources/Comparison_Group/District Demographic Data.xlsx"))
district.enrollment.dt <- data.table(read.xlsx("Data Sources/Comparison_Group/District Enrollment.xlsx"))

district.demographic.dt[, State.District.ID := paste0("MO-", str_pad(COUNTY_DISTRICT_CODE, 6, "left", "0"))]
district.enrollment.dt[, State.District.ID := paste0("MO-", str_pad(COUNTY_DISTRICT_CODE, 6, "left", "0"))]

joined.dt <- district.map.dt[district.demographic.dt, on = c("YEAR", "State.District.ID", nomatch = NULL)]
joined.dt <- joined.dt[district.enrollment.dt, on = c("YEAR", "State.District.ID", nomatch = NULL)]
# match within a year against other schools
joined.dt <- joined.dt[YEAR == 2019 | YEAR == 2018 | YEAR == 2020]
```

```{r}
dci.building.dt <- fread("Data Sources/DCI Data/Active Districts/Active_DCI_buildings_2017_2022.csv")

dci.districts.dt <- unique(dci.building.dt[, .(State.District.ID, currentSchoolYear)])

dci.districts.dt <- dci.districts.dt[order(currentSchoolYear)]

all.dci.districts <- unique(dci.districts.dt$State.District.ID)

# Account for years taken off from the program by counting along the years the
# district was actually in the program
dci.districts.dt[
  ,
  year.in.program := 1:nrow(.SD),
  by = "State.District.ID"]

dci.joined.dt <- dci.districts.dt[joined.dt, on = c("State.District.ID", "currentSchoolYear" = "YEAR"), nomatch = NA]


selected.year <- unique(dci.districts.dt[year.in.program == 1, .(currentSchoolYear, State.District.ID)])
dci.districts.2021 <- dci.districts.dt[currentSchoolYear == 2021]


dci.joined.dt[, Enroll := ENROLLMENT_GRADES_03]
dci.joined.dt[, Enroll := as.numeric(Enroll)]
dci.joined.dt[, Enroll := ifelse(is.na(Enroll), 0, Enroll)]
dci.joined.dt[, FRL := LUNCH_COUNT_FREE_REDUCED_PCT]
dci.joined.dt[, FRL := as.numeric(FRL)]
dci.joined.dt[, FRL := ifelse(is.na(FRL), 0, FRL)]

dci.joined.dt[, ENROLLMENT_WHITE_PCT := as.numeric(ENROLLMENT_WHITE_PCT)]
dci.joined.dt[, ENROLLMENT_WHITE_PCT := ifelse(is.na(ENROLLMENT_WHITE_PCT), 0, ENROLLMENT_WHITE_PCT)]
dci.joined.dt[, nonwhitepercent := 100 - ENROLLMENT_WHITE_PCT]

dci.joined.dt[, PROFICIENT_PCT := as.numeric(PROFICIENT_PCT)]
dci.joined.dt[, PROFICIENT_PCT := ifelse(is.na(PROFICIENT_PCT), 0, PROFICIENT_PCT)]

started.2019 <- selected.year[currentSchoolYear == 2019 & State.District.ID %in% dci.districts.2021$State.District.ID]
started.2020 <- selected.year[currentSchoolYear == 2020 & State.District.ID %in% dci.districts.2021$State.District.ID]

other.dci.districts.2019 <- selected.year[
  !State.District.ID %in% started.2019$State.District.ID
  & State.District.ID %in% all.dci.districts
  ]
other.dci.districts.2020 <- selected.year[
  !State.District.ID %in% started.2020$State.District.ID
  & State.District.ID %in% all.dci.districts
  ]

# Schools that started in 2019 are 1 year
# These are schools that started in fall 2018, and then reported in this data 
# as the year 2019
print(table(unique(dci.joined.dt[currentSchoolYear == 2019, .(year.in.program, State.District.ID)])$year.in.program))
# Schools that started in 2020 are 1 year
# These are schools that started in fall 2019, and then reported in this data 
# as the year 2020
print(table(unique(dci.joined.dt[currentSchoolYear == 2020, .(year.in.program, State.District.ID)])$year.in.program))


```


```{r}
dci.joined.dt <- dci.joined.dt[GRADE_LEVEL == 3 & CONTENT_AREA == "Eng. Language Arts" & State.District.ID %in% iep.reported.2021.districts]
dci.joined.dt[, prop.iep := REPORTABLE / Enroll]
dci.joined.dt <- dci.joined.dt[!is.infinite(prop.iep)]

# cohort 2 started treatment in 2019, so 2018 is the year for the baseline
dci.cohort2.dt <- dci.joined.dt[currentSchoolYear == 2018 & !State.District.ID %in% other.dci.districts.2019$State.District.ID] 
dci.cohort2.dt[, treated := State.District.ID %in% started.2019$State.District.ID]

dci.cohort2.iep.dt <- dci.cohort2.dt[TYPE == "IEP Non MAPA"]

# cohort 3 started treatment in 2020, so 2019 is year for baseline
dci.cohort3.dt <- dci.joined.dt[currentSchoolYear == 2019 & !State.District.ID %in% other.dci.districts.2020$State.District.ID]
dci.cohort3.dt[, treated := State.District.ID %in% started.2020$State.District.ID]
dci.cohort3.iep.dt <- dci.cohort3.dt[TYPE == "IEP Non MAPA"]
```

```{r}
m.cohort2.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase, data = dci.cohort2.iep.dt, method = "optimal", distance = "glm" )

m.cohort3.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase, data = dci.cohort3.iep.dt, method = "optimal", distance = "glm" )

m.cohort2.report.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase + REPORTABLE, data = dci.cohort2.iep.dt, method = "optimal", distance = "glm" )

m.cohort3.report.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase + REPORTABLE, data = dci.cohort3.iep.dt, method = "optimal", distance = "glm" )

m.cohort2.iep.prop.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase + prop.iep, data = dci.cohort2.iep.dt, method = "optimal", distance = "glm" )

m.cohort3.iep.prop.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase + prop.iep, data = dci.cohort3.iep.dt, method = "optimal", distance = "glm" )

summary(m.cohort2.matched.out)
matched.cohort2.districts.dt <- match.data(m.cohort2.matched.out, group = "control")

summary(m.cohort3.matched.out)
matched.cohort3.districts.dt <- match.data(m.cohort3.matched.out, group = "control")

summary(m.cohort2.report.matched.out)
matched.cohort2.report.districts.dt <- match.data(m.cohort2.report.matched.out, group = "control")

summary(m.cohort3.report.matched.out)
matched.cohort3.report.districts.dt <- match.data(m.cohort3.report.matched.out, group = "control")


summary(m.cohort2.iep.prop.matched.out)
matched.cohort2.iep.prop.districts.dt <- match.data(m.cohort2.iep.prop.matched.out, group = "control")

summary(m.cohort3.iep.prop.matched.out)
matched.cohort3.iep.prop.districts.dt <- match.data(m.cohort3.iep.prop.matched.out, group = "control")


matched.cohort2.treated.dt <- match.data(m.cohort2.matched.out, group = "treated")
matched.cohort3.treated.dt <- match.data(m.cohort3.matched.out, group = "treated")
```

```{r}
matched.cohort2.treated.dt[, cohort := 2]
matched.cohort3.treated.dt[, cohort := 3]

matched.cohort2.districts.dt[, cohort := 2]
matched.cohort3.districts.dt[, cohort := 3]

matched.cohort2.report.districts.dt[, cohort := 2]
matched.cohort3.report.districts.dt[, cohort := 3]

matched.cohort2.iep.prop.districts.dt[, cohort := 2]
matched.cohort3.iep.prop.districts.dt[, cohort := 3]

matched.cohort2.districts.dt[, extra.var := NA]
matched.cohort3.districts.dt[, extra.var := NA]

matched.cohort2.report.districts.dt[, extra.var := "reportable"]
matched.cohort3.report.districts.dt[, extra.var := "reportable"]

matched.cohort2.iep.prop.districts.dt[, extra.var := "iep.prop"]
matched.cohort3.iep.prop.districts.dt[, extra.var := "iep.prop"]

matched.cohort2.treated.dt[, extra.var := NA]
matched.cohort3.treated.dt[, extra.var := NA]


matched.cohorts.dt <- rbind(matched.cohort2.treated.dt, matched.cohort3.treated.dt, matched.cohort2.districts.dt, matched.cohort3.districts.dt, matched.cohort2.report.districts.dt, matched.cohort3.report.districts.dt,matched.cohort2.iep.prop.districts.dt, matched.cohort3.iep.prop.districts.dt)

map.2021.result <- district.map.dt[YEAR == 2021 & TYPE == "IEP Non MAPA" & GRADE_LEVEL == 3 & CONTENT_AREA == "Eng. Language Arts"]

map.2021.result <- map.2021.result[State.District.ID %in% matched.cohorts.dt$State.District.ID]

map.2021.result[, passrate2021 := passratebase]
map.2021.result <- map.2021.result[, .(State.District.ID, passrate2021)]

matched.cohorts.dt <- matched.cohorts.dt[map.2021.result, on = c("State.District.ID")]
matched.cohorts.dt[, change := passrate2021 - passratebase]

matched.summary.dt <- matched.cohorts.dt[, unlist(recursive = F, lapply(
  .(mean = mean, sd = sd, sum = sum, count = length),
  function(f) lapply(.SD, f)
)), by = c("treated", "cohort", "extra.var"), .SDcols = c("FRL", "nonwhitepercent", "passratebase", "passrate2021", "change", "Enroll", "REPORTABLE", "prop.iep")]

write.csv(matched.summary.dt, file = "./Data Sources CSV/Matching/summary_different_methods.csv")
write.csv(matched.cohorts.dt, file = "./Data Sources CSV/Matching/matching_results.csv")

# get matched pairs, order by distance and cbind the tables together
treated.c2.dt <- matched.cohorts.dt[cohort == 2 & treated == T, .(State.District.ID, distance)]
untreated.c2.dt <- matched.cohorts.dt[cohort == 2 & treated == F & is.na(extra.var), .(State.District.ID, distance)]

treated.c2.dt <- treated.c2.dt[order(distance)]
untreated.c2.dt <- untreated.c2.dt[order(distance)]

setnames(treated.c2.dt, "State.District.ID", "Treated.District.ID")
setnames(untreated.c2.dt, "State.District.ID", "Matched.District.ID")


treated.c3.dt <- matched.cohorts.dt[cohort == 3 & treated == T, .(State.District.ID, distance)]
untreated.c3.dt <- matched.cohorts.dt[cohort == 3 & treated == F & is.na(extra.var), .(State.District.ID, distance)]

treated.c3.dt <- treated.c3.dt[order(distance)]
untreated.c3.dt <- untreated.c3.dt[order(distance)]

setnames(treated.c3.dt, "State.District.ID", "Treated.District.ID")
setnames(untreated.c3.dt, "State.District.ID", "Matched.District.ID")


c2.matched.districts <- cbind(treated.c2.dt, untreated.c2.dt)
c3.matched.districts <- cbind(treated.c3.dt, untreated.c3.dt)

write.csv(c2.matched.districts, "./Data Sources CSV/Matching/cohort_2_matched.csv")
write.csv(c3.matched.districts, "./Data Sources CSV/Matching/cohort_3_matched.csv")

print(t.test(matched.cohorts.dt[treated == T & cohort == 2]$change, matched.cohorts.dt[treated == F & cohort == 2 & is.na(extra.var)]$change))
print(t.test(matched.cohorts.dt[treated == T & cohort == 3]$change, matched.cohorts.dt[treated == F & cohort == 3 & is.na(extra.var)]$change))

inactive.dci.districts <- c("MO-084001", "MO-106004", "MO-058112", "MO-011079", "MO-023101", "MO-004106", "MO-069107", "MO-063067", "MO-004110", "MO-054041", "MO-078003", "MO-078013", "MO-078013", "MO-033090", "MO-033090", "MO-068073", "MO-018050", "MO-088080")


print(inactive.dci.districts[which(inactive.dci.districts %in% treated.c2.dt$Treated.District.ID)])
print(inactive.dci.districts[which(inactive.dci.districts %in% treated.c3.dt$Treated.District.ID)])
which(inactive.dci.districts %in% c3.matched.districts$State.District.ID)

```
        Welch Two Sample t-test

data:  matched.cohorts.dt[treated == T & cohort == 2]$change and matched.cohorts.dt[treated == F & cohort == 2 & is.na(extra.var)]$change
t = 0.1151, df = 57.017, p-value = 0.9088
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.13022  12.48780
sample estimates:
mean of x mean of y 
-1.596970 -2.275758 


        Welch Two Sample t-test

data:  matched.cohorts.dt[treated == T & cohort == 3]$change and matched.cohorts.dt[treated == F & cohort == 3 & is.na(extra.var)]$change
t = -1.8441, df = 141.24, p-value = 0.06726
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -15.7501038   0.5473641
sample estimates:
 mean of x  mean of y 
-8.1506849 -0.5493151 

Building level matching, use yearly building level NCES data from Melvin, and yearly building level MAP data from `./Data Sources CSV/building_map_data/` files. ESSA data is building level as well. 

Create 4 matched data sets, cohort 2 and 3, and ESSA/non-ESSA

```{r}
# COUNTY_DISTRICT and SCHOOL_CODE uniquely identify a building
map.buildings.2018 <- fread("Data Sources CSV/building_map_data/Building MAP content area and grade all disag 2018.csv")
map.buildings.19.21 <- fread("Data Sources CSV/building_map_data/Building MAP content area and grade all disag 2019-2021.csv")

map.building.dt <- rbind(map.buildings.19.21, map.buildings.2018)

map.building.dt[, 
  State.School.ID := paste0("MO-", str_pad(COUNTY_DISTRICT, 6, "left", "0"), "-", SCHOOL_CODE, str_pad(COUNTY_DISTRICT, 6, "left", "0") )
]

map.building.dt[, PROFICIENT_PCT := as.numeric(PROFICIENT_PCT)]
map.building.dt[, PROFICIENT_PCT := ifelse(is.na(PROFICIENT_PCT), 0, PROFICIENT_PCT)]

map.building.dt[, ADVANCED_PCT := as.numeric(ADVANCED_PCT)]
map.building.dt[, ADVANCED_PCT := ifelse(is.na(ADVANCED_PCT), 0, ADVANCED_PCT)]
map.building.dt[, passratebase := PROFICIENT_PCT + ADVANCED_PCT]

iep.reported.2021.schools <- unique(map.building.dt[GRADE_LEVEL == "03" & TYPE == "IEP Non MAPA" & YEAR == 2021]$State.School.ID)


nces.demographic.dt <- data.table(read.xlsx("./Data Sources/NCES Demographic/Building Demographic Data 2006 to Current.xlsx"))
nces.enrollment.dt <- data.table(read.xlsx("./Data Sources/NCES Demographic/Building Enrollment.xlsx"))

nces.building.dt <- nces.demographic.dt[nces.enrollment.dt, on=c("COUNTY_DISTRICT_CODE", "SCHOOL_CODE", "YEAR", "DISTRICT_NAME", "SCHOOL_NAME"), nomatch = NULL]
nces.building.dt <- nces.building.dt[YEAR == 2018 | YEAR == 2019 | YEAR == 2020]

nces.building.dt[, 
  State.School.ID := paste0("MO-", str_pad(COUNTY_DISTRICT_CODE, 6, "left", "0"), "-", SCHOOL_CODE, str_pad(COUNTY_DISTRICT_CODE, 6, "left", "0") )
]

joined.building.dt <- nces.building.dt[map.building.dt, on = c("YEAR", "State.School.ID"), nomatch = NULL]


dci.building.dt <- fread("Data Sources/DCI Data/Active Districts/Active_DCI_buildings_2017_2022.csv")

dci.building.dt <- unique(dci.building.dt[, .(State.School.ID, currentSchoolYear)])

dci.building.dt <- dci.building.dt[order(currentSchoolYear)]

all.dci.schools <- unique(dci.building.dt$State.School.ID)

dci.building.dt[
  ,
  year.in.program := 1:nrow(.SD),
  by = "State.School.ID"]

dci.year.dt <- dci.building.dt[, .(year.in.program, currentSchoolYear, State.School.ID)]

building.selected.year <- unique(dci.building.dt[year.in.program == 1, .(currentSchoolYear, State.School.ID)])

dci.buildings.2021 <- dci.building.dt[currentSchoolYear == 2021]

dci.building.dt <- dci.building.dt[joined.building.dt, on = c("State.School.ID", "currentSchoolYear" = "YEAR"), nomatch = NA]

dci.building.dt[, Enroll := ENROLLMENT_GRADES_03]
dci.building.dt[, Enroll := as.numeric(Enroll)]
dci.building.dt[, Enroll := ifelse(is.na(Enroll), 0, Enroll)]
dci.building.dt[, FRL := LUNCH_COUNT_FREE_REDUCED_PCT]
dci.building.dt[, FRL := as.numeric(FRL)]
dci.building.dt[, FRL := ifelse(is.na(FRL), 0, FRL)]

dci.building.dt[, ENROLLMENT_WHITE_PCT := as.numeric(ENROLLMENT_WHITE_PCT)]
dci.building.dt[, ENROLLMENT_WHITE_PCT := ifelse(is.na(ENROLLMENT_WHITE_PCT), 0, ENROLLMENT_WHITE_PCT)]
dci.building.dt[, nonwhitepercent := 100 - ENROLLMENT_WHITE_PCT]

dci.building.dt[, PROFICIENT_PCT := as.numeric(PROFICIENT_PCT)]
dci.building.dt[, PROFICIENT_PCT := ifelse(is.na(PROFICIENT_PCT), 0, PROFICIENT_PCT)]


buildings.started.2019 <- building.selected.year[currentSchoolYear == 2019 & State.School.ID %in% dci.buildings.2021$State.School.ID]
buildings.started.2020 <- building.selected.year[currentSchoolYear == 2020 & State.School.ID %in% dci.buildings.2021$State.School.ID]

other.dci.buildings.2019 <- building.selected.year[
  !State.School.ID %in% buildings.started.2019$State.School.ID
  & State.School.ID %in% all.dci.schools
  ]
other.dci.buildings.2020 <- building.selected.year[
  !State.School.ID %in% buildings.started.2020$State.School.ID
  & State.School.ID %in% all.dci.schools
  ]

# When we verify ESSA data load it here, then join to dci.building.dt and 
# add boolean column, ESSA in year of selection.
# This file is not part of our main existing data set, it was provided to us on
# request
essa.years.dt <- data.table(read.xlsx("./Data Sources/Building ESSA/Title 1 Schools 2018 - 2022.xlsx", sheet = "Title 1 Buildings", startRow = 5))
essa.years.dt[, State.School.ID := paste0("MO-", str_pad(ACCTYDIS, 6, "left", "0"), "-", ACSCHOOL, str_pad(ACCTYDIS, 6, "left", "0"))]

essa.year.in.program.dt <- dci.year.dt[
  essa.years.dt, 
  on = c("currentSchoolYear" = "ACYEAR" , "State.School.ID")]

dci.building.dt <- essa.years.dt[dci.building.dt, on=c("State.School.ID", "ACYEAR" = "currentSchoolYear") ]
# Then we'll take the 4 subsets to match on, and include REPORTABLE in the 
# formula now
```

```{r}
dci.building.filtered.dt <- dci.building.dt[GRADE_LEVEL == "03" & CONTENT_AREA == "Eng. Language Arts" & State.School.ID %in% iep.reported.2021.schools & Enroll != 0]
dci.building.filtered.dt[, prop.iep := REPORTABLE / Enroll]
dci.building.filtered.dt <- dci.building.filtered.dt[!is.infinite(prop.iep)]

# cohort 2 started treatment in 2019, so 2018 is the year for the baseline
dci.building.cohort2.dt <- dci.building.filtered.dt[ACYEAR == 2018 & !State.School.ID %in% other.dci.buildings.2019$State.School.ID] 
dci.building.cohort2.dt[, treated := State.School.ID %in% buildings.started.2019$State.School.ID]
dci.building.cohort2.dt[, cohort := 2]

dci.building.cohort2.iep.dt <- dci.building.cohort2.dt[TYPE == "IEP Non MAPA"]
# get buildings that were labeled ESSA in same year as starting DCI, not in
# baseline year
dci.cohort2.essa.dt <- dci.building.cohort2.iep.dt[State.School.ID %in% essa.year.in.program.dt[currentSchoolYear == 2019]$State.School.ID]
dci.cohort2.non.essa.dt <- dci.building.cohort2.iep.dt[!State.School.ID %in% essa.year.in.program.dt[currentSchoolYear == 2019]$State.School.ID]

# cohort 3 started treatment in 2020, so 2019 is year for baseline
dci.building.cohort3.dt <- dci.building.filtered.dt[ACYEAR == 2019 & !State.School.ID %in% other.dci.buildings.2020$State.School.ID]
dci.building.cohort3.dt[, treated := State.School.ID %in% buildings.started.2020$State.School.ID]
dci.building.cohort3.dt[, cohort := 3]
dci.building.cohort3.iep.dt <- dci.building.cohort3.dt[TYPE == "IEP Non MAPA"]


dci.cohort3.essa.dt <- dci.building.cohort3.iep.dt[State.School.ID %in% essa.year.in.program.dt[currentSchoolYear == 2020]$State.School.ID]
dci.cohort3.non.essa.dt <- dci.building.cohort3.iep.dt[!State.School.ID %in% essa.year.in.program.dt[currentSchoolYear == 2020]$State.School.ID]
```

```{r}

m.cohort2.essa.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase, data = dci.cohort2.essa.dt, method = "optimal", distance = "glm" )

m.cohort3.essa.matched.out <- matchit(treated ~ FRL + Enroll + nonwhitepercent + passratebase, data = dci.cohort3.essa.dt, method = "optimal", distance = "glm" )

summary(m.cohort2.essa.matched.out)
summary(m.cohort3.essa.matched.out)

matched.cohort2.essa.dt <- match.data(m.cohort2.essa.matched.out)
matched.cohort3.essa.dt <- match.data(m.cohort3.essa.matched.out)
```

```{r}
matched.buildings.dt <- rbind(matched.cohort2.essa.dt, matched.cohort3.essa.dt)

map.building.2021.result <- map.building.dt[YEAR == 2021 & TYPE == "IEP Non MAPA" & GRADE_LEVEL == "03" & CONTENT_AREA == "Eng. Language Arts"]

map.building.2021.result <- map.building.2021.result[State.School.ID %in% matched.buildings.dt$State.School.ID]


map.building.2021.result[, passrate2021 := passratebase]
map.building.2021.result <- map.building.2021.result[, .(State.School.ID, passrate2021)]


matched.buildings.dt <- matched.buildings.dt[map.building.2021.result, on = c("State.School.ID")]
matched.buildings.dt[, change := passrate2021 - passratebase]

matched.buildings.summary.dt <- matched.buildings.dt[, unlist(recursive = F, lapply(
  .(mean = mean, sd = sd, sum = sum, count = length),
  function(f) lapply(.SD, f)
)), by = c("treated", "cohort"), .SDcols = c("FRL", "nonwhitepercent", "passratebase", "passrate2021", "change", "Enroll", "REPORTABLE", "prop.iep")]


print(t.test(matched.buildings.dt[treated == T & cohort == 2]$change, matched.buildings.dt[treated == F & cohort == 2]$change))
print(t.test(matched.buildings.dt[treated == T & cohort == 3]$change, matched.buildings.dt[treated == F & cohort == 3 ]$change))


print(t.test(matched.buildings.dt[treated == T & cohort == 2]$change, matched.buildings.dt[treated == F & cohort == 2]$change, paired = T))
print(t.test(matched.buildings.dt[treated == T & cohort == 3]$change, matched.buildings.dt[treated == F & cohort == 3 ]$change, paired = T))

# Output the resulting data tables, and matched pairs of districts

write.csv(matched.buildings.summary.dt, file = "./Data Sources CSV/Matching/summary_buildings_different_methods.csv")
write.csv(matched.buildings.dt, file = "./Data Sources CSV/Matching/matching_building_results.csv")

# get matched pairs, order by distance and cbind the tables together
treated.c2.dt <- matched.buildings.dt[cohort == 2 & treated == T, .(State.School.ID, distance)]
untreated.c2.dt <- matched.buildings.dt[cohort == 2 & treated == F, .(State.School.ID, distance)]

treated.c2.dt <- treated.c2.dt[order(distance)]
untreated.c2.dt <- untreated.c2.dt[order(distance)]

setnames(treated.c2.dt, "State.School.ID", "Treated.School.ID")
setnames(untreated.c2.dt, "State.School.ID", "Matched.School.ID")


treated.c3.dt <- matched.buildings.dt[cohort == 3 & treated == T, .(State.School.ID, distance)]
untreated.c3.dt <- matched.buildings.dt[cohort == 3 & treated == F, .(State.School.ID, distance)]

treated.c3.dt <- treated.c3.dt[order(distance)]
untreated.c3.dt <- untreated.c3.dt[order(distance)]

setnames(treated.c3.dt, "State.School.ID", "Treated.School.ID")
setnames(untreated.c3.dt, "State.School.ID", "Matched.School.ID")


c2.matched.buildings <- cbind(treated.c2.dt, untreated.c2.dt)
c3.matched.buildings <- cbind(treated.c3.dt, untreated.c3.dt)

write.csv(c2.matched.buildings, "./Data Sources CSV/Matching/cohort_2_matched_buildings.csv")
write.csv(c3.matched.buildings, "./Data Sources CSV/Matching/cohort_3_matched_buildings.csv")
```

        Welch Two Sample t-test

data:  matched.buildings.dt[treated == T & cohort == 2]$change and matched.buildings.dt[treated == F & cohort == 2]$change
t = 1.3105, df = 117.42, p-value = 0.1926
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -2.891999 14.206753
sample estimates:
mean of x mean of y 
-3.693443 -9.350820 


        Welch Two Sample t-test

data:  matched.buildings.dt[treated == T & cohort == 3]$change and matched.buildings.dt[treated == F & cohort == 3]$change
t = -1.9733, df = 203.31, p-value = 0.04982
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -12.40086040  -0.00496484
sample estimates:
mean of x mean of y 
-8.022330 -1.819417 


        Paired t-test

data:  matched.buildings.dt[treated == T & cohort == 2]$change and matched.buildings.dt[treated == F & cohort == 2]$change
t = 1.5897, df = 60, p-value = 0.1171
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -1.461029 12.775783
sample estimates:
mean of the differences 
               5.657377 


        Paired t-test

data:  matched.buildings.dt[treated == T & cohort == 3]$change and matched.buildings.dt[treated == F & cohort == 3]$change
t = -1.837, df = 102, p-value = 0.06912
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -12.9003305   0.4945052
sample estimates:
mean of the differences 
              -6.202913