build_correlation_animint.rmd

---
title: "Creating a function to create correlation plots as shown in coaching_correlation.rmd"
author: "Charles Saluski"
---

```{r}
# Note that if caret is used to normalize data then it must be imported before
# animint2, as it imports ggplot2 and will shadow functions from it.
library(data.table)
# library(caret)
library(animint2)
library(ash)
```

```{r} 
# Should this have some form of height calculation for the coefficient and
# correlation plots? As is, trying to plot more than 30 variables can start 
# getting really cramped, but we also have to be aware of the space complexity 
# of obs * vars^2, which gets a bit unweildy with large counts of vars or 
# observations
# I think animint2 has some way to work with wide data tables instead of long, 
# but I don't know how to work with that option yet. That would allow us to 
# utilize the original data tables, avoiding additional space being required
build_correlation_animint <- function(original_dt,
                                      model_output_dt,
                                      id_vars_vec,
                                      exclude_vars_vec = NULL,
                                      normalize = FALSE,
                                      heatmap = FALSE) {

  # Variables to exclude can be just the id variables, but if there's more
  # then the user will pass them to be excluded.
  if (!missing(exclude_vars_vec)) {
    original_dt <- original_dt[, !..exclude_vars_vec]
    model_output_dt <- model_output_dt[!var %in% exclude_vars_vec]
  }

  no.corr.cols <- c(id_vars_vec, exclude_vars_vec)

  cormat.raw <- cor(
    original_dt[, !..no.corr.cols],
    method = "pearson"
  )

  cormat.long.full <- data.table(reshape2::melt(cormat.raw))
  cormat.keep <- reshape2::melt(lower.tri(cormat.raw, diag = FALSE))
  cormat.display.dt <- data.table(
    reshape2::melt(cormat.raw)[cormat.keep$value == TRUE, ]
  )

  setnames(cormat.long.full, "Var1", "var")
  setnames(cormat.long.full, "Var2", "corr.var")
  setnames(cormat.long.full, "value", "corr.coef")

  setnames(cormat.display.dt, "Var1", "var")
  setnames(cormat.display.dt, "Var2", "corr.var")
  setnames(cormat.display.dt, "value", "corr.coef")

  var.names <- unique(cormat.long.full$var)


  select.var.dt <- data.table(unique(model_output_dt[, .(var, count)]))
  select.var.dt <- select.var.dt[var %in% var.names]

  coef.corr.dt <- copy(cormat.long.full)
  # Swizzle some names around so that we remap one name to another
  setnames(
    coef.corr.dt,
    c("var", "corr.var", "corr.coef"),
    c("var", "corr.var", "corr.coef")
  )

  coef.corr.dt <- cormat.long.full[select.var.dt, , on = c(corr.var = "var")]

  # subset by variable then sort by absolute value of correlation coefficient
  # assign every variable a y index
  coef.corr.dt[
    order(abs(corr.coef)),
    var.order := order(abs(corr.coef), decreasing = TRUE),
    by = var
  ]

  if (heatmap) {
    # Every combination of 2 variable columns needs to have a heat map
    # generated.
    heatmap.cols <- colnames(original_dt)
    heatmap.cols <- heatmap.cols[!heatmap.cols %in% no.corr.cols]
    heatmap.list <- list()
    for (colx in heatmap.cols) {
      for (coly in heatmap.cols) {
        # then melted into a long table of var1, var2, x, y, val
        # this doesn't work and that's really confusing?
        current.cols <- c(colx, coly)
        heatmap.resolution <- 5
        current.heatmap.mat <- bin2(
          as.matrix(original_dt[, ..current.cols]), 
          nbin = c(heatmap.resolution, heatmap.resolution)
          )$nc
        long.heatmap <- reshape2::melt(current.heatmap.mat)
        list.loc <- paste0(colx, coly, sep = ":")
        xVec <- min(original_dt[[colx]]):max(original_dt[[colx]])
        yVec <- min(original_dt[[coly]]):max(original_dt[[coly]])
        x <- approx(xVec, n = heatmap.resolution)$x
        y <- approx(yVec, n = heatmap.resolution)$x
        heatmap.list[[list.loc]] <- data.table(
          x = rep(x, each=heatmap.resolution),
          y = y,
          count = long.heatmap$value,
          var = colx,
          corr.var = coly
        )
      }
    }
    heatmap.dt <- do.call(rbind, heatmap.list)
    # we can't plot the text label at an arbitrary point because of the changes
    # in the coordinate systems, so we have to make a table with each location
    text.plot.dt <- coef.corr.dt[heatmap.dt, on = c("var", "corr.var")]
    text.plot.dt[, var.x := (min(x) + max(x)) / 2, by=var]
    text.plot.dt[, corr.y := (min(y) + max(y)) / 2, by=corr.var]
    text.plot.dt[, corr.x := min(x) , by=var]
    text.plot.dt[, var.y := min(y), by=corr.var]
    text.plot.dt <- unique(text.plot.dt[
      ,
      c("var", "corr.var", "var.x", "var.y", "corr.x", "corr.y")
    ])
  } else {
    # make a giant long data table of every var1 var2 val1 val2
    # after checking if any variables should be excluded
    if (missing(exclude_vars_vec)) {
      long.data.var <- melt(
        original_dt,
        id.vars = id_vars_vec,
        variable.name = "var",
        value.name = "var.val"
      )
      long.data.corr.var <- melt(
        original_dt,
        id.vars = id_vars_vec,
        variable.name = "corr.var",
        value.name = "corr.var.val"
      )
    } else {
      long.data.var <- melt(
        original_dt[, !..exclude_vars_vec],
        id.vars = id_vars_vec,
        variable.name = "var",
        value.name = "var.val"
      )
      long.data.corr.var <- melt(
        original_dt[, !..exclude_vars_vec],
        id.vars = id_vars_vec,
        variable.name = "corr.var",
        value.name = "corr.var.val"
      )
    }
  scatter.dt <- long.data.var[
    long.data.corr.var,
    on = id_vars_vec,
    allow.cartesian = TRUE
  ]

  # these intermediary data tables might be big, so save a little memory and
  # get rid of them
  rm("long.data.var", "long.data.corr.var")

  # we can't plot the text label at an arbitrary point because of the changes
  # in the coordinate systems, so we have to make a table with each location
  text.plot.dt <- coef.corr.dt[scatter.dt, on = c("var", "corr.var")]
  text.plot.dt[, var.x := (min(var.val) + max(var.val)) / 2, by=var]
  text.plot.dt[, corr.y := (min(corr.var.val) + max(corr.var.val)) / 2, by=corr.var]
  text.plot.dt[, corr.x := min(var.val) , by=var]
  text.plot.dt[, var.y := min(corr.var.val), by=corr.var]
  text.plot.dt <- unique(text.plot.dt[
    ,
    c("var", "corr.var", "var.x", "var.y", "corr.x", "corr.y")
  ])
  }

  # create a mean of each variable for plotting
  model_mean_dt <- model_output_dt[, mean := mean(norm.coef), by=var]
  model_mean_dt <- model_mean_dt[var %in% var.names]
  model_mean_dt <- unique(model_mean_dt[, .(mean, var, count)])

  var.coef.plot <- ggplot() +
    geom_segment(
      data = coef.corr.dt,
      aes(y = corr.var, yend = corr.var, x = -Inf, xend = Inf),
      color = "black",
      showSelected = "corr.var",
      size = 13,
      alpha = 0.5
    ) +
    geom_segment(
      data = coef.corr.dt,
      aes(
        y = corr.var,
        yend = corr.var,
        x = -Inf,
        xend = Inf,
        color = corr.coef,
        tooltip = corr.coef
      ),
      size = 10,
      showSelected = "var"
    ) +
    geom_segment(
      data = select.var.dt,
      aes(y = var, yend = var, x = -Inf, xend = Inf),
      color = "black",
      clickSelects = "var",
      size = 10,
      alpha = 0.5
    ) +
    geom_point(
      data = model_output_dt[var %in% var.names],
      aes(x = norm.coef, y = var, tooltip = norm.coef)
    ) +
    geom_point(
      data = model_mean_dt,
      aes(x = mean, y = var, tooltip = paste0("mean = ", mean)),
      color = "blue",
      size = 5,
      alpha = 0.5,
    ) +
    scale_color_gradient2(
      low = "blue",
      mid = "white",
      high = "red",
      midpoint = 0,
      limit = c(-1, 1)
    ) +
    facet_grid(count ~ ., scales = "free", space = "free") +
    labs(
      Title = "Variables that predict output",
      x = "Normalized linear model coefficient",
      y = "Variable"
    ) +
    theme_animint(height = 600, width = 800)

  corr.vec.plot <- ggplot() +
    # trying to use geom_raster silently breaks the entire plot system here,
    # seems like a major bug with animint2
    geom_tile(
      data = coef.corr.dt,
      aes(x = 1, y = -var.order, fill = corr.coef, key = corr.var, tooltip = corr.coef),
      showSelected = "var",
      clickSelects = "corr.var"
    ) +
    geom_text(
      data = coef.corr.dt,
      aes(x = 2, y = -var.order - 0.3, label = corr.var),
      size = 10,
      showSelected = "var",
      clickSelects = "corr.var",
      hjust = 0,
      # TODO Character alignment is not in the version of ggplot2 that animint is
      # based off, could be worth porting
      # hjust = "right"
      # TODO could also be worth making a better default text alignment in
      # animint2, see https://developer.mozilla.org/en-US/docs/Web/SVG/Attribute/alignment-baseline
      # for the SVG property that would probably be used for this
    ) +
    coord_equal() +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limit = c(-1, 1)) +
    labs(
      title = "Variables Most Correlated",
      y = "Correlating variables ordered by absolute value of correlation"
    ) +
    scale_x_continuous(limits=c(0,20)) +
    theme(
      axis.text.x = element_blank(),
      axis.title.x = element_blank(),
      axis.ticks.x = element_blank(),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank(),
      legend.position = "none"
    ) +
    theme_animint(height = 400, width = 300)

  var.2d.plot <- ggplot()
  
  if (heatmap) {
    var.2d.plot <- var.2d.plot +
    # change to geom_rect and see if that fixes the update_axes
      geom_tile(
        data = heatmap.dt,
        aes(x = x, y = y, fill = log(count)),
        showSelected = c("var", "corr.var"),
      ) +
      theme_animint(
        update_axes = c("x", "y"),
        height = 400,
        width = 400
      )
  } else {
    var.2d.plot <- var.2d.plot +
      geom_jitter(
        data = scatter.dt,
        width = 0.1,
        height = 0.1,
        alpha = 0.6,
        aes(x = var.val, y = corr.var.val, tooltip = paste(State.District.ID, year)),
        clickSelects = "State.District.ID",
        showSelected = c("var", "corr.var"),
        chunk_vars = c("var")) +
      theme_animint(
        update_axes = c("x", "y"),
        height = 400,
        width = 400
      )
  } 

  var.2d.plot <- var.2d.plot +
    geom_text(
      data = text.plot.dt,
      aes(label = var, x = var.x, y = var.y),
      color = "red",
      showSelected = c("var", "corr.var"),
    ) +
    geom_text(
      data = text.plot.dt,
      aes(label = corr.var, x = corr.x, y = corr.y, angle = 90),
      color = "red",
      showSelected = c("var", "corr.var"),
      # angle = 90,
    ) +
    labs(
      title = "Scatter Plot of Selected Variables",
      x = "Selected main variable",
      y = "Selected secondary variable"
    )


  plot.list <- list()

  # plot.list$correlationheatmap <- coaching.corr.plot
  plot.list$varcoef <- var.coef.plot
  plot.list$coefvec <- corr.vec.plot
  plot.list$varscatter <- var.2d.plot
  plot.list$duration <- list("var" = 1000, "corr.var" = 1000)


  plot.list
}
```


```{r}
library(openxlsx)
library(stringr)

full.joined.loc <- "./Data Sources CSV/ic.cwis.nces.computed.combined.csv"
joined.dt <- fread(full.joined.loc)
joined.dt <- joined.dt[complete.cases(joined.dt)]

for (col in colnames(joined.dt)) {
  new_name <- str_replace_all(col, "[^[:alnum:]._]", ".")
  setnames(joined.dt, col, new_name)
}

glm.coef.loc <- "./Data Sources CSV/regr.glm.coef.csv"

glm.coef.dt <- fread(glm.coef.loc)
glm.coef.dt <- glm.coef.dt[method == "lambda.min" & task_id == "ic.etlp.no.cfa"]
selected.vars <- unique(glm.coef.dt$var)

# normalize the coefficients of the models
joined.sd <- apply(joined.dt[, ..selected.vars], 2, sd)
glm.coef.sd.dt <- data.table(
  sd = joined.sd, 
  var = names(joined.sd)
)

glm.coef.dt <- glm.coef.sd.dt[glm.coef.dt, on=c("var")]
glm.coef.dt[, sd := ifelse(is.na(sd), 0, sd)]
glm.coef.dt[, norm.coef := coef * sd]
# kinda ugly way to add the ETLP_avg to the coefficients plot, so that it shows
# up in the correlation plots
glm.coef.dt <- rbind(glm.coef.dt, list(NA, "CWIS_ETLP_avg", "lambda.min", NA, "ic.etlp.no.cfa", NA, NA))

# get rid of weird characters in the column names
for (col in colnames(glm.coef.dt)) {
  new_name <- str_replace_all(col, "[^[:alnum:].]", ".")
  setnames(glm.coef.dt, col, new_name)
}
```

```{r}
id.cols <- c("State.District.ID", "year")

ic.cols <- unique(glm.coef.dt$var[grep("IC_", glm.coef.dt$var)])
ic.exclude.cols <- colnames(joined.dt)[
  !(
    colnames(joined.dt) %in% c(ic.cols, id.cols)
    )
  ]

ic.vis <- build_correlation_animint(joined.dt, glm.coef.dt, id.cols, ic.exclude.cols)
ic.vis$title <- "Correlation of Integration Checklist Variables with GLM models"

# ic.heatmap.vis <- build_correlation_animint(ic.dt, glm.coef.dt, id.cols, ic.exclude.cols, heatmap = TRUE)


animint2dir(ic.vis, out.dir = "./animint_out/ic", open.browser = FALSE)
# animint2dir(ic.heatmap.vis, out.dir = "./animint_out/ic_heatmap", open.browser = FALSE)
# We are using GitHub Pages to host these now instead of Gist, as the Gist files
# tend to get too large or too numerous.
# animint2gist(ic.vis)
rm("ic.vis")
```

We really want a way to visualize correlation across multiple data sets, maybe
by only going with the vars that only have count > 0?

```{r}
id.cols <- c("State.District.ID", "year")


glm.coef.dt <- glm.coef.dt[count > 0]
glm.coef.dt <- rbind(glm.coef.dt, list(NA, "CWIS_ETLP_avg", "lambda.min", NA, "etlp.no.cfa", NA, NA))

selected.vars <- unique(glm.coef.dt$var)
keep.cols <- c(selected.vars, id.cols)

joined.exclude.cols <- colnames(joined.dt)[
  !(
    (as.vector(sapply(joined.dt, class)) %in% c("numeric", "int"))
    | (colnames(joined.dt) %in% (id.cols))
    | (colnames(joined.dt) %in% (keep.cols))
  )
]

joined.vis <- build_correlation_animint(joined.dt, glm.coef.dt, id.cols, joined.exclude.cols)
joined.vis$title <- "Correlation of Full Data Set Variables with GLM models"

animint2dir(joined.vis, out.dir = "./animint_out/joined", open.browser = FALSE)

# animint2gist(joined.vis)
rm("joined.vis")
```

```{r}
id.cols <- c("State.District.ID", "year")

glm.coef.loc <- "./Data Sources CSV/regr.glm.coef.csv"

glm.coef.dt <- fread(glm.coef.loc)
glm.coef.dt <- glm.coef.dt[method == "lambda.min" & task_id == "cl.no.cfa"]

glm.vars <- glm.coef.dt$var

coaching.agg.loc <- "./Data Sources CSV/ic.cwis.nces.cl.computed.combined.csv"
coaching.agg.dt <- fread(coaching.agg.loc)

cl.cols <- unique(glm.coef.dt$var[grep("Coaching_", glm.coef.dt$var)])
cl.exclude.cols <- colnames(coaching.agg.dt)[
  !(
    colnames(coaching.agg.dt) %in% c(cl.cols, id.cols)
    )
  ]

# normalize the coefficients of the models
coaching.sd <- apply(coaching.agg.dt[, !..id.cols], 2, sd)
glm.coef.sd.dt <- data.table(
  sd = coaching.sd, 
  var = names(coaching.sd)
)

glm.coef.dt <- glm.coef.sd.dt[glm.coef.dt, on=c("var")]
glm.coef.dt[, sd := ifelse(is.na(sd), 0, sd)]
glm.coef.dt[, norm.coef := coef * sd]
glm.coef.dt <- rbind(glm.coef.dt, list(NA, "ETLP_avg", "lambda.min", NA, "etlp.no.cfa", NA, NA))

coaching.agg.dt <- coaching.agg.dt[complete.cases(coaching.agg.dt)]

coaching.vis <- build_correlation_animint(coaching.agg.dt, glm.coef.dt, id.cols, cl.exclude.cols)
coaching.vis$title <- "Correlation of Coaching Variables with GLM models"

animint2dir(coaching.vis, out.dir = "./animint_out/coaching", open.browser = FALSE)
```