cut_and_paste_demo.txt

# Logistics
# ===
# class:small-code
#
# - R allows methodology written by others to be imported.
#   - Leverage other code.
#   - Make your code available to others.  

# Load libraries
library(dplyr) # Dataframe manipulation
library(MAST) # For DE
library(Matrix) # Sparse matrices
library(vioplot) # Violin pots
library(Seurat) # Single cell Analysis
library(useful) # corner

# need to update or copy paste from website
source("src/dotplot.R")
source("src/makeNiceMAST.R")

load("/usr/local/R/data/small_bipolar.Robj")


# Representing Sparse Matrices
# ===
# class:small-code

bipolar.data <- Matrix(as.matrix(bipolar_dge), sparse=TRUE)
# Memory use as a normal matrix
object.size(bipolar_dge)

# Memory use as a sparse matrix
# ~22 X less
object.size(bipolar.data)
bipolar_dge <- NULL


# Create a Seurat Object
# ===
# class:small-code

# Expected raw counts (non-normalized data)
# Can give log transformed data but do not transform in setup method
bipolar.seurat.raw <- new("seurat", raw.data=bipolar.data)


# What is in a Seurat Object?
# ===
# class:small-code

# Display the internal pieces of the Seurat Object
slotNames(bipolar.seurat.raw)


# What is in a Seurat Object?
# ===
# class:small-code
# 
# So far, raw sparse matrix

head(bipolar.seurat.raw@raw.data)


# What is in a Seurat Object?
# ===
# class:small-code
#
# So far, raw sparse matrix

head(bipolar.seurat.raw@data)
head(bipolar.seurat.raw@ident)
head(bipolar.seurat.raw@var.genes)


# What is in a Seurat Object?
# ===
# class:small-code
#
# - __var.genes__ Variable genes across cells
# - __data.info__ Misc info including complexity (nGene)
# - __cell.names__ Column (cell) names

?seurat


# What are Our Genes?
# ===
# class:small-code

# Gene names (row names)
head(row.names(bipolar.seurat.raw@raw.data))
length(row.names(bipolar.seurat.raw@raw.data))


# What are Our Cells?
# ===
# class:small-code

# Column names # 24904 x 5000
# Sample / Cell names / Barcodes
head(colnames(bipolar.seurat.raw@raw.data),3)
length(colnames(bipolar.seurat.raw@raw.data))
dim(bipolar.seurat.raw@raw.data)


# How to Show Counts?
# ===
# class:small-code

# The full data will be too large to see
corner(as.matrix(bipolar.seurat.raw@raw.data))


# How Many Expressed Genes?
# ===
# class:small-code

# Plot genes per cell
# How many genes expressed per cells
complexity.per.cell <- apply(bipolar.seurat.raw@raw.data, 2, function(x) sum(x>0))

# Mean count per cell.
mean.count.per.cell <- apply(bipolar.seurat.raw@raw.data, 2, function(x) mean(x))

# Gene prevalence
gene.prevalence <- apply(bipolar.seurat.raw@raw.data, 1, function(x) sum(x>0))


# Store Technical Batches for Later
# ===
# class:small-code

# Get batches based on cell names
technical_batches <- sapply(colnames(bipolar.seurat.raw@raw.data),
                      FUN=function(x){strsplit(x,"_",fixed=TRUE)[[1]][1]})

# Turn to numbers and add cell names to them
technical_batches <- as.numeric(as.factor(technical_batches))
names(technical_batches) <- colnames(bipolar.seurat.raw@raw.data)


# How Many Expressed Genes?
# ===
# class:small-code

# Plot genes per cell
# How many genes expressed per cell
# Violin plot
vioplot(complexity.per.cell)

# Add points
stripchart(complexity.per.cell, add=TRUE, vertical=TRUE, method="jitter", jitter=0.3, pch=20, col="#00000033")

# Add lines for reference
abline(h=500, col="orange")
abline(h=2000, col="green")

# Add text
title("Study Complexity")
axis(side=1,at=1,labels=c("Study"))


# How Many Expressed Genes?
# ===
# class:small-code

# Plot cell complexity, sorted by complexity
plot(sort(complexity.per.cell))
# Add lines
abline(h=500, col="orange")
abline(h=2000, col="green")
title("Complexity from Less to More")

# How Many Expressed Genes By Batch?
# ===
# class:small-code

# Plot genes per cell
# Complexity per technical batch
plot(complexity.per.cell ~ jitter(technical_batches,2))

# Add line
abline(h=500, col="orange")
abline(h=2000, col="green")

# Add text
title("Study Complexity")


# Identifying Outliers?
# ===
# class:small-code
#
# - Cells that are unusually simple (or no counts)
# - Cells that are unusually complex (doublets?)
# - We will filter in a standard way, we are just describing the data.

# Complexity by mean expression
plot(complexity.per.cell, mean.count.per.cell)

# Add lines
abline(v=500, col="orange")
abline(v=2000, col="green")


# Identifying Noise?
# ===
# class:small-code
#
# - People tend to filter very conservatively.
# - Remember this is in log space.

# How often genes are seen in cells
hist(log2(gene.prevalence), breaks=100)

# add line
abline(v=log2(30), col="orange")
abline(v=log2(6), col="green")


# Filter Cells: Removing the Outlier Cells
# ===
# class:small-code
#
# - Genes must be in 6 cells.
# - Cells must have alteast 500 genes.
# - Scaled by 1000 (Total Sum Scaling)

# From 24904 x 3384
# min.cells=30 = 10215 vs 3384
# min.cells=6  = 13519 vs 3384
bipolar.seurat <- Setup(bipolar.seurat.raw,
                    min.cells=6, min.genes=500,
                    do.logNormalize=TRUE,
                    total.expr=1e4,
                    project="Tutorial")
bipolar.seurat.raw <- NULL


# Seurat: Updated Slots
# ===
# class:small-code

dim(bipolar.seurat@raw.data)


# Seurat: Updated Slots
# ===
# class:small-code

dim(bipolar.seurat@data)


# Seurat: Updated Slots
# ===
# class:small-code

head(bipolar.seurat@data.info)


# Seurat: Filtering on Metadata
# ===
# class:small-code
#
# - Filtering on mitochondrial reads in Seurat.

# Get gene names
mito.gene.names <- grep("^mt-", rownames(bipolar.seurat@data), value=TRUE)


# Seurat: Filtering on Metadata
# ===
# class:small-code
#
# - Filtering on mitochondrial reads in Seurat.

# Get TSS normalized mitochodrial counts
# Cell total expression
col.total.counts <- Matrix::colSums(expm1(bipolar.seurat@data))

# Scale each count by the cell total
mito.percent.counts <- Matrix::colSums(expm1(bipolar.seurat@data[mito.gene.names, ]))/col.total.counts


# Seurat: Filtering on Metadata
# ===
# class:small-code
#
# - Filtering on mitochondrial reads in Seurat.

# Add to seurat object as a metadata
bipolar.seurat <- AddMetaData(bipolar.seurat, mito.percent.counts, "percent.mitochodrial")

# Check
head(bipolar.seurat@data.info)


# Seurat: Filtering on Metadata
# ===
# class:small-code

VlnPlot(bipolar.seurat, "nGene")


# Seurat: Filtering on Metadata
# ===
# class:small-code

VlnPlot(bipolar.seurat, "nUMI")


# Seurat: Filtering on Metadata
# ===
# class:small-code

VlnPlot(bipolar.seurat, "percent.mitochodrial")


# Seurat: Filtering on Metadata
# ===
# class:small-code

VlnPlot(bipolar.seurat, c("nGene", "nUMI", "percent.mitochodrial"), nCol=3)

# Seurat: Filtering on Metadata
# ===
# class:small-code
#
# - Outlier percent mitochondria are very low expression.
# - Very high expression have low percent mitochondrial reads.

GenePlot(bipolar.seurat, "nUMI", "percent.mitochodrial")


# Seurat: Filtering on Metadata
# ===
# class:small-code

# Filter libraries with more than 10% mitochondrially derived transcripts
dim(bipolar.seurat@data)
# Filter max complexity
bipolar.seurat <- SubsetData(bipolar.seurat, subset.name="nGene", accept.high=2000)

# Filter percent percent mitochondrial reads
bipolar.seurat <- SubsetData(bipolar.seurat, subset.name="percent.mitochodrial", accept.high=0.1)
dim(bipolar.seurat@data)

# Seurat: Filtering on Metadata
# ===
# class:small-code

dim(bipolar.seurat@data)
table(bipolar.seurat@data.info[["orig.ident"]])

object.size(bipolar.seurat)


# Saving as an R Object
# ===
# class:small-code
#
# Saving the Seurat object
# - Contains all manipulation so far.
# - Can be loaded or shared.
# - Does not contain environment.

# save(bipolar.seurat, file="data/bipolar_seurat_demo.Robj")
# load("/usr/local/R/data/bipolar_seurat_demo.Robj")


# Saving as Text Files
# ===
# class:small-code
# 
# You may need to export data to import into other applications.

# Log-scale expression matrix
# (1.2 GB)
# write.table(as.matrix(bipolar.seurat@data), file="bipolar_data.txt")

# Study metadata (1.2 MB)
# write.table(bipolar.seurat@data.info, file="bipolar_metadata.txt")


# Seurat: Viewing Specific Genes
# ===
# class:small-code
# 
# - Notice many zeros.

VlnPlot(bipolar.seurat, "Prkca")


# Seurat: Viewing Specific Genes
# ===
# class:small-code

VlnPlot(bipolar.seurat, "Prkca", group.by="orig.ident")


# Seurat: Plotting Genes vs Genes
# ===
# class:small-code

# Plot a gene vs a gene
GenePlot(bipolar.seurat,"Isl1","Grm6",cex.use=1)


# Seurat: Batch Affect Correction
# ===
# class:small-code

# Create batch effect (5 and 6 as a group)
batch.effect <- technical_batches[row.names(bipolar.seurat@data.info)]
batch.effect[batch.effect %in% c(1,2,3,4)] <- 1
batch.effect[batch.effect %in% c(5,6)] <- 2

# Add batch effect
bipolar.seurat <- AddMetaData(bipolar.seurat, batch.effect, "batch.effect")

# Regress on metadata
bipolar.seurat <- RegressOut(bipolar.seurat, latent.vars="batch.effect")
# save(bipolar.seurat, file="data/bipolar_regress_demo.Robj")
# load("/usr/local/R/data/bipolar_regress_demo.Robj")

# Seurat: Select Variable Genes
# ===
# class:small-code
#
# - We are going to focus on highly variable genes.
#   - Average expression (X) and dispersion (SD)(Y).
#   - Bins genes (20).
#   - Z-score for dispersion.
# - fxn.x and fxn.y allow one to change the measurements.
#   - Cut.offs are high and low, X and Y.
# Update
bipolar.seurat <- MeanVarPlot(bipolar.seurat,
                              fxn.x=expMean,
                              fxn.y=logVarDivMean,
                              do.contour=FALSE,
                              do.plot=FALSE)
length(bipolar.seurat@var.genes)


# Seurat: Performing PCA
# ===
# class:small-code
#
# - Calculating PCA with the highly variable genes.

bipolar.seurat <- PCA(bipolar.seurat,
                      pc.genes=bipolar.seurat@var.genes,
                      do.print=FALSE)
# save(bipolar.seurat, file="data/bipolar_demo_seurat_pca.Robj")
# load("/usr/local/R/data/bipolar_demo_seurat_pca.Robj")

# Seurat: Performing PCA
# ===
# class:small-code
# Calculate PCA projection
bipolar.seurat <- ProjectPCA(bipolar.seurat)


# Seurat: Performing PCA
# ===
# class:small-code

# Can plot top genes for top components
PrintPCA(bipolar.seurat,
         pcs.print=1:2,
         genes.print=5,
         use.full=TRUE)


# Seurat: PCA Visualizations
# ===
# class:small-code
#
# - Top 30 genes associated with the first two components.

VizPCA(bipolar.seurat, pcs.use=1:2)


# Seurat: PCA Visualizations
# ===
# class:small-code

PCAPlot(bipolar.seurat, 1, 2)


# Seurat: PCA Visualizations
# ===
# class:small-code
# 
# - Plot top 30 genes in top 100 cells for PC1.

PCHeatmap(bipolar.seurat, pc.use=1, cells.use=100, do.balanced=TRUE)


# Seurat: Choosing Components
# ===
# class:small-code
# 
# Time Intensive
# Jackstraw
# # bipolar.seurat <- JackStraw(bipolar.seurat, num.replicate = 100, do.print = FALSE)


# Seurat: Choosing Components
# ===
# class:small-code
#
# How do we choose how many components to use?
#  - When there is diminishing returns to include it.
#  - Selection is performed more liberally in our setting.
# Time Efficient but more ad hoc
# Scree (elbow) plot
# 37 selected

PCElbowPlot(bipolar.seurat, num.pc=40)


# Seurat: Run t-SNE
# ===
# class:small-code
# 
# - Calculate and plot t-SNE on PC 1- 37.
# - Can use Barnes-hut implementation.
#
# Calculate t-SNE Ordination
bipolar.seurat <- RunTSNE(bipolar.seurat,
                          dims.use=1:37,
                          do.fast=TRUE)


# Seurat: Plot t-SNE
# ===
# class:small-code

# Plot
TSNEPlot(bipolar.seurat)


# Seurat: Store Clusters
# ===
# class:small-code
#
# - Determine subclusters for the plot.
# - Separate graph based approach.
#    - is not aware of the t-SNE projection.
# - Using PC 1-37
# 30 min - 11 clusters
# 6 min - 12 cluster
bipolar.seurat <- FindClusters(bipolar.seurat,
                            pc.use=1:37,
                            print.output=0,
                            save.SNN=TRUE)
# save(bipolar.seurat, file="data/bipolar_cluster_seurat_demo.Robj")
# load("/usr/local/R/data/bipolar_cluster_seurat_demo.Robj")

# Seurat: Plotting Genes Through Clusters
# ===
# class:small-code
# 
# Now that we have subclusters of cell populations plotting genes through subclusters is identical to before.
# - Seurat stores and groups by subclusters automatically.

VlnPlot(bipolar.seurat, "Prkca")


# Seurat: Plotting Genes Through Clusters
# ===
# class:small-code

VlnPlot(bipolar.seurat, "Glul")


# Seurat: Viewing de novo Groupings
# ===
# class:small-code

TSNEPlot(bipolar.seurat)


# Seurat: Viewing de novo Groupings
# ===
# class:small-code

TSNEPlot(bipolar.seurat, do.label=TRUE)


# Seurat: Plotting Genes on Clusters
# ===
# class:small-code
#
# You can also gene expression through out the cell ordination.
# - Marker genes can help identify cell groups.
#   - Rod Bipolar Cells (Prkca+ Scgn-)
#   - Muller Glia (Glul+ Prkca- Scgn-)
#   - Cone Bipolar Cells (Prkca- Scgn+)
#   - On Cone Bipolar Cells (Prkca- Scgn+ Grm6+)
# - Metadata can help visualize batch effects.

FeaturePlot(bipolar.seurat,
            c("Prkca","Glul", "Scgn", "Grm6"),
            cols.use = c("grey","blue"))


# QC the Clusters!
# ===
# class:small-code

FeaturePlot(bipolar.seurat,
            "nGene",
            cols.use = c("grey","blue"))


# QC the Clusters!
# ===
# class:small-code

FeaturePlot(bipolar.seurat,
            "percent.mitochodrial",
            cols.use = c("grey","blue"))


# QC the Clusters!
# ===
# class:small-code
# 
# Check for your batch effect.
# - We are going to make a fake batch effect (site) and plot this as an example of how one can visualize unwanted signal.

# Making Fake Data
fake.sites <- as.integer(bipolar.seurat@ident %in% c(0,1))
names(fake.sites) <- names(bipolar.seurat@ident)

# Add metadata
bipolar.seurat <- AddMetaData(bipolar.seurat, fake.sites, "site")

# Plot feature
FeaturePlot(bipolar.seurat, c("site"), cols.use = c("green","orange"))


# Seurat: Getting your labels
# ===
# class:small-code

cell.labels <- bipolar.seurat@ident
head(cell.labels)


# Seurat: Differential Expression
# ===
# class:small-code
#
# - Default if one cluster again many tests.
#   - Can specify an ident.2 test between clusters.
# - Adding speed by exluding tests.
#   - Min.pct - controls for sparsity
#     - Min percentage in a group
#   - Thresh.test - must have this difference in averages.

cluster.markers <- FindMarkers(bipolar.seurat, ident.1=2, min.pct = 0.25)
head(cluster.markers, 30)


# Seurat: Differential Expression
# ===
# class:small-code
# - Find all cluster against all others.

# bipolar.markers <- FindAllMarkers(bipolar.seurat, only.pos=TRUE, min.pct=0.25, thresh.use=0.25)
# bipolar.markers %>% group_by(cluster) %>% top_n(2, avg_diff)


# Seurat: Plotting DE Genes
# ===
# class:small-code

plot.genes <- head(rownames(cluster.markers),30)
DoHeatmap(bipolar.seurat,
          genes.use=plot.genes,
          order.by.ident=TRUE,
          slim.col.label=TRUE,
          remove.key=TRUE)


# Dot Plots: Plotting Genes Through Clusters
# ===
# class:small-code

dot.plot(bipolar.seurat,
         features.use=plot.genes)


# Dot Plots: Replicating Bipolar Results
# ===
# class:small-code

# Genes in tutorial
plot.genes = c("Vsx2","Otx2","Scgn","Isl1","Grm6",
               "Apoe","Pax6","Rho","Arr3","Tacr3",
               "Syt2","Neto1","Irx6","Prkar2b",
               "Grik1","Kcng4","Cabp5","Vsx1","Prkca")

# Cell groups shown
only.use.groups = c(7,9,10,8,4,3,6,5,1,0)

# Plot
dot.plot(bipolar.seurat,
         features.use=plot.genes,
         subset.group=only.use.groups)


# MAST: Creating a Test Comparison
# ===
# class:small-code

# Make a covariate to inform the test
test.2 <- as.character(bipolar.seurat@ident)
test.2[test.2 != "2"] <- "wild"
test.2[test.2 == "2"] <- "test"
names(test.2) <- names(bipolar.seurat@ident)
test.2 <- factor(test.2, levels=c("wild","test"))


# MAST: Prepping the Data
# ===
# class:small-code

# Load into Mast Object
# Mast object
# Data frame of cell-level covariates
# Data frame of feature-level covariates
bipolar.mast <- FromMatrix(as.matrix(bipolar.seurat@data),
                  cbind(bipolar.seurat@data.info,test.2),
                  data.frame(primerid=rownames(bipolar.seurat@data)))


# MAST
# ===
# class:small-code

# Run test
# 1 hr
# zlm.test.2 <- zlm.SingleCellAssay(~ test.2, bipolar.mast)
# save(zlm.test.2, file="data/mast_zlm_demo.Robj")
load("/usr/local/R/data/mast_zlm_demo.Robj")

df.test.2 <- data.frame(summary(zlm.test.2, doLRT="test.2test"))
summary.mast <- makeNice(df.test.2, val="test.2test")


# MAST
# ===
# class:small-code

summary.mast[abs(summary.mast[,5])>=1 & !is.na(summary.mast[,5]),]


# MAST
# ===
# class:small-code

plot.genes = c("Acsl3","Apoe","Clu","Glul","Rlbp1","Sparc","Dkk3",
               "Pcp2","Gng13","Calm1","Gnao1","Atp2b1","Cplx3","Snap25")
dot.plot(bipolar.seurat,
         features.use=plot.genes)