Merge branch 'quarto_roy'

NBISweden · Feb 5, 2024 · 94399b6 · 94399b6
2 parents 6293368 + 77b0d9f
commit 94399b6
Show file tree

Hide file tree

Showing 155 changed files with 2,811 additions and 4,780 deletions.
diff --git a/compiled/labs/bioc/bioc_01_qc.qmd b/compiled/labs/bioc/bioc_01_qc.qmd
@@ -1,5 +1,7 @@
 ---
 description: Quality control of single cell RNA-Seq data. Inspection of
+  QC metrics including number of UMIs, number of genes expressed,
+  mitochondrial and ribosomal expression, sex and cell cycle state.
 subtitle:  Bioconductor Toolkit
 title:  Quality Control
 ---
@@ -20,6 +22,10 @@ have been subsampled to 1500 cells per sample. We can start by defining
 our paths.
 
 ``` {r}
+# download pre-computed annotation
+fetch_annotation <- TRUE
+
+# url for source and intermediate data
 path_data <- "https://export.uppmax.uu.se/naiss2023-23-3/workshops/workshop-scrnaseq"
 
 path_covid <- "./data/covid"
@@ -103,10 +109,10 @@ overlapping barcodes between the datasets. After that we add a column
 **type** in the metadata to define covid and ctrl samples.
 
 ``` {r}
-sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.17, ctrl.5, ctrl.13, ctrl.14)))
+sce <- SingleCellExperiment(assays = list(counts = cbind(cov.1, cov.15, cov.16, cov.17, ctrl.5, ctrl.13, ctrl.14,ctrl.19)))
 dim(sce)
 # Adding metadata
-sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14"), function(x) rep(x, ncol(get(x)))))
+sce@colData$sample <- unlist(sapply(c("cov.1", "cov.15", "cov.16", "cov.17", "ctrl.5", "ctrl.13", "ctrl.14","ctrl.19"), function(x) rep(x, ncol(get(x)))))
 sce@colData$type <- ifelse(grepl("cov", sce@colData$sample), "Covid", "Control")
 ```
 
@@ -116,7 +122,7 @@ good idea to remove them and run garbage collect to free up some memory.
 
 ``` {r}
 # remove all objects that will not be used.
-rm(cov.15, cov.1, cov.17, ctrl.5, ctrl.13, ctrl.14)
+rm(cov.15, cov.1, cov.17, cov.16, ctrl.5, ctrl.13, ctrl.14, ctrl.19)
 # run garbage collect to free up memory
 gc()
 ```
@@ -133,9 +139,9 @@ head(sce@colData, 10)
 Having the data in a suitable format, we can start calculating some
 quality metrics. We can for example calculate the percentage of
 mitochondrial and ribosomal genes per cell and add to the metadata. The
-proportion hemoglobin genes can give an indication of red blood cell
+proportion of hemoglobin genes can give an indication of red blood cell
 contamination. This will be helpful to visualize them across different
-metadata parameteres (i.e. datasetID and chemistry version). There are
+metadata parameters (i.e. datasetID and chemistry version). There are
 several ways of doing this. The QC metrics are finally added to the
 metadata table.
 
@@ -152,7 +158,7 @@ mito_genes <- rownames(sce)[grep("^MT-", rownames(sce))]
 # Ribosomal genes
 ribo_genes <- rownames(sce)[grep("^RP[SL]", rownames(sce))]
 # Hemoglobin genes - includes all genes starting with HB except HBP.
-hb_genes <- rownames(sce)[grep("^HB[^(P)]", rownames(sce))]
+hb_genes <- rownames(sce)[grep("^HB[^(P|E|S)]", rownames(sce))]
 ```
 
 First, let Scran calculate some general qc-stats for genes and cells
@@ -195,10 +201,10 @@ wrap_plots(
 ) + plot_layout(guides = "collect")
 ```
 
-As you can see, there is quite some difference in quality for the 4
-datasets, with for instance the covid_15 and covid_16 samples having
-fewer cells with many detected genes and more mitochondrial content. As
-the ribosomal proteins are highly expressed they will make up a larger
+As you can see, there is quite some difference in quality for these
+samples, with for instance the covid_15 and covid_16 samples having
+cells with fewer detected genes and more mitochondrial content. As the
+ribosomal proteins are highly expressed they will make up a larger
 proportion of the transcriptional landscape when fewer of the lowly
 expressed genes are detected. We can also plot the different QC-measures
 as scatter plots.
@@ -222,8 +228,8 @@ plotColData(sce, x = "total", y = "detected", colour_by = "sample")
 
 ### Detection-based filtering
 
-A standard approach is to filter cells with low amount of reads as well
-as genes that are present in at least a certain amount of cells. Here we
+A standard approach is to filter cells with low number of reads as well
+as genes that are present in at least a given number of cells. Here we
 will only consider cells with at least 200 detected genes and genes need
 to be expressed in at least 3 cells. Please note that those values are
 highly dependent on the library preparation method used.
@@ -275,7 +281,7 @@ the gene contribution, but the function is quite slow.
 # Compute the relative expression of each gene per cell
 # Use sparse matrix operations, if your dataset is large, doing matrix devisions the regular way will take a very long time.
 C <- counts(sce)
-C@x <- C@x / rep.int(colSums(C), diff(C@p))
+C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
 most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
 boxplot(as.matrix(t(C[most_expressed, ])), cex = .1, las = 1, xlab = "% total count per cell", col = scales::hue_pal()(20)[20:1], horizontal = TRUE)
 
@@ -294,7 +300,7 @@ important for quality control and downstream filtering.
 ### Mito/Ribo filtering
 
 We also have quite a lot of cells with high proportion of mitochondrial
-and low proportion of ribosomal reads. It could be wise to remove those
+and low proportion of ribosomal reads. It would be wise to remove those
 cells, if we have enough cells left after filtering. Another option
 would be to either remove all mitochondrial reads from the dataset and
 hope that the remaining genes still have enough biological signal. A
@@ -307,7 +313,7 @@ cells are below 20% mitochondrial reads and that will be used as a
 cutoff. We will also remove cells with less than 5% ribosomal reads.
 
 ``` {r}
-selected_mito <- sce.filt$subsets_mt_percent < 30
+selected_mito <- sce.filt$subsets_mt_percent < 20
 selected_ribo <- sce.filt$subsets_ribo_percent > 5
 
 # and subset the object to only keep those cells
@@ -324,7 +330,7 @@ content, according to their function.
 
 ### Plot filtered QC
 
-Lets plot the same QC-stats another time.
+Lets plot the same QC-stats once more.
 
 ``` {r}
 #| fig-height: 6
@@ -359,61 +365,77 @@ sce.filt <- sce.filt[!grepl("^MT-", rownames(sce.filt)), ]
 # Filter Ribossomal gene (optional if that is a problem on your data)
 # sce.filt <- sce.filt[ ! grepl("^RP[SL]", rownames(sce.filt)), ]
 
-# Filter Hemoglobin gene
-sce.filt <- sce.filt[!grepl("^HB[^(PES)]", rownames(sce.filt)), ]
+# Filter Hemoglobin gene  (optional if that is a problem on your data)
+#sce.filt <- sce.filt[!grepl("^HB[^(P|E|S)]", rownames(sce.filt)), ]
 
 dim(sce.filt)
 ```
 
 ## Sample sex
 
 When working with human or animal samples, you should ideally constrain
-you experiments to a single sex to avoid including sex bias in the
+your experiments to a single sex to avoid including sex bias in the
 conclusions. However this may not always be possible. By looking at
 reads from chromosomeY (males) and XIST (X-inactive specific transcript)
 expression (mainly female) it is quite easy to determine per sample
-which sex it is. It can also bee a good way to detect if there has been
-any sample mixups, if the sample metadata sex does not agree with the
-computational predictions.
+which sex it is. It can also be a good way to detect if there has been
+any mislabelling in which case, the sample metadata sex does not agree
+with the computational predictions.
 
 To get chromosome information for all genes, you should ideally parse
 the information from the gtf file that you used in the mapping pipeline
 as it has the exact same annotation version/gene naming. However, it may
 not always be available, as in this case where we have downloaded public
 data. R package biomaRt can be used to fetch annotation information. The
-code to run biomaRt is provided. As the biomart instances quite often
-are unresponsive, we will download and use a file that was created in
+code to run biomaRt is provided. As the biomart instances are quite
+often unresponsive, we will download and use a file that was created in
 advance.
 
-``` {r}
-#| eval: false
+<div>
 
-# this code chunk is not executed
-suppressMessages(library(biomaRt))
+> **Tip**
+>
+> Here is the code to download annotation data from Ensembl using
+> biomaRt. We will not run this now and instead use a pre-computed file
+> in the step below.
+>
+> ``` {r}
+> # fetch_annotation is defined at the top of this document
+> if (!fetch_annotation) {
+>   suppressMessages(library(biomaRt))
+>
+>   # initialize connection to mart, may take some time if the sites are unresponsive.
+>   mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
+>
+>   # fetch chromosome info plus some other annotations
+>   genes_table <- try(biomaRt::getBM(attributes = c(
+>     "ensembl_gene_id", "external_gene_name",
+>     "description", "gene_biotype", "chromosome_name", "start_position"
+>   ), mart = mart, useCache = F))
+>
+>   write.csv(genes_table, file = "data/covid/results/genes_table.csv")
+> }
+> ```
 
-# initialize connection to mart, may take some time if the sites are unresponsive.
-mart <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
+</div>
 
-# fetch chromosome info plus some other annotations
-genes_table <- try(biomaRt::getBM(attributes = c(
-    "ensembl_gene_id", "external_gene_name",
-    "description", "gene_biotype", "chromosome_name", "start_position"
-), mart = mart, useCache = F))
+Download precomputed data.
 
-write.csv(genes_table, file = "data/covid/results/genes_table.csv")
+``` {r}
+# fetch_annotation is defined at the top of this document
+if (fetch_annotation) {
+  genes_file <- file.path(path_results, "genes_table.csv")
+  if (!file.exists(genes_file)) download.file(file.path(path_data, "covid/results/genes_table.csv"), destfile = genes_file)
+}
 ```
 
 ``` {r}
-genes_file <- file.path(path_results, "genes_table.csv")
-
-if (!file.exists(genes_file)) download.file(file.path(path_data, "covid/results/genes_table.csv"), destfile = genes_file)
 genes.table <- read.csv(genes_file)
-
 genes.table <- genes.table[genes.table$external_gene_name %in% rownames(sce.filt), ]
 ```
 
-Now that we have the chromosome information, we can calculate per cell
-the proportion of reads that comes from chromosome Y.
+Now that we have the chromosome information, we can calculate the
+proportion of reads that comes from chromosome Y per cell.
 
 ``` {r}
 chrY.gene <- genes.table$external_gene_name[genes.table$chromosome_name == "Y"]
@@ -463,9 +485,9 @@ We here perform cell cycle scoring. To score a gene list, the algorithm
 calculates the difference of mean expression of the given list and the
 mean expression of reference genes. To build the reference, the function
 randomly chooses a bunch of genes matching the distribution of the
-expression of the given list. Cell cycle scoring adds three slots in
-data, a score for S phase, a score for G2M phase and the predicted cell
-cycle phase.
+expression of the given list. Cell cycle scoring adds three slots in the
+metadata, a score for S phase, a score for G2M phase and the predicted
+cell cycle phase.
 
 ``` {r}
 hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package = "scran"))
@@ -487,16 +509,17 @@ assignments <- cyclone(sce.filt[ensembl %in% cc.ensembl, ], hs.pairs, gene.names
 sce.filt$G1.score <- assignments$scores$G1
 sce.filt$G2M.score <- assignments$scores$G2M
 sce.filt$S.score <- assignments$scores$S
+sce.filt$phase <- assignments$phases
 ```
 
-We can now plot a violin plot for the cell cycle scores as well.
+We can now create a violin plot for the cell cycle scores as well.
 
 ``` {r}
 #| fig-height: 4
 #| fig-width: 14
 
 wrap_plots(
-    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "sample"),
+    plotColData(sce.filt, y = "G2M.score", x = "G1.score", colour_by = "phase"),
     plotColData(sce.filt, y = "G2M.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "G1.score", x = "sample", colour_by = "sample"),
     plotColData(sce.filt, y = "S.score", x = "sample", colour_by = "sample"),
@@ -508,6 +531,11 @@ Cyclone predicts most cells as G1, but also quite a lot of cells with
 high S-Phase scores. Compare to results with Seurat and Scanpy and see
 how different predictors will give clearly different results.
 
+Cyclone does an automatic prediction of cell cycle phase with a default
+cutoff of the scores at 0.5 As you can see this does not fit this data
+very well, so be cautious with using these predictions. Instead we
+suggest that you look at the scores.
+
 ## Predict doublets
 
 Doublets/Multiples of cells in the same well/droplet is a common issue
@@ -530,10 +558,10 @@ and should have a doublet rate at about 4%.
 > **Caution**
 >
 > Ideally doublet prediction should be run on each sample separately,
-> especially if your different samples have different proportions of
-> cell types. In this case, the data is subsampled so we have very few
-> cells per sample and all samples are sorted PBMCs so it is okay to run
-> them together.
+> especially if your samples have different proportions of cell types.
+> In this case, the data is subsampled so we have very few cells per
+> sample and all samples are sorted PBMCs, so it is okay to run them
+> together.
 
 </div>
 
@@ -583,8 +611,8 @@ dim(sce.filt)
 ## Save data
 
 Finally, lets save the QC-filtered data for further analysis. Create
-output directory `results` and save data to that folder. This will be
-used in downstream labs.
+output directory `data/covid/results` and save data to that folder. This
+will be used in downstream labs.
 
 ``` {r}
 saveRDS(sce.filt, file.path(path_results, "bioc_covid_qc.rds"))