Merge pull request #825 from JingxuanChen7/JingxuanChen7/wilms14_cnv

Exploratory CNV notebook for Wilms tumor annotation (SCPCP000014)
AlexsLemonade · Oct 18, 2024 · ce4d8e3 · ce4d8e3
2 parents bc12b9f + 7b019fd
commit ce4d8e3
Show file tree

Hide file tree

Showing 11 changed files with 2,539 additions and 2 deletions.
diff --git a/analyses/cell-type-wilms-tumor-14/README.md b/analyses/cell-type-wilms-tumor-14/README.md
@@ -31,7 +31,7 @@ This would include:
 
 ## Usage
 
-* Run Rscripts with command line
+* Run main pipeline with command line
 ```bash
 cd /path/to/OpenScPCA-analysis
 cd analyses/cell-type-wilms-tumor-14
@@ -74,4 +74,7 @@ sudo apt install -y libglpk40 \
 
 ## Computational resources
 
-Analysis could be executed on a virtual computer ([Standard-4XL](https://openscpca.readthedocs.io/en/latest/aws/lsfr/creating-vcs/)) via AWS Lightsail for Research.
+Analysis could be executed on a virtual computer ([Standard-4XL](https://openscpca.readthedocs.io/en/latest/aws/lsfr/creating-vcs/)) via AWS Lightsail for Research.
+
+## Exploratory analysis
+In addition to the main pipeline, some exploratory analysis in R notebooks are added into the `./exploratory_analysis` folder, including CNV analysis. Check `./exploratory_analysis/README.md` for more details.
diff --git a/analyses/cell-type-wilms-tumor-14/components/dependencies.R b/analyses/cell-type-wilms-tumor-14/components/dependencies.R
@@ -1,2 +1,3 @@
 # R dependencies not captured by `renv`
 # library("missing_package")
+library(rtracklayer)
diff --git a/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_copykat_SCPCL000850.Rmd b/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_copykat_SCPCL000850.Rmd
@@ -0,0 +1,124 @@
+---
+title: "Exploratory analysis using Copykat"
+author: "Jingxuan Chen"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+## Introduction
+
+Here we test `CopyKat` to analyze the CNV profile for Wilms tumor sample (SCPCL000850), aiming to identify potential tumor cells from normal cells.
+
+Before running this notebook, one should first generate `CopyKat` outputs as in `03_runCopyKat.R`.
+
+## Setup
+
+### Packages
+
+```{r packages}
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(Seurat)
+  library(copykat)
+  library(ggplot2)
+})
+```
+
+### Paths
+
+
+#### Base directories
+
+```{r base paths}
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+path_repo <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The path to this module
+path_anal <- file.path(path_repo,"analyses","cell-type-wilms-tumor-14") 
+```
+
+#### Input and output files
+
+Only run for one sample
+
+```{r paths}
+library_id <- "SCPCL000850"
+scratch_out_dir <- file.path(path_anal, "scratch", "03_cnv","copykat")
+dir.create(scratch_out_dir, showWarnings = FALSE, recursive = TRUE)
+library_out_dir <- file.path(scratch_out_dir, library_id)
+dir.create(library_out_dir, showWarnings = FALSE, recursive = TRUE)
+
+## Input files
+# load pre-processed sample objs & anchor transfer results
+obj <- SeuratObject::LoadSeuratRds( file.path(path_anal,"scratch","00_preprocessing_rds",paste0(library_id,".rdsSeurat")) )
+level <- "compartment"
+predictions <- read.csv( file.path(path_anal, "results", "01_anchor_transfer_seurat", paste0(library_id, "_", level,".csv")) ) 
+obj <- AddMetaData(object = obj, metadata = predictions)
+copykat_result <- readr::read_rds( file = file.path(library_out_dir, paste0(library_id, "_copykat_resultobj.rds")) )
+copykat_result_noref <- readr::read_rds( file = file.path(library_out_dir, paste0(library_id, "_noref_copykat_resultobj.rds")) )
+
+```
+
+```{r images}
+heatmap_wref <- file.path(library_out_dir, paste0(library_id, "_copykat_heatmap.jpeg"))
+heatmap_noref <- file.path(library_out_dir, paste0(library_id, "_noref_copykat_heatmap.jpeg"))
+```
+
+## Analysis content
+
+#### Run CopyKat with normal cells
+
+Here, "immune" cells annotated with anchor transfer were used as reference normal cells. Here is the heatmap generated by the software:
+
+```{r, echo=FALSE, out.width = '100%'}
+knitr::include_graphics(heatmap_wref)
+```
+
+Frequency of predicted `aneuploid`, `diploid`, and `not.defined` categories.
+
+```{r}
+copykat_df <- copykat_result$prediction %>%
+  select(copykat.pred) %>% 
+  rename(c("ref.copykat" = "copykat.pred"))
+table(copykat_df)
+```
+
+Plot CopyKat identifications onto UMAP.
+
+```{r}
+obj <- AddMetaData(object = obj, metadata = copykat_df)
+DimPlot(obj, group.by = "ref.copykat", alpha = 0.3)
+```
+
+#### Run CopyKat without normal cells
+
+Here, no normal cells were provided to CopyKat.
+
+```{r, echo=FALSE, out.width = '100%'}
+knitr::include_graphics(heatmap_noref)
+```
+
+Frequency of predicted `aneuploid`, `diploid`, and `not.defined` categories.
+
+```{r}
+copykat_noref_df <- copykat_result_noref$prediction %>%
+  select(copykat.pred) %>% 
+  rename(c("noref.copykat" = "copykat.pred"))
+table(copykat_noref_df)
+```
+
+Plot CopyKat identifications onto UMAP.
+
+```{r}
+obj <- AddMetaData(object = obj, metadata = copykat_noref_df)
+DimPlot(obj, group.by = "noref.copykat", alpha = 0.3)
+```
+
+In this Wilms tumor sample (SCPCL000850), it's likely that CopyKat is not working well: (i) The result with or without specifying normal cells are not consistent; (ii) From the heatmap, no much differences in CNV profile could be observed.
+
+## Session Info
+
+```{r session info}
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+```
diff --git a/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_copykat_SCPCL000850.html b/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_copykat_SCPCL000850.html
diff --git a/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_infercnv_SCPCL000850.Rmd b/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_infercnv_SCPCL000850.Rmd
@@ -0,0 +1,171 @@
+---
+title: "Exploratory CNV analysis using inferCNV"
+author: "Jingxuan Chen"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+## Introduction
+
+Here we test `inferCNV` to analyze the CNV profile for Wilms tumor sample (SCPCL000850), aiming to identify potential tumor cells from normal cells.
+
+Before running this notebook, one should first generate `inferCNV` outputs as in `03_runInferCNV.R`.
+
+## Setup
+
+### Packages
+
+```{r packages}
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(Seurat)
+  library(infercnv)
+  library(ggplot2)
+})
+```
+
+### Paths
+
+#### Base directories
+
+```{r base paths}
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+path_repo <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The path to this module
+path_anal <- file.path(path_repo,"analyses","cell-type-wilms-tumor-14") 
+```
+
+#### Input and output files
+
+Only run for one sample
+
+```{r paths}
+library_id <- "SCPCL000850"
+scratch_out_dir <- file.path(path_anal, "scratch", "03_cnv","infercnv")
+dir.create(scratch_out_dir, showWarnings = FALSE, recursive = TRUE)
+library_out_dir <- file.path(scratch_out_dir, library_id)
+dir.create(library_out_dir, showWarnings = FALSE, recursive = TRUE)
+
+## Input files
+# load pre-processed sample objs & anchor transfer results
+obj <- SeuratObject::LoadSeuratRds( file.path(path_anal,"scratch","00_preprocessing_rds",paste0(library_id,".rdsSeurat")) )
+level <- "compartment"
+predictions <- read.csv( file.path(path_anal, "results", "01_anchor_transfer_seurat", paste0(library_id, "_", level,".csv")) ) 
+obj <- AddMetaData(object = obj, metadata = predictions)
+infercnv_result <- readr::read_rds( file = file.path(library_out_dir, "run.final.infercnv_obj") )
+# add infercnv output to seurat object
+obj_cnv <- infercnv::add_to_seurat(
+  seurat_obj = obj,
+  infercnv_output_path = library_out_dir
+)
+```
+
+```{r images}
+heatmap_wref <- file.path(library_out_dir, "infercnv.png")
+normal_level <- "immune"
+```
+
+## Analysis content
+
+Here, "immune" cells annotated with anchor transfer were used as reference normal cells. Here is the heatmap generated by the software:
+
+```{r, echo=FALSE, out.width = '100%'}
+knitr::include_graphics(heatmap_wref)
+```
+
+We could observe some different CNV profiles between "fetal_nephron" and "stroma", especially chr1 and chr11. However, it's hard to observe any different CNV profiles within the "fetal_nephron" subgroups. 
+
+`inferCNV` outputs results for each Chromosome. Thus, I applied several ways to summarize the overall CNV profile.
+
+#### CNV score - proportion of genes with CNV
+
+Based on [this](https://www.biostars.org/p/9573777/) Biostars discussion, I calculated a `cnv_score`, which basically indicates the proportion of CNVs across all genes.
+
+```{r}
+## summary strategy 1 https://www.biostars.org/p/9573777/
+scores <- apply(infercnv_result@expr.data, 2 ,function(x){ sum(x < 0.95 | x > 1.05)/length(x) }) %>% 
+  as.data.frame() %>%
+  mutate(pred = obj$predicted.id) %>%
+  mutate(cnv_score = ifelse(
+    pred == normal_level,
+    NA,
+    .
+  ))
+```
+
+Ideally, we should observe bimodal distribution for this score, indicating the CNV difference between normal and tumor cells. However, here we could get a distribution close to normal.
+```{r}
+hist(scores$cnv_score, breaks = 100)
+```
+
+By plotting the CNV score onto UMAP, no clear pattern is shown.
+```{r}
+obj <- AddMetaData(object = obj, metadata = scores)
+FeaturePlot(obj, features = "cnv_score", alpha = 0.3) +
+  scale_color_viridis_c()
+```
+
+#### Summary based upon number of chromosomes that have CNV
+This strategy to summary CNV results is based on [`cell-type-ewings`](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-ewings/template_notebooks/cnv-workflow) module.
+
+```{r}
+## summary strategy 2 ewings analysis, based on number of chr that have cnv
+cnv_df <- obj_cnv@meta.data %>%
+  select(matches("predicted.id") | starts_with("has_cnv_chr") & !matches("has_cnv_chrMT")) %>%
+  mutate(count_cnv_chr = ifelse(predicted.id == normal_level,
+                            NA,
+                            rowSums(across(starts_with("has_cnv_chr")))
+                            ) )
+```
+
+```{r}
+hist(cnv_df$count_cnv_chr)
+```
+
+```{r}
+obj <- AddMetaData(object = obj, metadata = cnv_df)
+FeaturePlot(obj, features = "count_cnv_chr", alpha = 0.3) +
+  scale_color_viridis_c()
+```
+
+#### Summary based upon weighted means of CNV proportion
+This strategy to summary CNV results is based on [`cell-type-ewings`](https://github.com/AlexsLemonade/OpenScPCA-analysis/tree/main/analyses/cell-type-ewings/template_notebooks/cnv-workflow) module.
+
+```{r}
+## summary strategy 3 ewings analysis, based on number of chr that have cn
+chr_weights <- infercnv_result@gene_order |> 
+  as.data.frame() |> 
+  dplyr::count(chr) |> 
+  # only keep chr 1-22, drops MT and GL chr
+  dplyr::filter(chr %in% glue::glue("chr{seq(1,22)}")) |> 
+  dplyr::pull(n)
+
+cnv_df <- obj_cnv@meta.data %>%
+  select(matches("predicted.id") | starts_with("proportion_scaled_cnv_chr") & !ends_with("chrMT")) %>%
+  rowwise() %>%
+  mutate(weight_mean = ifelse(
+    predicted.id == normal_level,
+    NA,
+    weighted.mean(across(starts_with("proportion_scaled_cnv_chr")), chr_weights/sum(chr_weights))
+  ) )
+```
+
+```{r}
+hist(cnv_df$weight_mean)
+```
+
+```{r}
+obj <- AddMetaData(object = obj, metadata = cnv_df)
+FeaturePlot(obj, features = "weight_mean", alpha = 0.3) +
+  scale_color_viridis_c()
+```
+
+In conclusion, the heatmap generated by `inferCNV` indicates different CNV profiles between `fetal_nephron` and `stroma`, especially in Chr1. However, none of the three summary strategies seem working. 
+
+## Session Info
+
+```{r session info}
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+```
diff --git a/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_infercnv_SCPCL000850.html b/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_infercnv_SCPCL000850.html
diff --git a/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_runCopyKat.R b/analyses/cell-type-wilms-tumor-14/exploratory_analysis/03_cnv/03_runCopyKat.R
@@ -0,0 +1,63 @@
+library(dplyr)
+library(Seurat)
+library(copykat)
+library(ggplot2)
+
+path_repo <- rprojroot::find_root(rprojroot::is_git_root)
+path_anal <- file.path(path_repo,"analyses","cell-type-wilms-tumor-14") 
+
+library <- "SCPCL000850"
+
+# create output dirs
+scratch_out_dir <- file.path(path_anal, "scratch", "03_cnv","copykat")
+dir.create(scratch_out_dir, showWarnings = FALSE, recursive = TRUE)
+library_out_dir <- file.path(scratch_out_dir, library)
+dir.create(library_out_dir, showWarnings = FALSE, recursive = TRUE)
+
+# results_out_dir <- file.path(path_anal, "results", "03_cnv")
+# dir.create(results_out_dir, showWarnings = FALSE, recursive = TRUE)
+# plots_out_dir <- file.path(path_anal, "plots", "03_cnv")
+# dir.create(results_out_dir, showWarnings = FALSE, recursive = TRUE)
+
+
+# load pre-processed sample objs & anchor transfer results
+obj <- SeuratObject::LoadSeuratRds( file.path(path_anal,"scratch","00_preprocessing_rds",paste0(library,".rdsSeurat")) )
+level <- "compartment"
+predictions <- read.csv( file.path(path_anal, "results", "01_anchor_transfer_seurat", paste0(library, "_", level,".csv")) ) 
+obj <- AddMetaData(object = obj, metadata = predictions)
+
+# prepare copykat input matrix & normal cell list
+count_mat <- SeuratObject::GetAssayData(obj, assay = "RNA", layer = "count")
+normal_cells <- predictions %>%
+  tibble::column_to_rownames(var = "X") %>%
+  filter(predicted.id == "immune")
+
+# save copykat intermediate results by setting directory
+setwd(library_out_dir)
+
+# run copykat with reference normal cells
+copykat_result <- copykat(
+  rawmat = count_mat,
+  id.type = "E",
+  sam.name = library,
+  norm.cell.names = rownames(normal_cells),
+  ngene.chr = 2,
+  plot.genes = FALSE,
+  output.seg = FALSE,
+  n.cores = 8
+)
+readr::write_rds(copykat_result, file = file.path(library_out_dir, paste0(library, "_copykat_resultobj.rds")))
+
+# run copykat without normal cells
+copykat_result_noref <- copykat(
+  rawmat = count_mat,
+  id.type = "E",
+  sam.name = paste0(library,"_noref"),
+  norm.cell.names = NULL,
+  ngene.chr = 2,
+  plot.genes = FALSE,
+  output.seg = FALSE,
+  n.cores = 8
+)
+readr::write_rds(copykat_result_noref, file = file.path(library_out_dir, paste0(library, "_noref_copykat_resultobj.rds")))
+