AlexsLemonade · sjspielman · Feb 6, 2025 · Feb 5, 2025 · Feb 5, 2025 · Feb 5, 2025
@@ -32,6 +32,14 @@ RUN_EXPLORATORY=${RUN_EXPLORATORY:-0}
 THREADS=${THREADS:-32}
 project_id="SCPCP000006"
 
+# Define the `predicted_celltype_threshold` which is used to identify cells with a high confident label transfer from the fetal kidney reference.
+# This score is used in the script `06_infercnv.R` and in the notebook `07_combined_annotation_across_samples.Rmd`
+predicted_celltype_threshold=0.75
+
+# Define the `cnv_threshold_low/high` which are used to identify cells with no CNV (`cnv_score` <= `cnv_threshold_low`) or with CNV (`cnv_score` > `cnv_threshold_high`)
+cnv_threshold_low=0
+cnv_threshold_high=2
+
 # Ensure script is being run from its directory
 module_dir=$(dirname "${BASH_SOURCE[0]}")
 cd ${module_dir}
@@ -147,12 +155,13 @@ for sample_dir in ${data_dir}/${project_id}/SCPCS*; do
     fi
 
     # Run inferCNV
-    Rscript scripts/06_infercnv.R --sample_id $sample_id --reference $reference --HMM i3 ${test_string}
+    Rscript scripts/06_infercnv.R --sample_id $sample_id --reference $reference --HMM i3 ${test_string} --score_threshold ${predicted_celltype_threshold}
 done
 
+
 # Render notebook to make draft annotations
 Rscript -e "rmarkdown::render('${notebook_dir}/07_combined_annotation_across_samples_exploration.Rmd',
-                        params = list(predicted.celltype.threshold = 0.85, cnv_threshold = 0, testing = ${TESTING}),
+                        params = list(predicted.celltype.threshold = ${predicted_celltype_threshold}, cnv_threshold = 0, testing = ${TESTING}),
                         output_format = 'html_document',
                         output_file = '07_combined_annotation_across_samples_exploration.html',
                         output_dir = '${notebook_dir}')"
@@ -30,7 +30,7 @@ option_list <- list(
   make_option(
     opt_str = c("-s", "--sample_id"),
     type = "character",
-    default = "SCPCS000179",
+    default = "SCPCS000194",
     help = "The sample_id of the sample to be used for inference of genomic copy number using infercnv "
   ),
   make_option(
@@ -179,7 +179,7 @@ if (opts$HMM == "no") {
 dir.create(output_dir, recursive = TRUE)
 
 # retrieve the gene order file created in `06a_build-geneposition.R`
-gene_order_file <- file.path(module_base, "results", "references", "gencode_v19_gene_pos.txt")
+gene_arm_order_file <- file.path(module_base, "results","references", 'gencode_v38_gene_pos_arm.txt')
 
 # Run infercnv ------------------------------------------------------------------
 # create inferCNV object and run method
@@ -191,7 +191,7 @@ infercnv_obj <- infercnv::CreateInfercnvObject(
   raw_counts_matrix = as.matrix(counts),
   annotations_file = annot_df,
   ref_group_names = normal_cells,
-  gene_order_file = gene_order_file,
+  gene_order_file = gene_arm_order_file,
   # ensure all cells are included
   min_max_counts_per_cell = c(-Inf, +Inf)
 )

@@ -1,25 +1,99 @@
-# This script downloads and adapt the genome position file for use with infercnv
+# load library
+library(tidyverse)
+
+# Setup ------------------
 
 # The base path for the OpenScPCA repository, found by its (hidden) .git directory
 repository_base <- rprojroot::find_root(rprojroot::is_git_root)
 # The path to this module
 module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
+# The path to the reference directory
+reference_dir <- file.path(module_base, "results", "references")
+
+# URLs for data to download
+gtf_file_uri <- "s3://scpca-references/homo_sapiens/ensembl-104/annotation/Homo_sapiens.GRCh38.104.gtf.gz"
+arm_order_file_url <- "http://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz"
 
-# load library
-library(tidyverse)
 # Define the gene order file
-  gene_order_file <- file.path(module_base, "results","references", 'gencode_v19_gene_pos.txt')
-
-# Download the file if ot existing
-if (!file.exists(gene_order_file)) {
-  download.file(url = 'https://data.broadinstitute.org/Trinity/CTAT/cnv/gencode_v19_gen_pos.complete.txt',
-                destfile = gene_order_file)
-  gene_order <- read_tsv(gene_order_file, col_names =  FALSE)
-  tmp <- gsub("\\..*","",gene_order$X1) 
-  tmp <- gsub(".*\\|","",tmp) 
-  gene_order$X1 <- tmp
-  gene_order <- gene_order[grepl("ENSG", x = gene_order$X1),]
-
-  write_tsv(col_names = FALSE, gene_order, gene_order_file, append = FALSE)
+gtf_file <- file.path(reference_dir, "Homo_sapiens.GRCh38.104.gtf.gz")
+
+# Define the arm order file
+arm_order_file <- file.path(reference_dir, "hg38_cytoBand.txt")
+
+# Define the gene/arm combined information file for output
+gene_arm_order_file <- file.path(reference_dir, "gencode_v38_gene_pos_arm.txt")
+
+# Download files if they don't exist --------------------
+if (!file.exists(gtf_file)) {
+  system(
+    glue::glue("aws s3 cp {gtf_file_uri} {gtf_file} --no-sign-request")
+  )
+}
+if (!file.exists(arm_order_file)) {
+  download.file(url = arm_order_file_url, destfile = arm_order_file)
 }
-
+
+# Read and prepare input files -----------------
+
+gtf <- rtracklayer::readGFF(gtf_file)
+
+gene_order_df <- gtf |>
+  # keep only genes on standard chromosomes
+  filter(
+    grepl("^[0-9XY]+$", seqid),
+    type == "gene"
+  ) |>
+  select(
+    ensembl_id = gene_id,
+    chrom = seqid,
+    gene_start = start,
+    gene_end = end
+  ) |>
+  mutate(chrom = glue::glue("chr{chrom}"))
+
+# Load cytoBand file into R and assign column names
+cytoBand <- read_tsv(arm_order_file, col_names = FALSE)
+colnames(cytoBand) <- c("chrom", "chrom_arm_start", "chrom_arm_end", "band", "stain")
+
+# Add a column for the chromosome arm (p or q) and group by chromosome and arm
+#  to calculate arm start and end positions
+chromosome_arms_df <- cytoBand |>
+  mutate(arm = substr(band, 1, 1)) |>
+  group_by(chrom, arm) %>%
+  summarize(
+    chrom_arm_start = min(chrom_arm_start),
+    chrom_arm_end = max(chrom_arm_end),
+    .groups = "drop"
+  ) |>
+  # this will remove non-standard chromosomes which have NA arms
+  tidyr::drop_na()
+
+# Before continuing, we should have 48 rows in chromosome_arms:
+stopifnot("Could not get all chromosome arm bounds" = nrow(chromosome_arms_df) == 48)
+
+
+# Combine data frames to get the chromosome and arm for each gene --------------
+combined_df <- gene_order_df |>
+  # combine gene coordinates with chromosome arm coordinates
+  left_join(
+    chromosome_arms_df,
+    by = "chrom",
+    relationship = "many-to-many"
+  ) |>
+  # keep only rows where gene is on the chromosome arm
+  filter(gene_start >= chrom_arm_start & gene_end <= chrom_arm_end) |>
+  # create chrom_arm column
+  mutate(chrom_arm = glue::glue("{chrom}{arm}")) %>%
+  # Define chromosome arm order
+  mutate(chrom_arm = factor(chrom_arm, levels = c(paste0("chr", rep(1:22, each = 2), c("p", "q")),
+                                                  "chrXp", "chrXq", "chrYp", "chrYq"))) %>%
+  # Sort genes by Chromosome arm and Start position
+  arrange(chrom_arm, gene_start)  %>%
+  # Select only relevant column for infercnv
+  select(ensembl_id, chrom_arm, gene_start, gene_end) %>%
+  # Remove ENSG duplicated (genes that are both on X and Y chromosome need to be remove before infercnv)
+  distinct(ensembl_id, .keep_all = TRUE)
+
+
+# Export --------------
+write_tsv(combined_df, gene_arm_order_file, col_names = FALSE, append = FALSE)