merge flexiplex updates

mritchielab · Jul 13, 2023 · 8174885 · 8174885
2 parents 8e1dbb3 + 1970dfe
commit 8174885
Show file tree

Hide file tree

Showing 46 changed files with 2,742 additions and 13,015 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
 export(annotation_to_fasta)
-export(barcode_info_plots)
 export(bulk_long_pipeline)
 export(combine_sce)
 export(create_config)
@@ -11,6 +10,7 @@ export(demultiplex_sockeye)
 export(filter_annotation)
 export(find_barcode)
 export(find_isoform)
+export(flexiplex)
 export(get_GRangesList)
 export(locate_minimap2_dir)
 export(minimap2_align)
@@ -25,7 +25,6 @@ export(sc_long_multisample_pipeline)
 export(sc_long_pipeline)
 export(sc_mutations)
 export(sc_umap_expression)
-import(zlibbioc)
 importFrom(BiocGenerics,cbind)
 importFrom(BiocGenerics,colnames)
 importFrom(BiocGenerics,end)

diff --git a/R/FLAMES.R b/R/FLAMES.R
@@ -0,0 +1,4 @@
+#' FLAMES: full-length analysis of mutations and splicing
+#' @name FLAMES
+#' @useDynLib FLAMES, .registration=TRUE
+NULL
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,9 +1,24 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' @import zlibbioc
-#' @useDynLib FLAMES, .registration=TRUE
-match_cell_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range) {
- .Call(`_FLAMES_match_cell_barcode`, fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range)
+#' Rcpp port of flexiplex
+#'
+#' @description demultiplex reads with flexiplex, for detailed description, see
+#' documentation for the original flexiplex: https://davidsongroup.github.io/flexiplex
+#'
+#' @param reads_in Input FASTQ or FASTA file
+#' @param barcodes_file barcode allow-list file
+#' @param bc_as_readid bool, whether to add the demultiplexed barcode to the
+#' read ID field 
+#' @param max_bc_editdistance max edit distance for barcode '
+#' @param max_flank_editdistance max edit distance for the flanking sequences '
+#' @param pattern StringVector defining the barcode structure, see [find_barcode]
+#' @param reads_out output file for demultiplexed reads
+#' @param stats_out output file for demultiplexed stats
+#' @param n_threads number of threads to be used during demultiplexing
+#' @param bc_out WIP
+#' @export
+flexiplex <- function(reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads) {
+ .Call(`_FLAMES_flexiplex`, reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads)
 }
 
diff --git a/R/basilisk.R b/R/basilisk.R
@@ -1,66 +1,13 @@
-# flames_nopysam_env <- BasiliskEnvironment(
-# envname = "flames_nopysam_env", pkgname = "FLAMES",
-# packages = c(
-# "python==2.7.15.0",
-# # "minimap2==2.17",
-# "numpy==1.16.5",
-# "editdistance==0.5.3",
-# "bamnostic==1.1.7"
-# ),
-# channels = c("bioconda", "conda-forge")
-# )
-
 #' @importFrom basilisk BasiliskEnvironment
 flames_env <- BasiliskEnvironment(
  envname = "flames_env", pkgname = "FLAMES",
  packages = c(
- "python==3.7",
+ "python==3.10.12",
  # "minimap2==2.17",
- "numpy==1.16.5",
- "editdistance==0.5.3",
- "scipy==1.2.0",
- "pysam==0.18.0"
+ "numpy==1.25.0",
+ "editdistance==0.6.2",
+ "scipy==1.11.1",
+ "pysam==0.21.0"
  ),
  channels = c("conda-forge", "bioconda")
 )
-# flames_env <- BasiliskEnvironment(envname="full_env",
-# pkgname="FLAMES",
-# packages=c("python==2.7.15.0",
-# "pysam==0.16.0.1",
-# "minimap2==2.17",
-# "numpy==1.16.5",
-# "editdistance==0.5.3",
-# "bzip2==1.0.8",
-# "c-ares==1.11.0",
-# "ca-certificates==2020.11.8",
-# "certifi==2019.11.28",
-# "htslib==1.11",
-# "k8==0.2.5",
-# "krb5==1.17.1",
-# "libblas==3.9.0",
-# "libcblas==3.9.0",
-# "libcurl==7.71.1",
-# "libcxx==11.0.0",
-# "libdeflate==1.6",
-# "libedit==3.1.20191231",
-# "libev==4.33",
-# "libffi==3.2.1",
-# "libgfortran==3.0.0",
-# "libgfortran5==9.3.0",
-# "liblapack==3.9.0",
-# "libnghttp2==1.41.0",
-# "libopenblas==0.3.12",
-# "libssh2==1.9.0",
-# "llvm-openmp==11.0.0",
-# "ncurses==6.2",
-# "openssl==1.1",
-# #"pip==20.1.1",
-# "python_abi==2.7",
-# "readline==8.0",
-# "setuptools==44.0.0",
-# "sqlite==3.33.0",
-# "tk==8.6.10",
-# "wheel==0.35.1",
-# "xz==5.2.5",
-# "zlib==1.2.11"),
-# channels=c("bioconda", "conda-forge"))
diff --git a/R/bulk_long_pipeline.R b/R/bulk_long_pipeline.R
@@ -107,7 +107,7 @@ bulk_long_pipeline <-
  outdir,
  minimap2_dir,
  prefix = samples[i],
- threads = NULL
+ threads = config$pipeline_parameters$threads
  )
  }
  } else {
@@ -124,7 +124,8 @@ bulk_long_pipeline <-
  cat("#### Realign to transcript using minimap2\n")
  for (i in 1:length(samples)) {
  cat(paste0(c("\tRealigning sample ", samples[i], "...\n")))
- minimap2_realign(config, fastq_files[i], outdir, minimap2_dir, prefix = samples[i], threads = 12)
+ minimap2_realign(config, fastq_files[i], outdir, minimap2_dir, prefix = samples[i], 
+ threads = config$pipeline_parameters$threads)
  }
  } else {
  cat("#### Skip read realignment\n")

diff --git a/R/create_config.R b/R/create_config.R
@@ -134,6 +134,11 @@ check_arguments <-
  stop("Bambu requires GTF format for annotation file.\n")
  }
  }
+
+ n_cores <- parallel::detectCores()
+ if (!is.na(n_cores) && config$pipeline_parameters$threads > n_cores) {
+ cat("Configured to use", config$pipeline_parameters$threads, "cores, detected", n_cores, "\n")
+ }
 
  return(list(config = config, minimap2_dir = minimap2_dir))
  }
diff --git a/R/find_barcode.R b/R/find_barcode.R
@@ -1,90 +1,45 @@
 #' Match Cell Barcodes
-#'
-#' @description Match cell barcodes in the given fastq directory with the given barcode allow-list. For each read, the left flanking
-#' sequence is located, FLAMES then takes the next 16 characters and match it to barcodes in the allow-list. If there is an unambigious match
-#' within the given edit distance (\code{MAX_DIST}), the barcode and following \code{UMI_LEN} characters are tirmmed, along with potential
-#' polyT tail. The trimmed read is then saved to \code{out_fastq}, with the identier field formatted as \[barcode\]_\[UMI\]#\[original read ID\].
 #' 
-#' @param fastq_dir directory containing fastq files to match
-#' @param stats_file NEEDED
-#' @param out_fastq output filename for matched barcodes
-#' @param ref_csv NEEDED
-#' @param MAX_DIST int; maximum edit distance
-#' @param UMI_LEN int; length of UMI sequences
-#' @param left_seq String; sequence that appears at the left of the barcode
-#' @param min_length int; minimum read length to be filtered after timming barcodes
-#' @param reverse_complement boolean; whether to check the reverse complement of the reads
-#' @param fixed_range boolean; deprecated, whether to skip finding flanking sequence by infering
-#' its position from previous reads. Setting to \code{TRUE} may decrease performance and accuracy.
+#' @description demultiplex reads with flexiplex
 #'
-#' @return returns a list containing statistics of the reads demultiplexed.
+#' @param fastq input FASTQ file path
+#' @param barcodes_file path to file containing barcode allow-list, with one barcode in each line
+#' @param max_bc_editdistance max edit distances for the barcode sequence
+#' @param max_flank_editdistance max edit distances for the flanking sequences (primer and polyT)
+#' @param reads_out path of output FASTQ file
+#' @param stats_out path of output stats file
+#' @param threads number of threads to be used
+#' @param pattern named character vector defining the barcode pattern
 #' @examples
 #' outdir <- tempfile()
 #' dir.create(outdir)
 #' bc_allow <- file.path(outdir, 'bc_allow.tsv')
 #' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE)
 #' find_barcode(
-#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'),
-#' stats_file = file.path(outdir, 'bc_stat'),
-#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'),
-#' ref_csv = bc_allow,
-#' MAX_DIST = 2,
-#' UMI_LEN = 10
-#')
-#' @md
-#' @export
-find_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN = 10L,
- left_seq = "CTACACGACGCTCTTCCGATCT", min_length = 20L, reverse_complement = TRUE,
- fixed_range = FALSE) {
- stats <- match_cell_barcode(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST,
- UMI_LEN, left_seq, min_length, reverse_complement, fixed_range)
- demultiplexed_counts <- read.csv(stats_file, row.names = "cell_barcode")
- return(list(demultiplex_stats = as.data.frame(stats), column_data = demultiplexed_counts))
-}
-
-#' Barcode demultiplexing QC plots
-#' @description Plot the barcode demultiplexing statistics
-#' @importFrom tidyr pivot_longer
-#' @importFrom magrittr '%>%'
-#' @importFrom dplyr filter
-#' @importFrom SummarizedExperiment colData colData<-
-#' @importFrom ggplot2 ggplot aes geom_bar ggtitle coord_polar element_blank theme position_stack theme_bw geom_histogram ggtitle ylab xlab geom_text
-#' @param sce The \code{SingleCellExperiment} object from FLAMES pipeline, or the returned list from \code{find_barcode}
-#' @return a list of QC plots for the barcode demultiplexing step (\code{find_barcode})
-#' @examples
-#' outdir <- tempfile()
-#' dir.create(outdir)
-#' bc_allow <- file.path(outdir, 'bc_allow.tsv')
-#' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE)
-#' barcode_info <- find_barcode(
-#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'),
-#' stats_file = file.path(outdir, 'bc_stat'),
-#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'),
-#' ref_csv = bc_allow,
-#' MAX_DIST = 2,
-#' UMI_LEN = 10
+#' fastq = system.file('extdata/fastq', package = 'FLAMES'),
+#' stats_out = file.path(outdir, 'bc_stat'),
+#' reads_out = file.path(outdir, 'demultiplexed.fq.gz'),
+#' barcodes_file = bc_allow
 #')
-#' barcode_info_plots(barcode_info)
 #' @md
 #' @export
-barcode_info_plots <- function(sce) {
- if (is.list(sce)) {
- demultiplex_stats <- sce$demultiplex_stats
- demultiplexed_counts <- sce$column_data[, "reads_demultiplexed", drop = F]
+find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flank_editdistance = 8,
+ reads_out, stats_out, threads = 1, pattern = c(primer = "CTACACGACGCTCTTCCGATCT",
+ polyT = paste0(rep("T", 9), collapse = ""), umi_seq = paste0(rep("?", 12),
+ collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = ""))) {
+ if (file_test("-f", fastq)) {
+ flexiplex(reads_in = fastq, barcodes_file = barcodes_file, bc_as_readid = TRUE,
+ max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
+ pattern = pattern, reads_out = reads_out, stats_out = stats_out, n_threads = threads,
+ bc_out = tempfile())
+ } else if (file_test("-d", fastq)) {
+ sapply(list.files(fastq, "\\.(fastq)|(fq)|(fasta)|(fa)$", full.names=TRUE), function(x) {
+ flexiplex(reads_in = x, barcodes_file = barcodes_file, bc_as_readid = TRUE,
+ max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
+ pattern = pattern, reads_out = reads_out, stats_out = stats_out,
+ n_threads = threads, bc_out = tempfile())
+ })
  } else {
- demultiplex_stats <- sce@metadata$demultiplex_stats
- demultiplexed_counts <- colData(sce)[, "reads_demultiplexed", drop = F]
+ stop("The specified path does not exist: ", fastq)
  }
- summary_pie <- tidyr::pivot_longer(demultiplex_stats, everything()) %>%
- dplyr::filter(!name %in% c("total")) %>%
- ggplot(aes(x = "", y = value, label = value, fill = name)) + geom_bar(stat = "identity") +
- geom_text(position = position_stack(vjust = 0.5)) + coord_polar("y") + labs(x = NULL,
- y = NULL) + ggtitle("Reads demultiplexed summary") + theme_bw() + theme(panel.grid.major = element_blank(),
- panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_blank(),
- axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(),
- axis.ticks.y = element_blank())
- distribution_hist <- ggplot(demultiplexed_counts, aes(x = reads_demultiplexed)) +
- geom_histogram(bins = ifelse(length(table(demultiplexed_counts)) < 20, length(table(demultiplexed_counts)),
- 20)) + xlab("Reads demultiplexed") + ylab("Cell barcodes") + ggtitle("Reads demultiplexed distribution")
- return(list(summary_pie = summary_pie, distribution_hist = distribution_hist))
 }
diff --git a/R/find_isoform.R b/R/find_isoform.R
@@ -122,6 +122,11 @@ annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir) {
 
  tr_string_set <- GenomicFeatures::extractTranscriptSeqs(dna_string_set, txdb,
  use.names = TRUE)
+ if (length(names(tr_string_set)) > length(unique(names(tr_string_set)))) {
+ cat("Duplicated transcript IDs present, removing ...")
+ tr_string_set <- tr_string_set[unique(names(tr_string_set))]
+ }
+
  Biostrings::writeXStringSet(tr_string_set, out_file)
  Rsamtools::indexFa(out_file)