Skip to content

Commit

Permalink
merge flexiplex updates
Browse files Browse the repository at this point in the history
  • Loading branch information
ChangqingW committed Jul 13, 2023
2 parents 8e1dbb3 + 1970dfe commit 8174885
Show file tree
Hide file tree
Showing 46 changed files with 2,742 additions and 13,015 deletions.
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(annotation_to_fasta)
export(barcode_info_plots)
export(bulk_long_pipeline)
export(combine_sce)
export(create_config)
Expand All @@ -11,6 +10,7 @@ export(demultiplex_sockeye)
export(filter_annotation)
export(find_barcode)
export(find_isoform)
export(flexiplex)
export(get_GRangesList)
export(locate_minimap2_dir)
export(minimap2_align)
Expand All @@ -25,7 +25,6 @@ export(sc_long_multisample_pipeline)
export(sc_long_pipeline)
export(sc_mutations)
export(sc_umap_expression)
import(zlibbioc)
importFrom(BiocGenerics,cbind)
importFrom(BiocGenerics,colnames)
importFrom(BiocGenerics,end)
Expand Down
4 changes: 4 additions & 0 deletions R/FLAMES.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#' FLAMES: full-length analysis of mutations and splicing
#' @name FLAMES
#' @useDynLib FLAMES, .registration=TRUE
NULL
23 changes: 19 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' @import zlibbioc
#' @useDynLib FLAMES, .registration=TRUE
match_cell_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range) {
.Call(`_FLAMES_match_cell_barcode`, fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range)
#' Rcpp port of flexiplex
#'
#' @description demultiplex reads with flexiplex, for detailed description, see
#' documentation for the original flexiplex: https://davidsongroup.github.io/flexiplex
#'
#' @param reads_in Input FASTQ or FASTA file
#' @param barcodes_file barcode allow-list file
#' @param bc_as_readid bool, whether to add the demultiplexed barcode to the
#' read ID field
#' @param max_bc_editdistance max edit distance for barcode '
#' @param max_flank_editdistance max edit distance for the flanking sequences '
#' @param pattern StringVector defining the barcode structure, see [find_barcode]
#' @param reads_out output file for demultiplexed reads
#' @param stats_out output file for demultiplexed stats
#' @param n_threads number of threads to be used during demultiplexing
#' @param bc_out WIP
#' @export
flexiplex <- function(reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads) {
.Call(`_FLAMES_flexiplex`, reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads)
}

63 changes: 5 additions & 58 deletions R/basilisk.R
Original file line number Diff line number Diff line change
@@ -1,66 +1,13 @@
# flames_nopysam_env <- BasiliskEnvironment(
# envname = "flames_nopysam_env", pkgname = "FLAMES",
# packages = c(
# "python==2.7.15.0",
# # "minimap2==2.17",
# "numpy==1.16.5",
# "editdistance==0.5.3",
# "bamnostic==1.1.7"
# ),
# channels = c("bioconda", "conda-forge")
# )

#' @importFrom basilisk BasiliskEnvironment
flames_env <- BasiliskEnvironment(
envname = "flames_env", pkgname = "FLAMES",
packages = c(
"python==3.7",
"python==3.10.12",
# "minimap2==2.17",
"numpy==1.16.5",
"editdistance==0.5.3",
"scipy==1.2.0",
"pysam==0.18.0"
"numpy==1.25.0",
"editdistance==0.6.2",
"scipy==1.11.1",
"pysam==0.21.0"
),
channels = c("conda-forge", "bioconda")
)
# flames_env <- BasiliskEnvironment(envname="full_env",
# pkgname="FLAMES",
# packages=c("python==2.7.15.0",
# "pysam==0.16.0.1",
# "minimap2==2.17",
# "numpy==1.16.5",
# "editdistance==0.5.3",
# "bzip2==1.0.8",
# "c-ares==1.11.0",
# "ca-certificates==2020.11.8",
# "certifi==2019.11.28",
# "htslib==1.11",
# "k8==0.2.5",
# "krb5==1.17.1",
# "libblas==3.9.0",
# "libcblas==3.9.0",
# "libcurl==7.71.1",
# "libcxx==11.0.0",
# "libdeflate==1.6",
# "libedit==3.1.20191231",
# "libev==4.33",
# "libffi==3.2.1",
# "libgfortran==3.0.0",
# "libgfortran5==9.3.0",
# "liblapack==3.9.0",
# "libnghttp2==1.41.0",
# "libopenblas==0.3.12",
# "libssh2==1.9.0",
# "llvm-openmp==11.0.0",
# "ncurses==6.2",
# "openssl==1.1",
# #"pip==20.1.1",
# "python_abi==2.7",
# "readline==8.0",
# "setuptools==44.0.0",
# "sqlite==3.33.0",
# "tk==8.6.10",
# "wheel==0.35.1",
# "xz==5.2.5",
# "zlib==1.2.11"),
# channels=c("bioconda", "conda-forge"))
5 changes: 3 additions & 2 deletions R/bulk_long_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ bulk_long_pipeline <-
outdir,
minimap2_dir,
prefix = samples[i],
threads = NULL
threads = config$pipeline_parameters$threads
)
}
} else {
Expand All @@ -124,7 +124,8 @@ bulk_long_pipeline <-
cat("#### Realign to transcript using minimap2\n")
for (i in 1:length(samples)) {
cat(paste0(c("\tRealigning sample ", samples[i], "...\n")))
minimap2_realign(config, fastq_files[i], outdir, minimap2_dir, prefix = samples[i], threads = 12)
minimap2_realign(config, fastq_files[i], outdir, minimap2_dir, prefix = samples[i],
threads = config$pipeline_parameters$threads)
}
} else {
cat("#### Skip read realignment\n")
Expand Down
5 changes: 5 additions & 0 deletions R/create_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,11 @@ check_arguments <-
stop("Bambu requires GTF format for annotation file.\n")
}
}

n_cores <- parallel::detectCores()
if (!is.na(n_cores) && config$pipeline_parameters$threads > n_cores) {
cat("Configured to use", config$pipeline_parameters$threads, "cores, detected", n_cores, "\n")
}

return(list(config = config, minimap2_dir = minimap2_dir))
}
105 changes: 30 additions & 75 deletions R/find_barcode.R
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,90 +1,45 @@
#' Match Cell Barcodes
#'
#' @description Match cell barcodes in the given fastq directory with the given barcode allow-list. For each read, the left flanking
#' sequence is located, FLAMES then takes the next 16 characters and match it to barcodes in the allow-list. If there is an unambigious match
#' within the given edit distance (\code{MAX_DIST}), the barcode and following \code{UMI_LEN} characters are tirmmed, along with potential
#' polyT tail. The trimmed read is then saved to \code{out_fastq}, with the identier field formatted as \[barcode\]_\[UMI\]#\[original read ID\].
#'
#' @param fastq_dir directory containing fastq files to match
#' @param stats_file NEEDED
#' @param out_fastq output filename for matched barcodes
#' @param ref_csv NEEDED
#' @param MAX_DIST int; maximum edit distance
#' @param UMI_LEN int; length of UMI sequences
#' @param left_seq String; sequence that appears at the left of the barcode
#' @param min_length int; minimum read length to be filtered after timming barcodes
#' @param reverse_complement boolean; whether to check the reverse complement of the reads
#' @param fixed_range boolean; deprecated, whether to skip finding flanking sequence by infering
#' its position from previous reads. Setting to \code{TRUE} may decrease performance and accuracy.
#' @description demultiplex reads with flexiplex
#'
#' @return returns a list containing statistics of the reads demultiplexed.
#' @param fastq input FASTQ file path
#' @param barcodes_file path to file containing barcode allow-list, with one barcode in each line
#' @param max_bc_editdistance max edit distances for the barcode sequence
#' @param max_flank_editdistance max edit distances for the flanking sequences (primer and polyT)
#' @param reads_out path of output FASTQ file
#' @param stats_out path of output stats file
#' @param threads number of threads to be used
#' @param pattern named character vector defining the barcode pattern
#' @examples
#' outdir <- tempfile()
#' dir.create(outdir)
#' bc_allow <- file.path(outdir, 'bc_allow.tsv')
#' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE)
#' find_barcode(
#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'),
#' stats_file = file.path(outdir, 'bc_stat'),
#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'),
#' ref_csv = bc_allow,
#' MAX_DIST = 2,
#' UMI_LEN = 10
#')
#' @md
#' @export
find_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN = 10L,
left_seq = "CTACACGACGCTCTTCCGATCT", min_length = 20L, reverse_complement = TRUE,
fixed_range = FALSE) {
stats <- match_cell_barcode(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST,
UMI_LEN, left_seq, min_length, reverse_complement, fixed_range)
demultiplexed_counts <- read.csv(stats_file, row.names = "cell_barcode")
return(list(demultiplex_stats = as.data.frame(stats), column_data = demultiplexed_counts))
}

#' Barcode demultiplexing QC plots
#' @description Plot the barcode demultiplexing statistics
#' @importFrom tidyr pivot_longer
#' @importFrom magrittr '%>%'
#' @importFrom dplyr filter
#' @importFrom SummarizedExperiment colData colData<-
#' @importFrom ggplot2 ggplot aes geom_bar ggtitle coord_polar element_blank theme position_stack theme_bw geom_histogram ggtitle ylab xlab geom_text
#' @param sce The \code{SingleCellExperiment} object from FLAMES pipeline, or the returned list from \code{find_barcode}
#' @return a list of QC plots for the barcode demultiplexing step (\code{find_barcode})
#' @examples
#' outdir <- tempfile()
#' dir.create(outdir)
#' bc_allow <- file.path(outdir, 'bc_allow.tsv')
#' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE)
#' barcode_info <- find_barcode(
#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'),
#' stats_file = file.path(outdir, 'bc_stat'),
#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'),
#' ref_csv = bc_allow,
#' MAX_DIST = 2,
#' UMI_LEN = 10
#' fastq = system.file('extdata/fastq', package = 'FLAMES'),
#' stats_out = file.path(outdir, 'bc_stat'),
#' reads_out = file.path(outdir, 'demultiplexed.fq.gz'),
#' barcodes_file = bc_allow
#')
#' barcode_info_plots(barcode_info)
#' @md
#' @export
barcode_info_plots <- function(sce) {
if (is.list(sce)) {
demultiplex_stats <- sce$demultiplex_stats
demultiplexed_counts <- sce$column_data[, "reads_demultiplexed", drop = F]
find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flank_editdistance = 8,
reads_out, stats_out, threads = 1, pattern = c(primer = "CTACACGACGCTCTTCCGATCT",
polyT = paste0(rep("T", 9), collapse = ""), umi_seq = paste0(rep("?", 12),
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = ""))) {
if (file_test("-f", fastq)) {
flexiplex(reads_in = fastq, barcodes_file = barcodes_file, bc_as_readid = TRUE,
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
pattern = pattern, reads_out = reads_out, stats_out = stats_out, n_threads = threads,
bc_out = tempfile())
} else if (file_test("-d", fastq)) {
sapply(list.files(fastq, "\\.(fastq)|(fq)|(fasta)|(fa)$", full.names=TRUE), function(x) {
flexiplex(reads_in = x, barcodes_file = barcodes_file, bc_as_readid = TRUE,
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
pattern = pattern, reads_out = reads_out, stats_out = stats_out,
n_threads = threads, bc_out = tempfile())
})
} else {
demultiplex_stats <- sce@metadata$demultiplex_stats
demultiplexed_counts <- colData(sce)[, "reads_demultiplexed", drop = F]
stop("The specified path does not exist: ", fastq)
}
summary_pie <- tidyr::pivot_longer(demultiplex_stats, everything()) %>%
dplyr::filter(!name %in% c("total")) %>%
ggplot(aes(x = "", y = value, label = value, fill = name)) + geom_bar(stat = "identity") +
geom_text(position = position_stack(vjust = 0.5)) + coord_polar("y") + labs(x = NULL,
y = NULL) + ggtitle("Reads demultiplexed summary") + theme_bw() + theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_blank(),
axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(),
axis.ticks.y = element_blank())
distribution_hist <- ggplot(demultiplexed_counts, aes(x = reads_demultiplexed)) +
geom_histogram(bins = ifelse(length(table(demultiplexed_counts)) < 20, length(table(demultiplexed_counts)),
20)) + xlab("Reads demultiplexed") + ylab("Cell barcodes") + ggtitle("Reads demultiplexed distribution")
return(list(summary_pie = summary_pie, distribution_hist = distribution_hist))
}
5 changes: 5 additions & 0 deletions R/find_isoform.R
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir) {

tr_string_set <- GenomicFeatures::extractTranscriptSeqs(dna_string_set, txdb,
use.names = TRUE)
if (length(names(tr_string_set)) > length(unique(names(tr_string_set)))) {
cat("Duplicated transcript IDs present, removing ...")
tr_string_set <- tr_string_set[unique(names(tr_string_set))]
}

Biostrings::writeXStringSet(tr_string_set, out_file)
Rsamtools::indexFa(out_file)

Expand Down
Loading

0 comments on commit 8174885

Please sign in to comment.