-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
46 changed files
with
2,742 additions
and
13,015 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#' FLAMES: full-length analysis of mutations and splicing | ||
#' @name FLAMES | ||
#' @useDynLib FLAMES, .registration=TRUE | ||
NULL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,24 @@ | ||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#' @import zlibbioc | ||
#' @useDynLib FLAMES, .registration=TRUE | ||
match_cell_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range) { | ||
.Call(`_FLAMES_match_cell_barcode`, fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN, left_seq, min_length, reverse_complement, fixed_range) | ||
#' Rcpp port of flexiplex | ||
#' | ||
#' @description demultiplex reads with flexiplex, for detailed description, see | ||
#' documentation for the original flexiplex: https://davidsongroup.github.io/flexiplex | ||
#' | ||
#' @param reads_in Input FASTQ or FASTA file | ||
#' @param barcodes_file barcode allow-list file | ||
#' @param bc_as_readid bool, whether to add the demultiplexed barcode to the | ||
#' read ID field | ||
#' @param max_bc_editdistance max edit distance for barcode ' | ||
#' @param max_flank_editdistance max edit distance for the flanking sequences ' | ||
#' @param pattern StringVector defining the barcode structure, see [find_barcode] | ||
#' @param reads_out output file for demultiplexed reads | ||
#' @param stats_out output file for demultiplexed stats | ||
#' @param n_threads number of threads to be used during demultiplexing | ||
#' @param bc_out WIP | ||
#' @export | ||
flexiplex <- function(reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads) { | ||
.Call(`_FLAMES_flexiplex`, reads_in, barcodes_file, bc_as_readid, max_bc_editdistance, max_flank_editdistance, pattern, reads_out, stats_out, bc_out, n_threads) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,66 +1,13 @@ | ||
# flames_nopysam_env <- BasiliskEnvironment( | ||
# envname = "flames_nopysam_env", pkgname = "FLAMES", | ||
# packages = c( | ||
# "python==2.7.15.0", | ||
# # "minimap2==2.17", | ||
# "numpy==1.16.5", | ||
# "editdistance==0.5.3", | ||
# "bamnostic==1.1.7" | ||
# ), | ||
# channels = c("bioconda", "conda-forge") | ||
# ) | ||
|
||
#' @importFrom basilisk BasiliskEnvironment | ||
flames_env <- BasiliskEnvironment( | ||
envname = "flames_env", pkgname = "FLAMES", | ||
packages = c( | ||
"python==3.7", | ||
"python==3.10.12", | ||
# "minimap2==2.17", | ||
"numpy==1.16.5", | ||
"editdistance==0.5.3", | ||
"scipy==1.2.0", | ||
"pysam==0.18.0" | ||
"numpy==1.25.0", | ||
"editdistance==0.6.2", | ||
"scipy==1.11.1", | ||
"pysam==0.21.0" | ||
), | ||
channels = c("conda-forge", "bioconda") | ||
) | ||
# flames_env <- BasiliskEnvironment(envname="full_env", | ||
# pkgname="FLAMES", | ||
# packages=c("python==2.7.15.0", | ||
# "pysam==0.16.0.1", | ||
# "minimap2==2.17", | ||
# "numpy==1.16.5", | ||
# "editdistance==0.5.3", | ||
# "bzip2==1.0.8", | ||
# "c-ares==1.11.0", | ||
# "ca-certificates==2020.11.8", | ||
# "certifi==2019.11.28", | ||
# "htslib==1.11", | ||
# "k8==0.2.5", | ||
# "krb5==1.17.1", | ||
# "libblas==3.9.0", | ||
# "libcblas==3.9.0", | ||
# "libcurl==7.71.1", | ||
# "libcxx==11.0.0", | ||
# "libdeflate==1.6", | ||
# "libedit==3.1.20191231", | ||
# "libev==4.33", | ||
# "libffi==3.2.1", | ||
# "libgfortran==3.0.0", | ||
# "libgfortran5==9.3.0", | ||
# "liblapack==3.9.0", | ||
# "libnghttp2==1.41.0", | ||
# "libopenblas==0.3.12", | ||
# "libssh2==1.9.0", | ||
# "llvm-openmp==11.0.0", | ||
# "ncurses==6.2", | ||
# "openssl==1.1", | ||
# #"pip==20.1.1", | ||
# "python_abi==2.7", | ||
# "readline==8.0", | ||
# "setuptools==44.0.0", | ||
# "sqlite==3.33.0", | ||
# "tk==8.6.10", | ||
# "wheel==0.35.1", | ||
# "xz==5.2.5", | ||
# "zlib==1.2.11"), | ||
# channels=c("bioconda", "conda-forge")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,90 +1,45 @@ | ||
#' Match Cell Barcodes | ||
#' | ||
#' @description Match cell barcodes in the given fastq directory with the given barcode allow-list. For each read, the left flanking | ||
#' sequence is located, FLAMES then takes the next 16 characters and match it to barcodes in the allow-list. If there is an unambigious match | ||
#' within the given edit distance (\code{MAX_DIST}), the barcode and following \code{UMI_LEN} characters are tirmmed, along with potential | ||
#' polyT tail. The trimmed read is then saved to \code{out_fastq}, with the identier field formatted as \[barcode\]_\[UMI\]#\[original read ID\]. | ||
#' | ||
#' @param fastq_dir directory containing fastq files to match | ||
#' @param stats_file NEEDED | ||
#' @param out_fastq output filename for matched barcodes | ||
#' @param ref_csv NEEDED | ||
#' @param MAX_DIST int; maximum edit distance | ||
#' @param UMI_LEN int; length of UMI sequences | ||
#' @param left_seq String; sequence that appears at the left of the barcode | ||
#' @param min_length int; minimum read length to be filtered after timming barcodes | ||
#' @param reverse_complement boolean; whether to check the reverse complement of the reads | ||
#' @param fixed_range boolean; deprecated, whether to skip finding flanking sequence by infering | ||
#' its position from previous reads. Setting to \code{TRUE} may decrease performance and accuracy. | ||
#' @description demultiplex reads with flexiplex | ||
#' | ||
#' @return returns a list containing statistics of the reads demultiplexed. | ||
#' @param fastq input FASTQ file path | ||
#' @param barcodes_file path to file containing barcode allow-list, with one barcode in each line | ||
#' @param max_bc_editdistance max edit distances for the barcode sequence | ||
#' @param max_flank_editdistance max edit distances for the flanking sequences (primer and polyT) | ||
#' @param reads_out path of output FASTQ file | ||
#' @param stats_out path of output stats file | ||
#' @param threads number of threads to be used | ||
#' @param pattern named character vector defining the barcode pattern | ||
#' @examples | ||
#' outdir <- tempfile() | ||
#' dir.create(outdir) | ||
#' bc_allow <- file.path(outdir, 'bc_allow.tsv') | ||
#' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE) | ||
#' find_barcode( | ||
#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'), | ||
#' stats_file = file.path(outdir, 'bc_stat'), | ||
#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'), | ||
#' ref_csv = bc_allow, | ||
#' MAX_DIST = 2, | ||
#' UMI_LEN = 10 | ||
#') | ||
#' @md | ||
#' @export | ||
find_barcode <- function(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, UMI_LEN = 10L, | ||
left_seq = "CTACACGACGCTCTTCCGATCT", min_length = 20L, reverse_complement = TRUE, | ||
fixed_range = FALSE) { | ||
stats <- match_cell_barcode(fastq_dir, stats_file, out_fastq, ref_csv, MAX_DIST, | ||
UMI_LEN, left_seq, min_length, reverse_complement, fixed_range) | ||
demultiplexed_counts <- read.csv(stats_file, row.names = "cell_barcode") | ||
return(list(demultiplex_stats = as.data.frame(stats), column_data = demultiplexed_counts)) | ||
} | ||
|
||
#' Barcode demultiplexing QC plots | ||
#' @description Plot the barcode demultiplexing statistics | ||
#' @importFrom tidyr pivot_longer | ||
#' @importFrom magrittr '%>%' | ||
#' @importFrom dplyr filter | ||
#' @importFrom SummarizedExperiment colData colData<- | ||
#' @importFrom ggplot2 ggplot aes geom_bar ggtitle coord_polar element_blank theme position_stack theme_bw geom_histogram ggtitle ylab xlab geom_text | ||
#' @param sce The \code{SingleCellExperiment} object from FLAMES pipeline, or the returned list from \code{find_barcode} | ||
#' @return a list of QC plots for the barcode demultiplexing step (\code{find_barcode}) | ||
#' @examples | ||
#' outdir <- tempfile() | ||
#' dir.create(outdir) | ||
#' bc_allow <- file.path(outdir, 'bc_allow.tsv') | ||
#' R.utils::gunzip(filename = system.file('extdata/bc_allow.tsv.gz', package = 'FLAMES'), destname = bc_allow, remove = FALSE) | ||
#' barcode_info <- find_barcode( | ||
#' fastq_dir = system.file('extdata/fastq', package = 'FLAMES'), | ||
#' stats_file = file.path(outdir, 'bc_stat'), | ||
#' out_fastq = file.path(outdir, 'demultiplexed.fq.gz'), | ||
#' ref_csv = bc_allow, | ||
#' MAX_DIST = 2, | ||
#' UMI_LEN = 10 | ||
#' fastq = system.file('extdata/fastq', package = 'FLAMES'), | ||
#' stats_out = file.path(outdir, 'bc_stat'), | ||
#' reads_out = file.path(outdir, 'demultiplexed.fq.gz'), | ||
#' barcodes_file = bc_allow | ||
#') | ||
#' barcode_info_plots(barcode_info) | ||
#' @md | ||
#' @export | ||
barcode_info_plots <- function(sce) { | ||
if (is.list(sce)) { | ||
demultiplex_stats <- sce$demultiplex_stats | ||
demultiplexed_counts <- sce$column_data[, "reads_demultiplexed", drop = F] | ||
find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flank_editdistance = 8, | ||
reads_out, stats_out, threads = 1, pattern = c(primer = "CTACACGACGCTCTTCCGATCT", | ||
polyT = paste0(rep("T", 9), collapse = ""), umi_seq = paste0(rep("?", 12), | ||
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = ""))) { | ||
if (file_test("-f", fastq)) { | ||
flexiplex(reads_in = fastq, barcodes_file = barcodes_file, bc_as_readid = TRUE, | ||
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance, | ||
pattern = pattern, reads_out = reads_out, stats_out = stats_out, n_threads = threads, | ||
bc_out = tempfile()) | ||
} else if (file_test("-d", fastq)) { | ||
sapply(list.files(fastq, "\\.(fastq)|(fq)|(fasta)|(fa)$", full.names=TRUE), function(x) { | ||
flexiplex(reads_in = x, barcodes_file = barcodes_file, bc_as_readid = TRUE, | ||
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance, | ||
pattern = pattern, reads_out = reads_out, stats_out = stats_out, | ||
n_threads = threads, bc_out = tempfile()) | ||
}) | ||
} else { | ||
demultiplex_stats <- sce@metadata$demultiplex_stats | ||
demultiplexed_counts <- colData(sce)[, "reads_demultiplexed", drop = F] | ||
stop("The specified path does not exist: ", fastq) | ||
} | ||
summary_pie <- tidyr::pivot_longer(demultiplex_stats, everything()) %>% | ||
dplyr::filter(!name %in% c("total")) %>% | ||
ggplot(aes(x = "", y = value, label = value, fill = name)) + geom_bar(stat = "identity") + | ||
geom_text(position = position_stack(vjust = 0.5)) + coord_polar("y") + labs(x = NULL, | ||
y = NULL) + ggtitle("Reads demultiplexed summary") + theme_bw() + theme(panel.grid.major = element_blank(), | ||
panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_blank(), | ||
axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text.y = element_blank(), | ||
axis.ticks.y = element_blank()) | ||
distribution_hist <- ggplot(demultiplexed_counts, aes(x = reads_demultiplexed)) + | ||
geom_histogram(bins = ifelse(length(table(demultiplexed_counts)) < 20, length(table(demultiplexed_counts)), | ||
20)) + xlab("Reads demultiplexed") + ylab("Cell barcodes") + ggtitle("Reads demultiplexed distribution") | ||
return(list(summary_pie = summary_pie, distribution_hist = distribution_hist)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.