Skip to content

Commit

Permalink
Merge remote-tracking branch 'oliver/devel'
Browse files Browse the repository at this point in the history
  • Loading branch information
youyupei committed Jul 22, 2023
2 parents 02cf278 + 61f8c42 commit 612b4ae
Show file tree
Hide file tree
Showing 25 changed files with 139 additions and 69 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export(combine_sce)
export(create_config)
export(create_sce_from_dir)
export(create_se_from_dir)
export(cutadapt)
export(demultiplex_sockeye)
export(filter_annotation)
export(find_barcode)
Expand Down
1 change: 1 addition & 0 deletions R/basilisk.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ flames_env <- BasiliskEnvironment(
"editdistance==0.6.2",
"scipy==1.11.1",
"pysam==0.21.0",
"cutadapt==4.4",
"tqdm==4.64.1",
"matplotlib==3.5.3",
"pandas==1.3.5",
Expand Down
4 changes: 2 additions & 2 deletions R/create_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#'
#' @details Create a list object containing the arguments supplied in a format usable for the FLAMES pipeline.
#' Also writes the object to a JSON file, which is located with the prefix 'config_' in the supplied \code{outdir}.
#' Default values from \code{extdata/config_sclr_nanopore_5end.json} will be used for unprovided parameters.
#' Default values from \code{extdata/config_sclr_nanopore_3end.json} will be used for unprovided parameters.
#'
#' @param outdir the destination directory for the configuratio nfile
#' @param type use an example config, available values:
Expand Down Expand Up @@ -47,7 +47,7 @@
#' @export
create_config <- function(outdir, type = "sc_5end", ...) {
if (type == "sc_5end") {
config <- jsonlite::fromJSON(system.file("extdata/config_sclr_nanopore_5end.json", package = "FLAMES"))
config <- jsonlite::fromJSON(system.file("extdata/config_sclr_nanopore_3end.json", package = "FLAMES"))
} else if (type == "SIRV") {
config <- jsonlite::fromJSON(system.file("extdata/SIRV_config_default.json", package = "FLAMES"))
} else {
Expand Down
18 changes: 18 additions & 0 deletions R/cutadapt.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' cutadapt wrapper
#' @description trim TSO adaptor with cutadapt
#' @param args arguments to be passed to cutadapt
#' @return Exit code of cutadapt
#'
#' @examples
#' cutadapt("-h")
#'
#' @export
cutadapt <- function(args) {
callBasilisk(FLAMES:::flames_env, function(x) {
python_path <- system.file("python", package = "FLAMES")
mod <- reticulate::import_from_path("cutadapt_wrapper", python_path)
return(mod$wrapper(as.list(x)))
}, x = args)
}

#cutadapt -a 'CCCATGTACTCTGCGTTGATACCACTGCTT' -o cutadapt.fq --untrimmed-output untrimmed.fq ../main/1k.out.fq
40 changes: 39 additions & 1 deletion R/find_barcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#' @param reads_out path of output FASTQ file
#' @param stats_out path of output stats file
#' @param threads number of threads to be used
#' @param full_length_only boolean, when TSO sequence is provided, whether reads without TSO
#' are to be discarded
#' @param pattern named character vector defining the barcode pattern
#' @examples
#' outdir <- tempfile()
Expand All @@ -26,7 +28,7 @@
find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flank_editdistance = 8,
reads_out, stats_out, threads = 1, pattern = c(primer = "CTACACGACGCTCTTCCGATCT",
polyT = paste0(rep("T", 9), collapse = ""), umi_seq = paste0(rep("?", 12),
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = ""))) {
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = "")), full_length_only = FALSE) {
if (file_test("-f", fastq)) {
flexiplex(reads_in = fastq, barcodes_file = barcodes_file, bc_as_readid = TRUE,
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
Expand All @@ -42,4 +44,40 @@ find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flan
} else {
stop("The specified path does not exist: ", fastq)
}

if (sum(grepl("^[35]'TSO$", names(pattern))) == 1) {
untrimmed_reads <- file.path(
dirname(reads_out),
paste0("untrimmed_", basename(reads_out))
)
noTSO_reads <- file.path(
dirname(reads_out),
paste0("noTSO_", basename(reads_out))
)
stopifnot(file.rename(reads_out, untrimmed_reads))
#cutadapt -a 'TSO' -o reads_out --untrimmed-output noTSO_out in_fq(untrimmed.fq)
cutadapt(
c(
ifelse("3'TSO" %in% names(pattern), "-a", "-g"),
pattern[[which(grepl("^[35]'TSO$", names(pattern)))]],
"-o",
reads_out,
untrimmed_reads,
if (full_length_only) {
c("--untrimmed-output", noTSO_reads)
}
)
)
} else {
cat("Skipping TSO trimming...\n")
}
}

convert_cellranger_bc <- function(bc_allow, bc_from, bc_to) {
from <- read.delim(bc_from, header = FALSE)$V1
to <- read.delim(bc_to, header = FALSE)$V1
allowed <- read.delim(bc_allow, header = FALSE)$V1
stopifnot(length(from) == length(to))
stopifnot(all(allowed %in% from))
return(to[from %in% allowed])
}
1 change: 0 additions & 1 deletion R/sc_DTU_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' group_anno <- data.frame(barcode_seq = colnames(sce), groups = SingleCellExperiment::counts(sce)["ENSMUST00000169826.2", ] > 1)
Expand Down
14 changes: 7 additions & 7 deletions R/sc_long_multisample_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ sc_long_multisample_pipeline <-
genome_fa,
minimap2_dir = NULL,
barcodes_file = NULL,
match_barcode = TRUE,
config_file = NULL) {
checked_args <- check_arguments(
annotation,
Expand Down Expand Up @@ -119,21 +118,21 @@ sc_long_multisample_pipeline <-
stop(length(fastqs), " .fq or .fastq file(s) found\n")
}

if (match_barcode && !is.null(barcodes_file)) {
stop("If \"match_barcode\" set to TRUE and \"barcodes_file\" is specified, argument \"fastqs\" must be a list of fastq files, with the same order in \"barcodes_file\"\nYou can also demultiplex the reads with \"FLAMES::find_barcode\"")
if (config$pipeline_parameters$do_barcode_demultiplex && !is.null(barcodes_file)) {
stop("If \"do_barcode_demultiplex\" set to TRUE and \"barcodes_file\" is specified, argument \"fastqs\" must be a list of fastq files, with the same order in \"barcodes_file\"\nYou can also demultiplex the reads with \"FLAMES::find_barcode\"")
}
} else if (any(!file.exists(fastqs))) {
stop("Please make sure all fastq files exist.")
}

samples <- gsub("\\.(fastq|fq)(\\.gz)?$", "", basename(fastqs))

if (match_barcode && is.null(barcodes_file)) {
if (config$pipeline_parameters$do_barcode_demultiplex && is.null(barcodes_file)) {
cat("No barcodes_file provided, running BLAZE to generate it from long reads...")
# config the blaze run
config$blaze_parameters['output-fastq'] <- 'matched_reads.fastq.gz'
config$blaze_parameters['threads'] <- config$blaze_parameters$threads
warning("BLAZE is running with default with --expect-cell ",config$blaze_parameters$expect_cell,
warning("BLAZE is running with default with --expect-cells ",config$blaze_parameters['expect-cells'],
",\n which meant to be the expected number of cells. If it is very different from your actuall\n"
," expectation, please modify it in the config file.")

Expand All @@ -142,7 +141,7 @@ sc_long_multisample_pipeline <-
blaze(config$blaze_parameters, fastqs[i])
}
infqs <- file.path(outdir, paste(samples, "matched_reads.fastq.gz", sep = "_"))
} else if (match_barcode && length(barcodes_file) >= 1) {
} else if (config$pipeline_parameters$do_barcode_demultiplex && length(barcodes_file) >= 1) {

if (!all(file.exists(barcodes_file))) {
stop("Please make sure all barcodes_file file exists.\n")
Expand All @@ -153,7 +152,7 @@ sc_long_multisample_pipeline <-
if (length(barcodes_file) != length(fastqs)) {
stop(length(barcodes_file), " barcode allow-lists provided while there are ", length(fastqs), "fastq file. Please either provide one allow-list per sample, or one allow-list for all samples.")
}
infqs <- file.path(outdir, paste(samples, "matched_reads.fastq.gz", sep = "_"))
infqs <- file.path(outdir, paste(samples, "matched_reads.fastq", sep = "_"))
bc_stats <- file.path(outdir, paste(samples, "matched_barcode_stat", sep = "_"))
for (i in 1:length(fastqs)) {
find_barcode(
Expand All @@ -165,6 +164,7 @@ sc_long_multisample_pipeline <-
names(config$barcode_parameters$pattern)),
max_bc_editdistance = config$barcode_parameters$max_bc_editdistance,
max_flank_editdistance = config$barcode_parameters$max_flank_editdistance,
full_length_only = config$barcode_parameters$full_length_only,
threads = config$pipeline_parameters$threads
)
}
Expand Down
20 changes: 6 additions & 14 deletions R/sc_long_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#' @param config_file File path to the JSON configuration file. If specified, \code{config_file} overrides
#' all configuration parameters
#' @param barcodes_file The file path to the reference csv used for demultiplexing
#' @param match_barcode Boolean; specifies if demultiplexing should be performed using `FLAMES::find_barcode`
#' @return if \code{do_transcript_quantification} set to true, \code{sc_long_pipeline} returns a \code{SingleCellExperiment} object, containing a count
#' matrix as an assay, gene annotations under metadata, as well as a list of the other
#' output files generated by the pipeline. The pipeline also outputs a number of output
Expand Down Expand Up @@ -94,7 +93,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' }
Expand All @@ -107,7 +105,6 @@ sc_long_pipeline <-
genome_fa,
minimap2_dir = NULL,
barcodes_file=NULL,
match_barcode,
config_file = NULL) {
checked_args <- check_arguments(
annotation,
Expand All @@ -123,14 +120,7 @@ sc_long_pipeline <-
minimap2_dir <- checked_args$minimap2_dir

infq <- NULL
if (missing("match_barcode")) {
if (basename(fastq) == "matched_reads.fastq.gz") {
match_barcode <- FALSE
} else {
match_barcode <- TRUE
}
}
if (match_barcode) {
if (config$pipeline_parameters$do_barcode_demultiplex) {

if (is.null(barcodes_file)){
cat("No barcodes_file provided, running BLAZE to generate it from long reads...")
Expand All @@ -139,7 +129,9 @@ sc_long_pipeline <-
config$blaze_parameters['output-prefix'] <- paste0(outdir, '/')
config$blaze_parameters['output-fastq'] <- 'matched_reads.fastq.gz'
config$blaze_parameters['threads'] <- config$blaze_parameters$threads

warning("BLAZE is running with default with --expect-cells ",config$blaze_parameters['expect-cells'],
",\n which meant to be the expected number of cells. If it is very different from your actuall\n"
," expectation, please modify it in the config file.")

blaze(config$blaze_parameters, fastq)
infq <- file.path(outdir, "matched_reads.fastq.gz")
Expand All @@ -149,7 +141,7 @@ sc_long_pipeline <-
if (!file.exists(barcodes_file)) {
stop("barcodes_file must exists.")
}
infq <- file.path(outdir, "matched_reads.fastq.gz")
infq <- file.path(outdir, "matched_reads.fastq")
bc_stat <- file.path(outdir, "matched_barcode_stat")
find_barcode(
fastq = fastq,
Expand All @@ -160,6 +152,7 @@ sc_long_pipeline <-
names(config$barcode_parameters$pattern)),
max_bc_editdistance = config$barcode_parameters$max_bc_editdistance,
max_flank_editdistance = config$barcode_parameters$max_flank_editdistance,
full_length_only = config$barcode_parameters$full_length_only,
threads = config$pipeline_parameters$threads
)
}
Expand Down Expand Up @@ -346,7 +339,6 @@ generate_bulk_summarized <- function(out_files, load_genome_anno = NULL) {
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = annotation,
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' sce_2 <- create_sce_from_dir(outdir, annotation)
Expand Down
1 change: 0 additions & 1 deletion R/sc_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' reference_csv = bc_allow
#' )
#' sc_mutations(sce, barcode_tsv = file.path(outdir, "bc_allow.tsv"), min_cov = 2, report_pct = c(0, 1))
Expand Down
2 changes: 1 addition & 1 deletion inst/blaze/README.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The scripts in this folder was downloaded from https://github.com/shimlab/BLAZE/tree/dev/bin

Version: 659c281 - Fri, 21 Jul 2023 13:53:4 AEST
Version: 0d51d5e - Sat, 22 Jul 2023 17:46:58 +1000

Reference:
You, Y., Prawer, Y. D., De Paoli-Iseppi, R., Hunt, C. P., Parish, C. L., Shim, H., & Clark, M. B. (2023) Identification of cell barcodes from long-read single-cell RNA-seq with BLAZE. Genome Biol 24, 66.
6 changes: 4 additions & 2 deletions inst/blaze/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,16 @@ def multiprocessing_submit(func, iterator, n_process=mp.cpu_count()-1 ,
if job_to_yield in job_completed.keys():
n_job_in_queue -= 1
# update pregress bar based on batch size
pbar.update(job_completed[job_to_yield][1])
if pbar:
pbar.update(job_completed[job_to_yield][1])
yield job_completed[job_to_yield][0]
del job_completed[job_to_yield]
job_to_yield += 1
# all jobs finished: yield complelted job in the submit order
else:
while len(job_completed):
pbar.update(job_completed[job_to_yield][1])
if pbar:
pbar.update(job_completed[job_to_yield][1])
yield job_completed[job_to_yield][0]
del job_completed[job_to_yield]
job_to_yield += 1
Expand Down
7 changes: 5 additions & 2 deletions inst/extdata/SIRV_config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"pipeline_parameters": {
"seed": 2022,
"threads" : 1,
"do_barcode_demultiplex": false,
"do_genome_alignment": true,
"do_isoform_identification": true,
"bambu_isoform_identification": false,
Expand All @@ -16,8 +17,10 @@
"primer": "CTACACGACGCTCTTCCGATCT",
"polyT": "TTTTTTTTT",
"umi_seq": "????????????",
"barcode_seq": "????????????????"
}
"barcode_seq": "????????????????",
"3'TSO" : "CCCATGTACTCTGCGTTGATACCACTGCTT"
},
"full_length_only" : false
},
"isoform_parameters": {
"generate_raw_isoform": true,
Expand Down
2 changes: 1 addition & 1 deletion inst/extdata/blaze_flames.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"comment": "this is the default config for nanopore single cell long read data using 10x VDJ kit (5'end). use splice annotation in alignment. ",
"pipeline_parameters": {
"seed": 2023,
"do_barcode_identification": true,
"do_barcode_demultiplex": true,
"do_genome_alignment": true,
"do_isoform_identification": true,
"bambu_isoform_identification": true,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
{
"comment": "this is the default config for nanopore single cell long read data using 10x VDJ kit (5'end). use splice annotation in alignment. ",
"comment": "this is the default config for nanopore single cell long read data using 10x 3'end kit. use splice annotation in alignment. ",
"pipeline_parameters": {
"seed": 2022,
"threads" : 1,
"do_barcode_demultiplex": true,
"do_genome_alignment": true,
"do_isoform_identification": true,
"bambu_isoform_identification": false,
Expand All @@ -16,8 +17,10 @@
"primer": "CTACACGACGCTCTTCCGATCT",
"polyT": "TTTTTTTTT",
"umi_seq": "????????????",
"barcode_seq": "????????????????"
}
"barcode_seq": "????????????????",
"3'TSO" : "CCCATGTACTCTGCGTTGATACCACTGCTT"
},
"full_length_only" : false
},
"isoform_parameters": {
"generate_raw_isoform": false,
Expand Down
16 changes: 9 additions & 7 deletions inst/python/count_tr.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def process_trans(batch_id, total_batches, bam_in, fa_idx_f, min_sup_reads, min_
alignment_batch: a list of pysam AlignedSegment object
"""
bamfile = ps.AlignmentFile(bam_in, "rb")
print("process start time:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# print("process start time:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
fa_idx = dict((it.strip().split()[0], int(
it.strip().split()[1])) for it in open(fa_idx_f))
bc_tr_count_dict = {}
Expand Down Expand Up @@ -238,8 +238,8 @@ def process_trans(batch_id, total_batches, bam_in, fa_idx_f, min_sup_reads, min_
bc_tr_count_dict[bc] = {}
bc_tr_count_dict[bc].setdefault(hit[0], []).append(umi)
cnt_stat["counted_reads"] += 1
print(("\t" + str(cnt_stat)))
print("process end time:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
# print(("\t" + str(cnt_stat)))
# print("process end time:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
return bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept


Expand All @@ -254,8 +254,10 @@ def parse_realigned_bam(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage,
Per transcript read count.
"""
rst_futures = helper.multiprocessing_submit(process_trans,
(x for x in range(n_process)), total_batches=n_process ,n_process=n_process, pbar = True,
pbar_tot=None, pbar_update=1, bam_in=bam_in,
(x for x in range(n_process)), total_batches=n_process ,n_process=n_process, pbar = False,
pbar_func = lambda x: 1,
pbar_unit='Batch',
bam_in=bam_in,
fa_idx_f=fa_idx_f, min_sup_reads=min_sup_reads,
min_tr_coverage=min_tr_coverage,
min_read_coverage=min_read_coverage, bc_file=bc_file)
Expand All @@ -265,8 +267,8 @@ def parse_realigned_bam(bam_in, fa_idx_f, min_sup_reads, min_tr_coverage,
tr_kept_rst_mp_rst = None # this doesn't seem to be used in the downstream

for idx, f in enumerate(rst_futures):
print(f"collecting result of batch {idx} " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
flush = True)
#print(f"collecting result of batch {idx} " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"),flush = True)

d1, d2, _ = f.result()
for k1 in d1.keys():
for k2 in d1[k1].keys():
Expand Down
2 changes: 1 addition & 1 deletion man/create_config.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 612b4ae

Please sign in to comment.