Skip to content

Commit

Permalink
add cutadapt
Browse files Browse the repository at this point in the history
  • Loading branch information
ChangqingW committed Jul 21, 2023
1 parent 58c97be commit cb526a7
Show file tree
Hide file tree
Showing 22 changed files with 120 additions and 56 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(combine_sce)
export(create_config)
export(create_sce_from_dir)
export(create_se_from_dir)
export(cutadapt)
export(demultiplex_sockeye)
export(filter_annotation)
export(find_barcode)
Expand Down
3 changes: 2 additions & 1 deletion R/basilisk.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ flames_env <- BasiliskEnvironment(
"numpy==1.25.0",
"editdistance==0.6.2",
"scipy==1.11.1",
"pysam==0.21.0"
"pysam==0.21.0",
"cutadapt==4.4"
),
channels = c("conda-forge", "bioconda")
)
4 changes: 2 additions & 2 deletions R/create_config.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#'
#' @details Create a list object containing the arguments supplied in a format usable for the FLAMES pipeline.
#' Also writes the object to a JSON file, which is located with the prefix 'config_' in the supplied \code{outdir}.
#' Default values from \code{extdata/config_sclr_nanopore_5end.json} will be used for unprovided parameters.
#' Default values from \code{extdata/config_sclr_nanopore_3end.json} will be used for unprovided parameters.
#'
#' @param outdir the destination directory for the configuratio nfile
#' @param type use an example config, available values:
Expand Down Expand Up @@ -47,7 +47,7 @@
#' @export
create_config <- function(outdir, type = "sc_5end", ...) {
if (type == "sc_5end") {
config <- jsonlite::fromJSON(system.file("extdata/config_sclr_nanopore_5end.json", package = "FLAMES"))
config <- jsonlite::fromJSON(system.file("extdata/config_sclr_nanopore_3end.json", package = "FLAMES"))
} else if (type == "SIRV") {
config <- jsonlite::fromJSON(system.file("extdata/SIRV_config_default.json", package = "FLAMES"))
} else {
Expand Down
18 changes: 18 additions & 0 deletions R/cutadapt.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' cutadapt wrapper
#' @description trim TSO adaptor with cutadapt
#' @param args arguments to be passed to cutadapt
#' @return Exit code of cutadapt
#'
#' @examples
#' cutadapt("-h")
#'
#' @export
cutadapt <- function(args) {
callBasilisk(FLAMES:::flames_env, function(x) {
python_path <- system.file("python", package = "FLAMES")
mod <- reticulate::import_from_path("cutadapt_wrapper", python_path)
return(mod$wrapper(as.list(x)))
}, x = args)
}

#cutadapt -a 'CCCATGTACTCTGCGTTGATACCACTGCTT' -o cutadapt.fq --untrimmed-output untrimmed.fq ../main/1k.out.fq
40 changes: 39 additions & 1 deletion R/find_barcode.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#' @param reads_out path of output FASTQ file
#' @param stats_out path of output stats file
#' @param threads number of threads to be used
#' @param full_length_only boolean, when TSO sequence is provided, whether reads without TSO
#' are to be discarded
#' @param pattern named character vector defining the barcode pattern
#' @examples
#' outdir <- tempfile()
Expand All @@ -26,7 +28,7 @@
find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flank_editdistance = 8,
reads_out, stats_out, threads = 1, pattern = c(primer = "CTACACGACGCTCTTCCGATCT",
polyT = paste0(rep("T", 9), collapse = ""), umi_seq = paste0(rep("?", 12),
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = ""))) {
collapse = ""), barcode_seq = paste0(rep("?", 16), collapse = "")), full_length_only = FALSE) {
if (file_test("-f", fastq)) {
flexiplex(reads_in = fastq, barcodes_file = barcodes_file, bc_as_readid = TRUE,
max_bc_editdistance = max_bc_editdistance, max_flank_editdistance = max_flank_editdistance,
Expand All @@ -42,4 +44,40 @@ find_barcode <- function(fastq, barcodes_file, max_bc_editdistance = 2, max_flan
} else {
stop("The specified path does not exist: ", fastq)
}

if (sum(grepl("^[35]'TSO$", names(pattern))) == 1) {
untrimmed_reads <- file.path(
dirname(reads_out),
paste0("untrimmed_", basename(reads_out))
)
noTSO_reads <- file.path(
dirname(reads_out),
paste0("noTSO_", basename(reads_out))
)
stopifnot(file.rename(reads_out, untrimmed_reads))
#cutadapt -a 'TSO' -o reads_out --untrimmed-output noTSO_out in_fq(untrimmed.fq)
cutadapt(
c(
ifelse("3'TSO" %in% names(pattern), "-a", "-g"),
pattern[[which(grepl("^[35]'TSO$", names(pattern)))]],
"-o",
reads_out,
untrimmed_reads,
if (full_length_only) {
c("--untrimmed-output", noTSO_reads)
}
)
)
} else {
cat("Skipping TSO trimming...\n")
}
}

convert_cellranger_bc <- function(bc_allow, bc_from, bc_to) {
from <- read.delim(bc_from, header = FALSE)$V1
to <- read.delim(bc_to, header = FALSE)$V1
allowed <- read.delim(bc_allow, header = FALSE)$V1
stopifnot(length(from) == length(to))
stopifnot(all(allowed %in% from))
return(to[from %in% allowed])
}
1 change: 0 additions & 1 deletion R/sc_DTU_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' group_anno <- data.frame(barcode_seq = colnames(sce), groups = SingleCellExperiment::counts(sce)["ENSMUST00000169826.2", ] > 1)
Expand Down
8 changes: 4 additions & 4 deletions R/sc_long_multisample_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ sc_long_multisample_pipeline <-
genome_fa,
minimap2_dir = NULL,
barcodes_file,
match_barcode = TRUE,
config_file = NULL) {
checked_args <- check_arguments(
annotation,
Expand Down Expand Up @@ -119,16 +118,16 @@ sc_long_multisample_pipeline <-
stop(length(fastqs), " .fq or .fastq file(s) found\n")
}

if (match_barcode) {
stop("If \"match_barcode\" set to TRUE, argument \"fastqs\" must be a list of fastq files, with the same order in \"barcodes_file\"\nYou can also demultiplex the reads with \"FLAMES::find_barcode\"")
if (config$pipeline_parameters$do_barcode_demultiplex) {
stop("If \"do_barcode_demultiplex\" set to TRUE, argument \"fastqs\" must be a list of fastq files, with the same order in \"barcodes_file\"\nYou can also demultiplex the reads with \"FLAMES::find_barcode\"")
}
} else if (any(!file.exists(fastqs))) {
stop("Please make sure all fastq files exist.")
}

samples <- gsub("\\.(fastq|fq)(\\.gz)?$", "", basename(fastqs))

if (match_barcode && length(barcodes_file) >= 1) {
if (config$pipeline_parameters$do_barcode_demultiplex && length(barcodes_file) >= 1) {
if (!all(file.exists(barcodes_file))) {
stop("Please make sure all barcodes_file file exists.\n")
}
Expand All @@ -150,6 +149,7 @@ sc_long_multisample_pipeline <-
names(config$barcode_parameters$pattern)),
max_bc_editdistance = config$barcode_parameters$max_bc_editdistance,
max_flank_editdistance = config$barcode_parameters$max_flank_editdistance,
full_length_only = config$barcode_parameters$full_length_only,
threads = config$pipeline_parameters$threads
)
}
Expand Down
14 changes: 2 additions & 12 deletions R/sc_long_pipeline.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#' @param config_file File path to the JSON configuration file. If specified, \code{config_file} overrides
#' all configuration parameters
#' @param barcodes_file The file path to the reference csv used for demultiplexing
#' @param match_barcode Boolean; specifies if demultiplexing should be performed using `FLAMES::find_barcode`
#' @return if \code{do_transcript_quantification} set to true, \code{sc_long_pipeline} returns a \code{SingleCellExperiment} object, containing a count
#' matrix as an assay, gene annotations under metadata, as well as a list of the other
#' output files generated by the pipeline. The pipeline also outputs a number of output
Expand Down Expand Up @@ -94,7 +93,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' }
Expand All @@ -107,7 +105,6 @@ sc_long_pipeline <-
genome_fa,
minimap2_dir = NULL,
barcodes_file,
match_barcode,
config_file = NULL) {
checked_args <- check_arguments(
annotation,
Expand All @@ -123,14 +120,7 @@ sc_long_pipeline <-
minimap2_dir <- checked_args$minimap2_dir

infq <- NULL
if (missing("match_barcode")) {
if (basename(fastq) == "matched_reads.fastq.gz") {
match_barcode <- FALSE
} else {
match_barcode <- TRUE
}
}
if (match_barcode) {
if (config$pipeline_parameters$do_barcode_demultiplex) {
cat("Matching cell barcodes...\n")
if (!file.exists(barcodes_file)) {
stop("barcodes_file must exists.")
Expand All @@ -146,6 +136,7 @@ sc_long_pipeline <-
names(config$barcode_parameters$pattern)),
max_bc_editdistance = config$barcode_parameters$max_bc_editdistance,
max_flank_editdistance = config$barcode_parameters$max_flank_editdistance,
full_length_only = config$barcode_parameters$full_length_only,
threads = config$pipeline_parameters$threads
)
} else {
Expand Down Expand Up @@ -328,7 +319,6 @@ generate_bulk_summarized <- function(out_files, load_genome_anno = NULL) {
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = annotation,
#' outdir = outdir,
#' match_barcode = TRUE,
#' barcodes_file = bc_allow
#' )
#' sce_2 <- create_sce_from_dir(outdir, annotation)
Expand Down
1 change: 0 additions & 1 deletion R/sc_mutations.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
#' fastq = system.file("extdata/fastq", package = "FLAMES"),
#' annotation = system.file("extdata/rps24.gtf.gz", package = "FLAMES"),
#' outdir = outdir,
#' match_barcode = TRUE,
#' reference_csv = bc_allow
#' )
#' sc_mutations(sce, barcode_tsv = file.path(outdir, "bc_allow.tsv"), min_cov = 2, report_pct = c(0, 1))
Expand Down
7 changes: 5 additions & 2 deletions inst/extdata/SIRV_config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"pipeline_parameters": {
"seed": 2022,
"threads" : 1,
"do_barcode_demultiplex": false,
"do_genome_alignment": true,
"do_isoform_identification": true,
"bambu_isoform_identification": false,
Expand All @@ -16,8 +17,10 @@
"primer": "CTACACGACGCTCTTCCGATCT",
"polyT": "TTTTTTTTT",
"umi_seq": "????????????",
"barcode_seq": "????????????????"
}
"barcode_seq": "????????????????",
"3'TSO" : "CCCATGTACTCTGCGTTGATACCACTGCTT"
},
"full_length_only" : false
},
"isoform_parameters": {
"generate_raw_isoform": true,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
{
"comment": "this is the default config for nanopore single cell long read data using 10x VDJ kit (5'end). use splice annotation in alignment. ",
"comment": "this is the default config for nanopore single cell long read data using 10x 3'end kit. use splice annotation in alignment. ",
"pipeline_parameters": {
"seed": 2022,
"threads" : 1,
"do_barcode_demultiplex": true,
"do_genome_alignment": true,
"do_isoform_identification": true,
"bambu_isoform_identification": false,
Expand All @@ -16,8 +17,10 @@
"primer": "CTACACGACGCTCTTCCGATCT",
"polyT": "TTTTTTTTT",
"umi_seq": "????????????",
"barcode_seq": "????????????????"
}
"barcode_seq": "????????????????",
"3'TSO" : "CCCATGTACTCTGCGTTGATACCACTGCTT"
},
"full_length_only" : false
},
"isoform_parameters": {
"generate_raw_isoform": false,
Expand Down
4 changes: 2 additions & 2 deletions man/FLAMES.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/create_config.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/create_sce_from_dir.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/cutadapt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion man/find_barcode.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/sc_DTU_analysis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions man/sc_long_multisample_pipeline.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions man/sc_long_pipeline.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/sc_mutations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit cb526a7

Please sign in to comment.