Skip to content

Commit

Permalink
wip: update agrep_demultiplex
Browse files Browse the repository at this point in the history
  • Loading branch information
ChangqingW committed May 31, 2023
1 parent a5a4692 commit 8446269
Showing 1 changed file with 65 additions and 72 deletions.
137 changes: 65 additions & 72 deletions R/agrep_demultiplex.R
Original file line number Diff line number Diff line change
@@ -1,43 +1,47 @@
# work in progress
library("ShortRead")
library("Biostrings")
agrep_demultiplex_ <- function(
fastq_file, bc.list, out.file = "out.fq.gz", structure = list(
adaptor.p5 = "CTACAC",
barcode = ".{16}", adaptor.read1nspacer = "CGCGTC.{28}GAGACAG", seq = ".*"
),
trim.after = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC", strands = c("-", "+"), pattern.max.distance = 3,
bc.max.distance = 2) {
agrep_demultiplex_ <- function(fastq_file, bc.list, out.file, structure, trim.after,
strands, pattern.max.distance, bc.max.distance) {
cat(fastq_file, "\n")
stat.names <- c("matched", "found.2N", "bc", "bc.dist")

strm <- FastqStreamer(fastq_file)
while (nreads <- length(res <- yield(strm))) {
strm <- ShortRead::FastqStreamer(fastq_file)
while (nreads <- length(res <- ShortRead::yield(strm))) {
match.idx <- sapply(strands, function(x) {
data.frame(matrix(ncol = length(stat.names), nrow = nreads, dimnames = list(
NULL,
stat.names
)))
data.frame(matrix(
ncol = length(stat.names),
nrow = nreads,
dimnames = list(NULL,stat.names)
))
}, simplify = FALSE)

for (strand in sort(strands, decreasing = TRUE)) {
if (strand == "-") {
res <- reverseComplement(res)
res <- Biostrings::reverseComplement(res)
}

# save valid match results to m
pattern <- paste0(c("(", paste0(structure, collapse = ")("), ")"), collapse = "")
m <- aregexec(pattern, sread(res), max.distance = pattern.max.distance)
m <- aregexec(pattern, ShortRead::sread(res), max.distance = pattern.max.distance)
match.idx[[strand]]$matched <- sapply(m, function(x) {
x[1] != -1
x[1] != -1 # valid match
})
res.matched <- res[match.idx[[strand]]$matched]
m <- m[match.idx[[strand]]$matched]
if (!length(m)) {
next
}

if ("umi" %in% names(structure)) {
umi <- sapply(regmatches(ShortRead::sread(res.matched), m), function(x) {
x[which("umi" == names(structure)) + 1]
})
stopifnot(length(m) == length(umi)) # delete
} else {
umi <- rep("", length(m))
}

if ("barcode" %in% names(structure)) {
bc <- sapply(regmatches(sread(res.matched), m), function(x) {
bc <- sapply(regmatches(ShortRead::sread(res.matched), m), function(x) {
x[which("barcode" == names(structure)) + 1]
})
bc.exact <- bc %in% bc.list
Expand All @@ -57,86 +61,77 @@ agrep_demultiplex_ <- function(
} else {
bc.matched <- bc.exact
}
match.idx[[strand]][match.idx[[strand]]][!bc.matched] <- FALSE
readseq <- DNAStringSet(sapply(
regmatches(sread(res.matched), m),
match.idx[[strand]]$matched[match.idx[[strand]]$matched][!bc.matched] <- FALSE
readseq <- Biostrings::DNAStringSet(sapply(
regmatches(ShortRead::sread(res.matched), m),
function(x) {
x[which("seq" == names(structure)) + 1]
}
))[bc.matched]
readid <- BStringSet(paste(bc, id(res.matched), sep = "#"))[bc.matched]
readqual <- sapply(regmatches(quality(res.matched), m), function(x) {
readid <- Biostrings::BStringSet(paste0(bc, "_", umi, "#", ShortRead::id(res.matched)))[bc.matched]
readqual <- sapply(regmatches(Biostrings::quality(res.matched), m), function(x) {
x[which("seq" == names(structure)) + 1]
})[bc.matched]
readqual <- ShortRead::SFastqQuality(readqual)
} else {
readseq <- DNAStringSet(sapply(
regmatches(sread(res.matched), m),
readseq <- Biostrings::DNAStringSet(sapply(
regmatches(ShortRead::sread(res.matched), m),
function(x) {
x[which("seq" == names(structure)) + 1]
}
))
readid <- id(res.matched)
readqual <- sapply(regmatches(quality(res.matched), m), function(x) {
readid <- Biostrings::BStringSet(paste0(umi, "#", ShortRead::id(res.matched)))
readqual <- sapply(regmatches(Biostrings::quality(res.matched), m), function(x) {
x[which("seq" == names(structure)) + 1]
})
readqual <- ShortRead::SFastqQuality(readqual)
}

if (!missing(trim.after) && !is.null(trim.after)) {
alig <- pairwiseAlignment(
subject = trim.after, pattern = readseq,
type = "overlap"
)
readseq <- subseq(readseq, start = ifelse(score(alig) > 10, end(pattern(alig)),
1
))
readqual <- SFastqQuality(subseq(readqual, start = ifelse(score(alig) >
alig <- Biostrings::pairwiseAlignment(subject = trim.after, pattern = readseq,
type = "overlap")
readseq <- XVector::subseq(readseq, start = ifelse(BiocGenerics::score(alig) >
10, end(pattern(alig)), 1))
readqual <- ShortRead::SFastqQuality(XVector::subseq(readqual, start = ifelse(BiocGenerics::score(alig) >
10, end(pattern(alig)), 1)))
}

demultiplexed_res <- new("ShortReadQ",
sread = readseq, quality = readqual,
id = readid
)
writeFastq(
object = demultiplexed_res[width(demultiplexed_res) > 10],
mode = "a", file = out.file
)
}
if (length(strands) > 1) {
cat(as.character(id(res[match.idx[["+"]] & match.idx[["-"]]])), sep = "\n")
cat("\n")
demultiplexed_res <- new("ShortReadQ", sread = readseq, quality = readqual,
id = readid)
ShortRead::writeFastq(object = demultiplexed_res[BiocGenerics::width(demultiplexed_res) >
10], mode = "a", file = out.file)
}
# if (length(strands) > 1) {
# cat(as.character(ShortRead::id(res[match.idx[["+"]] & match.idx[["-"]]])),
# sep = "\n")
# cat("\n")
# }
}
close(strm)

if (length(strands) > 1) {
stats["n.mixed.strands"] <- sum(match.idx[["+"]] & match.idx[["-"]])
}
for (strand in strands) {
stats[paste0("n.demultiplexed.", strand)] <- sum(match.idx[[strand]])
}
return(stats)
# if (length(strands) > 1) {
# stats["n.mixed.strands"] <- sum(match.idx[["+"]] & match.idx[["-"]])
# }
# for (strand in strands) {
# stats[paste0("n.demultiplexed.", strand)] <- sum(match.idx[[strand]])
# }
# return(stats)
}

agrep_demultiplex <- function(
fastq_file, bc.list, out.file = "out.fq.gz", structure = list(
adaptor.p5 = "CTACAC",
barcode = ".{16}", adaptor.read1nspacer = "CGCGTC.{28}GAGACAG", seq = ".*"
),
trim.after = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC", strands = c("-", "+"), pattern.max.distance = 3,
bc.max.distance = 2) {
# structure = list( adaptor.p5 = 'CTACAC', barcode = '.{16}',
# adaptor.read1nspacer = 'CGCGTC.{28}GAGACAG', seq = '.*' ),

agrep_demultiplex <- function(fastq_file, bc.list, out.file = "out.fq.gz", structure = list(partial.truseqread1 = "CTACACGACGCTCTTCCGATCT",
barcode = ".{16}", umi = ".{10}", polyT = "T{3,20}", seq = ".*"), trim.after = NULL,
strands = c("-", "+"), pattern.max.distance = 3, bc.max.distance = 2) {
if (length(fastq_file) == 1) {
return(agrep_demultiplex_(fastq_file, bc.list, out.file, structure, trim.after,
strands, pattern.max.distance,
bc.max.distance = 2
))
strands, pattern.max.distance, bc.max.distance))
} else {
bcs <- data.frame()
for (i in fastq_file) {
bcs <- rbind(bcs, agrep_demultiplex_(i, bc.list, out.file, structure,
trim.after, strands, pattern.max.distance,
bc.max.distance = 2
))
trim.after, strands, pattern.max.distance, bc.max.distance))
}
return(bcs)
}
Expand All @@ -145,10 +140,8 @@ agrep_demultiplex <- function(
if (FALSE) {
barcodes.atac.tsv <- read.csv("./barcodes.atac.tsv", header = F)$V1
cat(format(Sys.time(), "%a %b %d %X"), "\n")
x <- agrep_demultiplex(list.files(c("../fastq_pass", "../fastq_fail"),
full.names = TRUE,
pattern = "*.fastq.gz"
), barcodes.atac.tsv, bc.max.distance = 1, pattern.max.distance = 3)
x <- agrep_demultiplex(list.files(c("../fastq_pass", "../fastq_fail"), full.names = TRUE,
pattern = "*.fastq.gz"), barcodes.atac.tsv, bc.max.distance = 1, pattern.max.distance = 3)
cat(format(Sys.time(), "%a %b %d %X"), "\n")

saveRDS(x, "stats.rds")
Expand Down

0 comments on commit 8446269

Please sign in to comment.