wip: update agrep_demultiplex

mritchielab · May 31, 2023 · 8446269 · 8446269
1 parent a5a4692
commit 8446269
Showing 1 changed file with 65 additions and 72 deletions.
diff --git a/R/agrep_demultiplex.R b/R/agrep_demultiplex.R
@@ -1,43 +1,47 @@
 # work in progress
-library("ShortRead")
-library("Biostrings")
-agrep_demultiplex_ <- function(
-    fastq_file, bc.list, out.file = "out.fq.gz", structure = list(
-      adaptor.p5 = "CTACAC",
-      barcode = ".{16}", adaptor.read1nspacer = "CGCGTC.{28}GAGACAG", seq = ".*"
-    ),
-    trim.after = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC", strands = c("-", "+"), pattern.max.distance = 3,
-    bc.max.distance = 2) {
+agrep_demultiplex_ <- function(fastq_file, bc.list, out.file, structure, trim.after,
+  strands, pattern.max.distance, bc.max.distance) {
   cat(fastq_file, "\n")
   stat.names <- c("matched", "found.2N", "bc", "bc.dist")
 
-  strm <- FastqStreamer(fastq_file)
-  while (nreads <- length(res <- yield(strm))) {
+  strm <- ShortRead::FastqStreamer(fastq_file)
+  while (nreads <- length(res <- ShortRead::yield(strm))) {
     match.idx <- sapply(strands, function(x) {
-      data.frame(matrix(ncol = length(stat.names), nrow = nreads, dimnames = list(
-        NULL,
-        stat.names
-      )))
+      data.frame(matrix(
+        ncol = length(stat.names), 
+        nrow = nreads, 
+        dimnames = list(NULL,stat.names)
+      ))
     }, simplify = FALSE)
 
     for (strand in sort(strands, decreasing = TRUE)) {
       if (strand == "-") {
-        res <- reverseComplement(res)
+        res <- Biostrings::reverseComplement(res)
       }
 
+      # save valid match results to m
       pattern <- paste0(c("(", paste0(structure, collapse = ")("), ")"), collapse = "")
-      m <- aregexec(pattern, sread(res), max.distance = pattern.max.distance)
+      m <- aregexec(pattern, ShortRead::sread(res), max.distance = pattern.max.distance)
       match.idx[[strand]]$matched <- sapply(m, function(x) {
-        x[1] != -1
+        x[1] != -1 # valid match
       })
       res.matched <- res[match.idx[[strand]]$matched]
       m <- m[match.idx[[strand]]$matched]
       if (!length(m)) {
         next
       }
 
+      if ("umi" %in% names(structure)) {
+        umi <- sapply(regmatches(ShortRead::sread(res.matched), m), function(x) {
+          x[which("umi" == names(structure)) + 1]
+        })
+        stopifnot(length(m) == length(umi)) # delete
+      } else {
+        umi <- rep("", length(m))
+      }
+
       if ("barcode" %in% names(structure)) {
-        bc <- sapply(regmatches(sread(res.matched), m), function(x) {
+        bc <- sapply(regmatches(ShortRead::sread(res.matched), m), function(x) {
           x[which("barcode" == names(structure)) + 1]
         })
         bc.exact <- bc %in% bc.list
@@ -57,86 +61,77 @@ agrep_demultiplex_ <- function(
         } else {
           bc.matched <- bc.exact
         }
-        match.idx[[strand]][match.idx[[strand]]][!bc.matched] <- FALSE
-        readseq <- DNAStringSet(sapply(
-          regmatches(sread(res.matched), m),
+         match.idx[[strand]]$matched[match.idx[[strand]]$matched][!bc.matched] <- FALSE
+        readseq <- Biostrings::DNAStringSet(sapply(
+          regmatches(ShortRead::sread(res.matched), m),
           function(x) {
             x[which("seq" == names(structure)) + 1]
           }
         ))[bc.matched]
-        readid <- BStringSet(paste(bc, id(res.matched), sep = "#"))[bc.matched]
-        readqual <- sapply(regmatches(quality(res.matched), m), function(x) {
+        readid <- Biostrings::BStringSet(paste0(bc, "_", umi, "#", ShortRead::id(res.matched)))[bc.matched]
+        readqual <- sapply(regmatches(Biostrings::quality(res.matched), m), function(x) {
           x[which("seq" == names(structure)) + 1]
         })[bc.matched]
+        readqual <- ShortRead::SFastqQuality(readqual)
       } else {
-        readseq <- DNAStringSet(sapply(
-          regmatches(sread(res.matched), m),
+        readseq <- Biostrings::DNAStringSet(sapply(
+          regmatches(ShortRead::sread(res.matched), m),
           function(x) {
             x[which("seq" == names(structure)) + 1]
           }
         ))
-        readid <- id(res.matched)
-        readqual <- sapply(regmatches(quality(res.matched), m), function(x) {
+        readid <- Biostrings::BStringSet(paste0(umi, "#", ShortRead::id(res.matched)))
+        readqual <- sapply(regmatches(Biostrings::quality(res.matched), m), function(x) {
           x[which("seq" == names(structure)) + 1]
         })
+        readqual <- ShortRead::SFastqQuality(readqual)
       }
 
       if (!missing(trim.after) && !is.null(trim.after)) {
-        alig <- pairwiseAlignment(
-          subject = trim.after, pattern = readseq,
-          type = "overlap"
-        )
-        readseq <- subseq(readseq, start = ifelse(score(alig) > 10, end(pattern(alig)),
-          1
-        ))
-        readqual <- SFastqQuality(subseq(readqual, start = ifelse(score(alig) >
+        alig <- Biostrings::pairwiseAlignment(subject = trim.after, pattern = readseq,
+          type = "overlap")
+        readseq <- XVector::subseq(readseq, start = ifelse(BiocGenerics::score(alig) >
+          10, end(pattern(alig)), 1))
+        readqual <- ShortRead::SFastqQuality(XVector::subseq(readqual, start = ifelse(BiocGenerics::score(alig) >
           10, end(pattern(alig)), 1)))
       }
 
-      demultiplexed_res <- new("ShortReadQ",
-        sread = readseq, quality = readqual,
-        id = readid
-      )
-      writeFastq(
-        object = demultiplexed_res[width(demultiplexed_res) > 10],
-        mode = "a", file = out.file
-      )
-    }
-    if (length(strands) > 1) {
-      cat(as.character(id(res[match.idx[["+"]] & match.idx[["-"]]])), sep = "\n")
-      cat("\n")
+      demultiplexed_res <- new("ShortReadQ", sread = readseq, quality = readqual,
+        id = readid)
+      ShortRead::writeFastq(object = demultiplexed_res[BiocGenerics::width(demultiplexed_res) >
+        10], mode = "a", file = out.file)
     }
+#    if (length(strands) > 1) {
+#      cat(as.character(ShortRead::id(res[match.idx[["+"]] & match.idx[["-"]]])),
+#        sep = "\n")
+#      cat("\n")
+#    }
   }
   close(strm)
 
-  if (length(strands) > 1) {
-    stats["n.mixed.strands"] <- sum(match.idx[["+"]] & match.idx[["-"]])
-  }
-  for (strand in strands) {
-    stats[paste0("n.demultiplexed.", strand)] <- sum(match.idx[[strand]])
-  }
-  return(stats)
+#  if (length(strands) > 1) {
+#    stats["n.mixed.strands"] <- sum(match.idx[["+"]] & match.idx[["-"]])
+#  }
+#  for (strand in strands) {
+#    stats[paste0("n.demultiplexed.", strand)] <- sum(match.idx[[strand]])
+#  }
+#  return(stats)
 }
 
-agrep_demultiplex <- function(
-    fastq_file, bc.list, out.file = "out.fq.gz", structure = list(
-      adaptor.p5 = "CTACAC",
-      barcode = ".{16}", adaptor.read1nspacer = "CGCGTC.{28}GAGACAG", seq = ".*"
-    ),
-    trim.after = "CTGTCTCTTATACACATCTCCGAGCCCACGAGAC", strands = c("-", "+"), pattern.max.distance = 3,
-    bc.max.distance = 2) {
+# structure = list( adaptor.p5 = 'CTACAC', barcode = '.{16}',
+# adaptor.read1nspacer = 'CGCGTC.{28}GAGACAG', seq = '.*' ),
+
+agrep_demultiplex <- function(fastq_file, bc.list, out.file = "out.fq.gz", structure = list(partial.truseqread1 = "CTACACGACGCTCTTCCGATCT",
+  barcode = ".{16}", umi = ".{10}", polyT = "T{3,20}", seq = ".*"), trim.after = NULL,
+  strands = c("-", "+"), pattern.max.distance = 3, bc.max.distance = 2) {
   if (length(fastq_file) == 1) {
     return(agrep_demultiplex_(fastq_file, bc.list, out.file, structure, trim.after,
-      strands, pattern.max.distance,
-      bc.max.distance = 2
-    ))
+      strands, pattern.max.distance, bc.max.distance))
   } else {
     bcs <- data.frame()
     for (i in fastq_file) {
       bcs <- rbind(bcs, agrep_demultiplex_(i, bc.list, out.file, structure,
-        trim.after, strands, pattern.max.distance,
-        bc.max.distance = 2
-      ))
+        trim.after, strands, pattern.max.distance, bc.max.distance))
     }
     return(bcs)
   }
@@ -145,10 +140,8 @@ agrep_demultiplex <- function(
 if (FALSE) {
   barcodes.atac.tsv <- read.csv("./barcodes.atac.tsv", header = F)$V1
   cat(format(Sys.time(), "%a %b %d %X"), "\n")
-  x <- agrep_demultiplex(list.files(c("../fastq_pass", "../fastq_fail"),
-    full.names = TRUE,
-    pattern = "*.fastq.gz"
-  ), barcodes.atac.tsv, bc.max.distance = 1, pattern.max.distance = 3)
+  x <- agrep_demultiplex(list.files(c("../fastq_pass", "../fastq_fail"), full.names = TRUE,
+    pattern = "*.fastq.gz"), barcodes.atac.tsv, bc.max.distance = 1, pattern.max.distance = 3)
   cat(format(Sys.time(), "%a %b %d %X"), "\n")
 
   saveRDS(x, "stats.rds")