Skip to content

Commit

Permalink
[feature] TrimPrimers can trim only R1s (#681)
Browse files Browse the repository at this point in the history
* [feature] TrimPrimers can trim only R1s
  • Loading branch information
nh13 authored Aug 10, 2021
1 parent b9277e5 commit e91698c
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 15 deletions.
44 changes: 38 additions & 6 deletions src/main/scala/com/fulcrumgenomics/bam/TrimPrimers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,19 @@ import scala.collection.BufferedIterator
|
|If the input BAM is not `queryname` sorted it will be sorted internally so that mate
|information between paired-end reads can be corrected before writing the output file.
|
|The `--first-of-pair` option will cause only the first of pair (R1) reads to be trimmed
|based solely on the primer location of R1. This is useful when there is a target
|specific primer on the 5' end of R1 but no primer sequenced on R2 (eg. single gene-specific
|primer target enrichment). In this case, the location of each target specific primer should
|be specified in an amplicons left or right primer exclusively. The coordinates of the
|non-specific-target primer should be `-1` for both start and end, e.g:
|
|```
|chrom left_start left_end right_start right_end
|chr1 1010873 1010894 -1 -1
|chr2 -1 -1 1011118 1011137
|```
""")
class TrimPrimers
( @arg(flag='i', doc="Input BAM file.") val input: PathToBam,
Expand All @@ -68,7 +81,9 @@ class TrimPrimers
@arg(flag='S', doc="Match to primer locations +/- this many bases.") val slop: Int = 5,
@arg(flag='s', doc="Sort order of output BAM file (defaults to input sort order).") val sortOrder: Option[SamOrder] = None,
@arg(flag='r', doc="Optional reference fasta for recalculating NM, MD and UQ tags.") val ref: Option[PathToFasta] = None,
@arg(flag='a', doc="Automatically trim extended attributes that are the same length as bases.") val autoTrimAttributes: Boolean = false
@arg(flag='a', doc="Automatically trim extended attributes that are the same length as bases.") val autoTrimAttributes: Boolean = false,
@arg(doc="Trim only first of pair reads (R1s), otherwise both ends of a pair") val firstOfPair: Boolean = false

)extends FgBioTool with LazyLogging {
private val clipper = new SamRecordClipper(mode=if (hardClip) ClippingMode.Hard else ClippingMode.Soft, autoClipAttributes=autoTrimAttributes)

Expand Down Expand Up @@ -107,6 +122,21 @@ class TrimPrimers
unclippedCoordinates = true
)

// Validate that if we are trimming the first of pair, all amplicons have -1 coordinates for either the left or the
// right primer. Otherwise, coordinates must be > 0.
detector.detector.getAll.iterator().foreach { amplicon =>
if (firstOfPair) {
require(amplicon.leftStart == -1 || amplicon.rightStart == -1,
f"Either the left or right amplicon coordinates must be -1 when using --first-of-pair. Found ${amplicon.mkString("\t")}"
)
}
else {
require(amplicon.leftStart > 0 && amplicon.rightStart > 0,
f"Both the left and right amplicon coordinates must be > 0. Did you forget to set '--first-of-pair'? Found ${amplicon.mkString("\t")}"
)
}
}

// Main processing loop
val iterator = queryNameOrderIterator(in)
val trimProgress = ProgressLogger(this.logger, "Trimmed")
Expand Down Expand Up @@ -152,28 +182,30 @@ class TrimPrimers
val rec1 = reads.find(r => r.paired && r.firstOfPair && !r.secondary && !r.supplementary)
val rec2 = reads.find(r => r.paired && r.secondOfPair && !r.secondary && !r.supplementary)

val readsToClip = if (firstOfPair) reads.filter(_.firstOfPair) else reads

(rec1, rec2) match {
case (Some(r1), Some(r2)) =>
// FR mapped pairs get the full treatment
if (r1.isFrPair) {
detector.find(r1=r1, r2=r2) match {
(if (firstOfPair) detector.findPrimer(rec=r1) else detector.find(r1=r1, r2=r2)) match {
case Some(amplicon) =>
val leftClip = amplicon.leftPrimerLength
val rightClip = amplicon.rightPrimerLength
reads.foreach { rec =>
readsToClip.foreach { rec =>
// Note: r1.positiveStrand means that r1 is the "left" read, so we should clip on the left
val toClip = if (rec.firstOfPair == r1.positiveStrand) leftClip else rightClip
this.clipper.clip5PrimeEndOfRead(rec, toClip)
}
case None =>
reads.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
readsToClip.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
}

clipFullyOverlappedFrReads(r1, r2)
}
// Pairs without both reads mapped in FR orientation are just maximally clipped
else {
reads.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
readsToClip.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
}

SamPairUtil.setMateInfo(r1.asSam, r2.asSam, true)
Expand All @@ -183,7 +215,7 @@ class TrimPrimers
}
case _ =>
// Just trim each read independently
reads.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
readsToClip.foreach(r => this.clipper.clip5PrimeEndOfRead(r, detector.maxPrimerLength))
}
}

Expand Down
17 changes: 10 additions & 7 deletions src/main/scala/com/fulcrumgenomics/util/Amplicon.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,20 +59,23 @@ case class Amplicon
private val left_start: Int,
private val left_end: Int,
private val right_start: Int,
private val right_end: Int, private val
id: Option[String] = None
private val right_end: Int,
private val id: Option[String] = None
) extends GenomicSpan with Metric {
require(leftStart <= leftEnd, f"leftStart is > leftEnd: $this")
require(rightStart <= rightEnd, f"rightStart is > rightEnd: $this")
require(leftStart <= rightStart, f"leftStart is > rightStart: $this")
require((leftStart == -1) == (leftEnd == -1))
require((rightStart == -1) == (rightEnd == -1))
require(leftStart != -1 || rightStart != -1)
require(leftStart == -1 || leftStart <= leftEnd, f"leftStart is > leftEnd: $this")
require(rightStart == -1 || rightStart <= rightEnd, f"rightStart is > rightEnd: $this")
require(leftStart == -1 || rightStart == -1 || leftStart <= rightStart, f"leftStart is > rightStart: $this")

@inline def leftStart: Int = left_start
@inline def leftEnd: Int = left_end
@inline def rightStart: Int = right_start
@inline def rightEnd: Int = right_end
@inline def contig: String = chrom
@inline def start: Int = leftStart
@inline def end: Int = rightEnd
@inline def start: Int = if (leftStart == -1) rightStart else leftStart
@inline def end: Int = if (rightEnd == -1) leftEnd else rightEnd

def leftPrimerLength: Int = CoordMath.getLength(leftStart, leftEnd)
def rightPrimerLength: Int = CoordMath.getLength(rightStart, rightEnd)
Expand Down
65 changes: 63 additions & 2 deletions src/test/scala/com/fulcrumgenomics/bam/TrimPrimersTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@
package com.fulcrumgenomics.bam

import java.nio.file.Paths

import com.fulcrumgenomics.FgBioDef._
import com.fulcrumgenomics.bam.api.SamOrder
import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord}
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.util.{Amplicon, Io, Metric}
Expand Down Expand Up @@ -199,4 +198,66 @@ class TrimPrimersTest extends UnitSpec {
new TrimPrimers(input=bam, output=bam, primers=primerFile, hardClip=false).execute()
}
}

it should "trim only R1s when --first-of-pair is given" in {
val amplicons: Seq[Amplicon] = Seq(
Amplicon("chr1", 100, 119, -1, -1),
Amplicon("chr1", -1, -1, 300, 320),
Amplicon("chr1", -1, -1, 601, 619),
)
val primers: FilePath = {
val tmp = makeTempFile("primers.", ".txt")
Metric.write(path=tmp, amplicons)
tmp
}

val builder = new SamBuilder(readLength=readLength, sort=Some(SamOrder.Coordinate))

// amplicon 1 - R1 should be trimmed (matches left primer)
builder.addPair("q1", start1=100, start2=CoordMath.getStart(219, readLength), strand1=Plus, strand2=Minus)
// amplicon 1 - R1 should be trimmed the maximum, since it matches the right primer (0-length)
builder.addPair("q2", start2=100, start1=CoordMath.getStart(200, readLength), strand1=Minus, strand2=Plus)
// amplicon 1 - R1 should be trimmed (matches left primer), since R2's coordinate is not considered
builder.addPair("q3", start1=100, start2=500, strand1=Plus, strand2=Minus)

// amplicon 2 - R1 should be trimmed (matches right primer)
builder.addPair("q4", start1=CoordMath.getStart(619, readLength), start2=500, strand1=Minus, strand2=Plus)
// amplicon 2 - R1 should be trimmed the maximum, since it matches the left primer (0-length)
builder.addPair("q5", start2=CoordMath.getStart(619, readLength), start1=500, strand1=Plus, strand2=Minus)
// amplicon 2 - R1 should be trimmed (matches right primer), since R2's coordinate is not considered
builder.addPair("q6", start1=CoordMath.getStart(619, readLength), start2=400, strand1=Minus, strand2=Plus)

val bam = builder.toTempFile()
val newBam = makeTempFile("trimmed.", ".bam")
new TrimPrimers(input=bam, output=newBam, primers=primers, hardClip=false, firstOfPair=true).execute()

val reads = readBamRecs(newBam).sortBy(rec => (rec.name, rec.secondOfPair))
reads should have size 12

def validate(rec: SamRecord, name: String, firstOfPair: Boolean, fivePrimeSoftClipLength: Int = 0): Unit = {
rec.name shouldBe name
rec.firstOfPair shouldBe firstOfPair
val elem = if (rec.negativeStrand) rec.cigar.elems.last else rec.cigar.elems.head
if (rec.firstOfPair) {
elem.operator shouldBe Op.SOFT_CLIP
elem.length shouldBe fivePrimeSoftClipLength
}
else {
elem.operator should not be Op.SOFT_CLIP
}
}

validate(rec=reads(0), name="q1", firstOfPair=true, fivePrimeSoftClipLength=20) // amplicon 1
validate(rec=reads(1), name="q1", firstOfPair=false)
validate(rec=reads(2), name="q2", firstOfPair=true, fivePrimeSoftClipLength=21) // maximum
validate(rec=reads(3), name="q2", firstOfPair=false)
validate(rec=reads(4), name="q3", firstOfPair=true, fivePrimeSoftClipLength=20) // amplicon 1
validate(rec=reads(5), name="q3", firstOfPair=false)
validate(rec=reads(6), name="q4", firstOfPair=true, fivePrimeSoftClipLength=19) // amplicon 2
validate(rec=reads(7), name="q4", firstOfPair=false)
validate(rec=reads(8), name="q5", firstOfPair=true, fivePrimeSoftClipLength=21) // maximum
validate(rec=reads(9), name="q5", firstOfPair=false)
validate(rec=reads(10), name="q6", firstOfPair=true, fivePrimeSoftClipLength=19) // amplicon 2
validate(rec=reads(11), name="q6", firstOfPair=false)
}
}

0 comments on commit e91698c

Please sign in to comment.