From 27b140d54226ef73dec3084015bc6f73dfddb79c Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 10:52:47 -0800 Subject: [PATCH 1/5] feat: output umi grouping metrics --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 29 +++++++++++++++++++ .../umi/GroupReadsByUmiTest.scala | 5 +++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index aece16860..ddbcc9c71 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -466,6 +466,20 @@ case class TagFamilySizeMetric(family_size: Int, var fraction: Proportion = 0, var fraction_gt_or_eq_family_size: Proportion = 0) extends Metric +/** + * Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping + * @param reads The number of reads accepted for grouping + * @param filteredNonPf The number of non-PF reads + * @param filteredPoorAlignment The number of templates discarded for poor + * @param filteredNsInUmi The number of templates discarded due to one or more Ns in the UMI + * @param filterUmisTooShort The number of templates discarded due to a shorter than expected UMI + */ +case class UmiGroupingMetric(reads: Long, + filteredNonPf: Long, + filteredPoorAlignment: Long, + filteredNsInUmi: Long, + filterUmisTooShort: Long) extends Metric + /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/ sealed trait Strategy extends EnumEntry { def newStrategy(edits: Int, threads: Int): UmiAssigner @@ -568,6 +582,7 @@ class GroupReadsByUmi (@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn, @arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut, @arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None, + @arg(flag='g', doc="Optional output of UMI grouping metrics.") val groupingMetrics: Option[FilePath] = None, @arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = ConsensusTags.UmiBases, @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = ConsensusTags.MolecularId, @arg(flag='d', doc="Turn on duplicate marking mode.") val markDuplicates: Boolean = false, @@ -742,6 +757,20 @@ class GroupReadsByUmi ms.tails.foreach { tail => tail.headOption.foreach(m => m.fraction_gt_or_eq_family_size = tail.map(_.fraction).sum) } Metric.write(p, ms) } + + // Write out UMI grouping metrics + this.groupingMetrics match { + case None => () + case Some(p) => + val groupingMetrics = UmiGroupingMetric( + reads = kept, + filteredNonPf = filteredNonPf, + filteredPoorAlignment = filteredPoorAlignment, + filteredNsInUmi = filteredNsInUmi, + filterUmisTooShort = filterUmisTooShort, + ) + Metric.write(p, groupingMetrics) + } } private def logStats(): Unit = { diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 09c8ce0b6..4d38217ed 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -247,7 +247,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT val in = builder.toTempFile() val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) + val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt") + val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30)) val logs = executeFgbioTool(tool) val groups = readBamRecs(out).groupBy(_.name.charAt(0)) @@ -267,6 +268,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true + metrics.toFile.exists() shouldBe true + // Make sure that we skip sorting for TemplateCoordinate val sortMessage = "Sorting the input to TemplateCoordinate order" logs.exists(_.contains(sortMessage)) shouldBe (sortOrder != TemplateCoordinate) From ca81af8ad38ce04847af194ca77210978acb5040 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 11:26:19 -0800 Subject: [PATCH 2/5] refactor: variable names --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index ddbcc9c71..615983593 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -468,17 +468,17 @@ case class TagFamilySizeMetric(family_size: Int, /** * Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping - * @param reads The number of reads accepted for grouping - * @param filteredNonPf The number of non-PF reads - * @param filteredPoorAlignment The number of templates discarded for poor - * @param filteredNsInUmi The number of templates discarded due to one or more Ns in the UMI - * @param filterUmisTooShort The number of templates discarded due to a shorter than expected UMI + * @param sam_records The number of SAM records accepted for grouping. + * @param filtered_non_pf The number of non-PF SAM records. + * @param filtered_poor_alignment The number of SAM records discarded for poor alignment. + * @param filtered_ns_in_umi The number of SAM records discarded due to one or more Ns in the UMI. + * @param filtered_umis_to_short The number of SAM records discarded due to a shorter than expected UMI. */ -case class UmiGroupingMetric(reads: Long, - filteredNonPf: Long, - filteredPoorAlignment: Long, - filteredNsInUmi: Long, - filterUmisTooShort: Long) extends Metric +case class UmiGroupingMetric(sam_records: Long, + filtered_non_pf: Long, + filtered_poor_alignment: Long, + filtered_ns_in_umi: Long, + filtered_umis_to_short: Long) extends Metric /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/ sealed trait Strategy extends EnumEntry { @@ -758,27 +758,25 @@ class GroupReadsByUmi Metric.write(p, ms) } - // Write out UMI grouping metrics - this.groupingMetrics match { - case None => () - case Some(p) => - val groupingMetrics = UmiGroupingMetric( - reads = kept, - filteredNonPf = filteredNonPf, - filteredPoorAlignment = filteredPoorAlignment, - filteredNsInUmi = filteredNsInUmi, - filterUmisTooShort = filterUmisTooShort, - ) - Metric.write(p, groupingMetrics) + // Write out UMI grouping metrics + this.groupingMetrics.foreach { path => + val groupingMetrics = UmiGroupingMetric( + sam_records = kept, + filtered_non_pf = filteredNonPf, + filtered_poor_alignment = filteredPoorAlignment, + filtered_ns_in_umi = filteredNsInUmi, + filtered_umis_to_short = filterUmisTooShort, + ) + Metric.write(path, groupingMetrics) } } private def logStats(): Unit = { - logger.info(f"Accepted $kept%,d reads for grouping.") - if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") - logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") - logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") - this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } + logger.info(f"Accepted $kept%,d SAM records for grouping.") + if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF SAM records.") + logger.info(f"Filtered out $filteredPoorAlignment%,d SAM records due to mapping issues.") + logger.info(f"Filtered out $filteredNsInUmi%,d SAM records that contained one or more Ns in their UMIs.") + this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d SAM records that contained UMIs that were too short.") } } /** Consumes the next group of templates with all matching end positions and returns them. */ From e1dcbabb6ad8dadc825537ef22914b1872e85337 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 11:53:00 -0800 Subject: [PATCH 3/5] test: check metric values --- .../scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 4d38217ed..19ddc76d5 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -33,6 +33,7 @@ import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ +import com.fulcrumgenomics.util.Metric import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -268,7 +269,9 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true - metrics.toFile.exists() shouldBe true + // TODO: Create a more comprehensive test case + val expectedMetric = UmiGroupingMetric(sam_records = 10, filtered_non_pf = 0, filtered_poor_alignment = 2, filtered_ns_in_umi = 0, filtered_umis_to_short = 0) + Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) // Make sure that we skip sorting for TemplateCoordinate val sortMessage = "Sorting the input to TemplateCoordinate order" From a8e84791bf17500c6c23339ee2fa372f8c0ad728 Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 12:45:31 -0800 Subject: [PATCH 4/5] refactor: rename attributes --- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 34 +++++++++---------- .../umi/GroupReadsByUmiTest.scala | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 615983593..faa563844 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -468,17 +468,17 @@ case class TagFamilySizeMetric(family_size: Int, /** * Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping - * @param sam_records The number of SAM records accepted for grouping. - * @param filtered_non_pf The number of non-PF SAM records. - * @param filtered_poor_alignment The number of SAM records discarded for poor alignment. - * @param filtered_ns_in_umi The number of SAM records discarded due to one or more Ns in the UMI. - * @param filtered_umis_to_short The number of SAM records discarded due to a shorter than expected UMI. + * @param accepted_sam_records The number of SAM records accepted for grouping. + * @param discarded_non_pf The number of discarded non-PF SAM records. + * @param discarded_poor_alignment The number of SAM records discarded for poor alignment. + * @param discarded_ns_in_umi The number of SAM records discarded due to one or more Ns in the UMI. + * @param discarded_umis_to_short The number of SAM records discarded due to a shorter than expected UMI. */ -case class UmiGroupingMetric(sam_records: Long, - filtered_non_pf: Long, - filtered_poor_alignment: Long, - filtered_ns_in_umi: Long, - filtered_umis_to_short: Long) extends Metric +case class UmiGroupingMetric(accepted_sam_records: Long, + discarded_non_pf: Long, + discarded_poor_alignment: Long, + discarded_ns_in_umi: Long, + discarded_umis_to_short: Long) extends Metric /** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/ sealed trait Strategy extends EnumEntry { @@ -760,13 +760,13 @@ class GroupReadsByUmi // Write out UMI grouping metrics this.groupingMetrics.foreach { path => - val groupingMetrics = UmiGroupingMetric( - sam_records = kept, - filtered_non_pf = filteredNonPf, - filtered_poor_alignment = filteredPoorAlignment, - filtered_ns_in_umi = filteredNsInUmi, - filtered_umis_to_short = filterUmisTooShort, - ) + val groupingMetrics = UmiGroupingMetric( + accepted_sam_records = kept, + discarded_non_pf = filteredNonPf, + discarded_poor_alignment = filteredPoorAlignment, + discarded_ns_in_umi = filteredNsInUmi, + discarded_umis_to_short = filterUmisTooShort, + ) Metric.write(path, groupingMetrics) } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 19ddc76d5..dd4dcedb6 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -270,7 +270,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true // TODO: Create a more comprehensive test case - val expectedMetric = UmiGroupingMetric(sam_records = 10, filtered_non_pf = 0, filtered_poor_alignment = 2, filtered_ns_in_umi = 0, filtered_umis_to_short = 0) + val expectedMetric = UmiGroupingMetric(accepted_sam_records = 10, discarded_non_pf = 0, discarded_poor_alignment = 2, discarded_ns_in_umi = 0, discarded_umis_to_short = 0) Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) // Make sure that we skip sorting for TemplateCoordinate From 6b427fe43a9bd1d5b28b3e350b45be8ace555d9b Mon Sep 17 00:00:00 2001 From: znorgaard Date: Mon, 16 Dec 2024 12:50:49 -0800 Subject: [PATCH 5/5] test: check metric in additional tests --- .../com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index dd4dcedb6..2fb0526bf 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -269,7 +269,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT hist.toFile.exists() shouldBe true - // TODO: Create a more comprehensive test case + // TODO: Consider creating more unit tests that vary the following metric fields val expectedMetric = UmiGroupingMetric(accepted_sam_records = 10, discarded_non_pf = 0, discarded_poor_alignment = 2, discarded_ns_in_umi = 0, discarded_umis_to_short = 0) Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) @@ -430,8 +430,12 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT builder.addPair(name="a03", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ANN")) val in = builder.toTempFile() + val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt") val out = Files.createTempFile("umi_grouped.", ".bam") - new GroupReadsByUmi(input=in, output=out, rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute() + new GroupReadsByUmi(input=in, output=out, groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute() + + val expectedMetric = UmiGroupingMetric(accepted_sam_records = 4, discarded_non_pf = 0, discarded_poor_alignment = 0, discarded_ns_in_umi = 2, discarded_umis_to_short = 0) + Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric) readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02") }