From cd1460b5f96c59b46de2804b4fd4fc60beb30df5 Mon Sep 17 00:00:00 2001
From: Nils Homer <nilshomer@gmail.com>
Date: Mon, 29 Jan 2024 17:31:01 -0700
Subject: [PATCH] fix: properly template-coordinate sort and duplicate mark
 secondary and supplementary reads

Secondary and supplementary reads must use the coordinates of the
primary alignments within the template, otherwise they will not
guaranteed to be next the primary alignments in the file.  Therefore,
we've added the "rp" and "mp" tags to store the SA-tag equivalent
information for the primary alignment.  This keeps information about the
primary alignments with the secondary and supplementary alignments.
---
 .../scala/com/fulcrumgenomics/bam/Bams.scala  | 49 ++++++++++++--
 .../fulcrumgenomics/bam/api/SamOrder.scala    | 65 ++++++++++++++-----
 .../fulcrumgenomics/umi/GroupReadsByUmi.scala |  2 +-
 .../bam/api/SamOrderTest.scala                | 17 +++--
 .../umi/GroupReadsByUmiTest.scala             | 33 +++++++---
 5 files changed, 130 insertions(+), 36 deletions(-)

diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala
index 61d6d52aa..d50f946d8 100644
--- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala
+++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala
@@ -25,6 +25,7 @@
 package com.fulcrumgenomics.bam
 
 import com.fulcrumgenomics.FgBioDef._
+import com.fulcrumgenomics.alignment.Cigar
 import com.fulcrumgenomics.bam.api.SamOrder.Queryname
 import com.fulcrumgenomics.bam.api._
 import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator}
@@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil}
 import java.io.Closeable
 import scala.math.{max, min}
 
+
+
+case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) {
+  def negativeStrand: Boolean = !positiveStrand
+  def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex
+
+  def end: Int = start + cigar.lengthOnTarget - 1
+  def unclippedStart: Int = {
+    SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar)
+  }
+
+  def unclippedEnd: Int = {
+    SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar)
+  }
+}
+
+object Supplementary {
+  /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */
+  def toString(rec: SamRecord): String = {
+    val strand = if (rec.positiveStrand) '+' else '-'
+    f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}"
+  }
+
+
+  def apply(sa: String): Supplementary = {
+    val parts = sa.split(",")
+    Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt)
+  }
+}
+
 /**
   * Class that represents all reads from a template within a BAM file.
   */
@@ -107,11 +138,21 @@ case class Template(r1: Option[SamRecord],
 
   /** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */
   def fixMateInfo(): Unit = {
-    for (primary <- r1; supp <- r2Supplementals) {
-      SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
+    // Set all mate info on BOTH secondary and supplementary records, not just supplementary records.  We also need to
+    // add the "pa" and "pm" tags with information about the primary alignments.  Finally, we need the MQ tag!
+    val r1NonPrimary = r1Supplementals ++ r1Secondaries
+    val r2NonPrimary = r2Supplementals ++ r2Secondaries
+    for (primary <- r1; nonPrimary <- r2NonPrimary) {
+      SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
+      nonPrimary(SAMTag.MQ.name()) = primary.mapq
+      nonPrimary("mp") = Supplementary.toString(primary)
+      r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
     }
-    for (primary <- r2; supp <- r1Supplementals) {
-      SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true)
+    for (primary <- r2; nonPrimary <- r1NonPrimary) {
+      SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true)
+      nonPrimary(SAMTag.MQ.name()) = primary.mapq
+      nonPrimary("mp") = Supplementary.toString(primary)
+      r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r))
     }
     for (first <- r1; second <- r2) {
       SamPairUtil.setMateInfo(first.asSam, second.asSam, true)
diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
index 6aff8d844..2f147618d 100644
--- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
+++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala
@@ -24,12 +24,15 @@
 
 package com.fulcrumgenomics.bam.api
 
+import com.fulcrumgenomics.bam.{Bams, Supplementary}
 import com.fulcrumgenomics.umi.ConsensusTags
 import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
 import htsjdk.samtools.util.Murmur3
 import htsjdk.samtools.{SAMFileHeader, SAMUtils}
 import org.apache.commons.math3.genetics.RandomKey
 
+import scala.reflect.runtime.universe.Template
+
 
 /** Trait for specifying BAM orderings. */
 sealed trait SamOrder extends Product {
@@ -175,24 +178,50 @@ object SamOrder {
     override val groupOrder: GroupOrder     = GroupOrder.query
     override val subSort:    Option[String] = Some("template-coordinate")
     override val sortkey: SamRecord => A = rec => {
-      val readChrom = if (rec.unmapped)     Int.MaxValue else rec.refIndex
-      val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
-      val readNeg   = rec.negativeStrand
-      val mateNeg   = if (rec.paired) rec.mateNegativeStrand else false
-      val readPos   = if (rec.unmapped)     Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
-      val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
-      val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
-      val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
-        val index: Int = m.lastIndexOf('/')
-        if (index >= 0) m.substring(0, index) else m
-      }.getOrElse("")
-
-      if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
-           (readChrom == mateChrom && readPos == matePos && !readNeg)) {
-        TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
-      }
-      else {
-        TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+      // For non-secondary/non-supplementary alignments, use the info in the record.  For secondary and supplementary
+      // alignments, use the info in the pa/pm tags.
+      if (!rec.secondary && !rec.supplementary) {
+        val readChrom = if (rec.unmapped)     Int.MaxValue else rec.refIndex
+        val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex
+        val readNeg   = rec.negativeStrand
+        val mateNeg   = if (rec.paired) rec.mateNegativeStrand else false
+        val readPos   = if (rec.unmapped)     Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart
+        val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam)
+        val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
+        val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
+          val index: Int = m.lastIndexOf('/')
+          if (index >= 0) m.substring(0, index) else m
+        }.getOrElse("")
+
+        if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
+             (readChrom == mateChrom && readPos == matePos && !readNeg)) {
+          TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
+        }
+        else {
+          TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+        }
+      } else {
+        val primary   = Supplementary(rec[String]("rp"))
+        val mate      = Supplementary(rec[String]("mp"))
+        val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header)
+        val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header)
+        val readNeg   = primary.negativeStrand
+        val mateNeg   = if (rec.paired) mate.negativeStrand else false
+        val readPos   = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart
+        val matePos   = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart
+        val lib       = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown")
+        val mid       = rec.get[String](ConsensusTags.MolecularId).map { m =>
+          val index: Int = m.lastIndexOf('/')
+          if (index >= 0) m.substring(0, index) else m
+        }.getOrElse("")
+
+        if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) ||
+          (readChrom == mateChrom && readPos == matePos && !readNeg)) {
+          TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false)
+        }
+        else {
+          TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true)
+        }
       }
     }
   }
diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
index aece16860..498245e6b 100644
--- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
+++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
@@ -719,7 +719,7 @@ class GroupReadsByUmi
 
       // Then output the records in the right order (assigned tag, read name, r1, r2)
       templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => {
-        templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => {
+        templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => {
           out += rec
         })
       })
diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
index 2545f2ee0..b5eda09e2 100644
--- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala
@@ -25,11 +25,13 @@
 package com.fulcrumgenomics.bam.api
 
 import java.util.Random
-
 import com.fulcrumgenomics.FgBioDef._
+import com.fulcrumgenomics.bam.Bams
+import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator}
 import com.fulcrumgenomics.commons.util.SimpleCounter
 import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
 import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
+import com.fulcrumgenomics.util.Io
 import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator}
 import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder}
 import htsjdk.samtools.util.Murmur3
@@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec {
     s2.supplementary = true
     s2.properlyPaired = true
     exp += s2
-    // primary read pairs for q1, that map to different contigs, but earlier that q1
+    // primary read pairs for q2, that map to different contigs, but earlier that q1
     exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S")
 
-    val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2")
-    val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
+    // Fix the mate information.  Note: sorting here to get a template-iterator will write the records to disk first,
+    // so we cannot use the records in builder/exp.
+    val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template =>
+      template.fixMateInfo()
+      template.allReads
+    }.toList
+
+    val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup")
+    val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id)
 
     actual should contain theSameElementsInOrderAs expected
   }
diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
index 4c0e27748..003ead8bb 100644
--- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
+++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
@@ -26,13 +26,14 @@
  */
 package com.fulcrumgenomics.umi
 
-import com.fulcrumgenomics.bam.Template
-import com.fulcrumgenomics.bam.api.SamOrder
+import com.fulcrumgenomics.bam.{Bams, Template}
+import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter}
 import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate
 import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
 import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
 import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
 import com.fulcrumgenomics.umi.GroupReadsByUmi._
+import com.fulcrumgenomics.util.Io
 import org.scalatest.{OptionValues, PrivateMethodTester}
 
 import java.nio.file.Files
@@ -335,7 +336,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
   }
 
   it should "mark duplicates on supplementary reads" in {
-    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate))
+    val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate))
     // primary read pairs for q1, that map to different contigs
     builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT"))
     // supplementary R2, which maps to the same chromosome as the primary R1
@@ -343,21 +344,35 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
     s1.supplementary = true
     s2.supplementary = true
     s2.properlyPaired = true
-    // primary read pairs for q1, that map to different contigs, but earlier that q1
+    // primary read pairs for q2, that map to different contigs, but earlier that q1
     builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
 
-    val in = builder.toTempFile()
-    val out = Files.createTempFile("umi_grouped.", ".sam")
+    // primary read pairs for q3, that are duplicates of q2
+    builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT"))
+
+    // Fix the mate information and write the input.  Note: sorting here to get a template-iterator will write the
+    // records to disk first, so we cannot use the records in builder and therefore must write to the input file
+    // directly.
+    val in = Files.createTempFile("raw_reads", ".sam")
+    val writer = SamWriter(in, builder.header, sort = builder.sort)
+    Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template =>
+      template.fixMateInfo()
+      writer ++= template.allReads
+    }
+    writer.close()
+
+    val out  = Files.createTempFile("umi_grouped.", ".sam")
     val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
-    val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true))
+    val gr   = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true)
 
     gr.markDuplicates shouldBe true
     gr.execute()
 
     val recs = readBamRecs(out)
-    recs.length shouldBe 4
-    recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true
+    recs.length shouldBe 8
+    recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true
     recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true
+    recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true
   }
 
   it should "correctly group reads with the paired assigner when the two UMIs are the same" in {