From f4f445c80b8fad83f6a14ef986e787c3991437f7 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Mon, 29 Jan 2024 15:15:59 -0700 Subject: [PATCH 1/6] add a test to show issue with SamOrder# --- .../bam/api/SamOrderTest.scala | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala index 2170ec351..2545f2ee0 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala @@ -217,4 +217,23 @@ class SamOrderTest extends UnitSpec { actual should contain theSameElementsInOrderAs expected } + + it should "handle supplementary alignments" in { + val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Queryname)) + val exp = ListBuffer[SamRecord]() + // primary read pairs for q1, that map to different contigs + exp ++= builder.addPair("q1", contig=1, contig2=Some(2), start1=66, start2=47, cigar1="60M40S", cigar2="55M45S", strand2=Plus) + // supplementary R2 (ignore R1), which maps to the same chromosome as the primary R1 + val Seq(_, s2) = builder.addPair("q1", contig=1, contig2=Some(1), start1=66, start2=66, cigar1="60M40S", strand2=Minus) + s2.supplementary = true + s2.properlyPaired = true + exp += s2 + // primary read pairs for q1, that map to different contigs, but earlier that q1 + exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S") + + val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2") + val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) + + actual should contain theSameElementsInOrderAs expected + } } From 86378987b1101ad46102829089f4fd071ae1325d Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Mon, 29 Jan 2024 15:41:20 -0700 Subject: [PATCH 2/6] commit to show how GroupReadsByUmi fails --- .../umi/GroupReadsByUmiTest.scala | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 2fb0526bf..13759307c 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -344,6 +344,32 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT recs.filter(_.name.equals("a04")).forall(_.duplicate == true) shouldBe true } + it should "mark duplicates on supplementary reads" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate)) + // primary read pairs for q1, that map to different contigs + builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT")) + // supplementary R2, which maps to the same chromosome as the primary R1 + val Seq(s1, s2) = builder.addPair("q1", contig = 1, contig2 = Some(1), start1 = 66, start2 = 66, cigar1 = "60M40S", strand2 = Minus, attrs = Map("RX" -> "ACT")) + s1.supplementary = true + s2.supplementary = true + s2.properlyPaired = true + // primary read pairs for q1, that map to different contigs, but earlier that q1 + builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true)) + + gr.markDuplicates shouldBe true + gr.execute() + + val recs = readBamRecs(out) + recs.length shouldBe 4 + recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true + recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true + } + it should "correctly group reads with the paired assigner when the two UMIs are the same" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT")) From 75ac4bd8c25caa17cea1cd13e392212881d17ad3 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 4 Feb 2025 19:09:35 -0700 Subject: [PATCH 3/6] fix: properly template-coordinate sort and duplicate mark secondary and supplementary reads Secondary and supplementary reads must use the coordinates of the primary alignments within the template, otherwise they will not guaranteed to be next the primary alignments in the file. Therefore, we've added the "rp" and "mp" tags to store the SA-tag equivalent information for the primary alignment. This keeps information about the primary alignments with the secondary and supplementary alignments. --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 53 +++++++++++++-- .../fulcrumgenomics/bam/api/SamOrder.scala | 65 ++++++++++++++----- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 2 +- .../bam/api/SamOrderTest.scala | 17 +++-- .../umi/GroupReadsByUmiTest.scala | 33 +++++++--- 5 files changed, 132 insertions(+), 38 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 623b97d76..099af6f42 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -25,6 +25,7 @@ package com.fulcrumgenomics.bam import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.bam.api.SamOrder.Queryname import com.fulcrumgenomics.bam.api._ import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator} @@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil} import java.io.Closeable import scala.math.{max, min} + + +case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) { + def negativeStrand: Boolean = !positiveStrand + def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex + + def end: Int = start + cigar.lengthOnTarget - 1 + def unclippedStart: Int = { + SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar) + } + + def unclippedEnd: Int = { + SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar) + } +} + +object Supplementary { + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toString(rec: SamRecord): String = { + val strand = if (rec.positiveStrand) '+' else '-' + f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}" + } + + + def apply(sa: String): Supplementary = { + val parts = sa.split(",") + Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt) + } +} + /** * Class that represents all reads from a template within a BAM file. */ @@ -108,13 +139,23 @@ case class Template(r1: Option[SamRecord], /** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */ def fixMateInfo(): Unit = { // Developer note: the mate score ("ms") tag is used by samtools markdup - for (primary <- r1; supp <- r2Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) - primary.get[Int]("AS").foreach(supp("ms") = _) + // Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to + // add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag! + val r1NonPrimary = r1Supplementals ++ r1Secondaries + val r2NonPrimary = r2Supplementals ++ r2Secondaries + for (primary <- r1; nonPrimary <- r2NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(nonPrimary("ms") = _) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } - for (primary <- r2; supp <- r1Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) - primary.get[Int]("AS").foreach(supp("ms") = _) + for (primary <- r2; nonPrimary <- r1NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + primary.get[Int]("AS").foreach(nonPrimary("ms") = _) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala index 6aff8d844..2f147618d 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala @@ -24,12 +24,15 @@ package com.fulcrumgenomics.bam.api +import com.fulcrumgenomics.bam.{Bams, Supplementary} import com.fulcrumgenomics.umi.ConsensusTags import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 import htsjdk.samtools.{SAMFileHeader, SAMUtils} import org.apache.commons.math3.genetics.RandomKey +import scala.reflect.runtime.universe.Template + /** Trait for specifying BAM orderings. */ sealed trait SamOrder extends Product { @@ -175,24 +178,50 @@ object SamOrder { override val groupOrder: GroupOrder = GroupOrder.query override val subSort: Option[String] = Some("template-coordinate") override val sortkey: SamRecord => A = rec => { - val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex - val readNeg = rec.negativeStrand - val mateNeg = if (rec.paired) rec.mateNegativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => - val index: Int = m.lastIndexOf('/') - if (index >= 0) m.substring(0, index) else m - }.getOrElse("") - - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) - } - else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + // For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary + // alignments, use the info in the pa/pm tags. + if (!rec.secondary && !rec.supplementary) { + val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex + val readNeg = rec.negativeStrand + val mateNeg = if (rec.paired) rec.mateNegativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } + } else { + val primary = Supplementary(rec[String]("rp")) + val mate = Supplementary(rec[String]("mp")) + val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header) + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header) + val readNeg = primary.negativeStrand + val mateNeg = if (rec.paired) mate.negativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } } } } diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index faa563844..dabd3b73a 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -734,7 +734,7 @@ class GroupReadsByUmi // Then output the records in the right order (assigned tag, read name, r1, r2) templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => { - templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => { + templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => { out += rec }) }) diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala index 2545f2ee0..b5eda09e2 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala @@ -25,11 +25,13 @@ package com.fulcrumgenomics.bam.api import java.util.Random - import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator} import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.util.Io import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator} import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 @@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec { s2.supplementary = true s2.properlyPaired = true exp += s2 - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S") - val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2") - val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) + // Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first, + // so we cannot use the records in builder/exp. + val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template => + template.fixMateInfo() + template.allReads + }.toList + + val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup") + val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) actual should contain theSameElementsInOrderAs expected } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 13759307c..fbf668aa0 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -26,14 +26,15 @@ */ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.Template -import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter} import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ import com.fulcrumgenomics.util.Metric +import com.fulcrumgenomics.util.Io import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -345,7 +346,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT } it should "mark duplicates on supplementary reads" in { - val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate)) + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) // primary read pairs for q1, that map to different contigs builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT")) // supplementary R2, which maps to the same chromosome as the primary R1 @@ -353,21 +354,35 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT s1.supplementary = true s2.supplementary = true s2.properlyPaired = true - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) - val in = builder.toTempFile() - val out = Files.createTempFile("umi_grouped.", ".sam") + // primary read pairs for q3, that are duplicates of q2 + builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + // Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the + // records to disk first, so we cannot use the records in builder and therefore must write to the input file + // directly. + val in = Files.createTempFile("raw_reads", ".sam") + val writer = SamWriter(in, builder.header, sort = builder.sort) + Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template => + template.fixMateInfo() + writer ++= template.allReads + } + writer.close() + + val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true)) + val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true) gr.markDuplicates shouldBe true gr.execute() val recs = readBamRecs(out) - recs.length shouldBe 4 - recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true + recs.length shouldBe 8 + recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true + recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true } it should "correctly group reads with the paired assigner when the two UMIs are the same" in { From f9390ab407aa14b58fc34be1fc3bf20fde594694 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 4 Feb 2025 19:10:06 -0700 Subject: [PATCH 4/6] review fixups --- .../fulcrumgenomics/alignment/Alignment.scala | 14 +++- .../scala/com/fulcrumgenomics/bam/Bams.scala | 74 +++++++++++++------ .../fulcrumgenomics/bam/api/SamOrder.scala | 74 ++++++++----------- .../fulcrumgenomics/bam/api/SamRecord.scala | 2 +- .../alignment/AlignmentTest.scala | 35 +++++++++ 5 files changed, 128 insertions(+), 71 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala index 201951ef4..83b172d15 100644 --- a/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala +++ b/src/main/scala/com/fulcrumgenomics/alignment/Alignment.scala @@ -208,7 +208,7 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { def trailingSoftClippedBases: Int = stats.trailingSoftClippedBases /** Returns the number of bases that are hard-clipped at the start of the sequence. */ - def leadingHardClippedBases = this.headOption.map { elem => + def leadingHardClippedBases: Int = this.headOption.map { elem => if (elem.operator == CigarOperator.H) elem.length else 0 }.getOrElse(0) @@ -216,13 +216,23 @@ case class Cigar(elems: IndexedSeq[CigarElem]) extends Iterable[CigarElem] { def leadingClippedBases: Int = leadingHardClippedBases + leadingSoftClippedBases /** Returns the number of bases that are hard-clipped at the end of the sequence. */ - def trailingHardClippedBases = this.lastOption.map { elem => + def trailingHardClippedBases: Int = this.lastOption.map { elem => if (elem.operator == CigarOperator.H) elem.length else 0 }.getOrElse(0) /** Returns the number of bases that are clipped at the end of the sequence. */ def trailingClippedBases: Int = trailingHardClippedBases + trailingSoftClippedBases + /** Returns the alignment start position adjusted for leading soft and hard clipped bases. */ + def unclippedStart(start: Int): Int = { + start - this.iterator.takeWhile(_.operator.isClipping).map(_.length).sum + } + + /** Returns the alignment end position adjusted for trailing soft and hard clipped bases. */ + def unclippedEnd(end: Int): Int = { + end + this.reverseIterator.takeWhile(_.operator.isClipping).map(_.length).sum + } + /** Truncates the cigar based on either query or target length cutoff. */ private def truncate(len: Int, shouldCount: CigarElem => Boolean): Cigar = { var pos = 1 diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 099af6f42..4eee3c5f8 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -43,33 +43,54 @@ import java.io.Closeable import scala.math.{max, min} - -case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) { +/** Class to store information about an alignment, as described in the SAM SA tag. */ +private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: Boolean, cigar: Option[Cigar], mapq: Int, nm: Int) { def negativeStrand: Boolean = !positiveStrand - def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex - - def end: Int = start + cigar.lengthOnTarget - 1 - def unclippedStart: Int = { - SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar) + def refName(header: SAMFileHeader): String = header.getSequence(refIndex).getSequenceName + def mapped: Boolean = cigar.isDefined + def unmapped: Boolean = !mapped + private def _cigar: Cigar = cigar.getOrElse { + throw new IllegalStateException(s"Cannot get cigar for AlignmentInfo: ${this}") } - - def unclippedEnd: Int = { - SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar) + def end: Int = start + _cigar.lengthOnTarget - 1 + def unclippedStart: Int = _cigar.unclippedStart(start) + def unclippedEnd: Int = _cigar.unclippedEnd(end) + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toSA(header: SAMFileHeader): String = { + val strand = if (positiveStrand) '+' else '-' + val refName = header.getSequence(refIndex).getSequenceName + val cigar = this.cigar.getOrElse("*") + f"${refName},${start},${strand},${cigar},${mapq},${nm}" } } -object Supplementary { - /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ - def toString(rec: SamRecord): String = { - val strand = if (rec.positiveStrand) '+' else '-' - f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}" +private[bam] object AlignmentInfo { + def apply(rec: SamRecord, mate: Boolean = false): AlignmentInfo = { + if (mate) { + val mateRefIndex = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex + val mateCigar = if (rec.unpaired || rec.mateUnmapped) None else Some(rec.mateCigar.getOrElse { + throw new IllegalStateException(s"Mate CIGAR (Tag 'MC') not found for $rec, consider using SetMateInformation.") + }) + // NB: mateCigar has already checked for the existence of the MC tag, so using .get here is fine + val mateStart = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateStart + val mateStrand = if (rec.unpaired || rec.mateUnmapped) true else rec.matePositiveStrand + AlignmentInfo(mateRefIndex, mateStart, mateStrand, mateCigar, rec.mapq, 0) + } else { + val refIndex = if (rec.unmapped) Int.MaxValue else rec.refIndex + val positiveStrand = rec.positiveStrand + val start = if (rec.unmapped) Int.MaxValue else rec.start + AlignmentInfo(refIndex, start, positiveStrand, Some(rec.cigar), rec.mapq, rec.getOrElse(SAMTag.NM.name(), 0)) + } } - - def apply(sa: String): Supplementary = { - val parts = sa.split(",") - Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt) + def apply(sa: String, header: SAMFileHeader): AlignmentInfo = { + val parts = sa.split(",") + require(parts.length == 6, f"Could not parse SA tag: ${sa}") + val refIndex = header.getSequenceIndex(parts(0)) + AlignmentInfo(refIndex, parts(1).toInt, parts(2) == "+", Some(Cigar(parts(3))), parts(4).toInt, parts(5).toInt) } + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toSA(rec: SamRecord): String = AlignmentInfo(rec).toSA(rec.header) } /** @@ -136,7 +157,8 @@ case class Template(r1: Option[SamRecord], Template(x1, x2) } - /** Fixes mate information and sets mate cigar and mate score on all primary and supplementary (but not secondary) records. */ + + /** Fixes mate information and sets mate cigar and mate score on all primary, secondary, and supplementary records. */ def fixMateInfo(): Unit = { // Developer note: the mate score ("ms") tag is used by samtools markdup // Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to @@ -147,15 +169,15 @@ case class Template(r1: Option[SamRecord], SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) primary.get[Int]("AS").foreach(nonPrimary("ms") = _) nonPrimary(SAMTag.MQ.name()) = primary.mapq - nonPrimary("mp") = Supplementary.toString(primary) - r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) + nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary) + r2.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r)) } for (primary <- r2; nonPrimary <- r1NonPrimary) { SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) primary.get[Int]("AS").foreach(nonPrimary("ms") = _) nonPrimary(SAMTag.MQ.name()) = primary.mapq - nonPrimary("mp") = Supplementary.toString(primary) - r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) + nonPrimary(Template.MatePrimarySamTag) = AlignmentInfo.toSA(primary) + r1.foreach(r => nonPrimary(Template.ReadPrimarySamTag) = AlignmentInfo.toSA(r)) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) @@ -169,6 +191,10 @@ case class Template(r1: Option[SamRecord], } object Template { + /** The local SAM tag to store the alignment information of the primary alignment (in the same format as the SA tag) */ + val ReadPrimarySamTag: String = "rp" + /** The local SAM tag to store the alignment information of the mate's primary alignment (in the same format as the SA tag) */ + val MatePrimarySamTag: String = "mp" /** * Generates a Template for the next template in the buffered iterator. Assumes that the * iterator is queryname sorted or grouped. diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala index 2f147618d..9414e6f12 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala @@ -24,15 +24,13 @@ package com.fulcrumgenomics.bam.api -import com.fulcrumgenomics.bam.{Bams, Supplementary} +import com.fulcrumgenomics.bam.{Bams, AlignmentInfo, Template} import com.fulcrumgenomics.umi.ConsensusTags import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 import htsjdk.samtools.{SAMFileHeader, SAMUtils} import org.apache.commons.math3.genetics.RandomKey -import scala.reflect.runtime.universe.Template - /** Trait for specifying BAM orderings. */ sealed trait SamOrder extends Product { @@ -178,50 +176,38 @@ object SamOrder { override val groupOrder: GroupOrder = GroupOrder.query override val subSort: Option[String] = Some("template-coordinate") override val sortkey: SamRecord => A = rec => { - // For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary - // alignments, use the info in the pa/pm tags. - if (!rec.secondary && !rec.supplementary) { - val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex - val readNeg = rec.negativeStrand - val mateNeg = if (rec.paired) rec.mateNegativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => - val index: Int = m.lastIndexOf('/') - if (index >= 0) m.substring(0, index) else m - }.getOrElse("") - - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + // For non-secondary/non-supplementary alignments, or unpaired or mate unmapped reads, use the info in the record. + // Otherwise, use the info in the pa/pm tags. + val primary = if (!rec.secondary && !rec.supplementary) AlignmentInfo(rec) else { + val readPrimaryTag = rec.get[String](Template.ReadPrimarySamTag).getOrElse { + throw new IllegalStateException( + s"Missing '${Template.ReadPrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation" + ) } - else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + AlignmentInfo(readPrimaryTag, rec.header) + } + val mate = if ((!rec.secondary && !rec.supplementary) || rec.unpaired || rec.mateUnmapped) AlignmentInfo(rec, mate=true) else { + val matePrimaryTag = rec.get[String](Template.MatePrimarySamTag).getOrElse { + throw new IllegalStateException( + s"Missing '${Template.MatePrimarySamTag}' tag required for TemplateCoordinate; try using ZipperBams or SetMateInformation" + ) } - } else { - val primary = Supplementary(rec[String]("rp")) - val mate = Supplementary(rec[String]("mp")) - val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header) - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header) - val readNeg = primary.negativeStrand - val mateNeg = if (rec.paired) mate.negativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => - val index: Int = m.lastIndexOf('/') - if (index >= 0) m.substring(0, index) else m - }.getOrElse("") + AlignmentInfo(matePrimaryTag, rec.header) + } + val readPos = if (primary.unmapped) Int.MaxValue else if (primary.negativeStrand) primary.unclippedEnd else primary.unclippedStart + val matePos = if (mate.unmapped) Int.MaxValue else if (mate.negativeStrand) mate.unclippedEnd else mate.unclippedStart + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) - } - else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) - } + if (primary.refIndex < mate.refIndex || (primary.refIndex == mate.refIndex && readPos < matePos) || + (primary.refIndex == mate.refIndex && readPos == matePos && primary.positiveStrand)) { + TemplateCoordinateKey(primary.refIndex, mate.refIndex, readPos, matePos, primary.negativeStrand, mate.negativeStrand, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mate.refIndex, primary.refIndex, matePos, readPos, mate.negativeStrand, primary.negativeStrand, lib, mid, rec.name, true) } } } diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala index cf05b54c6..0002c54bc 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamRecord.scala @@ -76,7 +76,7 @@ class TransientAttrs(private val rec: SamRecord) { /** * A trait that fgbio uses as a replacement for [[SAMRecord]]. The trait is self-typed as a - * [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this wasy so that + * [[SamRecordIntermediate]] which is a sub-class of SAMRecord. It is done this ways so that * a) we can access superclass methods via [[SamRecordIntermediate]] but that self-typing here * instead of extending hides the [[SAMRecord]] API from users of the class. The result is * always a [[SAMRecord]] but isn't seen as such without casting. diff --git a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala index 63c809fc3..9d000f130 100644 --- a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala +++ b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala @@ -121,6 +121,41 @@ class AlignmentTest extends UnitSpec { Cigar("10M2D20M").reverse.toString shouldBe "20M2D10M" } + private case class ClippingTestCase + (cigar: String, clippedBases: Int, + leadingSoftClippedBases: Int, trailingSoftClippedBases: Int, + leadingHardClippedBases: Int, trailingHardClippedBases: Int, + leadingClippedBases: Int, trailingClippedBases: Int, + unclippedStart: Int, unclippedEnd: Int) + + Seq( + ClippingTestCase("50M", 0, 0, 0, 0, 0, 0, 0, 0, 0), // no clipping + ClippingTestCase("20S50M", 20, 20, 0, 0, 0, 20, 0, -20, 0), // leading soft clipping + ClippingTestCase("50M10S", 10, 0, 10, 0, 0, 0, 10, 0, 10), // trailing soft clipping + ClippingTestCase("20S50M10S", 30, 20, 10, 0, 0, 20, 10, -20, 10), // leading and trailing soft clipping + ClippingTestCase("20H50M", 20, 0, 0, 20, 0, 20, 0, -20, 0), // leading hard clipping + ClippingTestCase("50M10H", 10, 0, 0, 0, 10, 0, 10, 0, 10), // trailing hard clipping + ClippingTestCase("20H50M10H", 30, 0, 0, 20, 10, 20, 10, -20, 10), // leading and trailing hard clipping + ClippingTestCase("15H5S50M", 20, 5, 0, 15, 0, 20, 0, -20, 0), // leading hard-the-soft clipping + ClippingTestCase("50M5S15H", 20, 0, 5, 0, 15, 0, 20, 0, 20), // trailing hard-the-soft clipping + ClippingTestCase("20H15S50M10S5H", 50, 15, 10, 20, 5, 35, 15, -35, 15), // leading and trailing hard-the-soft clipping + ).foreach { testCase => + "Cigar clipping methods" should s"account for leading and trailing clipping in ${testCase.cigar}" in { + val cigar = Cigar(testCase.cigar) + cigar.clippedBases shouldBe testCase.clippedBases + cigar.leadingSoftClippedBases shouldBe testCase.leadingSoftClippedBases + cigar.trailingSoftClippedBases shouldBe testCase.trailingSoftClippedBases + cigar.leadingHardClippedBases shouldBe testCase.leadingHardClippedBases + cigar.trailingHardClippedBases shouldBe testCase.trailingHardClippedBases + cigar.leadingClippedBases shouldBe testCase.leadingClippedBases + cigar.trailingClippedBases shouldBe testCase.trailingClippedBases + cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100 + cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200 + } + + } + + ///////////////////////////////////////////////////////////////////////////// // Tests for the Alignment class ///////////////////////////////////////////////////////////////////////////// From 28134459f02fed804b50e759392674d2972952bf Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 4 Feb 2025 19:11:26 -0700 Subject: [PATCH 5/6] format fix --- .../scala/com/fulcrumgenomics/alignment/AlignmentTest.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala index 9d000f130..a64cafc95 100644 --- a/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala +++ b/src/test/scala/com/fulcrumgenomics/alignment/AlignmentTest.scala @@ -152,10 +152,8 @@ class AlignmentTest extends UnitSpec { cigar.unclippedStart(100) shouldBe testCase.unclippedStart + 100 cigar.unclippedEnd(200) shouldBe testCase.unclippedEnd + 200 } - } - ///////////////////////////////////////////////////////////////////////////// // Tests for the Alignment class ///////////////////////////////////////////////////////////////////////////// From bf2dba34239b80527510bf15ceb6d4c53465af03 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Tue, 4 Feb 2025 19:18:06 -0700 Subject: [PATCH 6/6] one more --- src/main/scala/com/fulcrumgenomics/bam/Bams.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 4eee3c5f8..3546353ec 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -58,9 +58,8 @@ private[bam] case class AlignmentInfo(refIndex: Int, start: Int, positiveStrand: /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ def toSA(header: SAMFileHeader): String = { val strand = if (positiveStrand) '+' else '-' - val refName = header.getSequence(refIndex).getSequenceName val cigar = this.cigar.getOrElse("*") - f"${refName},${start},${strand},${cigar},${mapq},${nm}" + f"${refName(header)},${start},${strand},${cigar},${mapq},${nm}" } }