From cd1460b5f96c59b46de2804b4fd4fc60beb30df5 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Mon, 29 Jan 2024 17:31:01 -0700 Subject: [PATCH] fix: properly template-coordinate sort and duplicate mark secondary and supplementary reads Secondary and supplementary reads must use the coordinates of the primary alignments within the template, otherwise they will not guaranteed to be next the primary alignments in the file. Therefore, we've added the "rp" and "mp" tags to store the SA-tag equivalent information for the primary alignment. This keeps information about the primary alignments with the secondary and supplementary alignments. --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 49 ++++++++++++-- .../fulcrumgenomics/bam/api/SamOrder.scala | 65 ++++++++++++++----- .../fulcrumgenomics/umi/GroupReadsByUmi.scala | 2 +- .../bam/api/SamOrderTest.scala | 17 +++-- .../umi/GroupReadsByUmiTest.scala | 33 +++++++--- 5 files changed, 130 insertions(+), 36 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 61d6d52aa..d50f946d8 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -25,6 +25,7 @@ package com.fulcrumgenomics.bam import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.bam.api.SamOrder.Queryname import com.fulcrumgenomics.bam.api._ import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator} @@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil} import java.io.Closeable import scala.math.{max, min} + + +case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) { + def negativeStrand: Boolean = !positiveStrand + def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex + + def end: Int = start + cigar.lengthOnTarget - 1 + def unclippedStart: Int = { + SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar) + } + + def unclippedEnd: Int = { + SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar) + } +} + +object Supplementary { + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toString(rec: SamRecord): String = { + val strand = if (rec.positiveStrand) '+' else '-' + f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}" + } + + + def apply(sa: String): Supplementary = { + val parts = sa.split(",") + Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt) + } +} + /** * Class that represents all reads from a template within a BAM file. */ @@ -107,11 +138,21 @@ case class Template(r1: Option[SamRecord], /** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */ def fixMateInfo(): Unit = { - for (primary <- r1; supp <- r2Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + // Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to + // add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag! + val r1NonPrimary = r1Supplementals ++ r1Secondaries + val r2NonPrimary = r2Supplementals ++ r2Secondaries + for (primary <- r1; nonPrimary <- r2NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } - for (primary <- r2; supp <- r1Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + for (primary <- r2; nonPrimary <- r1NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala index 6aff8d844..2f147618d 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala @@ -24,12 +24,15 @@ package com.fulcrumgenomics.bam.api +import com.fulcrumgenomics.bam.{Bams, Supplementary} import com.fulcrumgenomics.umi.ConsensusTags import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 import htsjdk.samtools.{SAMFileHeader, SAMUtils} import org.apache.commons.math3.genetics.RandomKey +import scala.reflect.runtime.universe.Template + /** Trait for specifying BAM orderings. */ sealed trait SamOrder extends Product { @@ -175,24 +178,50 @@ object SamOrder { override val groupOrder: GroupOrder = GroupOrder.query override val subSort: Option[String] = Some("template-coordinate") override val sortkey: SamRecord => A = rec => { - val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex - val readNeg = rec.negativeStrand - val mateNeg = if (rec.paired) rec.mateNegativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => - val index: Int = m.lastIndexOf('/') - if (index >= 0) m.substring(0, index) else m - }.getOrElse("") - - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) - } - else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + // For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary + // alignments, use the info in the pa/pm tags. + if (!rec.secondary && !rec.supplementary) { + val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex + val readNeg = rec.negativeStrand + val mateNeg = if (rec.paired) rec.mateNegativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } + } else { + val primary = Supplementary(rec[String]("rp")) + val mate = Supplementary(rec[String]("mp")) + val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header) + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header) + val readNeg = primary.negativeStrand + val mateNeg = if (rec.paired) mate.negativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } } } } diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index aece16860..498245e6b 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -719,7 +719,7 @@ class GroupReadsByUmi // Then output the records in the right order (assigned tag, read name, r1, r2) templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => { - templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => { + templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => { out += rec }) }) diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala index 2545f2ee0..b5eda09e2 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala @@ -25,11 +25,13 @@ package com.fulcrumgenomics.bam.api import java.util.Random - import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator} import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.util.Io import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator} import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 @@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec { s2.supplementary = true s2.properlyPaired = true exp += s2 - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S") - val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2") - val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) + // Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first, + // so we cannot use the records in builder/exp. + val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template => + template.fixMateInfo() + template.allReads + }.toList + + val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup") + val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) actual should contain theSameElementsInOrderAs expected } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 4c0e27748..003ead8bb 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -26,13 +26,14 @@ */ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.Template -import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter} import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ +import com.fulcrumgenomics.util.Io import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -335,7 +336,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT } it should "mark duplicates on supplementary reads" in { - val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate)) + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) // primary read pairs for q1, that map to different contigs builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT")) // supplementary R2, which maps to the same chromosome as the primary R1 @@ -343,21 +344,35 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT s1.supplementary = true s2.supplementary = true s2.properlyPaired = true - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) - val in = builder.toTempFile() - val out = Files.createTempFile("umi_grouped.", ".sam") + // primary read pairs for q3, that are duplicates of q2 + builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + // Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the + // records to disk first, so we cannot use the records in builder and therefore must write to the input file + // directly. + val in = Files.createTempFile("raw_reads", ".sam") + val writer = SamWriter(in, builder.header, sort = builder.sort) + Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template => + template.fixMateInfo() + writer ++= template.allReads + } + writer.close() + + val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true)) + val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true) gr.markDuplicates shouldBe true gr.execute() val recs = readBamRecs(out) - recs.length shouldBe 4 - recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true + recs.length shouldBe 8 + recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true + recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true } it should "correctly group reads with the paired assigner when the two UMIs are the same" in {