diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 61d6d52aa..d50f946d8 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -25,6 +25,7 @@ package com.fulcrumgenomics.bam import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.alignment.Cigar import com.fulcrumgenomics.bam.api.SamOrder.Queryname import com.fulcrumgenomics.bam.api._ import com.fulcrumgenomics.commons.collection.{BetterBufferedIterator, SelfClosingIterator} @@ -41,6 +42,36 @@ import htsjdk.samtools.util.{CloserUtil, CoordMath, Murmur3, SequenceUtil} import java.io.Closeable import scala.math.{max, min} + + +case class Supplementary(refName: String, start: Int, positiveStrand: Boolean, cigar: Cigar, mapq: Int, nm: Int) { + def negativeStrand: Boolean = !positiveStrand + def refIndex(header: SAMFileHeader): Int = header.getSequence(refName).getSequenceIndex + + def end: Int = start + cigar.lengthOnTarget - 1 + def unclippedStart: Int = { + SAMUtils.getUnclippedStart(start, cigar.toHtsjdkCigar) + } + + def unclippedEnd: Int = { + SAMUtils.getUnclippedEnd(end, cigar.toHtsjdkCigar) + } +} + +object Supplementary { + /** Returns a formatted alignment as per the SA tag: `(rname ,pos ,strand ,CIGAR ,mapQ ,NM ;)+` */ + def toString(rec: SamRecord): String = { + val strand = if (rec.positiveStrand) '+' else '-' + f"${rec.refName},${rec.start},${strand},${rec.cigar},${rec.mapq},${rec.getOrElse(SAMTag.NM.name(),0)}" + } + + + def apply(sa: String): Supplementary = { + val parts = sa.split(",") + Supplementary(parts(0), parts(1).toInt, parts(2) == "+", Cigar(parts(3)), parts(4).toInt, parts(5).toInt) + } +} + /** * Class that represents all reads from a template within a BAM file. */ @@ -107,11 +138,21 @@ case class Template(r1: Option[SamRecord], /** Fixes mate information and sets mate cigar on all primary and supplementary (but not secondary) records. */ def fixMateInfo(): Unit = { - for (primary <- r1; supp <- r2Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + // Set all mate info on BOTH secondary and supplementary records, not just supplementary records. We also need to + // add the "pa" and "pm" tags with information about the primary alignments. Finally, we need the MQ tag! + val r1NonPrimary = r1Supplementals ++ r1Secondaries + val r2NonPrimary = r2Supplementals ++ r2Secondaries + for (primary <- r1; nonPrimary <- r2NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r2.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } - for (primary <- r2; supp <- r1Supplementals) { - SamPairUtil.setMateInformationOnSupplementalAlignment(supp.asSam, primary.asSam, true) + for (primary <- r2; nonPrimary <- r1NonPrimary) { + SamPairUtil.setMateInformationOnSupplementalAlignment(nonPrimary.asSam, primary.asSam, true) + nonPrimary(SAMTag.MQ.name()) = primary.mapq + nonPrimary("mp") = Supplementary.toString(primary) + r1.foreach(r => nonPrimary("rp") = Supplementary.toString(r)) } for (first <- r1; second <- r2) { SamPairUtil.setMateInfo(first.asSam, second.asSam, true) diff --git a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala index 6aff8d844..2f147618d 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/api/SamOrder.scala @@ -24,12 +24,15 @@ package com.fulcrumgenomics.bam.api +import com.fulcrumgenomics.bam.{Bams, Supplementary} import com.fulcrumgenomics.umi.ConsensusTags import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 import htsjdk.samtools.{SAMFileHeader, SAMUtils} import org.apache.commons.math3.genetics.RandomKey +import scala.reflect.runtime.universe.Template + /** Trait for specifying BAM orderings. */ sealed trait SamOrder extends Product { @@ -175,24 +178,50 @@ object SamOrder { override val groupOrder: GroupOrder = GroupOrder.query override val subSort: Option[String] = Some("template-coordinate") override val sortkey: SamRecord => A = rec => { - val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex - val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex - val readNeg = rec.negativeStrand - val mateNeg = if (rec.paired) rec.mateNegativeStrand else false - val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart - val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) - val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") - val mid = rec.get[String](ConsensusTags.MolecularId).map { m => - val index: Int = m.lastIndexOf('/') - if (index >= 0) m.substring(0, index) else m - }.getOrElse("") - - if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || - (readChrom == mateChrom && readPos == matePos && !readNeg)) { - TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) - } - else { - TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + // For non-secondary/non-supplementary alignments, use the info in the record. For secondary and supplementary + // alignments, use the info in the pa/pm tags. + if (!rec.secondary && !rec.supplementary) { + val readChrom = if (rec.unmapped) Int.MaxValue else rec.refIndex + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else rec.mateRefIndex + val readNeg = rec.negativeStrand + val mateNeg = if (rec.paired) rec.mateNegativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) rec.unclippedEnd else rec.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) SAMUtils.getMateUnclippedEnd(rec.asSam) else SAMUtils.getMateUnclippedStart(rec.asSam) + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } + } else { + val primary = Supplementary(rec[String]("rp")) + val mate = Supplementary(rec[String]("mp")) + val readChrom = if (rec.unmapped) Int.MaxValue else primary.refIndex(rec.header) + val mateChrom = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else mate.refIndex(rec.header) + val readNeg = primary.negativeStrand + val mateNeg = if (rec.paired) mate.negativeStrand else false + val readPos = if (rec.unmapped) Int.MaxValue else if (readNeg) primary.unclippedEnd else primary.unclippedStart + val matePos = if (rec.unpaired || rec.mateUnmapped) Int.MaxValue else if (mateNeg) mate.unclippedEnd else mate.unclippedStart + val lib = Option(rec.readGroup).flatMap(rg => Option(rg.getLibrary)).getOrElse("Unknown") + val mid = rec.get[String](ConsensusTags.MolecularId).map { m => + val index: Int = m.lastIndexOf('/') + if (index >= 0) m.substring(0, index) else m + }.getOrElse("") + + if (readChrom < mateChrom || (readChrom == mateChrom && readPos < matePos) || + (readChrom == mateChrom && readPos == matePos && !readNeg)) { + TemplateCoordinateKey(readChrom, mateChrom, readPos, matePos, readNeg, mateNeg, lib, mid, rec.name, false) + } + else { + TemplateCoordinateKey(mateChrom, readChrom, matePos, readPos, mateNeg, readNeg, lib, mid, rec.name, true) + } } } } diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index aece16860..498245e6b 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -719,7 +719,7 @@ class GroupReadsByUmi // Then output the records in the right order (assigned tag, read name, r1, r2) templatesByMi.keys.toSeq.sortBy(id => (id.length, id)).foreach(tag => { - templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.primaryReads).foreach(rec => { + templatesByMi(tag).sortBy(t => t.name).flatMap(t => t.allReads).foreach(rec => { out += rec }) }) diff --git a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala index 2545f2ee0..b5eda09e2 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/api/SamOrderTest.scala @@ -25,11 +25,13 @@ package com.fulcrumgenomics.bam.api import java.util.Random - import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.Bams.{MaxInMemory, templateIterator} import com.fulcrumgenomics.commons.util.SimpleCounter import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.util.Io import htsjdk.samtools.{SAMFileHeader, SAMRecordCoordinateComparator, SAMRecordQueryNameComparator} import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} import htsjdk.samtools.util.Murmur3 @@ -228,11 +230,18 @@ class SamOrderTest extends UnitSpec { s2.supplementary = true s2.properlyPaired = true exp += s2 - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 exp ++= builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S") - val expected = List("q1/2:sup", "q1/1", "q1/2", "q2/1", "q2/2") - val actual = exp.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) + // Fix the mate information. Note: sorting here to get a template-iterator will write the records to disk first, + // so we cannot use the records in builder/exp. + val records = Bams.templateIterator(iterator=exp.iterator, header=builder.header, maxInMemory=MaxInMemory, tmpDir=Io.tmpDir).flatMap { template => + template.fixMateInfo() + template.allReads + }.toList + + val expected = List("q2/1", "q2/2", "q1/1", "q1/2", "q1/2:sup") + val actual = records.sortBy(r => SamOrder.TemplateCoordinate.sortkey(r)).map(_.id) actual should contain theSameElementsInOrderAs expected } diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index 4c0e27748..003ead8bb 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -26,13 +26,14 @@ */ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.Template -import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.bam.api.{SamOrder, SamWriter} import com.fulcrumgenomics.bam.api.SamOrder.TemplateCoordinate import com.fulcrumgenomics.cmdline.FgBioMain.FailureException import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import com.fulcrumgenomics.umi.GroupReadsByUmi._ +import com.fulcrumgenomics.util.Io import org.scalatest.{OptionValues, PrivateMethodTester} import java.nio.file.Files @@ -335,7 +336,7 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT } it should "mark duplicates on supplementary reads" in { - val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.TemplateCoordinate)) + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) // primary read pairs for q1, that map to different contigs builder.addPair("q1", contig = 1, contig2 = Some(2), start1 = 66, start2 = 47, cigar1 = "60M40S", cigar2 = "55M45S", strand2 = Plus, attrs = Map("RX" -> "ACT")) // supplementary R2, which maps to the same chromosome as the primary R1 @@ -343,21 +344,35 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT s1.supplementary = true s2.supplementary = true s2.properlyPaired = true - // primary read pairs for q1, that map to different contigs, but earlier that q1 + // primary read pairs for q2, that map to different contigs, but earlier that q1 builder.addPair("q2", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) - val in = builder.toTempFile() - val out = Files.createTempFile("umi_grouped.", ".sam") + // primary read pairs for q3, that are duplicates of q2 + builder.addPair("q3", contig = 1, contig2 = Some(2), start1 = 50, start2 = 30, cigar1 = "60M40S", cigar2 = "55M45S", attrs = Map("RX" -> "ACT")) + + // Fix the mate information and write the input. Note: sorting here to get a template-iterator will write the + // records to disk first, so we cannot use the records in builder and therefore must write to the input file + // directly. + val in = Files.createTempFile("raw_reads", ".sam") + val writer = SamWriter(in, builder.header, sort = builder.sort) + Bams.templateIterator(iterator = builder.iterator, header = builder.header, maxInMemory = Bams.MaxInMemory, tmpDir = Io.tmpDir).foreach { template => + template.fixMateInfo() + writer ++= template.allReads + } + writer.close() + + val out = Files.createTempFile("umi_grouped.", ".sam") val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") - val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true, includeSupplementary = Some(true)) + val gr = new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), strategy = Strategy.Adjacency, edits = 1, markDuplicates = true) gr.markDuplicates shouldBe true gr.execute() val recs = readBamRecs(out) - recs.length shouldBe 4 - recs.filter(_.name.equals("q1")).forall(_.duplicate == true) shouldBe true + recs.length shouldBe 8 + recs.filter(_.name.equals("q1")).forall(_.duplicate == false) shouldBe true recs.filter(_.name.equals("q2")).forall(_.duplicate == false) shouldBe true + recs.filter(_.name.equals("q3")).forall(_.duplicate == true) shouldBe true } it should "correctly group reads with the paired assigner when the two UMIs are the same" in {