diff --git a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala index 7c335fd9a..044cbdb69 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala @@ -466,8 +466,9 @@ object Strategy extends FgBioEnum[Strategy] { | |By default, all UMIs must be the same length. If `--min-umi-length=len` is specified then reads that have a UMI |shorter than `len` will be discarded, and when comparing UMIs of different lengths, the first len bases will be - |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] bases in the UMI - |(i.e. does not count dashes and other non-ACGT characters). This option is not implemented for reads with UMI pairs + |compared, where `len` is the length of the shortest UMI. The UMI length is the number of [ACGT] + |([ACTGN], if the N flag is passed) bases in the UMI + |(i.e. does not count dashes and other non-ACGT(N) characters). This option is not implemented for reads with UMI pairs |(i.e. using the paired assigner). | |If the input is not template-coordinate sorted (i.e. `SO:unsorted GO:query SS:unsorted:template-coordinate`), then @@ -482,6 +483,8 @@ class GroupReadsByUmi @arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = "MI", @arg(flag='m', doc="Minimum mapping quality for mapped reads.") val minMapQ: Int = 1, @arg(flag='n', doc="Include non-PF reads.") val includeNonPfReads: Boolean = false, + @arg(flag = 'N', doc = "Allow UMIs which contain Ns. Ns will be treated as mismatches to all other non-Ns") + val includeNs: Boolean = false, @arg(flag='s', doc="The UMI assignment strategy.") val strategy: Strategy, @arg(flag='e', doc="The allowable number of edits between UMIs.") val edits: Int = 1, @arg(flag='l', doc= """The minimum UMI length. If not specified then all UMIs must have the same length, @@ -543,6 +546,7 @@ class GroupReadsByUmi (this.edits == 0 || this.strategy == Strategy.Identity) } + // Filter and sort the input BAM file logger.info("Filtering the input.") val filteredIterator = in.iterator @@ -551,7 +555,11 @@ class GroupReadsByUmi .filter(r => (r.mapped || (r.paired && r.mateMapped)) || { filteredPoorAlignment += 1; false }) .filter(r => (allowInterContig || r.unpaired || r.refIndex == r.mateRefIndex) || { filteredPoorAlignment += 1; false }) .filter(r => mapqOk(r, this.minMapQ) || { filteredPoorAlignment += 1; false }) - .filter(r => !r.get[String](rawTag).exists(_.contains('N')) || { filteredNsInUmi += 1; false }) + .filter(r => + (this.includeNs && !r.get[String](rawTag).exists { umi => + umi.forall(c => c == 'N' || c == '-') + } || !r.get[String](rawTag).exists(_.contains('N'))) + || { filteredNsInUmi += 1; false }) .filter { r => this.minUmiLength.forall { l => r.get[String](this.rawTag).forall { umi => @@ -644,7 +652,8 @@ class GroupReadsByUmi logger.info(f"Accepted $kept%,d reads for grouping.") if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.") logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.") - logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") + if (!this.includeNs) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.") + if (this.includeNs && filteredNsInUmi > 0) logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained only Ns in their UMIs.") this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") } } diff --git a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala index 81412a859..3e53378b9 100644 --- a/src/main/scala/com/fulcrumgenomics/util/Sequences.scala +++ b/src/main/scala/com/fulcrumgenomics/util/Sequences.scala @@ -94,7 +94,9 @@ object Sequences { */ def gcContent(s: String): Double = if (s.isEmpty) 0 else SequenceUtil.calculateGc(s.getBytes) - /** Counts the number of mismatches between two sequences of the same length. */ + /** Counts the number of mismatches between two sequences of the same length. + * N's always count as mismatches + */ def countMismatches(s1: String, s2: String): Int = { require(s1.length == s2.length, s"Cannot count mismatches in strings of differing lengths: $s1 $s2") @@ -102,7 +104,7 @@ object Sequences { forloop (from=0, until=s1.length) { i => val a = Character.toUpperCase(s1.charAt(i)) val b = Character.toUpperCase(s2.charAt(i)) - if (a != b) count += 1 + if (a != b || a == 'N' || b == 'N') count += 1 } count diff --git a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala index beb9917ab..b77f1b6db 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala @@ -334,6 +334,124 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) } + it should "correctly group together single-end reads with UMIs containing N's, if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA")) + builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC")) + builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 8 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 4 + groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) + } + + it should "correctly filter UMIs which are only N's, if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "NNNNNNNN")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 1 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 1 + groups should contain theSameElementsAs Seq(Set("a02")) + } + + it should "exclude reads that contain an only N's in the UMI, handle - also." in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "NNN-NNN")) + builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".bam") + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute() + + val recs = readBamRecs(out) + recs should have size 2 + + readBamRecs(out).map(_.name).distinct shouldBe Seq("a02") + } + + + it should "correctly filter and not group together single-end reads with UMIs containing N's" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAN")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "CACACACA")) + builder.addFrag(name = "a04", start = 100, attrs = Map("RX" -> "CACACACC")) + builder.addFrag(name = "a05", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a06", start = 105, attrs = Map("RX" -> "GTAGTAGG")) + builder.addFrag(name = "a07", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a08", start = 107, attrs = Map("RX" -> "AAAAAAAA")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = false, strategy = Strategy.Edit, edits = 1).execute() + + val recs = readBamRecs(out) + recs should have size 7 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 4 + groups should contain theSameElementsAs Seq(Set("a01"), Set("a03", "a04"), Set("a05", "a06"), Set("a07", "a08")) + } + + + it should "include reads that contain an N in the UMI, if option is set." in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addPair(name = "a01", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + builder.addPair(name = "a02", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACT")) + builder.addPair(name = "a03", start1 = 100, start2 = 300, strand1 = Plus, strand2 = Minus, attrs = Map("RX" -> "ACT-ACN")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".bam") + new GroupReadsByUmi(input = in, output = out, rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Paired, edits = 2).execute() + + readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02", "a03") + } + + it should "correctly group UMIs within the edit distance, if option is set" in { + val builder = new SamBuilder(readLength = 100, sort = Some(SamOrder.Coordinate)) + builder.addFrag(name = "a01", start = 100, attrs = Map("RX" -> "AAAAAANN")) + builder.addFrag(name = "a02", start = 100, attrs = Map("RX" -> "AAAAAAAA")) + builder.addFrag(name = "a03", start = 100, attrs = Map("RX" -> "AAAAANNN")) + + val in = builder.toTempFile() + val out = Files.createTempFile("umi_grouped.", ".sam") + val hist = Files.createTempFile("umi_grouped.", ".histogram.txt") + new GroupReadsByUmi(input = in, output = out, familySizeHistogram = Some(hist), rawTag = "RX", assignTag = "MI", includeNs = true, strategy = Strategy.Edit, edits = 2).execute() + + val recs = readBamRecs(out) + recs should have size 3 + + val groups = recs.groupBy(r => r[String]("MI")).values.map(rs => rs.map(_.name).toSet) + groups should have size 2 + groups should contain theSameElementsAs Seq(Set("a01", "a02"), Set("a03")) + + } + + + it should "exclude reads that contain an N in the UMI" in { val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.Coordinate)) builder.addPair(name="a01", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ACT"))