diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 5c1b428d..fc07460d 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -836,6 +836,7 @@ public static Protein ScrambleDecoyProteinSequence( string scrambledProteinSequence = originalDecoyProtein.BaseSequence; // Clone the original protein's modifications var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + Random rng = new Random(42); // Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide // Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide @@ -843,14 +844,14 @@ public static Protein ScrambleDecoyProteinSequence( { if(scrambledProteinSequence.Contains(peptideSequence)) { - string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, + string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, out var swappedArray); int scrambleAttempts = 1; // Try five times to scramble the peptide sequence without creating a forbidden sequence while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5) { - scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, + scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, out swappedArray); scrambleAttempts++; } @@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence( return newProtein; } - private static Random rng = new Random(42); - /// /// Scrambles a peptide sequence, preserving the position of any cleavage sites. /// /// An array that maps the previous position (index) to the new position (value) - public static string ScrambleSequence(string sequence, List motifs, out int[] swappedPositionArray) + public static string ScrambleSequence(string sequence, List motifs, Random rng, out int[] swappedPositionArray) { // First, find the location of every cleavage motif. These sites shouldn't be scrambled. HashSet zeroBasedCleavageSitesLocations = new(); diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs index 82d7ce71..8925661c 100644 --- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs +++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs @@ -1,4 +1,4 @@ -// opyright 2016 Stefan Solntsev +// Copyright 2016 Stefan Solntsev // // This file (ChemicalFormula.cs) is part of Chemistry Library. // @@ -28,6 +28,7 @@ using Omics.Modifications; using UsefulProteomicsDatabases; using Stopwatch = System.Diagnostics.Stopwatch; +using NUnit.Framework.Legacy; namespace Test.DatabaseTests { @@ -81,6 +82,39 @@ public static void LoadIsoforms() Assert.AreEqual("Q14103-4", proteinXml[9].Accession); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)] + [TestCase("uniprot_aifm1.fasta", DecoyType.None)] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)] + [TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)] + public void LoadingIsReproducible(string fileName, DecoyType decoyType) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + List proteins1 = null; + List proteins2 = null; + if(fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + // check are equivalent lists of proteins + Assert.AreEqual(proteins1.Count, proteins2.Count); + // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same + CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession)); + CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence)); + } + [Test] public static void LoadModWithNl() { diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 975db7dd..02cc3aed 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -361,6 +361,73 @@ public static void Test_ProteinDigest() Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml")] + [TestCase("uniprot_aifm1.fasta")] + public static void TestDecoyScramblingIsReproducible(string fileName) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + DecoyType decoyType = DecoyType.Reverse; + List proteins1 = null; + List proteins2 = null; + if (fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + DigestionParams d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 5, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + // Digest target proteins + var pepsToReplace = proteins1.Where(p => !p.IsDecoy) + .SelectMany(p => p.Digest(d, new List(), new List()).ToList()) + .Select(pep => pep.BaseSequence) + .ToHashSet(); + + // Ensure at least one decoy peptide from each protein is problematic and must be replaced + var singleDecoyPeptides = proteins1 + .Where(p => p.IsDecoy) + .Select(p => p.Digest(d, new List(), new List()).Skip(2).Take(1)) + .Select(pwsm => pwsm.First().BaseSequence) + .ToHashSet(); + + //modify targetpeptides in place + pepsToReplace.UnionWith(singleDecoyPeptides); + + // Scramble every decoy from db1 + List decoys1 = new(); + foreach (var protein in proteins1.Where(p => p.IsDecoy)) + { + decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + // Scramble every decoy from db2 + List decoys2 = new(); + foreach (var protein in proteins2.Where(p => p.IsDecoy)) + { + decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + + // check are equivalent lists of proteins + Assert.AreEqual(decoys1.Count, decoys2.Count); + foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession)) + { + Assert.AreEqual(2, decoyPair.Count()); + Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence); + } + } + [Test] public static void TestDecoyScramblerReplacesPeptides() {