diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs
index 5c1b428d..fc07460d 100644
--- a/mzLib/Proteomics/Protein/Protein.cs
+++ b/mzLib/Proteomics/Protein/Protein.cs
@@ -836,6 +836,7 @@ public static Protein ScrambleDecoyProteinSequence(
string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
// Clone the original protein's modifications
var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
+ Random rng = new Random(42);
// Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
// Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
@@ -843,14 +844,14 @@ public static Protein ScrambleDecoyProteinSequence(
{
if(scrambledProteinSequence.Contains(peptideSequence))
{
- string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
+ string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out var swappedArray);
int scrambleAttempts = 1;
// Try five times to scramble the peptide sequence without creating a forbidden sequence
while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
{
- scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
+ scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out swappedArray);
scrambleAttempts++;
}
@@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence(
return newProtein;
}
- private static Random rng = new Random(42);
-
///
/// Scrambles a peptide sequence, preserving the position of any cleavage sites.
///
/// An array that maps the previous position (index) to the new position (value)
- public static string ScrambleSequence(string sequence, List motifs, out int[] swappedPositionArray)
+ public static string ScrambleSequence(string sequence, List motifs, Random rng, out int[] swappedPositionArray)
{
// First, find the location of every cleavage motif. These sites shouldn't be scrambled.
HashSet zeroBasedCleavageSitesLocations = new();
diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
index 82d7ce71..8925661c 100644
--- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
+++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
@@ -1,4 +1,4 @@
-// opyright 2016 Stefan Solntsev
+// Copyright 2016 Stefan Solntsev
//
// This file (ChemicalFormula.cs) is part of Chemistry Library.
//
@@ -28,6 +28,7 @@
using Omics.Modifications;
using UsefulProteomicsDatabases;
using Stopwatch = System.Diagnostics.Stopwatch;
+using NUnit.Framework.Legacy;
namespace Test.DatabaseTests
{
@@ -81,6 +82,39 @@ public static void LoadIsoforms()
Assert.AreEqual("Q14103-4", proteinXml[9].Accession);
}
+ [Test]
+ [TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)]
+ [TestCase("uniprot_aifm1.fasta", DecoyType.None)]
+ [TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)]
+ [TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)]
+ public void LoadingIsReproducible(string fileName, DecoyType decoyType)
+ {
+ // Load in proteins
+ var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
+ List proteins1 = null;
+ List proteins2 = null;
+ if(fileName.Contains(".xml"))
+ {
+ proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
+ proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
+ }
+ else if (fileName.Contains(".fasta"))
+ {
+ proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
+ proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
+ }
+ else
+ {
+ Assert.Fail("Unknown file type");
+ }
+
+ // check are equivalent lists of proteins
+ Assert.AreEqual(proteins1.Count, proteins2.Count);
+ // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same
+ CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession));
+ CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence));
+ }
+
[Test]
public static void LoadModWithNl()
{
diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs
index 975db7dd..02cc3aed 100644
--- a/mzLib/Test/TestProteinDigestion.cs
+++ b/mzLib/Test/TestProteinDigestion.cs
@@ -361,6 +361,73 @@ public static void Test_ProteinDigest()
Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence);
}
+ [Test]
+ [TestCase("cRAP_databaseGPTMD.xml")]
+ [TestCase("uniprot_aifm1.fasta")]
+ public static void TestDecoyScramblingIsReproducible(string fileName)
+ {
+ // Load in proteins
+ var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
+ DecoyType decoyType = DecoyType.Reverse;
+ List proteins1 = null;
+ List proteins2 = null;
+ if (fileName.Contains(".xml"))
+ {
+ proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
+ proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
+ }
+ else if (fileName.Contains(".fasta"))
+ {
+ proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
+ proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
+ }
+ else
+ {
+ Assert.Fail("Unknown file type");
+ }
+
+ DigestionParams d = new DigestionParams(
+ maxMissedCleavages: 1,
+ minPeptideLength: 5,
+ initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);
+ // Digest target proteins
+ var pepsToReplace = proteins1.Where(p => !p.IsDecoy)
+ .SelectMany(p => p.Digest(d, new List(), new List()).ToList())
+ .Select(pep => pep.BaseSequence)
+ .ToHashSet();
+
+ // Ensure at least one decoy peptide from each protein is problematic and must be replaced
+ var singleDecoyPeptides = proteins1
+ .Where(p => p.IsDecoy)
+ .Select(p => p.Digest(d, new List(), new List()).Skip(2).Take(1))
+ .Select(pwsm => pwsm.First().BaseSequence)
+ .ToHashSet();
+
+ //modify targetpeptides in place
+ pepsToReplace.UnionWith(singleDecoyPeptides);
+
+ // Scramble every decoy from db1
+ List decoys1 = new();
+ foreach (var protein in proteins1.Where(p => p.IsDecoy))
+ {
+ decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
+ }
+ // Scramble every decoy from db2
+ List decoys2 = new();
+ foreach (var protein in proteins2.Where(p => p.IsDecoy))
+ {
+ decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
+ }
+
+ // check are equivalent lists of proteins
+ Assert.AreEqual(decoys1.Count, decoys2.Count);
+ foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession))
+ {
+ Assert.AreEqual(2, decoyPair.Count());
+ Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence);
+ }
+ }
+
[Test]
public static void TestDecoyScramblerReplacesPeptides()
{