Skip to content

Commit

Permalink
Modified decoy scrambler to no longer use static Random generator (#798)
Browse files Browse the repository at this point in the history
* Modified decoy scrambler to no longer use static Random generator

* Added additional tests

* Better tests
  • Loading branch information
Alexander-Sol authored Sep 13, 2024
1 parent 7055d84 commit 983c3b0
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 6 deletions.
9 changes: 4 additions & 5 deletions mzLib/Proteomics/Protein/Protein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -836,21 +836,22 @@ public static Protein ScrambleDecoyProteinSequence(
string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
// Clone the original protein's modifications
var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
Random rng = new Random(42);

// Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
// Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
foreach(string peptideSequence in sequencesToScramble.OrderBy(seq => seq.Length))
{
if(scrambledProteinSequence.Contains(peptideSequence))
{
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out var swappedArray);
int scrambleAttempts = 1;

// Try five times to scramble the peptide sequence without creating a forbidden sequence
while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
{
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out swappedArray);
scrambleAttempts++;
}
Expand Down Expand Up @@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence(
return newProtein;
}

private static Random rng = new Random(42);

/// <summary>
/// Scrambles a peptide sequence, preserving the position of any cleavage sites.
/// </summary>
/// <param name="swappedPositionArray">An array that maps the previous position (index) to the new position (value)</param>
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, out int[] swappedPositionArray)
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, Random rng, out int[] swappedPositionArray)
{
// First, find the location of every cleavage motif. These sites shouldn't be scrambled.
HashSet<int> zeroBasedCleavageSitesLocations = new();
Expand Down
36 changes: 35 additions & 1 deletion mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// opyright 2016 Stefan Solntsev
// Copyright 2016 Stefan Solntsev
//
// This file (ChemicalFormula.cs) is part of Chemistry Library.
//
Expand Down Expand Up @@ -28,6 +28,7 @@
using Omics.Modifications;
using UsefulProteomicsDatabases;
using Stopwatch = System.Diagnostics.Stopwatch;
using NUnit.Framework.Legacy;

namespace Test.DatabaseTests
{
Expand Down Expand Up @@ -81,6 +82,39 @@ public static void LoadIsoforms()
Assert.AreEqual("Q14103-4", proteinXml[9].Accession);
}

[Test]
[TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)]
[TestCase("uniprot_aifm1.fasta", DecoyType.None)]
[TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)]
[TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)]
public void LoadingIsReproducible(string fileName, DecoyType decoyType)
{
// Load in proteins
var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
List<Protein> proteins1 = null;
List<Protein> proteins2 = null;
if(fileName.Contains(".xml"))
{
proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
}
else if (fileName.Contains(".fasta"))
{
proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
}
else
{
Assert.Fail("Unknown file type");
}

// check are equivalent lists of proteins
Assert.AreEqual(proteins1.Count, proteins2.Count);
// Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same
CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession));
CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence));
}

[Test]
public static void LoadModWithNl()
{
Expand Down
67 changes: 67 additions & 0 deletions mzLib/Test/TestProteinDigestion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,73 @@ public static void Test_ProteinDigest()
Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence);
}

[Test]
[TestCase("cRAP_databaseGPTMD.xml")]
[TestCase("uniprot_aifm1.fasta")]
public static void TestDecoyScramblingIsReproducible(string fileName)
{
// Load in proteins
var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
DecoyType decoyType = DecoyType.Reverse;
List<Protein> proteins1 = null;
List<Protein> proteins2 = null;
if (fileName.Contains(".xml"))
{
proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
}
else if (fileName.Contains(".fasta"))
{
proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
}
else
{
Assert.Fail("Unknown file type");
}

DigestionParams d = new DigestionParams(
maxMissedCleavages: 1,
minPeptideLength: 5,
initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);
// Digest target proteins
var pepsToReplace = proteins1.Where(p => !p.IsDecoy)
.SelectMany(p => p.Digest(d, new List<Modification>(), new List<Modification>()).ToList())
.Select(pep => pep.BaseSequence)
.ToHashSet();

// Ensure at least one decoy peptide from each protein is problematic and must be replaced
var singleDecoyPeptides = proteins1
.Where(p => p.IsDecoy)
.Select(p => p.Digest(d, new List<Modification>(), new List<Modification>()).Skip(2).Take(1))
.Select(pwsm => pwsm.First().BaseSequence)
.ToHashSet();

//modify targetpeptides in place
pepsToReplace.UnionWith(singleDecoyPeptides);

// Scramble every decoy from db1
List<Protein> decoys1 = new();
foreach (var protein in proteins1.Where(p => p.IsDecoy))
{
decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
}
// Scramble every decoy from db2
List<Protein> decoys2 = new();
foreach (var protein in proteins2.Where(p => p.IsDecoy))
{
decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
}

// check are equivalent lists of proteins
Assert.AreEqual(decoys1.Count, decoys2.Count);
foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession))
{
Assert.AreEqual(2, decoyPair.Count());
Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence);
}
}

[Test]
public static void TestDecoyScramblerReplacesPeptides()
{
Expand Down

0 comments on commit 983c3b0

Please sign in to comment.