Skip to content

Commit

Permalink
Added scramble method in protein, associated tests (#789)
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander-Sol authored Aug 3, 2024
1 parent 712ac72 commit a78eea2
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 5 deletions.
108 changes: 103 additions & 5 deletions mzLib/Proteomics/Protein/Protein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,14 @@ public Protein(string sequence, string accession, string organism = null, List<T

/// <summary>
/// Protein construction that clones a protein but assigns a different base sequence
/// For use in SILAC experiments
/// For use in SILAC experiments and in decoy construction
/// </summary>
/// <param name="originalProtein"></param>
/// <param name="silacSequence"></param>
/// <param name="newBaseSequence"></param>
/// <param name="silacAccession"></param>
public Protein(Protein originalProtein, string silacSequence)
public Protein(Protein originalProtein, string newBaseSequence)
{
BaseSequence = silacSequence;
BaseSequence = newBaseSequence;
Accession = originalProtein.Accession;
NonVariantProtein = originalProtein.NonVariantProtein;
Name = originalProtein.Name;
Expand Down Expand Up @@ -156,7 +156,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable<Sequence
/// <summary>
/// Base sequence, which may contain applied sequence variations.
/// </summary>
public string BaseSequence { get; }
public string BaseSequence { get; private set; }

public string Organism { get; }
public bool IsDecoy { get; }
Expand Down Expand Up @@ -206,6 +206,7 @@ public string GetUniProtFastaHeader()
{
var n = GeneNames.FirstOrDefault();
string geneName = n == null ? "" : n.Item2;

return string.Format("mz|{0}|{1} {2} OS={3} GN={4}", Accession, Name, FullName, Organism, geneName);
}

Expand Down Expand Up @@ -801,6 +802,103 @@ private static string GetName(IEnumerable<SequenceVariation> appliedVariations,
}
}

/// <summary>
/// This function takes in a decoy protein and a list of forbidden sequences that the decoy
/// protein should not contain. Optionally, a list of the peptides within the base sequence
/// of the decoy protein that need to be scrambled can be passed as well. It will scramble the required sequences,
/// leaving cleavage sites intact.
/// </summary>
/// <param name="originalDecoyProtein"> A Decoy protein to be cloned </param>
/// <param name="digestionParams"> Digestion parameters </param>
/// <param name="forbiddenSequences"> A HashSet of forbidden sequences that the decoy protein should not contain. Typically, a set of target base sequences </param>
/// <param name="sequencesToScramble"> Optional IEnumberable of sequences within the decoy protein that need to be replaced.
/// If this is passed, only sequences within the IEnumerable will be replaced!!! </param>
/// <returns> A cloned copy of the decoy protein with a scrambled sequence </returns>
public static Protein ScrambleDecoyProteinSequence(
Protein originalDecoyProtein,
DigestionParams digestionParams,
HashSet<string> forbiddenSequences,
IEnumerable<string> sequencesToScramble = null)
{
// If no sequencesToScramble are passed in, we check to see if any
// peptides in the decoy are forbidden sequences
sequencesToScramble = sequencesToScramble ?? originalDecoyProtein
.Digest(digestionParams, new List<Modification>(), new List<Modification>())
.Select(pep => pep.FullSequence)
.Where(forbiddenSequences.Contains);
if(sequencesToScramble.Count() == 0)
{
return originalDecoyProtein;
}

string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
// Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
// Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
foreach(string peptideSequence in sequencesToScramble.OrderBy(seq => seq.Length))
{
if(scrambledProteinSequence.Contains(peptideSequence))
{
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs);
int scrambleAttempts = 1;
// Try five times to scramble the peptide sequence without creating a forbidden sequence
while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
{
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs);
scrambleAttempts++;
}
scrambledProteinSequence = scrambledProteinSequence.Replace(peptideSequence, scrambledPeptideSequence);
}
}

return new Protein(originalDecoyProtein, scrambledProteinSequence);
}

private static Random rng = new Random(42);

/// <summary>
/// Scrambles a peptide sequence, preserving the position of any cleavage sites.
/// </summary>
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs)
{
// First, find the location of every cleavage motif. These sites shouldn't be scrambled.
HashSet<int> zeroBasedCleavageSitesLocations = new();
foreach (var motif in motifs)
{
for (int i = 0; i < sequence.Length; i++)
{
(bool fits, bool prevents) = motif.Fits(sequence, i);
if (fits && !prevents)
{
zeroBasedCleavageSitesLocations.Add(i);
}
}
}

// Next, scramble the sequence using the Fisher-Yates shuffle algorithm.
char[] sequenceArray = sequence.ToCharArray();
int n = sequenceArray.Length;
while(n > 1)
{
n--;
if(zeroBasedCleavageSitesLocations.Contains(n))
{
// Leave the cleavage site in place
continue;
}
int k = rng.Next(n + 1);
// don't swap the position of a cleavage site
while(zeroBasedCleavageSitesLocations.Contains(k))
{
k = rng.Next(n + 1);
}
char temp = sequenceArray[k];
sequenceArray[k] = sequenceArray[n];
sequenceArray[n] = temp;
}

return new string(sequenceArray);
}

public int CompareTo(Protein other)
{
//permits sorting of proteins
Expand Down
73 changes: 73 additions & 0 deletions mzLib/Test/TestProteinDigestion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
using UsefulProteomicsDatabases;
using static Chemistry.PeriodicTable;
using Stopwatch = System.Diagnostics.Stopwatch;
using System.Runtime.CompilerServices;

namespace Test
{
Expand Down Expand Up @@ -359,6 +360,78 @@ public static void Test_ProteinDigest()
Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence);
}

[Test]
public static void TestDecoyScramblerReplacesPeptides()
{
DigestionParams d = new DigestionParams(
maxMissedCleavages: 1,
minPeptideLength: 5,
initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);

Protein target = new Protein("MEDEEKFVGYKYGVFK", "target");
Protein decoy = new Protein("EEDEMKYGVFKFVGYK", "decoy");

var targetPep = target.Digest(d, new List<Modification>(), new List<Modification>());
var decoyPep = decoy.Digest(d, new List<Modification>(), new List<Modification>());

HashSet<string> targetPepSeqs = targetPep.Select(p => p.FullSequence).ToHashSet();
var offendingDecoys = decoyPep.Where(p => targetPepSeqs.Contains(p.FullSequence)).Select(d => d.FullSequence).ToList();

Assert.AreEqual(2, offendingDecoys.Count);

Protein scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs, offendingDecoys);
var scrambledPep = scrambledDecoy.Digest(d, new List<Modification>(), new List<Modification>());

Assert.AreEqual(decoyPep.Count(), scrambledPep.Count());
Assert.IsFalse(scrambledPep.Any(p => offendingDecoys.Contains(p.FullSequence)));

// Check to make sure that decoy generation also works in no offending sequences are passed in
scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs);
scrambledPep = scrambledDecoy.Digest(d, new List<Modification>(), new List<Modification>());

Assert.AreEqual(decoyPep.Count(), scrambledPep.Count());
Assert.IsFalse(scrambledPep.Any(p => offendingDecoys.Contains(p.FullSequence)));
}

[Test, Timeout(5000)]
public static void TestDecoyScramblerNoInfiniteLoops()
{
DigestionParams d = new DigestionParams(
maxMissedCleavages: 0,
minPeptideLength: 3,
initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);

Protein target = new Protein("MEK", "target");
Protein decoy = new Protein("EMK", "decoy");

var targetPep = target.Digest(d, new List<Modification>(), new List<Modification>());
var decoyPep = decoy.Digest(d, new List<Modification>(), new List<Modification>());

HashSet<string> targetPepSeqs = targetPep.Select(p => p.FullSequence).ToHashSet();

// We'll pretend that this is also a target sequence and can't be used as a decoy
HashSet<string> offendingDecoys = new HashSet<string> { "EMK" };

// You can't win in this scenario, there's no way to scramble that results in a different decoy
Protein scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs.Union(offendingDecoys).ToHashSet(), offendingDecoys);
var scrambledPep = scrambledDecoy.Digest(d, new List<Modification>(), new List<Modification>());

Assert.AreEqual(decoyPep.Count(), scrambledPep.Count());

d = new DigestionParams(
maxMissedCleavages: 1,
minPeptideLength: 3,
initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);

offendingDecoys = new HashSet<string> { "KEK" };

var impossibleDecoy = new Protein("KEK", "target"); // This guy could crash the shuffling algorithm
scrambledDecoy = Protein.ScrambleDecoyProteinSequence(impossibleDecoy, d, offendingDecoys, offendingDecoys);

Assert.AreEqual("KEK", scrambledDecoy.BaseSequence);

}

[Test]
/// <summary>
/// Tests that a PeptideWithSetModifications object can be parsed correctly from a string, with mod info
Expand Down

0 comments on commit a78eea2

Please sign in to comment.