Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion mzLib/Omics/Digestion/DigestionProduct.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one
_baseSequence = baseSequence;
}


[field: NonSerialized] public IBioPolymer Parent { get; protected set; } // BioPolymer that this lysis product is a digestion product of
public string Description { get; protected set; } //unstructured explanation of source
public int OneBasedStartResidue { get; }// the residue number at which the peptide begins (the first residue in a protein is 1)
Expand All @@ -37,6 +38,7 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one
OneBasedEndResidue - OneBasedStartResidue + 1);
public CleavageSpecificity CleavageSpecificityForFdrCategory { get; set; } //structured explanation of source
public int Length => BaseSequence.Length; //how many residues long the peptide is

public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex];

#region Digestion Helper Methods
Expand Down Expand Up @@ -85,6 +87,68 @@ protected static IEnumerable<Dictionary<int, Modification>> GetVariableModificat
}
}
}
/// <summary>
/// Helper method to call Piset() with standard parameters to get the number of proteoforms for a given peptide.
/// </summary>
/// <param name="sitesConsidered"></param>
/// <param name="modsAsList"></param>
/// <returns></returns>
public static long PossibleProteoformRecursive(int sitesConsidered, List<int> modsAsList)
{
long result = 0;
List<int> subset = new();
Piset(sitesConsidered, ref result, subset, modsAsList);
return result;
}

/// <summary>
/// Recursive backtracking method to calculate the number of all possible proteoforms for a given peptide.
/// </summary>
/// <param name="sitesConsidered"> Restriction on the number of sites. For all possible proteoforms, this is equal to the number of total residues </param>
/// <param name="result">The product to be returned</param>
/// <param name="subset">Consider each subset in a powerset of modsAsList less or equal to the length sitesConsidered</param>
/// <param name="modsAsList">Mods given as {#mods at residue 1, #mods at residue 2, ... #mods at residue n} where n is the number of sites with any mods</param>
/// <returns></returns>
internal static long Piset(int sitesConsidered, ref long result, List<int> subset, List<int> modsAsList)
{
if (modsAsList.Count == 0)
{ // collect subset if valid and not already in result. Essentially, what conditions must exist if the subset is valid?
if (subset.Count <= sitesConsidered)
{ result = result + PI(subset); }
return result;
}
int currentSite = modsAsList[0]; //where modsAsList[0] is the current residue. To add or not to add?
//case 1: exclude current
List<int> remaining = new(modsAsList);
remaining.RemoveAt(0);
//recursively call the function w/o current integer.
Piset(sitesConsidered, ref result, subset, remaining);
//case 2: include current, once we consider if adding current to set keeps length less or equal to sitesConsidered.
if (subset.Count < sitesConsidered) // Check if we can add one more element
{
List<int> include = new(subset);
include.Add(currentSite);
Piset(sitesConsidered, ref result, include, remaining);
}
return result;
}
/// <summary>
/// Calculates the product of all elements in a list of integers.
/// </summary>
/// <param name="subset"> List to get product from </param>
/// <returns></returns>
internal static int PI(List<int> subset)
{
int product = 1;
for (int i = 0; i < subset.Count; i++)
{
//suppose no numbers are 0, given the context that 0 would not be a possible residue.
product = product * subset[i];
}

return product;
}


/// <summary>
/// Sets the fixed modifications for the peptide, considering the N-terminal and C-terminal positions, by populating the <paramref name="fixedModsOneIsNterminus"/> dictionary.
Expand Down Expand Up @@ -187,7 +251,7 @@ protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length,
/// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide.
/// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide.
/// </remarks>
protected void PopulateVariableModifications(List<Modification> allVariableMods, in Dictionary<int, SortedSet<Modification>> twoBasedDictToPopulate)
protected void PopulateVariableModifications(List<Modification> allVariableMods, in Dictionary<int, SortedSet<Modification>> twoBasedDictToPopulate) //{1, Oxy, 3, acetylation}
{
int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1;
var pepNTermVariableMods = new SortedSet<Modification>();
Expand Down
2 changes: 1 addition & 1 deletion mzLib/Proteomics/Protein/Protein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ public IEnumerable<IBioPolymerWithSetMods> Digest(IDigestionParams digestionPara

IEnumerable<PeptideWithSetModifications> modifiedPeptides = unmodifiedPeptides.SelectMany(peptide =>
peptide.GetModifiedPeptides(allKnownFixedModifications, digestionParameters, variableModifications));

//Remove terminal modifications (if needed)
if (searchModeType == CleavageSpecificity.SingleN ||
searchModeType == CleavageSpecificity.SingleC ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public PeptideWithSetModifications(string sequence, Dictionary<string, Modificat
{
throw new MzLibUtil.MzLibException("Ambiguous peptide cannot be parsed from string: " + sequence);
}

FullSequence = sequence;
_baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence);
_allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(sequence, allKnownMods);
Expand Down
26 changes: 25 additions & 1 deletion mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Omics.Digestion;
using Omics.Modifications;

Expand Down Expand Up @@ -38,6 +39,7 @@ public string PeptideDescription
set => Description = value;
}


#endregion

/// <summary>
Expand All @@ -48,7 +50,7 @@ public string PeptideDescription
/// <param name="digestionParams"></param>
/// <param name="variableModifications"></param>
/// <returns></returns>
internal IEnumerable<PeptideWithSetModifications> GetModifiedPeptides(List<Modification> allKnownFixedModifications,
internal IEnumerable<PeptideWithSetModifications> GetModifiedPeptides(List<Modification> allKnownFixedModifications, //for each peptide, peptide3.getvariablemods?
DigestionParams digestionParams, List<Modification> variableModifications)
{
int variable_modification_isoforms = 0;
Expand Down Expand Up @@ -83,5 +85,27 @@ internal IEnumerable<PeptideWithSetModifications> GetModifiedPeptides(List<Modif
DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications);
}
}
/// <summary>
/// Calculates the number of possible proteoforms for a given peptide sequence. Both fixed and variable modifications are considered.
/// </summary>
/// <param name="allKnownFixedModifications">Number of Fixed Modifications</param>
/// <param name="digestionParams">Used to ascertain the user chosen max mods per peptide</param>
/// <param name="variableModifications">Number of Variable Modifications</param>
/// <returns></returns>
public long GetNumberOfPossibleProteoforms(List<Modification> allKnownFixedModifications,
DigestionParams digestionParams, List<Modification> variableModifications)
{
int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1;
var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get();
PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications);
List<int> modsPerResidue = new List<int>(); //list of possible mods at residue, without attention to the order. {2,6,3,4,5}
foreach (var x in twoBasedPossibleVariableAndLocalizeableModifications)
{
//iterate through each residue, add in the number of possible modifications for that residue.
modsPerResidue.Add(x.Value.Count);
}
//given a maximum number of mods, the upper bound on proteoforms we can have with a base sequence.
return DigestionProduct.PossibleProteoformRecursive(digestionParams.MaxMods, modsPerResidue);
}
}
}
25 changes: 25 additions & 0 deletions mzLib/Test/TestPeptideWithSetMods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Windows.Documents;
using MzLibUtil;
using Omics;
using Omics.BioPolymer;
Expand Down Expand Up @@ -762,6 +763,30 @@ public static void BreakDeserializationMethod()
Assert.Throws<MzLibUtil.MzLibException>(() => new PeptideWithSetModifications("A[:mod]", new Dictionary<string, Modification>())); // nonexistent mod
}

[Test]
public static void TestNumRestrictedProteoforms()
{
Protein myProtein = new Protein("PEPTIDKKK", "accession");

DigestionParams digest1 = new DigestionParams(protease: "top-down", maxMissedCleavages: 1, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain, maxModsForPeptides:2);
var variableMods = new List<Modification>();
ModificationMotif.TryGetMotif("P", out ModificationMotif motif_p);
Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_p, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1"));
Modification phospho2 = new Modification(_originalId: "phosphorylation", _modificationType: "CommonBiological", _target: motif_p, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H20")); //random mod on P
ModificationMotif.TryGetMotif("E", out ModificationMotif motif_k);
Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1"));

variableMods.Add(phosphorylation);
variableMods.Add(phospho2);
variableMods.Add(acetylation);

PeptideWithSetModifications pep1 = myProtein.Digest(digest1, new List<Modification>(), variableMods).First();
Console.WriteLine(pep1.GetNumberOfPossibleProteoforms(new List<Modification>(), digest1, variableMods));
Console.WriteLine();
Assert.That(pep1.GetNumberOfPossibleProteoforms(new List<Modification>(), digest1, variableMods).Equals(14)); //restricted to 2 residue sites maximum -> length less or equal to 2 subsets.
Assert.That(pep1.GetNumberOfPossibleProteoforms(new List<Modification>(), digest1, new List<Modification>()).Equals(1)); // no variable mods
}

[Test]
public static void TestReverseDecoyFromTarget()
{
Expand Down
Loading