diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index b43980e23..0da2c3761 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -22,6 +22,7 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one _baseSequence = baseSequence; } + [field: NonSerialized] public IBioPolymer Parent { get; protected set; } // BioPolymer that this lysis product is a digestion product of public string Description { get; protected set; } //unstructured explanation of source public int OneBasedStartResidue { get; }// the residue number at which the peptide begins (the first residue in a protein is 1) @@ -37,6 +38,7 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one OneBasedEndResidue - OneBasedStartResidue + 1); public CleavageSpecificity CleavageSpecificityForFdrCategory { get; set; } //structured explanation of source public int Length => BaseSequence.Length; //how many residues long the peptide is + public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; #region Digestion Helper Methods @@ -85,6 +87,68 @@ protected static IEnumerable> GetVariableModificat } } } + /// + /// Helper method to call Piset() with standard parameters to get the number of proteoforms for a given peptide. + /// + /// + /// + /// + public static long PossibleProteoformRecursive(int sitesConsidered, List modsAsList) + { + long result = 0; + List subset = new(); + Piset(sitesConsidered, ref result, subset, modsAsList); + return result; + } + + /// + /// Recursive backtracking method to calculate the number of all possible proteoforms for a given peptide. + /// + /// Restriction on the number of sites. For all possible proteoforms, this is equal to the number of total residues + /// The product to be returned + /// Consider each subset in a powerset of modsAsList less or equal to the length sitesConsidered + /// Mods given as {#mods at residue 1, #mods at residue 2, ... #mods at residue n} where n is the number of sites with any mods + /// + internal static long Piset(int sitesConsidered, ref long result, List subset, List modsAsList) + { + if (modsAsList.Count == 0) + { // collect subset if valid and not already in result. Essentially, what conditions must exist if the subset is valid? + if (subset.Count <= sitesConsidered) + { result = result + PI(subset); } + return result; + } + int currentSite = modsAsList[0]; //where modsAsList[0] is the current residue. To add or not to add? + //case 1: exclude current + List remaining = new(modsAsList); + remaining.RemoveAt(0); + //recursively call the function w/o current integer. + Piset(sitesConsidered, ref result, subset, remaining); + //case 2: include current, once we consider if adding current to set keeps length less or equal to sitesConsidered. + if (subset.Count < sitesConsidered) // Check if we can add one more element + { + List include = new(subset); + include.Add(currentSite); + Piset(sitesConsidered, ref result, include, remaining); + } + return result; + } + /// + /// Calculates the product of all elements in a list of integers. + /// + /// List to get product from + /// + internal static int PI(List subset) + { + int product = 1; + for (int i = 0; i < subset.Count; i++) + { + //suppose no numbers are 0, given the context that 0 would not be a possible residue. + product = product * subset[i]; + } + + return product; + } + /// /// Sets the fixed modifications for the peptide, considering the N-terminal and C-terminal positions, by populating the dictionary. @@ -187,7 +251,7 @@ protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, /// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide. /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. /// - protected void PopulateVariableModifications(List allVariableMods, in Dictionary> twoBasedDictToPopulate) + protected void PopulateVariableModifications(List allVariableMods, in Dictionary> twoBasedDictToPopulate) //{1, Oxy, 3, acetylation} { int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; var pepNTermVariableMods = new SortedSet(); diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 88a74bfff..0abee8a5c 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -268,7 +268,7 @@ public IEnumerable Digest(IDigestionParams digestionPara IEnumerable modifiedPeptides = unmodifiedPeptides.SelectMany(peptide => peptide.GetModifiedPeptides(allKnownFixedModifications, digestionParameters, variableModifications)); - + //Remove terminal modifications (if needed) if (searchModeType == CleavageSpecificity.SingleN || searchModeType == CleavageSpecificity.SingleC || diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 8e6705f36..edf9b5baf 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -67,7 +67,7 @@ public PeptideWithSetModifications(string sequence, Dictionary Description = value; } + #endregion /// @@ -48,7 +50,7 @@ public string PeptideDescription /// /// /// - internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, + internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, //for each peptide, peptide3.getvariablemods? DigestionParams digestionParams, List variableModifications) { int variable_modification_isoforms = 0; @@ -83,5 +85,27 @@ internal IEnumerable GetModifiedPeptides(List + /// Calculates the number of possible proteoforms for a given peptide sequence. Both fixed and variable modifications are considered. + /// + /// Number of Fixed Modifications + /// Used to ascertain the user chosen max mods per peptide + /// Number of Variable Modifications + /// + public long GetNumberOfPossibleProteoforms(List allKnownFixedModifications, + DigestionParams digestionParams, List variableModifications) + { + int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + List modsPerResidue = new List(); //list of possible mods at residue, without attention to the order. {2,6,3,4,5} + foreach (var x in twoBasedPossibleVariableAndLocalizeableModifications) + { + //iterate through each residue, add in the number of possible modifications for that residue. + modsPerResidue.Add(x.Value.Count); + } + //given a maximum number of mods, the upper bound on proteoforms we can have with a base sequence. + return DigestionProduct.PossibleProteoformRecursive(digestionParams.MaxMods, modsPerResidue); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index 5090fb354..b16a02724 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -8,6 +8,7 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using System.Windows.Documents; using MzLibUtil; using Omics; using Omics.BioPolymer; @@ -762,6 +763,30 @@ public static void BreakDeserializationMethod() Assert.Throws(() => new PeptideWithSetModifications("A[:mod]", new Dictionary())); // nonexistent mod } + [Test] + public static void TestNumRestrictedProteoforms() + { + Protein myProtein = new Protein("PEPTIDKKK", "accession"); + + DigestionParams digest1 = new DigestionParams(protease: "top-down", maxMissedCleavages: 1, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain, maxModsForPeptides:2); + var variableMods = new List(); + ModificationMotif.TryGetMotif("P", out ModificationMotif motif_p); + Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_p, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1")); + Modification phospho2 = new Modification(_originalId: "phosphorylation", _modificationType: "CommonBiological", _target: motif_p, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H20")); //random mod on P + ModificationMotif.TryGetMotif("E", out ModificationMotif motif_k); + Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1")); + + variableMods.Add(phosphorylation); + variableMods.Add(phospho2); + variableMods.Add(acetylation); + + PeptideWithSetModifications pep1 = myProtein.Digest(digest1, new List(), variableMods).First(); + Console.WriteLine(pep1.GetNumberOfPossibleProteoforms(new List(), digest1, variableMods)); + Console.WriteLine(); + Assert.That(pep1.GetNumberOfPossibleProteoforms(new List(), digest1, variableMods).Equals(14)); //restricted to 2 residue sites maximum -> length less or equal to 2 subsets. + Assert.That(pep1.GetNumberOfPossibleProteoforms(new List(), digest1, new List()).Equals(1)); // no variable mods + } + [Test] public static void TestReverseDecoyFromTarget() {