diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs index a79c4a29a..1787b9537 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs @@ -96,21 +96,23 @@ public PepAnalysisEngine(List psms, string searchType, List<(stri SearchType = searchType; SetFileSpecificParameters(fileSpecificParameters); BuildFileSpecificDictionaries(psms, TrainingVariables); - QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005); - + double minQ = searchType == "top-down" ? 0.025 : 0.005; // Less stringent FDR cut-off for top-down + QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), minQ); // If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100; } public string ComputePEPValuesForAllPSMs() { - List peptideGroups = UsePeptideLevelQValueForTraining - ? PeptideMatchGroup.GroupByBaseSequence(AllPsms) - : PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + List peptideGroups = UsePeptideLevelQValueForTraining + ? SpectralMatchGroup.GroupByBaseSequence(AllPsms) + : SpectralMatchGroup.GroupByIndividualPsm(AllPsms); if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4)) { - peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms); + // If we don't have enough peptides to train at the peptide level, we will train at the PSM level + peptideGroups = SpectralMatchGroup.GroupByIndividualPsm(AllPsms); + UsePeptideLevelQValueForTraining = false; } int numGroups = 4; @@ -185,7 +187,7 @@ public void BuildFileSpecificDictionaries(List trainingData, stri } } - public static List[] GetPeptideGroupIndices(List peptides, int numGroups) + public static List[] GetPeptideGroupIndices(List peptides, int numGroups) { List[] groupsOfIndices = new List[numGroups]; @@ -244,7 +246,7 @@ static List> DivideListIntoGroups(List list, int numGroups) } public IEnumerable CreatePsmData(string searchType, - List peptideGroups, List peptideGroupIndices) + List peptideGroups, List peptideGroupIndices) { object psmDataListLock = new object(); List psmDataList = new List(); @@ -264,7 +266,7 @@ public IEnumerable CreatePsmData(string searchType, if (GlobalVariables.StopLoops) { return; } int modCount = 0; - foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null)) + foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatches().Where(psm => psm != null)) { PsmData newPsmData = new PsmData(); if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null) @@ -335,7 +337,8 @@ public IEnumerable CreatePsmData(string searchType, } public static string AggregateMetricsForOutput(List allMetrics, int sumOfAllAmbiguousPeptidesResolved, - int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff) + int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff) + { List accuracy = allMetrics.Select(m => m.Accuracy).ToList(); List areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList(); @@ -392,7 +395,7 @@ public static string AggregateMetricsForOutput(List peptideGroups, + public int Compute_PSM_PEP(List peptideGroups, List peptideGroupIndices, MLContext mLContext, TransformerChain>> trainedModel, string searchType, string outputFolder) { diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/SpectralMatchGroup.cs similarity index 60% rename from MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs rename to MetaMorpheus/EngineLayer/FdrAnalysis/SpectralMatchGroup.cs index b88faa9d1..b9ba0f171 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/PeptideMatchGroup.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/SpectralMatchGroup.cs @@ -9,37 +9,41 @@ namespace EngineLayer { - public class PeptideMatchGroup : IEnumerable + public class SpectralMatchGroup : IEnumerable { - public string PeptideFullSequence { get; } + public string BaseSequence { get; } public List SpectralMatches { get; } /// - /// This class groups all spectral matches associated with a given peptide together, + /// This class groups all spectral matches associated with a given sequence /// to facilitate the calculation of PEP values. /// - /// The full sequence to be used for grouping - /// Every spectral match that matches the full sequence - public PeptideMatchGroup(string fullPeptideSeq, List spectralMatches) + /// The sequence to be used for grouping + /// Every spectral match that matches the sequence + public SpectralMatchGroup(string baseSequence, List spectralMatches) { - PeptideFullSequence = fullPeptideSeq; + BaseSequence = baseSequence; SpectralMatches = spectralMatches; } - public static List GroupByBaseSequence(List spectralMatches) + public static List GroupByBaseSequence(List spectralMatches) { // This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training. // TODO: Determine if it's better to group PSMs by base sequence or by full sequence. return spectralMatches.GroupBy(p => p.BaseSequence) - .Select(group => new PeptideMatchGroup(group.Key, group.ToList())) + .Select(group => new SpectralMatchGroup(group.Key, group.ToList())) .OrderByDescending(matchGroup => matchGroup.Count()) .ThenByDescending(matchGroup => matchGroup.BestMatch.Score) .ToList(); } - public IEnumerable GetBestMatchByMod() - { + /// + /// Returns the top-scoring PSM for each full sequence in the spectral match group. + /// i.e., if the same base sequence has multiple modifications, this function will return the PSM with the highest score for each modification. + /// + public IEnumerable GetBestMatches() + { return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p)); } @@ -48,9 +52,9 @@ public IEnumerable GetBestMatchByMod() /// /// /// - public static List GroupByIndividualPsm(List spectralMatches) + public static List GroupByIndividualPsm(List spectralMatches) { - return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List { psm })) + return spectralMatches.Select(psm => new SpectralMatchGroup(psm.FullSequence, new List { psm })) .ToList(); }