Skip to content

Commit

Permalink
Increased the Q-value threshold for Top-Down PEP positive training ex…
Browse files Browse the repository at this point in the history
…amples (#2427)

* Buncha changes to PEP

* Increase Q-Value cutoff for top-down PEP positive training examples

* removed unused RNG
  • Loading branch information
Alexander-Sol authored Oct 17, 2024
1 parent 2962329 commit b753e3c
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 24 deletions.
25 changes: 14 additions & 11 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,23 @@ public PepAnalysisEngine(List<SpectralMatch> psms, string searchType, List<(stri
SearchType = searchType;
SetFileSpecificParameters(fileSpecificParameters);
BuildFileSpecificDictionaries(psms, TrainingVariables);
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005);

double minQ = searchType == "top-down" ? 0.025 : 0.005; // Less stringent FDR cut-off for top-down
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), minQ);
// If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level
UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100;
}

public string ComputePEPValuesForAllPSMs()
{
List<PeptideMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? PeptideMatchGroup.GroupByBaseSequence(AllPsms)
: PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
List<SpectralMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? SpectralMatchGroup.GroupByBaseSequence(AllPsms)
: SpectralMatchGroup.GroupByIndividualPsm(AllPsms);

if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4))
{
peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
// If we don't have enough peptides to train at the peptide level, we will train at the PSM level
peptideGroups = SpectralMatchGroup.GroupByIndividualPsm(AllPsms);
UsePeptideLevelQValueForTraining = false;
}

int numGroups = 4;
Expand Down Expand Up @@ -185,7 +187,7 @@ public void BuildFileSpecificDictionaries(List<SpectralMatch> trainingData, stri
}
}

public static List<int>[] GetPeptideGroupIndices(List<PeptideMatchGroup> peptides, int numGroups)
public static List<int>[] GetPeptideGroupIndices(List<SpectralMatchGroup> peptides, int numGroups)
{
List<int>[] groupsOfIndices = new List<int>[numGroups];

Expand Down Expand Up @@ -244,7 +246,7 @@ static List<List<int>> DivideListIntoGroups(List<int> list, int numGroups)
}

public IEnumerable<PsmData> CreatePsmData(string searchType,
List<PeptideMatchGroup> peptideGroups, List<int> peptideGroupIndices)
List<SpectralMatchGroup> peptideGroups, List<int> peptideGroupIndices)
{
object psmDataListLock = new object();
List<PsmData> psmDataList = new List<PsmData>();
Expand All @@ -264,7 +266,7 @@ public IEnumerable<PsmData> CreatePsmData(string searchType,
if (GlobalVariables.StopLoops) { return; }

int modCount = 0;
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null))
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatches().Where(psm => psm != null))
{
PsmData newPsmData = new PsmData();
if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null)
Expand Down Expand Up @@ -335,7 +337,8 @@ public IEnumerable<PsmData> CreatePsmData(string searchType,
}

public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificationMetrics> allMetrics, int sumOfAllAmbiguousPeptidesResolved,
int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff)
int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff)

{
List<double> accuracy = allMetrics.Select(m => m.Accuracy).ToList();
List<double> areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList();
Expand Down Expand Up @@ -392,7 +395,7 @@ public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificati
return s.ToString();
}

public int Compute_PSM_PEP(List<PeptideMatchGroup> peptideGroups,
public int Compute_PSM_PEP(List<SpectralMatchGroup> peptideGroups,
List<int> peptideGroupIndices,
MLContext mLContext, TransformerChain<BinaryPredictionTransformer<Microsoft.ML.Calibrators.CalibratedModelParametersBase<Microsoft.ML.Trainers.FastTree.FastTreeBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator>>> trainedModel, string searchType, string outputFolder)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,41 @@

namespace EngineLayer
{
public class PeptideMatchGroup : IEnumerable<SpectralMatch>
public class SpectralMatchGroup : IEnumerable<SpectralMatch>
{
public string PeptideFullSequence { get; }
public string BaseSequence { get; }
public List<SpectralMatch> SpectralMatches { get; }

/// <summary>
/// This class groups all spectral matches associated with a given peptide together,
/// This class groups all spectral matches associated with a given sequence
/// to facilitate the calculation of PEP values.
/// </summary>
/// <param name="fullPeptideSeq"> The full sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the full sequence</param>
public PeptideMatchGroup(string fullPeptideSeq, List<SpectralMatch> spectralMatches)
/// <param name="baseSequence"> The sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the sequence</param>
public SpectralMatchGroup(string baseSequence, List<SpectralMatch> spectralMatches)
{
PeptideFullSequence = fullPeptideSeq;
BaseSequence = baseSequence;
SpectralMatches = spectralMatches;
}

public static List<PeptideMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
{
// This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training.

// TODO: Determine if it's better to group PSMs by base sequence or by full sequence.
return spectralMatches.GroupBy(p => p.BaseSequence)
.Select(group => new PeptideMatchGroup(group.Key, group.ToList()))
.Select(group => new SpectralMatchGroup(group.Key, group.ToList()))
.OrderByDescending(matchGroup => matchGroup.Count())
.ThenByDescending(matchGroup => matchGroup.BestMatch.Score)
.ToList();
}

public IEnumerable<SpectralMatch> GetBestMatchByMod()
{
/// <summary>
/// Returns the top-scoring PSM for each full sequence in the spectral match group.
/// i.e., if the same base sequence has multiple modifications, this function will return the PSM with the highest score for each modification.
/// </summary>
public IEnumerable<SpectralMatch> GetBestMatches()
{
return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p));
}

Expand All @@ -48,9 +52,9 @@ public IEnumerable<SpectralMatch> GetBestMatchByMod()
/// </summary>
/// <param name="spectralMatches"></param>
/// <returns></returns>
public static List<PeptideMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
{
return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
return spectralMatches.Select(psm => new SpectralMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
.ToList();
}

Expand Down

0 comments on commit b753e3c

Please sign in to comment.