Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Increased the Q-value threshold for Top-Down PEP positive training examples #2427

Merged
merged 5 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,23 @@
SearchType = searchType;
SetFileSpecificParameters(fileSpecificParameters);
BuildFileSpecificDictionaries(psms, TrainingVariables);
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005);

double minQ = searchType == "top-down" ? 0.025 : 0.005; // Less stringent FDR cut-off for top-down
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), minQ);
// If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level
UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100;
}

public string ComputePEPValuesForAllPSMs()
{
List<PeptideMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? PeptideMatchGroup.GroupByBaseSequence(AllPsms)
: PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
List<SpectralMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? SpectralMatchGroup.GroupByBaseSequence(AllPsms)
: SpectralMatchGroup.GroupByIndividualPsm(AllPsms);

if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4))
{
peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
// If we don't have enough peptides to train at the peptide level, we will train at the PSM level
peptideGroups = SpectralMatchGroup.GroupByIndividualPsm(AllPsms);
UsePeptideLevelQValueForTraining = false;

Check warning on line 115 in MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs

View check run for this annotation

Codecov / codecov/patch

MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs#L114-L115

Added lines #L114 - L115 were not covered by tests
}

int numGroups = 4;
Expand Down Expand Up @@ -185,7 +187,7 @@
}
}

public static List<int>[] GetPeptideGroupIndices(List<PeptideMatchGroup> peptides, int numGroups)
public static List<int>[] GetPeptideGroupIndices(List<SpectralMatchGroup> peptides, int numGroups)
{
List<int>[] groupsOfIndices = new List<int>[numGroups];

Expand Down Expand Up @@ -244,7 +246,7 @@
}

public IEnumerable<PsmData> CreatePsmData(string searchType,
List<PeptideMatchGroup> peptideGroups, List<int> peptideGroupIndices)
List<SpectralMatchGroup> peptideGroups, List<int> peptideGroupIndices)
{
object psmDataListLock = new object();
List<PsmData> psmDataList = new List<PsmData>();
Expand All @@ -264,7 +266,7 @@
if (GlobalVariables.StopLoops) { return; }

int modCount = 0;
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null))
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatches().Where(psm => psm != null))
{
PsmData newPsmData = new PsmData();
if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null)
Expand Down Expand Up @@ -335,7 +337,8 @@
}

public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificationMetrics> allMetrics, int sumOfAllAmbiguousPeptidesResolved,
int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff)
int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff)

{
List<double> accuracy = allMetrics.Select(m => m.Accuracy).ToList();
List<double> areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList();
Expand Down Expand Up @@ -392,7 +395,7 @@
return s.ToString();
}

public int Compute_PSM_PEP(List<PeptideMatchGroup> peptideGroups,
public int Compute_PSM_PEP(List<SpectralMatchGroup> peptideGroups,
List<int> peptideGroupIndices,
MLContext mLContext, TransformerChain<BinaryPredictionTransformer<Microsoft.ML.Calibrators.CalibratedModelParametersBase<Microsoft.ML.Trainers.FastTree.FastTreeBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator>>> trainedModel, string searchType, string outputFolder)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,41 @@

namespace EngineLayer
{
public class PeptideMatchGroup : IEnumerable<SpectralMatch>
public class SpectralMatchGroup : IEnumerable<SpectralMatch>
{
public string PeptideFullSequence { get; }
public string BaseSequence { get; }

Check warning on line 14 in MetaMorpheus/EngineLayer/FdrAnalysis/SpectralMatchGroup.cs

View check run for this annotation

Codecov / codecov/patch

MetaMorpheus/EngineLayer/FdrAnalysis/SpectralMatchGroup.cs#L14

Added line #L14 was not covered by tests
public List<SpectralMatch> SpectralMatches { get; }

/// <summary>
/// This class groups all spectral matches associated with a given peptide together,
/// This class groups all spectral matches associated with a given sequence
/// to facilitate the calculation of PEP values.
/// </summary>
/// <param name="fullPeptideSeq"> The full sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the full sequence</param>
public PeptideMatchGroup(string fullPeptideSeq, List<SpectralMatch> spectralMatches)
/// <param name="baseSequence"> The sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the sequence</param>
public SpectralMatchGroup(string baseSequence, List<SpectralMatch> spectralMatches)
{
PeptideFullSequence = fullPeptideSeq;
BaseSequence = baseSequence;
SpectralMatches = spectralMatches;
}

public static List<PeptideMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
{
// This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training.

// TODO: Determine if it's better to group PSMs by base sequence or by full sequence.
return spectralMatches.GroupBy(p => p.BaseSequence)
.Select(group => new PeptideMatchGroup(group.Key, group.ToList()))
.Select(group => new SpectralMatchGroup(group.Key, group.ToList()))
.OrderByDescending(matchGroup => matchGroup.Count())
.ThenByDescending(matchGroup => matchGroup.BestMatch.Score)
.ToList();
}

public IEnumerable<SpectralMatch> GetBestMatchByMod()
{
/// <summary>
/// Returns the top-scoring PSM for each full sequence in the spectral match group.
/// i.e., if the same base sequence has multiple modifications, this function will return the PSM with the highest score for each modification.
/// </summary>
public IEnumerable<SpectralMatch> GetBestMatches()
{
return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p));
}

Expand All @@ -48,9 +52,9 @@
/// </summary>
/// <param name="spectralMatches"></param>
/// <returns></returns>
public static List<PeptideMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
{
return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
return spectralMatches.Select(psm => new SpectralMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
.ToList();
}

Expand Down