Skip to content

Commit

Permalink
Merge branch 'master' into softenCalibrationTolerance
Browse files Browse the repository at this point in the history
  • Loading branch information
trishorts authored Oct 30, 2024
2 parents 0b9865d + 32658b1 commit a505e55
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 63 deletions.
34 changes: 22 additions & 12 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,23 @@ public PepAnalysisEngine(List<SpectralMatch> psms, string searchType, List<(stri
SearchType = searchType;
SetFileSpecificParameters(fileSpecificParameters);
BuildFileSpecificDictionaries(psms, TrainingVariables);
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), 0.005);

double minQ = searchType == "top-down" ? 0.025 : 0.005; // Less stringent FDR cut-off for top-down
QValueCutoff = Math.Max(fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(), minQ);
// If we have more than 100 peptides, we will train on the peptide level. Otherwise, we will train on the PSM level
UsePeptideLevelQValueForTraining = psms.Select(psm => psm.FullSequence).Distinct().Count(seq => seq.IsNotNullOrEmpty()) >= 100;
}

public string ComputePEPValuesForAllPSMs()
{
List<PeptideMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? PeptideMatchGroup.GroupByBaseSequence(AllPsms)
: PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
List<SpectralMatchGroup> peptideGroups = UsePeptideLevelQValueForTraining
? SpectralMatchGroup.GroupByBaseSequence(AllPsms)
: SpectralMatchGroup.GroupByIndividualPsm(AllPsms);

if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4))
{
peptideGroups = PeptideMatchGroup.GroupByIndividualPsm(AllPsms);
// If we don't have enough peptides to train at the peptide level, we will train at the PSM level
peptideGroups = SpectralMatchGroup.GroupByIndividualPsm(AllPsms);
UsePeptideLevelQValueForTraining = false;
}

int numGroups = 4;
Expand Down Expand Up @@ -161,7 +163,10 @@ public string ComputePEPValuesForAllPSMs()
sumOfAllAmbiguousPeptidesResolved += ambiguousPeptidesResolved;
}

return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved);
int positiveTrainingCount = PSMDataGroups.SelectMany(p => p).Count(p => p.Label);
int negativeTrainingcount = PSMDataGroups.SelectMany(p => p).Count(p => !p.Label);

return AggregateMetricsForOutput(allMetrics, sumOfAllAmbiguousPeptidesResolved, positiveTrainingCount, negativeTrainingcount, QValueCutoff);
}

/// <summary>
Expand All @@ -182,7 +187,7 @@ public void BuildFileSpecificDictionaries(List<SpectralMatch> trainingData, stri
}
}

public static List<int>[] GetPeptideGroupIndices(List<PeptideMatchGroup> peptides, int numGroups)
public static List<int>[] GetPeptideGroupIndices(List<SpectralMatchGroup> peptides, int numGroups)
{
List<int>[] groupsOfIndices = new List<int>[numGroups];

Expand Down Expand Up @@ -241,7 +246,7 @@ static List<List<int>> DivideListIntoGroups(List<int> list, int numGroups)
}

public IEnumerable<PsmData> CreatePsmData(string searchType,
List<PeptideMatchGroup> peptideGroups, List<int> peptideGroupIndices)
List<SpectralMatchGroup> peptideGroups, List<int> peptideGroupIndices)
{
object psmDataListLock = new object();
List<PsmData> psmDataList = new List<PsmData>();
Expand All @@ -261,7 +266,7 @@ public IEnumerable<PsmData> CreatePsmData(string searchType,
if (GlobalVariables.StopLoops) { return; }

int modCount = 0;
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatchByMod().Where(psm => psm != null))
foreach (var psm in peptideGroups[peptideGroupIndices[i]].GetBestMatches().Where(psm => psm != null))
{
PsmData newPsmData = new PsmData();
if (searchType == "crosslink" && ((CrosslinkSpectralMatch)psm)?.BetaPeptide != null)
Expand Down Expand Up @@ -331,7 +336,9 @@ public IEnumerable<PsmData> CreatePsmData(string searchType,
return pda.AsEnumerable();
}

public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificationMetrics> allMetrics, int sumOfAllAmbiguousPeptidesResolved)
public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificationMetrics> allMetrics, int sumOfAllAmbiguousPeptidesResolved,
int positiveTrainingCount, int negativeTrainingCount, double qValueCutoff)

{
List<double> accuracy = allMetrics.Select(m => m.Accuracy).ToList();
List<double> areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList();
Expand Down Expand Up @@ -381,11 +388,14 @@ public static string AggregateMetricsForOutput(List<CalibratedBinaryClassificati
s.AppendLine("* NegativePrecision: " + negativePrecision.Average().ToString());
s.AppendLine("* NegativeRecall: " + negativeRecall.Average().ToString());
s.AppendLine("* Count of Ambiguous Peptides Removed: " + sumOfAllAmbiguousPeptidesResolved.ToString());
s.AppendLine("* Q-Value Cutoff for Training Targets: " + qValueCutoff);
s.AppendLine("* Targets Used for Training: " + positiveTrainingCount.ToString());
s.AppendLine("* Decoys Used for Training: " + negativeTrainingCount.ToString());
s.AppendLine("************************************************************");
return s.ToString();
}

public int Compute_PSM_PEP(List<PeptideMatchGroup> peptideGroups,
public int Compute_PSM_PEP(List<SpectralMatchGroup> peptideGroups,
List<int> peptideGroupIndices,
MLContext mLContext, TransformerChain<BinaryPredictionTransformer<Microsoft.ML.Calibrators.CalibratedModelParametersBase<Microsoft.ML.Trainers.FastTree.FastTreeBinaryModelParameters, Microsoft.ML.Calibrators.PlattCalibrator>>> trainedModel, string searchType, string outputFolder)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,41 @@

namespace EngineLayer
{
public class PeptideMatchGroup : IEnumerable<SpectralMatch>
public class SpectralMatchGroup : IEnumerable<SpectralMatch>
{
public string PeptideFullSequence { get; }
public string BaseSequence { get; }
public List<SpectralMatch> SpectralMatches { get; }

/// <summary>
/// This class groups all spectral matches associated with a given peptide together,
/// This class groups all spectral matches associated with a given sequence
/// to facilitate the calculation of PEP values.
/// </summary>
/// <param name="fullPeptideSeq"> The full sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the full sequence</param>
public PeptideMatchGroup(string fullPeptideSeq, List<SpectralMatch> spectralMatches)
/// <param name="baseSequence"> The sequence to be used for grouping</param>
/// <param name="spectralMatches"> Every spectral match that matches the sequence</param>
public SpectralMatchGroup(string baseSequence, List<SpectralMatch> spectralMatches)
{
PeptideFullSequence = fullPeptideSeq;
BaseSequence = baseSequence;
SpectralMatches = spectralMatches;
}

public static List<PeptideMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByBaseSequence(List<SpectralMatch> spectralMatches)
{
// This groups psms by base sequence, ensuring that PSMs with the same base sequence but different modifications are grouped together when training.

// TODO: Determine if it's better to group PSMs by base sequence or by full sequence.
return spectralMatches.GroupBy(p => p.BaseSequence)
.Select(group => new PeptideMatchGroup(group.Key, group.ToList()))
.Select(group => new SpectralMatchGroup(group.Key, group.ToList()))
.OrderByDescending(matchGroup => matchGroup.Count())
.ThenByDescending(matchGroup => matchGroup.BestMatch.Score)
.ToList();
}

public IEnumerable<SpectralMatch> GetBestMatchByMod()
{
/// <summary>
/// Returns the top-scoring PSM for each full sequence in the spectral match group.
/// i.e., if the same base sequence has multiple modifications, this function will return the PSM with the highest score for each modification.
/// </summary>
public IEnumerable<SpectralMatch> GetBestMatches()
{
return SpectralMatches.GroupBy(p => p.FullSequence).Select(g => g.MaxBy(p => p));
}

Expand All @@ -48,9 +52,9 @@ public IEnumerable<SpectralMatch> GetBestMatchByMod()
/// </summary>
/// <param name="spectralMatches"></param>
/// <returns></returns>
public static List<PeptideMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
public static List<SpectralMatchGroup> GroupByIndividualPsm(List<SpectralMatch> spectralMatches)
{
return spectralMatches.Select(psm => new PeptideMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
return spectralMatches.Select(psm => new SpectralMatchGroup(psm.FullSequence, new List<SpectralMatch> { psm }))
.ToList();
}

Expand Down
2 changes: 1 addition & 1 deletion MetaMorpheus/TaskLayer/CalibrationTask/CalibrationTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ protected override MyTaskResults RunSpecific(string OutputFolder, List<DbForTask
// we use a multiplier of 4 for the tolerance for files that are not calibrated
fileSpecificParams.PrecursorMassTolerance = new PpmTolerance((4 * acquisitionResults.PsmPrecursorIqrPpmError) + Math.Abs(acquisitionResults.PsmPrecursorMedianPpmError));
fileSpecificParams.ProductMassTolerance = new PpmTolerance((4 * acquisitionResults.PsmProductIqrPpmError) + Math.Abs(acquisitionResults.PsmProductMedianPpmError));

// generate calibration function and shift data points
Status("Calibrating...", new List<string> { taskId, "Individual Spectra Files" });
CalibrationEngine engine = new(myMsDataFile, acquisitionResults, combinedParams, FileSpecificParameters, new List<string> { taskId, "Individual Spectra Files", originalUncalibratedFilenameWithoutExtension });
Expand Down Expand Up @@ -200,7 +201,6 @@ protected override MyTaskResults RunSpecific(string OutputFolder, List<DbForTask
return MyTaskResults;
}


private void CalibrationWarnMessage(DataPointAquisitionResults acquisitionResults)
{
// provide a message indicating why we couldn't calibrate
Expand Down
88 changes: 51 additions & 37 deletions MetaMorpheus/TaskLayer/FilteredPsms.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,43 +112,14 @@ public static FilteredPsms Filter(IEnumerable<SpectralMatch> psms,
}
}

if (!includeHighQValuePsms)
{
filteredPsms = filterType.Equals(FilterType.QValue)
? psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null
&& p.GetFdrInfo(filterAtPeptideLevel).QValue <= filterThreshold
&& p.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= filterThreshold).ToList()
: psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null && p.GetFdrInfo(filterAtPeptideLevel).PEP_QValue <= filterThreshold).ToList();
}
else
{
filteredPsms = psms.ToList();
}

if (!includeDecoys)
{
filteredPsms.RemoveAll(p => p.IsDecoy);
}
if (!includeContaminants)
{
filteredPsms.RemoveAll(p => p.IsContaminant);
}
if (!includeAmbiguous)
{
filteredPsms.RemoveAll(p => p.BaseSequence.IsNullOrEmpty());
}
if (!includeAmbiguousMods)
{
filteredPsms.RemoveAll(p => p.FullSequence.IsNullOrEmpty());
}
if (filterAtPeptideLevel)
{
//Choose the top scoring PSM for each peptide
filteredPsms = filteredPsms
.OrderByDescending(p => p)
.GroupBy(b => b.FullSequence)
.Select(b => b.FirstOrDefault()).ToList();
}
filteredPsms = psms.Where(psm =>
(includeDecoys || !psm.IsDecoy)
&& (includeContaminants || !psm.IsContaminant)
&& (includeAmbiguous || !psm.BaseSequence.IsNullOrEmpty())
&& (includeAmbiguousMods || !psm.FullSequence.IsNullOrEmpty()))
.FilterByQValue(includeHighQValuePsms, filterThreshold, filterAtPeptideLevel, filterType)
.CollapseToPeptides(filterAtPeptideLevel)
.ToList();

return new FilteredPsms(filteredPsms, filterType, filterThreshold, filteringNotPerformed, filterAtPeptideLevel);
}
Expand All @@ -163,4 +134,47 @@ System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
return FilteredPsmsList.GetEnumerator();
}
}

public static class FilteredPsmsExtensions
{
public static IEnumerable<SpectralMatch> CollapseToPeptides(this IEnumerable<SpectralMatch> psms, bool filterAtPeptideLevel)
{
if(!filterAtPeptideLevel)
{
return psms;
}
else
{
return psms
.OrderByDescending(p => p)
.GroupBy(b => b.FullSequence)
.Select(b => b.FirstOrDefault());
}
}

public static IEnumerable<SpectralMatch> FilterByQValue(this IEnumerable<SpectralMatch> psms, bool includeHighQValuePsms, double qValueThreshold, bool filterAtPeptideLevel, FilterType filterType)
{
foreach (var psm in psms)
{
if (includeHighQValuePsms)
{
yield return psm;
}
else if (filterType == FilterType.PepQValue)
{
if (psm.GetFdrInfo(filterAtPeptideLevel).PEP_QValue <= qValueThreshold)
{
yield return psm;
}
}
else
{
if (psm.GetFdrInfo(filterAtPeptideLevel).QValue <= qValueThreshold && psm.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= qValueThreshold)
{
yield return psm;
}
}
}
}
}
}

0 comments on commit a505e55

Please sign in to comment.