Skip to content

Commit

Permalink
FlashLFQ now enables selection of peptides to be quantified (#779)
Browse files Browse the repository at this point in the history
* Added optional arg to FlashLFQ that allows users to select which peptides will be quantified/reported

* added some comments

* increasing test coverage

* Increased test coverage

* Renamed, added comment

---------

Co-authored-by: Edwin Laboy <[email protected]>
  • Loading branch information
Alexander-Sol and elaboy authored May 20, 2024
1 parent a93dfa2 commit 84b2e9d
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 23 deletions.
16 changes: 13 additions & 3 deletions mzLib/FlashLFQ/FlashLFQResults.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using MathNet.Numerics.Statistics;
using Easy.Common.Extensions;
using MathNet.Numerics.Statistics;
using System;
using System.Collections.Generic;
using System.IO;
Expand All @@ -13,20 +14,28 @@ public class FlashLfqResults
public readonly Dictionary<string, Peptide> PeptideModifiedSequences;
public readonly Dictionary<string, ProteinGroup> ProteinGroups;
public readonly Dictionary<SpectraFileInfo, List<ChromatographicPeak>> Peaks;
private readonly HashSet<string> _peptideModifiedSequencesToQuantify;

public FlashLfqResults(List<SpectraFileInfo> spectraFiles, List<Identification> identifications)
public FlashLfqResults(List<SpectraFileInfo> spectraFiles, List<Identification> identifications, HashSet<string> peptides = null)
{
SpectraFiles = spectraFiles;
PeptideModifiedSequences = new Dictionary<string, Peptide>();
ProteinGroups = new Dictionary<string, ProteinGroup>();
Peaks = new Dictionary<SpectraFileInfo, List<ChromatographicPeak>>();
if(peptides == null || !peptides.Any())
{
peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet();
}
_peptideModifiedSequencesToQuantify = peptides;

foreach (SpectraFileInfo file in spectraFiles)
{
Peaks.Add(file, new List<ChromatographicPeak>());
}

foreach (Identification id in identifications)

// Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results
foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence)))
{
if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide))
{
Expand Down Expand Up @@ -120,6 +129,7 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides)
var groupedPeaks = filePeaks.Value
.Where(p => p.NumIdentificationsByFullSeq == 1)
.GroupBy(p => p.Identifications.First().ModifiedSequence)
.Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key))
.ToList();

foreach (var sequenceWithPeaks in groupedPeaks)
Expand Down
51 changes: 47 additions & 4 deletions mzLib/FlashLFQ/FlashLfqEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using System.Threading.Tasks;
using UsefulProteomicsDatabases;
using System.Runtime.CompilerServices;
using Easy.Common.Extensions;

[assembly: InternalsVisibleTo("TestFlashLFQ")]

Expand Down Expand Up @@ -57,6 +58,12 @@ public class FlashLfqEngine
private Stopwatch _globalStopwatch;
private List<Identification> _allIdentifications;
/// <summary>
/// These peptides will be reported in the QuantifiedPeptides output and used for protein quant.
/// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable
/// peptide-level FDR filtering
/// </summary>
public HashSet<string> PeptidesModifiedSequencesToQuantify { get; init; }
/// <summary>
/// Dictionary linking a modified sequence to a List of tuples containing
/// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the
/// isotopes for a given peptide
Expand All @@ -67,6 +74,13 @@ public class FlashLfqEngine
internal Dictionary<SpectraFileInfo, Ms1ScanInfo[]> _ms1Scans;
internal PeakIndexingEngine _peakIndexingEngine;

/// <summary>
/// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra
/// </summary>
/// <param name="allIdentifications">A list of identifications corresponding to MS2 peptide detections. One ID per peptide per file</param>
/// <param name="integrate">Optional. Bool indicating whether peaks should be integrated before quantification. It is HIGHLY recommended this is set to FALSE</param>
/// <param name="peptideSequencesToUse">Optional. A list of strings corresponding to the modified sequences of peptides that should be quantified/used for
/// protein level quant. Reccommended use is to pass in the full sequence of every peptide at 1% peptide-level FDR</param>
public FlashLfqEngine(
List<Identification> allIdentifications,
bool normalize = false,
Expand All @@ -93,6 +107,7 @@ public FlashLfqEngine(
int mcmcBurninSteps = 1000,
bool useSharedPeptidesForProteinQuant = false,
bool pairedSamples = false,
List<string> peptideSequencesToUse = null,
int? randomSeed = null)
{
Loaders.LoadElements();
Expand Down Expand Up @@ -128,6 +143,8 @@ public FlashLfqEngine(
McmcSteps = mcmcSteps;
McmcBurninSteps = mcmcBurninSteps;
UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant;
PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet<string>(peptideSequencesToUse)
: allIdentifications.Select(id => id.ModifiedSequence).ToHashSet();
RandomSeed = randomSeed;

if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount)
Expand All @@ -149,7 +166,7 @@ public FlashLfqResults Run()
{
_globalStopwatch.Start();
_ms1Scans = new Dictionary<SpectraFileInfo, Ms1ScanInfo[]>();
_results = new FlashLfqResults(_spectraFileInfo, _allIdentifications);
_results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify);

// build m/z index keys
CalculateTheoreticalIsotopeDistributions();
Expand Down Expand Up @@ -588,7 +605,8 @@ private MbrScorer BuildMbrScorer(List<ChromatographicPeak> acceptorFileIdentifie
// Construct a distribution of ppm errors for all MSMS peaks in the acceptor file
var apexToAcceptorFilePeakDict = new Dictionary<IndexedMassSpectralPeak, ChromatographicPeak>();
List<double> ppmErrors = new List<double>();
foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null))
foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null
&& PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence)))
{
if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak))
{
Expand Down Expand Up @@ -677,6 +695,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile)
!p.IsMbrPeak
&& p.NumIdentificationsByFullSeq == 1
&& p.IsotopicEnvelopes.Any()
&& PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify
&& !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence)
&& (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList();

Expand Down Expand Up @@ -1060,11 +1079,35 @@ private void RunErrorChecking(SpectraFileInfo spectraFile)

if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak)
{
storedPeak.MergeFeatureWith(tryPeak, Integrate);
if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence))
{
if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence))
{
storedPeak.MergeFeatureWith(tryPeak, Integrate);
}
else
{
// If the stored peak id isn't in the list of peptides to quantify, overwrite it
errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak;
}
}
else
{
continue; // if the tryPeak id isn't in the list of peptides to quantify, it is discarded
}

}
else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak)
{
continue;
if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence))
{
continue;
}
else
{
// If the stored peak id isn't in the list of peptides to quantify, overwrite it
errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak;
}
}
else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak)
{
Expand Down
80 changes: 64 additions & 16 deletions mzLib/TestFlashLFQ/TestFlashLFQ.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
using UsefulProteomicsDatabases;
using ChromatographicPeak = FlashLFQ.ChromatographicPeak;
using Stopwatch = System.Diagnostics.Stopwatch;
using TopDownProteomics;

namespace Test
{
Expand Down Expand Up @@ -475,23 +476,39 @@ public static void TestFlashLfqMergeResults()
Assert.AreEqual(4, resultsA.SpectraFiles.Count);
}


/// <summary>
/// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs.
/// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing
/// MBR is tested by ensuring that IDs are transferred between runs
/// </summary>
[Test]
public static void TestFlashLfqMatchBetweenRuns()
{
List<string> filesToWrite = new List<string> { "mzml_1", "mzml_2" };
List<string> pepSequences = new List<string> { "PEPTIDE", "PEPTIDEV", "PEPTIDEVV", "PEPTIDEVVV", "PEPTIDEVVVV", "PEPTIDEVVVVA", "PEPTIDEVVVVAA" };
List<string> pepSequences = new List<string>
{
"PEPTIDE",
"PEPTIDEV",
"PEPTIDEVV",
"TARGETPEP",
"PEPTIDEVVV",
"PEPTIDEVVVV",
"PEPTIDEVVVVA",
"PEPTIDEVVVVAA"
};
double intensity = 1e6;

double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.035, 1.04, 1.045, 1.05 };
double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.035, 1.04, 1.055, 1.07 };
double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 };
double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 };

Loaders.LoadElements();

// generate mzml files (5 peptides each)
for (int f = 0; f < filesToWrite.Count; f++)
{
// 1 MS1 scan per peptide
MsDataScan[] scans = new MsDataScan[7];
MsDataScan[] scans = new MsDataScan[8];

for (int p = 0; p < pepSequences.Count; p++)
{
Expand Down Expand Up @@ -533,39 +550,53 @@ public static void TestFlashLfqMatchBetweenRuns()
Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });

Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List<ProteinGroup> { pg });
// missing ID 8 - MBR feature
// missing ID 8 - MBR feature - "PEPTIDEVV"

Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });

// Adding additional peaks to check interquartile range
Identification id11 = new Identification(file1, "PEPTIDEVVVVA", "PEPTIDEVVVVA",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id12 = new Identification(file1, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });

new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[7] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id13 = new Identification(file2, "PEPTIDEVVVVA", "PEPTIDEVVVVA",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id14 = new Identification(file2, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA",
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List<ProteinGroup> { pg });
new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[7] + 0.001, 1, new List<ProteinGroup> { pg });

// Additional peaks, check that a non-confident ID (i.e., one that isn't included in the peptides to quantify list) is overwritten
// by a MBR transfer of a more confident ID
Identification id15 = new Identification(file1, "TARGETPEP", "TARGETPEP",
new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id16 = new Identification(file2, "DECOYPEP", "DECOYPEP",
new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });

// Additional peaks to ensure that non-confident IDs aren't merged with confident IDs at the peak level
Identification id17 = new Identification(file1, "DECOYPEP", "DECOYPEP",
new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });
Identification id18 = new Identification(file1, "DECOYPEP2", "DECOYPEP2",
new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List<ProteinGroup> { pg });

// create the FlashLFQ engine
FlashLfqEngine engine = new FlashLfqEngine(new List<Identification> { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true);
FlashLfqEngine interquartileEngine = new FlashLfqEngine(
new List<Identification> { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true);
FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List<Identification> { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true,
peptideSequencesToUse: pepSequences);


// run the engine
//run the engine
var results = engine.Run();

Assert.That(results.Peaks[file2].Count == 5);
Expand Down Expand Up @@ -606,6 +637,15 @@ public static void TestFlashLfqMatchBetweenRuns()
Assert.That(!peak.RtStdDev.HasValue);
Assert.That(peak.RtInterquartileRange.HasValue);
Assert.That(peak.RtInterquartileRange, Is.EqualTo(rtDiffs.InterquartileRange()).Within(0.01));

// The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff)
// gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids
results = engineAmbiguous.Run();
Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP"));
Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP"))));
Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP"));
Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2);

}

[Test]
Expand Down Expand Up @@ -1382,6 +1422,13 @@ public static void RealDataMbrTest()
{
Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0);
}

List<string> peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList();
engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse);
results = engine.Run();
var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList();

CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse);
}

[Test]
Expand Down Expand Up @@ -1620,7 +1667,8 @@ public static void TestAmbiguousFraction()
peak1.CalculateIntensityForThisFeature(false);
peak2.CalculateIntensityForThisFeature(false);

FlashLfqResults res = new FlashLfqResults(new List<SpectraFileInfo> { fraction1, fraction2 }, new List<Identification> { id1, id2, id3 });
FlashLfqResults res = new FlashLfqResults(new List<SpectraFileInfo> { fraction1, fraction2 }, new List<Identification> { id1, id2, id3 },
new HashSet<string> { "peptide1", "peptide2"});
res.Peaks[fraction1].Add(peak1);
res.Peaks[fraction2].Add(peak2);
res.CalculatePeptideResults(quantifyAmbiguousPeptides: false);
Expand Down

0 comments on commit 84b2e9d

Please sign in to comment.