diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index ac9c36189..6261f1f29 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -1,4 +1,5 @@ -using MathNet.Numerics.Statistics; +using Easy.Common.Extensions; +using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; using System.IO; @@ -13,20 +14,28 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; + private readonly HashSet _peptideModifiedSequencesToQuantify; - public FlashLfqResults(List spectraFiles, List identifications) + public FlashLfqResults(List spectraFiles, List identifications, HashSet peptides = null) { SpectraFiles = spectraFiles; PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); + if(peptides == null || !peptides.Any()) + { + peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet(); + } + _peptideModifiedSequencesToQuantify = peptides; foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); } - foreach (Identification id in identifications) + + // Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results + foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence))) { if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide)) { @@ -120,6 +129,7 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) var groupedPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq == 1) .GroupBy(p => p.Identifications.First().ModifiedSequence) + .Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key)) .ToList(); foreach (var sequenceWithPeaks in groupedPeaks) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 0efff5f32..2e877d1f7 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -11,6 +11,7 @@ using System.Threading.Tasks; using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; +using Easy.Common.Extensions; [assembly: InternalsVisibleTo("TestFlashLFQ")] @@ -57,6 +58,12 @@ public class FlashLfqEngine private Stopwatch _globalStopwatch; private List _allIdentifications; /// + /// These peptides will be reported in the QuantifiedPeptides output and used for protein quant. + /// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable + /// peptide-level FDR filtering + /// + public HashSet PeptidesModifiedSequencesToQuantify { get; init; } + /// /// Dictionary linking a modified sequence to a List of tuples containing /// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the /// isotopes for a given peptide @@ -67,6 +74,13 @@ public class FlashLfqEngine internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + /// + /// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra + /// + /// A list of identifications corresponding to MS2 peptide detections. One ID per peptide per file + /// Optional. Bool indicating whether peaks should be integrated before quantification. It is HIGHLY recommended this is set to FALSE + /// Optional. A list of strings corresponding to the modified sequences of peptides that should be quantified/used for + /// protein level quant. Reccommended use is to pass in the full sequence of every peptide at 1% peptide-level FDR public FlashLfqEngine( List allIdentifications, bool normalize = false, @@ -93,6 +107,7 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, + List peptideSequencesToUse = null, int? randomSeed = null) { Loaders.LoadElements(); @@ -128,6 +143,8 @@ public FlashLfqEngine( McmcSteps = mcmcSteps; McmcBurninSteps = mcmcBurninSteps; UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant; + PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet(peptideSequencesToUse) + : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); RandomSeed = randomSeed; if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount) @@ -149,7 +166,7 @@ public FlashLfqResults Run() { _globalStopwatch.Start(); _ms1Scans = new Dictionary(); - _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications); + _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify); // build m/z index keys CalculateTheoreticalIsotopeDistributions(); @@ -588,7 +605,8 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie // Construct a distribution of ppm errors for all MSMS peaks in the acceptor file var apexToAcceptorFilePeakDict = new Dictionary(); List ppmErrors = new List(); - foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null)) + foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null + && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence))) { if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak)) { @@ -677,6 +695,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1 && p.IsotopicEnvelopes.Any() + && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); @@ -1060,11 +1079,35 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - storedPeak.MergeFeatureWith(tryPeak, Integrate); + if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) + { + if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + { + storedPeak.MergeFeatureWith(tryPeak, Integrate); + } + else + { + // If the stored peak id isn't in the list of peptides to quantify, overwrite it + errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; + } + } + else + { + continue; // if the tryPeak id isn't in the list of peptides to quantify, it is discarded + } + } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - continue; + if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + { + continue; + } + else + { + // If the stored peak id isn't in the list of peptides to quantify, overwrite it + errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; + } } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) { diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 787b2800b..b94e32cf0 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -15,6 +15,7 @@ using UsefulProteomicsDatabases; using ChromatographicPeak = FlashLFQ.ChromatographicPeak; using Stopwatch = System.Diagnostics.Stopwatch; +using TopDownProteomics; namespace Test { @@ -475,15 +476,31 @@ public static void TestFlashLfqMergeResults() Assert.AreEqual(4, resultsA.SpectraFiles.Count); } + + /// + /// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs. + /// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing + /// MBR is tested by ensuring that IDs are transferred between runs + /// [Test] public static void TestFlashLfqMatchBetweenRuns() { List filesToWrite = new List { "mzml_1", "mzml_2" }; - List pepSequences = new List { "PEPTIDE", "PEPTIDEV", "PEPTIDEVV", "PEPTIDEVVV", "PEPTIDEVVVV", "PEPTIDEVVVVA", "PEPTIDEVVVVAA" }; + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", + "TARGETPEP", + "PEPTIDEVVV", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", + "PEPTIDEVVVVAA" + }; double intensity = 1e6; - double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.035, 1.04, 1.045, 1.05 }; - double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.035, 1.04, 1.055, 1.07 }; + double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 }; + double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 }; Loaders.LoadElements(); @@ -491,7 +508,7 @@ public static void TestFlashLfqMatchBetweenRuns() for (int f = 0; f < filesToWrite.Count; f++) { // 1 MS1 scan per peptide - MsDataScan[] scans = new MsDataScan[7]; + MsDataScan[] scans = new MsDataScan[8]; for (int p = 0; p < pepSequences.Count; p++) { @@ -533,39 +550,53 @@ public static void TestFlashLfqMatchBetweenRuns() Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV", new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List { pg }); Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE", new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List { pg }); Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV", new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List { pg }); - // missing ID 8 - MBR feature + // missing ID 8 - MBR feature - "PEPTIDEVV" + Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List { pg }); Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List { pg }); // Adding additional peaks to check interquartile range Identification id11 = new Identification(file1, "PEPTIDEVVVVA", "PEPTIDEVVVVA", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List { pg }); Identification id12 = new Identification(file1, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[6] + 0.001, 1, new List { pg }); - + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file1Rt[7] + 0.001, 1, new List { pg }); Identification id13 = new Identification(file2, "PEPTIDEVVVVA", "PEPTIDEVVVVA", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List { pg }); Identification id14 = new Identification(file2, "PEPTIDEVVVVAA", "PEPTIDEVVVVAA", - new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[6] + 0.001, 1, new List { pg }); + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVVAA").MonoisotopicMass, file2Rt[7] + 0.001, 1, new List { pg }); + // Additional peaks, check that a non-confident ID (i.e., one that isn't included in the peptides to quantify list) is overwritten + // by a MBR transfer of a more confident ID + Identification id15 = new Identification(file1, "TARGETPEP", "TARGETPEP", + new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List { pg }); + Identification id16 = new Identification(file2, "DECOYPEP", "DECOYPEP", + new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file2Rt[3] + 0.001, 1, new List { pg }); + + // Additional peaks to ensure that non-confident IDs aren't merged with confident IDs at the peak level + Identification id17 = new Identification(file1, "DECOYPEP", "DECOYPEP", + new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List { pg }); + Identification id18 = new Identification(file1, "DECOYPEP2", "DECOYPEP2", + new Proteomics.AminoAcidPolymer.Peptide("TARGETPEP").MonoisotopicMass, file1Rt[3] + 0.001, 1, new List { pg }); // create the FlashLFQ engine FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); FlashLfqEngine interquartileEngine = new FlashLfqEngine( new List { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true); + FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, + peptideSequencesToUse: pepSequences); - // run the engine + //run the engine var results = engine.Run(); Assert.That(results.Peaks[file2].Count == 5); @@ -606,6 +637,15 @@ public static void TestFlashLfqMatchBetweenRuns() Assert.That(!peak.RtStdDev.HasValue); Assert.That(peak.RtInterquartileRange.HasValue); Assert.That(peak.RtInterquartileRange, Is.EqualTo(rtDiffs.InterquartileRange()).Within(0.01)); + + // The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff) + // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids + results = engineAmbiguous.Run(); + Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP")); + Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP")))); + Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP")); + Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2); + } [Test] @@ -1382,6 +1422,13 @@ public static void RealDataMbrTest() { Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0); } + + List peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList(); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse); + results = engine.Run(); + var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList(); + + CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse); } [Test] @@ -1620,7 +1667,8 @@ public static void TestAmbiguousFraction() peak1.CalculateIntensityForThisFeature(false); peak2.CalculateIntensityForThisFeature(false); - FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }); + FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }, + new HashSet { "peptide1", "peptide2"}); res.Peaks[fraction1].Add(peak1); res.Peaks[fraction2].Add(peak2); res.CalculatePeptideResults(quantifyAmbiguousPeptides: false);