From 7055d844681af53fc940a0db57e27f7cf362e371 Mon Sep 17 00:00:00 2001 From: mzhastings Date: Fri, 6 Sep 2024 11:08:44 -0500 Subject: [PATCH 1/2] Quantifiable interfaces for results reading in FlashLFQ (#793) * created interface IQuantifiable * created IQuantifiableRecord and IQuantifiable interfaces, deleted origional IQuantifiable in FlashLFQ * created IdentificationAdapter class and edited IquantifiableRecord interface * added implementation of IQuantifiableRecord to MSFraggerPSM abd wrote tests * created test for IdentificationAdapter method * created Dictionary linking MSFragger file names to their corresponding full file paths * edited FileNameToFilePath method in MSFraggerPsm to account for MSFragger file name additions and wrote tests * edited MsFraggerPsm to account for differences in result files (probabilty v peptideprophetprobability) * removed unneeded test from TestQuantifiedPeaks * made recommended changes by reviewers * made recommended changes * made recommended changes --------- Co-authored-by: trishorts Co-authored-by: Nic Bollis --- .../ResultsReading/MzLibExtensions.cs | 64 +++++++++++++ .../BaseClasses/IQuantifiableRecord.cs | 55 ++++++++++++ .../BaseClasses/IQuantifiableResultFile.cs | 29 ++++++ .../IndividualResultRecords/MsFraggerPsm.cs | 89 +++++++++++++++++- .../ResultFiles/MsFraggerPsmFile.cs | 40 ++++++++- .../SmallCalibratibleYeastFragger_psm.tsv | 6 ++ .../TestMsFraggerResultFiles.cs | 15 ++++ mzLib/Test/Test.csproj | 3 + .../TestFlashLFQ/TestIdentificationAdapter.cs | 90 +++++++++++++++++++ 9 files changed, 385 insertions(+), 6 deletions(-) create mode 100644 mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs create mode 100644 mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs create mode 100644 mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs create mode 100644 mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv create mode 100644 mzLib/TestFlashLFQ/TestIdentificationAdapter.cs diff --git a/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs new file mode 100644 index 000000000..396e76a9a --- /dev/null +++ b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs @@ -0,0 +1,64 @@ +using Readers.ExternalResults.BaseClasses; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ +{ + public static class MzLibExtensions + { + /// + /// Makes a list of identification objects usable by FlashLFQ from an IQuantifiableResultFile + /// + public static List MakeIdentifications(this IQuantifiableResultFile quantifiable) + { + IEnumerable quantifiableRecords = quantifiable.GetQuantifiableResults(); + List identifications = new List(); + Dictionary allProteinGroups = new Dictionary(); + Dictionary allFiles = new Dictionary(); + + foreach (var record in quantifiableRecords) + { + string baseSequence = record.BaseSequence; + string modifiedSequence = record.ModifiedSequence; + double ms2RetentionTimeInMinutes = record.RetentionTime; + double monoisotopicMass = record.MonoisotopicMass; + int precursurChargeState = record.ChargeState; + + SpectraFileInfo file = null; + if (allFiles.TryGetValue(record.FileName, out var fileInfo)) + { + // placeholder values for SpectraFileInfo that will be edited later + file = new SpectraFileInfo(record.FileName, "", 1, 1, 1); + } + else + { + file = new SpectraFileInfo(record.FileName, "", 1, 1, 1); + allFiles.Add(record.FileName, fileInfo); + } + + List proteinGroups = new(); + foreach (var info in record.ProteinGroupInfos) + { + if (allProteinGroups.TryGetValue(info.proteinAccessions, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(info.proteinAccessions, new ProteinGroup(info.proteinAccessions, info.geneName, info.organism)); + proteinGroups.Add(allProteinGroups[info.proteinAccessions]); + } + } + Identification id = new Identification(file, baseSequence, modifiedSequence, monoisotopicMass, ms2RetentionTimeInMinutes, precursurChargeState, proteinGroups); + identifications.Add(id); + + } + + return identifications; + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs new file mode 100644 index 000000000..87100cfa5 --- /dev/null +++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers.ExternalResults.BaseClasses +{ + /// + /// Defines the information needed to create the identification object usable by FlashLFQ + /// + public interface IQuantifiableRecord + { + /// + /// The file name of the MS Data file in which the identification was made + /// + public string FileName { get; } + + /// + /// A list of tuples, each of which represent a protein. + /// Each tuple contains the accession number, gene name, and organism associated with the given result. + /// + public List<(string proteinAccessions, string geneName, string organism)> ProteinGroupInfos { get; } + + /// + /// The amino acid sequence of the identified peptide + /// + public string BaseSequence { get; } + + /// + /// The amino acid sequence and the associated post-translation modifications of the identified peptide + /// + public string ModifiedSequence { get; } + + /// + /// The retention time (in minutes) associated with the result + /// + public double RetentionTime { get; } + + /// + /// The charge state associated with the result + /// + public int ChargeState { get; } + + /// + /// Defines whether or not the result is a decoy identification + /// + public bool IsDecoy { get; } + + /// + /// The mass of the monoisotopic peptide (i.e., no c13 or n15 atoms are present, the lowest possible mass) + /// + public double MonoisotopicMass { get; } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs new file mode 100644 index 000000000..9e1a1a54e --- /dev/null +++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers.ExternalResults.BaseClasses +{ + /// + /// Outlines behavior to turn results into an IEnumerable of IQuantifiableRecords + /// and to create the dictionary linking file names from the external result files + /// to their local file paths which are used to make the identification object + /// + public interface IQuantifiableResultFile : IResultFile + { + /// + /// Returns every result in the result file as an IQuantifiableRecord + /// + /// Enumerable that contains identifications for a peptide + public IEnumerable GetQuantifiableResults(); + + /// + /// Links the file name associated with the protein to the raw file path of MassSpec data + /// + /// list of file paths associated with each distinct record + /// Dictionary of file names and their associted full paths + public Dictionary FileNametoFilePath(List fullFilePath); + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs index 3dd5b1976..54dc19987 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs @@ -14,10 +14,14 @@ using Proteomics; using static System.Net.Mime.MediaTypeNames; using ThermoFisher.CommonCore.Data.Interfaces; +using Readers.ExternalResults.BaseClasses; +using System.Reflection.Metadata.Ecma335; +using System.Runtime.CompilerServices; +using Easy.Common.Extensions; namespace Readers { - public class MsFraggerPsm + public class MsFraggerPsm : IQuantifiableRecord { public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) { @@ -59,7 +63,7 @@ public class MsFraggerPsm [Name("Retention")] public double RetentionTime { get; set; } - + [Name("Observed Mass")] public double ObservedMass { get; set; } @@ -90,7 +94,11 @@ public class MsFraggerPsm [Name("Nextscore")] public double NextScore { get; set; } - [Name("PeptideProphet Probability")] + /// + /// MsFragger v22.0 output renames the header "PeptideProphet Probability" as just "Probability". + /// Headers are mutually exclusive, will not both occur in the same file. + /// + [Name("PeptideProphet Probability", "Probability")] public double PeptideProphetProbability { get; set; } [Name("Number of Enzymatic Termini")] @@ -155,5 +163,78 @@ public class MsFraggerPsm public int OneBasedScanNumber => _oneBasedScanNumber ??= int.Parse(Spectrum.Split('.')[1]); #endregion + + #region IQuantifiableRecord Implementation + + [Ignore] public string FileName => SpectrumFilePath; + + [Ignore] public List<(string, string, string)> ProteinGroupInfos + { + get + { + _proteinGroupInfos ??= AddProteinGroupInfos(); + return _proteinGroupInfos; + } + } + + /// + /// Creates a list of tuples, each of which represents a protein. + /// Each tuple contains the accession number, gene name, and organism. + /// These parameters are used to create a ProteinGroup object, + /// which is needed to make an identification. + /// + /// + private List<(string, string, string)> AddProteinGroupInfos () + { + _proteinGroupInfos = new List<(string, string, string)> (); + string protein = Protein; + + char[] delimiterChars = { '|', '_'}; + string[] proteinInfo = protein.Split(delimiterChars); + + string proteinAccessions; + string geneName; + string organism; + + // Fasta header is parsed to separate the accession number, gene name, and organism. + // If the protein does not have this information, it will be assigned an empty string. + // Ideally, a future refactor would create a method for parsing fasta headers + // that is shared by Readers and UsefulProteomicsDatabases. + proteinAccessions = proteinInfo.Length >= 2 ? proteinInfo[1] : ""; + geneName = proteinInfo.Length >= 3 ? proteinInfo[2] : ""; + organism = proteinInfo.Length >= 4 ? proteinInfo[3] : ""; ; + + _proteinGroupInfos.Add((proteinAccessions, geneName, organism)); + + if (MappedProteins.IsNullOrEmpty()) return _proteinGroupInfos; + + string mappedProteins = MappedProteins; + string[] allMappedProteinInfo = mappedProteins.Split(','); + foreach (var singleMappedProteinInfo in allMappedProteinInfo) + { + string[] mappedProteinInfo = singleMappedProteinInfo.Split(delimiterChars); + + proteinAccessions = mappedProteinInfo.Length >= 2 ? mappedProteinInfo[1] : ""; + geneName = mappedProteinInfo.Length >= 3 ? mappedProteinInfo[2] : ""; + organism = mappedProteinInfo.Length >= 4 ? mappedProteinInfo[3] : ""; + + _proteinGroupInfos.Add((proteinAccessions, geneName, organism)); + } + + return _proteinGroupInfos; + } + + [Ignore] private List<(string, string, string)> _proteinGroupInfos; + + [Ignore] public string ModifiedSequence => FullSequence.IsNullOrEmpty() ? BaseSequence : FullSequence; + + [Ignore] public int ChargeState => Charge; + + // decoy reading isn't currently supported for MsFragger psms, this will be revisited later + [Ignore] public bool IsDecoy => false; + + [Ignore] public double MonoisotopicMass => CalculatedPeptideMass; + + #endregion } -} +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs index 165d1c8d3..1aa80f885 100644 --- a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs @@ -4,10 +4,12 @@ using System.Text; using System.Threading.Tasks; using CsvHelper; +using MassSpectrometry; +using Readers.ExternalResults.BaseClasses; namespace Readers { - public class MsFraggerPsmFile : ResultFile, IResultFile + public class MsFraggerPsmFile : ResultFile, IQuantifiableResultFile { public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm; public override Software Software { get; set; } @@ -38,5 +40,39 @@ public override void WriteResults(string outputPath) csv.WriteRecord(result); } } + + public IEnumerable GetQuantifiableResults() => Results; + + /// + /// Creates a dictionary linking a shortened file name to its corresponding full file path + /// + /// list of all full file paths associted with a given result + /// dictionary with key fileName and value fullFilePath + public Dictionary FileNametoFilePath (List fullFilePath) + { + List rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList(); + fullFilePath = fullFilePath.Distinct().ToList(); + Dictionary allFiles = new Dictionary(); + + foreach(var fileName in rawFileNames) + { + string shortFileName = Path.GetFileName(fileName); + + // MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml + // In order to correctly match the file names, these changes must be removed + shortFileName = shortFileName.Replace("interact-", "").Replace(".pep.xml", ""); + + foreach(var file in fullFilePath) + { + if (file.Contains(shortFileName) && !allFiles.ContainsKey(fileName)) + { + allFiles.Add(fileName, file); + break; + } + } + } + + return allFiles; + } } -} +} \ No newline at end of file diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv new file mode 100644 index 000000000..a87d8493f --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv @@ -0,0 +1,6 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +SmallCalibratibleYeast.00002.00002.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml RGNVCGDAK RGNVCGDAK NVSVKEIR.RGNVCGDAK.NDPPKGCA R N 9 2 1443.6643 975.4455 975.4455 488.7300 488.7300 975.4556 488.7351 -0.0101 0.00047835460000 25.7810 11.6900 0.9997 2 1 320 328 1.81151584E8 5C(57.0215) 0.00 true sp|P02994|EF1A_YEAST P02994 EF1A_YEAST TEF1 Elongation factor 1-alpha +SmallCalibratibleYeast.00004.00004.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml EKAEAEAEK GIREKRAR.EKAEAEAEK.KK R K 9 2 1444.1241 1003.4855 1003.4855 502.7500 502.7500 1003.4821 502.7483 0.0033 0.02974116000000 18.6380 12.6310 0.9269 2 1 189 197 3.6544424E7 0.00 false sp|P40212|RL13B_YEAST P40212 RL13B_YEAST RPL13B Large ribosomal subunit protein eL13B RPL13A sp|Q12690|RL13A_YEAST +SmallCalibratibleYeast.00008.00008.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml KITSNQR FNVPIDGK.KITSNQR.IVAAIPTI K I 7 2 1446.0237 845.4655 845.4655 423.7400 423.7400 845.4719 423.7432 -0.0064 0.13056640000000 17.5940 12.5170 0.9646 2 1 33 39 2.5555878E7 0.00 true sp|P00560|PGK_YEAST P00560 PGK_YEAST PGK1 Phosphoglycerate kinase +SmallCalibratibleYeast.00009.00009.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml GIDHTSK .GIDHTSK.QHKRSGHR M Q 7 2 1446.2545 756.3855 756.3855 379.2000 379.2000 756.3766 379.1956 0.0089 0.01669167000000 17.6100 9.8270 0.9965 2 0 2 8 6.739196E7 0.00 false sp|P0CX49|RL18A_YEAST P0CX49 RL18A_YEAST RPL18A Large ribosomal subunit protein eL18A RPL18B sp|P0CX50|RL18B_YEAST +SmallCalibratibleYeast.00010.00010.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml EKAEAEAEK GIREKRAR.EKAEAEAEK.KK R K 9 2 1446.4789 1003.4855 1003.4855 502.7500 502.7500 1003.4821 502.7483 0.0033 0.02315876000000 18.5780 12.4510 0.9875 2 1 189 197 3.6544424E7 0.00 false sp|P40212|RL13B_YEAST P40212 RL13B_YEAST RPL13B Large ribosomal subunit protein eL13B RPL13A sp|Q12690|RL13A_YEAST diff --git a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs index 06f63372e..765bd5c1c 100644 --- a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs +++ b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs @@ -45,6 +45,21 @@ public void TestMsFraggerPsmLoadsAndCountCorrect(string path, int count) Assert.That(file.CanRead(path)); } + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestAddProteinGroupInfoCountCorrect (string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + var allResults = file.ToList(); + + // one protein associated with given results, list should only contain this one element + Assert.That(allResults[0].ProteinGroupInfos.Count, Is.EqualTo(1)); + // two proteins associated with given results, list should contain two elements + Assert.That(allResults[2].ProteinGroupInfos.Count, Is.EqualTo(2)); + + } + [Test] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1individual_peptide.tsv", 7)] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1combined_peptide.tsv", 6)] diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index f6d44a6cf..b58d87522 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -318,6 +318,9 @@ Always + + Always + Always diff --git a/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs new file mode 100644 index 000000000..72461fa9c --- /dev/null +++ b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs @@ -0,0 +1,90 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using FlashLFQ; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; + +namespace TestFlashLFQ +{ + internal class TestIdentificationAdapter + { + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestAddProteinGroupInfoCorrect(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + + List identifications = new List(); + identifications = MzLibExtensions.MakeIdentifications(file); + + // list should contain five elements + Assert.That(identifications.Count, Is.EqualTo(5)); + // one protein associated with given results, list should only contain this one element + Assert.That(identifications[0].ProteinGroups.Count, Is.EqualTo(1)); + // two proteins associated with given results, list should contain two elements + Assert.That(identifications[2].ProteinGroups.Count, Is.EqualTo(2)); + + Identification identification1= identifications[0]; + Assert.That(identification1.BaseSequence, Is.EqualTo("KPVGAAK")); + Assert.That(identification1.ModifiedSequence, Is.EqualTo("KPVGAAK")); + Assert.That(identification1.Ms2RetentionTimeInMinutes, Is.EqualTo(1.9398)); + Assert.That(identification1.MonoisotopicMass, Is.EqualTo(669.4173)); + Assert.That(identification1.PrecursorChargeState, Is.EqualTo(2)); + + HashSet proteinGroups = identification1.ProteinGroups; + ProteinGroup proteinGroup1 = proteinGroups.First(); + Assert.That(proteinGroup1.ProteinGroupName, Is.EqualTo("P16403")); + Assert.That(proteinGroup1.GeneName, Is.EqualTo("H12")); + Assert.That(proteinGroup1.Organism, Is.EqualTo("HUMAN")); + + Identification identification5 = identifications[4]; + Assert.That(identification5.BaseSequence, Is.EqualTo("VVTHGGR")); + Assert.That(identification5.ModifiedSequence, Is.EqualTo("VVTHGGR")); + Assert.That(identification5.Ms2RetentionTimeInMinutes, Is.EqualTo(19.114)); + Assert.That(identification5.MonoisotopicMass, Is.EqualTo(724.398)); + Assert.That(identification5.PrecursorChargeState, Is.EqualTo(2)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestFileNametoFilePath(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + string fileName = file.First().FileName; + + List fullFilePath = new List(); + string fullFilePath1 = @"D:\Projects\Chimeras\Mann_11cell_analysis\RawData\interact-20100611_Velos1_TaGe_SA_Hela_1.raw"; + string fullFilePath2 = @"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv"; + fullFilePath.Add(fullFilePath1); + fullFilePath.Add(fullFilePath2); + + Dictionary allFiles = file.FileNametoFilePath(fullFilePath); + + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.AreEqual(output, fullFilePath1); + Assert.That(!allFiles.ContainsValue(fullFilePath2)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\SmallCalibratibleYeastFragger_psm.tsv")] + public void TestFileNametoFilePathLocalPath(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + string fileName = file.First().FileName; + + List fullFilePath = new List(); + string rawFilePath = @"DataFiles\SmallCalibratibleYeast.mzml"; + fullFilePath.Add(rawFilePath); + + Dictionary allFiles = file.FileNametoFilePath(fullFilePath); + + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.AreEqual(output, rawFilePath); + } + } +} \ No newline at end of file From 983c3b01b059646c8da77eec79978cae86c1c358 Mon Sep 17 00:00:00 2001 From: Alexander-Sol <41119316+Alexander-Sol@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:25:35 -0500 Subject: [PATCH 2/2] Modified decoy scrambler to no longer use static Random generator (#798) * Modified decoy scrambler to no longer use static Random generator * Added additional tests * Better tests --- mzLib/Proteomics/Protein/Protein.cs | 9 ++- .../Test/DatabaseTests/TestDatabaseLoaders.cs | 36 +++++++++- mzLib/Test/TestProteinDigestion.cs | 67 +++++++++++++++++++ 3 files changed, 106 insertions(+), 6 deletions(-) diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 5c1b428d6..fc07460d2 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -836,6 +836,7 @@ public static Protein ScrambleDecoyProteinSequence( string scrambledProteinSequence = originalDecoyProtein.BaseSequence; // Clone the original protein's modifications var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + Random rng = new Random(42); // Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide // Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide @@ -843,14 +844,14 @@ public static Protein ScrambleDecoyProteinSequence( { if(scrambledProteinSequence.Contains(peptideSequence)) { - string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, + string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, out var swappedArray); int scrambleAttempts = 1; // Try five times to scramble the peptide sequence without creating a forbidden sequence while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5) { - scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, + scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, out swappedArray); scrambleAttempts++; } @@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence( return newProtein; } - private static Random rng = new Random(42); - /// /// Scrambles a peptide sequence, preserving the position of any cleavage sites. /// /// An array that maps the previous position (index) to the new position (value) - public static string ScrambleSequence(string sequence, List motifs, out int[] swappedPositionArray) + public static string ScrambleSequence(string sequence, List motifs, Random rng, out int[] swappedPositionArray) { // First, find the location of every cleavage motif. These sites shouldn't be scrambled. HashSet zeroBasedCleavageSitesLocations = new(); diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs index 82d7ce715..8925661cb 100644 --- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs +++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs @@ -1,4 +1,4 @@ -// opyright 2016 Stefan Solntsev +// Copyright 2016 Stefan Solntsev // // This file (ChemicalFormula.cs) is part of Chemistry Library. // @@ -28,6 +28,7 @@ using Omics.Modifications; using UsefulProteomicsDatabases; using Stopwatch = System.Diagnostics.Stopwatch; +using NUnit.Framework.Legacy; namespace Test.DatabaseTests { @@ -81,6 +82,39 @@ public static void LoadIsoforms() Assert.AreEqual("Q14103-4", proteinXml[9].Accession); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)] + [TestCase("uniprot_aifm1.fasta", DecoyType.None)] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)] + [TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)] + public void LoadingIsReproducible(string fileName, DecoyType decoyType) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + List proteins1 = null; + List proteins2 = null; + if(fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + // check are equivalent lists of proteins + Assert.AreEqual(proteins1.Count, proteins2.Count); + // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same + CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession)); + CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence)); + } + [Test] public static void LoadModWithNl() { diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 975db7dd1..02cc3aed5 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -361,6 +361,73 @@ public static void Test_ProteinDigest() Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml")] + [TestCase("uniprot_aifm1.fasta")] + public static void TestDecoyScramblingIsReproducible(string fileName) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + DecoyType decoyType = DecoyType.Reverse; + List proteins1 = null; + List proteins2 = null; + if (fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + DigestionParams d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 5, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + // Digest target proteins + var pepsToReplace = proteins1.Where(p => !p.IsDecoy) + .SelectMany(p => p.Digest(d, new List(), new List()).ToList()) + .Select(pep => pep.BaseSequence) + .ToHashSet(); + + // Ensure at least one decoy peptide from each protein is problematic and must be replaced + var singleDecoyPeptides = proteins1 + .Where(p => p.IsDecoy) + .Select(p => p.Digest(d, new List(), new List()).Skip(2).Take(1)) + .Select(pwsm => pwsm.First().BaseSequence) + .ToHashSet(); + + //modify targetpeptides in place + pepsToReplace.UnionWith(singleDecoyPeptides); + + // Scramble every decoy from db1 + List decoys1 = new(); + foreach (var protein in proteins1.Where(p => p.IsDecoy)) + { + decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + // Scramble every decoy from db2 + List decoys2 = new(); + foreach (var protein in proteins2.Where(p => p.IsDecoy)) + { + decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + + // check are equivalent lists of proteins + Assert.AreEqual(decoys1.Count, decoys2.Count); + foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession)) + { + Assert.AreEqual(2, decoyPair.Count()); + Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence); + } + } + [Test] public static void TestDecoyScramblerReplacesPeptides() {