From 7055d844681af53fc940a0db57e27f7cf362e371 Mon Sep 17 00:00:00 2001
From: mzhastings <maddyh225@gmail.com>
Date: Fri, 6 Sep 2024 11:08:44 -0500
Subject: [PATCH 1/2] Quantifiable interfaces for results reading in FlashLFQ
 (#793)

* created interface IQuantifiable

* created IQuantifiableRecord and IQuantifiable interfaces, deleted origional IQuantifiable in FlashLFQ

* created IdentificationAdapter class and edited IquantifiableRecord interface

* added implementation of IQuantifiableRecord to MSFraggerPSM abd wrote tests

* created test for IdentificationAdapter method

* created Dictionary linking MSFragger file names to their corresponding full file paths

* edited FileNameToFilePath method in MSFraggerPsm to account for MSFragger file name additions and wrote tests

* edited MsFraggerPsm to account for differences in result files (probabilty v peptideprophetprobability)

* removed unneeded test from TestQuantifiedPeaks

* made recommended changes by reviewers

* made recommended changes

* made recommended changes

---------

Co-authored-by: trishorts <mshort@chem.wisc.edu>
Co-authored-by: Nic Bollis <nbollis@comcast.net>
---
 .../ResultsReading/MzLibExtensions.cs         | 64 +++++++++++++
 .../BaseClasses/IQuantifiableRecord.cs        | 55 ++++++++++++
 .../BaseClasses/IQuantifiableResultFile.cs    | 29 ++++++
 .../IndividualResultRecords/MsFraggerPsm.cs   | 89 +++++++++++++++++-
 .../ResultFiles/MsFraggerPsmFile.cs           | 40 ++++++++-
 .../SmallCalibratibleYeastFragger_psm.tsv     |  6 ++
 .../TestMsFraggerResultFiles.cs               | 15 ++++
 mzLib/Test/Test.csproj                        |  3 +
 .../TestFlashLFQ/TestIdentificationAdapter.cs | 90 +++++++++++++++++++
 9 files changed, 385 insertions(+), 6 deletions(-)
 create mode 100644 mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs
 create mode 100644 mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs
 create mode 100644 mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs
 create mode 100644 mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv
 create mode 100644 mzLib/TestFlashLFQ/TestIdentificationAdapter.cs
diff --git a/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs
new file mode 100644
index 000000000..396e76a9a
--- /dev/null
+++ b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs
@@ -0,0 +1,64 @@
+﻿using Readers.ExternalResults.BaseClasses;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace FlashLFQ
+{
+    public static class MzLibExtensions
+    {
+        /// <summary>
+        /// Makes a list of identification objects usable by FlashLFQ from an IQuantifiableResultFile
+        /// </summary>
+        public static List<Identification> MakeIdentifications(this IQuantifiableResultFile quantifiable)
+        {
+            IEnumerable<IQuantifiableRecord> quantifiableRecords = quantifiable.GetQuantifiableResults();
+            List<Identification> identifications = new List<Identification>();
+            Dictionary<string, ProteinGroup> allProteinGroups = new Dictionary<string, ProteinGroup>();
+            Dictionary<string, SpectraFileInfo> allFiles = new Dictionary<string, SpectraFileInfo>();
+
+            foreach (var record in quantifiableRecords)
+            {
+                string baseSequence = record.BaseSequence;
+                string modifiedSequence = record.ModifiedSequence;
+                double ms2RetentionTimeInMinutes = record.RetentionTime;
+                double monoisotopicMass = record.MonoisotopicMass;
+                int precursurChargeState = record.ChargeState;
+
+                SpectraFileInfo file = null;
+                if (allFiles.TryGetValue(record.FileName, out var fileInfo))
+                {
+                    // placeholder values for SpectraFileInfo that will be edited later
+                    file = new SpectraFileInfo(record.FileName, "", 1, 1, 1);
+                }
+                else
+                {
+                    file = new SpectraFileInfo(record.FileName, "", 1, 1, 1);
+                    allFiles.Add(record.FileName, fileInfo);
+                }
+
+                List<ProteinGroup> proteinGroups = new();
+                foreach (var info in record.ProteinGroupInfos)
+                {
+                    if (allProteinGroups.TryGetValue(info.proteinAccessions, out var proteinGroup))
+                    {
+                        proteinGroups.Add(proteinGroup);
+                    }
+                    else
+                    {
+                        allProteinGroups.Add(info.proteinAccessions, new ProteinGroup(info.proteinAccessions, info.geneName, info.organism));
+                        proteinGroups.Add(allProteinGroups[info.proteinAccessions]);
+                    }
+                }
+                Identification id = new Identification(file, baseSequence, modifiedSequence, monoisotopicMass, ms2RetentionTimeInMinutes, precursurChargeState, proteinGroups);
+                identifications.Add(id);
+
+            }
+
+            return identifications;
+        }
+    }
+}
\ No newline at end of file
diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs
new file mode 100644
index 000000000..87100cfa5
--- /dev/null
+++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs
@@ -0,0 +1,55 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Readers.ExternalResults.BaseClasses
+{
+    /// <summary>
+    /// Defines the information needed to create the identification object usable by FlashLFQ
+    /// </summary>
+    public interface IQuantifiableRecord
+    {
+        /// <summary>
+        /// The file name of the MS Data file in which the identification was made
+        /// </summary>
+        public string FileName { get; }
+
+        /// <summary>
+        /// A list of tuples, each of which represent a protein. 
+        /// Each tuple contains the accession number, gene name, and organism associated with the given result.
+        /// </summary>
+        public List<(string proteinAccessions, string geneName, string organism)> ProteinGroupInfos { get; }
+
+        /// <summary>
+        /// The amino acid sequence of the identified peptide
+        /// </summary>
+        public string BaseSequence { get; }
+
+        /// <summary>
+        /// The amino acid sequence and the associated post-translation modifications of the identified peptide
+        /// </summary>
+        public string ModifiedSequence { get; }
+
+        /// <summary>
+        /// The retention time (in minutes) associated with the result
+        /// </summary>
+        public double RetentionTime { get; }
+
+        /// <summary>
+        /// The charge state associated with the result
+        /// </summary>
+        public int ChargeState { get; }
+
+        /// <summary>
+        /// Defines whether or not the result is a decoy identification
+        /// </summary>
+        public bool IsDecoy { get; }
+
+        /// <summary>
+        /// The mass of the monoisotopic peptide (i.e., no c13 or n15 atoms are present, the lowest possible mass)
+        /// </summary>
+        public double MonoisotopicMass { get; }
+    }
+}
\ No newline at end of file
diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs
new file mode 100644
index 000000000..9e1a1a54e
--- /dev/null
+++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs
@@ -0,0 +1,29 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Readers.ExternalResults.BaseClasses
+{
+    /// <summary>
+    /// Outlines behavior to turn results into an IEnumerable of IQuantifiableRecords 
+    /// and to create the dictionary linking file names from the external result files 
+    /// to their local file paths which are used to make the identification object
+    /// </summary>
+    public interface IQuantifiableResultFile : IResultFile
+    {
+        /// <summary>
+        /// Returns every result in the result file as an IQuantifiableRecord
+        /// </summary>
+        /// <returns> Enumerable that contains identifications for a peptide </returns>
+        public IEnumerable<IQuantifiableRecord> GetQuantifiableResults();
+
+        /// <summary>
+        /// Links the file name associated with the protein to the raw file path of MassSpec data
+        /// </summary>
+        /// <param name="fullFilePath"> list of file paths associated with each distinct record </param>
+        /// <returns> Dictionary of file names and their associted full paths </returns>
+        public Dictionary<string, string> FileNametoFilePath(List<string> fullFilePath);
+    }
+}
\ No newline at end of file
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
index 3dd5b1976..54dc19987 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
@@ -14,10 +14,14 @@
 using Proteomics;
 using static System.Net.Mime.MediaTypeNames;
 using ThermoFisher.CommonCore.Data.Interfaces;
+using Readers.ExternalResults.BaseClasses;
+using System.Reflection.Metadata.Ecma335;
+using System.Runtime.CompilerServices;
+using Easy.Common.Extensions;
 
 namespace Readers
 {
-    public class MsFraggerPsm
+    public class MsFraggerPsm : IQuantifiableRecord
     {
         public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture)
         {
@@ -59,7 +63,7 @@ public class MsFraggerPsm
 
         [Name("Retention")]
         public double RetentionTime { get; set; }
-        
+
         [Name("Observed Mass")]
         public double ObservedMass { get; set; }
 
@@ -90,7 +94,11 @@ public class MsFraggerPsm
         [Name("Nextscore")]
         public double NextScore { get; set; }
 
-        [Name("PeptideProphet Probability")]
+        /// <summary>
+        /// MsFragger v22.0 output renames the header "PeptideProphet Probability" as just "Probability".
+        /// Headers are mutually exclusive, will not both occur in the same file. 
+        /// </summary>
+        [Name("PeptideProphet Probability", "Probability")]
         public double PeptideProphetProbability { get; set; }
 
         [Name("Number of Enzymatic Termini")]
@@ -155,5 +163,78 @@ public class MsFraggerPsm
         public int OneBasedScanNumber => _oneBasedScanNumber ??= int.Parse(Spectrum.Split('.')[1]);
 
         #endregion
+
+        #region IQuantifiableRecord Implementation
+
+        [Ignore] public string FileName => SpectrumFilePath;
+
+        [Ignore] public List<(string, string, string)> ProteinGroupInfos
+        {
+            get 
+            {
+                _proteinGroupInfos ??= AddProteinGroupInfos();
+                return _proteinGroupInfos;
+            }
+        }
+
+        /// <summary>
+        /// Creates a list of tuples, each of which represents a protein.
+        /// Each tuple contains the accession number, gene name, and organism.
+        /// These parameters are used to create a ProteinGroup object, 
+        /// which is needed to make an identification.
+        /// </summary>
+        /// <returns></returns>
+        private List<(string, string, string)> AddProteinGroupInfos ()
+        {
+            _proteinGroupInfos = new List<(string, string, string)> ();
+            string protein = Protein;
+
+            char[] delimiterChars = { '|', '_'};
+            string[] proteinInfo = protein.Split(delimiterChars);
+
+            string proteinAccessions;
+            string geneName;
+            string organism;
+
+            // Fasta header is parsed to separate the accession number, gene name, and organism.
+            // If the protein does not have this information, it will be assigned an empty string.
+            // Ideally, a future refactor would create a method for parsing fasta headers
+            // that is shared by Readers and UsefulProteomicsDatabases.
+            proteinAccessions = proteinInfo.Length >= 2 ? proteinInfo[1] : "";
+            geneName = proteinInfo.Length >= 3 ? proteinInfo[2] : "";
+            organism = proteinInfo.Length >= 4 ? proteinInfo[3] : ""; ;
+
+            _proteinGroupInfos.Add((proteinAccessions, geneName, organism));
+
+            if (MappedProteins.IsNullOrEmpty()) return _proteinGroupInfos;
+
+            string mappedProteins = MappedProteins;
+            string[] allMappedProteinInfo = mappedProteins.Split(',');
+            foreach (var singleMappedProteinInfo in allMappedProteinInfo)
+            {
+                string[] mappedProteinInfo = singleMappedProteinInfo.Split(delimiterChars);
+
+                proteinAccessions = mappedProteinInfo.Length >= 2 ? mappedProteinInfo[1] : "";
+                geneName = mappedProteinInfo.Length >= 3 ? mappedProteinInfo[2] : "";
+                organism = mappedProteinInfo.Length >= 4 ? mappedProteinInfo[3] : "";
+
+                _proteinGroupInfos.Add((proteinAccessions, geneName, organism));
+            }
+
+            return _proteinGroupInfos;
+        }
+
+        [Ignore] private List<(string, string, string)> _proteinGroupInfos;
+
+        [Ignore] public string ModifiedSequence => FullSequence.IsNullOrEmpty() ? BaseSequence : FullSequence;
+
+        [Ignore] public int ChargeState => Charge;
+
+        // decoy reading isn't currently supported for MsFragger psms, this will be revisited later
+        [Ignore] public bool IsDecoy => false;
+
+        [Ignore] public double MonoisotopicMass => CalculatedPeptideMass;
+
+        #endregion
     }
-}
+}
\ No newline at end of file
diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs
index 165d1c8d3..1aa80f885 100644
--- a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs
+++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs
@@ -4,10 +4,12 @@
 using System.Text;
 using System.Threading.Tasks;
 using CsvHelper;
+using MassSpectrometry;
+using Readers.ExternalResults.BaseClasses;
 
 namespace Readers
 {
-    public class MsFraggerPsmFile : ResultFile<MsFraggerPsm>, IResultFile
+    public class MsFraggerPsmFile : ResultFile<MsFraggerPsm>, IQuantifiableResultFile
     {
         public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm;
         public override Software Software { get; set; }
@@ -38,5 +40,39 @@ public override void WriteResults(string outputPath)
                 csv.WriteRecord(result);
             }
         }
+
+        public IEnumerable<IQuantifiableRecord> GetQuantifiableResults() => Results;
+
+        /// <summary>
+        /// Creates a dictionary linking a shortened file name to its corresponding full file path
+        /// </summary>
+        /// <param name="fullFilePath"> list of all full file paths associted with a given result </param>
+        /// <returns> dictionary with key fileName and value fullFilePath </returns>
+        public Dictionary<string, string> FileNametoFilePath (List<string> fullFilePath)
+        {
+            List<string> rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList();
+            fullFilePath = fullFilePath.Distinct().ToList();
+            Dictionary<string, string> allFiles = new Dictionary<string, string>();
+
+            foreach(var fileName in rawFileNames)
+            {
+                string shortFileName = Path.GetFileName(fileName);
+
+                // MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml
+                // In order to correctly match the file names, these changes must be removed
+                shortFileName = shortFileName.Replace("interact-", "").Replace(".pep.xml", "");
+
+                foreach(var file in fullFilePath)
+                {
+                    if (file.Contains(shortFileName) && !allFiles.ContainsKey(fileName))
+                    {
+                        allFiles.Add(fileName, file);
+                        break;
+                    }
+                }
+            }
+
+            return allFiles;
+        }
     }
-}
+}
\ No newline at end of file
diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv
new file mode 100644
index 000000000..a87d8493f
--- /dev/null
+++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv
@@ -0,0 +1,6 @@
+Spectrum	Spectrum File	Peptide	Modified Peptide	Extended Peptide	Prev AA	Next AA	Peptide Length	Charge	Retention	Observed Mass	Calibrated Observed Mass	Observed M/Z	Calibrated Observed M/Z	Calculated Peptide Mass	Calculated M/Z	Delta Mass	Expectation	Hyperscore	Nextscore	Probability	Number of Enzymatic Termini	Number of Missed Cleavages	Protein Start	Protein End	Intensity	Assigned Modifications	Observed Modifications	Purity	Is Unique	Protein	Protein ID	Entry Name	Gene	Protein Description	Mapped Genes	Mapped Proteins
+SmallCalibratibleYeast.00002.00002.2	E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml	RGNVCGDAK	RGNVCGDAK	NVSVKEIR.RGNVCGDAK.NDPPKGCA	R	N	9	2	1443.6643	975.4455	975.4455	488.7300	488.7300	975.4556	488.7351	-0.0101	0.00047835460000	25.7810	11.6900	0.9997	2	1	320	328	1.81151584E8	5C(57.0215)		0.00	true	sp|P02994|EF1A_YEAST	P02994	EF1A_YEAST	TEF1	Elongation factor 1-alpha		
+SmallCalibratibleYeast.00004.00004.2	E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml	EKAEAEAEK		GIREKRAR.EKAEAEAEK.KK	R	K	9	2	1444.1241	1003.4855	1003.4855	502.7500	502.7500	1003.4821	502.7483	0.0033	0.02974116000000	18.6380	12.6310	0.9269	2	1	189	197	3.6544424E7			0.00	false	sp|P40212|RL13B_YEAST	P40212	RL13B_YEAST	RPL13B	Large ribosomal subunit protein eL13B	RPL13A	sp|Q12690|RL13A_YEAST
+SmallCalibratibleYeast.00008.00008.2	E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml	KITSNQR		FNVPIDGK.KITSNQR.IVAAIPTI	K	I	7	2	1446.0237	845.4655	845.4655	423.7400	423.7400	845.4719	423.7432	-0.0064	0.13056640000000	17.5940	12.5170	0.9646	2	1	33	39	2.5555878E7			0.00	true	sp|P00560|PGK_YEAST	P00560	PGK_YEAST	PGK1	Phosphoglycerate kinase		
+SmallCalibratibleYeast.00009.00009.2	E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml	GIDHTSK		.GIDHTSK.QHKRSGHR	M	Q	7	2	1446.2545	756.3855	756.3855	379.2000	379.2000	756.3766	379.1956	0.0089	0.01669167000000	17.6100	9.8270	0.9965	2	0	2	8	6.739196E7			0.00	false	sp|P0CX49|RL18A_YEAST	P0CX49	RL18A_YEAST	RPL18A	Large ribosomal subunit protein eL18A	RPL18B	sp|P0CX50|RL18B_YEAST
+SmallCalibratibleYeast.00010.00010.2	E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml	EKAEAEAEK		GIREKRAR.EKAEAEAEK.KK	R	K	9	2	1446.4789	1003.4855	1003.4855	502.7500	502.7500	1003.4821	502.7483	0.0033	0.02315876000000	18.5780	12.4510	0.9875	2	1	189	197	3.6544424E7			0.00	false	sp|P40212|RL13B_YEAST	P40212	RL13B_YEAST	RPL13B	Large ribosomal subunit protein eL13B	RPL13A	sp|Q12690|RL13A_YEAST
diff --git a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs
index 06f63372e..765bd5c1c 100644
--- a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs
+++ b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs
@@ -45,6 +45,21 @@ public void TestMsFraggerPsmLoadsAndCountCorrect(string path, int count)
             Assert.That(file.CanRead(path));
         }
 
+        [Test]
+        [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")]
+        public void TestAddProteinGroupInfoCountCorrect (string path)
+        {
+            string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path);
+            MsFraggerPsmFile file = new MsFraggerPsmFile(filePath);
+            var allResults = file.ToList();
+
+            // one protein associated with given results, list should only contain this one element 
+            Assert.That(allResults[0].ProteinGroupInfos.Count, Is.EqualTo(1));
+            // two proteins associated with given results, list should contain two elements
+            Assert.That(allResults[2].ProteinGroupInfos.Count, Is.EqualTo(2));
+
+        }
+
         [Test]
         [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1individual_peptide.tsv", 7)]
         [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1combined_peptide.tsv", 6)]
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index f6d44a6cf..b58d87522 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -318,6 +318,9 @@
     <None Update="FileReadingTests\ExternalFileTypes\MsPathFinderT_TargetResults_IcTarget.tsv">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
+    <None Update="FileReadingTests\ExternalFileTypes\SmallCalibratibleYeastFragger_psm.tsv">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
     <None Update="FileReadingTests\ExternalFileTypes\ToppicProteoformSingle_TopPICv1.5.3_proteoform_single.tsv">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
diff --git a/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs
new file mode 100644
index 000000000..72461fa9c
--- /dev/null
+++ b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs
@@ -0,0 +1,90 @@
+﻿using NUnit.Framework;
+using Readers;
+using System.Collections.Generic;
+using System.Linq;
+using FlashLFQ;
+using Assert = NUnit.Framework.Legacy.ClassicAssert;
+using System.IO;
+
+namespace TestFlashLFQ
+{
+    internal class TestIdentificationAdapter
+    {
+        [Test]
+        [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")]
+        public void TestAddProteinGroupInfoCorrect(string path)
+        {
+            string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path);
+            MsFraggerPsmFile file = new MsFraggerPsmFile(filePath);
+
+            List<Identification> identifications = new List<Identification>();
+            identifications = MzLibExtensions.MakeIdentifications(file);
+
+            // list should contain five elements
+            Assert.That(identifications.Count, Is.EqualTo(5));
+            // one protein associated with given results, list should only contain this one element 
+            Assert.That(identifications[0].ProteinGroups.Count, Is.EqualTo(1));
+            // two proteins associated with given results, list should contain two elements
+            Assert.That(identifications[2].ProteinGroups.Count, Is.EqualTo(2));
+            
+            Identification identification1= identifications[0];
+            Assert.That(identification1.BaseSequence, Is.EqualTo("KPVGAAK"));
+            Assert.That(identification1.ModifiedSequence, Is.EqualTo("KPVGAAK"));
+            Assert.That(identification1.Ms2RetentionTimeInMinutes, Is.EqualTo(1.9398));
+            Assert.That(identification1.MonoisotopicMass, Is.EqualTo(669.4173));
+            Assert.That(identification1.PrecursorChargeState, Is.EqualTo(2));
+
+            HashSet<ProteinGroup> proteinGroups = identification1.ProteinGroups;
+            ProteinGroup proteinGroup1 = proteinGroups.First();
+            Assert.That(proteinGroup1.ProteinGroupName, Is.EqualTo("P16403"));
+            Assert.That(proteinGroup1.GeneName, Is.EqualTo("H12"));
+            Assert.That(proteinGroup1.Organism, Is.EqualTo("HUMAN"));
+
+            Identification identification5 = identifications[4];
+            Assert.That(identification5.BaseSequence, Is.EqualTo("VVTHGGR"));
+            Assert.That(identification5.ModifiedSequence, Is.EqualTo("VVTHGGR"));
+            Assert.That(identification5.Ms2RetentionTimeInMinutes, Is.EqualTo(19.114));
+            Assert.That(identification5.MonoisotopicMass, Is.EqualTo(724.398));
+            Assert.That(identification5.PrecursorChargeState, Is.EqualTo(2));
+        }
+
+        [Test]
+        [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")]
+        public void TestFileNametoFilePath(string path)
+        {
+            string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path);
+            MsFraggerPsmFile file = new MsFraggerPsmFile(filePath);
+            string fileName = file.First().FileName;
+
+            List<string> fullFilePath = new List<string>();
+            string fullFilePath1 = @"D:\Projects\Chimeras\Mann_11cell_analysis\RawData\interact-20100611_Velos1_TaGe_SA_Hela_1.raw";
+            string fullFilePath2 = @"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv";
+            fullFilePath.Add(fullFilePath1);
+            fullFilePath.Add(fullFilePath2);
+
+            Dictionary<string, string> allFiles = file.FileNametoFilePath(fullFilePath);
+
+            Assert.That(allFiles.TryGetValue(fileName, out var output));
+            Assert.AreEqual(output, fullFilePath1);
+            Assert.That(!allFiles.ContainsValue(fullFilePath2));
+        }
+
+        [Test]
+        [TestCase(@"FileReadingTests\ExternalFileTypes\SmallCalibratibleYeastFragger_psm.tsv")]
+        public void TestFileNametoFilePathLocalPath(string path)
+        {
+            string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path);
+            MsFraggerPsmFile file = new MsFraggerPsmFile(filePath);
+            string fileName = file.First().FileName;
+
+            List<string> fullFilePath = new List<string>();
+            string rawFilePath = @"DataFiles\SmallCalibratibleYeast.mzml";
+            fullFilePath.Add(rawFilePath);
+
+            Dictionary<string, string> allFiles = file.FileNametoFilePath(fullFilePath);
+
+            Assert.That(allFiles.TryGetValue(fileName, out var output));
+            Assert.AreEqual(output, rawFilePath);
+        }
+    }
+}
\ No newline at end of file

From 983c3b01b059646c8da77eec79978cae86c1c358 Mon Sep 17 00:00:00 2001
From: Alexander-Sol <41119316+Alexander-Sol@users.noreply.github.com>
Date: Fri, 13 Sep 2024 15:25:35 -0500
Subject: [PATCH 2/2] Modified decoy scrambler to no longer use static Random
 generator (#798)

* Modified decoy scrambler to no longer use static Random generator

* Added additional tests

* Better tests
---
 mzLib/Proteomics/Protein/Protein.cs           |  9 ++-
 .../Test/DatabaseTests/TestDatabaseLoaders.cs | 36 +++++++++-
 mzLib/Test/TestProteinDigestion.cs            | 67 +++++++++++++++++++
 3 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs
index 5c1b428d6..fc07460d2 100644
--- a/mzLib/Proteomics/Protein/Protein.cs
+++ b/mzLib/Proteomics/Protein/Protein.cs
@@ -836,6 +836,7 @@ public static Protein ScrambleDecoyProteinSequence(
             string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
             // Clone the original protein's modifications
             var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
+            Random rng = new Random(42);
 
             // Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
             // Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
@@ -843,14 +844,14 @@ public static Protein ScrambleDecoyProteinSequence(
             {
                 if(scrambledProteinSequence.Contains(peptideSequence))
                 {
-                    string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, 
+                    string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
                         out var swappedArray);
                     int scrambleAttempts = 1;
 
                     // Try five times to scramble the peptide sequence without creating a forbidden sequence
                     while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
                     {
-                        scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
+                        scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
                             out swappedArray);
                         scrambleAttempts++;
                     }
@@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence(
             return newProtein;
         }
 
-        private static Random rng = new Random(42);
-
         /// <summary>
         /// Scrambles a peptide sequence, preserving the position of any cleavage sites.
         /// </summary>
         /// <param name="swappedPositionArray">An array that maps the previous position (index) to the new position (value)</param>
-        public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, out int[] swappedPositionArray)
+        public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, Random rng, out int[] swappedPositionArray)
         {
             // First, find the location of every cleavage motif. These sites shouldn't be scrambled.
             HashSet<int> zeroBasedCleavageSitesLocations = new();
diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
index 82d7ce715..8925661cb 100644
--- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
+++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
@@ -1,4 +1,4 @@
-﻿// opyright 2016 Stefan Solntsev
+﻿// Copyright 2016 Stefan Solntsev
 //
 // This file (ChemicalFormula.cs) is part of Chemistry Library.
 //
@@ -28,6 +28,7 @@
 using Omics.Modifications;
 using UsefulProteomicsDatabases;
 using Stopwatch = System.Diagnostics.Stopwatch;
+using NUnit.Framework.Legacy;
 
 namespace Test.DatabaseTests
 {
@@ -81,6 +82,39 @@ public static void LoadIsoforms()
             Assert.AreEqual("Q14103-4", proteinXml[9].Accession);
         }
 
+        [Test]
+        [TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)]
+        [TestCase("uniprot_aifm1.fasta", DecoyType.None)]
+        [TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)]
+        [TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)]
+        public void LoadingIsReproducible(string fileName, DecoyType decoyType)
+        {
+            // Load in proteins
+            var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
+            List<Protein> proteins1 = null;
+            List<Protein> proteins2 = null;
+            if(fileName.Contains(".xml"))
+            {
+                proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
+                proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
+            }
+            else if (fileName.Contains(".fasta"))
+            {
+                proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
+                proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
+            }
+            else
+            {
+                Assert.Fail("Unknown file type");
+            }
+
+            // check are equivalent lists of proteins
+            Assert.AreEqual(proteins1.Count, proteins2.Count);
+            // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same
+            CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession));
+            CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence));
+        }
+
         [Test]
         public static void LoadModWithNl()
         {
diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs
index 975db7dd1..02cc3aed5 100644
--- a/mzLib/Test/TestProteinDigestion.cs
+++ b/mzLib/Test/TestProteinDigestion.cs
@@ -361,6 +361,73 @@ public static void Test_ProteinDigest()
             Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence);
         }
 
+        [Test]
+        [TestCase("cRAP_databaseGPTMD.xml")]
+        [TestCase("uniprot_aifm1.fasta")]
+        public static void TestDecoyScramblingIsReproducible(string fileName)
+        {
+            // Load in proteins
+            var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName);
+            DecoyType decoyType = DecoyType.Reverse;
+            List<Protein> proteins1 = null;
+            List<Protein> proteins2 = null;
+            if (fileName.Contains(".xml"))
+            {
+                proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications);
+                proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications);
+            }
+            else if (fileName.Contains(".fasta"))
+            {
+                proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications);
+                proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications);
+            }
+            else
+            {
+                Assert.Fail("Unknown file type");
+            }
+
+            DigestionParams d = new DigestionParams(
+                        maxMissedCleavages: 1,
+                        minPeptideLength: 5,
+                        initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain);
+            // Digest target proteins
+            var pepsToReplace = proteins1.Where(p => !p.IsDecoy)
+                .SelectMany(p => p.Digest(d, new List<Modification>(), new List<Modification>()).ToList())
+                .Select(pep => pep.BaseSequence)
+                .ToHashSet();
+
+            // Ensure at least one decoy peptide from each protein is problematic and must be replaced
+            var singleDecoyPeptides = proteins1
+                .Where(p => p.IsDecoy)
+                .Select(p => p.Digest(d, new List<Modification>(), new List<Modification>()).Skip(2).Take(1))
+                .Select(pwsm => pwsm.First().BaseSequence)
+                .ToHashSet();
+
+            //modify targetpeptides in place
+            pepsToReplace.UnionWith(singleDecoyPeptides);
+
+            // Scramble every decoy from db1
+            List<Protein> decoys1 = new();
+            foreach (var protein in proteins1.Where(p => p.IsDecoy))
+            {
+                decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
+            }
+            // Scramble every decoy from db2
+            List<Protein> decoys2 = new();
+            foreach (var protein in proteins2.Where(p => p.IsDecoy))
+            {
+                decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace));
+            }
+
+            // check are equivalent lists of proteins
+            Assert.AreEqual(decoys1.Count, decoys2.Count);
+            foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession))
+            {
+                Assert.AreEqual(2, decoyPair.Count());
+                Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence);
+            }
+        }
+
         [Test]
         public static void TestDecoyScramblerReplacesPeptides()
         {