Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gptmd approach update #2419

Merged
merged 39 commits into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
bd5aefc
update mzlib nuget package to 551
trishorts Aug 26, 2024
d60e32e
Merge remote-tracking branch 'upstream/master'
trishorts Sep 5, 2024
5e65835
use psms score to revise what ptms are added in gptmd
trishorts Sep 18, 2024
dd2cbfa
now deals w/ variants
trishorts Sep 19, 2024
7a7dad6
fix unit testts
trishorts Sep 19, 2024
22fb336
Merge remote-tracking branch 'upstream/master' into gptmdApproachUpdate
trishorts Sep 19, 2024
8245a0e
unused lines
trishorts Sep 19, 2024
ce6c4e1
load files in background
trishorts Sep 19, 2024
ef44e03
add parallelization and eliminate pep
trishorts Sep 20, 2024
f33313b
fix unit tests
trishorts Sep 20, 2024
5a3445b
yo
trishorts Sep 20, 2024
2dcc22c
fix those unit tests
trishorts Sep 20, 2024
b7371d0
fix unit test
trishorts Sep 20, 2024
67aa1d2
test gptmd DissociationType Autodetect
trishorts Sep 23, 2024
bf0401d
gptmd test mod before variant
trishorts Sep 24, 2024
c44a719
new test
trishorts Sep 24, 2024
40abab6
Merge branch 'master' into gptmdApproachUpdate
trishorts Sep 24, 2024
0646bb4
Merge branch 'master' into gptmdApproachUpdate
trishorts Sep 24, 2024
96268d0
delete unreachable code
trishorts Sep 27, 2024
63c6fff
unit test for xcorr process spectra and matched ion with unknown mass
trishorts Sep 27, 2024
71c2274
Test MatchFragmentIons when scan has no peaks
trishorts Sep 27, 2024
46f902e
test gptmd task with contaminant database
trishorts Sep 27, 2024
cf83389
return missing comments
trishorts Sep 27, 2024
aed2aba
Merge branch 'gptmdApproachUpdate' of https://github.com/trishorts/Me…
trishorts Sep 27, 2024
388cc0b
Merge branch 'master' into gptmdApproachUpdate
trishorts Sep 27, 2024
04c3ed8
more unit test coverage for modification analysis engine
trishorts Sep 30, 2024
8d48e53
Merge branch 'gptmdApproachUpdate' of https://github.com/trishorts/Me…
trishorts Sep 30, 2024
51b1c58
Merge branch 'master' into gptmdApproachUpdate
trishorts Oct 3, 2024
d716476
Merge branch 'master' into gptmdApproachUpdate
trishorts Oct 30, 2024
7077e45
Merge branch 'master' into gptmdApproachUpdate
trishorts Nov 4, 2024
ba4f69b
Merge branch 'master' into gptmdApproachUpdate
elaboy Nov 4, 2024
9b398a4
Merge remote-tracking branch 'upstream/master' into gptmdApproachUpdate
trishorts Nov 8, 2024
b26f50d
fix unit test
trishorts Nov 8, 2024
c8f2432
Merge branch 'gptmdApproachUpdate' of https://github.com/trishorts/Me…
trishorts Nov 8, 2024
a1a98d4
works on my machine
trishorts Nov 11, 2024
976570a
fix missing prose line
trishorts Nov 12, 2024
d821dde
eliminate unused code
trishorts Nov 12, 2024
688b112
merge upstream
trishorts Nov 13, 2024
f3d1ca3
fix merge problems
trishorts Nov 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 130 additions & 66 deletions MetaMorpheus/EngineLayer/Gptmd/GptmdEngine.cs

Large diffs are not rendered by default.

7 changes: 1 addition & 6 deletions MetaMorpheus/EngineLayer/MetaMorpheusEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,6 @@ public static List<MatchedFragmentIon> MatchFragmentIons(Ms2ScanWithSpecificMass

if (scan.TheScan.MassSpectrum.XcorrProcessed && scan.TheScan.MassSpectrum.XArray.Length != 0)
{
// if the spectrum has no peaks
if (scan.TheScan.MassSpectrum.XArray.Length == 0)
{
return matchedFragmentIons;
}

for (int i = 0; i < theoreticalProducts.Count; i++)
{
Expand Down Expand Up @@ -225,7 +220,7 @@ public static List<MatchedFragmentIon> MatchFragmentIons(Ms2ScanWithSpecificMass

return matchedFragmentIons;
}

//Used only when user wants to generate spectral library.
//Normal search only looks for one match ion for one fragment, and if it accepts it then it doesn't try to look for different charge states of that same fragment.
//But for library generation, we need find all the matched peaks with all the different charges.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ protected override MetaMorpheusEngineResults RunSpecific()
if (unlocalizedFormulas.ContainsKey(representativePsm.ModsChemicalFormula))
unlocalizedFormulas[representativePsm.ModsChemicalFormula] += 1;
else
unlocalizedFormulas.Add(representativePsm.ModsChemicalFormula, 1);
unlocalizedFormulas.Add(representativePsm.ModsChemicalFormula, 1);
}

myAnalysisResults.CountOfEachModSeenOnProteins = modsOnProteins.GroupBy(b => b.Item2).ToDictionary(b => b.Key, b => b.Count());
Expand Down
68 changes: 50 additions & 18 deletions MetaMorpheus/TaskLayer/GPTMDTask/GPTMDTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
using Proteomics.ProteolyticDigestion;
using System.Globalization;
using Omics.Modifications;
using System.Threading.Tasks;

namespace TaskLayer
{
Expand All @@ -30,14 +31,30 @@

protected override MyTaskResults RunSpecific(string OutputFolder, List<DbForTask> dbFilenameList, List<string> currentRawFileList, string taskId, FileSpecificParameters[] fileSettingsList)
{
MyFileManager myFileManager = new MyFileManager(true);
var fileSpecificCommonParams = fileSettingsList.Select(b => SetAllFileSpecificCommonParams(CommonParameters, b));

// start loading first spectra file in the background
Task<MsDataFile> nextFileLoadingTask = new(() => myFileManager.LoadFile(currentRawFileList[0], SetAllFileSpecificCommonParams(CommonParameters, fileSettingsList[0])));
nextFileLoadingTask.Start();
LoadModifications(taskId, out var variableModifications, out var fixedModifications, out var localizeableModificationTypes);

// start loading proteins in the background
List<Protein> proteinList = null;
Task<List<Protein>> proteinLoadingTask = new(() =>
{
var proteins = LoadProteins(taskId, dbFilenameList, true, DecoyType.Reverse,
localizeableModificationTypes,
CommonParameters);
SanitizeProteinDatabase(proteins, TargetContaminantAmbiguity.RemoveContaminant);
return proteins;
});
proteinLoadingTask.Start();

// TODO: print error messages loading GPTMD mods
List<Modification> gptmdModifications = GlobalVariables.AllModsKnown.OfType<Modification>().Where(b => GptmdParameters.ListOfModsGptmd.Contains((b.ModificationType, b.IdWithMotif))).ToList();
IEnumerable<Tuple<double, double>> combos = LoadCombos(gptmdModifications).ToList();

// load proteins
List<Protein> proteinList = LoadProteins(taskId, dbFilenameList, true, DecoyType.Reverse, localizeableModificationTypes, CommonParameters);

List<SpectralMatch> allPsms = new List<SpectralMatch>();

Expand All @@ -62,18 +79,13 @@
ProseCreatedWhileRunning.Append("precursor mass tolerance(s) = {" + tempSearchMode.ToProseString() + "}; ");

ProseCreatedWhileRunning.Append("product mass tolerance = " + CommonParameters.ProductMassTolerance + ". ");
ProseCreatedWhileRunning.Append("The combined search database contained " + proteinList.Count(p => !p.IsDecoy) + " non-decoy protein entries including " + proteinList.Where(p => p.IsContaminant).Count() + " contaminant sequences. ");
trishorts marked this conversation as resolved.
Show resolved Hide resolved

// start the G-PTM-D task
Status("Running G-PTM-D...", new List<string> { taskId });
MyTaskResults = new MyTaskResults(this)
{
NewDatabases = new List<DbForTask>()
};
var fileSpecificCommonParams = fileSettingsList.Select(b => SetAllFileSpecificCommonParams(CommonParameters, b));
HashSet<DigestionParams> ListOfDigestionParams = new HashSet<DigestionParams>(fileSpecificCommonParams.Select(p => p.DigestionParams));

MyFileManager myFileManager = new MyFileManager(true);

object lock1 = new object();
object lock2 = new object();
Expand All @@ -94,14 +106,39 @@
NewCollection(Path.GetFileName(origDataFile), new List<string> { taskId, "Individual Spectra Files", origDataFile });

Status("Loading spectra file...", new List<string> { taskId, "Individual Spectra Files", origDataFile });
MsDataFile myMsDataFile = myFileManager.LoadFile(origDataFile, combinedParams);

// ensure that the next file has finished loading from the async method
nextFileLoadingTask.Wait();
var myMsDataFile = nextFileLoadingTask.Result;
// if another file exists, then begin loading it in while the previous is being searched
if (origDataFile != currentRawFileList.Last())
{
int nextFileIndex = spectraFileIndex + 1;
nextFileLoadingTask = new Task<MsDataFile>(() => myFileManager.LoadFile(currentRawFileList[nextFileIndex], SetAllFileSpecificCommonParams(CommonParameters, fileSettingsList[nextFileIndex])));
nextFileLoadingTask.Start();
}
Status("Getting ms2 scans...", new List<string> { taskId, "Individual Spectra Files", origDataFile });
Ms2ScanWithSpecificMass[] arrayOfMs2ScansSortedByMass = GetMs2Scans(myMsDataFile, origDataFile, combinedParams).OrderBy(b => b.PrecursorMass).ToArray();
myFileManager.DoneWithFile(origDataFile);
SpectralMatch[] allPsmsArray = new PeptideSpectralMatch[arrayOfMs2ScansSortedByMass.Length];

//spectral Library search and library generation have't applied to GPTMD yet
bool writeSpctralLibrary = false;

// ensure proteins are loaded in before proceeding with search
switch (proteinLoadingTask.IsCompleted)
{
case true when proteinList is null: // has finished loading but not been set
proteinList = proteinLoadingTask.Result;
break;

Check warning on line 133 in MetaMorpheus/TaskLayer/GPTMDTask/GPTMDTask.cs

View check run for this annotation

Codecov / codecov/patch

MetaMorpheus/TaskLayer/GPTMDTask/GPTMDTask.cs#L132-L133

Added lines #L132 - L133 were not covered by tests
case true when proteinList.Any(): // has finished loading and already been set
break;
case false: // has not finished loading
proteinLoadingTask.Wait();
proteinList = proteinLoadingTask.Result;
break;
}

new ClassicSearchEngine(allPsmsArray, arrayOfMs2ScansSortedByMass, variableModifications, fixedModifications, null, null, null,
proteinList, searchMode, combinedParams, this.FileSpecificParameters, null, new List<string> { taskId, "Individual Spectra Files", origDataFile }, writeSpctralLibrary).Run();
allPsms.AddRange(allPsmsArray.Where(p => p != null));
Expand All @@ -110,18 +147,11 @@
}
ReportProgress(new ProgressEventArgs(100, "Done!", new List<string> { taskId, "Individual Spectra Files" }));

allPsms = allPsms.OrderByDescending(b => b.Score)
.ThenBy(b => b.BioPolymerWithSetModsMonoisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.BioPolymerWithSetModsMonoisotopicMass.Value) : double.MaxValue)
.GroupBy(b => new Tuple<string, int, double?>(b.FullFilePath, b.ScanNumber, b.BioPolymerWithSetModsMonoisotopicMass))
.Select(b => b.First()).ToList();

new FdrAnalysisEngine(allPsms, tempSearchMode.NumNotches, CommonParameters, this.FileSpecificParameters, new List<string> { taskId }).Run();
//Move this text after search because proteins don't get loaded until search begins.
ProseCreatedWhileRunning.Append("The combined search database contained " + proteinList.Count(p => !p.IsDecoy) + " non-decoy protein entries including " + proteinList.Where(p => p.IsContaminant).Count() + " contaminant sequences. ");

var writtenFile = Path.Combine(OutputFolder, "GPTMD_Candidates.psmtsv");
WritePsmsToTsv(allPsms, writtenFile, new Dictionary<string, int>());
FinishedWritingFile(writtenFile, new List<string> { taskId });
new FdrAnalysisEngine(allPsms.OrderBy(p=>p).ToList(), tempSearchMode.NumNotches, CommonParameters, this.FileSpecificParameters, new List<string> { taskId }, doPEP: false).Run();

// get file-specific precursor mass tolerances for the GPTMD engine
var filePathToPrecursorMassTolerance = new Dictionary<string, Tolerance>();
for (int i = 0; i < currentRawFileList.Count; i++)
{
Expand All @@ -135,6 +165,7 @@
}

// run GPTMD engine
Status("Creating the GPTMD Database", new List<string> { taskId });
var gptmdResults = (GptmdResults)new GptmdEngine(allPsms, gptmdModifications, combos, filePathToPrecursorMassTolerance, CommonParameters, this.FileSpecificParameters, new List<string> { taskId }).Run();

// Stop if canceled
Expand Down Expand Up @@ -188,6 +219,7 @@
MyTaskResults.AddTaskSummaryText("Mods types and counts:");
MyTaskResults.AddTaskSummaryText(string.Join(Environment.NewLine, newModsActuallyWritten.OrderByDescending(b => b.Value).Select(b => "\t" + b.Key + "\t" + b.Value)));
}
Status("Done", new List<string> { taskId });
return MyTaskResults;
}

Expand Down
4 changes: 2 additions & 2 deletions MetaMorpheus/TaskLayer/MetaMorpheusTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -572,10 +572,10 @@ public MyTaskResults RunTask(string output_folder, List<DbForTask> currentProtei
using (StreamWriter file = new StreamWriter(proseFilePath))
{
file.WriteLine("The data analysis was performed using MetaMorpheus version " + GlobalVariables.MetaMorpheusVersion + ", available at " + "https://github.com/smith-chem-wisc/MetaMorpheus.");
file.WriteLine();
file.WriteLine();
file.Write(ProseCreatedWhileRunning.ToString());
file.WriteLine(SystemInfo.SystemProse().Replace(Environment.NewLine, "") + " ");
file.WriteLine();
file.WriteLine();
file.WriteLine("The total time to perform the " + TaskType + " task on " + currentRawDataFilepathList.Count + " spectra file(s) was " + String.Format("{0:0.00}", MyTaskResults.Time.TotalMinutes) + " minutes.");
file.WriteLine();
file.WriteLine("Published works using MetaMorpheus software are encouraged to cite the appropriate publications listed in the reference guide, found here: https://github.com/smith-chem-wisc/MetaMorpheus/blob/master/README.md.");
Expand Down
38 changes: 4 additions & 34 deletions MetaMorpheus/Test/CustomFragmentationTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,9 @@ public static void CustomFragmentationManyTasks()
CollectionAssert.AreEquivalent(customIons, loadedSearchTask.CommonParameters.CustomIons);
Assert.That(loadedSearchTask.CommonParameters.DissociationType, Is.EqualTo(DissociationType.Custom));

// read gptmd and search results to ensure matched ions are correct
var gptmdResults = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "GPTMD", "GPTMD_Candidates.psmtsv"), out List<string> warnings);
var searchResults = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "Search", "AllPSMs.psmtsv"), out List<string> warnings);
Assert.That(!warnings.Any());
var productIons = gptmdResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons, productIons);

var searchResults = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "Search", "AllPSMs.psmtsv"), out warnings);
Assert.That(!warnings.Any());
productIons = searchResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
var productIons = searchResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons, productIons);

Expand All @@ -156,13 +149,6 @@ public static void CustomFragmentationManyTasks()
CollectionAssert.AreEquivalent(customIons, loadedSearchTask.CommonParameters.CustomIons);
Assert.That(loadedSearchTask.CommonParameters.DissociationType, Is.EqualTo(DissociationType.Custom));

// read gptmd and search results to ensure matched ions are correct
gptmdResults = PsmTsvReader.ReadTsv(Path.Combine(newOutputFolder, "GPTMD", "GPTMD_Candidates.psmtsv"), out warnings);
Assert.That(!warnings.Any());
productIons = gptmdResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons, productIons);

searchResults = PsmTsvReader.ReadTsv(Path.Combine(newOutputFolder, "Search", "AllPSMs.psmtsv"), out warnings);
Assert.That(!warnings.Any());
productIons = searchResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
Expand Down Expand Up @@ -298,16 +284,9 @@ public static void CustomFragmentIonsManySearchTasksContainingDifferentIons()
CollectionAssert.AreEquivalent(customIons3, loadedSearchTask3.CommonParameters.CustomIons);
Assert.That(loadedSearchTask3.CommonParameters.DissociationType == DissociationType.Custom);

// read gptmd and search results to ensure matched ions are correct
var gptmdResults = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "GPTMD", "GPTMD_Candidates.psmtsv"), out List<string> warnings);
var searchResults1 = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "Search1", "AllPSMs.psmtsv"), out List<string> warnings);
Assert.That(!warnings.Any());
var productIons = gptmdResults.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons2, productIons);

var searchResults1 = PsmTsvReader.ReadTsv(Path.Combine(outputFolder, "Search1", "AllPSMs.psmtsv"), out warnings);
Assert.That(!warnings.Any());
productIons = searchResults1.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
var productIons = searchResults1.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons1, productIons);

Expand Down Expand Up @@ -365,15 +344,6 @@ public static void CustomFragmentIonsManySearchTasksContainingDifferentIons()
CollectionAssert.AreEquivalent(customIons3, loadedSearchTask3.CommonParameters.CustomIons);
Assert.That(loadedSearchTask3.CommonParameters.DissociationType == DissociationType.Custom);

// read gptmd and search results to ensure matched ions are correct
gptmdResults = PsmTsvReader.ReadTsv(Path.Combine(newOutputFolder, "GPTMD", "GPTMD_Candidates.psmtsv"),
out warnings);
Assert.That(!warnings.Any());
productIons = gptmdResults
.SelectMany(p => p.MatchedIons.Select(m => m.NeutralTheoreticalProduct.ProductType))
.Distinct();
CollectionAssert.AreEquivalent(customIons2, productIons);

searchResults1 =
PsmTsvReader.ReadTsv(Path.Combine(newOutputFolder, "Search1", "AllPSMs.psmtsv"), out warnings);
Assert.That(!warnings.Any());
Expand Down
Loading