Skip to content

Commit

Permalink
Add option and capability to re-create Peptide/PeptideEvidence IDs, t…
Browse files Browse the repository at this point in the history
…o permit merging old MS-GF+ split-fasta searches
  • Loading branch information
FarmGeek4Life committed Oct 9, 2018
1 parent 56e21bd commit 103de10
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 15 deletions.
7 changes: 7 additions & 0 deletions MzidMerger/MzidMerger.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,18 @@
<RepositoryUrl>https://github.com/PNNL-Comp-Mass-Spec/MzidMerger</RepositoryUrl>
<RepositoryType>GitHub</RepositoryType>
<Copyright>Copyright @ PNNL 2018</Copyright>
<PlatformTarget>AnyCPU</PlatformTarget>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="PRISM-Library" Version="2.4.93" />
<PackageReference Include="PSI_Interface" Version="1.3.22" />
</ItemGroup>

<!-- Disabled for now; causes an infinite build loop without the -\-no-build flag, which isn't available for .NET Core 2.0 (requires 2.1)
<Target Name="PostBuild" AfterTargets="PostBuildEvent">
<Exec Command="dotnet publish $(ProjectPath) -c $(ConfigurationName) -f netcoreapp2.0 -/-no-build" />
</Target>
-->

</Project>
143 changes: 128 additions & 15 deletions MzidMerger/MzidMerging.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,18 @@ public static void MergeMzids(Options options)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var targetFile = options.FilesToMerge.First();
var toMerge = options.FilesToMerge.Skip(1).ParallelPreprocess(x => IdentDataReaderWriter.Read(x), 2);
IEnumerable<IdentDataObj> toMerge = null;
if (!options.MultiThread)
{
toMerge = options.FilesToMerge.Skip(1).Select(x => ReadAndPreprocessFile(x, options));
}
else
{
toMerge = options.FilesToMerge.Skip(1).ParallelPreprocess(x => ReadAndPreprocessFile(x, options), 2);
}

Console.WriteLine("Reading first file (the merge target)...");
var targetObj = IdentDataReaderWriter.Read(targetFile);
var targetObj = ReadAndPreprocessFile(targetFile, options);

var merger = new MzidMerging(targetObj);
merger.MergeIdentData(toMerge, options.MaxSpecEValue, options.KeepOnlyBestResults, true);
Expand Down Expand Up @@ -64,6 +72,116 @@ public static void MergeMzids(Options options)
Console.WriteLine("Total time to merge {0} files: {1}", options.FilesToMerge.Count, sw.Elapsed);
}

private static IdentDataObj ReadAndPreprocessFile(string filePath, Options options)
{
var identData = IdentDataReaderWriter.Read(filePath);

if (options.FixIDs)
{
try
{
//var pepDict = new Dictionary<string, PeptideObj>(); // TODO: Monitor for duplicates
var mods = identData.AnalysisProtocolCollection.SpectrumIdentificationProtocols[0].ModificationParams.Where(x => x.FixedMod);
var fixedModDict = new Dictionary<string, List<SearchModificationObj>>();
foreach (var mod in mods)
{
var massStr = mod.MassDelta.ToString("F4");
if (!fixedModDict.ContainsKey(massStr))
{
fixedModDict.Add(massStr, new List<SearchModificationObj>());
}

fixedModDict[massStr].Add(mod);
}

foreach (var peptide in identData.SequenceCollection.Peptides)
{
// Re-ID peptides...
var seq = peptide.PeptideSequence;
foreach (var mod in peptide.Modifications.OrderByDescending(x => x.Location).ThenByDescending(x => x.MonoisotopicMassDelta))
{
var isNTerm = false;
var isCTerm = false;
if (mod.Location == 0)
{
isNTerm = true;
}

if (mod.Location == 0 || mod.Location == peptide.PeptideSequence.Length + 1)
{
isCTerm = true;
}

var massStr = mod.MonoisotopicMassDelta.ToString("F4");
// match to mass and residue (backward lookup from location)
if (fixedModDict.TryGetValue(massStr, out var fixedMods))
{
var residue = "";
if (isNTerm || isCTerm)
{
residue = ".";
}
else
{
residue = peptide.PeptideSequence[mod.Location - 1].ToString();
}

foreach (var fixedMod in fixedMods)
{
// TODO: should also check specificity rules
if (fixedMod.Residues.Contains(residue))
{
continue;
}
}
}

if (isNTerm)
{
seq = $"[{mod.MonoisotopicMassDelta:+F2}{seq}";
}
else if (isCTerm)
{
seq += $"}}{mod.MonoisotopicMassDelta:+F2}";
}
else
{
seq = $"{seq.Substring(0, mod.Location)}{mod.MonoisotopicMassDelta:+F2}{seq.Substring(mod.Location)}";
}
}

peptide.Id = "Pep_" + seq;
}
//pepDict.Clear(); // TODO:

// var pepEvDict = new Dictionary<string, PeptideEvidenceObj>(); // TODO: Monitor for duplicates
foreach (var pepEv in identData.SequenceCollection.PeptideEvidences)
{
// Re-Id PeptideEvidences
var dbseq = pepEv.DBSequence.Id;
if (dbseq.StartsWith("DBSeq", StringComparison.OrdinalIgnoreCase))
{
dbseq = dbseq.Substring(5);
if (int.TryParse(dbseq, out var offset))
{
dbseq = (offset + pepEv.Start - 1).ToString();
}
}

var pepId = pepEv.Peptide.Id.Substring(4);

pepEv.Id = $"PepEv_{dbseq}_{pepId}_{pepEv.Start}";
}
}
catch (Exception e)
{
Console.WriteLine(e);
}
}

return identData;
}

private void MergeIdentData(IEnumerable<IdentDataObj> toMerge, double maxSpecEValue, bool keepOnlyBestResult, bool remapPostMerge)
{
var mergedCount = 2; // start at 2, since we are merging into the first file.
Expand Down Expand Up @@ -129,7 +247,7 @@ public static void MergeMzidsDivideAndConquer(Options options)

// Semaphore: initialCount, is the number initially available, maximumCount is the max allowed
var threadLimiter = new Semaphore(options.MaxThreads, options.MaxThreads);
var mergedData = DivideAndConquerMergeIdentData(options.FilesToMerge, threadLimiter, options.MaxSpecEValue, options.KeepOnlyBestResults, true).targetIdentDataObj;
var mergedData = DivideAndConquerMergeIdentData(options.FilesToMerge, threadLimiter, options.MaxSpecEValue, options.KeepOnlyBestResults, true, options).targetIdentDataObj;

sw.Stop();
Console.WriteLine("Mzid read time: {0}", readTime);
Expand All @@ -150,23 +268,18 @@ public static void MergeMzidsDivideAndConquer(Options options)
private static readonly object ReadTimeWriteLock = new object();
private static readonly object MergeTimeWriteLock = new object();

private MzidMerging(string filePath)
private MzidMerging(string filePath, Options options)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
var mzid = MzIdentMlReaderWriter.Read(filePath);
sw.Stop();
var myReadTime = sw.Elapsed;
sw.Restart();
targetIdentDataObj = new IdentDataObj(mzid);
targetIdentDataObj = ReadAndPreprocessFile(filePath, options);
sw.Stop();
lock (ReadTimeWriteLock)
{
readTime += myReadTime;
readConvertTime += sw.Elapsed;
readTime += sw.Elapsed;
}
}

private static MzidMerging DivideAndConquerMergeIdentData(List<string> filePaths, Semaphore threadLimiter, double maxSpecEValue, bool keepOnlyBestResult, bool finalize)
private static MzidMerging DivideAndConquerMergeIdentData(List<string> filePaths, Semaphore threadLimiter, double maxSpecEValue, bool keepOnlyBestResult, bool finalize, Options options)
{
if (filePaths.Count >= 2)
{
Expand All @@ -177,8 +290,8 @@ private static MzidMerging DivideAndConquerMergeIdentData(List<string> filePaths

/**/
// run in parallel
var merged1Task = Task.Run(() => DivideAndConquerMergeIdentData(firstHalf, threadLimiter, maxSpecEValue, keepOnlyBestResult, false));
var merged2Task = Task.Run(() => DivideAndConquerMergeIdentData(secondHalf, threadLimiter, maxSpecEValue, keepOnlyBestResult, false));
var merged1Task = Task.Run(() => DivideAndConquerMergeIdentData(firstHalf, threadLimiter, maxSpecEValue, keepOnlyBestResult, false, options));
var merged2Task = Task.Run(() => DivideAndConquerMergeIdentData(secondHalf, threadLimiter, maxSpecEValue, keepOnlyBestResult, false, options));

// wait for them to complete
Task.WaitAll(merged1Task, merged2Task);
Expand Down Expand Up @@ -222,7 +335,7 @@ private static MzidMerging DivideAndConquerMergeIdentData(List<string> filePaths
if (filePaths.Count == 1)
{
threadLimiter.WaitOne();
var merger = new MzidMerging(filePaths[0]);
var merger = new MzidMerging(filePaths[0], options);
threadLimiter.Release();
return merger;
}
Expand Down
12 changes: 12 additions & 0 deletions MzidMerger/Options.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ public class Options
[Option("threads", Hidden = true, HelpText = "Max number of threads to use.")]
public int MaxThreads { get; set; }

[Option("fixIds", HelpText = "Fix the peptide and peptideEvidence IDs. Only use for e.g. older MS-GF+ results, that output many errors about duplicate IDs. Only fixes Peptide and PeptideEvidence IDs.")]
public bool FixIDs { get; set; }

[Option("multithread", HelpText = "If supplied, program will attempt to decrease merge time by reading multiple files in parallel. Will also require more memory, and is more likely to crash.")]
public bool MultiThread { get; set; }

public List<string> FilesToMerge { get; } = new List<string>();

public Options()
Expand All @@ -39,6 +45,8 @@ public Options()
KeepOnlyBestResults = false;
AllowHighResourceUsage = false;
MaxThreads = GetOptimalMaxThreads();
FixIDs = false;
MultiThread = false;
}

public bool Validate()
Expand Down Expand Up @@ -184,6 +192,10 @@ public static string GetLeftIdentical(List<string> input)
private int GetOptimalMaxThreads()
{
var cores = SystemInfo.GetCoreCount();
if (cores == -1)
{
Console.WriteLine("NOTE: Above error about the CPU info can be ignored.");
}
var threads = SystemInfo.GetLogicalCoreCount();
if (cores == threads)
{
Expand Down

0 comments on commit 103de10

Please sign in to comment.