Skip to content

Commit

Permalink
Update NuGet dependencies, add exception handler, and add option to i…
Browse files Browse the repository at this point in the history
…gnore duplicate IDs when parsing the mzid
  • Loading branch information
FarmGeek4Life committed Mar 27, 2018
1 parent cdd050a commit f5d3308
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 121 deletions.
7 changes: 7 additions & 0 deletions MzidToTsvConverter/ConverterOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ public ConverterOptions()
[Option("singleResult", "1", HelpText = "Only output one result per spectrum", HelpShowsDefault = true)]
public bool SingleResultPerSpectrum { get; set; }

[Option("skipDupIds", HelpText = "If there are issues converting a file due to \"duplicate ID\" errors, specifying this will cause the duplicate IDs to be ignored, at the likely cost of some correctness.", HelpShowsDefault = true)]
public bool SkipDuplicateIds { get; set; }

public string AutoNameTsvFromMzid(string mzidPath)
{
var path = mzidPath;
Expand Down Expand Up @@ -80,6 +83,10 @@ public void OutputSetOptions()
Console.WriteLine("unroll results: {0}", UnrollResults);
Console.WriteLine("show decoy: {0}", ShowDecoy);
Console.WriteLine("single result per spectrum: {0}", SingleResultPerSpectrum);
if (SkipDuplicateIds)
{
Console.WriteLine("skipping duplicate IDs");
}
}

}
Expand Down
261 changes: 143 additions & 118 deletions MzidToTsvConverter/MzidToTsvConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,168 +36,193 @@ public void ConvertToTsv(ConverterOptions options)
tsvPath = options.TsvPath;
}

ConvertToTsv(mzidFile.FullName, tsvPath, options.ShowDecoy, options.UnrollResults, options.SingleResultPerSpectrum);
ConvertToTsv(mzidFile.FullName, tsvPath, options.ShowDecoy, options.UnrollResults, options.SingleResultPerSpectrum, options.SkipDuplicateIds);
}

}
else
{
ConvertToTsv(options.MzidPath, options.TsvPath, options.ShowDecoy, options.UnrollResults, options.SingleResultPerSpectrum);
ConvertToTsv(options.MzidPath, options.TsvPath, options.ShowDecoy, options.UnrollResults, options.SingleResultPerSpectrum, options.SkipDuplicateIds);
}
}

public void ConvertToTsv(string mzidPath, string tsvPath, bool showDecoy = true, bool unrollResults = true, bool singleResult = false)
public void ConvertToTsv(string mzidPath, string tsvPath, bool showDecoy = true, bool unrollResults = true, bool singleResult = false, bool skipDuplicateIds = false)
{
var reader = new SimpleMZIdentMLReader();
using (var data = reader.ReadLowMem(mzidPath))
using (var stream = new StreamWriter(new FileStream(tsvPath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)))
var reader = new SimpleMZIdentMLReader(skipDuplicateIds, s => Console.WriteLine("MZID PARSE ERROR: {0}", s));
try
{
var headers = new List<string>
using (var data = reader.ReadLowMem(mzidPath))
using (var stream = new StreamWriter(new FileStream(tsvPath, FileMode.Create, FileAccess.Write, FileShare.ReadWrite)))
{
"#SpecFile", "SpecID", "ScanNum", "FragMethod", "Precursor", "IsotopeError", "PrecursorError(ppm)", "Charge", "Peptide",
"Protein", "DeNovoScore", "MSGFScore", "SpecEValue", "EValue", "QValue", "PepQValue"
};

// SPECIAL CASE:
// Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
// SpectrumIdentificationItems was correct, but if there was a modification in the first 3 residues there was at
// least a 50% chance of the PeptideEvidenceRefs within the SpectrumIdentificationItem being incorrect. So, for
// those bad versions, use the peptide_ref rather than the PeptideEvidenceRefs to get the sequence.
var isBadMsGfMzid = false;
if (data.AnalysisSoftwareCvAccession.ToUpper().Contains("MS:1002048") && !string.IsNullOrWhiteSpace(data.AnalysisSoftwareVersion))
{
// bad versions: v10280 (introduced), v10282, v2016.01.20, v2016.01.21, v2016.01.29, v2016.02.12, v2016.05.25, v2016.0.13, v2016.06.13, v2016.06.14, v2016.06.15, v2016.06.29, v2016.07.26, v2016.08.31, v2016.09.07, v2016.09.22, v2016.09.23 (fixed with version v2016.10.10)
var badVersions = new[]
var headers = new List<string>
{
"v10280", "v10282", "v2016.01.20", "v2016.01.21", "v2016.01.29", "v2016.02.12", "v2016.05.25", "v2016.0.13", "v2016.06.13",
"v2016.06.14", "v2016.06.15", "v2016.06.29", "v2016.07.26", "v2016.08.31", "v2016.09.07", "v2016.09.22", "v2016.09.23"
"#SpecFile",
"SpecID",
"ScanNum",
"FragMethod",
"Precursor",
"IsotopeError",
"PrecursorError(ppm)",
"Charge",
"Peptide",
"Protein",
"DeNovoScore",
"MSGFScore",
"SpecEValue",
"EValue",
"QValue",
"PepQValue"
};
foreach (var version in badVersions)

// SPECIAL CASE:
// Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
// SpectrumIdentificationItems was correct, but if there was a modification in the first 3 residues there was at
// least a 50% chance of the PeptideEvidenceRefs within the SpectrumIdentificationItem being incorrect. So, for
// those bad versions, use the peptide_ref rather than the PeptideEvidenceRefs to get the sequence.
var isBadMsGfMzid = false;
if (data.AnalysisSoftwareCvAccession.ToUpper().Contains("MS:1002048") && !string.IsNullOrWhiteSpace(data.AnalysisSoftwareVersion))
{
if (data.AnalysisSoftwareVersion.Contains(version))
// bad versions: v10280 (introduced), v10282, v2016.01.20, v2016.01.21, v2016.01.29, v2016.02.12, v2016.05.25, v2016.0.13, v2016.06.13, v2016.06.14, v2016.06.15, v2016.06.29, v2016.07.26, v2016.08.31, v2016.09.07, v2016.09.22, v2016.09.23 (fixed with version v2016.10.10)
var badVersions = new[]
{
isBadMsGfMzid = true;
"v10280", "v10282", "v2016.01.20", "v2016.01.21", "v2016.01.29", "v2016.02.12", "v2016.05.25", "v2016.0.13",
"v2016.06.13",
"v2016.06.14", "v2016.06.15", "v2016.06.29", "v2016.07.26", "v2016.08.31", "v2016.09.07", "v2016.09.22", "v2016.09.23"
};
foreach (var version in badVersions)
{
if (data.AnalysisSoftwareVersion.Contains(version))
{
isBadMsGfMzid = true;
}
}
}
}

if (isBadMsGfMzid)
{
ShowWarning(string.Format(
"Warning: file \"{0}\" was created with a version of MS-GF+ that had some erroneous output in the mzid file." +
" Using sequences from the peptide_ref attribute instead of the PeptideEvidenceRef element to try to bypass the issue.",
mzidPath));
}

stream.WriteLine(string.Join("\t", headers));

var lastScanNum = 0;
var resultsWritten = 0;
var writtenCount = 0;

foreach (var id in data.Identifications)
{
if (singleResult && id.ScanNum == lastScanNum)
if (isBadMsGfMzid)
{
continue;
ShowWarning(string.Format(
"Warning: file \"{0}\" was created with a version of MS-GF+ that had some erroneous output in the mzid file." +
" Using sequences from the peptide_ref attribute instead of the PeptideEvidenceRef element to try to bypass the issue.",
mzidPath));
}

writtenCount++;
stream.WriteLine(string.Join("\t", headers));

lastScanNum = id.ScanNum;
var specFile = data.SpectrumFile;
var specId = id.NativeId;
var scanNum = id.ScanNum;
var fragMethod = "CID";
if (id.AllParamsDict.ContainsKey("AssumedDissociationMethod"))
{
fragMethod = id.AllParamsDict["AssumedDissociationMethod"];
}
var lastScanNum = 0;
var resultsWritten = 0;
var writtenCount = 0;

var precursor = id.ExperimentalMz;
var isotopeError = "0";
if (id.AllParamsDict.ContainsKey("IsotopeError"))
foreach (var id in data.Identifications)
{
isotopeError = id.AllParamsDict["IsotopeError"];
}

var adjExpMz = id.ExperimentalMz - IsotopeMass * int.Parse(isotopeError) / id.Charge;
//var precursorError = (id.CalMz - id.ExperimentalMz) / id.CalMz * 1e6;
var precursorError = (adjExpMz - id.CalMz) / id.CalMz * 1e6;

var charge = id.Charge;
var deNovoScore = id.DeNovoScore;
var msgfScore = id.RawScore;
var specEValue = id.SpecEv;
var eValue = id.EValue;
var qValue = id.QValue;
var pepQValue = id.PepQValue;

var dedup = new HashSet<string>();

foreach (var pepEv in id.PepEvidence)
{
if (!showDecoy && pepEv.IsDecoy)
if (singleResult && id.ScanNum == lastScanNum)
{
continue;
}

var peptideWithModsAndContext = pepEv.SequenceWithNumericMods;
// Produce correct output with bad MS-GF+ mzid
if (isBadMsGfMzid)
{
// Add the prefix and suffix residues for this protein
// Do not use pepEv.SequenceWithNumericMods; it isn't necessarily correct for this spectrum
peptideWithModsAndContext = pepEv.Pre + "." + id.Peptide.SequenceWithNumericMods + "." + pepEv.Post;
}
writtenCount++;

var protein = pepEv.DbSeq.Accession;
if (!dedup.Add(peptideWithModsAndContext + protein))
lastScanNum = id.ScanNum;
var specFile = data.SpectrumFile;
var specId = id.NativeId;
var scanNum = id.ScanNum;
var fragMethod = "CID";
if (id.AllParamsDict.ContainsKey("AssumedDissociationMethod"))
{
continue;
fragMethod = id.AllParamsDict["AssumedDissociationMethod"];
}

// Write out EValues to 5 sig figs, using scientific notation below 0.0001
var specEValueString = StringUtilities.ValueToString(specEValue, 5, 1000);
var eValueString = StringUtilities.ValueToString(eValue, 5, 1000);

// Write out QValue using 5 digits after the decimal, though use scientific notation below 0.00005
var qValueString = StringUtilities.DblToString(qValue, 5, 0.00005);
var pepQValueString = StringUtilities.DblToString(pepQValue, 5, 0.00005);

if (resultsWritten == 0)
var precursor = id.ExperimentalMz;
var isotopeError = "0";
if (id.AllParamsDict.ContainsKey("IsotopeError"))
{
// Assure that the first row has 0.0 for score fields (helps in loading data into Access or SQL server)
if (specEValueString == "0") specEValueString = "0.0";
if (eValueString == "0") eValueString = "0.0";
if (qValueString == "0") qValueString = "0.0";
if (pepQValueString == "0") pepQValueString = "0.0";
isotopeError = id.AllParamsDict["IsotopeError"];
}

var line = string.Format(CultureInfo.InvariantCulture,
"{0}\t{1}\t{2}\t{3}\t{4:0.0####}\t{5}\t{6:0.0###}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14:0.0####}\t{15:0.0####}",
specFile, specId, scanNum, fragMethod, precursor, isotopeError, precursorError, charge, peptideWithModsAndContext, protein,
deNovoScore, msgfScore, specEValueString, eValueString, qValueString, pepQValueString);

stream.WriteLine(line);
var adjExpMz = id.ExperimentalMz - IsotopeMass * int.Parse(isotopeError) / id.Charge;
//var precursorError = (id.CalMz - id.ExperimentalMz) / id.CalMz * 1e6;
var precursorError = (adjExpMz - id.CalMz) / id.CalMz * 1e6;

resultsWritten += 1;
var charge = id.Charge;
var deNovoScore = id.DeNovoScore;
var msgfScore = id.RawScore;
var specEValue = id.SpecEv;
var eValue = id.EValue;
var qValue = id.QValue;
var pepQValue = id.PepQValue;

var dedup = new HashSet<string>();

if (!unrollResults)
foreach (var pepEv in id.PepEvidence)
{
break;
if (!showDecoy && pepEv.IsDecoy)
{
continue;
}

var peptideWithModsAndContext = pepEv.SequenceWithNumericMods;
// Produce correct output with bad MS-GF+ mzid
if (isBadMsGfMzid)
{
// Add the prefix and suffix residues for this protein
// Do not use pepEv.SequenceWithNumericMods; it isn't necessarily correct for this spectrum
peptideWithModsAndContext = pepEv.Pre + "." + id.Peptide.SequenceWithNumericMods + "." + pepEv.Post;
}

var protein = pepEv.DbSeq.Accession;
if (!dedup.Add(peptideWithModsAndContext + protein))
{
continue;
}

// Write out EValues to 5 sig figs, using scientific notation below 0.0001
var specEValueString = StringUtilities.ValueToString(specEValue, 5, 1000);
var eValueString = StringUtilities.ValueToString(eValue, 5, 1000);

// Write out QValue using 5 digits after the decimal, though use scientific notation below 0.00005
var qValueString = StringUtilities.DblToString(qValue, 5, 0.00005);
var pepQValueString = StringUtilities.DblToString(pepQValue, 5, 0.00005);

if (resultsWritten == 0)
{
// Assure that the first row has 0.0 for score fields (helps in loading data into Access or SQL server)
if (specEValueString == "0") specEValueString = "0.0";
if (eValueString == "0") eValueString = "0.0";
if (qValueString == "0") qValueString = "0.0";
if (pepQValueString == "0") pepQValueString = "0.0";
}

var line = string.Format(CultureInfo.InvariantCulture,
"{0}\t{1}\t{2}\t{3}\t{4:0.0####}\t{5}\t{6:0.0###}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14:0.0####}\t{15:0.0####}",
specFile, specId, scanNum, fragMethod, precursor, isotopeError, precursorError, charge, peptideWithModsAndContext,
protein,
deNovoScore, msgfScore, specEValueString, eValueString, qValueString, pepQValueString);

stream.WriteLine(line);

resultsWritten += 1;


if (!unrollResults)
{
break;
}
}
}
}

if (writtenCount == 0)
{
ShowWarning("Warning: .mzID file does not have any results");
System.Threading.Thread.Sleep(1500);
return;
if (writtenCount == 0)
{
ShowWarning("Warning: .mzID file does not have any results");
System.Threading.Thread.Sleep(1500);
return;
}
}
}
catch (SimpleMZIdentMLReader.DuplicateKeyException e)
{
Console.WriteLine("MZID PARSE ERROR: {0}", e.Message);
Console.WriteLine("This type of error is usually caused by an error in the MZID output.");
Console.WriteLine("StackTrace: {1}", e.StackTrace);
}
}

public const double C = 12.0f;
Expand Down
6 changes: 3 additions & 3 deletions MzidToTsvConverter/MzidToTsvConverter.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<RootNamespace>MzidToTsvConverter</RootNamespace>
<AssemblyName>MzidToTsvConverter</AssemblyName>
<PackageId>MzidToTsvConverter</PackageId>
<Version>1.2.0</Version>
<Version>1.2.1</Version>
<AssemblyVersion>$(Version)</AssemblyVersion>
<FileVersion>$(Version)</FileVersion>
<Description>Converts mzid[.gz] files to the MS-GF+ tsv format. Designed for MS-GF+ mzid files (looks for EValue and SpecEValue scores)</Description>
Expand All @@ -19,8 +19,8 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="PRISM-Library" Version="2.4.58" />
<PackageReference Include="PSI_Interface" Version="1.3.5" />
<PackageReference Include="PRISM-Library" Version="2.4.82" />
<PackageReference Include="PSI_Interface" Version="1.3.8" />
</ItemGroup>

</Project>

0 comments on commit f5d3308

Please sign in to comment.