diff --git a/MzidToTsvConverter/ConverterOptions.cs b/MzidToTsvConverter/ConverterOptions.cs
index 85f3a93..566a712 100644
--- a/MzidToTsvConverter/ConverterOptions.cs
+++ b/MzidToTsvConverter/ConverterOptions.cs
@@ -1,12 +1,17 @@
using System;
using System.Collections.Generic;
using System.IO;
+using System.Text.RegularExpressions;
using PRISM;
namespace MzidToTsvConverter
{
public class ConverterOptions
{
+ // Default regex: Should support UniProt SwissProt files, and would support others that use the same format
+ // Pattern description: match "sp|[protein ID: 6+ alphanumeric]|[CAPTURE gene ID: 2+ alphanumeric]_[species code: 2+ alphanumeric]"
+ public const string DefaultGeneIdRegexPattern = @"(?<=sp\|[0-9a-zA-Z\-]{6,}\|)([A-Z0-9]{2,})(?=_[A-Z0=9]{2,})";
+
public ConverterOptions()
{
MzidPath = string.Empty;
@@ -19,6 +24,10 @@ public ConverterOptions()
MaxEValue = 0;
MaxQValue = 0;
NoExtendedFields = false;
+ AddGeneId = false;
+
+ GeneIdRegexPattern = DefaultGeneIdRegexPattern;
+ GeneIdRegex = null;
}
[Option("mzid", Required = true, ArgPosition = 1,
@@ -71,6 +80,14 @@ public ConverterOptions()
[Option("ne", "noExtended", HelpText = "If specified, does not add extended fields to the TSV output (e.g., scan time).")]
public bool NoExtendedFields { get; set; }
+ [Option("geneid",
+ HelpText = "If specified, adds a 'GeneID' column to the output for non-decoy identification. " +
+ "Can supply a regular expression to extract it from the protein identifier/description. " +
+ "Default expression supports the UniProt SwissProt format.", ArgExistsProperty = nameof(AddGeneId))]
+ public string GeneIdRegexPattern { get; set; }
+
+ public bool AddGeneId { get; set; }
+
///
/// True if we are processing all .mzid or .mzid.gz files in a directory
///
@@ -78,6 +95,8 @@ public ConverterOptions()
public List MzidPaths { get; } = new List();
+ public Regex GeneIdRegex { get; private set; }
+
public string AutoNameTsvFromMzid(string mzidPath)
{
var path = mzidPath;
@@ -117,13 +136,31 @@ public bool ValidateArgs(out string errorMessage)
return false;
}
+ if (AddGeneId)
+ {
+ if (string.IsNullOrWhiteSpace(GeneIdRegexPattern))
+ {
+ GeneIdRegexPattern = DefaultGeneIdRegexPattern;
+ }
+
+ try
+ {
+ GeneIdRegex = new Regex(GeneIdRegexPattern, RegexOptions.Compiled);
+ }
+ catch
+ {
+ errorMessage = "ERROR: GeneID Regex is not a valid regular expression.";
+ return false;
+ }
+ }
+
string mzidFileDirectory;
if (HasWildcard(MzidPath))
{
mzidFileDirectory = GetParentDirectoryPath(MzidPath);
- } else
-
+ }
+ else
{
var mzidFile = new FileInfo(MzidPath);
if (mzidFile.Exists)
@@ -292,6 +329,11 @@ public void OutputSetOptions()
Console.WriteLine("Show decoy: {0}", ShowDecoy);
Console.WriteLine("Single result per spectrum: {0}", SingleResultPerSpectrum);
+ if (AddGeneId)
+ {
+ Console.WriteLine("Adding gene IDs to the output using regular expression \"{0}\"", GeneIdRegexPattern);
+ }
+
if (SkipDuplicateIds)
{
Console.WriteLine("Skipping duplicate IDs");
diff --git a/MzidToTsvConverter/MzidToTsvConverter.cs b/MzidToTsvConverter/MzidToTsvConverter.cs
index d80e589..da1f75f 100644
--- a/MzidToTsvConverter/MzidToTsvConverter.cs
+++ b/MzidToTsvConverter/MzidToTsvConverter.cs
@@ -98,7 +98,7 @@ public void ConvertToTsv(
{
csv.Configuration.AllowComments = false;
csv.Configuration.Delimiter = "\t";
- csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields));
+ csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId));
// SPECIAL CASE:
// Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
@@ -200,6 +200,17 @@ public void ConvertToTsv(
}
match.Protein = pepEv.DbSeq.Accession;
+
+ match.GeneId = "";
+ if (options.AddGeneId && !pepEv.IsDecoy)
+ {
+ var geneMatch = options.GeneIdRegex.Match(pepEv.DbSeq.ProteinDescription);
+ if (geneMatch.Success && geneMatch.Captures.Count > 0)
+ {
+ match.GeneId = geneMatch.Value;
+ }
+ }
+
if (!uniquePepProteinList.Add(match.Peptide + match.Protein))
{
continue;
diff --git a/MzidToTsvConverter/MzidToTsvConverter.csproj b/MzidToTsvConverter/MzidToTsvConverter.csproj
index 295e300..28f8650 100644
--- a/MzidToTsvConverter/MzidToTsvConverter.csproj
+++ b/MzidToTsvConverter/MzidToTsvConverter.csproj
@@ -6,7 +6,7 @@
MzidToTsvConverter
MzidToTsvConverter
MzidToTsvConverter
- 1.4.1
+ 1.4.2
$(Version)
$(Version)
Converts mzid[.gz] files to the MS-GF+ tsv format. Designed for MS-GF+ mzid files (looks for EValue and SpecEValue scores)
diff --git a/MzidToTsvConverter/PeptideMatch.cs b/MzidToTsvConverter/PeptideMatch.cs
index 0e67ca7..f313e81 100644
--- a/MzidToTsvConverter/PeptideMatch.cs
+++ b/MzidToTsvConverter/PeptideMatch.cs
@@ -43,6 +43,7 @@ public double PrecursorErrorPpm
public int Charge => Identification.Charge;
public string Peptide { get; set; }
public string Protein { get; set; }
+ public string GeneId { get; set; }
public int DeNovoScore => Identification.DeNovoScore;
public double MSGFScore => Identification.RawScore;
public double SpecEValue => Identification.SpecEv;
@@ -53,7 +54,7 @@ public double PrecursorErrorPpm
public class PeptideMatchMap : ClassMap
{
- public PeptideMatchMap(bool noExtendedFields = false)
+ public PeptideMatchMap(bool noExtendedFields = false, bool addGeneId = false)
{
var index = 0;
Map(x => x.SpecFile).Name("#SpecFile", "SpecFile").Index(index++);
@@ -70,6 +71,10 @@ public PeptideMatchMap(bool noExtendedFields = false)
Map(x => x.Charge).Name("Charge").Index(index++);
Map(x => x.Peptide).Name("Peptide").Index(index++);
Map(x => x.Protein).Name("Protein").Index(index++);
+ if (addGeneId)
+ {
+ Map(x => x.GeneId).Name("GeneID").Index(index++);
+ }
Map(x => x.DeNovoScore).Name("DeNovoScore").Index(index++);
Map(x => x.MSGFScore).Name("MSGFScore").Index(index++);