From b6bacf3a99c6542ef06594545a4c9684a5912e27 Mon Sep 17 00:00:00 2001
From: Bryson Gibbons <bryson.gibbons@pnnl.gov>
Date: Fri, 26 Jun 2020 00:40:34 -0700
Subject: [PATCH] Allow parsing Gene IDs from the protein descriptions stored
 with the DBSeq entries.

A default regular expression will generally handle the SwissProt format, but user can also override the regular expression.
---
 MzidToTsvConverter/ConverterOptions.cs       | 46 +++++++++++++++++++-
 MzidToTsvConverter/MzidToTsvConverter.cs     | 13 +++++-
 MzidToTsvConverter/MzidToTsvConverter.csproj |  2 +-
 MzidToTsvConverter/PeptideMatch.cs           |  7 ++-
 4 files changed, 63 insertions(+), 5 deletions(-)
diff --git a/MzidToTsvConverter/ConverterOptions.cs b/MzidToTsvConverter/ConverterOptions.cs
index 85f3a93..566a712 100644
--- a/MzidToTsvConverter/ConverterOptions.cs
+++ b/MzidToTsvConverter/ConverterOptions.cs
@@ -1,12 +1,17 @@
 ﻿using System;
 using System.Collections.Generic;
 using System.IO;
+using System.Text.RegularExpressions;
 using PRISM;
 
 namespace MzidToTsvConverter
 {
     public class ConverterOptions
     {
+        // Default regex: Should support UniProt SwissProt files, and would support others that use the same format
+        // Pattern description: match "sp|[protein ID: 6+ alphanumeric]|[CAPTURE gene ID: 2+ alphanumeric]_[species code: 2+ alphanumeric]"
+        public const string DefaultGeneIdRegexPattern = @"(?<=sp\|[0-9a-zA-Z\-]{6,}\|)([A-Z0-9]{2,})(?=_[A-Z0=9]{2,})";
+
         public ConverterOptions()
         {
             MzidPath = string.Empty;
@@ -19,6 +24,10 @@ public ConverterOptions()
             MaxEValue = 0;
             MaxQValue = 0;
             NoExtendedFields = false;
+            AddGeneId = false;
+
+            GeneIdRegexPattern = DefaultGeneIdRegexPattern;
+            GeneIdRegex = null;
         }
 
         [Option("mzid", Required = true, ArgPosition = 1,
@@ -71,6 +80,14 @@ public ConverterOptions()
         [Option("ne", "noExtended", HelpText = "If specified, does not add extended fields to the TSV output (e.g., scan time).")]
         public bool NoExtendedFields { get; set; }
 
+        [Option("geneid",
+            HelpText = "If specified, adds a 'GeneID' column to the output for non-decoy identification. " +
+                       "Can supply a regular expression to extract it from the protein identifier/description. " +
+                       "Default expression supports the UniProt SwissProt format.", ArgExistsProperty = nameof(AddGeneId))]
+        public string GeneIdRegexPattern { get; set; }
+
+        public bool AddGeneId { get; set; }
+
         /// <summary>
         /// True if we are processing all .mzid or .mzid.gz files in a directory
         /// </summary>
@@ -78,6 +95,8 @@ public ConverterOptions()
 
         public List<string> MzidPaths { get; } = new List<string>();
 
+        public Regex GeneIdRegex { get; private set; }
+
         public string AutoNameTsvFromMzid(string mzidPath)
         {
             var path = mzidPath;
@@ -117,13 +136,31 @@ public bool ValidateArgs(out string errorMessage)
                 return false;
             }
 
+            if (AddGeneId)
+            {
+                if (string.IsNullOrWhiteSpace(GeneIdRegexPattern))
+                {
+                    GeneIdRegexPattern = DefaultGeneIdRegexPattern;
+                }
+
+                try
+                {
+                    GeneIdRegex = new Regex(GeneIdRegexPattern, RegexOptions.Compiled);
+                }
+                catch
+                {
+                    errorMessage = "ERROR: GeneID Regex is not a valid regular expression.";
+                    return false;
+                }
+            }
+
             string mzidFileDirectory;
 
             if (HasWildcard(MzidPath))
             {
                 mzidFileDirectory = GetParentDirectoryPath(MzidPath);
-            } else
-
+            }
+            else
             {
                 var mzidFile = new FileInfo(MzidPath);
                 if (mzidFile.Exists)
@@ -292,6 +329,11 @@ public void OutputSetOptions()
             Console.WriteLine("Show decoy: {0}", ShowDecoy);
             Console.WriteLine("Single result per spectrum: {0}", SingleResultPerSpectrum);
 
+            if (AddGeneId)
+            {
+                Console.WriteLine("Adding gene IDs to the output using regular expression \"{0}\"", GeneIdRegexPattern);
+            }
+
             if (SkipDuplicateIds)
             {
                 Console.WriteLine("Skipping duplicate IDs");
diff --git a/MzidToTsvConverter/MzidToTsvConverter.cs b/MzidToTsvConverter/MzidToTsvConverter.cs
index d80e589..da1f75f 100644
--- a/MzidToTsvConverter/MzidToTsvConverter.cs
+++ b/MzidToTsvConverter/MzidToTsvConverter.cs
@@ -98,7 +98,7 @@ public void ConvertToTsv(
                 {
                     csv.Configuration.AllowComments = false;
                     csv.Configuration.Delimiter = "\t";
-                    csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields));
+                    csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId));
 
                     // SPECIAL CASE:
                     // Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
@@ -200,6 +200,17 @@ public void ConvertToTsv(
                             }
 
                             match.Protein = pepEv.DbSeq.Accession;
+
+                            match.GeneId = "";
+                            if (options.AddGeneId && !pepEv.IsDecoy)
+                            {
+                                var geneMatch = options.GeneIdRegex.Match(pepEv.DbSeq.ProteinDescription);
+                                if (geneMatch.Success && geneMatch.Captures.Count > 0)
+                                {
+                                    match.GeneId = geneMatch.Value;
+                                }
+                            }
+
                             if (!uniquePepProteinList.Add(match.Peptide + match.Protein))
                             {
                                 continue;
diff --git a/MzidToTsvConverter/MzidToTsvConverter.csproj b/MzidToTsvConverter/MzidToTsvConverter.csproj
index 295e300..28f8650 100644
--- a/MzidToTsvConverter/MzidToTsvConverter.csproj
+++ b/MzidToTsvConverter/MzidToTsvConverter.csproj
@@ -6,7 +6,7 @@
     <RootNamespace>MzidToTsvConverter</RootNamespace>
     <AssemblyName>MzidToTsvConverter</AssemblyName>
     <PackageId>MzidToTsvConverter</PackageId>
-    <Version>1.4.1</Version>
+    <Version>1.4.2</Version>
     <AssemblyVersion>$(Version)</AssemblyVersion>
     <FileVersion>$(Version)</FileVersion>
     <Description>Converts mzid[.gz] files to the MS-GF+ tsv format. Designed for MS-GF+ mzid files (looks for EValue and SpecEValue scores)</Description>
diff --git a/MzidToTsvConverter/PeptideMatch.cs b/MzidToTsvConverter/PeptideMatch.cs
index 0e67ca7..f313e81 100644
--- a/MzidToTsvConverter/PeptideMatch.cs
+++ b/MzidToTsvConverter/PeptideMatch.cs
@@ -43,6 +43,7 @@ public double PrecursorErrorPpm
         public int Charge => Identification.Charge;
         public string Peptide { get; set; }
         public string Protein { get; set; }
+        public string GeneId { get; set; }
         public int DeNovoScore => Identification.DeNovoScore;
         public double MSGFScore => Identification.RawScore;
         public double SpecEValue => Identification.SpecEv;
@@ -53,7 +54,7 @@ public double PrecursorErrorPpm
 
     public class PeptideMatchMap : ClassMap<PeptideMatch>
     {
-        public PeptideMatchMap(bool noExtendedFields = false)
+        public PeptideMatchMap(bool noExtendedFields = false, bool addGeneId = false)
         {
             var index = 0;
             Map(x => x.SpecFile).Name("#SpecFile", "SpecFile").Index(index++);
@@ -70,6 +71,10 @@ public PeptideMatchMap(bool noExtendedFields = false)
             Map(x => x.Charge).Name("Charge").Index(index++);
             Map(x => x.Peptide).Name("Peptide").Index(index++);
             Map(x => x.Protein).Name("Protein").Index(index++);
+            if (addGeneId)
+            {
+                Map(x => x.GeneId).Name("GeneID").Index(index++);
+            }
             Map(x => x.DeNovoScore).Name("DeNovoScore").Index(index++);
             Map(x => x.MSGFScore).Name("MSGFScore").Index(index++);