From b6bacf3a99c6542ef06594545a4c9684a5912e27 Mon Sep 17 00:00:00 2001 From: Bryson Gibbons Date: Fri, 26 Jun 2020 00:40:34 -0700 Subject: [PATCH] Allow parsing Gene IDs from the protein descriptions stored with the DBSeq entries. A default regular expression will generally handle the SwissProt format, but user can also override the regular expression. --- MzidToTsvConverter/ConverterOptions.cs | 46 +++++++++++++++++++- MzidToTsvConverter/MzidToTsvConverter.cs | 13 +++++- MzidToTsvConverter/MzidToTsvConverter.csproj | 2 +- MzidToTsvConverter/PeptideMatch.cs | 7 ++- 4 files changed, 63 insertions(+), 5 deletions(-) diff --git a/MzidToTsvConverter/ConverterOptions.cs b/MzidToTsvConverter/ConverterOptions.cs index 85f3a93..566a712 100644 --- a/MzidToTsvConverter/ConverterOptions.cs +++ b/MzidToTsvConverter/ConverterOptions.cs @@ -1,12 +1,17 @@ using System; using System.Collections.Generic; using System.IO; +using System.Text.RegularExpressions; using PRISM; namespace MzidToTsvConverter { public class ConverterOptions { + // Default regex: Should support UniProt SwissProt files, and would support others that use the same format + // Pattern description: match "sp|[protein ID: 6+ alphanumeric]|[CAPTURE gene ID: 2+ alphanumeric]_[species code: 2+ alphanumeric]" + public const string DefaultGeneIdRegexPattern = @"(?<=sp\|[0-9a-zA-Z\-]{6,}\|)([A-Z0-9]{2,})(?=_[A-Z0=9]{2,})"; + public ConverterOptions() { MzidPath = string.Empty; @@ -19,6 +24,10 @@ public ConverterOptions() MaxEValue = 0; MaxQValue = 0; NoExtendedFields = false; + AddGeneId = false; + + GeneIdRegexPattern = DefaultGeneIdRegexPattern; + GeneIdRegex = null; } [Option("mzid", Required = true, ArgPosition = 1, @@ -71,6 +80,14 @@ public ConverterOptions() [Option("ne", "noExtended", HelpText = "If specified, does not add extended fields to the TSV output (e.g., scan time).")] public bool NoExtendedFields { get; set; } + [Option("geneid", + HelpText = "If specified, adds a 'GeneID' column to the output for non-decoy identification. " + + "Can supply a regular expression to extract it from the protein identifier/description. " + + "Default expression supports the UniProt SwissProt format.", ArgExistsProperty = nameof(AddGeneId))] + public string GeneIdRegexPattern { get; set; } + + public bool AddGeneId { get; set; } + /// /// True if we are processing all .mzid or .mzid.gz files in a directory /// @@ -78,6 +95,8 @@ public ConverterOptions() public List MzidPaths { get; } = new List(); + public Regex GeneIdRegex { get; private set; } + public string AutoNameTsvFromMzid(string mzidPath) { var path = mzidPath; @@ -117,13 +136,31 @@ public bool ValidateArgs(out string errorMessage) return false; } + if (AddGeneId) + { + if (string.IsNullOrWhiteSpace(GeneIdRegexPattern)) + { + GeneIdRegexPattern = DefaultGeneIdRegexPattern; + } + + try + { + GeneIdRegex = new Regex(GeneIdRegexPattern, RegexOptions.Compiled); + } + catch + { + errorMessage = "ERROR: GeneID Regex is not a valid regular expression."; + return false; + } + } + string mzidFileDirectory; if (HasWildcard(MzidPath)) { mzidFileDirectory = GetParentDirectoryPath(MzidPath); - } else - + } + else { var mzidFile = new FileInfo(MzidPath); if (mzidFile.Exists) @@ -292,6 +329,11 @@ public void OutputSetOptions() Console.WriteLine("Show decoy: {0}", ShowDecoy); Console.WriteLine("Single result per spectrum: {0}", SingleResultPerSpectrum); + if (AddGeneId) + { + Console.WriteLine("Adding gene IDs to the output using regular expression \"{0}\"", GeneIdRegexPattern); + } + if (SkipDuplicateIds) { Console.WriteLine("Skipping duplicate IDs"); diff --git a/MzidToTsvConverter/MzidToTsvConverter.cs b/MzidToTsvConverter/MzidToTsvConverter.cs index d80e589..da1f75f 100644 --- a/MzidToTsvConverter/MzidToTsvConverter.cs +++ b/MzidToTsvConverter/MzidToTsvConverter.cs @@ -98,7 +98,7 @@ public void ConvertToTsv( { csv.Configuration.AllowComments = false; csv.Configuration.Delimiter = "\t"; - csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields)); + csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId)); // SPECIAL CASE: // Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in @@ -200,6 +200,17 @@ public void ConvertToTsv( } match.Protein = pepEv.DbSeq.Accession; + + match.GeneId = ""; + if (options.AddGeneId && !pepEv.IsDecoy) + { + var geneMatch = options.GeneIdRegex.Match(pepEv.DbSeq.ProteinDescription); + if (geneMatch.Success && geneMatch.Captures.Count > 0) + { + match.GeneId = geneMatch.Value; + } + } + if (!uniquePepProteinList.Add(match.Peptide + match.Protein)) { continue; diff --git a/MzidToTsvConverter/MzidToTsvConverter.csproj b/MzidToTsvConverter/MzidToTsvConverter.csproj index 295e300..28f8650 100644 --- a/MzidToTsvConverter/MzidToTsvConverter.csproj +++ b/MzidToTsvConverter/MzidToTsvConverter.csproj @@ -6,7 +6,7 @@ MzidToTsvConverter MzidToTsvConverter MzidToTsvConverter - 1.4.1 + 1.4.2 $(Version) $(Version) Converts mzid[.gz] files to the MS-GF+ tsv format. Designed for MS-GF+ mzid files (looks for EValue and SpecEValue scores) diff --git a/MzidToTsvConverter/PeptideMatch.cs b/MzidToTsvConverter/PeptideMatch.cs index 0e67ca7..f313e81 100644 --- a/MzidToTsvConverter/PeptideMatch.cs +++ b/MzidToTsvConverter/PeptideMatch.cs @@ -43,6 +43,7 @@ public double PrecursorErrorPpm public int Charge => Identification.Charge; public string Peptide { get; set; } public string Protein { get; set; } + public string GeneId { get; set; } public int DeNovoScore => Identification.DeNovoScore; public double MSGFScore => Identification.RawScore; public double SpecEValue => Identification.SpecEv; @@ -53,7 +54,7 @@ public double PrecursorErrorPpm public class PeptideMatchMap : ClassMap { - public PeptideMatchMap(bool noExtendedFields = false) + public PeptideMatchMap(bool noExtendedFields = false, bool addGeneId = false) { var index = 0; Map(x => x.SpecFile).Name("#SpecFile", "SpecFile").Index(index++); @@ -70,6 +71,10 @@ public PeptideMatchMap(bool noExtendedFields = false) Map(x => x.Charge).Name("Charge").Index(index++); Map(x => x.Peptide).Name("Peptide").Index(index++); Map(x => x.Protein).Name("Protein").Index(index++); + if (addGeneId) + { + Map(x => x.GeneId).Name("GeneID").Index(index++); + } Map(x => x.DeNovoScore).Name("DeNovoScore").Index(index++); Map(x => x.MSGFScore).Name("MSGFScore").Index(index++);