Skip to content

Commit

Permalink
Allow parsing Gene IDs from the protein descriptions stored with the …
Browse files Browse the repository at this point in the history
…DBSeq entries.

A default regular expression will generally handle the SwissProt format, but user can also override the regular expression.
  • Loading branch information
FarmGeek4Life committed Jun 26, 2020
1 parent 25aa2cc commit b6bacf3
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 5 deletions.
46 changes: 44 additions & 2 deletions MzidToTsvConverter/ConverterOptions.cs
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using PRISM;

namespace MzidToTsvConverter
{
public class ConverterOptions
{
// Default regex: Should support UniProt SwissProt files, and would support others that use the same format
// Pattern description: match "sp|[protein ID: 6+ alphanumeric]|[CAPTURE gene ID: 2+ alphanumeric]_[species code: 2+ alphanumeric]"
public const string DefaultGeneIdRegexPattern = @"(?<=sp\|[0-9a-zA-Z\-]{6,}\|)([A-Z0-9]{2,})(?=_[A-Z0=9]{2,})";

public ConverterOptions()
{
MzidPath = string.Empty;
Expand All @@ -19,6 +24,10 @@ public ConverterOptions()
MaxEValue = 0;
MaxQValue = 0;
NoExtendedFields = false;
AddGeneId = false;

GeneIdRegexPattern = DefaultGeneIdRegexPattern;
GeneIdRegex = null;
}

[Option("mzid", Required = true, ArgPosition = 1,
Expand Down Expand Up @@ -71,13 +80,23 @@ public ConverterOptions()
[Option("ne", "noExtended", HelpText = "If specified, does not add extended fields to the TSV output (e.g., scan time).")]
public bool NoExtendedFields { get; set; }

[Option("geneid",
HelpText = "If specified, adds a 'GeneID' column to the output for non-decoy identification. " +
"Can supply a regular expression to extract it from the protein identifier/description. " +
"Default expression supports the UniProt SwissProt format.", ArgExistsProperty = nameof(AddGeneId))]
public string GeneIdRegexPattern { get; set; }

public bool AddGeneId { get; set; }

/// <summary>
/// True if we are processing all .mzid or .mzid.gz files in a directory
/// </summary>
public bool IsDirectory { get; private set; }

public List<string> MzidPaths { get; } = new List<string>();

public Regex GeneIdRegex { get; private set; }

public string AutoNameTsvFromMzid(string mzidPath)
{
var path = mzidPath;
Expand Down Expand Up @@ -117,13 +136,31 @@ public bool ValidateArgs(out string errorMessage)
return false;
}

if (AddGeneId)
{
if (string.IsNullOrWhiteSpace(GeneIdRegexPattern))
{
GeneIdRegexPattern = DefaultGeneIdRegexPattern;
}

try
{
GeneIdRegex = new Regex(GeneIdRegexPattern, RegexOptions.Compiled);
}
catch
{
errorMessage = "ERROR: GeneID Regex is not a valid regular expression.";
return false;
}
}

string mzidFileDirectory;

if (HasWildcard(MzidPath))
{
mzidFileDirectory = GetParentDirectoryPath(MzidPath);
} else

}
else
{
var mzidFile = new FileInfo(MzidPath);
if (mzidFile.Exists)
Expand Down Expand Up @@ -292,6 +329,11 @@ public void OutputSetOptions()
Console.WriteLine("Show decoy: {0}", ShowDecoy);
Console.WriteLine("Single result per spectrum: {0}", SingleResultPerSpectrum);

if (AddGeneId)
{
Console.WriteLine("Adding gene IDs to the output using regular expression \"{0}\"", GeneIdRegexPattern);
}

if (SkipDuplicateIds)
{
Console.WriteLine("Skipping duplicate IDs");
Expand Down
13 changes: 12 additions & 1 deletion MzidToTsvConverter/MzidToTsvConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public void ConvertToTsv(ConverterOptions options)
{
csv.Configuration.AllowComments = false;
csv.Configuration.Delimiter = "\t";
csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields));
csv.Configuration.RegisterClassMap(new PeptideMatchMap(options.NoExtendedFields, options.AddGeneId));

// SPECIAL CASE:
// Certain versions of MS-GF+ output incorrect mzid files - the peptides referenced in the peptide_ref attribute in
Expand Down Expand Up @@ -200,6 +200,17 @@ public void ConvertToTsv(ConverterOptions options)
}

match.Protein = pepEv.DbSeq.Accession;

match.GeneId = "";
if (options.AddGeneId && !pepEv.IsDecoy)
{
var geneMatch = options.GeneIdRegex.Match(pepEv.DbSeq.ProteinDescription);
if (geneMatch.Success && geneMatch.Captures.Count > 0)
{
match.GeneId = geneMatch.Value;
}
}

if (!uniquePepProteinList.Add(match.Peptide + match.Protein))
{
continue;
Expand Down
2 changes: 1 addition & 1 deletion MzidToTsvConverter/MzidToTsvConverter.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<RootNamespace>MzidToTsvConverter</RootNamespace>
<AssemblyName>MzidToTsvConverter</AssemblyName>
<PackageId>MzidToTsvConverter</PackageId>
<Version>1.4.1</Version>
<Version>1.4.2</Version>
<AssemblyVersion>$(Version)</AssemblyVersion>
<FileVersion>$(Version)</FileVersion>
<Description>Converts mzid[.gz] files to the MS-GF+ tsv format. Designed for MS-GF+ mzid files (looks for EValue and SpecEValue scores)</Description>
Expand Down
7 changes: 6 additions & 1 deletion MzidToTsvConverter/PeptideMatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public double PrecursorErrorPpm
public int Charge => Identification.Charge;
public string Peptide { get; set; }
public string Protein { get; set; }
public string GeneId { get; set; }
public int DeNovoScore => Identification.DeNovoScore;
public double MSGFScore => Identification.RawScore;
public double SpecEValue => Identification.SpecEv;
Expand All @@ -53,7 +54,7 @@ public double PrecursorErrorPpm

public class PeptideMatchMap : ClassMap<PeptideMatch>
{
public PeptideMatchMap(bool noExtendedFields = false)
public PeptideMatchMap(bool noExtendedFields = false, bool addGeneId = false)
{
var index = 0;
Map(x => x.SpecFile).Name("#SpecFile", "SpecFile").Index(index++);
Expand All @@ -70,6 +71,10 @@ public PeptideMatchMap(bool noExtendedFields = false)
Map(x => x.Charge).Name("Charge").Index(index++);
Map(x => x.Peptide).Name("Peptide").Index(index++);
Map(x => x.Protein).Name("Protein").Index(index++);
if (addGeneId)
{
Map(x => x.GeneId).Name("GeneID").Index(index++);
}
Map(x => x.DeNovoScore).Name("DeNovoScore").Index(index++);
Map(x => x.MSGFScore).Name("MSGFScore").Index(index++);

Expand Down

0 comments on commit b6bacf3

Please sign in to comment.