Skip to content

Commit

Permalink
Merge branch 'smith-chem-wisc:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
elaboy authored Oct 14, 2024
2 parents bb551d7 + 983c3b0 commit 81ffd00
Show file tree
Hide file tree
Showing 12 changed files with 491 additions and 12 deletions.
64 changes: 64 additions & 0 deletions mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
using Readers.ExternalResults.BaseClasses;
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace FlashLFQ
{
public static class MzLibExtensions
{
/// <summary>
/// Makes a list of identification objects usable by FlashLFQ from an IQuantifiableResultFile
/// </summary>
public static List<Identification> MakeIdentifications(this IQuantifiableResultFile quantifiable)
{
IEnumerable<IQuantifiableRecord> quantifiableRecords = quantifiable.GetQuantifiableResults();
List<Identification> identifications = new List<Identification>();
Dictionary<string, ProteinGroup> allProteinGroups = new Dictionary<string, ProteinGroup>();
Dictionary<string, SpectraFileInfo> allFiles = new Dictionary<string, SpectraFileInfo>();

foreach (var record in quantifiableRecords)
{
string baseSequence = record.BaseSequence;
string modifiedSequence = record.ModifiedSequence;
double ms2RetentionTimeInMinutes = record.RetentionTime;
double monoisotopicMass = record.MonoisotopicMass;
int precursurChargeState = record.ChargeState;

SpectraFileInfo file = null;
if (allFiles.TryGetValue(record.FileName, out var fileInfo))
{
// placeholder values for SpectraFileInfo that will be edited later
file = new SpectraFileInfo(record.FileName, "", 1, 1, 1);
}
else
{
file = new SpectraFileInfo(record.FileName, "", 1, 1, 1);
allFiles.Add(record.FileName, fileInfo);
}

List<ProteinGroup> proteinGroups = new();
foreach (var info in record.ProteinGroupInfos)
{
if (allProteinGroups.TryGetValue(info.proteinAccessions, out var proteinGroup))
{
proteinGroups.Add(proteinGroup);
}
else
{
allProteinGroups.Add(info.proteinAccessions, new ProteinGroup(info.proteinAccessions, info.geneName, info.organism));
proteinGroups.Add(allProteinGroups[info.proteinAccessions]);
}
}
Identification id = new Identification(file, baseSequence, modifiedSequence, monoisotopicMass, ms2RetentionTimeInMinutes, precursurChargeState, proteinGroups);
identifications.Add(id);

}

return identifications;
}
}
}
9 changes: 4 additions & 5 deletions mzLib/Proteomics/Protein/Protein.cs
Original file line number Diff line number Diff line change
Expand Up @@ -836,21 +836,22 @@ public static Protein ScrambleDecoyProteinSequence(
string scrambledProteinSequence = originalDecoyProtein.BaseSequence;
// Clone the original protein's modifications
var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
Random rng = new Random(42);

// Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide
// Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide
foreach(string peptideSequence in sequencesToScramble.OrderBy(seq => seq.Length))
{
if(scrambledProteinSequence.Contains(peptideSequence))
{
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out var swappedArray);
int scrambleAttempts = 1;

// Try five times to scramble the peptide sequence without creating a forbidden sequence
while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5)
{
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs,
scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng,
out swappedArray);
scrambleAttempts++;
}
Expand Down Expand Up @@ -896,13 +897,11 @@ public static Protein ScrambleDecoyProteinSequence(
return newProtein;
}

private static Random rng = new Random(42);

/// <summary>
/// Scrambles a peptide sequence, preserving the position of any cleavage sites.
/// </summary>
/// <param name="swappedPositionArray">An array that maps the previous position (index) to the new position (value)</param>
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, out int[] swappedPositionArray)
public static string ScrambleSequence(string sequence, List<DigestionMotif> motifs, Random rng, out int[] swappedPositionArray)
{
// First, find the location of every cleavage motif. These sites shouldn't be scrambled.
HashSet<int> zeroBasedCleavageSitesLocations = new();
Expand Down
55 changes: 55 additions & 0 deletions mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Readers.ExternalResults.BaseClasses
{
/// <summary>
/// Defines the information needed to create the identification object usable by FlashLFQ
/// </summary>
public interface IQuantifiableRecord
{
/// <summary>
/// The file name of the MS Data file in which the identification was made
/// </summary>
public string FileName { get; }

/// <summary>
/// A list of tuples, each of which represent a protein.
/// Each tuple contains the accession number, gene name, and organism associated with the given result.
/// </summary>
public List<(string proteinAccessions, string geneName, string organism)> ProteinGroupInfos { get; }

/// <summary>
/// The amino acid sequence of the identified peptide
/// </summary>
public string BaseSequence { get; }

/// <summary>
/// The amino acid sequence and the associated post-translation modifications of the identified peptide
/// </summary>
public string ModifiedSequence { get; }

/// <summary>
/// The retention time (in minutes) associated with the result
/// </summary>
public double RetentionTime { get; }

/// <summary>
/// The charge state associated with the result
/// </summary>
public int ChargeState { get; }

/// <summary>
/// Defines whether or not the result is a decoy identification
/// </summary>
public bool IsDecoy { get; }

/// <summary>
/// The mass of the monoisotopic peptide (i.e., no c13 or n15 atoms are present, the lowest possible mass)
/// </summary>
public double MonoisotopicMass { get; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Readers.ExternalResults.BaseClasses
{
/// <summary>
/// Outlines behavior to turn results into an IEnumerable of IQuantifiableRecords
/// and to create the dictionary linking file names from the external result files
/// to their local file paths which are used to make the identification object
/// </summary>
public interface IQuantifiableResultFile : IResultFile
{
/// <summary>
/// Returns every result in the result file as an IQuantifiableRecord
/// </summary>
/// <returns> Enumerable that contains identifications for a peptide </returns>
public IEnumerable<IQuantifiableRecord> GetQuantifiableResults();

/// <summary>
/// Links the file name associated with the protein to the raw file path of MassSpec data
/// </summary>
/// <param name="fullFilePath"> list of file paths associated with each distinct record </param>
/// <returns> Dictionary of file names and their associted full paths </returns>
public Dictionary<string, string> FileNametoFilePath(List<string> fullFilePath);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
using Proteomics;
using static System.Net.Mime.MediaTypeNames;
using ThermoFisher.CommonCore.Data.Interfaces;
using Readers.ExternalResults.BaseClasses;
using System.Reflection.Metadata.Ecma335;
using System.Runtime.CompilerServices;
using Easy.Common.Extensions;

namespace Readers
{
public class MsFraggerPsm
public class MsFraggerPsm : IQuantifiableRecord
{
public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Expand Down Expand Up @@ -59,7 +63,7 @@ public class MsFraggerPsm

[Name("Retention")]
public double RetentionTime { get; set; }

[Name("Observed Mass")]
public double ObservedMass { get; set; }

Expand Down Expand Up @@ -90,7 +94,11 @@ public class MsFraggerPsm
[Name("Nextscore")]
public double NextScore { get; set; }

[Name("PeptideProphet Probability")]
/// <summary>
/// MsFragger v22.0 output renames the header "PeptideProphet Probability" as just "Probability".
/// Headers are mutually exclusive, will not both occur in the same file.
/// </summary>
[Name("PeptideProphet Probability", "Probability")]
public double PeptideProphetProbability { get; set; }

[Name("Number of Enzymatic Termini")]
Expand Down Expand Up @@ -155,5 +163,78 @@ public class MsFraggerPsm
public int OneBasedScanNumber => _oneBasedScanNumber ??= int.Parse(Spectrum.Split('.')[1]);

#endregion

#region IQuantifiableRecord Implementation

[Ignore] public string FileName => SpectrumFilePath;

[Ignore] public List<(string, string, string)> ProteinGroupInfos
{
get
{
_proteinGroupInfos ??= AddProteinGroupInfos();
return _proteinGroupInfos;
}
}

/// <summary>
/// Creates a list of tuples, each of which represents a protein.
/// Each tuple contains the accession number, gene name, and organism.
/// These parameters are used to create a ProteinGroup object,
/// which is needed to make an identification.
/// </summary>
/// <returns></returns>
private List<(string, string, string)> AddProteinGroupInfos ()
{
_proteinGroupInfos = new List<(string, string, string)> ();
string protein = Protein;

char[] delimiterChars = { '|', '_'};
string[] proteinInfo = protein.Split(delimiterChars);

string proteinAccessions;
string geneName;
string organism;

// Fasta header is parsed to separate the accession number, gene name, and organism.
// If the protein does not have this information, it will be assigned an empty string.
// Ideally, a future refactor would create a method for parsing fasta headers
// that is shared by Readers and UsefulProteomicsDatabases.
proteinAccessions = proteinInfo.Length >= 2 ? proteinInfo[1] : "";
geneName = proteinInfo.Length >= 3 ? proteinInfo[2] : "";
organism = proteinInfo.Length >= 4 ? proteinInfo[3] : ""; ;

_proteinGroupInfos.Add((proteinAccessions, geneName, organism));

if (MappedProteins.IsNullOrEmpty()) return _proteinGroupInfos;

string mappedProteins = MappedProteins;
string[] allMappedProteinInfo = mappedProteins.Split(',');
foreach (var singleMappedProteinInfo in allMappedProteinInfo)
{
string[] mappedProteinInfo = singleMappedProteinInfo.Split(delimiterChars);

proteinAccessions = mappedProteinInfo.Length >= 2 ? mappedProteinInfo[1] : "";
geneName = mappedProteinInfo.Length >= 3 ? mappedProteinInfo[2] : "";
organism = mappedProteinInfo.Length >= 4 ? mappedProteinInfo[3] : "";

_proteinGroupInfos.Add((proteinAccessions, geneName, organism));
}

return _proteinGroupInfos;
}

[Ignore] private List<(string, string, string)> _proteinGroupInfos;

[Ignore] public string ModifiedSequence => FullSequence.IsNullOrEmpty() ? BaseSequence : FullSequence;

[Ignore] public int ChargeState => Charge;

// decoy reading isn't currently supported for MsFragger psms, this will be revisited later
[Ignore] public bool IsDecoy => false;

[Ignore] public double MonoisotopicMass => CalculatedPeptideMass;

#endregion
}
}
}
40 changes: 38 additions & 2 deletions mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
using System.Text;
using System.Threading.Tasks;
using CsvHelper;
using MassSpectrometry;
using Readers.ExternalResults.BaseClasses;

namespace Readers
{
public class MsFraggerPsmFile : ResultFile<MsFraggerPsm>, IResultFile
public class MsFraggerPsmFile : ResultFile<MsFraggerPsm>, IQuantifiableResultFile
{
public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm;
public override Software Software { get; set; }
Expand Down Expand Up @@ -38,5 +40,39 @@ public override void WriteResults(string outputPath)
csv.WriteRecord(result);
}
}

public IEnumerable<IQuantifiableRecord> GetQuantifiableResults() => Results;

/// <summary>
/// Creates a dictionary linking a shortened file name to its corresponding full file path
/// </summary>
/// <param name="fullFilePath"> list of all full file paths associted with a given result </param>
/// <returns> dictionary with key fileName and value fullFilePath </returns>
public Dictionary<string, string> FileNametoFilePath (List<string> fullFilePath)
{
List<string> rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList();
fullFilePath = fullFilePath.Distinct().ToList();
Dictionary<string, string> allFiles = new Dictionary<string, string>();

foreach(var fileName in rawFileNames)
{
string shortFileName = Path.GetFileName(fileName);

// MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml
// In order to correctly match the file names, these changes must be removed
shortFileName = shortFileName.Replace("interact-", "").Replace(".pep.xml", "");

foreach(var file in fullFilePath)
{
if (file.Contains(shortFileName) && !allFiles.ContainsKey(fileName))
{
allFiles.Add(fileName, file);
break;
}
}
}

return allFiles;
}
}
}
}
Loading

0 comments on commit 81ffd00

Please sign in to comment.