Fixed MSstats bug (#38)

* Fixed MSstats bug * Bump changelog * Added suggested error * Add create dir * Update dataProcess params
TalusBio · Apr 12, 2023 · fdf2570 · fdf2570
1 parent ac2f912
commit fdf2570
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-
 ## [Unreleased]
 
+## [1.1.0] - 2023-04-12
+### Fixed
+- Fixed our conversion from EncyclopeDIA outputs to an MSstats input, which was dropping proteins due to mismatched protein accessions.
+
 ## [1.0.1] - 2022-12-14
 ### Fixed
 - Fixed issue where the docker image would not build in M1 macs.

diff --git a/bin/msstats.R b/bin/msstats.R
@@ -10,18 +10,28 @@ library(MSstats, warn.conflicts = FALSE)
 # Convert EncyclopeDIA results to a format for MSstats
 encyclopediaToMsstats <- function(peptides_txt, proteins_txt) {
   id_vars <- c("Peptide", "Protein", "numFragments")
-  proteins <- read.table(proteins_txt,
+  prot2pep <- read.table(proteins_txt,
                          sep = "\t",
                          header = TRUE,
                          stringsAsFactors = TRUE,
-                         check.names = FALSE)
+                         check.names = FALSE) %>%
+    select(Protein, PeptideSequences) %>%
+    mutate(Peptide = str_split(PeptideSequences, ";")) %>%
+    select(Protein, Peptide) %>%
+    unnest(Peptide)
 
   df <- read.table(peptides_txt,
                    sep = "\t",
                    header = TRUE,
                    stringsAsFactors = FALSE,
                    check.names = FALSE) %>%
-    inner_join(proteins["Protein"], by = "Protein") %>%
+    select(-Protein)
+
+  orig_rows <- nrow(df)
+
+  df <- df %>%
+    inner_join(prot2pep, by = "Peptide") %>%
+    check_join(orig_rows, 0.75) %>%
     pivot_longer(names(.)[!names(.) %in% id_vars],
                  names_to = "run",
                  values_to = "intensity") %>%
@@ -87,6 +97,21 @@ fill_column <- function(df, column, backup) {
 }
 
 
+# Check for too many missing values after the join.
+# Args:
+# - df (data.frame): the joined data
+# - orig_rows (int): the number or rows in the original data.
+# - thold (float): The threshold for thowing an error.
+check_join <- function(df, orig_rows, thold) {
+  present <- nrow(df) / orig_rows
+  if (present < thold) {
+    thold <- as.integer(thold*100)
+    stop(paste0("<",thold, "% of peptides have associated protein."))
+  }
+  return(df)
+}
+
+
 # The main function
 main <- function() {
   args <- commandArgs(trailingOnly = TRUE)
@@ -96,6 +121,10 @@ main <- function() {
   contrasts <- args[4] # 'NO_FILE' if missing.
   normalization <- args[5]
   reports <- as.logical(args[6])
+  print(reports)
+  # Create the result folders if needed:
+  dir.create("msstats", showWarnings = FALSE)
+  dir.create("results", showWarnings = FALSE)
 
   # Parse the normalization:
   if(tolower(normalization) == "none") normalization <- FALSE
@@ -116,6 +145,7 @@ main <- function() {
   # Read into an MSstats format:
   raw <- SkylinetoMSstatsFormat(peptide_df,
                                 filter_with_Qvalue = FALSE,
+                                removeProtein_with1Peptide = FALSE,
                                 censoredInt = "0",
                                 use_log_file = FALSE)
 

diff --git a/tests/msstats_utils.py b/tests/msstats_utils.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+from collections import defaultdict
 
 import numpy as np
 import pandas as pd
@@ -10,19 +11,26 @@ def _msstats_input(tmp_path, peps: list[str], prots, stems, conditions):
     """
     rng = np.random.default_rng(42)
     quants = rng.normal(0, 1, size=(len(peps), len(stems))) ** 2 * 1e5
-
     mzml = [s + ".mzML" for s in stems]
     raw = ["s3://stuff/blah/" + s + ".raw" for s in stems]
 
+    # Prepare protein-to-peptide mapping:
+    prot2pep = defaultdict(list)
+    for prot, pep in zip(prots, peps):
+        prot2pep[prot].append(pep)
+
+    prot2pep = {prot: ";".join(pep) for prot, pep in prot2pep.items()}
+
     # The peptides.txt file:
     quant_df = pd.DataFrame(quants, columns=mzml)
-    meta_df = pd.DataFrame({"Peptide": peps, "Protein": prots, "numFragments": 1})
-
+    meta_df = pd.DataFrame({"Peptide": peps, "Protein": "X", "numFragments": 1})
     peptide_df = pd.concat([quant_df, meta_df], axis=1)
     peptide_file = tmp_path / "encyclopedia.peptides.txt"
     peptide_df.to_csv(peptide_file, sep="\t", index=False)
 
+    # The proteins.txt file:
     protein_df = pd.DataFrame({"Protein": list(set(prots))})
+    protein_df["PeptideSequences"] = protein_df["Protein"].replace(prot2pep)
     protein_file = tmp_path / "encyclopedia.proteins.txt"
     protein_df.to_csv(protein_file, sep="\t", index=False)
 

diff --git a/tests/unit_tests/test_msstats.py b/tests/unit_tests/test_msstats.py
@@ -16,6 +16,40 @@
 ]
 
 
+def test_joins(msstats_input, script):
+    """Test that the joins are made correctly"""
+    peptide_file, protein_file, annot_file, _ = msstats_input
+    args = [
+        script,
+        peptide_file,
+        protein_file,
+        annot_file,
+        "NO_FILE",
+        "equalizeMedians",
+        "false",
+    ]
+
+    subprocess.run(args, check=True)
+    df = pd.read_table(OUTPUTS[2])
+    assert set(df["Protein"]) == set("AB")
+
+    # Test the failure case.
+    rows_to_add = {"Peptide": list("QRSTUVWXYZ"), "Protein": "Y"}
+    (
+        pd.read_table(peptide_file)
+        .merge(pd.DataFrame(rows_to_add), how="outer")
+        .fillna(0)
+        .to_csv(peptide_file, sep="\t", index=False)
+    )
+
+    err = subprocess.run(
+        args,
+        capture_output=True,
+        text=True,
+    )
+    assert "% of peptides have associated protein." in err.stderr
+
+
 def test_reports(msstats_input, script):
     """Test without reports"""
     peptide_file, protein_file, annot_file, contrast_file = msstats_input