Skip to content

Commit

Permalink
Fixed MSstats bug (#38)
Browse files Browse the repository at this point in the history
* Fixed MSstats bug

* Bump changelog

* Added suggested error

* Add create dir

* Update dataProcess params
  • Loading branch information
wfondrie committed Apr 12, 2023
1 parent ac2f912 commit fdf2570
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 7 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [Unreleased]

## [1.1.0] - 2023-04-12
### Fixed
- Fixed our conversion from EncyclopeDIA outputs to an MSstats input, which was dropping proteins due to mismatched protein accessions.

## [1.0.1] - 2022-12-14
### Fixed
- Fixed issue where the docker image would not build in M1 macs.
Expand Down
36 changes: 33 additions & 3 deletions bin/msstats.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,28 @@ library(MSstats, warn.conflicts = FALSE)
# Convert EncyclopeDIA results to a format for MSstats
encyclopediaToMsstats <- function(peptides_txt, proteins_txt) {
id_vars <- c("Peptide", "Protein", "numFragments")
proteins <- read.table(proteins_txt,
prot2pep <- read.table(proteins_txt,
sep = "\t",
header = TRUE,
stringsAsFactors = TRUE,
check.names = FALSE)
check.names = FALSE) %>%
select(Protein, PeptideSequences) %>%
mutate(Peptide = str_split(PeptideSequences, ";")) %>%
select(Protein, Peptide) %>%
unnest(Peptide)

df <- read.table(peptides_txt,
sep = "\t",
header = TRUE,
stringsAsFactors = FALSE,
check.names = FALSE) %>%
inner_join(proteins["Protein"], by = "Protein") %>%
select(-Protein)

orig_rows <- nrow(df)

df <- df %>%
inner_join(prot2pep, by = "Peptide") %>%
check_join(orig_rows, 0.75) %>%
pivot_longer(names(.)[!names(.) %in% id_vars],
names_to = "run",
values_to = "intensity") %>%
Expand Down Expand Up @@ -87,6 +97,21 @@ fill_column <- function(df, column, backup) {
}


# Check for too many missing values after the join.
# Args:
# - df (data.frame): the joined data
# - orig_rows (int): the number or rows in the original data.
# - thold (float): The threshold for thowing an error.
check_join <- function(df, orig_rows, thold) {
present <- nrow(df) / orig_rows
if (present < thold) {
thold <- as.integer(thold*100)
stop(paste0("<",thold, "% of peptides have associated protein."))
}
return(df)
}


# The main function
main <- function() {
args <- commandArgs(trailingOnly = TRUE)
Expand All @@ -96,6 +121,10 @@ main <- function() {
contrasts <- args[4] # 'NO_FILE' if missing.
normalization <- args[5]
reports <- as.logical(args[6])
print(reports)
# Create the result folders if needed:
dir.create("msstats", showWarnings = FALSE)
dir.create("results", showWarnings = FALSE)

# Parse the normalization:
if(tolower(normalization) == "none") normalization <- FALSE
Expand All @@ -116,6 +145,7 @@ main <- function() {
# Read into an MSstats format:
raw <- SkylinetoMSstatsFormat(peptide_df,
filter_with_Qvalue = FALSE,
removeProtein_with1Peptide = FALSE,
censoredInt = "0",
use_log_file = FALSE)

Expand Down
14 changes: 11 additions & 3 deletions tests/msstats_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from __future__ import annotations
from collections import defaultdict

import numpy as np
import pandas as pd
Expand All @@ -10,19 +11,26 @@ def _msstats_input(tmp_path, peps: list[str], prots, stems, conditions):
"""
rng = np.random.default_rng(42)
quants = rng.normal(0, 1, size=(len(peps), len(stems))) ** 2 * 1e5

mzml = [s + ".mzML" for s in stems]
raw = ["s3://stuff/blah/" + s + ".raw" for s in stems]

# Prepare protein-to-peptide mapping:
prot2pep = defaultdict(list)
for prot, pep in zip(prots, peps):
prot2pep[prot].append(pep)

prot2pep = {prot: ";".join(pep) for prot, pep in prot2pep.items()}

# The peptides.txt file:
quant_df = pd.DataFrame(quants, columns=mzml)
meta_df = pd.DataFrame({"Peptide": peps, "Protein": prots, "numFragments": 1})

meta_df = pd.DataFrame({"Peptide": peps, "Protein": "X", "numFragments": 1})
peptide_df = pd.concat([quant_df, meta_df], axis=1)
peptide_file = tmp_path / "encyclopedia.peptides.txt"
peptide_df.to_csv(peptide_file, sep="\t", index=False)

# The proteins.txt file:
protein_df = pd.DataFrame({"Protein": list(set(prots))})
protein_df["PeptideSequences"] = protein_df["Protein"].replace(prot2pep)
protein_file = tmp_path / "encyclopedia.proteins.txt"
protein_df.to_csv(protein_file, sep="\t", index=False)

Expand Down
34 changes: 34 additions & 0 deletions tests/unit_tests/test_msstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,40 @@
]


def test_joins(msstats_input, script):
"""Test that the joins are made correctly"""
peptide_file, protein_file, annot_file, _ = msstats_input
args = [
script,
peptide_file,
protein_file,
annot_file,
"NO_FILE",
"equalizeMedians",
"false",
]

subprocess.run(args, check=True)
df = pd.read_table(OUTPUTS[2])
assert set(df["Protein"]) == set("AB")

# Test the failure case.
rows_to_add = {"Peptide": list("QRSTUVWXYZ"), "Protein": "Y"}
(
pd.read_table(peptide_file)
.merge(pd.DataFrame(rows_to_add), how="outer")
.fillna(0)
.to_csv(peptide_file, sep="\t", index=False)
)

err = subprocess.run(
args,
capture_output=True,
text=True,
)
assert "% of peptides have associated protein." in err.stderr


def test_reports(msstats_input, script):
"""Test without reports"""
peptide_file, protein_file, annot_file, contrast_file = msstats_input
Expand Down

0 comments on commit fdf2570

Please sign in to comment.