Merge pull request #235 from theGreatHerrLebert/feature/sage-output-p…

…arser Feature/sage output parser
theGreatHerrLebert · Jul 20, 2024 · 6abbda2 · 6abbda2
2 parents d174d49 + b383231
commit 6abbda2
Show file tree

Hide file tree

Showing 5 changed files with 446 additions and 4 deletions.
diff --git a/imspy/imspy/simulation/timsim/simulator.py b/imspy/imspy/simulation/timsim/simulator.py
@@ -143,8 +143,8 @@ def main():
 
     # charge state probabilities
     parser.add_argument("--p_charge", type=float, default=0.5, help="Probability of being charged (default: 0.5)")
-    parser.add_argument("--min_charge_contrib", type=float, default=0.15,
-                        help="Minimum charge contribution (default: 0.15)")
+    parser.add_argument("--min_charge_contrib", type=float, default=0.25,
+                        help="Minimum charge contribution (default: 0.25)")
 
     # Noise settings
     # -- 1. RT and IM noise

diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py
@@ -15,9 +15,7 @@
 from sagepy.qfdr.tdc import target_decoy_competition_pandas
 
 from imspy.algorithm import DeepPeptideIonMobilityApex, load_deep_ccs_predictor, load_tokenizer_from_resources
-
 from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
-
 from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper
 
 from imspy.timstof import TimsDatasetDDA

diff --git a/imspy/imspy/timstof/dbsearch/imspy_rescore_sage.py b/imspy/imspy/timstof/dbsearch/imspy_rescore_sage.py
@@ -0,0 +1,228 @@
+# start by importing logging and cmd argument parser
+import logging
+import argparse
+import sys
+import os
+
+from imspy.algorithm import DeepPeptideIonMobilityApex, load_deep_ccs_predictor, load_tokenizer_from_resources
+from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
+from imspy.algorithm.intensity.predictors import Prosit2023TimsTofWrapper
+from imspy.chemistry.utility import calculate_mz
+from imspy.timstof.dbsearch.utility import linear_map
+from sagepy.core.scoring import prosit_intensities_to_fragments_par
+
+from .sage_output_utility import *
+
+# supress pandas warnings about column assignment
+pd.options.mode.chained_assignment = None
+
+
+def main():
+    # argument parser
+    parser = argparse.ArgumentParser(description='🦀💻 IMSPY - SAGE Parser DDA 🔬🐍 - PROTEOMICS IMS DDA '
+                                                 'data re-scoring using imspy and sagepy.')
+
+    # add the arguments
+    parser.add_argument("sage_results", help="The path to the SAGE results file")
+    parser.add_argument("sage_fragments", help="The path to the SAGE fragments file")
+    parser.add_argument("output", help="The path to where the output files should be created")
+
+    # add target decoy competition method
+    parser.add_argument("--tdc_method", default="peptide_psm_peptide",
+                        help="The target decoy competition method, default is peptide_psm_peptide",
+                        choices=["psm", "peptide_psm_only", "peptide_peptide_only", "peptide_psm_peptide"])
+    # re-scoring parameters
+    parser.add_argument("--num_splits", default=10, type=int,
+                        help="The number of splits for the target decoy competition cross-validation, default is 10")
+    parser.add_argument("--no_balanced_split",
+                        action="store_false", dest="balance",
+                        help="Whether to balance the training dataset, sampling same amount "
+                             "of target and decoy examples, default is True")
+    parser.set_defaults(balance=True)
+
+    # if hyper score results should be stored
+    parser.add_argument("--no_store_hyperscore", action="store_false",
+                        help="Store the results with the hyperscore as score")
+    parser.set_defaults(store_hyperscore=True)
+
+    parser.add_argument("--positive_example_q_max", default=0.01, type=float,
+                        help="Maximum q-value allowed for positive examples, default is 0.01 (1% FDR)")
+
+    # parse the arguments
+    args = parser.parse_args()
+
+    # check if the SAGE results file exists
+    if not os.path.exists(args.sage_results):
+        logging.error(f"The SAGE results file {args.sage_results} does not exist.")
+        sys.exit(1)
+
+    # check if the SAGE fragments file exists
+    if not os.path.exists(args.sage_fragments):
+        logging.error(f"The SAGE fragments file {args.sage_fragments} does not exist.")
+        sys.exit(1)
+
+    # read the SAGE results file, check if .tsv or .parquet
+    if args.sage_results.endswith(".tsv"):
+        results = pd.read_csv(args.sage_results, sep="\t")
+    elif args.sage_results.endswith(".parquet"):
+        results = pd.read_parquet(args.sage_results)
+    else:
+        logging.error(f"Unknown file format for SAGE results file {args.sage_results}.")
+        sys.exit(1)
+
+    # read the SAGE fragments file, check if .tsv or .parquet
+    if args.sage_fragments.endswith(".tsv"):
+        fragments = pd.read_csv(args.sage_fragments, sep="\t")
+    elif args.sage_fragments.endswith(".parquet"):
+        fragments = pd.read_parquet(args.sage_fragments)
+    else:
+        logging.error(f"Unknown file format for SAGE fragments file {args.sage_fragments}.")
+        sys.exit(1)
+
+    # set up the logging
+    logging.basicConfig(level=logging.INFO)
+
+    # the intensity predictor model
+    prosit_model = Prosit2023TimsTofWrapper(verbose=False)
+
+    # the ion mobility predictor model
+    im_predictor = DeepPeptideIonMobilityApex(load_deep_ccs_predictor(),
+                                              load_tokenizer_from_resources("tokenizer-ptm"))
+    # the retention time predictor model
+    rt_predictor = DeepChromatographyApex(load_deep_retention_time_predictor(),
+                                          load_tokenizer_from_resources("tokenizer-ptm"), verbose=True)
+
+    # add sequence length to results table to filter out sequences that are too long for intensity prediction
+    results["sequence_length"] = results.apply(lambda s: len(remove_substrings(s.peptide)), axis=1)
+    results_filtered = results[results.sequence_length <= 30]
+
+    # add decoy True or False column
+    results_filtered["decoy"] = results_filtered.apply(lambda r: r.label == -1, axis=1)
+
+    # create the token replacer
+    token_replacer = PatternReplacer(replace_tokens)
+    results_filtered["peptide"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
+
+    # calculate missing mz value for PSMs
+    results_filtered["mz"] = results_filtered.apply(lambda r: calculate_mz(r.calcmass, r.charge), axis=1)
+    results_filtered["rt_projected"] = results_filtered.apply(lambda r:
+                                                              linear_map(r.rt, old_min=results_filtered.rt.min(),
+                                                                         old_max=results_filtered.rt.max()), axis=1)
+
+    if len(results_filtered) < len(results):
+        s = len(results) - len(results_filtered)
+        logging.info(f"Removed {s} sequences with sequence length > 30.")
+
+    # create filtered set of ids
+    S = set(results_filtered.psm_id)
+
+    # filter fragments to remove sequences not in the results
+    fragments = fragments[[f in S for f in fragments.psm_id.values]]
+
+    # logg how many psms are going to be processed
+    logging.info(f"Processing {len(results_filtered)} PSMs.")
+
+    # group the fragments by PSM id, create a Fragments object for each PSM
+    fragments_grouped = fragments.groupby("psm_id").agg({
+        "fragment_type": list,
+        "fragment_ordinals": list,
+        "fragment_charge": list,
+        "fragment_mz_calculated": list,
+        "fragment_mz_experimental": list,
+        "fragment_intensity": list,
+    }).reset_index()
+
+    fragments_grouped["fragments_observed"] = fragments_grouped.apply(lambda r: row_to_fragment(r), axis=1)
+
+    # select only the psm_id and fragments_observed columns
+    fragments_grouped = fragments_grouped[["psm_id", "fragments_observed"]]
+
+    # log that intensity prediction is starting
+    logging.info("Predicting intensities...")
+
+    # use prosit to predict intensities
+    intensity_pred = prosit_model.predict_intensities(
+        results_filtered.peptide.values,
+        results_filtered.charge.values,
+        collision_energies=np.zeros_like(results_filtered.charge.values) + 30,
+        batch_size=2048,
+        flatten=True,
+    )
+
+    # log that ion mobility prediction is starting
+    logging.info("Predicting peptide retention times...")
+
+    # predict retention times
+    rt_pred = rt_predictor.simulate_separation_times(
+        sequences=results_filtered.peptide.values,
+    )
+
+    # log that ion mobility prediction is starting
+    logging.info("Predicting ion mobilities...")
+
+    # predict ion mobilities
+    inv_mob = im_predictor.simulate_ion_mobilities(
+        sequences=results_filtered.peptide.values,
+        charges=results_filtered.charge.values,
+        mz=results_filtered.mz.values,
+    )
+
+    results_filtered["inv_mob_predicted"] = inv_mob
+    results_filtered["rt_predicted"] = rt_pred
+    results_filtered["fragments_predicted"] = prosit_intensities_to_fragments_par(intensity_pred)
+
+    # merge the results and fragments
+    PSMS = pd.merge(results_filtered, fragments_grouped, on="psm_id")
+
+    # features for re-scoring
+    PSMS["observed_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_observed), axis=1)
+    PSMS["predicted_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_predicted), axis=1)
+    PSMS["cosine_similarity"] = PSMS.apply(lambda s: cosim_from_dict(s.observed_dict, s.predicted_dict), axis=1)
+    PSMS["delta_rt"] = PSMS.rt_projected - PSMS.rt_predicted
+    PSMS["delta_ims"] = PSMS.ion_mobility - PSMS.inv_mob_predicted
+    PSMS["intensity_ms1"] = 0.0
+    PSMS["collision_energy"] = 0.0
+    PSMS = PSMS.rename(columns={"ms2_intensity": "intensity_ms2",
+                                "fragment_ppm": "average_ppm", "precursor_ppm": "delta_mass"})
+
+    # IDs for TDC
+    PSMS["match_idx"] = PSMS.peptide
+    PSMS["spec_idx"] = [str(x) for x in PSMS.psm_id]
+    PSMS["score"] = PSMS.hyperscore
+    PSMS["q_value"] = None
+
+    # log that re-scoring is starting
+    logging.info("Re-scoring PSMs...")
+
+    # re-score the PSMs
+    PSMS["re_score"] = re_score_psms(PSMS, num_splits=args.num_splits, balance=args.balance,
+                                     positive_example_q_max=args.positive_example_q_max)
+
+    # run the target decoy competition
+    TDC = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="hyperscore")
+    TDC_rescore = target_decoy_competition_pandas(PSMS, method=args.tdc_method, score="re_score")
+
+    # rename the columns to match the output
+    TDC = TDC.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
+    TDC_rescore = TDC_rescore.rename(columns={"match_idx": "peptide", "spec_idx": "psm_id"})
+
+    # log the number of PSMs with q-value <= 0.01 before and after re-scoring
+    before, after = len(TDC[TDC.q_value <= 0.01]), len(TDC_rescore[TDC_rescore.q_value <= 0.01])
+    logging.info(f"Before re-scoring: {before} PSMs with q-value <= 0.01")
+    logging.info(f"After re-scoring: {after} PSMs with q-value <= 0.01")
+
+    # create output path for both files
+    output_path = os.path.dirname(args.output)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    file_name = os.path.join(output_path, "imspy_sage_hyperscore.tsv")
+    file_name_rescore = os.path.join(output_path, "imspy_sage_rescore.tsv")
+
+    # save the results
+    if args.store_hyperscore:
+        TDC.to_csv(file_name, sep="\t", index=False)
+        logging.info(f"Output file {file_name} saved.")
+
+    TDC_rescore.to_csv(file_name_rescore, sep="\t", index=False)
+    logging.info(f"Output file {file_name_rescore} saved.")