Merge pull request #240 from theGreatHerrLebert/feature/sage-output-p…

…arser Feature/sage output parser
theGreatHerrLebert · Jul 25, 2024 · c6f3235 · c6f3235
2 parents e774d77 + 598eee0
commit c6f3235
Show file tree

Hide file tree

Showing 2 changed files with 120 additions and 13 deletions.
diff --git a/imspy/imspy/timstof/dbsearch/imspy_rescore_sage.py b/imspy/imspy/timstof/dbsearch/imspy_rescore_sage.py
@@ -47,11 +47,27 @@ def main():
                         dest="store_hyperscore",
                         help="Store the results with the hyperscore as score")
 
+    parser.add_argument("--fine_tune_predictors",
+                        action="store_true",
+                        help="Fine tune the rt and inv-mob predictors, default is False")
+    parser.set_defaults(fine_tune_predictors=False)
+
     parser.set_defaults(store_hyperscore=True)
 
     parser.add_argument("--positive_example_q_max", default=0.01, type=float,
                         help="Maximum q-value allowed for positive examples, default is 0.01 (1 percent FDR)")
 
+    parser.add_argument("--verbose",
+                        action="store_true",
+                        help="Print verbose output, default is False")
+    parser.set_defaults(verbose=False)
+
+    parser.add_argument("--no_summary_plot",
+                        action="store_false",
+                        dest="summary_plot",
+                        help="Whether to create a summary plot, default is True")
+    parser.set_defaults(summary_plot=True)
+
     # parse the arguments
     args = parser.parse_args()
 
@@ -105,21 +121,34 @@ def main():
 
     # create the token replacer
     token_replacer = PatternReplacer(replace_tokens)
-    results_filtered["peptide"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
+    results_filtered["sequence"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
 
     # calculate missing mz value for PSMs
-    results_filtered["mz"] = results_filtered.apply(lambda r: calculate_mz(r.calcmass, r.charge), axis=1)
-    results_filtered["rt_projected"] = results_filtered.apply(lambda r:
+    results_filtered["mono_mz_calculated"] = results_filtered.apply(lambda r: calculate_mz(r.calcmass, r.charge), axis=1)
+    results_filtered["inverse_mobility_observed"] = results.ion_mobility
+    results_filtered["projected_rt"] = results_filtered.apply(lambda r:
                                                               linear_map(r.rt, old_min=results_filtered.rt.min(),
                                                                          old_max=results_filtered.rt.max()), axis=1)
 
+    results_filtered["match_idx"] = results_filtered.sequence
+    results_filtered["spec_idx"] = [str(x) for x in results_filtered.psm_id]
+    results_filtered["score"] = results_filtered.hyperscore
+    results_filtered["q_value"] = None
+
     if len(results_filtered) < len(results):
         s = len(results) - len(results_filtered)
         logging.info(f"Removed {s} sequences with sequence length > 30.")
 
     # create filtered set of ids
     S = set(results_filtered.psm_id)
 
+    # create fine-tuning data
+    if args.fine_tune_predictors:
+        TDC_train = target_decoy_competition_pandas(results_filtered, method="psm")
+        TDC_train_f = TDC_train[(TDC_train.decoy == False) & (TDC_train.q_value <= 0.01)]
+        TDC_train_f["spec_idxi"] = [int(x) for x in TDC_train_f.spec_idx]
+        FT_data = pd.merge(TDC_train_f, results_filtered, left_on=["spec_idxi"], right_on=["psm_id"])
+
     # filter fragments to remove sequences not in the results
     fragments = fragments[[f in S for f in fragments.psm_id.values]]
 
@@ -146,7 +175,7 @@ def main():
 
     # use prosit to predict intensities
     intensity_pred = prosit_model.predict_intensities(
-        results_filtered.peptide.values,
+        results_filtered.sequence.values,
         results_filtered.charge.values,
         collision_energies=np.zeros_like(results_filtered.charge.values) + 30,
         batch_size=2048,
@@ -156,19 +185,31 @@ def main():
     # log that ion mobility prediction is starting
     logging.info("Predicting peptide retention times...")
 
+    if args.fine_tune_predictors:
+        rt_predictor.fine_tune_model(
+            data=FT_data,
+            verbose=args.verbose,
+        )
+
     # predict retention times
     rt_pred = rt_predictor.simulate_separation_times(
-        sequences=results_filtered.peptide.values,
+        sequences=results_filtered.sequence.values,
     )
 
     # log that ion mobility prediction is starting
     logging.info("Predicting ion mobilities...")
 
+    if args.fine_tune_predictors:
+        im_predictor.fine_tune_model(
+            data=FT_data,
+            verbose=args.verbose,
+        )
+
     # predict ion mobilities
     inv_mob = im_predictor.simulate_ion_mobilities(
-        sequences=results_filtered.peptide.values,
+        sequences=results_filtered.sequence.values,
         charges=results_filtered.charge.values,
-        mz=results_filtered.mz.values,
+        mz=results_filtered.mono_mz_calculated.values,
     )
 
     results_filtered["inv_mob_predicted"] = inv_mob
@@ -182,18 +223,13 @@ def main():
     PSMS["observed_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_observed), axis=1)
     PSMS["predicted_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_predicted), axis=1)
     PSMS["cosine_similarity"] = PSMS.apply(lambda s: cosim_from_dict(s.observed_dict, s.predicted_dict), axis=1)
-    PSMS["delta_rt"] = PSMS.rt_projected - PSMS.rt_predicted
+    PSMS["delta_rt"] = PSMS.projected_rt - PSMS.rt_predicted
     PSMS["delta_ims"] = PSMS.ion_mobility - PSMS.inv_mob_predicted
     PSMS["intensity_ms1"] = 0.0
     PSMS["collision_energy"] = 0.0
     PSMS = PSMS.rename(columns={"ms2_intensity": "intensity_ms2",
                                 "fragment_ppm": "average_ppm", "precursor_ppm": "delta_mass"})
 
-    # IDs for TDC
-    PSMS["match_idx"] = PSMS.peptide
-    PSMS["spec_idx"] = [str(x) for x in PSMS.psm_id]
-    PSMS["score"] = PSMS.hyperscore
-    PSMS["q_value"] = None
 
     # log that re-scoring is starting
     logging.info("Re-scoring PSMs...")
@@ -219,6 +255,15 @@ def main():
     logging.info(f"Before re-scoring: {before} PSMs with q-value <= 0.01")
     logging.info(f"After re-scoring: {after} PSMs with q-value <= 0.01")
 
+    if args.summary_plot:
+        TARGET = PSMS[PSMS.decoy == False]
+        DECOY = PSMS[PSMS.decoy]
+
+        logging.info("Creating summary plot...")
+
+        output_path = os.path.join(args.output, "summary_plot.png")
+        plot_summary(TARGET, DECOY, output_path, dpi=300)
+
     # create output path for both files
     output_path = os.path.dirname(args.output)
     if not os.path.exists(output_path):

diff --git a/imspy/imspy/timstof/dbsearch/sage_output_utility.py b/imspy/imspy/timstof/dbsearch/sage_output_utility.py
@@ -12,6 +12,8 @@
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 from sklearn.preprocessing import StandardScaler
 
+from matplotlib import pyplot as plt
+
 from numpy.typing import NDArray
 from typing import Tuple
 
@@ -252,3 +254,63 @@ def fragments_to_dict(fragments: Fragments):
         dict[(charge, ion_type, ordinal)] = intensity
 
     return dict
+
+
+def plot_summary(TARGET, DECOY, save_path, dpi=300, file_format='png'):
+    fig, axs = plt.subplots(3, 2, figsize=(15, 18))
+
+    # Plot 1
+    axs[0, 0].scatter(TARGET.projected_rt, TARGET.rt_predicted, s=1, alpha=.1, c="darkblue", label="Target")
+    axs[0, 0].scatter(DECOY.projected_rt, DECOY.rt_predicted, s=1, alpha=.1, c="orange", label="Decoy")
+    axs[0, 0].set_xlabel("Retention time observed")
+    axs[0, 0].set_ylabel("Retention time predicted")
+    axs[0, 0].legend()
+    axs[0, 0].set_title("Retention Time Prediction")
+
+    # Plot 2
+    axs[0, 1].scatter(TARGET.ion_mobility, TARGET.inv_mob_predicted, s=1, alpha=.1, c="darkblue", label="Target")
+    axs[0, 1].scatter(DECOY.ion_mobility, DECOY.inv_mob_predicted, s=1, alpha=.1, c="orange", label="Decoy")
+    axs[0, 1].set_xlabel("Ion mobility observed")
+    axs[0, 1].set_ylabel("Ion mobility predicted")
+    axs[0, 1].legend()
+    axs[0, 1].set_title("Ion Mobility Prediction")
+
+    # Plot 3
+    axs[1, 0].hist(TARGET.projected_rt - TARGET.rt_predicted, alpha=.8, bins="auto", density=True, color="darkblue",
+                   label="Target")
+    axs[1, 0].hist(DECOY.projected_rt - DECOY.rt_predicted, alpha=.5, bins="auto", density=True, color="orange",
+                   label="Decoy")
+    axs[1, 0].set_xlabel("Retention time delta")
+    axs[1, 0].set_ylabel("Density")
+    axs[1, 0].legend()
+    axs[1, 0].set_title("Retention Time Delta")
+
+    # Plot 4
+    axs[1, 1].hist(TARGET.ion_mobility - TARGET.inv_mob_predicted, alpha=.8, bins="auto", density=True,
+                   color="darkblue", label="Target")
+    axs[1, 1].hist(DECOY.ion_mobility - DECOY.inv_mob_predicted, alpha=.5, bins="auto", density=True, color="orange",
+                   label="Decoy")
+    axs[1, 1].set_xlim((-0.4, 0.4))
+    axs[1, 1].set_xlabel("Ion mobility delta")
+    axs[1, 1].set_ylabel("Density")
+    axs[1, 1].legend()
+    axs[1, 1].set_title("Ion Mobility Delta")
+
+    # Plot 5
+    axs[2, 0].hist(TARGET.cosine_similarity, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
+    axs[2, 0].hist(DECOY.cosine_similarity, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
+    axs[2, 0].set_xlabel("Cosine similarity")
+    axs[2, 0].set_ylabel("Density")
+    axs[2, 0].legend()
+    axs[2, 0].set_title("Cosine Similarity")
+
+    # Plot 6
+    axs[2, 1].hist(TARGET.re_score, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
+    axs[2, 1].hist(DECOY.re_score, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
+    axs[2, 1].set_xlabel("Score")
+    axs[2, 1].set_ylabel("Density")
+    axs[2, 1].legend()
+    axs[2, 1].set_title("Score Information")
+
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=dpi, format=file_format)