Skip to content

Commit

Permalink
Merge pull request #240 from theGreatHerrLebert/feature/sage-output-p…
Browse files Browse the repository at this point in the history
…arser

Feature/sage output parser
  • Loading branch information
theGreatHerrLebert authored Jul 25, 2024
2 parents e774d77 + 598eee0 commit c6f3235
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 13 deletions.
71 changes: 58 additions & 13 deletions imspy/imspy/timstof/dbsearch/imspy_rescore_sage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,27 @@ def main():
dest="store_hyperscore",
help="Store the results with the hyperscore as score")

parser.add_argument("--fine_tune_predictors",
action="store_true",
help="Fine tune the rt and inv-mob predictors, default is False")
parser.set_defaults(fine_tune_predictors=False)

parser.set_defaults(store_hyperscore=True)

parser.add_argument("--positive_example_q_max", default=0.01, type=float,
help="Maximum q-value allowed for positive examples, default is 0.01 (1 percent FDR)")

parser.add_argument("--verbose",
action="store_true",
help="Print verbose output, default is False")
parser.set_defaults(verbose=False)

parser.add_argument("--no_summary_plot",
action="store_false",
dest="summary_plot",
help="Whether to create a summary plot, default is True")
parser.set_defaults(summary_plot=True)

# parse the arguments
args = parser.parse_args()

Expand Down Expand Up @@ -105,21 +121,34 @@ def main():

# create the token replacer
token_replacer = PatternReplacer(replace_tokens)
results_filtered["peptide"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)
results_filtered["sequence"] = results_filtered.apply(lambda r: token_replacer.apply(r.peptide), axis=1)

# calculate missing mz value for PSMs
results_filtered["mz"] = results_filtered.apply(lambda r: calculate_mz(r.calcmass, r.charge), axis=1)
results_filtered["rt_projected"] = results_filtered.apply(lambda r:
results_filtered["mono_mz_calculated"] = results_filtered.apply(lambda r: calculate_mz(r.calcmass, r.charge), axis=1)
results_filtered["inverse_mobility_observed"] = results.ion_mobility
results_filtered["projected_rt"] = results_filtered.apply(lambda r:
linear_map(r.rt, old_min=results_filtered.rt.min(),
old_max=results_filtered.rt.max()), axis=1)

results_filtered["match_idx"] = results_filtered.sequence
results_filtered["spec_idx"] = [str(x) for x in results_filtered.psm_id]
results_filtered["score"] = results_filtered.hyperscore
results_filtered["q_value"] = None

if len(results_filtered) < len(results):
s = len(results) - len(results_filtered)
logging.info(f"Removed {s} sequences with sequence length > 30.")

# create filtered set of ids
S = set(results_filtered.psm_id)

# create fine-tuning data
if args.fine_tune_predictors:
TDC_train = target_decoy_competition_pandas(results_filtered, method="psm")
TDC_train_f = TDC_train[(TDC_train.decoy == False) & (TDC_train.q_value <= 0.01)]
TDC_train_f["spec_idxi"] = [int(x) for x in TDC_train_f.spec_idx]
FT_data = pd.merge(TDC_train_f, results_filtered, left_on=["spec_idxi"], right_on=["psm_id"])

# filter fragments to remove sequences not in the results
fragments = fragments[[f in S for f in fragments.psm_id.values]]

Expand All @@ -146,7 +175,7 @@ def main():

# use prosit to predict intensities
intensity_pred = prosit_model.predict_intensities(
results_filtered.peptide.values,
results_filtered.sequence.values,
results_filtered.charge.values,
collision_energies=np.zeros_like(results_filtered.charge.values) + 30,
batch_size=2048,
Expand All @@ -156,19 +185,31 @@ def main():
# log that ion mobility prediction is starting
logging.info("Predicting peptide retention times...")

if args.fine_tune_predictors:
rt_predictor.fine_tune_model(
data=FT_data,
verbose=args.verbose,
)

# predict retention times
rt_pred = rt_predictor.simulate_separation_times(
sequences=results_filtered.peptide.values,
sequences=results_filtered.sequence.values,
)

# log that ion mobility prediction is starting
logging.info("Predicting ion mobilities...")

if args.fine_tune_predictors:
im_predictor.fine_tune_model(
data=FT_data,
verbose=args.verbose,
)

# predict ion mobilities
inv_mob = im_predictor.simulate_ion_mobilities(
sequences=results_filtered.peptide.values,
sequences=results_filtered.sequence.values,
charges=results_filtered.charge.values,
mz=results_filtered.mz.values,
mz=results_filtered.mono_mz_calculated.values,
)

results_filtered["inv_mob_predicted"] = inv_mob
Expand All @@ -182,18 +223,13 @@ def main():
PSMS["observed_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_observed), axis=1)
PSMS["predicted_dict"] = PSMS.apply(lambda r: fragments_to_dict(r.fragments_predicted), axis=1)
PSMS["cosine_similarity"] = PSMS.apply(lambda s: cosim_from_dict(s.observed_dict, s.predicted_dict), axis=1)
PSMS["delta_rt"] = PSMS.rt_projected - PSMS.rt_predicted
PSMS["delta_rt"] = PSMS.projected_rt - PSMS.rt_predicted
PSMS["delta_ims"] = PSMS.ion_mobility - PSMS.inv_mob_predicted
PSMS["intensity_ms1"] = 0.0
PSMS["collision_energy"] = 0.0
PSMS = PSMS.rename(columns={"ms2_intensity": "intensity_ms2",
"fragment_ppm": "average_ppm", "precursor_ppm": "delta_mass"})

# IDs for TDC
PSMS["match_idx"] = PSMS.peptide
PSMS["spec_idx"] = [str(x) for x in PSMS.psm_id]
PSMS["score"] = PSMS.hyperscore
PSMS["q_value"] = None

# log that re-scoring is starting
logging.info("Re-scoring PSMs...")
Expand All @@ -219,6 +255,15 @@ def main():
logging.info(f"Before re-scoring: {before} PSMs with q-value <= 0.01")
logging.info(f"After re-scoring: {after} PSMs with q-value <= 0.01")

if args.summary_plot:
TARGET = PSMS[PSMS.decoy == False]
DECOY = PSMS[PSMS.decoy]

logging.info("Creating summary plot...")

output_path = os.path.join(args.output, "summary_plot.png")
plot_summary(TARGET, DECOY, output_path, dpi=300)

# create output path for both files
output_path = os.path.dirname(args.output)
if not os.path.exists(output_path):
Expand Down
62 changes: 62 additions & 0 deletions imspy/imspy/timstof/dbsearch/sage_output_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt

from numpy.typing import NDArray
from typing import Tuple

Expand Down Expand Up @@ -252,3 +254,63 @@ def fragments_to_dict(fragments: Fragments):
dict[(charge, ion_type, ordinal)] = intensity

return dict


def plot_summary(TARGET, DECOY, save_path, dpi=300, file_format='png'):
fig, axs = plt.subplots(3, 2, figsize=(15, 18))

# Plot 1
axs[0, 0].scatter(TARGET.projected_rt, TARGET.rt_predicted, s=1, alpha=.1, c="darkblue", label="Target")
axs[0, 0].scatter(DECOY.projected_rt, DECOY.rt_predicted, s=1, alpha=.1, c="orange", label="Decoy")
axs[0, 0].set_xlabel("Retention time observed")
axs[0, 0].set_ylabel("Retention time predicted")
axs[0, 0].legend()
axs[0, 0].set_title("Retention Time Prediction")

# Plot 2
axs[0, 1].scatter(TARGET.ion_mobility, TARGET.inv_mob_predicted, s=1, alpha=.1, c="darkblue", label="Target")
axs[0, 1].scatter(DECOY.ion_mobility, DECOY.inv_mob_predicted, s=1, alpha=.1, c="orange", label="Decoy")
axs[0, 1].set_xlabel("Ion mobility observed")
axs[0, 1].set_ylabel("Ion mobility predicted")
axs[0, 1].legend()
axs[0, 1].set_title("Ion Mobility Prediction")

# Plot 3
axs[1, 0].hist(TARGET.projected_rt - TARGET.rt_predicted, alpha=.8, bins="auto", density=True, color="darkblue",
label="Target")
axs[1, 0].hist(DECOY.projected_rt - DECOY.rt_predicted, alpha=.5, bins="auto", density=True, color="orange",
label="Decoy")
axs[1, 0].set_xlabel("Retention time delta")
axs[1, 0].set_ylabel("Density")
axs[1, 0].legend()
axs[1, 0].set_title("Retention Time Delta")

# Plot 4
axs[1, 1].hist(TARGET.ion_mobility - TARGET.inv_mob_predicted, alpha=.8, bins="auto", density=True,
color="darkblue", label="Target")
axs[1, 1].hist(DECOY.ion_mobility - DECOY.inv_mob_predicted, alpha=.5, bins="auto", density=True, color="orange",
label="Decoy")
axs[1, 1].set_xlim((-0.4, 0.4))
axs[1, 1].set_xlabel("Ion mobility delta")
axs[1, 1].set_ylabel("Density")
axs[1, 1].legend()
axs[1, 1].set_title("Ion Mobility Delta")

# Plot 5
axs[2, 0].hist(TARGET.cosine_similarity, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
axs[2, 0].hist(DECOY.cosine_similarity, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
axs[2, 0].set_xlabel("Cosine similarity")
axs[2, 0].set_ylabel("Density")
axs[2, 0].legend()
axs[2, 0].set_title("Cosine Similarity")

# Plot 6
axs[2, 1].hist(TARGET.re_score, bins="auto", density=True, alpha=.8, color="darkblue", label="Target")
axs[2, 1].hist(DECOY.re_score, bins="auto", density=True, alpha=.5, color="orange", label="Decoy")
axs[2, 1].set_xlabel("Score")
axs[2, 1].set_ylabel("Density")
axs[2, 1].legend()
axs[2, 1].set_title("Score Information")

plt.tight_layout()
plt.savefig(save_path, dpi=dpi, format=file_format)

0 comments on commit c6f3235

Please sign in to comment.