diff --git a/enviroment.yml b/enviroment.yml index b6715a3..852f9ea 100644 --- a/enviroment.yml +++ b/enviroment.yml @@ -8,7 +8,6 @@ channels: dependencies: - bioconda::thermorawfileparser=1.4.2 - bioconda::crux-toolkit - - bioconda::sage-proteomics - pandas - matplotlib - scikit-learn @@ -16,5 +15,9 @@ dependencies: - seaborn - numpy - pyarrow + - sagepy=0.2.22 + - sagepy-connector=0.2.22 + - pyteomics + - tqdm diff --git a/opt-params/general-sage.json b/opt-params/general-sage.json index 204344b..58b59e9 100644 --- a/opt-params/general-sage.json +++ b/opt-params/general-sage.json @@ -19,11 +19,10 @@ "min_ion_index": 2, "max_variable_mods": 3, "static_mods": { - "C": 57.0215 + "C": "[UNIMOD:4]" }, "variable_mods": { - "M": [15.994], - "N": [0.984] + "M": ["[UNIMOD:35]"] }, "generate_decoys": false, "decoy_tag": "DECOY_", @@ -59,6 +58,8 @@ "max_peaks": 300, "max_fragment_charge": 1, "min_matched_peaks": 4, + "annotate_matches": true, + "search_thread": 4, "mzml_paths": [ "./20150901_QEp1_LC7_PhGe_SA_Plate1E_45_6.mzML", "./20150618_QEp1_LC7_PhGe_SA_Plate1B_45_1.mzML", diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py index 25540a6..af3cf26 100644 --- a/opt-params/opt_params.py +++ b/opt-params/opt_params.py @@ -7,13 +7,20 @@ from typing import Tuple, List, Dict import json import os +from sagepy.core import EnzymeBuilder, SageSearchConfiguration, validate_mods, validate_var_mods, \ + Scorer, RawSpectrum, SpectrumProcessor, Precursor, Tolerance, IonType +from sagepy.qfdr.tdc import target_decoy_competition_pandas +from sagepy.utility import mean_ppm, median_ppm +from sagepy.core.ml.pep import calculate_pep +from sagepy.rescore.lda import rescore_lda import numpy as np import pandas as pd -import subprocess import matplotlib.pyplot as plt from pandas import DataFrame import click +from pyteomics import mzml, mass +import concurrent.futures as cf logging.basicConfig(level=logging.INFO) @@ -22,60 +29,401 @@ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +indexed_db = None + + +class SageRunner: + def __init__(self): + self.queries = [] + + def generate_indexed_db(self, fasta_path, param_data): + # if sage_config_file is None: + # raise ValueError("The sage config file is required.") + # + # with open(sage_config_file) as f: + # param_data = json.load(f) + cleave_at = param_data["database"]["enzyme"]["cleave_at"] + missed_cleavages = param_data["database"]["enzyme"]["missed_cleavages"] + min_len = param_data["database"]["enzyme"]["min_len"] + max_len = param_data["database"]["enzyme"]["max_len"] + restrict = param_data["database"]["enzyme"]["restrict"] + static_mods = param_data["database"]["static_mods"] + variable_mods = param_data["database"]["variable_mods"] + fragment_min_mz = param_data["database"]["fragment_min_mz"] + fragment_max_mz = param_data["database"]["fragment_max_mz"] + peptide_min_mass = param_data["database"]["peptide_min_mass"] + peptide_max_mass = param_data["database"]["peptide_max_mass"] + ion_kinds = param_data["database"]["ion_kinds"] + min_ion_index = param_data["database"]["min_ion_index"] + max_variable_mods = param_data["database"]["max_variable_mods"] + generate_decoys = param_data["database"]["generate_decoys"] + decoy_tag = param_data["database"]["decoy_tag"] + + # configure a trypsin-like digestor of fasta files + enzyme_builder = EnzymeBuilder( + missed_cleavages=missed_cleavages, + min_len=min_len, + max_len=max_len, + cleave_at=cleave_at, + restrict=restrict, + c_terminal=True + ) + + with open(fasta_path, 'r') as infile: + fasta = infile.read() + + # generate IonType class + ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds] + + # set-up a config for a sage-database + sage_config = SageSearchConfiguration( + fasta=fasta, + static_mods=static_mods, + variable_mods=variable_mods, + enzyme_builder=enzyme_builder, + fragment_min_mz=fragment_min_mz, + fragment_max_mz=fragment_max_mz, + peptide_min_mass=peptide_min_mass, + peptide_max_mass=peptide_max_mass, + ion_kinds=ion_kinds, + generate_decoys=generate_decoys, + decoy_tag=decoy_tag, + min_ion_index=min_ion_index, + max_variable_mods=max_variable_mods, + bucket_size=int(np.power(2, 14)) + ) + + # generate the database for searching against + indexed_db = sage_config.generate_indexed_database() + return indexed_db + + def peptide_spectrum_match_list_to_pandas(self, + psms, indexed_db, + re_score: bool = False, + use_sequence_as_match_idx: bool = True) -> pd.DataFrame: + """Convert a list of peptide spectrum matches to a pandas dataframe + + Args: + psms (List[PeptideSpectrumMatch]): The peptide spectrum matches + re_score (bool, optional): Should re-score be used. Defaults to False. + use_sequence_as_match_idx (bool, optional): Should the sequence be used as the match index. Defaults to True. + + Returns: + pd.DataFrame: The pandas dataframe + """ + row_list = [] + + for match in psms: + if match.retention_time_predicted is not None and match.projected_rt is not None: + delta_rt = match.retention_time_predicted - match.projected_rt + else: + delta_rt = None -def run_sage(fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", - mzml_files: list = [], fasta_path: str = "", sage_config_file: str = None, - use_file_values: bool = True) -> DataFrame: - if sage_config_file is None: - raise ValueError("The sage config file is required.") - - with open(sage_config_file) as f: - data = json.load(f) - - if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: - data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] - if fragment_type == "ppm": - data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] - else: - data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] - else: - logging.info("Using the values from the file.") - if "ppm" in data["precursor_tol"]: - precursor_tolerance = data["precursor_tol"]["ppm"][1] - else: - precursor_tolerance = data["precursor_tol"]["da"][1] - - if "ppm" in data["fragment_tol"]: - fragment_tolerance = data["fragment_tol"]["ppm"][1] - else: - fragment_tolerance = data["fragment_tol"]["da"][1] - - data["mzml_paths"] = mzml_files - - if os.path.exists(fasta_path): - data["database"]["fasta"] = fasta_path - else: - logging.error(f"File {fasta_path} does not exist.") - raise FileNotFoundError(f"File {fasta_path} does not exist.") + if re_score: + score = match.re_score + else: + score = match.hyper_score - temp_sage_file = str(uuid.uuid4()) + ".json" - with open(temp_sage_file, "w") as f: - json.dump(data, f, indent=4) + if use_sequence_as_match_idx: + match_idx = match.sequence + else: + match_idx = str(match.peptide_idx) - logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, - precursor_tolerance)) + if match.inverse_mobility_predicted is not None: + delta_ims = match.inverse_mobility_predicted - match.inverse_mobility_observed + else: + delta_ims = None - result = subprocess.run(["sage", temp_sage_file, "--write-pin"], capture_output=True, text=True) - os.remove(temp_sage_file) + if match.beta_score is not None: + beta_score = match.beta_score + else: + beta_score = None + + peptide = indexed_db[match.peptide_idx] + position = peptide.position + modifications = peptide.modifications + precursor_ppm = abs(match.mono_mass_observed - match.mono_mass_calculated - match.isotope_error) * 2e6 / ( + match.mono_mass_observed + match.mono_mass_calculated - match.isotope_error) + + row_list.append({ + "spec_idx": match.spec_idx, + "match_idx": match_idx, + "proteins": ";".join(match.proteins), + "decoy": match.decoy, + "score": score, + "re_score": match.re_score, + "hyperscore": match.hyper_score, + "rank": match.rank, + "mono_mz_calculated": match.mono_mz_calculated, + "mono_mass_observed": match.mono_mass_observed, + "mono_mass_calculated": match.mono_mass_calculated, + "delta_mass": match.mono_mass_calculated - match.mono_mass_observed, + "isotope_error": match.isotope_error, + "fragment_ppm": match.average_ppm, + "precursor_ppm": precursor_ppm, + "delta_next": match.delta_next, + "delta_best": match.delta_best, + "matched_peaks": match.matched_peaks, + "longest_b": match.longest_b, + "longest_y": match.longest_y, + "longest_y_pct": match.longest_y_pct, + "missed_cleavages": match.missed_cleavages, + "matched_intensity_pct": match.matched_intensity_pct, + "scored_candidates": match.scored_candidates, + "poisson": match.poisson, + "modifications": modifications, + "charge": match.charge, + "retention_time_observed": match.retention_time_observed, + "retention_time_predicted": match.retention_time_predicted, + "delta_rt": delta_rt, + "inverse_mobility_observed": match.inverse_mobility_observed, + "inverse_mobility_predicted": match.inverse_mobility_predicted, + "delta_ims": delta_ims, + "intensity_ms1": match.intensity_ms1, + "intensity_ms2": match.intensity_ms2, + "q_value": match.q_value, + "collision_energy": match.collision_energy, + "cosine_similarity": match.cosine_similarity, + "projected_rt": match.projected_rt, + "beta_score": beta_score + }) + + return pd.DataFrame(row_list) + + def parse_mzml(self, spec_file, file_id, fragment_min_mz, fragment_max_mz, deisotope_bool): + raw_spectrums = mzml.MzML(spec_file) + + for spectrum in raw_spectrums: + if spectrum["ms level"] == 1: + continue + precursor = spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0] + precursor_mz = precursor["selected ion m/z"] + precursor_charge = precursor["charge state"] + precursor_intensity = precursor.get('peak intensity', None) + collision_energy = spectrum['precursorList']['precursor'][0]['activation']['collision energy'] + injection_time = spectrum['scanList']['scan'][0]['ion injection time'] + retention_time = spectrum['scanList']['scan'][0]['scan start time'] + + # Extract the relevant metadata + isolation_window = spectrum['precursorList']['precursor'][0]['isolationWindow'] + + lower = isolation_window['isolation window target m/z'] - isolation_window[ + 'isolation window lower offset'] + upper = isolation_window['isolation window target m/z'] + isolation_window[ + 'isolation window upper offset'] + + mz = spectrum["m/z array"].astype(np.float32) + intensity = spectrum["intensity array"].astype(np.float32) + # set selection window bounds + tolerance = Tolerance(da=(lower, upper)) + precursor = Precursor( + charge=precursor_charge, + mz=precursor_mz, + intensity=precursor_intensity, + isolation_window=tolerance, + collision_energy=collision_energy + ) + scan = re.findall(r"scan=(\d+)", spectrum["id"])[0] + raw_spectrum = RawSpectrum( + file_id=file_id, + spec_id=os.path.splitext(os.path.basename(spec_file))[0] + "." + str(scan) + "." + str( + scan) + "." + str(precursor_charge), + total_ion_current=spectrum["total ion current"], + ion_injection_time=injection_time, + scan_start_time=retention_time, + precursors=[precursor], + mz=mz, + intensity=intensity + ) + + spec_processor = SpectrumProcessor(take_top_n=150, min_fragment_mz=fragment_min_mz, + max_fragment_mz=fragment_max_mz, + deisotope=deisotope_bool) + + query = spec_processor.process(raw_spectrum) + self.queries.append(query) + + # return querys + + def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", + mzml_files: list = [], sage_config_file: str = None, fasta_file: str = None, + use_file_values: bool = True, skip_spectra_preprocess: bool = False) -> DataFrame: + if sage_config_file is None: + raise ValueError("The sage config file is required.") + + with open(sage_config_file) as f: + param_data = json.load(f) + + # loading and set default parameters + if "fragment_min_mz" in param_data["database"]: + fragment_min_mz = param_data["database"]["fragment_min_mz"] + else: + param_data["database"]["fragment_min_mz"] = 150 + fragment_min_mz = 150 + if "fragment_max_mz" in param_data["database"]: + fragment_max_mz = param_data["database"]["fragment_min_mz"] + else: + param_data["database"]["fragment_max_mz"] = 2000 + fragment_max_mz = 2000 + if "deisotope" in param_data: + deisotope_bool = param_data["deisotope"] + else: + param_data["deisotope"] = True + deisotope_bool = True + if "min_matched_peaks" in param_data: + min_matched_peaks = param_data["min_matched_peaks"] + else: + param_data["min_matched_peaks"] = 6 + min_matched_peaks = 6 + if "min_isotope_err" in param_data: + min_isotope_err = param_data["min_isotope_err"] + else: + param_data["min_isotope_err"] = -1 + min_isotope_err = -1 + if "max_isotope_err" in param_data: + max_isotope_err = param_data["max_isotope_err"] + else: + param_data["max_isotope_err"] = -1 + max_isotope_err = 3 + if "min_precursor_charge" in param_data: + min_precursor_charge = param_data["min_precursor_charge"] + else: + min_precursor_charge = 2 + param_data["min_precursor_charge"] = 2 + if "max_precursor_charge" in param_data: + max_precursor_charge = param_data["max_precursor_charge"] + else: + max_precursor_charge = 4 + param_data["max_precursor_charge"] = 4 + if "chimera" in param_data: + chimera = param_data["chimera"] + else: + param_data["chimera"] = False + chimera = False + if "report_psms" in param_data: + report_psms = param_data["report_psms"] + else: + param_data["report_psms"] = 1 + report_psms = 1 + if "annotate_matches" in param_data: + annotate_matches = param_data["annotate_matches"] + else: + param_data["annotate_matches"] = False + annotate_matches = False + if "max_fragment_charge" in param_data: + max_fragment_charge = param_data["max_fragment_charge"] + else: + param_data["max_fragment_charge"] = 1 + max_fragment_charge = 1 + if "search_thread" in param_data: + search_thread = param_data["search_thread"] + else: + param_data["search_thread"] = 4 + search_thread = 4 + + if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: + param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] + precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance))) + if fragment_type == "ppm": + param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] + fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance))) + else: + param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] + fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance)) - if result.returncode != 0: - logging.error("Error running SAGE.") - logging.error(result.stderr) - raise ValueError("Error running SAGE.") + else: + logging.info("Using the values from the file.") + if "ppm" in param_data["precursor_tol"]: + precursor_tolerance = param_data["precursor_tol"]["ppm"][1] + else: + precursor_tolerance = param_data["precursor_tol"]["da"][1] - sage_table = pd.read_csv("results.sage.tsv", sep="\t") - sage_table = compute_entrapment_qvalues(sage_table) - return sage_table + if "ppm" in param_data["fragment_tol"]: + fragment_tolerance = param_data["fragment_tol"]["ppm"][1] + else: + fragment_tolerance = param_data["fragment_tol"]["da"][1] + + param_data["mzml_paths"] = mzml_files + + temp_sage_file = str(uuid.uuid4()) + ".json" + with open(temp_sage_file, "w") as f: + json.dump(param_data, f, indent=4) + + logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, + precursor_tolerance)) + if not skip_spectra_preprocess: + name_id_map = dict(zip(mzml_files, range(len(mzml_files)))) + with cf.ThreadPoolExecutor(search_thread) as executor: + for m in mzml_files: + executor.submit(self.parse_mzml, m, name_id_map[m], param_data["database"]["fragment_min_mz"], + param_data["database"]["fragment_max_mz"], deisotope_bool) + + # for future in cf.as_completed(tasks): + # all_queries.append(future.result()) + + static_mods = param_data["database"]["static_mods"] + variable_mods = param_data["database"]["variable_mods"] + + scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks, + precursor_tolerance=precursor_tolerance, + fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err, + max_isotope_err=max_isotope_err, + min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge, + min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera, + annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge, + static_mods=static_mods, variable_mods=variable_mods + ) + + indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data) + results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=self.queries, + num_threads=search_thread) + + psm_list = [] + for _, values in results.items(): + psm_list.extend(values) + + # after intensity a + psm_list_rescore = rescore_lda(psm_list, verbose=False) + psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db, re_score=True) + + # TARGET = psm_list_df[psm_list_df.decoy == False] + # DECOY = psm_list_df[psm_list_df.decoy] + + # plt.figure(figsize=(6, 4), dpi=150) + # + # plt.hist(TARGET.hyper_score, bins="auto", label="target") + # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy") + # plt.legend() + # plt.title("Target and decoy score distributions") + # plt.xlabel("Score") + # plt.ylabel("Count") + # plt.show() + + TDC = target_decoy_competition_pandas( + df=psm_list_df, + # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level) + method="peptide_psm_peptide", + score="re_score" + ) + + # add PEP to the PSMs + TDC["pep"] = calculate_pep( + scores=TDC.score.values, + decoys=TDC.decoy.values, + ) + + # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01 + # TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)] + RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC, + on=["spec_idx", "decoy", "match_idx"]) + + # RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(m))[0]) + RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True) + + sage_table = pd.DataFrame(RESULT, index=None) + + sage_table = compute_entrapment_qvalues(sage_table) + return sage_table def extract_ptms(sage_table_target): @@ -151,8 +499,7 @@ def compute_entrapment_qvalues(sage_data_frame: pd.DataFrame) -> pd.DataFrame: :param sage_data_frame: panda data frame with the sage results """ - # Use inplace=True for sorting - sage_data_frame.sort_values(by='sage_discriminant_score', ascending=False, inplace=True) + sage_data_frame.sort_values(by='re_score', ascending=False, inplace=True) # Use a more descriptive name for columns sage_data_frame['is_entrap'] = sage_data_frame['proteins'].apply( @@ -191,13 +538,15 @@ def compute_best_combination(sage_table: pd.DataFrame) -> int: return len(sage_table_target['peptide']) -def combined_search(results: list = [], start_fragment_tolerance: int = 0, start_precursor_tolerance: int = 0, +def combined_search(sagerunner, results: list = [], start_fragment_tolerance: int = 0, + start_precursor_tolerance: int = 0, min_fragment_tolerance: int = 1, max_fragment_tolerance: int = 50, min_precursor_tolerance: int = 10, max_precursor_tolerance: int = 100, num_psms: int = 0, fragment_type: str = "ppm", mzml_files: list = [], grid_frag_steps: int = 10, - grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file="", initial_temp=100, + grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file: str = None, + initial_temp=100, cooling_rate=0.95, sage_config_file=None) -> Tuple[int, int, List[Dict]]: def acceptance_probability(old_value, new_value, temperature): if new_value > old_value: @@ -213,9 +562,10 @@ def acceptance_probability(old_value, new_value, temperature): grid_best_value = 1 for ft in precursor_tolerances: - sage_table = run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, use_file_values=False) + sage_table = sagerunner.run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, use_file_values=False, + skip_spectra_preprocess=True) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value)) @@ -238,9 +588,10 @@ def acceptance_probability(old_value, new_value, temperature): for ft in fragment_tolerances: for pt in precursor_tolerances: - sage_table = run_sage(fragment_tolerance=ft, precursor_tolerance=pt, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, use_file_values=False) + sage_table = sagerunner.run_sage(fragment_tolerance=ft, precursor_tolerance=pt, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, skip_spectra_preprocess=True, + use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, ft, pt, new_value)) @@ -272,10 +623,10 @@ def acceptance_probability(old_value, new_value, temperature): max(min_precursor_tolerance, current_precursor_tolerance - search_radius), min(max_precursor_tolerance, current_precursor_tolerance + search_radius)) - sage_table = run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, - use_file_values=False) + sage_table = sagerunner.run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, skip_spectra_preprocess=True, + use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, fragment_tolerance, precursor_tolerance, new_value)) @@ -304,7 +655,7 @@ def acceptance_probability(old_value, new_value, temperature): @click.option("--max-precursor-tolerance", default=50, help="The maximum precursor tolerance to consider.", type=int) @click.option("--fasta-file", default="Homo-sapiens-uniprot-reviewed-contaminants-entrap-decoy-20240615.fasta", help="The path to the fasta file to use for the SAGE analysis.") -@click.option("--sage-config-file", default="sage-general.json", +@click.option("--sage-config-file", default="general-sage.json", help="The path to the Sage config file to use for the SAGE analysis.") @click.option("--max-iterations", default=10, help="The maximum number of iterations to run the optimization.", type=int) @@ -330,9 +681,18 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i sage_config_file = "general-sage.json" if initial_fragment_tolerance is not None and initial_precursor_tolerance is not None: - sage_table = run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type, - mzml_files, fasta_path=fasta_file, sage_config_file=sage_config_file, - use_file_values=False) + + if os.path.exists(fasta_file): + sagerunner = SageRunner() + else: + logging.error(f"File {fasta_file} does not exist.") + raise FileNotFoundError(f"File {fasta_file} does not exist.") + + sage_table = sagerunner.run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), + fragment_type, + mzml_files, sage_config_file=sage_config_file, fasta_file=fasta_file, + use_file_values=False) + sage_table.to_csv("sage_table.csv", index=False) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms) results.append(stats) @@ -340,7 +700,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i best_precursor_tolerance = initial_precursor_tolerance best_fragment_tolerance = initial_fragment_tolerance - best_fragment_tolerance, best_precursor_tolerance, results = combined_search(results=results, + best_fragment_tolerance, best_precursor_tolerance, results = combined_search(sagerunner, results=results, start_fragment_tolerance=best_fragment_tolerance, start_precursor_tolerance=best_precursor_tolerance, min_fragment_tolerance=initial_fragment_tolerance, @@ -425,9 +785,9 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str): global folder_plots_uui folder_plots_uui = str(uuid.uuid4()) os.makedirs(folder_plots_uui) - - sage_table = run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, use_file_values=True) + sagerunner = SageRunner() + sage_table = sagerunner.run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, use_file_values=True) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, fragment_tolerance=fragment_tolerance, @@ -445,6 +805,79 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str): f.write(f"{key}\t{value}\n") +def sage2PTMShepherd(sage_table): + ptmsherd_schema = ["Spectrum", "Spectrum File", "Peptide", "Modified Peptide", "Peptide Length", "Charge", + "Retention", "Observed Mass", "Calibrated Observed Mass", + "Observed M/Z", "Calibrated Observed M/Z", "Calculated Peptide Mass", "Calculated M/Z", + "Delta Mass", "Expectation", "Hyperscore", "Nextscore", "Probability", + "Number of Missed Cleavages", "Intensity", + "Assigned Modifications", "Observed Modifications", + "Purity", "Protein", "Protein ID"] + psm_df = pd.DataFrame(columns=ptmsherd_schema) + unimod_pattern = re.compile(r"\[UNIMOD:\d+]") + db = mass.Unimod() + new_rows = [] + for _, row in sage_table.iterrows(): + Spectrum = row["spec_idx"] + SpectrumFile = row["spec_idx"].split(".")[0] + peptide = unimod_pattern.sub(repl="", string=row['peptide']) + modifications = unimod_pattern.finditer(row["peptide"]) + modified_peptide = row["peptide"] + AssignedModifications = [] + previous_idx = 0 + for modification in modifications: + position = modification.span()[0] - 1 - previous_idx + unimod_term = modification.group(0).replace("[UNIMOD:", "").replace("]", "") + mass_da = db.by_id(unimod_term)["mono_mass"] + AssignedModifications.append(str(position + 1) + str(peptide[position]) + "(" + str(mass_da) + ")") + MassResiduePlusMod = int(mass.fast_mass(peptide[position]) + mass_da - 18.001) + modified_peptide = modified_peptide.replace(peptide[position] + modification.group(0), + peptide[position] + "[" + str(MassResiduePlusMod) + "]") + previous_idx += len(modification.group(0)) + if modified_peptide == peptide: + modified_peptide = "" + AssignedModifications = ", ".join(AssignedModifications) + + PeptideLength = len(peptide) + Charge = row["charge"] + Retention = row["retention_time_observed"] * 60 + ObservedMass = row["mono_mass_observed"] + ObservedMZ = row["mono_mass_observed"] / row["charge"] + 1.007276 + CalibratedObservedMZ = ObservedMZ + CalculatedPeptideMass = row["mono_mass_calculated"] + CalibratedObservedMass = ObservedMass + CalculatedMZ = row["mono_mz_calculated"] + DeltaMass = row["delta_mass"] + Hyperscore = row["hyperscore"] + Probability = 1 - row["spectrum_q"] + Nextscore = row["delta_next"] + NumberofMissedCleavages = row["missed_cleavages"] + Intensity = row["intensity_ms1"] + Protein = row["proteins"] + ProteinID = row["proteins"].split("|")[1] + new_rows.append({"Spectrum": Spectrum, "Spectrum File": SpectrumFile, + "Peptide": peptide, + "Modified Peptide": modified_peptide, + "Peptide Length": PeptideLength, "Charge": Charge, + "Retention": Retention, "Observed Mass": ObservedMass, + "Calibrated Observed Mass": CalibratedObservedMass, + "Observed M/Z": ObservedMZ, + "Calibrated Observed M/Z": CalibratedObservedMZ, + "Calculated Peptide Mass": CalculatedPeptideMass, + "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass, + "Hyperscore": Hyperscore, "Nextscore": Nextscore, + "Probability": Probability, + "Number of Missed Cleavages": NumberofMissedCleavages, + "Intensity": Intensity, + "Assigned Modifications": AssignedModifications, + "Protein": Protein, "Protein ID": ProteinID}) + + psm_df = pd.concat([psm_df, pd.DataFrame.from_records(new_rows)], + ignore_index=True) + + psm_df.to_csv("inputPTMShepherd.tsv", index=False, sep="\t") + + @click.group(context_settings=CONTEXT_SETTINGS) def cli(): """