From 37943b91a492f8e76b8d8c8602a1914d27ad62d6 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Mon, 2 Sep 2024 20:51:10 +0800 Subject: [PATCH 1/3] replace sage command with sagepy --- enviroment.yml | 4 +- opt-params/opt_params.py | 323 ++++++++++++++++++++++++++++++++++----- 2 files changed, 290 insertions(+), 37 deletions(-) diff --git a/enviroment.yml b/enviroment.yml index b6715a3..94eecf9 100644 --- a/enviroment.yml +++ b/enviroment.yml @@ -8,7 +8,6 @@ channels: dependencies: - bioconda::thermorawfileparser=1.4.2 - bioconda::crux-toolkit - - bioconda::sage-proteomics - pandas - matplotlib - scikit-learn @@ -16,5 +15,8 @@ dependencies: - seaborn - numpy - pyarrow + - sagepy=0.2.20 + - pyteomics + - tqdm diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py index 25540a6..eb92b15 100644 --- a/opt-params/opt_params.py +++ b/opt-params/opt_params.py @@ -7,13 +7,17 @@ from typing import Tuple, List, Dict import json import os - +from sagepy.core import EnzymeBuilder, SageSearchConfiguration, validate_mods, validate_var_mods, \ + Scorer, RawSpectrum, SpectrumProcessor, Precursor, Tolerance, IonType import numpy as np import pandas as pd -import subprocess import matplotlib.pyplot as plt from pandas import DataFrame import click +from pyteomics import mzml +import concurrent.futures as cf + +from tqdm import tqdm logging.basicConfig(level=logging.INFO) @@ -22,58 +26,296 @@ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) +indexed_db = None + + +def generate_indexed_db(fasta_path, sage_config_file): + if sage_config_file is None: + raise ValueError("The sage config file is required.") + + with open(sage_config_file) as f: + param_data = json.load(f) + cleave_at = param_data["database"]["enzyme"]["cleave_at"] + missed_cleavages = param_data["database"]["enzyme"]["missed_cleavages"] + min_len = param_data["database"]["enzyme"]["min_len"] + max_len = param_data["database"]["enzyme"]["max_len"] + restrict = param_data["database"]["enzyme"]["restrict"] + static_mods = param_data["database"]["static_mods"] + variable_mods = param_data["database"]["variable_mods"] + fragment_min_mz = param_data["database"]["fragment_min_mz"] + fragment_max_mz = param_data["database"]["fragment_max_mz"] + peptide_min_mass = param_data["database"]["peptide_min_mass"] + peptide_max_mass = param_data["database"]["peptide_max_mass"] + ion_kinds = param_data["database"]["ion_kinds"] + min_ion_index = param_data["database"]["min_ion_index"] + max_variable_mods = param_data["database"]["max_variable_mods"] + generate_decoys = param_data["database"]["generate_decoys"] + decoy_tag = param_data["database"]["decoy_tag"] + + # configure a trypsin-like digestor of fasta files + enzyme_builder = EnzymeBuilder( + missed_cleavages=missed_cleavages, + min_len=min_len, + max_len=max_len, + cleave_at=cleave_at, + restrict=restrict, + c_terminal=True + ) + + # # generate static cysteine modification + # static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]} + # + # # generate variable methionine modification + # variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]} + + # generate SAGE compatible mod representations + static = validate_mods(static_mods) + variab = validate_var_mods(variable_mods) + + with open(fasta_path, 'r') as infile: + fasta = infile.read() + + # generate IonType class + ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds] + + # set-up a config for a sage-database + sage_config = SageSearchConfiguration( + fasta=fasta, + static_mods=static, + variable_mods=variab, + enzyme_builder=enzyme_builder, + fragment_min_mz=fragment_min_mz, + fragment_max_mz=fragment_max_mz, + peptide_min_mass=peptide_min_mass, + peptide_max_mass=peptide_max_mass, + ion_kinds=ion_kinds, + generate_decoys=generate_decoys, + decoy_tag=decoy_tag, + min_ion_index=min_ion_index, + max_variable_mods=max_variable_mods, + bucket_size=int(np.power(2, 14)) + ) + + # generate the database for searching against + global indexed_db + indexed_db = sage_config.generate_indexed_database() + + +def mass_to_mod(mass: float) -> str: + """ Convert a mass to a UNIMOD modification annotation. + + Args: + mass: a mass in Da + + Returns: + a UNIMOD modification annotation + """ + maybe_key = int(np.round(mass)) + mod_dict = { + 1: '[UNIMOD:7]', + 42: '[UNIMOD:1]', + 57: '[UNIMOD:4]', + 80: '[UNIMOD:21]', + 16: '[UNIMOD:35]', + 119: '[UNIMOD:312]', + } + # try to translate to UNIMOD annotation + try: + return mod_dict[maybe_key] + except KeyError: + raise KeyError(f"Rounded mass not in dict: {maybe_key}") + + +# Adapted from https://github.com/theGreatHerrLebert/sagepy/blob/v0.2.20-alpha/sagepy/sagepy/core/peptide.py#L125C1 +# Will update in next release https://github.com/theGreatHerrLebert/sagepy/pull/12/files +def to_unimod_sequence(modifications, sequence) -> str: + """ Get Peptide sequence with UNIMOD modification annotations. + + Returns: + str: Peptide sequence with UNIMOD modification annotations. + """ + + seq = '' + + for i, (s, m) in enumerate(zip(sequence, modifications)): + if m != 0: + if i == 0: + if mass_to_mod(m) == '[UNIMOD:1]': + seq += f'{mass_to_mod(m)}{s}' + else: + seq += f'{s}{mass_to_mod(m)}' + else: + seq += f'{s}{mass_to_mod(m)}' + else: + seq += s + + return seq + + +def run_one_raw(spec_file, param_data): + global indexed_db + + raw_spectrums = mzml.MzML(spec_file) + res = [] + + # loading parameters + fragment_min_mz = param_data["database"]["fragment_min_mz"] + fragment_max_mz = param_data["database"]["fragment_max_mz"] + deisotope_bool = param_data["deisotope"] + min_matched_peaks = param_data["min_matched_peaks"] + min_isotope_err = param_data["min_isotope_err"] + max_isotope_err = param_data["max_isotope_err"] + min_precursor_charge = param_data["min_precursor_charge"] + max_precursor_charge = param_data["max_precursor_charge"] + chimera = param_data["chimera"] + report_psms = param_data["report_psms"] + annotate_matches = param_data["annotate_matches"] + max_fragment_charge = param_data["max_fragment_charge"] + + if "ppm" in param_data["precursor_tol"]: + precursor_tolerance = Tolerance( + ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1])) + else: + precursor_tolerance = Tolerance( + ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1])) + + if "ppm" in param_data["fragment_tol"]: + fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1])) + else: + fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1])) + + # Begin search and score + for spectrum in tqdm(raw_spectrums): + if spectrum["ms level"] == 1: + continue + selected_ion = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0][ + "selected ion m/z"] + charge = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0]["charge state"] + mz = spectrum["m/z array"].astype(np.float32) + intensity = spectrum["intensity array"].astype(np.float32) + precursor = Precursor( + charge=charge, + mz=selected_ion, + ) + raw_spectrum = RawSpectrum( + file_id=1, + spec_id=spectrum["id"], + total_ion_current=spectrum["total ion current"], + precursors=[precursor], + mz=mz, + intensity=intensity + ) + + spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz, + max_fragment_mz=fragment_max_mz, + deisotope=deisotope_bool) + query = spec_processor.process(raw_spectrum) + scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks, + precursor_tolerance=precursor_tolerance, + fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err, + max_isotope_err=max_isotope_err, + min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge, + min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera, + annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge) + + results = scorer.score(db=indexed_db, spectrum=query) + for feature in results: + peptide = indexed_db[feature.peptide_idx] + sequence = peptide.sequence + modifications = peptide.modifications + unimod_sequence = to_unimod_sequence(modifications, sequence) + precursor_ppm = abs(feature.expmass - feature.calcmass - feature.isotope_error) * 2e6 / (feature.expmass + feature.calcmass- feature.isotope_error) + res.append({"file_id": feature.file_id, "spec_id": feature.spec_id, "peptide": unimod_sequence, + "proteins": ";".join(peptide.proteins), "rank": feature.rank, + "label": feature.label, "exp.mass": feature.expmass, "cal.mass": feature.calcmass, + "charge": feature.charge, "retention time": feature.rt, + "precursor_ppm": precursor_ppm, "fragment_ppm": feature.average_ppm, + "delta mass": feature.delta_mass, "isotope error": feature.isotope_error, + "hyperscore": feature.hyperscore, "delta_next": feature.delta_next, + "delta_best": feature.delta_best, + "matched peaks": feature.matched_peaks, "missed cleavages": feature.missed_cleavages, + "scored candidates": feature.scored_candidates, + "sage_discriminant_score": feature.discriminant_score, + "posterior error": feature.posterior_error, + "spectrum_q": feature.spectrum_q, "peptide q": feature.peptide_q, + "protein q": feature.protein_q, + "ms2 intensity": feature.ms2_intensity}) + + res = pd.DataFrame(res, index=None) + return res + def run_sage(fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", - mzml_files: list = [], fasta_path: str = "", sage_config_file: str = None, - use_file_values: bool = True) -> DataFrame: + mzml_files: list = [], sage_config_file: str = None, + use_file_values: bool = True, n_process=4) -> DataFrame: if sage_config_file is None: raise ValueError("The sage config file is required.") with open(sage_config_file) as f: - data = json.load(f) + param_data = json.load(f) + + # loading and set default parameters + param_data["database"]["fragment_min_mz"] = param_data["database"]["fragment_min_mz"] if "fragment_min_mz" in \ + param_data[ + "database"] else 150 + param_data["database"]["fragment_max_mz"] = param_data["database"]["fragment_max_mz"] if "fragment_max_mz" in \ + param_data[ + "database"] else 2000 + param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True + param_data["min_matched_peaks"] = param_data["min_matched_peaks"] if "min_matched_peaks" in param_data else 6 + param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1 + param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3 + param_data["min_precursor_charge"] = param_data[ + "min_precursor_charge"] if "min_precursor_charge" in param_data else 2 + param_data["max_precursor_charge"] = param_data[ + "max_precursor_charge"] if "max_precursor_charge" in param_data else 4 + param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False + param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1 + param_data["annotate_matches"] = param_data["annotate_matches"] if "annotate_matches" in param_data else False + param_data["max_fragment_charge"] = param_data[ + "max_fragment_charge"] if "max_fragment_charge" in param_data else 1 if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: - data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] + param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] + precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance))) if fragment_type == "ppm": - data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] + param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] + fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance))) else: - data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] + param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] + fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance)) + else: logging.info("Using the values from the file.") - if "ppm" in data["precursor_tol"]: - precursor_tolerance = data["precursor_tol"]["ppm"][1] + if "ppm" in param_data["precursor_tol"]: + precursor_tolerance = param_data["precursor_tol"]["ppm"][1] else: - precursor_tolerance = data["precursor_tol"]["da"][1] + precursor_tolerance = param_data["precursor_tol"]["da"][1] - if "ppm" in data["fragment_tol"]: - fragment_tolerance = data["fragment_tol"]["ppm"][1] + if "ppm" in param_data["fragment_tol"]: + fragment_tolerance = param_data["fragment_tol"]["ppm"][1] else: - fragment_tolerance = data["fragment_tol"]["da"][1] + fragment_tolerance = param_data["fragment_tol"]["da"][1] - data["mzml_paths"] = mzml_files - - if os.path.exists(fasta_path): - data["database"]["fasta"] = fasta_path - else: - logging.error(f"File {fasta_path} does not exist.") - raise FileNotFoundError(f"File {fasta_path} does not exist.") + param_data["mzml_paths"] = mzml_files temp_sage_file = str(uuid.uuid4()) + ".json" with open(temp_sage_file, "w") as f: - json.dump(data, f, indent=4) + json.dump(param_data, f, indent=4) logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, precursor_tolerance)) - result = subprocess.run(["sage", temp_sage_file, "--write-pin"], capture_output=True, text=True) - os.remove(temp_sage_file) + sage_table = pd.DataFrame() + tasks = [] - if result.returncode != 0: - logging.error("Error running SAGE.") - logging.error(result.stderr) - raise ValueError("Error running SAGE.") + with cf.ProcessPoolExecutor(n_process) as pp: + for m in mzml_files: + tasks.append(pp.submit(run_one_raw, m, param_data)) + + for future in cf.as_completed(tasks): + sage_table = pd.concat([sage_table, future.result()], ignore_index=True) - sage_table = pd.read_csv("results.sage.tsv", sep="\t") sage_table = compute_entrapment_qvalues(sage_table) return sage_table @@ -214,8 +456,8 @@ def acceptance_probability(old_value, new_value, temperature): grid_best_value = 1 for ft in precursor_tolerances: sage_table = run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, use_file_values=False) + fragment_type=fragment_type, mzml_files=mzml_files, + sage_config_file=sage_config_file, use_file_values=False, n_process=4) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value)) @@ -239,7 +481,7 @@ def acceptance_probability(old_value, new_value, temperature): for ft in fragment_tolerances: for pt in precursor_tolerances: sage_table = run_sage(fragment_tolerance=ft, precursor_tolerance=pt, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, + fragment_type=fragment_type, mzml_files=mzml_files, sage_config_file=sage_config_file, use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, ft, pt, new_value)) @@ -273,7 +515,7 @@ def acceptance_probability(old_value, new_value, temperature): min(max_precursor_tolerance, current_precursor_tolerance + search_radius)) sage_table = run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, - fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file, + fragment_type=fragment_type, mzml_files=mzml_files, sage_config_file=sage_config_file, use_file_values=False) new_value = compute_best_combination(sage_table) @@ -304,7 +546,7 @@ def acceptance_probability(old_value, new_value, temperature): @click.option("--max-precursor-tolerance", default=50, help="The maximum precursor tolerance to consider.", type=int) @click.option("--fasta-file", default="Homo-sapiens-uniprot-reviewed-contaminants-entrap-decoy-20240615.fasta", help="The path to the fasta file to use for the SAGE analysis.") -@click.option("--sage-config-file", default="sage-general.json", +@click.option("--sage-config-file", default="general-sage.json", help="The path to the Sage config file to use for the SAGE analysis.") @click.option("--max-iterations", default=10, help="The maximum number of iterations to run the optimization.", type=int) @@ -330,9 +572,18 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i sage_config_file = "general-sage.json" if initial_fragment_tolerance is not None and initial_precursor_tolerance is not None: + + if os.path.exists(fasta_file): + logging.info(f"Generating indexed database from {fasta_file}.") + generate_indexed_db(fasta_file, sage_config_file) + else: + logging.error(f"File {fasta_file} does not exist.") + raise FileNotFoundError(f"File {fasta_file} does not exist.") + sage_table = run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type, - mzml_files, fasta_path=fasta_file, sage_config_file=sage_config_file, - use_file_values=False) + mzml_files, sage_config_file=sage_config_file, + use_file_values=False, n_process=4) + sage_table.to_csv("sage_table.csv", index=False) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms) results.append(stats) From 31084a54350b658655c04ca42821daa83aaa04a0 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sat, 14 Sep 2024 10:45:22 +0800 Subject: [PATCH 2/3] update --- opt-params/general-sage.json | 7 +- opt-params/opt_params.py | 721 ++++++++++++++++++++++------------- 2 files changed, 455 insertions(+), 273 deletions(-) diff --git a/opt-params/general-sage.json b/opt-params/general-sage.json index 204344b..58b59e9 100644 --- a/opt-params/general-sage.json +++ b/opt-params/general-sage.json @@ -19,11 +19,10 @@ "min_ion_index": 2, "max_variable_mods": 3, "static_mods": { - "C": 57.0215 + "C": "[UNIMOD:4]" }, "variable_mods": { - "M": [15.994], - "N": [0.984] + "M": ["[UNIMOD:35]"] }, "generate_decoys": false, "decoy_tag": "DECOY_", @@ -59,6 +58,8 @@ "max_peaks": 300, "max_fragment_charge": 1, "min_matched_peaks": 4, + "annotate_matches": true, + "search_thread": 4, "mzml_paths": [ "./20150901_QEp1_LC7_PhGe_SA_Plate1E_45_6.mzML", "./20150618_QEp1_LC7_PhGe_SA_Plate1B_45_1.mzML", diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py index eb92b15..cc5926c 100644 --- a/opt-params/opt_params.py +++ b/opt-params/opt_params.py @@ -9,16 +9,19 @@ import os from sagepy.core import EnzymeBuilder, SageSearchConfiguration, validate_mods, validate_var_mods, \ Scorer, RawSpectrum, SpectrumProcessor, Precursor, Tolerance, IonType +from sagepy.qfdr.tdc import target_decoy_competition_pandas +from sagepy.utility import mean_ppm, median_ppm +from sagepy.core.ml.pep import calculate_pep +from sagepy.rescore.lda import rescore_lda + import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas import DataFrame import click -from pyteomics import mzml +from pyteomics import mzml, mass import concurrent.futures as cf -from tqdm import tqdm - logging.basicConfig(level=logging.INFO) folder_plots_uui = "" @@ -29,12 +32,16 @@ indexed_db = None -def generate_indexed_db(fasta_path, sage_config_file): - if sage_config_file is None: - raise ValueError("The sage config file is required.") +class SageRunner: + def __init__(self): + self.indexed_db = None - with open(sage_config_file) as f: - param_data = json.load(f) + def generate_indexed_db(self, fasta_path, param_data): + # if sage_config_file is None: + # raise ValueError("The sage config file is required.") + # + # with open(sage_config_file) as f: + # param_data = json.load(f) cleave_at = param_data["database"]["enzyme"]["cleave_at"] missed_cleavages = param_data["database"]["enzyme"]["missed_cleavages"] min_len = param_data["database"]["enzyme"]["min_len"] @@ -52,272 +59,366 @@ def generate_indexed_db(fasta_path, sage_config_file): generate_decoys = param_data["database"]["generate_decoys"] decoy_tag = param_data["database"]["decoy_tag"] - # configure a trypsin-like digestor of fasta files - enzyme_builder = EnzymeBuilder( - missed_cleavages=missed_cleavages, - min_len=min_len, - max_len=max_len, - cleave_at=cleave_at, - restrict=restrict, - c_terminal=True - ) - - # # generate static cysteine modification - # static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]} - # - # # generate variable methionine modification - # variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]} - - # generate SAGE compatible mod representations - static = validate_mods(static_mods) - variab = validate_var_mods(variable_mods) - - with open(fasta_path, 'r') as infile: - fasta = infile.read() - - # generate IonType class - ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds] - - # set-up a config for a sage-database - sage_config = SageSearchConfiguration( - fasta=fasta, - static_mods=static, - variable_mods=variab, - enzyme_builder=enzyme_builder, - fragment_min_mz=fragment_min_mz, - fragment_max_mz=fragment_max_mz, - peptide_min_mass=peptide_min_mass, - peptide_max_mass=peptide_max_mass, - ion_kinds=ion_kinds, - generate_decoys=generate_decoys, - decoy_tag=decoy_tag, - min_ion_index=min_ion_index, - max_variable_mods=max_variable_mods, - bucket_size=int(np.power(2, 14)) - ) - - # generate the database for searching against - global indexed_db - indexed_db = sage_config.generate_indexed_database() - - -def mass_to_mod(mass: float) -> str: - """ Convert a mass to a UNIMOD modification annotation. - - Args: - mass: a mass in Da - - Returns: - a UNIMOD modification annotation - """ - maybe_key = int(np.round(mass)) - mod_dict = { - 1: '[UNIMOD:7]', - 42: '[UNIMOD:1]', - 57: '[UNIMOD:4]', - 80: '[UNIMOD:21]', - 16: '[UNIMOD:35]', - 119: '[UNIMOD:312]', - } - # try to translate to UNIMOD annotation - try: - return mod_dict[maybe_key] - except KeyError: - raise KeyError(f"Rounded mass not in dict: {maybe_key}") + # configure a trypsin-like digestor of fasta files + enzyme_builder = EnzymeBuilder( + missed_cleavages=missed_cleavages, + min_len=min_len, + max_len=max_len, + cleave_at=cleave_at, + restrict=restrict, + c_terminal=True + ) + with open(fasta_path, 'r') as infile: + fasta = infile.read() + + # generate IonType class + ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds] + + # set-up a config for a sage-database + sage_config = SageSearchConfiguration( + fasta=fasta, + static_mods=static_mods, + variable_mods=variable_mods, + enzyme_builder=enzyme_builder, + fragment_min_mz=fragment_min_mz, + fragment_max_mz=fragment_max_mz, + peptide_min_mass=peptide_min_mass, + peptide_max_mass=peptide_max_mass, + ion_kinds=ion_kinds, + generate_decoys=generate_decoys, + decoy_tag=decoy_tag, + min_ion_index=min_ion_index, + max_variable_mods=max_variable_mods, + bucket_size=int(np.power(2, 14)) + ) -# Adapted from https://github.com/theGreatHerrLebert/sagepy/blob/v0.2.20-alpha/sagepy/sagepy/core/peptide.py#L125C1 -# Will update in next release https://github.com/theGreatHerrLebert/sagepy/pull/12/files -def to_unimod_sequence(modifications, sequence) -> str: - """ Get Peptide sequence with UNIMOD modification annotations. + # generate the database for searching against + indexed_db = sage_config.generate_indexed_database() + return indexed_db + + def peptide_spectrum_match_list_to_pandas(self, + psms, indexed_db, + re_score: bool = False, + use_sequence_as_match_idx: bool = True) -> pd.DataFrame: + """Convert a list of peptide spectrum matches to a pandas dataframe + + Args: + psms (List[PeptideSpectrumMatch]): The peptide spectrum matches + re_score (bool, optional): Should re-score be used. Defaults to False. + use_sequence_as_match_idx (bool, optional): Should the sequence be used as the match index. Defaults to True. + + Returns: + pd.DataFrame: The pandas dataframe + """ + row_list = [] + + for match in psms: + if match.retention_time_predicted is not None and match.projected_rt is not None: + delta_rt = match.retention_time_predicted - match.projected_rt + else: + delta_rt = None - Returns: - str: Peptide sequence with UNIMOD modification annotations. - """ + if re_score: + score = match.re_score + else: + score = match.hyper_score + + if use_sequence_as_match_idx: + match_idx = match.sequence + else: + match_idx = str(match.peptide_idx) - seq = '' + if match.inverse_mobility_predicted is not None: + delta_ims = match.inverse_mobility_predicted - match.inverse_mobility_observed + else: + delta_ims = None - for i, (s, m) in enumerate(zip(sequence, modifications)): - if m != 0: - if i == 0: - if mass_to_mod(m) == '[UNIMOD:1]': - seq += f'{mass_to_mod(m)}{s}' - else: - seq += f'{s}{mass_to_mod(m)}' + if match.beta_score is not None: + beta_score = match.beta_score else: - seq += f'{s}{mass_to_mod(m)}' + beta_score = None + + peptide = indexed_db[match.peptide_idx] + position = peptide.position + modifications = peptide.modifications + precursor_ppm = abs(match.mono_mass_observed - match.mono_mass_calculated - match.isotope_error) * 2e6 / ( + match.mono_mass_observed + match.mono_mass_calculated - match.isotope_error) + + row_list.append({ + "spec_idx": match.spec_idx, + "match_idx": match_idx, + "proteins": ";".join(match.proteins), + "decoy": match.decoy, + "score": score, + "re_score": match.re_score, + "hyperscore": match.hyper_score, + "rank": match.rank, + "mono_mz_calculated": match.mono_mz_calculated, + "mono_mass_observed": match.mono_mass_observed, + "mono_mass_calculated": match.mono_mass_calculated, + "delta_mass": match.mono_mass_calculated - match.mono_mass_observed, + "isotope_error": match.isotope_error, + "fragment_ppm": match.average_ppm, + "precursor_ppm": precursor_ppm, + "delta_next": match.delta_next, + "delta_best": match.delta_best, + "matched_peaks": match.matched_peaks, + "longest_b": match.longest_b, + "longest_y": match.longest_y, + "longest_y_pct": match.longest_y_pct, + "missed_cleavages": match.missed_cleavages, + "matched_intensity_pct": match.matched_intensity_pct, + "scored_candidates": match.scored_candidates, + "poisson": match.poisson, + "modifications": modifications, + "charge": match.charge, + "retention_time_observed": match.retention_time_observed, + "retention_time_predicted": match.retention_time_predicted, + "delta_rt": delta_rt, + "inverse_mobility_observed": match.inverse_mobility_observed, + "inverse_mobility_predicted": match.inverse_mobility_predicted, + "delta_ims": delta_ims, + "intensity_ms1": match.intensity_ms1, + "intensity_ms2": match.intensity_ms2, + "q_value": match.q_value, + "collision_energy": match.collision_energy, + "cosine_similarity": match.cosine_similarity, + "projected_rt": match.projected_rt, + "beta_score": beta_score + }) + + return pd.DataFrame(row_list) + + def run_one_raw(self, spec_file, param_data, fasta_file): + + raw_spectrums = mzml.MzML(spec_file) + + # loading parameters + fragment_min_mz = param_data["database"]["fragment_min_mz"] + fragment_max_mz = param_data["database"]["fragment_max_mz"] + deisotope_bool = param_data["deisotope"] + static_mods = param_data["database"]["static_mods"] + variable_mods = param_data["database"]["variable_mods"] + min_matched_peaks = param_data["min_matched_peaks"] + min_isotope_err = param_data["min_isotope_err"] + max_isotope_err = param_data["max_isotope_err"] + min_precursor_charge = param_data["min_precursor_charge"] + max_precursor_charge = param_data["max_precursor_charge"] + chimera = param_data["chimera"] + report_psms = param_data["report_psms"] + annotate_matches = param_data["annotate_matches"] + max_fragment_charge = param_data["max_fragment_charge"] + num_threads = param_data["search_thread"] + + if "ppm" in param_data["precursor_tol"]: + precursor_tolerance = Tolerance( + ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1])) else: - seq += s - - return seq - - -def run_one_raw(spec_file, param_data): - global indexed_db - - raw_spectrums = mzml.MzML(spec_file) - res = [] - - # loading parameters - fragment_min_mz = param_data["database"]["fragment_min_mz"] - fragment_max_mz = param_data["database"]["fragment_max_mz"] - deisotope_bool = param_data["deisotope"] - min_matched_peaks = param_data["min_matched_peaks"] - min_isotope_err = param_data["min_isotope_err"] - max_isotope_err = param_data["max_isotope_err"] - min_precursor_charge = param_data["min_precursor_charge"] - max_precursor_charge = param_data["max_precursor_charge"] - chimera = param_data["chimera"] - report_psms = param_data["report_psms"] - annotate_matches = param_data["annotate_matches"] - max_fragment_charge = param_data["max_fragment_charge"] - - if "ppm" in param_data["precursor_tol"]: - precursor_tolerance = Tolerance( - ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1])) - else: - precursor_tolerance = Tolerance( - ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1])) + precursor_tolerance = Tolerance( + ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1])) - if "ppm" in param_data["fragment_tol"]: - fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1])) - else: - fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1])) - - # Begin search and score - for spectrum in tqdm(raw_spectrums): - if spectrum["ms level"] == 1: - continue - selected_ion = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0][ - "selected ion m/z"] - charge = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0]["charge state"] - mz = spectrum["m/z array"].astype(np.float32) - intensity = spectrum["intensity array"].astype(np.float32) - precursor = Precursor( - charge=charge, - mz=selected_ion, - ) - raw_spectrum = RawSpectrum( - file_id=1, - spec_id=spectrum["id"], - total_ion_current=spectrum["total ion current"], - precursors=[precursor], - mz=mz, - intensity=intensity - ) + if "ppm" in param_data["fragment_tol"]: + fragment_tolerance = Tolerance( + ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1])) + else: + fragment_tolerance = Tolerance( + ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1])) - spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz, - max_fragment_mz=fragment_max_mz, - deisotope=deisotope_bool) - query = spec_processor.process(raw_spectrum) scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks, precursor_tolerance=precursor_tolerance, fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err, max_isotope_err=max_isotope_err, min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge, min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera, - annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge) + annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge, + static_mods=static_mods, variable_mods=variable_mods + ) + + # Begin search and score + querys = [] + + for spectrum in raw_spectrums: + if spectrum["ms level"] == 1: + continue + precursor = spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0] + precursor_mz = precursor["selected ion m/z"] + precursor_charge = precursor["charge state"] + precursor_intensity = precursor.get('peak intensity', None) + collision_energy = spectrum['precursorList']['precursor'][0]['activation']['collision energy'] + injection_time = spectrum['scanList']['scan'][0]['ion injection time'] + retention_time = spectrum['scanList']['scan'][0]['scan start time'] + + # Extract the relevant metadata + isolation_window = spectrum['precursorList']['precursor'][0]['isolationWindow'] + + lower = isolation_window['isolation window target m/z'] - isolation_window[ + 'isolation window lower offset'] + upper = isolation_window['isolation window target m/z'] + isolation_window[ + 'isolation window upper offset'] + + mz = spectrum["m/z array"].astype(np.float32) + intensity = spectrum["intensity array"].astype(np.float32) + # set selection window bounds + tolerance = Tolerance(da=(lower, upper)) + precursor = Precursor( + charge=precursor_charge, + mz=precursor_mz, + intensity=precursor_intensity, + isolation_window=tolerance, + collision_energy=collision_energy + ) + raw_spectrum = RawSpectrum( + file_id=1, + spec_id=spectrum["id"], + total_ion_current=spectrum["total ion current"], + ion_injection_time=injection_time, + scan_start_time=retention_time, + precursors=[precursor], + mz=mz, + intensity=intensity + ) + + spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz, + max_fragment_mz=fragment_max_mz, + deisotope=deisotope_bool) + + query = spec_processor.process(raw_spectrum) + querys.append(query) + + indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data) + results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=querys, num_threads=num_threads) + # Convert the list of dictionaries to a pandas DataFrame + # exp_data = pd.DataFrame(d) + + psm_list = [] + for _, values in results.items(): + psm_list.extend(values) + + psm_list_rescore = rescore_lda(psm_list, verbose=False) + # print(psm_list_rescore[0]) + psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db) + + # TARGET = psm_list_df[psm_list_df.decoy == False] + # DECOY = psm_list_df[psm_list_df.decoy] + + # plt.figure(figsize=(6, 4), dpi=150) + # + # plt.hist(TARGET.hyper_score, bins="auto", label="target") + # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy") + # plt.legend() + # plt.title("Target and decoy score distributions") + # plt.xlabel("Score") + # plt.ylabel("Count") + # plt.show() + + TDC = target_decoy_competition_pandas( + df=psm_list_df, + # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level) + method="psm", + score="hyperscore" + ) - results = scorer.score(db=indexed_db, spectrum=query) - for feature in results: - peptide = indexed_db[feature.peptide_idx] - sequence = peptide.sequence - modifications = peptide.modifications - unimod_sequence = to_unimod_sequence(modifications, sequence) - precursor_ppm = abs(feature.expmass - feature.calcmass - feature.isotope_error) * 2e6 / (feature.expmass + feature.calcmass- feature.isotope_error) - res.append({"file_id": feature.file_id, "spec_id": feature.spec_id, "peptide": unimod_sequence, - "proteins": ";".join(peptide.proteins), "rank": feature.rank, - "label": feature.label, "exp.mass": feature.expmass, "cal.mass": feature.calcmass, - "charge": feature.charge, "retention time": feature.rt, - "precursor_ppm": precursor_ppm, "fragment_ppm": feature.average_ppm, - "delta mass": feature.delta_mass, "isotope error": feature.isotope_error, - "hyperscore": feature.hyperscore, "delta_next": feature.delta_next, - "delta_best": feature.delta_best, - "matched peaks": feature.matched_peaks, "missed cleavages": feature.missed_cleavages, - "scored candidates": feature.scored_candidates, - "sage_discriminant_score": feature.discriminant_score, - "posterior error": feature.posterior_error, - "spectrum_q": feature.spectrum_q, "peptide q": feature.peptide_q, - "protein q": feature.protein_q, - "ms2 intensity": feature.ms2_intensity}) - - res = pd.DataFrame(res, index=None) - return res - - -def run_sage(fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", - mzml_files: list = [], sage_config_file: str = None, - use_file_values: bool = True, n_process=4) -> DataFrame: - if sage_config_file is None: - raise ValueError("The sage config file is required.") - - with open(sage_config_file) as f: - param_data = json.load(f) - - # loading and set default parameters - param_data["database"]["fragment_min_mz"] = param_data["database"]["fragment_min_mz"] if "fragment_min_mz" in \ - param_data[ - "database"] else 150 - param_data["database"]["fragment_max_mz"] = param_data["database"]["fragment_max_mz"] if "fragment_max_mz" in \ - param_data[ - "database"] else 2000 - param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True - param_data["min_matched_peaks"] = param_data["min_matched_peaks"] if "min_matched_peaks" in param_data else 6 - param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1 - param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3 - param_data["min_precursor_charge"] = param_data[ - "min_precursor_charge"] if "min_precursor_charge" in param_data else 2 - param_data["max_precursor_charge"] = param_data[ - "max_precursor_charge"] if "max_precursor_charge" in param_data else 4 - param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False - param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1 - param_data["annotate_matches"] = param_data["annotate_matches"] if "annotate_matches" in param_data else False - param_data["max_fragment_charge"] = param_data[ - "max_fragment_charge"] if "max_fragment_charge" in param_data else 1 - - if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: - param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] - precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance))) - if fragment_type == "ppm": - param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] - fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance))) - else: - param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] - fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance)) + # add PEP to the PSMs + TDC["pep"] = calculate_pep( + scores=TDC.score.values, + decoys=TDC.decoy.values, + ) - else: - logging.info("Using the values from the file.") - if "ppm" in param_data["precursor_tol"]: - precursor_tolerance = param_data["precursor_tol"]["ppm"][1] - else: - precursor_tolerance = param_data["precursor_tol"]["da"][1] + # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01 + TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)] + RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC_filtered, + on=["spec_idx", "decoy", "match_idx"]) + RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(spec_file))[0]) + RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True) + # RESULT.to_csv("./test_results.csv", index=False) + + res = pd.DataFrame(RESULT, index=None) + return res + + def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", + mzml_files: list = [], sage_config_file: str = None, fasta_file: str = None, + use_file_values: bool = True, n_process=4) -> DataFrame: + if sage_config_file is None: + raise ValueError("The sage config file is required.") + + with open(sage_config_file) as f: + param_data = json.load(f) + + # loading and set default parameters + param_data["database"]["fragment_min_mz"] = param_data["database"][ + "fragment_min_mz"] if "fragment_min_mz" in \ + param_data[ + "database"] else 150 + param_data["database"]["fragment_max_mz"] = param_data["database"][ + "fragment_max_mz"] if "fragment_max_mz" in \ + param_data[ + "database"] else 2000 + param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True + param_data["min_matched_peaks"] = param_data[ + "min_matched_peaks"] if "min_matched_peaks" in param_data else 6 + param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1 + param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3 + param_data["min_precursor_charge"] = param_data[ + "min_precursor_charge"] if "min_precursor_charge" in param_data else 2 + param_data["max_precursor_charge"] = param_data[ + "max_precursor_charge"] if "max_precursor_charge" in param_data else 4 + param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False + param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1 + param_data["annotate_matches"] = param_data[ + "annotate_matches"] if "annotate_matches" in param_data else False + param_data["max_fragment_charge"] = param_data[ + "max_fragment_charge"] if "max_fragment_charge" in param_data else 1 + param_data["search_thread"] = param_data["search_thread"] if "search_thread" in param_data else 4 + + if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: + param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] + precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance))) + if fragment_type == "ppm": + param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)] + fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance))) + else: + param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance] + fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance)) - if "ppm" in param_data["fragment_tol"]: - fragment_tolerance = param_data["fragment_tol"]["ppm"][1] else: - fragment_tolerance = param_data["fragment_tol"]["da"][1] + logging.info("Using the values from the file.") + if "ppm" in param_data["precursor_tol"]: + precursor_tolerance = param_data["precursor_tol"]["ppm"][1] + else: + precursor_tolerance = param_data["precursor_tol"]["da"][1] - param_data["mzml_paths"] = mzml_files + if "ppm" in param_data["fragment_tol"]: + fragment_tolerance = param_data["fragment_tol"]["ppm"][1] + else: + fragment_tolerance = param_data["fragment_tol"]["da"][1] - temp_sage_file = str(uuid.uuid4()) + ".json" - with open(temp_sage_file, "w") as f: - json.dump(param_data, f, indent=4) + param_data["mzml_paths"] = mzml_files - logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, - precursor_tolerance)) + temp_sage_file = str(uuid.uuid4()) + ".json" + with open(temp_sage_file, "w") as f: + json.dump(param_data, f, indent=4) - sage_table = pd.DataFrame() - tasks = [] + logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, + precursor_tolerance)) - with cf.ProcessPoolExecutor(n_process) as pp: - for m in mzml_files: - tasks.append(pp.submit(run_one_raw, m, param_data)) + sage_table = pd.DataFrame() + tasks = [] + # for m in mzml_files: + # res = run_one_raw(m, param_data) + # sage_table = pd.concat([sage_table, res], ignore_index=True) - for future in cf.as_completed(tasks): - sage_table = pd.concat([sage_table, future.result()], ignore_index=True) + # multiple threads don't supported indexed database class. so currently generating indexed database separately + # for each file. But it only takes about 12s. + with cf.ProcessPoolExecutor(n_process) as pp: + for m in mzml_files: + tasks.append(pp.submit(self.run_one_raw, m, param_data, fasta_file)) - sage_table = compute_entrapment_qvalues(sage_table) - return sage_table + for future in cf.as_completed(tasks): + sage_table = pd.concat([sage_table, future.result()], ignore_index=True) + + sage_table = compute_entrapment_qvalues(sage_table) + return sage_table def extract_ptms(sage_table_target): @@ -393,8 +494,9 @@ def compute_entrapment_qvalues(sage_data_frame: pd.DataFrame) -> pd.DataFrame: :param sage_data_frame: panda data frame with the sage results """ - # Use inplace=True for sorting - sage_data_frame.sort_values(by='sage_discriminant_score', ascending=False, inplace=True) + # Use inplace=True for sorting, Currently using hyper score due to unavailable LDA + # https://github.com/theGreatHerrLebert/sagepy/issues/13 + sage_data_frame.sort_values(by='hyperscore', ascending=False, inplace=True) # Use a more descriptive name for columns sage_data_frame['is_entrap'] = sage_data_frame['proteins'].apply( @@ -433,13 +535,15 @@ def compute_best_combination(sage_table: pd.DataFrame) -> int: return len(sage_table_target['peptide']) -def combined_search(results: list = [], start_fragment_tolerance: int = 0, start_precursor_tolerance: int = 0, +def combined_search(sagerunner, results: list = [], start_fragment_tolerance: int = 0, + start_precursor_tolerance: int = 0, min_fragment_tolerance: int = 1, max_fragment_tolerance: int = 50, min_precursor_tolerance: int = 10, max_precursor_tolerance: int = 100, num_psms: int = 0, fragment_type: str = "ppm", mzml_files: list = [], grid_frag_steps: int = 10, - grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file="", initial_temp=100, + grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file: str = None, + initial_temp=100, cooling_rate=0.95, sage_config_file=None) -> Tuple[int, int, List[Dict]]: def acceptance_probability(old_value, new_value, temperature): if new_value > old_value: @@ -455,9 +559,9 @@ def acceptance_probability(old_value, new_value, temperature): grid_best_value = 1 for ft in precursor_tolerances: - sage_table = run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, - fragment_type=fragment_type, mzml_files=mzml_files, - sage_config_file=sage_config_file, use_file_values=False, n_process=4) + sage_table = sagerunner.run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, use_file_values=False, n_process=4) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value)) @@ -480,9 +584,9 @@ def acceptance_probability(old_value, new_value, temperature): for ft in fragment_tolerances: for pt in precursor_tolerances: - sage_table = run_sage(fragment_tolerance=ft, precursor_tolerance=pt, - fragment_type=fragment_type, mzml_files=mzml_files, - sage_config_file=sage_config_file, use_file_values=False) + sage_table = sagerunner.run_sage(fragment_tolerance=ft, precursor_tolerance=pt, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, ft, pt, new_value)) @@ -514,10 +618,10 @@ def acceptance_probability(old_value, new_value, temperature): max(min_precursor_tolerance, current_precursor_tolerance - search_radius), min(max_precursor_tolerance, current_precursor_tolerance + search_radius)) - sage_table = run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, - fragment_type=fragment_type, mzml_files=mzml_files, - sage_config_file=sage_config_file, - use_file_values=False) + sage_table = sagerunner.run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, + fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, + use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, fragment_tolerance, precursor_tolerance, new_value)) @@ -575,14 +679,16 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i if os.path.exists(fasta_file): logging.info(f"Generating indexed database from {fasta_file}.") - generate_indexed_db(fasta_file, sage_config_file) + sagerunner = SageRunner() + # sagerunner.generate_indexed_db(fasta_file, sage_config_file) else: logging.error(f"File {fasta_file} does not exist.") raise FileNotFoundError(f"File {fasta_file} does not exist.") - sage_table = run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type, - mzml_files, sage_config_file=sage_config_file, - use_file_values=False, n_process=4) + sage_table = sagerunner.run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), + fragment_type, + mzml_files, sage_config_file=sage_config_file, fasta_file=fasta_file, + use_file_values=False, n_process=4) sage_table.to_csv("sage_table.csv", index=False) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms) @@ -591,7 +697,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i best_precursor_tolerance = initial_precursor_tolerance best_fragment_tolerance = initial_fragment_tolerance - best_fragment_tolerance, best_precursor_tolerance, results = combined_search(results=results, + best_fragment_tolerance, best_precursor_tolerance, results = combined_search(sagerunner, results=results, start_fragment_tolerance=best_fragment_tolerance, start_precursor_tolerance=best_precursor_tolerance, min_fragment_tolerance=initial_fragment_tolerance, @@ -676,9 +782,9 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str): global folder_plots_uui folder_plots_uui = str(uuid.uuid4()) os.makedirs(folder_plots_uui) - - sage_table = run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_path=fasta_file, - sage_config_file=sage_config_file, use_file_values=True) + sagerunner = SageRunner() + sage_table = sagerunner.run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_file=fasta_file, + sage_config_file=sage_config_file, use_file_values=True) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, fragment_tolerance=fragment_tolerance, @@ -696,6 +802,80 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str): f.write(f"{key}\t{value}\n") +def sage2PTMShepherd(sage_table): + ptmsherd_schema = ["Spectrum", "Spectrum File", "Peptide", "Modified Peptide", "Peptide Length", "Charge", + "Retention", "Observed Mass", "Calibrated Observed Mass", + "Observed M/Z", "Calibrated Observed M/Z", "Calculated Peptide Mass", "Calculated M/Z", + "Delta Mass", "Expectation", "Hyperscore", "Nextscore", "Probability", + "Number of Missed Cleavages", "Intensity", + "Assigned Modifications", "Observed Modifications", + "Purity", "Protein", "Protein ID"] + psm_df = pd.DataFrame(columns=ptmsherd_schema) + unimod_pattern = re.compile(r"\[UNIMOD:\d+]") + db = mass.Unimod() + new_rows = [] + for _, row in sage_table.iterrows(): + Spectrum = ".".join([os.path.splitext(row['file_name'])[0], row["spec_idx"].split("=")[-1], + row["spec_idx"].split("=")[-1], str(row["charge"])]) + SpectrumFile = row['file_name'] + peptide = unimod_pattern.sub(repl="", string=row['peptide']) + modifications = unimod_pattern.finditer(row["peptide"]) + modified_peptide = row["peptide"] + AssignedModifications = [] + previous_idx = 0 + for modification in modifications: + position = modification.span()[0] - 1 - previous_idx + unimod_term = modification.group(0).replace("[UNIMOD:", "").replace("]", "") + mass_da = db.by_id(unimod_term)["mono_mass"] + AssignedModifications.append(str(position + 1) + str(peptide[position]) + "(" + str(mass_da) + ")") + MassResiduePlusMod = int(mass.fast_mass(peptide[position]) + mass_da - 18.001) + modified_peptide = modified_peptide.replace(peptide[position] + modification.group(0), + peptide[position] + "[" + str(MassResiduePlusMod) + "]") + previous_idx += len(modification.group(0)) + if modified_peptide == peptide: + modified_peptide = "" + AssignedModifications = ", ".join(AssignedModifications) + + PeptideLength = len(peptide) + Charge = row["charge"] + Retention = row["retention_time_observed"] * 60 + ObservedMass = row["mono_mass_observed"] + ObservedMZ = row["mono_mass_observed"] / row["charge"] + 1.007276 + CalibratedObservedMZ = ObservedMZ + CalculatedPeptideMass = row["mono_mass_calculated"] + CalibratedObservedMass = ObservedMass + CalculatedMZ = row["mono_mz_calculated"] + DeltaMass = row["delta_mass"] + Hyperscore = row["hyperscore"] + Probability = 1 - row["spectrum_q"] + Nextscore = row["delta_next"] + NumberofMissedCleavages = row["missed_cleavages"] + Intensity = row["intensity_ms1"] + Protein = row["proteins"] + ProteinID = row["proteins"].split("|")[1] + new_rows.append({"Spectrum": Spectrum, "Spectrum File": SpectrumFile, + "Peptide": peptide, + "Modified Peptide": modified_peptide, + "Peptide Length": PeptideLength, "Charge": Charge, + "Retention": Retention, "Observed Mass": ObservedMass, + "Calibrated Observed Mass": CalibratedObservedMass, + "Observed M/Z": ObservedMZ, + "Calibrated Observed M/Z": CalibratedObservedMZ, + "Calculated Peptide Mass": CalculatedPeptideMass, + "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass, + "Hyperscore": Hyperscore, "Nextscore": Nextscore, + "Probability": Probability, + "Number of Missed Cleavages": NumberofMissedCleavages, + "Intensity": Intensity, + "Assigned Modifications": AssignedModifications, + "Protein": Protein, "Protein ID": ProteinID}) + + psm_df = pd.concat([psm_df, pd.DataFrame.from_records(new_rows)], + ignore_index=True) + + psm_df.to_csv("inputPTMShepherd.tsv", index=False, sep="\t") + + @click.group(context_settings=CONTEXT_SETTINGS) def cli(): """ @@ -709,3 +889,4 @@ def cli(): if __name__ == "__main__": cli() + From 2f39fff96f82838d480ceed9b8e3fdf7345c1c65 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Sun, 15 Sep 2024 09:55:33 +0800 Subject: [PATCH 3/3] refine --- enviroment.yml | 3 +- opt-params/opt_params.py | 331 ++++++++++++++++++++------------------- 2 files changed, 168 insertions(+), 166 deletions(-) diff --git a/enviroment.yml b/enviroment.yml index 94eecf9..852f9ea 100644 --- a/enviroment.yml +++ b/enviroment.yml @@ -15,7 +15,8 @@ dependencies: - seaborn - numpy - pyarrow - - sagepy=0.2.20 + - sagepy=0.2.22 + - sagepy-connector=0.2.22 - pyteomics - tqdm diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py index cc5926c..af3cf26 100644 --- a/opt-params/opt_params.py +++ b/opt-params/opt_params.py @@ -34,7 +34,7 @@ class SageRunner: def __init__(self): - self.indexed_db = None + self.queries = [] def generate_indexed_db(self, fasta_path, param_data): # if sage_config_file is None: @@ -190,54 +190,9 @@ def peptide_spectrum_match_list_to_pandas(self, return pd.DataFrame(row_list) - def run_one_raw(self, spec_file, param_data, fasta_file): - + def parse_mzml(self, spec_file, file_id, fragment_min_mz, fragment_max_mz, deisotope_bool): raw_spectrums = mzml.MzML(spec_file) - # loading parameters - fragment_min_mz = param_data["database"]["fragment_min_mz"] - fragment_max_mz = param_data["database"]["fragment_max_mz"] - deisotope_bool = param_data["deisotope"] - static_mods = param_data["database"]["static_mods"] - variable_mods = param_data["database"]["variable_mods"] - min_matched_peaks = param_data["min_matched_peaks"] - min_isotope_err = param_data["min_isotope_err"] - max_isotope_err = param_data["max_isotope_err"] - min_precursor_charge = param_data["min_precursor_charge"] - max_precursor_charge = param_data["max_precursor_charge"] - chimera = param_data["chimera"] - report_psms = param_data["report_psms"] - annotate_matches = param_data["annotate_matches"] - max_fragment_charge = param_data["max_fragment_charge"] - num_threads = param_data["search_thread"] - - if "ppm" in param_data["precursor_tol"]: - precursor_tolerance = Tolerance( - ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1])) - else: - precursor_tolerance = Tolerance( - ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1])) - - if "ppm" in param_data["fragment_tol"]: - fragment_tolerance = Tolerance( - ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1])) - else: - fragment_tolerance = Tolerance( - ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1])) - - scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks, - precursor_tolerance=precursor_tolerance, - fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err, - max_isotope_err=max_isotope_err, - min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge, - min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera, - annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge, - static_mods=static_mods, variable_mods=variable_mods - ) - - # Begin search and score - querys = [] - for spectrum in raw_spectrums: if spectrum["ms level"] == 1: continue @@ -268,9 +223,11 @@ def run_one_raw(self, spec_file, param_data, fasta_file): isolation_window=tolerance, collision_energy=collision_energy ) + scan = re.findall(r"scan=(\d+)", spectrum["id"])[0] raw_spectrum = RawSpectrum( - file_id=1, - spec_id=spectrum["id"], + file_id=file_id, + spec_id=os.path.splitext(os.path.basename(spec_file))[0] + "." + str(scan) + "." + str( + scan) + "." + str(precursor_charge), total_ion_current=spectrum["total ion current"], ion_injection_time=injection_time, scan_start_time=retention_time, @@ -279,66 +236,18 @@ def run_one_raw(self, spec_file, param_data, fasta_file): intensity=intensity ) - spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz, + spec_processor = SpectrumProcessor(take_top_n=150, min_fragment_mz=fragment_min_mz, max_fragment_mz=fragment_max_mz, deisotope=deisotope_bool) query = spec_processor.process(raw_spectrum) - querys.append(query) - - indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data) - results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=querys, num_threads=num_threads) - # Convert the list of dictionaries to a pandas DataFrame - # exp_data = pd.DataFrame(d) - - psm_list = [] - for _, values in results.items(): - psm_list.extend(values) - - psm_list_rescore = rescore_lda(psm_list, verbose=False) - # print(psm_list_rescore[0]) - psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db) - - # TARGET = psm_list_df[psm_list_df.decoy == False] - # DECOY = psm_list_df[psm_list_df.decoy] - - # plt.figure(figsize=(6, 4), dpi=150) - # - # plt.hist(TARGET.hyper_score, bins="auto", label="target") - # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy") - # plt.legend() - # plt.title("Target and decoy score distributions") - # plt.xlabel("Score") - # plt.ylabel("Count") - # plt.show() - - TDC = target_decoy_competition_pandas( - df=psm_list_df, - # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level) - method="psm", - score="hyperscore" - ) + self.queries.append(query) - # add PEP to the PSMs - TDC["pep"] = calculate_pep( - scores=TDC.score.values, - decoys=TDC.decoy.values, - ) - - # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01 - TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)] - RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC_filtered, - on=["spec_idx", "decoy", "match_idx"]) - RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(spec_file))[0]) - RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True) - # RESULT.to_csv("./test_results.csv", index=False) - - res = pd.DataFrame(RESULT, index=None) - return res + # return querys def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm", mzml_files: list = [], sage_config_file: str = None, fasta_file: str = None, - use_file_values: bool = True, n_process=4) -> DataFrame: + use_file_values: bool = True, skip_spectra_preprocess: bool = False) -> DataFrame: if sage_config_file is None: raise ValueError("The sage config file is required.") @@ -346,30 +255,71 @@ def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fr param_data = json.load(f) # loading and set default parameters - param_data["database"]["fragment_min_mz"] = param_data["database"][ - "fragment_min_mz"] if "fragment_min_mz" in \ - param_data[ - "database"] else 150 - param_data["database"]["fragment_max_mz"] = param_data["database"][ - "fragment_max_mz"] if "fragment_max_mz" in \ - param_data[ - "database"] else 2000 - param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True - param_data["min_matched_peaks"] = param_data[ - "min_matched_peaks"] if "min_matched_peaks" in param_data else 6 - param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1 - param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3 - param_data["min_precursor_charge"] = param_data[ - "min_precursor_charge"] if "min_precursor_charge" in param_data else 2 - param_data["max_precursor_charge"] = param_data[ - "max_precursor_charge"] if "max_precursor_charge" in param_data else 4 - param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False - param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1 - param_data["annotate_matches"] = param_data[ - "annotate_matches"] if "annotate_matches" in param_data else False - param_data["max_fragment_charge"] = param_data[ - "max_fragment_charge"] if "max_fragment_charge" in param_data else 1 - param_data["search_thread"] = param_data["search_thread"] if "search_thread" in param_data else 4 + if "fragment_min_mz" in param_data["database"]: + fragment_min_mz = param_data["database"]["fragment_min_mz"] + else: + param_data["database"]["fragment_min_mz"] = 150 + fragment_min_mz = 150 + if "fragment_max_mz" in param_data["database"]: + fragment_max_mz = param_data["database"]["fragment_min_mz"] + else: + param_data["database"]["fragment_max_mz"] = 2000 + fragment_max_mz = 2000 + if "deisotope" in param_data: + deisotope_bool = param_data["deisotope"] + else: + param_data["deisotope"] = True + deisotope_bool = True + if "min_matched_peaks" in param_data: + min_matched_peaks = param_data["min_matched_peaks"] + else: + param_data["min_matched_peaks"] = 6 + min_matched_peaks = 6 + if "min_isotope_err" in param_data: + min_isotope_err = param_data["min_isotope_err"] + else: + param_data["min_isotope_err"] = -1 + min_isotope_err = -1 + if "max_isotope_err" in param_data: + max_isotope_err = param_data["max_isotope_err"] + else: + param_data["max_isotope_err"] = -1 + max_isotope_err = 3 + if "min_precursor_charge" in param_data: + min_precursor_charge = param_data["min_precursor_charge"] + else: + min_precursor_charge = 2 + param_data["min_precursor_charge"] = 2 + if "max_precursor_charge" in param_data: + max_precursor_charge = param_data["max_precursor_charge"] + else: + max_precursor_charge = 4 + param_data["max_precursor_charge"] = 4 + if "chimera" in param_data: + chimera = param_data["chimera"] + else: + param_data["chimera"] = False + chimera = False + if "report_psms" in param_data: + report_psms = param_data["report_psms"] + else: + param_data["report_psms"] = 1 + report_psms = 1 + if "annotate_matches" in param_data: + annotate_matches = param_data["annotate_matches"] + else: + param_data["annotate_matches"] = False + annotate_matches = False + if "max_fragment_charge" in param_data: + max_fragment_charge = param_data["max_fragment_charge"] + else: + param_data["max_fragment_charge"] = 1 + max_fragment_charge = 1 + if "search_thread" in param_data: + search_thread = param_data["search_thread"] + else: + param_data["search_thread"] = 4 + search_thread = 4 if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values: param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)] @@ -401,21 +351,76 @@ def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fr logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance, precursor_tolerance)) + if not skip_spectra_preprocess: + name_id_map = dict(zip(mzml_files, range(len(mzml_files)))) + with cf.ThreadPoolExecutor(search_thread) as executor: + for m in mzml_files: + executor.submit(self.parse_mzml, m, name_id_map[m], param_data["database"]["fragment_min_mz"], + param_data["database"]["fragment_max_mz"], deisotope_bool) + + # for future in cf.as_completed(tasks): + # all_queries.append(future.result()) + + static_mods = param_data["database"]["static_mods"] + variable_mods = param_data["database"]["variable_mods"] + + scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks, + precursor_tolerance=precursor_tolerance, + fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err, + max_isotope_err=max_isotope_err, + min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge, + min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera, + annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge, + static_mods=static_mods, variable_mods=variable_mods + ) + + indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data) + results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=self.queries, + num_threads=search_thread) + + psm_list = [] + for _, values in results.items(): + psm_list.extend(values) + + # after intensity a + psm_list_rescore = rescore_lda(psm_list, verbose=False) + psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db, re_score=True) + + # TARGET = psm_list_df[psm_list_df.decoy == False] + # DECOY = psm_list_df[psm_list_df.decoy] + + # plt.figure(figsize=(6, 4), dpi=150) + # + # plt.hist(TARGET.hyper_score, bins="auto", label="target") + # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy") + # plt.legend() + # plt.title("Target and decoy score distributions") + # plt.xlabel("Score") + # plt.ylabel("Count") + # plt.show() + + TDC = target_decoy_competition_pandas( + df=psm_list_df, + # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level) + method="peptide_psm_peptide", + score="re_score" + ) - sage_table = pd.DataFrame() - tasks = [] - # for m in mzml_files: - # res = run_one_raw(m, param_data) - # sage_table = pd.concat([sage_table, res], ignore_index=True) + # add PEP to the PSMs + TDC["pep"] = calculate_pep( + scores=TDC.score.values, + decoys=TDC.decoy.values, + ) - # multiple threads don't supported indexed database class. so currently generating indexed database separately - # for each file. But it only takes about 12s. - with cf.ProcessPoolExecutor(n_process) as pp: - for m in mzml_files: - tasks.append(pp.submit(self.run_one_raw, m, param_data, fasta_file)) + # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01 + # TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)] + RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC, + on=["spec_idx", "decoy", "match_idx"]) - for future in cf.as_completed(tasks): - sage_table = pd.concat([sage_table, future.result()], ignore_index=True) + # RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(m))[0]) + RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True) + + sage_table = pd.DataFrame(RESULT, index=None) sage_table = compute_entrapment_qvalues(sage_table) return sage_table @@ -494,9 +499,7 @@ def compute_entrapment_qvalues(sage_data_frame: pd.DataFrame) -> pd.DataFrame: :param sage_data_frame: panda data frame with the sage results """ - # Use inplace=True for sorting, Currently using hyper score due to unavailable LDA - # https://github.com/theGreatHerrLebert/sagepy/issues/13 - sage_data_frame.sort_values(by='hyperscore', ascending=False, inplace=True) + sage_data_frame.sort_values(by='re_score', ascending=False, inplace=True) # Use a more descriptive name for columns sage_data_frame['is_entrap'] = sage_data_frame['proteins'].apply( @@ -561,7 +564,8 @@ def acceptance_probability(old_value, new_value, temperature): for ft in precursor_tolerances: sage_table = sagerunner.run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft, fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, - sage_config_file=sage_config_file, use_file_values=False, n_process=4) + sage_config_file=sage_config_file, use_file_values=False, + skip_spectra_preprocess=True) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value)) @@ -586,7 +590,8 @@ def acceptance_probability(old_value, new_value, temperature): for pt in precursor_tolerances: sage_table = sagerunner.run_sage(fragment_tolerance=ft, precursor_tolerance=pt, fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, - sage_config_file=sage_config_file, use_file_values=False) + sage_config_file=sage_config_file, skip_spectra_preprocess=True, + use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, ft, pt, new_value)) @@ -620,7 +625,7 @@ def acceptance_probability(old_value, new_value, temperature): sage_table = sagerunner.run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance, fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file, - sage_config_file=sage_config_file, + sage_config_file=sage_config_file, skip_spectra_preprocess=True, use_file_values=False) new_value = compute_best_combination(sage_table) results.append(get_stats_from_sage(sage_table, fragment_tolerance, precursor_tolerance, new_value)) @@ -678,9 +683,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i if initial_fragment_tolerance is not None and initial_precursor_tolerance is not None: if os.path.exists(fasta_file): - logging.info(f"Generating indexed database from {fasta_file}.") sagerunner = SageRunner() - # sagerunner.generate_indexed_db(fasta_file, sage_config_file) else: logging.error(f"File {fasta_file} does not exist.") raise FileNotFoundError(f"File {fasta_file} does not exist.") @@ -688,7 +691,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i sage_table = sagerunner.run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type, mzml_files, sage_config_file=sage_config_file, fasta_file=fasta_file, - use_file_values=False, n_process=4) + use_file_values=False) sage_table.to_csv("sage_table.csv", index=False) num_psms = compute_best_combination(sage_table) stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms) @@ -815,9 +818,8 @@ def sage2PTMShepherd(sage_table): db = mass.Unimod() new_rows = [] for _, row in sage_table.iterrows(): - Spectrum = ".".join([os.path.splitext(row['file_name'])[0], row["spec_idx"].split("=")[-1], - row["spec_idx"].split("=")[-1], str(row["charge"])]) - SpectrumFile = row['file_name'] + Spectrum = row["spec_idx"] + SpectrumFile = row["spec_idx"].split(".")[0] peptide = unimod_pattern.sub(repl="", string=row['peptide']) modifications = unimod_pattern.finditer(row["peptide"]) modified_peptide = row["peptide"] @@ -854,21 +856,21 @@ def sage2PTMShepherd(sage_table): Protein = row["proteins"] ProteinID = row["proteins"].split("|")[1] new_rows.append({"Spectrum": Spectrum, "Spectrum File": SpectrumFile, - "Peptide": peptide, - "Modified Peptide": modified_peptide, - "Peptide Length": PeptideLength, "Charge": Charge, - "Retention": Retention, "Observed Mass": ObservedMass, - "Calibrated Observed Mass": CalibratedObservedMass, - "Observed M/Z": ObservedMZ, - "Calibrated Observed M/Z": CalibratedObservedMZ, - "Calculated Peptide Mass": CalculatedPeptideMass, - "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass, - "Hyperscore": Hyperscore, "Nextscore": Nextscore, - "Probability": Probability, - "Number of Missed Cleavages": NumberofMissedCleavages, - "Intensity": Intensity, - "Assigned Modifications": AssignedModifications, - "Protein": Protein, "Protein ID": ProteinID}) + "Peptide": peptide, + "Modified Peptide": modified_peptide, + "Peptide Length": PeptideLength, "Charge": Charge, + "Retention": Retention, "Observed Mass": ObservedMass, + "Calibrated Observed Mass": CalibratedObservedMass, + "Observed M/Z": ObservedMZ, + "Calibrated Observed M/Z": CalibratedObservedMZ, + "Calculated Peptide Mass": CalculatedPeptideMass, + "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass, + "Hyperscore": Hyperscore, "Nextscore": Nextscore, + "Probability": Probability, + "Number of Missed Cleavages": NumberofMissedCleavages, + "Intensity": Intensity, + "Assigned Modifications": AssignedModifications, + "Protein": Protein, "Protein ID": ProteinID}) psm_df = pd.concat([psm_df, pd.DataFrame.from_records(new_rows)], ignore_index=True) @@ -889,4 +891,3 @@ def cli(): if __name__ == "__main__": cli() -