From 37943b91a492f8e76b8d8c8602a1914d27ad62d6 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Mon, 2 Sep 2024 20:51:10 +0800
Subject: [PATCH 1/3] replace sage command with sagepy

---
 enviroment.yml           |   4 +-
 opt-params/opt_params.py | 323 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 290 insertions(+), 37 deletions(-)

diff --git a/enviroment.yml b/enviroment.yml
index b6715a3..94eecf9 100644
--- a/enviroment.yml
+++ b/enviroment.yml
@@ -8,7 +8,6 @@ channels:
 dependencies:
   - bioconda::thermorawfileparser=1.4.2
   - bioconda::crux-toolkit
-  - bioconda::sage-proteomics
   - pandas
   - matplotlib
   - scikit-learn
@@ -16,5 +15,8 @@ dependencies:
   - seaborn
   - numpy
   - pyarrow
+  - sagepy=0.2.20
+  - pyteomics
+  - tqdm
 
 
diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py
index 25540a6..eb92b15 100644
--- a/opt-params/opt_params.py
+++ b/opt-params/opt_params.py
@@ -7,13 +7,17 @@
 from typing import Tuple, List, Dict
 import json
 import os
-
+from sagepy.core import EnzymeBuilder, SageSearchConfiguration, validate_mods, validate_var_mods, \
+    Scorer, RawSpectrum, SpectrumProcessor, Precursor, Tolerance, IonType
 import numpy as np
 import pandas as pd
-import subprocess
 import matplotlib.pyplot as plt
 from pandas import DataFrame
 import click
+from pyteomics import mzml
+import concurrent.futures as cf
+
+from tqdm import tqdm
 
 logging.basicConfig(level=logging.INFO)
 
@@ -22,58 +26,296 @@
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
 
+indexed_db = None
+
+
+def generate_indexed_db(fasta_path, sage_config_file):
+    if sage_config_file is None:
+        raise ValueError("The sage config file is required.")
+
+    with open(sage_config_file) as f:
+        param_data = json.load(f)
+        cleave_at = param_data["database"]["enzyme"]["cleave_at"]
+        missed_cleavages = param_data["database"]["enzyme"]["missed_cleavages"]
+        min_len = param_data["database"]["enzyme"]["min_len"]
+        max_len = param_data["database"]["enzyme"]["max_len"]
+        restrict = param_data["database"]["enzyme"]["restrict"]
+        static_mods = param_data["database"]["static_mods"]
+        variable_mods = param_data["database"]["variable_mods"]
+        fragment_min_mz = param_data["database"]["fragment_min_mz"]
+        fragment_max_mz = param_data["database"]["fragment_max_mz"]
+        peptide_min_mass = param_data["database"]["peptide_min_mass"]
+        peptide_max_mass = param_data["database"]["peptide_max_mass"]
+        ion_kinds = param_data["database"]["ion_kinds"]
+        min_ion_index = param_data["database"]["min_ion_index"]
+        max_variable_mods = param_data["database"]["max_variable_mods"]
+        generate_decoys = param_data["database"]["generate_decoys"]
+        decoy_tag = param_data["database"]["decoy_tag"]
+
+    # configure a trypsin-like digestor of fasta files
+    enzyme_builder = EnzymeBuilder(
+        missed_cleavages=missed_cleavages,
+        min_len=min_len,
+        max_len=max_len,
+        cleave_at=cleave_at,
+        restrict=restrict,
+        c_terminal=True
+    )
+
+    # # generate static cysteine modification
+    # static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]}
+    #
+    # # generate variable methionine modification
+    # variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]}
+
+    # generate SAGE compatible mod representations
+    static = validate_mods(static_mods)
+    variab = validate_var_mods(variable_mods)
+
+    with open(fasta_path, 'r') as infile:
+        fasta = infile.read()
+
+    # generate IonType class
+    ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds]
+
+    # set-up a config for a sage-database
+    sage_config = SageSearchConfiguration(
+        fasta=fasta,
+        static_mods=static,
+        variable_mods=variab,
+        enzyme_builder=enzyme_builder,
+        fragment_min_mz=fragment_min_mz,
+        fragment_max_mz=fragment_max_mz,
+        peptide_min_mass=peptide_min_mass,
+        peptide_max_mass=peptide_max_mass,
+        ion_kinds=ion_kinds,
+        generate_decoys=generate_decoys,
+        decoy_tag=decoy_tag,
+        min_ion_index=min_ion_index,
+        max_variable_mods=max_variable_mods,
+        bucket_size=int(np.power(2, 14))
+    )
+
+    # generate the database for searching against
+    global indexed_db
+    indexed_db = sage_config.generate_indexed_database()
+
+
+def mass_to_mod(mass: float) -> str:
+    """ Convert a mass to a UNIMOD modification annotation.
+
+    Args:
+        mass: a mass in Da
+
+    Returns:
+        a UNIMOD modification annotation
+    """
+    maybe_key = int(np.round(mass))
+    mod_dict = {
+        1: '[UNIMOD:7]',
+        42: '[UNIMOD:1]',
+        57: '[UNIMOD:4]',
+        80: '[UNIMOD:21]',
+        16: '[UNIMOD:35]',
+        119: '[UNIMOD:312]',
+    }
+    # try to translate to UNIMOD annotation
+    try:
+        return mod_dict[maybe_key]
+    except KeyError:
+        raise KeyError(f"Rounded mass not in dict: {maybe_key}")
+
+
+# Adapted from https://github.com/theGreatHerrLebert/sagepy/blob/v0.2.20-alpha/sagepy/sagepy/core/peptide.py#L125C1
+# Will update in next release https://github.com/theGreatHerrLebert/sagepy/pull/12/files
+def to_unimod_sequence(modifications, sequence) -> str:
+    """ Get Peptide sequence with UNIMOD modification annotations.
+
+    Returns:
+        str: Peptide sequence with UNIMOD modification annotations.
+    """
+
+    seq = ''
+
+    for i, (s, m) in enumerate(zip(sequence, modifications)):
+        if m != 0:
+            if i == 0:
+                if mass_to_mod(m) == '[UNIMOD:1]':
+                    seq += f'{mass_to_mod(m)}{s}'
+                else:
+                    seq += f'{s}{mass_to_mod(m)}'
+            else:
+                seq += f'{s}{mass_to_mod(m)}'
+        else:
+            seq += s
+
+    return seq
+
+
+def run_one_raw(spec_file, param_data):
+    global indexed_db
+
+    raw_spectrums = mzml.MzML(spec_file)
+    res = []
+
+    # loading parameters
+    fragment_min_mz = param_data["database"]["fragment_min_mz"]
+    fragment_max_mz = param_data["database"]["fragment_max_mz"]
+    deisotope_bool = param_data["deisotope"]
+    min_matched_peaks = param_data["min_matched_peaks"]
+    min_isotope_err = param_data["min_isotope_err"]
+    max_isotope_err = param_data["max_isotope_err"]
+    min_precursor_charge = param_data["min_precursor_charge"]
+    max_precursor_charge = param_data["max_precursor_charge"]
+    chimera = param_data["chimera"]
+    report_psms = param_data["report_psms"]
+    annotate_matches = param_data["annotate_matches"]
+    max_fragment_charge = param_data["max_fragment_charge"]
+
+    if "ppm" in param_data["precursor_tol"]:
+        precursor_tolerance = Tolerance(
+            ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1]))
+    else:
+        precursor_tolerance = Tolerance(
+            ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1]))
+
+    if "ppm" in param_data["fragment_tol"]:
+        fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1]))
+    else:
+        fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1]))
+
+    # Begin search and score
+    for spectrum in tqdm(raw_spectrums):
+        if spectrum["ms level"] == 1:
+            continue
+        selected_ion = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0][
+            "selected ion m/z"]
+        charge = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0]["charge state"]
+        mz = spectrum["m/z array"].astype(np.float32)
+        intensity = spectrum["intensity array"].astype(np.float32)
+        precursor = Precursor(
+            charge=charge,
+            mz=selected_ion,
+        )
+        raw_spectrum = RawSpectrum(
+            file_id=1,
+            spec_id=spectrum["id"],
+            total_ion_current=spectrum["total ion current"],
+            precursors=[precursor],
+            mz=mz,
+            intensity=intensity
+        )
+
+        spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz,
+                                           max_fragment_mz=fragment_max_mz,
+                                           deisotope=deisotope_bool)
+        query = spec_processor.process(raw_spectrum)
+        scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks,
+                        precursor_tolerance=precursor_tolerance,
+                        fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err,
+                        max_isotope_err=max_isotope_err,
+                        min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge,
+                        min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera,
+                        annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge)
+
+        results = scorer.score(db=indexed_db, spectrum=query)
+        for feature in results:
+            peptide = indexed_db[feature.peptide_idx]
+            sequence = peptide.sequence
+            modifications = peptide.modifications
+            unimod_sequence = to_unimod_sequence(modifications, sequence)
+            precursor_ppm = abs(feature.expmass - feature.calcmass - feature.isotope_error) * 2e6 / (feature.expmass + feature.calcmass- feature.isotope_error)
+            res.append({"file_id": feature.file_id, "spec_id": feature.spec_id, "peptide": unimod_sequence,
+                        "proteins": ";".join(peptide.proteins), "rank": feature.rank,
+                        "label": feature.label, "exp.mass": feature.expmass, "cal.mass": feature.calcmass,
+                        "charge": feature.charge, "retention time": feature.rt,
+                        "precursor_ppm": precursor_ppm, "fragment_ppm": feature.average_ppm,
+                        "delta mass": feature.delta_mass, "isotope error": feature.isotope_error,
+                        "hyperscore": feature.hyperscore, "delta_next": feature.delta_next,
+                        "delta_best": feature.delta_best,
+                        "matched peaks": feature.matched_peaks, "missed cleavages": feature.missed_cleavages,
+                        "scored candidates": feature.scored_candidates,
+                        "sage_discriminant_score": feature.discriminant_score,
+                        "posterior error": feature.posterior_error,
+                        "spectrum_q": feature.spectrum_q, "peptide q": feature.peptide_q,
+                        "protein q": feature.protein_q,
+                        "ms2 intensity": feature.ms2_intensity})
+
+    res = pd.DataFrame(res, index=None)
+    return res
+
 
 def run_sage(fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm",
-             mzml_files: list = [], fasta_path: str = "", sage_config_file: str = None,
-             use_file_values: bool = True) -> DataFrame:
+             mzml_files: list = [], sage_config_file: str = None,
+             use_file_values: bool = True, n_process=4) -> DataFrame:
     if sage_config_file is None:
         raise ValueError("The sage config file is required.")
 
     with open(sage_config_file) as f:
-        data = json.load(f)
+        param_data = json.load(f)
+
+        # loading and set default parameters
+        param_data["database"]["fragment_min_mz"] = param_data["database"]["fragment_min_mz"] if "fragment_min_mz" in \
+                                                                                                 param_data[
+                                                                                                     "database"] else 150
+        param_data["database"]["fragment_max_mz"] = param_data["database"]["fragment_max_mz"] if "fragment_max_mz" in \
+                                                                                                 param_data[
+                                                                                                     "database"] else 2000
+        param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True
+        param_data["min_matched_peaks"] = param_data["min_matched_peaks"] if "min_matched_peaks" in param_data else 6
+        param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1
+        param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3
+        param_data["min_precursor_charge"] = param_data[
+            "min_precursor_charge"] if "min_precursor_charge" in param_data else 2
+        param_data["max_precursor_charge"] = param_data[
+            "max_precursor_charge"] if "max_precursor_charge" in param_data else 4
+        param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False
+        param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1
+        param_data["annotate_matches"] = param_data["annotate_matches"] if "annotate_matches" in param_data else False
+        param_data["max_fragment_charge"] = param_data[
+            "max_fragment_charge"] if "max_fragment_charge" in param_data else 1
 
     if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values:
-        data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)]
+        param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)]
+        precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance)))
         if fragment_type == "ppm":
-            data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)]
+            param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)]
+            fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance)))
         else:
-            data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance]
+            param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance]
+            fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance))
+
     else:
         logging.info("Using the values from the file.")
-        if "ppm" in data["precursor_tol"]:
-            precursor_tolerance = data["precursor_tol"]["ppm"][1]
+        if "ppm" in param_data["precursor_tol"]:
+            precursor_tolerance = param_data["precursor_tol"]["ppm"][1]
         else:
-            precursor_tolerance = data["precursor_tol"]["da"][1]
+            precursor_tolerance = param_data["precursor_tol"]["da"][1]
 
-        if "ppm" in data["fragment_tol"]:
-            fragment_tolerance = data["fragment_tol"]["ppm"][1]
+        if "ppm" in param_data["fragment_tol"]:
+            fragment_tolerance = param_data["fragment_tol"]["ppm"][1]
         else:
-            fragment_tolerance = data["fragment_tol"]["da"][1]
+            fragment_tolerance = param_data["fragment_tol"]["da"][1]
 
-    data["mzml_paths"] = mzml_files
-
-    if os.path.exists(fasta_path):
-        data["database"]["fasta"] = fasta_path
-    else:
-        logging.error(f"File {fasta_path} does not exist.")
-        raise FileNotFoundError(f"File {fasta_path} does not exist.")
+    param_data["mzml_paths"] = mzml_files
 
     temp_sage_file = str(uuid.uuid4()) + ".json"
     with open(temp_sage_file, "w") as f:
-        json.dump(data, f, indent=4)
+        json.dump(param_data, f, indent=4)
 
     logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance,
                                                                                                precursor_tolerance))
 
-    result = subprocess.run(["sage", temp_sage_file, "--write-pin"], capture_output=True, text=True)
-    os.remove(temp_sage_file)
+    sage_table = pd.DataFrame()
+    tasks = []
 
-    if result.returncode != 0:
-        logging.error("Error running SAGE.")
-        logging.error(result.stderr)
-        raise ValueError("Error running SAGE.")
+    with cf.ProcessPoolExecutor(n_process) as pp:
+        for m in mzml_files:
+            tasks.append(pp.submit(run_one_raw, m, param_data))
+
+        for future in cf.as_completed(tasks):
+            sage_table = pd.concat([sage_table, future.result()], ignore_index=True)
 
-    sage_table = pd.read_csv("results.sage.tsv", sep="\t")
     sage_table = compute_entrapment_qvalues(sage_table)
     return sage_table
 
@@ -214,8 +456,8 @@ def acceptance_probability(old_value, new_value, temperature):
     grid_best_value = 1
     for ft in precursor_tolerances:
         sage_table = run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft,
-                              fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file,
-                              sage_config_file=sage_config_file, use_file_values=False)
+                              fragment_type=fragment_type, mzml_files=mzml_files,
+                              sage_config_file=sage_config_file, use_file_values=False, n_process=4)
         new_value = compute_best_combination(sage_table)
         results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value))
 
@@ -239,7 +481,7 @@ def acceptance_probability(old_value, new_value, temperature):
     for ft in fragment_tolerances:
         for pt in precursor_tolerances:
             sage_table = run_sage(fragment_tolerance=ft, precursor_tolerance=pt,
-                                  fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file,
+                                  fragment_type=fragment_type, mzml_files=mzml_files,
                                   sage_config_file=sage_config_file, use_file_values=False)
             new_value = compute_best_combination(sage_table)
             results.append(get_stats_from_sage(sage_table, ft, pt, new_value))
@@ -273,7 +515,7 @@ def acceptance_probability(old_value, new_value, temperature):
             min(max_precursor_tolerance, current_precursor_tolerance + search_radius))
 
         sage_table = run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance,
-                              fragment_type=fragment_type, mzml_files=mzml_files, fasta_path=fasta_file,
+                              fragment_type=fragment_type, mzml_files=mzml_files,
                               sage_config_file=sage_config_file,
                               use_file_values=False)
         new_value = compute_best_combination(sage_table)
@@ -304,7 +546,7 @@ def acceptance_probability(old_value, new_value, temperature):
 @click.option("--max-precursor-tolerance", default=50, help="The maximum precursor tolerance to consider.", type=int)
 @click.option("--fasta-file", default="Homo-sapiens-uniprot-reviewed-contaminants-entrap-decoy-20240615.fasta",
               help="The path to the fasta file to use for the SAGE analysis.")
-@click.option("--sage-config-file", default="sage-general.json",
+@click.option("--sage-config-file", default="general-sage.json",
               help="The path to the Sage config file to use for the SAGE analysis.")
 @click.option("--max-iterations", default=10, help="The maximum number of iterations to run the optimization.",
               type=int)
@@ -330,9 +572,18 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i
         sage_config_file = "general-sage.json"
 
     if initial_fragment_tolerance is not None and initial_precursor_tolerance is not None:
+
+        if os.path.exists(fasta_file):
+            logging.info(f"Generating indexed database from {fasta_file}.")
+            generate_indexed_db(fasta_file, sage_config_file)
+        else:
+            logging.error(f"File {fasta_file} does not exist.")
+            raise FileNotFoundError(f"File {fasta_file} does not exist.")
+
         sage_table = run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type,
-                              mzml_files, fasta_path=fasta_file, sage_config_file=sage_config_file,
-                              use_file_values=False)
+                              mzml_files, sage_config_file=sage_config_file,
+                              use_file_values=False, n_process=4)
+        sage_table.to_csv("sage_table.csv", index=False)
         num_psms = compute_best_combination(sage_table)
         stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms)
         results.append(stats)

From 31084a54350b658655c04ca42821daa83aaa04a0 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Sat, 14 Sep 2024 10:45:22 +0800
Subject: [PATCH 2/3] update

---
 opt-params/general-sage.json |   7 +-
 opt-params/opt_params.py     | 721 ++++++++++++++++++++++-------------
 2 files changed, 455 insertions(+), 273 deletions(-)

diff --git a/opt-params/general-sage.json b/opt-params/general-sage.json
index 204344b..58b59e9 100644
--- a/opt-params/general-sage.json
+++ b/opt-params/general-sage.json
@@ -19,11 +19,10 @@
         "min_ion_index": 2,
         "max_variable_mods": 3,
         "static_mods": {
-            "C": 57.0215
+            "C": "[UNIMOD:4]"
         },
         "variable_mods": {
-            "M": [15.994],
-            "N": [0.984]
+            "M": ["[UNIMOD:35]"]
         },
         "generate_decoys": false,
         "decoy_tag": "DECOY_",
@@ -59,6 +58,8 @@
     "max_peaks": 300,
     "max_fragment_charge": 1,
     "min_matched_peaks": 4,
+    "annotate_matches": true,
+    "search_thread": 4,
     "mzml_paths": [
         "./20150901_QEp1_LC7_PhGe_SA_Plate1E_45_6.mzML",
         "./20150618_QEp1_LC7_PhGe_SA_Plate1B_45_1.mzML",
diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py
index eb92b15..cc5926c 100644
--- a/opt-params/opt_params.py
+++ b/opt-params/opt_params.py
@@ -9,16 +9,19 @@
 import os
 from sagepy.core import EnzymeBuilder, SageSearchConfiguration, validate_mods, validate_var_mods, \
     Scorer, RawSpectrum, SpectrumProcessor, Precursor, Tolerance, IonType
+from sagepy.qfdr.tdc import target_decoy_competition_pandas
+from sagepy.utility import mean_ppm, median_ppm
+from sagepy.core.ml.pep import calculate_pep
+from sagepy.rescore.lda import rescore_lda
+
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 from pandas import DataFrame
 import click
-from pyteomics import mzml
+from pyteomics import mzml, mass
 import concurrent.futures as cf
 
-from tqdm import tqdm
-
 logging.basicConfig(level=logging.INFO)
 
 folder_plots_uui = ""
@@ -29,12 +32,16 @@
 indexed_db = None
 
 
-def generate_indexed_db(fasta_path, sage_config_file):
-    if sage_config_file is None:
-        raise ValueError("The sage config file is required.")
+class SageRunner:
+    def __init__(self):
+        self.indexed_db = None
 
-    with open(sage_config_file) as f:
-        param_data = json.load(f)
+    def generate_indexed_db(self, fasta_path, param_data):
+        # if sage_config_file is None:
+        #     raise ValueError("The sage config file is required.")
+        #
+        # with open(sage_config_file) as f:
+        #     param_data = json.load(f)
         cleave_at = param_data["database"]["enzyme"]["cleave_at"]
         missed_cleavages = param_data["database"]["enzyme"]["missed_cleavages"]
         min_len = param_data["database"]["enzyme"]["min_len"]
@@ -52,272 +59,366 @@ def generate_indexed_db(fasta_path, sage_config_file):
         generate_decoys = param_data["database"]["generate_decoys"]
         decoy_tag = param_data["database"]["decoy_tag"]
 
-    # configure a trypsin-like digestor of fasta files
-    enzyme_builder = EnzymeBuilder(
-        missed_cleavages=missed_cleavages,
-        min_len=min_len,
-        max_len=max_len,
-        cleave_at=cleave_at,
-        restrict=restrict,
-        c_terminal=True
-    )
-
-    # # generate static cysteine modification
-    # static_mods = {k: v for k, v in [SAGE_KNOWN_MODS.cysteine_static()]}
-    #
-    # # generate variable methionine modification
-    # variable_mods = {k: v for k, v in [SAGE_KNOWN_MODS.methionine_variable()]}
-
-    # generate SAGE compatible mod representations
-    static = validate_mods(static_mods)
-    variab = validate_var_mods(variable_mods)
-
-    with open(fasta_path, 'r') as infile:
-        fasta = infile.read()
-
-    # generate IonType class
-    ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds]
-
-    # set-up a config for a sage-database
-    sage_config = SageSearchConfiguration(
-        fasta=fasta,
-        static_mods=static,
-        variable_mods=variab,
-        enzyme_builder=enzyme_builder,
-        fragment_min_mz=fragment_min_mz,
-        fragment_max_mz=fragment_max_mz,
-        peptide_min_mass=peptide_min_mass,
-        peptide_max_mass=peptide_max_mass,
-        ion_kinds=ion_kinds,
-        generate_decoys=generate_decoys,
-        decoy_tag=decoy_tag,
-        min_ion_index=min_ion_index,
-        max_variable_mods=max_variable_mods,
-        bucket_size=int(np.power(2, 14))
-    )
-
-    # generate the database for searching against
-    global indexed_db
-    indexed_db = sage_config.generate_indexed_database()
-
-
-def mass_to_mod(mass: float) -> str:
-    """ Convert a mass to a UNIMOD modification annotation.
-
-    Args:
-        mass: a mass in Da
-
-    Returns:
-        a UNIMOD modification annotation
-    """
-    maybe_key = int(np.round(mass))
-    mod_dict = {
-        1: '[UNIMOD:7]',
-        42: '[UNIMOD:1]',
-        57: '[UNIMOD:4]',
-        80: '[UNIMOD:21]',
-        16: '[UNIMOD:35]',
-        119: '[UNIMOD:312]',
-    }
-    # try to translate to UNIMOD annotation
-    try:
-        return mod_dict[maybe_key]
-    except KeyError:
-        raise KeyError(f"Rounded mass not in dict: {maybe_key}")
+        # configure a trypsin-like digestor of fasta files
+        enzyme_builder = EnzymeBuilder(
+            missed_cleavages=missed_cleavages,
+            min_len=min_len,
+            max_len=max_len,
+            cleave_at=cleave_at,
+            restrict=restrict,
+            c_terminal=True
+        )
 
+        with open(fasta_path, 'r') as infile:
+            fasta = infile.read()
+
+        # generate IonType class
+        ion_kinds = [IonType(ion).get_py_ptr() for ion in ion_kinds]
+
+        # set-up a config for a sage-database
+        sage_config = SageSearchConfiguration(
+            fasta=fasta,
+            static_mods=static_mods,
+            variable_mods=variable_mods,
+            enzyme_builder=enzyme_builder,
+            fragment_min_mz=fragment_min_mz,
+            fragment_max_mz=fragment_max_mz,
+            peptide_min_mass=peptide_min_mass,
+            peptide_max_mass=peptide_max_mass,
+            ion_kinds=ion_kinds,
+            generate_decoys=generate_decoys,
+            decoy_tag=decoy_tag,
+            min_ion_index=min_ion_index,
+            max_variable_mods=max_variable_mods,
+            bucket_size=int(np.power(2, 14))
+        )
 
-# Adapted from https://github.com/theGreatHerrLebert/sagepy/blob/v0.2.20-alpha/sagepy/sagepy/core/peptide.py#L125C1
-# Will update in next release https://github.com/theGreatHerrLebert/sagepy/pull/12/files
-def to_unimod_sequence(modifications, sequence) -> str:
-    """ Get Peptide sequence with UNIMOD modification annotations.
+        # generate the database for searching against
+        indexed_db = sage_config.generate_indexed_database()
+        return indexed_db
+
+    def peptide_spectrum_match_list_to_pandas(self,
+                                              psms, indexed_db,
+                                              re_score: bool = False,
+                                              use_sequence_as_match_idx: bool = True) -> pd.DataFrame:
+        """Convert a list of peptide spectrum matches to a pandas dataframe
+
+        Args:
+            psms (List[PeptideSpectrumMatch]): The peptide spectrum matches
+            re_score (bool, optional): Should re-score be used. Defaults to False.
+            use_sequence_as_match_idx (bool, optional): Should the sequence be used as the match index. Defaults to True.
+
+        Returns:
+            pd.DataFrame: The pandas dataframe
+        """
+        row_list = []
+
+        for match in psms:
+            if match.retention_time_predicted is not None and match.projected_rt is not None:
+                delta_rt = match.retention_time_predicted - match.projected_rt
+            else:
+                delta_rt = None
 
-    Returns:
-        str: Peptide sequence with UNIMOD modification annotations.
-    """
+            if re_score:
+                score = match.re_score
+            else:
+                score = match.hyper_score
+
+            if use_sequence_as_match_idx:
+                match_idx = match.sequence
+            else:
+                match_idx = str(match.peptide_idx)
 
-    seq = ''
+            if match.inverse_mobility_predicted is not None:
+                delta_ims = match.inverse_mobility_predicted - match.inverse_mobility_observed
+            else:
+                delta_ims = None
 
-    for i, (s, m) in enumerate(zip(sequence, modifications)):
-        if m != 0:
-            if i == 0:
-                if mass_to_mod(m) == '[UNIMOD:1]':
-                    seq += f'{mass_to_mod(m)}{s}'
-                else:
-                    seq += f'{s}{mass_to_mod(m)}'
+            if match.beta_score is not None:
+                beta_score = match.beta_score
             else:
-                seq += f'{s}{mass_to_mod(m)}'
+                beta_score = None
+
+            peptide = indexed_db[match.peptide_idx]
+            position = peptide.position
+            modifications = peptide.modifications
+            precursor_ppm = abs(match.mono_mass_observed - match.mono_mass_calculated - match.isotope_error) * 2e6 / (
+                    match.mono_mass_observed + match.mono_mass_calculated - match.isotope_error)
+
+            row_list.append({
+                "spec_idx": match.spec_idx,
+                "match_idx": match_idx,
+                "proteins": ";".join(match.proteins),
+                "decoy": match.decoy,
+                "score": score,
+                "re_score": match.re_score,
+                "hyperscore": match.hyper_score,
+                "rank": match.rank,
+                "mono_mz_calculated": match.mono_mz_calculated,
+                "mono_mass_observed": match.mono_mass_observed,
+                "mono_mass_calculated": match.mono_mass_calculated,
+                "delta_mass": match.mono_mass_calculated - match.mono_mass_observed,
+                "isotope_error": match.isotope_error,
+                "fragment_ppm": match.average_ppm,
+                "precursor_ppm": precursor_ppm,
+                "delta_next": match.delta_next,
+                "delta_best": match.delta_best,
+                "matched_peaks": match.matched_peaks,
+                "longest_b": match.longest_b,
+                "longest_y": match.longest_y,
+                "longest_y_pct": match.longest_y_pct,
+                "missed_cleavages": match.missed_cleavages,
+                "matched_intensity_pct": match.matched_intensity_pct,
+                "scored_candidates": match.scored_candidates,
+                "poisson": match.poisson,
+                "modifications": modifications,
+                "charge": match.charge,
+                "retention_time_observed": match.retention_time_observed,
+                "retention_time_predicted": match.retention_time_predicted,
+                "delta_rt": delta_rt,
+                "inverse_mobility_observed": match.inverse_mobility_observed,
+                "inverse_mobility_predicted": match.inverse_mobility_predicted,
+                "delta_ims": delta_ims,
+                "intensity_ms1": match.intensity_ms1,
+                "intensity_ms2": match.intensity_ms2,
+                "q_value": match.q_value,
+                "collision_energy": match.collision_energy,
+                "cosine_similarity": match.cosine_similarity,
+                "projected_rt": match.projected_rt,
+                "beta_score": beta_score
+            })
+
+        return pd.DataFrame(row_list)
+
+    def run_one_raw(self, spec_file, param_data, fasta_file):
+
+        raw_spectrums = mzml.MzML(spec_file)
+
+        # loading parameters
+        fragment_min_mz = param_data["database"]["fragment_min_mz"]
+        fragment_max_mz = param_data["database"]["fragment_max_mz"]
+        deisotope_bool = param_data["deisotope"]
+        static_mods = param_data["database"]["static_mods"]
+        variable_mods = param_data["database"]["variable_mods"]
+        min_matched_peaks = param_data["min_matched_peaks"]
+        min_isotope_err = param_data["min_isotope_err"]
+        max_isotope_err = param_data["max_isotope_err"]
+        min_precursor_charge = param_data["min_precursor_charge"]
+        max_precursor_charge = param_data["max_precursor_charge"]
+        chimera = param_data["chimera"]
+        report_psms = param_data["report_psms"]
+        annotate_matches = param_data["annotate_matches"]
+        max_fragment_charge = param_data["max_fragment_charge"]
+        num_threads = param_data["search_thread"]
+
+        if "ppm" in param_data["precursor_tol"]:
+            precursor_tolerance = Tolerance(
+                ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1]))
         else:
-            seq += s
-
-    return seq
-
-
-def run_one_raw(spec_file, param_data):
-    global indexed_db
-
-    raw_spectrums = mzml.MzML(spec_file)
-    res = []
-
-    # loading parameters
-    fragment_min_mz = param_data["database"]["fragment_min_mz"]
-    fragment_max_mz = param_data["database"]["fragment_max_mz"]
-    deisotope_bool = param_data["deisotope"]
-    min_matched_peaks = param_data["min_matched_peaks"]
-    min_isotope_err = param_data["min_isotope_err"]
-    max_isotope_err = param_data["max_isotope_err"]
-    min_precursor_charge = param_data["min_precursor_charge"]
-    max_precursor_charge = param_data["max_precursor_charge"]
-    chimera = param_data["chimera"]
-    report_psms = param_data["report_psms"]
-    annotate_matches = param_data["annotate_matches"]
-    max_fragment_charge = param_data["max_fragment_charge"]
-
-    if "ppm" in param_data["precursor_tol"]:
-        precursor_tolerance = Tolerance(
-            ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1]))
-    else:
-        precursor_tolerance = Tolerance(
-            ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1]))
+            precursor_tolerance = Tolerance(
+                ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1]))
 
-    if "ppm" in param_data["fragment_tol"]:
-        fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1]))
-    else:
-        fragment_tolerance = Tolerance(ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1]))
-
-    # Begin search and score
-    for spectrum in tqdm(raw_spectrums):
-        if spectrum["ms level"] == 1:
-            continue
-        selected_ion = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0][
-            "selected ion m/z"]
-        charge = spectrum["precursorList"]["precursor"][0]["selectedIonList"]["selectedIon"][0]["charge state"]
-        mz = spectrum["m/z array"].astype(np.float32)
-        intensity = spectrum["intensity array"].astype(np.float32)
-        precursor = Precursor(
-            charge=charge,
-            mz=selected_ion,
-        )
-        raw_spectrum = RawSpectrum(
-            file_id=1,
-            spec_id=spectrum["id"],
-            total_ion_current=spectrum["total ion current"],
-            precursors=[precursor],
-            mz=mz,
-            intensity=intensity
-        )
+        if "ppm" in param_data["fragment_tol"]:
+            fragment_tolerance = Tolerance(
+                ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1]))
+        else:
+            fragment_tolerance = Tolerance(
+                ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1]))
 
-        spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz,
-                                           max_fragment_mz=fragment_max_mz,
-                                           deisotope=deisotope_bool)
-        query = spec_processor.process(raw_spectrum)
         scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks,
                         precursor_tolerance=precursor_tolerance,
                         fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err,
                         max_isotope_err=max_isotope_err,
                         min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge,
                         min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera,
-                        annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge)
+                        annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge,
+                        static_mods=static_mods, variable_mods=variable_mods
+                        )
+
+        # Begin search and score
+        querys = []
+
+        for spectrum in raw_spectrums:
+            if spectrum["ms level"] == 1:
+                continue
+            precursor = spectrum['precursorList']['precursor'][0]['selectedIonList']['selectedIon'][0]
+            precursor_mz = precursor["selected ion m/z"]
+            precursor_charge = precursor["charge state"]
+            precursor_intensity = precursor.get('peak intensity', None)
+            collision_energy = spectrum['precursorList']['precursor'][0]['activation']['collision energy']
+            injection_time = spectrum['scanList']['scan'][0]['ion injection time']
+            retention_time = spectrum['scanList']['scan'][0]['scan start time']
+
+            # Extract the relevant metadata
+            isolation_window = spectrum['precursorList']['precursor'][0]['isolationWindow']
+
+            lower = isolation_window['isolation window target m/z'] - isolation_window[
+                'isolation window lower offset']
+            upper = isolation_window['isolation window target m/z'] + isolation_window[
+                'isolation window upper offset']
+
+            mz = spectrum["m/z array"].astype(np.float32)
+            intensity = spectrum["intensity array"].astype(np.float32)
+            # set selection window bounds
+            tolerance = Tolerance(da=(lower, upper))
+            precursor = Precursor(
+                charge=precursor_charge,
+                mz=precursor_mz,
+                intensity=precursor_intensity,
+                isolation_window=tolerance,
+                collision_energy=collision_energy
+            )
+            raw_spectrum = RawSpectrum(
+                file_id=1,
+                spec_id=spectrum["id"],
+                total_ion_current=spectrum["total ion current"],
+                ion_injection_time=injection_time,
+                scan_start_time=retention_time,
+                precursors=[precursor],
+                mz=mz,
+                intensity=intensity
+            )
+
+            spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz,
+                                               max_fragment_mz=fragment_max_mz,
+                                               deisotope=deisotope_bool)
+
+            query = spec_processor.process(raw_spectrum)
+            querys.append(query)
+
+        indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data)
+        results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=querys, num_threads=num_threads)
+        # Convert the list of dictionaries to a pandas DataFrame
+        # exp_data = pd.DataFrame(d)
+
+        psm_list = []
+        for _, values in results.items():
+            psm_list.extend(values)
+
+        psm_list_rescore = rescore_lda(psm_list, verbose=False)
+        # print(psm_list_rescore[0])
+        psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db)
+
+        # TARGET = psm_list_df[psm_list_df.decoy == False]
+        # DECOY = psm_list_df[psm_list_df.decoy]
+
+        # plt.figure(figsize=(6, 4), dpi=150)
+        #
+        # plt.hist(TARGET.hyper_score, bins="auto", label="target")
+        # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy")
+        # plt.legend()
+        # plt.title("Target and decoy score distributions")
+        # plt.xlabel("Score")
+        # plt.ylabel("Count")
+        # plt.show()
+
+        TDC = target_decoy_competition_pandas(
+            df=psm_list_df,
+            # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level)
+            method="psm",
+            score="hyperscore"
+        )
 
-        results = scorer.score(db=indexed_db, spectrum=query)
-        for feature in results:
-            peptide = indexed_db[feature.peptide_idx]
-            sequence = peptide.sequence
-            modifications = peptide.modifications
-            unimod_sequence = to_unimod_sequence(modifications, sequence)
-            precursor_ppm = abs(feature.expmass - feature.calcmass - feature.isotope_error) * 2e6 / (feature.expmass + feature.calcmass- feature.isotope_error)
-            res.append({"file_id": feature.file_id, "spec_id": feature.spec_id, "peptide": unimod_sequence,
-                        "proteins": ";".join(peptide.proteins), "rank": feature.rank,
-                        "label": feature.label, "exp.mass": feature.expmass, "cal.mass": feature.calcmass,
-                        "charge": feature.charge, "retention time": feature.rt,
-                        "precursor_ppm": precursor_ppm, "fragment_ppm": feature.average_ppm,
-                        "delta mass": feature.delta_mass, "isotope error": feature.isotope_error,
-                        "hyperscore": feature.hyperscore, "delta_next": feature.delta_next,
-                        "delta_best": feature.delta_best,
-                        "matched peaks": feature.matched_peaks, "missed cleavages": feature.missed_cleavages,
-                        "scored candidates": feature.scored_candidates,
-                        "sage_discriminant_score": feature.discriminant_score,
-                        "posterior error": feature.posterior_error,
-                        "spectrum_q": feature.spectrum_q, "peptide q": feature.peptide_q,
-                        "protein q": feature.protein_q,
-                        "ms2 intensity": feature.ms2_intensity})
-
-    res = pd.DataFrame(res, index=None)
-    return res
-
-
-def run_sage(fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm",
-             mzml_files: list = [], sage_config_file: str = None,
-             use_file_values: bool = True, n_process=4) -> DataFrame:
-    if sage_config_file is None:
-        raise ValueError("The sage config file is required.")
-
-    with open(sage_config_file) as f:
-        param_data = json.load(f)
-
-        # loading and set default parameters
-        param_data["database"]["fragment_min_mz"] = param_data["database"]["fragment_min_mz"] if "fragment_min_mz" in \
-                                                                                                 param_data[
-                                                                                                     "database"] else 150
-        param_data["database"]["fragment_max_mz"] = param_data["database"]["fragment_max_mz"] if "fragment_max_mz" in \
-                                                                                                 param_data[
-                                                                                                     "database"] else 2000
-        param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True
-        param_data["min_matched_peaks"] = param_data["min_matched_peaks"] if "min_matched_peaks" in param_data else 6
-        param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1
-        param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3
-        param_data["min_precursor_charge"] = param_data[
-            "min_precursor_charge"] if "min_precursor_charge" in param_data else 2
-        param_data["max_precursor_charge"] = param_data[
-            "max_precursor_charge"] if "max_precursor_charge" in param_data else 4
-        param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False
-        param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1
-        param_data["annotate_matches"] = param_data["annotate_matches"] if "annotate_matches" in param_data else False
-        param_data["max_fragment_charge"] = param_data[
-            "max_fragment_charge"] if "max_fragment_charge" in param_data else 1
-
-    if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values:
-        param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)]
-        precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance)))
-        if fragment_type == "ppm":
-            param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)]
-            fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance)))
-        else:
-            param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance]
-            fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance))
+        # add PEP to the PSMs
+        TDC["pep"] = calculate_pep(
+            scores=TDC.score.values,
+            decoys=TDC.decoy.values,
+        )
 
-    else:
-        logging.info("Using the values from the file.")
-        if "ppm" in param_data["precursor_tol"]:
-            precursor_tolerance = param_data["precursor_tol"]["ppm"][1]
-        else:
-            precursor_tolerance = param_data["precursor_tol"]["da"][1]
+        # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01
+        TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)]
+        RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC_filtered,
+                          on=["spec_idx", "decoy", "match_idx"])
+        RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(spec_file))[0])
+        RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True)
+        # RESULT.to_csv("./test_results.csv", index=False)
+
+        res = pd.DataFrame(RESULT, index=None)
+        return res
+
+    def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm",
+                 mzml_files: list = [], sage_config_file: str = None, fasta_file: str = None,
+                 use_file_values: bool = True, n_process=4) -> DataFrame:
+        if sage_config_file is None:
+            raise ValueError("The sage config file is required.")
+
+        with open(sage_config_file) as f:
+            param_data = json.load(f)
+
+            # loading and set default parameters
+            param_data["database"]["fragment_min_mz"] = param_data["database"][
+                "fragment_min_mz"] if "fragment_min_mz" in \
+                                      param_data[
+                                          "database"] else 150
+            param_data["database"]["fragment_max_mz"] = param_data["database"][
+                "fragment_max_mz"] if "fragment_max_mz" in \
+                                      param_data[
+                                          "database"] else 2000
+            param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True
+            param_data["min_matched_peaks"] = param_data[
+                "min_matched_peaks"] if "min_matched_peaks" in param_data else 6
+            param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1
+            param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3
+            param_data["min_precursor_charge"] = param_data[
+                "min_precursor_charge"] if "min_precursor_charge" in param_data else 2
+            param_data["max_precursor_charge"] = param_data[
+                "max_precursor_charge"] if "max_precursor_charge" in param_data else 4
+            param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False
+            param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1
+            param_data["annotate_matches"] = param_data[
+                "annotate_matches"] if "annotate_matches" in param_data else False
+            param_data["max_fragment_charge"] = param_data[
+                "max_fragment_charge"] if "max_fragment_charge" in param_data else 1
+            param_data["search_thread"] = param_data["search_thread"] if "search_thread" in param_data else 4
+
+        if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values:
+            param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)]
+            precursor_tolerance = Tolerance(ppm=(int(-1 * precursor_tolerance), int(precursor_tolerance)))
+            if fragment_type == "ppm":
+                param_data["fragment_tol"][fragment_type] = [int(-1 * fragment_tolerance), int(fragment_tolerance)]
+                fragment_tolerance = Tolerance(ppm=(int(-1 * fragment_tolerance), int(fragment_tolerance)))
+            else:
+                param_data["fragment_tol"][fragment_type] = [-1 * fragment_tolerance, fragment_tolerance]
+                fragment_tolerance = Tolerance(da=(-1 * fragment_tolerance, fragment_tolerance))
 
-        if "ppm" in param_data["fragment_tol"]:
-            fragment_tolerance = param_data["fragment_tol"]["ppm"][1]
         else:
-            fragment_tolerance = param_data["fragment_tol"]["da"][1]
+            logging.info("Using the values from the file.")
+            if "ppm" in param_data["precursor_tol"]:
+                precursor_tolerance = param_data["precursor_tol"]["ppm"][1]
+            else:
+                precursor_tolerance = param_data["precursor_tol"]["da"][1]
 
-    param_data["mzml_paths"] = mzml_files
+            if "ppm" in param_data["fragment_tol"]:
+                fragment_tolerance = param_data["fragment_tol"]["ppm"][1]
+            else:
+                fragment_tolerance = param_data["fragment_tol"]["da"][1]
 
-    temp_sage_file = str(uuid.uuid4()) + ".json"
-    with open(temp_sage_file, "w") as f:
-        json.dump(param_data, f, indent=4)
+        param_data["mzml_paths"] = mzml_files
 
-    logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance,
-                                                                                               precursor_tolerance))
+        temp_sage_file = str(uuid.uuid4()) + ".json"
+        with open(temp_sage_file, "w") as f:
+            json.dump(param_data, f, indent=4)
 
-    sage_table = pd.DataFrame()
-    tasks = []
+        logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance,
+                                                                                                   precursor_tolerance))
 
-    with cf.ProcessPoolExecutor(n_process) as pp:
-        for m in mzml_files:
-            tasks.append(pp.submit(run_one_raw, m, param_data))
+        sage_table = pd.DataFrame()
+        tasks = []
+        # for m in mzml_files:
+        #     res = run_one_raw(m, param_data)
+        #     sage_table = pd.concat([sage_table, res], ignore_index=True)
 
-        for future in cf.as_completed(tasks):
-            sage_table = pd.concat([sage_table, future.result()], ignore_index=True)
+        # multiple threads don't supported indexed database class. so currently generating indexed database separately
+        # for each file. But it only takes about 12s.
+        with cf.ProcessPoolExecutor(n_process) as pp:
+            for m in mzml_files:
+                tasks.append(pp.submit(self.run_one_raw, m, param_data, fasta_file))
 
-    sage_table = compute_entrapment_qvalues(sage_table)
-    return sage_table
+            for future in cf.as_completed(tasks):
+                sage_table = pd.concat([sage_table, future.result()], ignore_index=True)
+
+        sage_table = compute_entrapment_qvalues(sage_table)
+        return sage_table
 
 
 def extract_ptms(sage_table_target):
@@ -393,8 +494,9 @@ def compute_entrapment_qvalues(sage_data_frame: pd.DataFrame) -> pd.DataFrame:
     :param sage_data_frame: panda data frame with the sage results
     """
 
-    # Use inplace=True for sorting
-    sage_data_frame.sort_values(by='sage_discriminant_score', ascending=False, inplace=True)
+    # Use inplace=True for sorting, Currently using hyper score due to unavailable LDA
+    # https://github.com/theGreatHerrLebert/sagepy/issues/13
+    sage_data_frame.sort_values(by='hyperscore', ascending=False, inplace=True)
 
     # Use a more descriptive name for columns
     sage_data_frame['is_entrap'] = sage_data_frame['proteins'].apply(
@@ -433,13 +535,15 @@ def compute_best_combination(sage_table: pd.DataFrame) -> int:
     return len(sage_table_target['peptide'])
 
 
-def combined_search(results: list = [], start_fragment_tolerance: int = 0, start_precursor_tolerance: int = 0,
+def combined_search(sagerunner, results: list = [], start_fragment_tolerance: int = 0,
+                    start_precursor_tolerance: int = 0,
                     min_fragment_tolerance: int = 1, max_fragment_tolerance: int = 50,
                     min_precursor_tolerance: int = 10,
                     max_precursor_tolerance: int = 100, num_psms: int = 0, fragment_type: str = "ppm",
                     mzml_files: list = [],
                     grid_frag_steps: int = 10,
-                    grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file="", initial_temp=100,
+                    grid_prec_steps: int = 5, max_iterations=10, search_radius=1, fasta_file: str = None,
+                    initial_temp=100,
                     cooling_rate=0.95, sage_config_file=None) -> Tuple[int, int, List[Dict]]:
     def acceptance_probability(old_value, new_value, temperature):
         if new_value > old_value:
@@ -455,9 +559,9 @@ def acceptance_probability(old_value, new_value, temperature):
 
     grid_best_value = 1
     for ft in precursor_tolerances:
-        sage_table = run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft,
-                              fragment_type=fragment_type, mzml_files=mzml_files,
-                              sage_config_file=sage_config_file, use_file_values=False, n_process=4)
+        sage_table = sagerunner.run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft,
+                                         fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
+                                         sage_config_file=sage_config_file, use_file_values=False, n_process=4)
         new_value = compute_best_combination(sage_table)
         results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value))
 
@@ -480,9 +584,9 @@ def acceptance_probability(old_value, new_value, temperature):
 
     for ft in fragment_tolerances:
         for pt in precursor_tolerances:
-            sage_table = run_sage(fragment_tolerance=ft, precursor_tolerance=pt,
-                                  fragment_type=fragment_type, mzml_files=mzml_files,
-                                  sage_config_file=sage_config_file, use_file_values=False)
+            sage_table = sagerunner.run_sage(fragment_tolerance=ft, precursor_tolerance=pt,
+                                             fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
+                                             sage_config_file=sage_config_file, use_file_values=False)
             new_value = compute_best_combination(sage_table)
             results.append(get_stats_from_sage(sage_table, ft, pt, new_value))
 
@@ -514,10 +618,10 @@ def acceptance_probability(old_value, new_value, temperature):
             max(min_precursor_tolerance, current_precursor_tolerance - search_radius),
             min(max_precursor_tolerance, current_precursor_tolerance + search_radius))
 
-        sage_table = run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance,
-                              fragment_type=fragment_type, mzml_files=mzml_files,
-                              sage_config_file=sage_config_file,
-                              use_file_values=False)
+        sage_table = sagerunner.run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance,
+                                         fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
+                                         sage_config_file=sage_config_file,
+                                         use_file_values=False)
         new_value = compute_best_combination(sage_table)
         results.append(get_stats_from_sage(sage_table, fragment_tolerance, precursor_tolerance, new_value))
 
@@ -575,14 +679,16 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i
 
         if os.path.exists(fasta_file):
             logging.info(f"Generating indexed database from {fasta_file}.")
-            generate_indexed_db(fasta_file, sage_config_file)
+            sagerunner = SageRunner()
+            # sagerunner.generate_indexed_db(fasta_file, sage_config_file)
         else:
             logging.error(f"File {fasta_file} does not exist.")
             raise FileNotFoundError(f"File {fasta_file} does not exist.")
 
-        sage_table = run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance), fragment_type,
-                              mzml_files, sage_config_file=sage_config_file,
-                              use_file_values=False, n_process=4)
+        sage_table = sagerunner.run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance),
+                                         fragment_type,
+                                         mzml_files, sage_config_file=sage_config_file, fasta_file=fasta_file,
+                                         use_file_values=False, n_process=4)
         sage_table.to_csv("sage_table.csv", index=False)
         num_psms = compute_best_combination(sage_table)
         stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms)
@@ -591,7 +697,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i
         best_precursor_tolerance = initial_precursor_tolerance
         best_fragment_tolerance = initial_fragment_tolerance
 
-    best_fragment_tolerance, best_precursor_tolerance, results = combined_search(results=results,
+    best_fragment_tolerance, best_precursor_tolerance, results = combined_search(sagerunner, results=results,
                                                                                  start_fragment_tolerance=best_fragment_tolerance,
                                                                                  start_precursor_tolerance=best_precursor_tolerance,
                                                                                  min_fragment_tolerance=initial_fragment_tolerance,
@@ -676,9 +782,9 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str):
     global folder_plots_uui
     folder_plots_uui = str(uuid.uuid4())
     os.makedirs(folder_plots_uui)
-
-    sage_table = run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_path=fasta_file,
-                          sage_config_file=sage_config_file, use_file_values=True)
+    sagerunner = SageRunner()
+    sage_table = sagerunner.run_sage(fragment_type="ppm", mzml_files=mzml_files, fasta_file=fasta_file,
+                                     sage_config_file=sage_config_file, use_file_values=True)
 
     num_psms = compute_best_combination(sage_table)
     stats = get_stats_from_sage(sage_table, fragment_tolerance=fragment_tolerance,
@@ -696,6 +802,80 @@ def ptms(mzml_path: str, fasta_file: str, sage_config_file: str):
             f.write(f"{key}\t{value}\n")
 
 
+def sage2PTMShepherd(sage_table):
+    ptmsherd_schema = ["Spectrum", "Spectrum File", "Peptide", "Modified Peptide", "Peptide Length", "Charge",
+                       "Retention", "Observed Mass", "Calibrated Observed Mass",
+                       "Observed M/Z", "Calibrated Observed M/Z", "Calculated Peptide Mass", "Calculated M/Z",
+                       "Delta Mass", "Expectation", "Hyperscore", "Nextscore", "Probability",
+                       "Number of Missed Cleavages", "Intensity",
+                       "Assigned Modifications", "Observed Modifications",
+                       "Purity", "Protein", "Protein ID"]
+    psm_df = pd.DataFrame(columns=ptmsherd_schema)
+    unimod_pattern = re.compile(r"\[UNIMOD:\d+]")
+    db = mass.Unimod()
+    new_rows = []
+    for _, row in sage_table.iterrows():
+        Spectrum = ".".join([os.path.splitext(row['file_name'])[0], row["spec_idx"].split("=")[-1],
+                             row["spec_idx"].split("=")[-1], str(row["charge"])])
+        SpectrumFile = row['file_name']
+        peptide = unimod_pattern.sub(repl="", string=row['peptide'])
+        modifications = unimod_pattern.finditer(row["peptide"])
+        modified_peptide = row["peptide"]
+        AssignedModifications = []
+        previous_idx = 0
+        for modification in modifications:
+            position = modification.span()[0] - 1 - previous_idx
+            unimod_term = modification.group(0).replace("[UNIMOD:", "").replace("]", "")
+            mass_da = db.by_id(unimod_term)["mono_mass"]
+            AssignedModifications.append(str(position + 1) + str(peptide[position]) + "(" + str(mass_da) + ")")
+            MassResiduePlusMod = int(mass.fast_mass(peptide[position]) + mass_da - 18.001)
+            modified_peptide = modified_peptide.replace(peptide[position] + modification.group(0),
+                                                        peptide[position] + "[" + str(MassResiduePlusMod) + "]")
+            previous_idx += len(modification.group(0))
+        if modified_peptide == peptide:
+            modified_peptide = ""
+        AssignedModifications = ", ".join(AssignedModifications)
+
+        PeptideLength = len(peptide)
+        Charge = row["charge"]
+        Retention = row["retention_time_observed"] * 60
+        ObservedMass = row["mono_mass_observed"]
+        ObservedMZ = row["mono_mass_observed"] / row["charge"] + 1.007276
+        CalibratedObservedMZ = ObservedMZ
+        CalculatedPeptideMass = row["mono_mass_calculated"]
+        CalibratedObservedMass = ObservedMass
+        CalculatedMZ = row["mono_mz_calculated"]
+        DeltaMass = row["delta_mass"]
+        Hyperscore = row["hyperscore"]
+        Probability = 1 - row["spectrum_q"]
+        Nextscore = row["delta_next"]
+        NumberofMissedCleavages = row["missed_cleavages"]
+        Intensity = row["intensity_ms1"]
+        Protein = row["proteins"]
+        ProteinID = row["proteins"].split("|")[1]
+        new_rows.append({"Spectrum": Spectrum, "Spectrum File": SpectrumFile,
+                          "Peptide": peptide,
+                          "Modified Peptide": modified_peptide,
+                          "Peptide Length": PeptideLength, "Charge": Charge,
+                          "Retention": Retention, "Observed Mass": ObservedMass,
+                          "Calibrated Observed Mass": CalibratedObservedMass,
+                          "Observed M/Z": ObservedMZ,
+                          "Calibrated Observed M/Z": CalibratedObservedMZ,
+                          "Calculated Peptide Mass": CalculatedPeptideMass,
+                          "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass,
+                          "Hyperscore": Hyperscore, "Nextscore": Nextscore,
+                          "Probability": Probability,
+                          "Number of Missed Cleavages": NumberofMissedCleavages,
+                          "Intensity": Intensity,
+                          "Assigned Modifications": AssignedModifications,
+                          "Protein": Protein, "Protein ID": ProteinID})
+
+    psm_df = pd.concat([psm_df, pd.DataFrame.from_records(new_rows)],
+                       ignore_index=True)
+
+    psm_df.to_csv("inputPTMShepherd.tsv", index=False, sep="\t")
+
+
 @click.group(context_settings=CONTEXT_SETTINGS)
 def cli():
     """
@@ -709,3 +889,4 @@ def cli():
 
 if __name__ == "__main__":
     cli()
+

From 2f39fff96f82838d480ceed9b8e3fdf7345c1c65 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Sun, 15 Sep 2024 09:55:33 +0800
Subject: [PATCH 3/3] refine

---
 enviroment.yml           |   3 +-
 opt-params/opt_params.py | 331 ++++++++++++++++++++-------------------
 2 files changed, 168 insertions(+), 166 deletions(-)

diff --git a/enviroment.yml b/enviroment.yml
index 94eecf9..852f9ea 100644
--- a/enviroment.yml
+++ b/enviroment.yml
@@ -15,7 +15,8 @@ dependencies:
   - seaborn
   - numpy
   - pyarrow
-  - sagepy=0.2.20
+  - sagepy=0.2.22
+  - sagepy-connector=0.2.22
   - pyteomics
   - tqdm
 
diff --git a/opt-params/opt_params.py b/opt-params/opt_params.py
index cc5926c..af3cf26 100644
--- a/opt-params/opt_params.py
+++ b/opt-params/opt_params.py
@@ -34,7 +34,7 @@
 
 class SageRunner:
     def __init__(self):
-        self.indexed_db = None
+        self.queries = []
 
     def generate_indexed_db(self, fasta_path, param_data):
         # if sage_config_file is None:
@@ -190,54 +190,9 @@ def peptide_spectrum_match_list_to_pandas(self,
 
         return pd.DataFrame(row_list)
 
-    def run_one_raw(self, spec_file, param_data, fasta_file):
-
+    def parse_mzml(self, spec_file, file_id, fragment_min_mz, fragment_max_mz, deisotope_bool):
         raw_spectrums = mzml.MzML(spec_file)
 
-        # loading parameters
-        fragment_min_mz = param_data["database"]["fragment_min_mz"]
-        fragment_max_mz = param_data["database"]["fragment_max_mz"]
-        deisotope_bool = param_data["deisotope"]
-        static_mods = param_data["database"]["static_mods"]
-        variable_mods = param_data["database"]["variable_mods"]
-        min_matched_peaks = param_data["min_matched_peaks"]
-        min_isotope_err = param_data["min_isotope_err"]
-        max_isotope_err = param_data["max_isotope_err"]
-        min_precursor_charge = param_data["min_precursor_charge"]
-        max_precursor_charge = param_data["max_precursor_charge"]
-        chimera = param_data["chimera"]
-        report_psms = param_data["report_psms"]
-        annotate_matches = param_data["annotate_matches"]
-        max_fragment_charge = param_data["max_fragment_charge"]
-        num_threads = param_data["search_thread"]
-
-        if "ppm" in param_data["precursor_tol"]:
-            precursor_tolerance = Tolerance(
-                ppm=(param_data["precursor_tol"]["ppm"][0], param_data["precursor_tol"]["ppm"][1]))
-        else:
-            precursor_tolerance = Tolerance(
-                ppm=(param_data["precursor_tol"]["da"][0], param_data["precursor_tol"]["da"][1]))
-
-        if "ppm" in param_data["fragment_tol"]:
-            fragment_tolerance = Tolerance(
-                ppm=(param_data["fragment_tol"]["ppm"][0], param_data["fragment_tol"]["ppm"][1]))
-        else:
-            fragment_tolerance = Tolerance(
-                ppm=(param_data["fragment_tol"]["da"][0], param_data["fragment_tol"]["da"][1]))
-
-        scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks,
-                        precursor_tolerance=precursor_tolerance,
-                        fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err,
-                        max_isotope_err=max_isotope_err,
-                        min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge,
-                        min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera,
-                        annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge,
-                        static_mods=static_mods, variable_mods=variable_mods
-                        )
-
-        # Begin search and score
-        querys = []
-
         for spectrum in raw_spectrums:
             if spectrum["ms level"] == 1:
                 continue
@@ -268,9 +223,11 @@ def run_one_raw(self, spec_file, param_data, fasta_file):
                 isolation_window=tolerance,
                 collision_energy=collision_energy
             )
+            scan = re.findall(r"scan=(\d+)", spectrum["id"])[0]
             raw_spectrum = RawSpectrum(
-                file_id=1,
-                spec_id=spectrum["id"],
+                file_id=file_id,
+                spec_id=os.path.splitext(os.path.basename(spec_file))[0] + "." + str(scan) + "." + str(
+                    scan) + "." + str(precursor_charge),
                 total_ion_current=spectrum["total ion current"],
                 ion_injection_time=injection_time,
                 scan_start_time=retention_time,
@@ -279,66 +236,18 @@ def run_one_raw(self, spec_file, param_data, fasta_file):
                 intensity=intensity
             )
 
-            spec_processor = SpectrumProcessor(take_top_n=75, min_fragment_mz=fragment_min_mz,
+            spec_processor = SpectrumProcessor(take_top_n=150, min_fragment_mz=fragment_min_mz,
                                                max_fragment_mz=fragment_max_mz,
                                                deisotope=deisotope_bool)
 
             query = spec_processor.process(raw_spectrum)
-            querys.append(query)
-
-        indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data)
-        results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=querys, num_threads=num_threads)
-        # Convert the list of dictionaries to a pandas DataFrame
-        # exp_data = pd.DataFrame(d)
-
-        psm_list = []
-        for _, values in results.items():
-            psm_list.extend(values)
-
-        psm_list_rescore = rescore_lda(psm_list, verbose=False)
-        # print(psm_list_rescore[0])
-        psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db)
-
-        # TARGET = psm_list_df[psm_list_df.decoy == False]
-        # DECOY = psm_list_df[psm_list_df.decoy]
-
-        # plt.figure(figsize=(6, 4), dpi=150)
-        #
-        # plt.hist(TARGET.hyper_score, bins="auto", label="target")
-        # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy")
-        # plt.legend()
-        # plt.title("Target and decoy score distributions")
-        # plt.xlabel("Score")
-        # plt.ylabel("Count")
-        # plt.show()
-
-        TDC = target_decoy_competition_pandas(
-            df=psm_list_df,
-            # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level)
-            method="psm",
-            score="hyperscore"
-        )
+            self.queries.append(query)
 
-        # add PEP to the PSMs
-        TDC["pep"] = calculate_pep(
-            scores=TDC.score.values,
-            decoys=TDC.decoy.values,
-        )
-
-        # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01
-        TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)]
-        RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC_filtered,
-                          on=["spec_idx", "decoy", "match_idx"])
-        RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(spec_file))[0])
-        RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True)
-        # RESULT.to_csv("./test_results.csv", index=False)
-
-        res = pd.DataFrame(RESULT, index=None)
-        return res
+        # return querys
 
     def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fragment_type: str = "ppm",
                  mzml_files: list = [], sage_config_file: str = None, fasta_file: str = None,
-                 use_file_values: bool = True, n_process=4) -> DataFrame:
+                 use_file_values: bool = True, skip_spectra_preprocess: bool = False) -> DataFrame:
         if sage_config_file is None:
             raise ValueError("The sage config file is required.")
 
@@ -346,30 +255,71 @@ def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fr
             param_data = json.load(f)
 
             # loading and set default parameters
-            param_data["database"]["fragment_min_mz"] = param_data["database"][
-                "fragment_min_mz"] if "fragment_min_mz" in \
-                                      param_data[
-                                          "database"] else 150
-            param_data["database"]["fragment_max_mz"] = param_data["database"][
-                "fragment_max_mz"] if "fragment_max_mz" in \
-                                      param_data[
-                                          "database"] else 2000
-            param_data["deisotope"] = param_data["deisotope"] if "deisotope" in param_data else True
-            param_data["min_matched_peaks"] = param_data[
-                "min_matched_peaks"] if "min_matched_peaks" in param_data else 6
-            param_data["min_isotope_err"] = param_data["min_isotope_err"] if "min_isotope_err" in param_data else -1
-            param_data["max_isotope_err"] = param_data["max_isotope_err"] if "max_isotope_err" in param_data else 3
-            param_data["min_precursor_charge"] = param_data[
-                "min_precursor_charge"] if "min_precursor_charge" in param_data else 2
-            param_data["max_precursor_charge"] = param_data[
-                "max_precursor_charge"] if "max_precursor_charge" in param_data else 4
-            param_data["chimera"] = param_data["chimera"] if "chimera" in param_data else False
-            param_data["report_psms"] = param_data["report_psms"] if "report_psms" in param_data else 1
-            param_data["annotate_matches"] = param_data[
-                "annotate_matches"] if "annotate_matches" in param_data else False
-            param_data["max_fragment_charge"] = param_data[
-                "max_fragment_charge"] if "max_fragment_charge" in param_data else 1
-            param_data["search_thread"] = param_data["search_thread"] if "search_thread" in param_data else 4
+            if "fragment_min_mz" in param_data["database"]:
+                fragment_min_mz = param_data["database"]["fragment_min_mz"]
+            else:
+                param_data["database"]["fragment_min_mz"] = 150
+                fragment_min_mz = 150
+            if "fragment_max_mz" in param_data["database"]:
+                fragment_max_mz = param_data["database"]["fragment_min_mz"]
+            else:
+                param_data["database"]["fragment_max_mz"] = 2000
+                fragment_max_mz = 2000
+            if "deisotope" in param_data:
+                deisotope_bool = param_data["deisotope"]
+            else:
+                param_data["deisotope"] = True
+                deisotope_bool = True
+            if "min_matched_peaks" in param_data:
+                min_matched_peaks = param_data["min_matched_peaks"]
+            else:
+                param_data["min_matched_peaks"] = 6
+                min_matched_peaks = 6
+            if "min_isotope_err" in param_data:
+                min_isotope_err = param_data["min_isotope_err"]
+            else:
+                param_data["min_isotope_err"] = -1
+                min_isotope_err = -1
+            if "max_isotope_err" in param_data:
+                max_isotope_err = param_data["max_isotope_err"]
+            else:
+                param_data["max_isotope_err"] = -1
+                max_isotope_err = 3
+            if "min_precursor_charge" in param_data:
+                min_precursor_charge = param_data["min_precursor_charge"]
+            else:
+                min_precursor_charge = 2
+                param_data["min_precursor_charge"] = 2
+            if "max_precursor_charge" in param_data:
+                max_precursor_charge = param_data["max_precursor_charge"]
+            else:
+                max_precursor_charge = 4
+                param_data["max_precursor_charge"] = 4
+            if "chimera" in param_data:
+                chimera = param_data["chimera"]
+            else:
+                param_data["chimera"] = False
+                chimera = False
+            if "report_psms" in param_data:
+                report_psms = param_data["report_psms"]
+            else:
+                param_data["report_psms"] = 1
+                report_psms = 1
+            if "annotate_matches" in param_data:
+                annotate_matches = param_data["annotate_matches"]
+            else:
+                param_data["annotate_matches"] = False
+                annotate_matches = False
+            if "max_fragment_charge" in param_data:
+                max_fragment_charge = param_data["max_fragment_charge"]
+            else:
+                param_data["max_fragment_charge"] = 1
+                max_fragment_charge = 1
+            if "search_thread" in param_data:
+                search_thread = param_data["search_thread"]
+            else:
+                param_data["search_thread"] = 4
+                search_thread = 4
 
         if fragment_tolerance != 0 and precursor_tolerance != 0 or not use_file_values:
             param_data["precursor_tol"]["ppm"] = [int(-1 * precursor_tolerance), int(precursor_tolerance)]
@@ -401,21 +351,76 @@ def run_sage(self, fragment_tolerance: int = 0, precursor_tolerance: int = 0, fr
 
         logging.info("Running SAGE with fragment tolerance: {} and precursor tolerance: {}".format(fragment_tolerance,
                                                                                                    precursor_tolerance))
+        if not skip_spectra_preprocess:
+            name_id_map = dict(zip(mzml_files, range(len(mzml_files))))
+            with cf.ThreadPoolExecutor(search_thread) as executor:
+                for m in mzml_files:
+                    executor.submit(self.parse_mzml, m, name_id_map[m], param_data["database"]["fragment_min_mz"],
+                                    param_data["database"]["fragment_max_mz"], deisotope_bool)
+
+            # for future in cf.as_completed(tasks):
+            #     all_queries.append(future.result())
+
+        static_mods = param_data["database"]["static_mods"]
+        variable_mods = param_data["database"]["variable_mods"]
+
+        scorer = Scorer(report_psms=report_psms, min_matched_peaks=min_matched_peaks,
+                        precursor_tolerance=precursor_tolerance,
+                        fragment_tolerance=fragment_tolerance, min_isotope_err=min_isotope_err,
+                        max_isotope_err=max_isotope_err,
+                        min_precursor_charge=min_precursor_charge, max_precursor_charge=max_precursor_charge,
+                        min_fragment_mass=fragment_min_mz, max_fragment_mass=fragment_max_mz, chimera=chimera,
+                        annotate_matches=annotate_matches, max_fragment_charge=max_fragment_charge,
+                        static_mods=static_mods, variable_mods=variable_mods
+                        )
+
+        indexed_db = self.generate_indexed_db(fasta_path=fasta_file, param_data=param_data)
+        results = scorer.score_collection_psm(db=indexed_db, spectrum_collection=self.queries,
+                                              num_threads=search_thread)
+
+        psm_list = []
+        for _, values in results.items():
+            psm_list.extend(values)
+
+        # after intensity a
+        psm_list_rescore = rescore_lda(psm_list, verbose=False)
+        psm_list_df = self.peptide_spectrum_match_list_to_pandas(psm_list_rescore, indexed_db, re_score=True)
+
+        # TARGET = psm_list_df[psm_list_df.decoy == False]
+        # DECOY = psm_list_df[psm_list_df.decoy]
+
+        # plt.figure(figsize=(6, 4), dpi=150)
+        #
+        # plt.hist(TARGET.hyper_score, bins="auto", label="target")
+        # plt.hist(DECOY.hyper_score, bins="auto", alpha=.5, label="decoy")
+        # plt.legend()
+        # plt.title("Target and decoy score distributions")
+        # plt.xlabel("Score")
+        # plt.ylabel("Count")
+        # plt.show()
+
+        TDC = target_decoy_competition_pandas(
+            df=psm_list_df,
+            # the method for TDC can be set here, e.g., PSM level, peptide level, or double competition (PSM and peptide level)
+            method="peptide_psm_peptide",
+            score="re_score"
+        )
 
-        sage_table = pd.DataFrame()
-        tasks = []
-        # for m in mzml_files:
-        #     res = run_one_raw(m, param_data)
-        #     sage_table = pd.concat([sage_table, res], ignore_index=True)
+        # add PEP to the PSMs
+        TDC["pep"] = calculate_pep(
+            scores=TDC.score.values,
+            decoys=TDC.decoy.values,
+        )
 
-        # multiple threads don't supported indexed database class. so currently generating indexed database separately
-        # for each file. But it only takes about 12s.
-        with cf.ProcessPoolExecutor(n_process) as pp:
-            for m in mzml_files:
-                tasks.append(pp.submit(self.run_one_raw, m, param_data, fasta_file))
+        # after target decoy competition, hits can be filtered to, e.g., 1 percent expected FDR == q-value <= 0.01
+        # TDC_filtered = TDC[(TDC.q_value <= 0.01) & (TDC.decoy == False)]
+        RESULT = pd.merge(psm_list_df.drop(columns=["q_value", "score"]), TDC,
+                          on=["spec_idx", "decoy", "match_idx"])
 
-            for future in cf.as_completed(tasks):
-                sage_table = pd.concat([sage_table, future.result()], ignore_index=True)
+        # RESULT.insert(loc=0, column="file_name", value=os.path.splitext(os.path.basename(m))[0])
+        RESULT.rename(columns={"q_value": "spectrum_q", "match_idx": "peptide"}, inplace=True)
+
+        sage_table = pd.DataFrame(RESULT, index=None)
 
         sage_table = compute_entrapment_qvalues(sage_table)
         return sage_table
@@ -494,9 +499,7 @@ def compute_entrapment_qvalues(sage_data_frame: pd.DataFrame) -> pd.DataFrame:
     :param sage_data_frame: panda data frame with the sage results
     """
 
-    # Use inplace=True for sorting, Currently using hyper score due to unavailable LDA
-    # https://github.com/theGreatHerrLebert/sagepy/issues/13
-    sage_data_frame.sort_values(by='hyperscore', ascending=False, inplace=True)
+    sage_data_frame.sort_values(by='re_score', ascending=False, inplace=True)
 
     # Use a more descriptive name for columns
     sage_data_frame['is_entrap'] = sage_data_frame['proteins'].apply(
@@ -561,7 +564,8 @@ def acceptance_probability(old_value, new_value, temperature):
     for ft in precursor_tolerances:
         sage_table = sagerunner.run_sage(fragment_tolerance=start_fragment_tolerance, precursor_tolerance=ft,
                                          fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
-                                         sage_config_file=sage_config_file, use_file_values=False, n_process=4)
+                                         sage_config_file=sage_config_file, use_file_values=False,
+                                         skip_spectra_preprocess=True)
         new_value = compute_best_combination(sage_table)
         results.append(get_stats_from_sage(sage_table, start_fragment_tolerance, ft, new_value))
 
@@ -586,7 +590,8 @@ def acceptance_probability(old_value, new_value, temperature):
         for pt in precursor_tolerances:
             sage_table = sagerunner.run_sage(fragment_tolerance=ft, precursor_tolerance=pt,
                                              fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
-                                             sage_config_file=sage_config_file, use_file_values=False)
+                                             sage_config_file=sage_config_file, skip_spectra_preprocess=True,
+                                             use_file_values=False)
             new_value = compute_best_combination(sage_table)
             results.append(get_stats_from_sage(sage_table, ft, pt, new_value))
 
@@ -620,7 +625,7 @@ def acceptance_probability(old_value, new_value, temperature):
 
         sage_table = sagerunner.run_sage(fragment_tolerance=fragment_tolerance, precursor_tolerance=precursor_tolerance,
                                          fragment_type=fragment_type, mzml_files=mzml_files, fasta_file=fasta_file,
-                                         sage_config_file=sage_config_file,
+                                         sage_config_file=sage_config_file, skip_spectra_preprocess=True,
                                          use_file_values=False)
         new_value = compute_best_combination(sage_table)
         results.append(get_stats_from_sage(sage_table, fragment_tolerance, precursor_tolerance, new_value))
@@ -678,9 +683,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i
     if initial_fragment_tolerance is not None and initial_precursor_tolerance is not None:
 
         if os.path.exists(fasta_file):
-            logging.info(f"Generating indexed database from {fasta_file}.")
             sagerunner = SageRunner()
-            # sagerunner.generate_indexed_db(fasta_file, sage_config_file)
         else:
             logging.error(f"File {fasta_file} does not exist.")
             raise FileNotFoundError(f"File {fasta_file} does not exist.")
@@ -688,7 +691,7 @@ def tolerances(fragment_type: str, mzml_path: str, initial_fragment_tolerance: i
         sage_table = sagerunner.run_sage(int(initial_fragment_tolerance), int(initial_precursor_tolerance),
                                          fragment_type,
                                          mzml_files, sage_config_file=sage_config_file, fasta_file=fasta_file,
-                                         use_file_values=False, n_process=4)
+                                         use_file_values=False)
         sage_table.to_csv("sage_table.csv", index=False)
         num_psms = compute_best_combination(sage_table)
         stats = get_stats_from_sage(sage_table, initial_fragment_tolerance, initial_precursor_tolerance, num_psms)
@@ -815,9 +818,8 @@ def sage2PTMShepherd(sage_table):
     db = mass.Unimod()
     new_rows = []
     for _, row in sage_table.iterrows():
-        Spectrum = ".".join([os.path.splitext(row['file_name'])[0], row["spec_idx"].split("=")[-1],
-                             row["spec_idx"].split("=")[-1], str(row["charge"])])
-        SpectrumFile = row['file_name']
+        Spectrum = row["spec_idx"]
+        SpectrumFile = row["spec_idx"].split(".")[0]
         peptide = unimod_pattern.sub(repl="", string=row['peptide'])
         modifications = unimod_pattern.finditer(row["peptide"])
         modified_peptide = row["peptide"]
@@ -854,21 +856,21 @@ def sage2PTMShepherd(sage_table):
         Protein = row["proteins"]
         ProteinID = row["proteins"].split("|")[1]
         new_rows.append({"Spectrum": Spectrum, "Spectrum File": SpectrumFile,
-                          "Peptide": peptide,
-                          "Modified Peptide": modified_peptide,
-                          "Peptide Length": PeptideLength, "Charge": Charge,
-                          "Retention": Retention, "Observed Mass": ObservedMass,
-                          "Calibrated Observed Mass": CalibratedObservedMass,
-                          "Observed M/Z": ObservedMZ,
-                          "Calibrated Observed M/Z": CalibratedObservedMZ,
-                          "Calculated Peptide Mass": CalculatedPeptideMass,
-                          "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass,
-                          "Hyperscore": Hyperscore, "Nextscore": Nextscore,
-                          "Probability": Probability,
-                          "Number of Missed Cleavages": NumberofMissedCleavages,
-                          "Intensity": Intensity,
-                          "Assigned Modifications": AssignedModifications,
-                          "Protein": Protein, "Protein ID": ProteinID})
+                         "Peptide": peptide,
+                         "Modified Peptide": modified_peptide,
+                         "Peptide Length": PeptideLength, "Charge": Charge,
+                         "Retention": Retention, "Observed Mass": ObservedMass,
+                         "Calibrated Observed Mass": CalibratedObservedMass,
+                         "Observed M/Z": ObservedMZ,
+                         "Calibrated Observed M/Z": CalibratedObservedMZ,
+                         "Calculated Peptide Mass": CalculatedPeptideMass,
+                         "Calculated M/Z": CalculatedMZ, "Delta Mass": DeltaMass,
+                         "Hyperscore": Hyperscore, "Nextscore": Nextscore,
+                         "Probability": Probability,
+                         "Number of Missed Cleavages": NumberofMissedCleavages,
+                         "Intensity": Intensity,
+                         "Assigned Modifications": AssignedModifications,
+                         "Protein": Protein, "Protein ID": ProteinID})
 
     psm_df = pd.concat([psm_df, pd.DataFrame.from_records(new_rows)],
                        ignore_index=True)
@@ -889,4 +891,3 @@ def cli():
 
 if __name__ == "__main__":
     cli()
-