Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ms2pip/_utils/ion_mobility.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Module for ion mobility prediction with IM²Deep."""
"""Module for ion mobility prediction with IM2Deep."""

import logging

Expand Down
39 changes: 16 additions & 23 deletions ms2pip/_utils/xgb_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,28 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1
Number of CPUs to use in multiprocessing

"""
# Init models
xgboost_models = _initialize_xgb_models(
model_params["xgboost_model_files"],
model_dir,
processes,
)
xgb.set_config(verbosity=0)
os.environ.pop("CUDA_VISIBLE_DEVICES", None) # See issue at dmlc/xgboost#11283

if isinstance(features, np.ndarray):
features = xgb.DMatrix(features)
elif isinstance(features, xgb.DMatrix):
pass
else:
raise ValueError("Unsupported input type for features.")

logger.debug("Predicting intensities from XGBoost model files...")
prediction_dict = {}
for ion_type, xgb_model in xgboost_models.items():
n_models = len(model_params["xgboost_model_files"].items())
for i, (ion_type, model_filename) in enumerate(model_params["xgboost_model_files"].items()):
model_file = os.path.join(model_dir, model_filename)
logger.debug(f"Initializing model from file: `{model_file}`")
xgb_model = xgb.Booster({"nthread": processes}, model_file=model_file)

# Get predictions from XGBoost model
logger.debug(f"Predicting intensities from XGBoost model {i + 1}/{n_models}...")
preds = xgb_model.predict(features)
preds = preds.clip(min=np.log2(0.001)) # Clip negative intensities
xgb_model.__del__()
del(xgb_model)

# Reshape into arrays for each peptide
if ion_type.lower() in ["x", "y", "y2", "z"]:
Expand Down Expand Up @@ -113,18 +119,5 @@ def _check_model_integrity(filename, model_hash):
if sha1_hash.hexdigest() == model_hash:
return True
else:
logger.warn("Model hash not recognized.")
logger.warning("Model hash not recognized.")
return False


def _initialize_xgb_models(xgboost_model_files, model_dir, nthread) -> dict:
"""Initialize xgboost models and return them in a dict with ion types as keys."""
xgb.set_config(verbosity=0)
xgboost_models = {}
for ion_type in xgboost_model_files.keys():
model_file = os.path.join(model_dir, xgboost_model_files[ion_type])
logger.debug(f"Initializing model from file: `{model_file}`")
xgb_model = xgb.Booster({"nthread": nthread})
xgb_model.load_model(model_file)
xgboost_models[ion_type] = xgb_model
return xgboost_models
15 changes: 5 additions & 10 deletions ms2pip/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import numpy as np
import pandas as pd
from psm_utils import PSM, Peptidoform, PSMList
from rich.progress import track

import ms2pip.exceptions as exceptions
from ms2pip._cython_modules import ms2pip_pyx
Expand Down Expand Up @@ -189,14 +188,11 @@ def predict_library(
raise ValueError("Either `fasta_file` or `config` must be provided.")

search_space = ProteomeSearchSpace.from_any(config)
search_space.build()
search_space.build(processes)

for batch in track(
_into_batches(search_space, batch_size=batch_size),
description="Predicting spectra...",
total=ceil(len(search_space) / batch_size),
):
logging.disable(logging.CRITICAL)
n_batches = len(search_space) // batch_size + 1
for i, batch in enumerate(_into_batches(search_space, batch_size=batch_size)):
logging.info(f"Processing batch {i + 1}/{n_batches}...")
yield predict_batch(
search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))),
add_retention_time=add_retention_time,
Expand All @@ -205,7 +201,6 @@ def predict_library(
model_dir=model_dir,
processes=processes,
)
logging.disable(logging.NOTSET)


def correlate(
Expand Down Expand Up @@ -553,7 +548,7 @@ def _get_pool(self):
"""Get multiprocessing pool."""
logger.debug(f"Starting workers (processes={self.processes})...")
if multiprocessing.current_process().daemon:
logger.warn(
logger.warning(
"MS²PIP is running in a daemon process. Disabling multiprocessing as daemonic "
"processes cannot have children."
)
Expand Down
28 changes: 22 additions & 6 deletions ms2pip/search_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
{
"fasta_file": "test.fasta",
"min_length": 8,
"max_length": 3,
"max_length": 30,
"cleavage_rule": "trypsin",
"missed_cleavages": 2,
"semi_specific": false,
Expand Down Expand Up @@ -204,7 +204,23 @@ def __init__(self, **data: Any):
"""

super().__init__(**data)
self._peptidoform_spaces: List[_PeptidoformSearchSpace] = []
self._peptidoform_spaces: Optional[List[_PeptidoformSearchSpace]] = None

@field_validator("min_length")
@classmethod
def _validate_min_length(cls, v):
if v > 3:
return v
else:
raise ValueError("Minimum peptide length must be greater than 3.")

@field_validator("max_length")
@classmethod
def _validate_max_length(cls, v):
if v <= 100:
return v
else:
raise ValueError("Maximum peptide length must be less than or equal to 100.")

@field_validator("modifications")
@classmethod
Expand All @@ -229,7 +245,7 @@ def _validate_unspecific_cleavage(self):
return self

def __len__(self):
if not self._peptidoform_spaces:
if self._peptidoform_spaces is None:
raise ValueError("Search space must be built before length can be determined.")
return sum(len(pep_space) for pep_space in self._peptidoform_spaces)

Expand All @@ -255,14 +271,14 @@ def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> Proteo
else:
raise ValueError("Search space must be a dict, str, Path, or ProteomeSearchSpace.")

def build(self, processes: int = 1):
def build(self, processes: int | None = None):
"""
Build peptide search space from FASTA file.

Parameters
----------
processes : int
Number of processes to use for parallelization.
Number of processes to use for parallelization. If None, uses all available CPUs.

"""
processes = processes if processes else multiprocessing.cpu_count()
Expand All @@ -285,7 +301,7 @@ def __iter__(self) -> Generator[PSM, None, None]:

"""
# Build search space if not already built
if not self._peptidoform_spaces:
if self._peptidoform_spaces is None:
raise ValueError("Search space must be built before PSMs can be generated.")

spectrum_id = 0
Expand Down