CompOmics · RalfG · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/ms2pip/_utils/ion_mobility.py b/ms2pip/_utils/ion_mobility.py
@@ -1,4 +1,4 @@
-"""Module for ion mobility prediction with IM²Deep."""
+"""Module for ion mobility prediction with IM2Deep."""
 
 import logging
 

diff --git a/ms2pip/_utils/xgb_models.py b/ms2pip/_utils/xgb_models.py
@@ -39,22 +39,28 @@ def get_predictions_xgb(features, num_ions, model_params, model_dir, processes=1
         Number of CPUs to use in multiprocessing
 
     """
-    # Init models
-    xgboost_models = _initialize_xgb_models(
-        model_params["xgboost_model_files"],
-        model_dir,
-        processes,
-    )
+    xgb.set_config(verbosity=0)
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)  # See issue at dmlc/xgboost#11283
+
     if isinstance(features, np.ndarray):
         features = xgb.DMatrix(features)
+    elif isinstance(features, xgb.DMatrix):
+        pass
+    else:
+        raise ValueError("Unsupported input type for features.")
 
-    logger.debug("Predicting intensities from XGBoost model files...")
     prediction_dict = {}
-    for ion_type, xgb_model in xgboost_models.items():
+    n_models = len(model_params["xgboost_model_files"].items())
+    for i, (ion_type, model_filename) in enumerate(model_params["xgboost_model_files"].items()):
+        model_file = os.path.join(model_dir, model_filename)
+        logger.debug(f"Initializing model from file: `{model_file}`")
+        xgb_model = xgb.Booster({"nthread": processes}, model_file=model_file)
+
         # Get predictions from XGBoost model
+        logger.debug(f"Predicting intensities from XGBoost model {i + 1}/{n_models}...")
         preds = xgb_model.predict(features)
         preds = preds.clip(min=np.log2(0.001))  # Clip negative intensities
-        xgb_model.__del__()
+        del(xgb_model)
 
         # Reshape into arrays for each peptide
         if ion_type.lower() in ["x", "y", "y2", "z"]:
@@ -113,18 +119,5 @@ def _check_model_integrity(filename, model_hash):
     if sha1_hash.hexdigest() == model_hash:
         return True
     else:
-        logger.warn("Model hash not recognized.")
+        logger.warning("Model hash not recognized.")
         return False
-
-
-def _initialize_xgb_models(xgboost_model_files, model_dir, nthread) -> dict:
-    """Initialize xgboost models and return them in a dict with ion types as keys."""
-    xgb.set_config(verbosity=0)
-    xgboost_models = {}
-    for ion_type in xgboost_model_files.keys():
-        model_file = os.path.join(model_dir, xgboost_model_files[ion_type])
-        logger.debug(f"Initializing model from file: `{model_file}`")
-        xgb_model = xgb.Booster({"nthread": nthread})
-        xgb_model.load_model(model_file)
-        xgboost_models[ion_type] = xgb_model
-    return xgboost_models
diff --git a/ms2pip/core.py b/ms2pip/core.py
@@ -14,7 +14,6 @@
 import numpy as np
 import pandas as pd
 from psm_utils import PSM, Peptidoform, PSMList
-from rich.progress import track
 
 import ms2pip.exceptions as exceptions
 from ms2pip._cython_modules import ms2pip_pyx
@@ -189,14 +188,11 @@ def predict_library(
         raise ValueError("Either `fasta_file` or `config` must be provided.")
 
     search_space = ProteomeSearchSpace.from_any(config)
-    search_space.build()
+    search_space.build(processes)
 
-    for batch in track(
-        _into_batches(search_space, batch_size=batch_size),
-        description="Predicting spectra...",
-        total=ceil(len(search_space) / batch_size),
-    ):
-        logging.disable(logging.CRITICAL)
+    n_batches = len(search_space) // batch_size + 1
+    for i, batch in enumerate(_into_batches(search_space, batch_size=batch_size)):
+        logging.info(f"Processing batch {i + 1}/{n_batches}...")
         yield predict_batch(
             search_space.filter_psms_by_mz(PSMList(psm_list=list(batch))),
             add_retention_time=add_retention_time,
@@ -205,7 +201,6 @@ def predict_library(
             model_dir=model_dir,
             processes=processes,
         )
-        logging.disable(logging.NOTSET)
 
 
 def correlate(
@@ -553,7 +548,7 @@ def _get_pool(self):
         """Get multiprocessing pool."""
         logger.debug(f"Starting workers (processes={self.processes})...")
         if multiprocessing.current_process().daemon:
-            logger.warn(
+            logger.warning(
                 "MS²PIP is running in a daemon process. Disabling multiprocessing as daemonic "
                 "processes cannot have children."
             )

diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py
@@ -15,7 +15,7 @@
    {
      "fasta_file": "test.fasta",
      "min_length": 8,
-     "max_length": 3,
+     "max_length": 30,
      "cleavage_rule": "trypsin",
      "missed_cleavages": 2,
      "semi_specific": false,
@@ -204,7 +204,23 @@ def __init__(self, **data: Any):
         """
 
         super().__init__(**data)
-        self._peptidoform_spaces: List[_PeptidoformSearchSpace] = []
+        self._peptidoform_spaces: Optional[List[_PeptidoformSearchSpace]] = None
+
+    @field_validator("min_length")
+    @classmethod
+    def _validate_min_length(cls, v):
+        if v > 3:
+            return v
+        else:
+            raise ValueError("Minimum peptide length must be greater than 3.")
+
+    @field_validator("max_length")
+    @classmethod
+    def _validate_max_length(cls, v):
+        if v <= 100:
+            return v
+        else:
+            raise ValueError("Maximum peptide length must be less than or equal to 100.")
 
     @field_validator("modifications")
     @classmethod
@@ -229,7 +245,7 @@ def _validate_unspecific_cleavage(self):
         return self
 
     def __len__(self):
-        if not self._peptidoform_spaces:
+        if self._peptidoform_spaces is None:
             raise ValueError("Search space must be built before length can be determined.")
         return sum(len(pep_space) for pep_space in self._peptidoform_spaces)
 
@@ -255,14 +271,14 @@ def from_any(cls, _input: Union[dict, str, Path, ProteomeSearchSpace]) -> Proteo
         else:
             raise ValueError("Search space must be a dict, str, Path, or ProteomeSearchSpace.")
 
-    def build(self, processes: int = 1):
+    def build(self, processes: int | None = None):
         """
         Build peptide search space from FASTA file.
 
         Parameters
         ----------
         processes : int
-            Number of processes to use for parallelization.
+            Number of processes to use for parallelization. If None, uses all available CPUs.
 
         """
         processes = processes if processes else multiprocessing.cpu_count()
@@ -285,7 +301,7 @@ def __iter__(self) -> Generator[PSM, None, None]:
 
         """
         # Build search space if not already built
-        if not self._peptidoform_spaces:
+        if self._peptidoform_spaces is None:
             raise ValueError("Search space must be built before PSMs can be generated.")
 
         spectrum_id = 0