Noble-Lab · Lilferrit · Sep 16, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,14 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Changed
 
 - Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
+- The `--output` option has been split into two options, `--output_dir` and `--output_root`.
+- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.
 
 ### Fixed
 
 - Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification.
 
 ### Removed
 
-- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`. 
+- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
+- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training.
 
 ## [4.2.1] - 2024-06-25
 

diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -67,8 +67,13 @@
                 """,
             ),
             click.Option(
-                ("-o", "--output"),
-                help="The mzTab file to which results will be written.",
+                ("-f", "--output_dir"),
+                help="The destination directory for output files",
+                type=click.Path(dir_okay=True),
+            ),
+            click.Option(
+                ("-o", "--output_root"),
+                help="The root name for all output files",
                 type=click.Path(dir_okay=False),
             ),
             click.Option(
@@ -90,6 +95,15 @@
                 ),
                 default="info",
             ),
+            click.Option(
+                ("-d", "--overwrite"),
+                help="""
+                Whether to overwrite output files.
+                """,
+                is_flag=True,
+                show_default=True,
+                default=False,
+            ),
         ]
 
 
@@ -144,8 +158,10 @@
     peak_path: Tuple[str],
     model: Optional[str],
     config: Optional[str],
-    output: Optional[str],
+    output_dir: Optional[str],
+    output_root: Optional[str],
     verbosity: str,
+    overwrite: bool,
     evaluate: bool,
 ) -> None:
     """De novo sequence peptides from tandem mass spectra.
@@ -154,10 +170,18 @@
     to sequence peptides. If evaluate is set to True PEAK_PATH must be
     one or more annotated MGF file.
     """
-    output = setup_logging(output, verbosity)
+    output_dir = Path(output_dir) if output_dir is not None else Path.cwd()
+    output_base_name = None
+    if output_root is not None and not overwrite:
+        output_base_name = output_dir / output_root
+        base_pattern = re.escape(output_root)
+        patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"]
+        utils.check_dir(output_dir, patterns)
+    output = setup_logging(output_base_name, verbosity)
+
     config, model = setup_model(model, config, output, False)
     start_time = time.time()
-    with ModelRunner(config, model) as runner:
+    with ModelRunner(config, model, output_root, output_dir, False) as runner:
         logger.info(
             "Sequencing %speptides from:",
             "and evaluating " if evaluate else "",
@@ -186,31 +210,44 @@
     An annotated MGF file for validation, like from MassIVE-KB. Use this
     option multiple times to specify multiple files.
     """,
-    required=True,
+    required=False,
     multiple=True,
     type=click.Path(exists=True, dir_okay=False),
 )
 def train(
     train_peak_path: Tuple[str],
-    validation_peak_path: Tuple[str],
+    validation_peak_path: Optional[Tuple[str]],
     model: Optional[str],
     config: Optional[str],
-    output: Optional[str],
+    output_dir: Optional[str],
+    output_root: Optional[str],
     verbosity: str,
+    overwrite: bool,
 ) -> None:
     """Train a Casanovo model on your own data.
 
     TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those
     provided by MassIVE-KB, from which to train a new Casnovo model.
     """
-    output = setup_logging(output, verbosity)
+    output_dir = Path(output_dir) if output_dir is not None else Path.cwd()
+    output_base_name = None
+    if output_root is not None and not overwrite:
+        output_base_name = output_dir / output_root
+        utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"])
+    output = setup_logging(output_base_name, verbosity)
+
     config, model = setup_model(model, config, output, True)
     start_time = time.time()
-    with ModelRunner(config, model) as runner:
+    with ModelRunner(
+        config, model, output_root, output_dir, not overwrite
+    ) as runner:
         logger.info("Training a model from:")
         for peak_file in train_peak_path:
             logger.info("  %s", peak_file)
 
+        if len(validation_peak_path) == 0:
+            validation_peak_path = train_peak_path
+
         logger.info("Using the following validation files:")
         for peak_file in validation_peak_path:
             logger.info("  %s", peak_file)

diff --git a/casanovo/config.py b/casanovo/config.py
@@ -19,6 +19,7 @@
     every_n_train_steps="val_check_interval",
     max_iters="cosine_schedule_period_iters",
     save_top_k=None,
+    model_save_folder_path=None,
 )
 
 
@@ -75,7 +76,6 @@ class Config:
         top_match=int,
         max_epochs=int,
         num_sanity_val_steps=int,
-        model_save_folder_path=str,
         val_check_interval=int,
         calculate_precision=bool,
         accelerator=str,

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -42,8 +42,6 @@ random_seed: 454
 n_log: 1
 # Tensorboard directory to use for keeping track of training metrics.
 tb_summarywriter:
-# Path to saved checkpoints.
-model_save_folder_path: ""
 # Model validation and checkpointing frequency in training steps.
 val_check_interval: 50_000
 

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -4,6 +4,7 @@
 import glob
 import logging
 import os
+import re
 import tempfile
 import uuid
 import warnings
@@ -18,6 +19,7 @@
 from lightning.pytorch.strategies import DDPStrategy
 from lightning.pytorch.callbacks import ModelCheckpoint
 
+from .. import utils
 from ..config import Config
 from ..data import ms_io
 from ..denovo.dataloaders import DeNovoDataModule
@@ -47,6 +49,8 @@ def __init__(
         config: Config,
         model_filename: Optional[str] = None,
         output_rootname: Optional[str] = None,
+        output_dir: Optional[str] = None,
+        overwrite_ckpt_check: bool = True,
     ) -> None:
         """Initialize a ModelRunner"""
         self.config = config
@@ -59,20 +63,35 @@ def __init__(
         self.loaders = None
         self.writer = None
 
-        best_filename = "best"
+        filenames = ("{epoch}-{step}", "best")
         if output_rootname is not None:
-            best_filename = f"{output_rootname}.{best_filename}"
+            filenames = tuple(
+                f"{output_rootname}.{curr_name}" for curr_name in filenames
+            )
+        curr_filename, best_filename = filenames
+
+        if overwrite_ckpt_check and output_dir is not None:
+            patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"]
+            if output_rootname is not None:
+                patterns = [
+                    re.escape(output_rootname + ".") + pattern
+                    for pattern in patterns
+                ]
+            utils.check_dir(output_dir, patterns)
 
         # Configure checkpoints.
         self.callbacks = [
             ModelCheckpoint(
-                dirpath=config.model_save_folder_path,
+                dirpath=output_dir,
                 save_on_train_epoch_end=True,
+                filename=curr_filename,
+                enable_version_counter=False,
             ),
             ModelCheckpoint(
-                dirpath=config.model_save_folder_path,
+                dirpath=output_dir,
                 monitor="valid_CELoss",
                 filename=best_filename,
+                enable_version_counter=False,
             ),
         ]
 

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -2,12 +2,13 @@
 
 import logging
 import os
+import pathlib
 import platform
 import re
 import socket
 import sys
 from datetime import datetime
-from typing import Tuple, Dict, List, Optional
+from typing import Tuple, Dict, List, Optional, Iterable
 
 import numpy as np
 import pandas as pd
@@ -203,6 +204,8 @@
     """
     Log sequencing run report
 
+    Parameters
+    ----------
     next_prediction : Tuple[
         str, Tuple[str, str], float, float, float, float, str
     ]
@@ -251,3 +254,34 @@
         logger.info(
             "Median Peptide Length: %d", run_report["median_sequence_length"]
         )
+
+
+def check_dir(
+    dir: pathlib.Path, file_patterns: Iterable[re.Pattern[str]]
+) -> None:
+    """
+    Check that no file names in dir match any of file_patterns
+
+    Parameters
+    ----------
+    dir : pathlib.Path
+        The directory to check for matching file names
+    file_patterns : Iterable[re.Pattern[str]]
+        File name re patterns to test file names against
+
+    Raises
+    ------
+    FileExistsError
+        If matching file name is found in dir
+    """
+    for pattern in file_patterns:
+        comp_pattern = re.compile(pattern)
+        for file in dir.iterdir():
+            if not file.is_file():
+                continue
+
+            if comp_pattern.fullmatch(file.name) is not None:
+                raise FileExistsError(
+                    f"File {file.name} already exists in {dir} "
+                    "and can not be overwritten."
+                )