Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

File IO command line options revision #372

Merged
merged 25 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
1868980
file io console options
Lilferrit Aug 26, 2024
8a346e0
output console io options
Lilferrit Aug 26, 2024
6e043d1
file io options tests
Lilferrit Aug 26, 2024
ee88344
changelog entry
Lilferrit Aug 26, 2024
4da1357
revised changelog
Lilferrit Aug 27, 2024
653deed
file io console options
Lilferrit Aug 26, 2024
837b769
output console io options
Lilferrit Aug 26, 2024
2483d67
file io options tests
Lilferrit Aug 26, 2024
bf14f2b
changelog entry
Lilferrit Aug 26, 2024
1903cbc
revised changelog
Lilferrit Aug 27, 2024
90ef08b
Generate new screengrabs with rich-codex
github-actions[bot] Aug 27, 2024
970adb6
requested changes
Lilferrit Aug 29, 2024
77c6756
merge conflicts
Lilferrit Aug 29, 2024
e68858b
updated integration test
Lilferrit Aug 30, 2024
3d91f81
requested changes
Lilferrit Aug 30, 2024
645a33f
Generate new screengrabs with rich-codex
github-actions[bot] Aug 30, 2024
2d6dd00
requested changes, output setup refactor
Lilferrit Sep 3, 2024
66de213
Merge branch 'console-file-io' of github.com:Noble-Lab/casanovo into …
Lilferrit Sep 3, 2024
1ee28be
ModelRunner documentation
Lilferrit Sep 3, 2024
4cb18e1
requested changes, _setup_output unit test
Lilferrit Sep 4, 2024
503fb86
ModelRunner output root bug fix, setup_model documentation, sequence …
Lilferrit Sep 12, 2024
71dc50c
Generate new screengrabs with rich-codex
github-actions[bot] Sep 12, 2024
6ba9bb3
logging format character
Lilferrit Sep 16, 2024
257a681
Merge branch 'console-file-io' of github.com:Noble-Lab/casanovo into …
Lilferrit Sep 16, 2024
e405add
Generate new screengrabs with rich-codex
github-actions[bot] Sep 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Changed

- Removed the `evaluate` sub-command, and all model evaluation functionality has been moved to the `sequence` command using the new `--evaluate` flag.
- The `--output` option has been split into two options, `--output_dir` and `--output_root`.
- The `--validation_peak_path` is now optional when training; if `--validation_peak_path` is not set then the `train_peak_path` will also be used for validation.

### Fixed

- Precursor charges are exported as integers instead of floats in the mzTab output file, in compliance with the mzTab specification.

### Removed

- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
- Removed the `save_top_k` option from the Casanovo config, the model with the lowest validation loss during training will now be saved to a fixed filename `<output_root>.best.ckpt`.
- The `model_save_folder_path` config option has been eliminated; model checkpoints will now be saved to `--output_dir` during training.

## [4.2.1] - 2024-06-25

Expand Down
57 changes: 47 additions & 10 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,13 @@
""",
),
click.Option(
("-o", "--output"),
help="The mzTab file to which results will be written.",
("-f", "--output_dir"),
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
help="The destination directory for output files",
type=click.Path(dir_okay=True),
),
click.Option(
("-o", "--output_root"),
help="The root name for all output files",
type=click.Path(dir_okay=False),
),
click.Option(
Expand All @@ -90,6 +95,15 @@
),
default="info",
),
click.Option(
("-d", "--overwrite"),
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
help="""
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
Whether to overwrite output files.
""",
is_flag=True,
show_default=True,
default=False,
),
]


Expand Down Expand Up @@ -144,8 +158,10 @@
peak_path: Tuple[str],
model: Optional[str],
config: Optional[str],
output: Optional[str],
output_dir: Optional[str],
output_root: Optional[str],
verbosity: str,
overwrite: bool,
evaluate: bool,
) -> None:
"""De novo sequence peptides from tandem mass spectra.
Expand All @@ -154,10 +170,18 @@
to sequence peptides. If evaluate is set to True PEAK_PATH must be
one or more annotated MGF file.
"""
output = setup_logging(output, verbosity)
output_dir = Path(output_dir) if output_dir is not None else Path.cwd()
output_base_name = None
if output_root is not None and not overwrite:
output_base_name = output_dir / output_root
base_pattern = re.escape(output_root)
patterns = [base_pattern + r"\.log", base_pattern + r"\.mztab"]
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
utils.check_dir(output_dir, patterns)
output = setup_logging(output_base_name, verbosity)
bittremieux marked this conversation as resolved.
Show resolved Hide resolved

config, model = setup_model(model, config, output, False)
start_time = time.time()
with ModelRunner(config, model) as runner:
with ModelRunner(config, model, output_root, output_dir, False) as runner:
logger.info(
"Sequencing %speptides from:",
"and evaluating " if evaluate else "",
Expand Down Expand Up @@ -186,31 +210,44 @@
An annotated MGF file for validation, like from MassIVE-KB. Use this
option multiple times to specify multiple files.
""",
required=True,
required=False,
multiple=True,
type=click.Path(exists=True, dir_okay=False),
)
def train(
train_peak_path: Tuple[str],
validation_peak_path: Tuple[str],
validation_peak_path: Optional[Tuple[str]],
model: Optional[str],
config: Optional[str],
output: Optional[str],
output_dir: Optional[str],
output_root: Optional[str],
verbosity: str,
overwrite: bool,
) -> None:
"""Train a Casanovo model on your own data.

TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those
provided by MassIVE-KB, from which to train a new Casnovo model.
"""
output = setup_logging(output, verbosity)
output_dir = Path(output_dir) if output_dir is not None else Path.cwd()
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
output_base_name = None
if output_root is not None and not overwrite:
output_base_name = output_dir / output_root
utils.check_dir(output_dir, [re.escape(output_root) + r"\.log"])
output = setup_logging(output_base_name, verbosity)

config, model = setup_model(model, config, output, True)
start_time = time.time()
with ModelRunner(config, model) as runner:
with ModelRunner(
config, model, output_root, output_dir, not overwrite
) as runner:
logger.info("Training a model from:")
for peak_file in train_peak_path:
logger.info(" %s", peak_file)

if len(validation_peak_path) == 0:
validation_peak_path = train_peak_path

Check warning on line 249 in casanovo/casanovo.py

View check run for this annotation

Codecov / codecov/patch

casanovo/casanovo.py#L249

Added line #L249 was not covered by tests

logger.info("Using the following validation files:")
for peak_file in validation_peak_path:
logger.info(" %s", peak_file)
Expand Down
2 changes: 1 addition & 1 deletion casanovo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
every_n_train_steps="val_check_interval",
max_iters="cosine_schedule_period_iters",
save_top_k=None,
model_save_folder_path=None,
)


Expand Down Expand Up @@ -75,7 +76,6 @@ class Config:
top_match=int,
max_epochs=int,
num_sanity_val_steps=int,
model_save_folder_path=str,
val_check_interval=int,
calculate_precision=bool,
accelerator=str,
Expand Down
2 changes: 0 additions & 2 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ random_seed: 454
n_log: 1
# Tensorboard directory to use for keeping track of training metrics.
tb_summarywriter:
# Path to saved checkpoints.
model_save_folder_path: ""
# Model validation and checkpointing frequency in training steps.
val_check_interval: 50_000

Expand Down
27 changes: 23 additions & 4 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import glob
import logging
import os
import re
import tempfile
import uuid
import warnings
Expand All @@ -18,6 +19,7 @@
from lightning.pytorch.strategies import DDPStrategy
from lightning.pytorch.callbacks import ModelCheckpoint

from .. import utils
from ..config import Config
from ..data import ms_io
from ..denovo.dataloaders import DeNovoDataModule
Expand Down Expand Up @@ -47,6 +49,8 @@ def __init__(
config: Config,
model_filename: Optional[str] = None,
output_rootname: Optional[str] = None,
output_dir: Optional[str] = None,
overwrite_ckpt_check: bool = True,
) -> None:
"""Initialize a ModelRunner"""
self.config = config
Expand All @@ -59,20 +63,35 @@ def __init__(
self.loaders = None
self.writer = None

best_filename = "best"
filenames = ("{epoch}-{step}", "best")
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
if output_rootname is not None:
best_filename = f"{output_rootname}.{best_filename}"
filenames = tuple(
f"{output_rootname}.{curr_name}" for curr_name in filenames
)
curr_filename, best_filename = filenames

if overwrite_ckpt_check and output_dir is not None:
patterns = [r"epoch=\d+\-step=\d+\.ckpt", r"best\.ckpt"]
if output_rootname is not None:
patterns = [
re.escape(output_rootname + ".") + pattern
for pattern in patterns
]
utils.check_dir(output_dir, patterns)

# Configure checkpoints.
self.callbacks = [
ModelCheckpoint(
dirpath=config.model_save_folder_path,
dirpath=output_dir,
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
save_on_train_epoch_end=True,
filename=curr_filename,
enable_version_counter=False,
),
ModelCheckpoint(
dirpath=config.model_save_folder_path,
dirpath=output_dir,
monitor="valid_CELoss",
filename=best_filename,
enable_version_counter=False,
),
]

Expand Down
36 changes: 35 additions & 1 deletion casanovo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

import logging
import os
import pathlib
import platform
import re
import socket
import sys
from datetime import datetime
from typing import Tuple, Dict, List, Optional
from typing import Tuple, Dict, List, Optional, Iterable

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -203,6 +204,8 @@
"""
Log sequencing run report

Parameters
----------
next_prediction : Tuple[
str, Tuple[str, str], float, float, float, float, str
]
Expand Down Expand Up @@ -251,3 +254,34 @@
logger.info(
"Median Peptide Length: %d", run_report["median_sequence_length"]
)


def check_dir(
dir: pathlib.Path, file_patterns: Iterable[re.Pattern[str]]
) -> None:
"""
Check that no file names in dir match any of file_patterns

Parameters
----------
dir : pathlib.Path
The directory to check for matching file names
file_patterns : Iterable[re.Pattern[str]]
File name re patterns to test file names against

Raises
------
FileExistsError
If matching file name is found in dir
"""
for pattern in file_patterns:
comp_pattern = re.compile(pattern)
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
for file in dir.iterdir():
if not file.is_file():
continue

Check warning on line 281 in casanovo/utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/utils.py#L281

Added line #L281 was not covered by tests

if comp_pattern.fullmatch(file.name) is not None:
bittremieux marked this conversation as resolved.
Show resolved Hide resolved
raise FileExistsError(
f"File {file.name} already exists in {dir} "
"and can not be overwritten."
)
Loading
Loading