Skip to content

Commit

Permalink
Fix scan reading
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Mar 11, 2024
1 parent 7813d7f commit 723d146
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 15 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ You can install this branch (ideally, in an appropriately named Conda environmen
To use Casanovo-DB, you must also install the Crux toolkit. Given a set of spectra in a file named, for example, `spectra.mgf` and a corresponding proteome fasta `proteome.fasta`, you can run a database search via the following commands:
1. Build a peptide index in the directory `my_proteome`:
- `crux tide-index proteome.fasta my_proteome`

Please note that your `.fasta` file cannot contain any 'U' amino acids because it is not in the vocabulary of Casanovo. Replace all occurrences of this character with 'X' to denote a missing amino acid.

2. Identify candidate peptides for each spectrum (be sure to set `top-match` to a very high number):
- `crux tide-search --output-dir search_results --top-match 1000000 spectra.mgf my_proteome`
3. Extract the candidate peptides from the search results into a format readable by Casanovo-DB (`annotated.mgf`).
- `casanovo --mode=annotate --peak_path spectra.mgf --tide_dir_path search_results --output annotated.mgf`

Please note that `spectra.mgf` must contain the `SCANS=` field.

4. Run Casanovo-DB:
- `casanovo --mode=db --peak_path annotated.mgf --output casanovo_db_result.mztab`

Expand Down
14 changes: 8 additions & 6 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,12 +388,14 @@ def create_mgf_from_tide(
scan_map[scan] = target_candidate_list + decoy_candidate_list

all_spec = []
for idx, spec_dict in enumerate(
mgf.read(mgf_file)
): #! WILL NEED TO BE CHANGED FOR OTHER ENCODINGS OF SCAN
scan = int(
re.search(r"scan=(\d+)", spec_dict["params"]["title"]).group(1)
)
for idx, spec_dict in enumerate(mgf.read(mgf_file)):
try:
scan = int(spec_dict["params"]["scans"])
except KeyError as e:
logger.error(
"Could not find the scan number in the .mgf file. Please ensure that the .mgf file contains the scan number in the 'SCANS' field."
)
raise e
try:
spec_dict["params"]["seq"] = ",".join(list(scan_map[scan]))
all_spec.append(spec_dict)
Expand Down
21 changes: 12 additions & 9 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import csv
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
import operator
import os

import depthcharge.masses
import einops
Expand Down Expand Up @@ -1031,16 +1032,18 @@ def on_predict_epoch_end(self, results) -> None:
results = np.array(results, dtype=object).squeeze((0))
with open(self.out_writer.filename, "a") as out_f:
csv_writer = csv.writer(out_f, delimiter="\t")
# Write a header
csv_writer.writerow(
(
"index",
"peptide",
"target",
"score",
"per_aa_scores",
# Write a header IF THE FILE IS BLANK
if os.stat(self.out_writer.filename).st_size == 0:
csv_writer.writerow(
(
"index",
"peptide",
"target",
"score",
"per_aa_scores",
)
)
)
# Write rows
for group in results:
for batch in group:
for index, t_or_d, peptide, score, per_aa_scores in list(
Expand Down
1 change: 1 addition & 0 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,7 @@ def db_search(
n_beams=config["n_beams"],
n_log=config["n_log"],
out_writer=out_writer,
top_match=config["top_match"],
)
# Read the MS/MS spectra for which to predict peptide sequences.
peak_ext = (".mgf", ".h5", ".hdf5")
Expand Down

0 comments on commit 723d146

Please sign in to comment.