Skip to content

Commit

Permalink
Merge pull request #751 from uclahs-cds/czhu-fix-decoy-fasta
Browse files Browse the repository at this point in the history
Make decoyFasta to keep amino acids unchanged at cleavager sites by default
  • Loading branch information
zhuchcn authored Jun 10, 2023
2 parents f2f8737 + 91e7d39 commit d6b6092
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- Fixed `callVariant` to handle hypermutated region with a dynamic cutoff. #738

- Fixed `decoyFasta` to make it as default to keep cleavage site amino acid residues unmodified. #750

- Fixed `SeqFeature` and `GTFSeqFeature` to remove the definition of strand and use `location.strand`. #616

- Refactored `util` so all functions are accessible. #749
Expand Down
5 changes: 2 additions & 3 deletions docs/vignette.md
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,10 @@ Most search engines expect a target-decoy database as input to estimate false di
moPepGen decoyFasta \
-i split/split_gSNP_encode.fasta \
-o split/split_gSNP_decoy.fasta \
--method reverse \
--non-shuffle-pattern K,R
--method reverse
```

`--non-shuffle-pattern` specifies the amino acid residues that should be fixed in the decoy sequence. By default, the N- and C-terminal residues are also fixed, which can be turned off by setting `--keep-peptide-nterm` or `--keep-peptide-cterm` to `false`. See [here](./decoy-fasta) for a complete list of arguments.
By default, amino acid residues at cleavage sites are unmodified. Trypsin is the default enzyme and can be changed using `--enzyme`. `--non-shuffle-pattern` can be used to specify additional amino acid residues to be fixed in the decoy sequence. By default, the N- and C-terminal residues are also fixed, which can be turned off by setting `--keep-peptide-nterm` or `--keep-peptide-cterm` to `false`. See [here](./decoy-fasta) for a complete list of arguments.

### Shortening FASTA headers

Expand Down
24 changes: 21 additions & 3 deletions moPepGen/cli/decoy_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio.SeqIO import FastaIO
from moPepGen import logger
from moPepGen import aa, logger
from moPepGen.cli import common
from moPepGen.aa.expasy_rules import EXPASY_RULES


INPUT_FILE_FORMATS = ['.fa', '.fasta']
Expand Down Expand Up @@ -42,6 +43,15 @@ def add_subparser_decoy_fasta(subparser:argparse._SubParsersAction):
' sequences.',
default='reverse'
)
parser.add_argument(
'--enzyme',
type=str,
choices=[None, *EXPASY_RULES.keys()],
help='Enzymatic cleavage rule. Amino acids at cleavage sites will be'
' kept unmodified. Set it to None to turn off this behavior.',
metavar='<value>',
default=None
)
parser.add_argument(
'--non-shuffle-pattern',
type=str,
Expand Down Expand Up @@ -109,7 +119,7 @@ def log_summary(self):
class DecoyFasta():
""" Decoy Fasta """
def __init__(self, input_path:Path, output_path:Path, method:str,
keep_peptide_nterm:bool, keep_peptide_cterm:bool,
enzyme:str, keep_peptide_nterm:bool, keep_peptide_cterm:bool,
non_shuffle_pattern:List[str], shuffle_max_attempts:int, seed:int,
decoy_string:str, decoy_string_position:str, order:str, quiet:bool,
target_db:List[SeqRecord]=None, _target_pool:Set[SeqRecord]=None,
Expand All @@ -120,6 +130,7 @@ def __init__(self, input_path:Path, output_path:Path, method:str,
self.output_path = output_path
self.method = method
self.seed = seed
self.enzyme = enzyme
self.keep_peptide_nterm = keep_peptide_nterm
self.keep_peptide_cterm = keep_peptide_cterm
self.non_shuffle_pattern = non_shuffle_pattern
Expand Down Expand Up @@ -157,6 +168,7 @@ def from_args(cls, args:argparse.ArgumentParser):
input_path=input_path,
output_path=output_path,
method=args.method,
enzyme=args.enzyme,
keep_peptide_nterm=keep_peptide_nterm,
keep_peptide_cterm=keep_peptide_cterm,
non_shuffle_pattern=non_shuffle_pattern,
Expand All @@ -168,10 +180,16 @@ def from_args(cls, args:argparse.ArgumentParser):
quiet=args.quiet
)


def find_fixed_indices(self, seq:Seq) -> List[int]:
""" Find the peptide indices to be fixed """
fixed_indices = []

if self.enzyme is not None:
rule = self.enzyme
exception = 'trypsin_expection' if self.enzyme == 'trypsin' else None
fixed_indices += aa.AminoAcidSeqRecord(seq) \
.find_all_enzymatic_cleave_sites(rule, exception)

for i, it in enumerate(seq):
if i == 0 and self.keep_peptide_nterm:
fixed_indices.append(i)
Expand Down
10 changes: 7 additions & 3 deletions test/integration/test_decoy_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def generate_default_args(self):
args.decoy_string = 'DECOY_'
args.decoy_string_position = 'prefix'
args.method = 'reverse'
args.enzyme = 'trypsin'
args.shuffle_max_attempts = float('inf')
args.non_shuffle_pattern = ''
args.keep_peptide_nterm = 'true'
Expand All @@ -36,6 +37,7 @@ def test_decoy_fasta_case1(self):
self.write_test_fasta(TARGET_DB)
args = self.generate_default_args()
args.input_path = self.work_dir/'test_input.fasta'
args.enzyme = None
cli.decoy_fasta(args)

received = {str(file.name) for file in self.work_dir.glob('*')}
Expand All @@ -53,6 +55,7 @@ def test_decoy_fasta_case2(self):
args.method = 'shuffle'
args.input_path = self.work_dir/'test_input.fasta'
args.seed = 123123
args.enzyme = None
cli.decoy_fasta(args)

received = {str(file.name) for file in self.work_dir.glob('*')}
Expand All @@ -78,16 +81,16 @@ def test_decoy_fasta_case3(self):
args.method = 'shuffle'
args.input_path = self.work_dir/'test_input.fasta'
args.seed = 123123
args.non_shuffle_pattern = 'K,R'
# args.non_shuffle_pattern = 'K,R'
cli.decoy_fasta(args)

received = {str(file.name) for file in self.work_dir.glob('*')}
expected = {args.input_path.name, args.output_path.name}
self.assertEqual(received, expected)

seqs = {str(seq.seq) for seq in SeqIO.parse(args.output_path, 'fasta')}
expected = {'MELPGQWRQVGIPITPK', 'IRPSCSCCIQGVPGISR'}
self.assertTrue(expected.issubset(seqs))
expected = {'MELPGQWRQVGIPITPK', 'IRSISCCCQGGVPPISR', *[x[0] for x in TARGET_DB]}
self.assertEqual(seqs, expected)

def test_decoy_fasta_shuffle_order(self):
""" This test case ensures that the shuffled decoy sequences are
Expand All @@ -98,6 +101,7 @@ def test_decoy_fasta_shuffle_order(self):
args = self.generate_default_args()
args.method = 'shuffle'
args.seed = 123123
args.enzyme = None
args.non_shuffle_pattern = 'K,R'

args.input_path = self.work_dir/'test_input_1.fasta'
Expand Down

0 comments on commit d6b6092

Please sign in to comment.