Skip to content

Commit

Permalink
Merge pull request #463 from uclahs-cds/czhu-filter-noncoding
Browse files Browse the repository at this point in the history
fix (filterFasta): enable to keep peptides from canonical ORFs
  • Loading branch information
lydiayliu authored May 25, 2022
2 parents a760502 + c902f66 commit 607f263
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 6 deletions.
10 changes: 10 additions & 0 deletions docs/filter-fasta.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,16 @@ moPepGen fitlerFasta \
--denylist path/to/denylist.fasta
```

Use the `--keep-canonical` option to keep peptides that are called from canonical ORFs even if they are in the denylist. Canonical ORFs include coding transcripts with mutation(s) and fusion transcripts where the upstream transcript is coding. Peptides called from circRNAs are considered noncanonical ORFs.

```bash
moPepGen filterFasta \
--input-path path/to/variant_peptides.fasta \
--output-path path/to/variant_peptides_filter.fasta \
--denylist path/to/denylist.fasta \
--keep-canonical
```

### Complex Filtering

Sometimes we want more complex filtering strategy. In the example below, we want to first remove any variant peptides that overlap with any noncoding peptide, and next we filter again based on the expression level.
Expand Down
17 changes: 13 additions & 4 deletions moPepGen/aa/VariantPeptidePool.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,33 +74,42 @@ def filter(self, exprs:Dict[str,int]=None, cutoff:float=None,
coding_transcripts:List[str]=None, keep_all_noncoding:bool=False,
keep_all_coding:bool=False, enzyme:str='trypsin',
miscleavage_range:Tuple[int,int]=(None, None,),
denylist:Set[Seq]=None) -> VariantPeptidePool:
denylist:Set[Seq]=None, keep_canonical:bool=False) -> VariantPeptidePool:
""" Filter variant peptides according to gene expression. """
label_delimiter = VARIANT_PEPTIDE_SOURCE_DELIMITER
filtered_pool = VariantPeptidePool()
for peptide in self.peptides:
if denylist and peptide.seq in denylist:
continue
# Filter by miscleavages
if any(x is not None for x in miscleavage_range):
exception = 'trypsin_exception' if enzyme == 'trypsin' else None
misc = peptide.find_all_enzymatic_cleave_sites(enzyme, exception)
if miscleavage_range[0] is not None and len(misc) < miscleavage_range[0]:
continue
if miscleavage_range[1] is not None and len(misc) > miscleavage_range[1]:
continue

peptide_entries = VariantPeptideInfo.from_variant_peptide_minimal(peptide)

is_in_denylist = denylist is not None and peptide.seq in denylist
keep = []
for entry in peptide_entries:
all_noncoding = not any(x in coding_transcripts
for x in entry.get_transcript_ids())
all_coding = all(x in coding_transcripts
for x in entry.get_transcript_ids())

if keep_all_noncoding and all_noncoding:
is_canonical = ((not entry.is_circ_rna()) and \
entry.get_transcript_ids()[0] in coding_transcripts)

if is_in_denylist and (not (keep_canonical and is_canonical)):
should_keep = False

elif keep_all_noncoding and all_noncoding:
should_keep = True

elif keep_all_coding and all_coding:
should_keep = True

else:
if exprs is not None:
tx_ids = entry.get_transcript_ids()
Expand Down
10 changes: 8 additions & 2 deletions moPepGen/cli/filter_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ def add_subparser_filter_fasta(subparser:argparse._SubParsersAction):
default=False,
help='Keep all noncoding genes, regardless of their expression level.'
)
p.add_argument(
'--keep-canonical',
action='store_true',
help='Keep peptides called from canonical ORFs. Only useful together'
' with denylist.'
)
p.add_argument(
'--miscleavages',
type=str,
Expand Down Expand Up @@ -188,8 +194,8 @@ def filter_fasta(args:argparse.Namespace) -> None:
exprs=exprs, cutoff=args.quant_cutoff, coding_transcripts=coding_tx,
keep_all_noncoding=args.keep_all_noncoding,
keep_all_coding=args.keep_all_coding, enzyme=args.enzyme,
miscleavage_range=miscleavage_range,
denylist=denylist
miscleavage_range=miscleavage_range, denylist=denylist,
keep_canonical=args.keep_canonical
)

filtered_pool.write(args.output_path)
Expand Down
1 change: 1 addition & 0 deletions test/integration/test_filter_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def generate_default_args(self) -> argparse.Namespace:
args.enzyme = 'trypsin'
args.miscleavages = None
args.denylist = None
args.keep_canonical = False
return args

def test_filter_fasta_cli(self):
Expand Down
55 changes: 55 additions & 0 deletions test/unit/test_variant_peptide_pool.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" Test Module for VariantPeptidePool """
import unittest
from test.unit import create_aa_record
from Bio.Seq import Seq
from moPepGen.aa import VariantPeptidePool


Expand Down Expand Up @@ -122,3 +123,57 @@ def test_filter_keep_all_noncoding(self):
keep_all_coding=False, keep_all_noncoding=True
)
self.assertEqual(len(filtered.peptides), 4)

def test_filter_denylist(self):
""" Filter with denylist """
data = [
['SSSSSSSSSR', 'ENST0001|SNV-100-A-T|1'],
['SSSSSSSSAR', 'ENST0002|SNV-100-A-T|1'],
['SSSSSSSSCR', 'ENST0003|SNV-100-A-T|1'],
['SSSSSSSSGR', 'ENST0004|SNV-100-A-T|1'],
]
peptides = {create_aa_record(*x) for x in data}
denylist_data = [
'SSSSSSSSAR', 'SSSSSSSSGR'
]
denylist = {Seq(x) for x in denylist_data}
exprs = None
coding_tx = ['ENST0001', 'ENST0003']
pool = VariantPeptidePool(peptides=peptides)
filtered = pool.filter(
exprs=exprs, cutoff=8, coding_transcripts=coding_tx,
keep_all_coding=False, keep_all_noncoding=True,
denylist=denylist, keep_canonical=False
)
self.assertEqual(len(filtered.peptides), 2)
self.assertEqual(
{str(x.seq) for x in filtered.peptides},
{'SSSSSSSSSR', 'SSSSSSSSCR'}
)

def test_filter_denylist_keep_canonical(self):
""" Filter with denylist and keep_canonical = True """
data = [
['SSSSSSSSSR', 'ENST0001|SNV-100-A-T|1'],
['SSSSSSSSAR', 'ENST0002|SNV-100-A-T|1'],
['SSSSSSSSCR', 'ENST0003|SNV-100-A-T|1'],
['SSSSSSSSGR', 'ENST0004|SNV-100-A-T|1'],
]
peptides = {create_aa_record(*x) for x in data}
denylist_data = [
'SSSSSSSSAR', 'SSSSSSSSGR'
]
denylist = {Seq(x) for x in denylist_data}
exprs = None
coding_tx = ['ENST0001', 'ENST0002', 'ENST0003']
pool = VariantPeptidePool(peptides=peptides)
filtered = pool.filter(
exprs=exprs, cutoff=8, coding_transcripts=coding_tx,
keep_all_coding=False, keep_all_noncoding=True,
denylist=denylist, keep_canonical=True
)
self.assertEqual(len(filtered.peptides), 3)
self.assertEqual(
{str(x.seq) for x in filtered.peptides},
{'SSSSSSSSSR', 'SSSSSSSSAR', 'SSSSSSSSCR'}
)

0 comments on commit 607f263

Please sign in to comment.