Skip to content

Commit 607f263

Browse files
authored
Merge pull request #463 from uclahs-cds/czhu-filter-noncoding
fix (filterFasta): enable to keep peptides from canonical ORFs
2 parents a760502 + c902f66 commit 607f263

File tree

5 files changed

+87
-6
lines changed

5 files changed

+87
-6
lines changed

docs/filter-fasta.md

+10
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,16 @@ moPepGen fitlerFasta \
8686
--denylist path/to/denylist.fasta
8787
```
8888

89+
Use the `--keep-canonical` option to keep peptides that are called from canonical ORFs even if they are in the denylist. Canonical ORFs include coding transcripts with mutation(s) and fusion transcripts where the upstream transcript is coding. Peptides called from circRNAs are considered noncanonical ORFs.
90+
91+
```bash
92+
moPepGen filterFasta \
93+
--input-path path/to/variant_peptides.fasta \
94+
--output-path path/to/variant_peptides_filter.fasta \
95+
--denylist path/to/denylist.fasta \
96+
--keep-canonical
97+
```
98+
8999
### Complex Filtering
90100

91101
Sometimes we want more complex filtering strategy. In the example below, we want to first remove any variant peptides that overlap with any noncoding peptide, and next we filter again based on the expression level.

moPepGen/aa/VariantPeptidePool.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -74,33 +74,42 @@ def filter(self, exprs:Dict[str,int]=None, cutoff:float=None,
7474
coding_transcripts:List[str]=None, keep_all_noncoding:bool=False,
7575
keep_all_coding:bool=False, enzyme:str='trypsin',
7676
miscleavage_range:Tuple[int,int]=(None, None,),
77-
denylist:Set[Seq]=None) -> VariantPeptidePool:
77+
denylist:Set[Seq]=None, keep_canonical:bool=False) -> VariantPeptidePool:
7878
""" Filter variant peptides according to gene expression. """
7979
label_delimiter = VARIANT_PEPTIDE_SOURCE_DELIMITER
8080
filtered_pool = VariantPeptidePool()
8181
for peptide in self.peptides:
82-
if denylist and peptide.seq in denylist:
83-
continue
82+
# Filter by miscleavages
8483
if any(x is not None for x in miscleavage_range):
8584
exception = 'trypsin_exception' if enzyme == 'trypsin' else None
8685
misc = peptide.find_all_enzymatic_cleave_sites(enzyme, exception)
8786
if miscleavage_range[0] is not None and len(misc) < miscleavage_range[0]:
8887
continue
8988
if miscleavage_range[1] is not None and len(misc) > miscleavage_range[1]:
9089
continue
90+
9191
peptide_entries = VariantPeptideInfo.from_variant_peptide_minimal(peptide)
92+
93+
is_in_denylist = denylist is not None and peptide.seq in denylist
9294
keep = []
9395
for entry in peptide_entries:
9496
all_noncoding = not any(x in coding_transcripts
9597
for x in entry.get_transcript_ids())
9698
all_coding = all(x in coding_transcripts
9799
for x in entry.get_transcript_ids())
98100

99-
if keep_all_noncoding and all_noncoding:
101+
is_canonical = ((not entry.is_circ_rna()) and \
102+
entry.get_transcript_ids()[0] in coding_transcripts)
103+
104+
if is_in_denylist and (not (keep_canonical and is_canonical)):
105+
should_keep = False
106+
107+
elif keep_all_noncoding and all_noncoding:
100108
should_keep = True
101109

102110
elif keep_all_coding and all_coding:
103111
should_keep = True
112+
104113
else:
105114
if exprs is not None:
106115
tx_ids = entry.get_transcript_ids()

moPepGen/cli/filter_fasta.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,12 @@ def add_subparser_filter_fasta(subparser:argparse._SubParsersAction):
9797
default=False,
9898
help='Keep all noncoding genes, regardless of their expression level.'
9999
)
100+
p.add_argument(
101+
'--keep-canonical',
102+
action='store_true',
103+
help='Keep peptides called from canonical ORFs. Only useful together'
104+
' with denylist.'
105+
)
100106
p.add_argument(
101107
'--miscleavages',
102108
type=str,
@@ -188,8 +194,8 @@ def filter_fasta(args:argparse.Namespace) -> None:
188194
exprs=exprs, cutoff=args.quant_cutoff, coding_transcripts=coding_tx,
189195
keep_all_noncoding=args.keep_all_noncoding,
190196
keep_all_coding=args.keep_all_coding, enzyme=args.enzyme,
191-
miscleavage_range=miscleavage_range,
192-
denylist=denylist
197+
miscleavage_range=miscleavage_range, denylist=denylist,
198+
keep_canonical=args.keep_canonical
193199
)
194200

195201
filtered_pool.write(args.output_path)

test/integration/test_filter_fasta.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def generate_default_args(self) -> argparse.Namespace:
2323
args.enzyme = 'trypsin'
2424
args.miscleavages = None
2525
args.denylist = None
26+
args.keep_canonical = False
2627
return args
2728

2829
def test_filter_fasta_cli(self):

test/unit/test_variant_peptide_pool.py

+55
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""" Test Module for VariantPeptidePool """
22
import unittest
33
from test.unit import create_aa_record
4+
from Bio.Seq import Seq
45
from moPepGen.aa import VariantPeptidePool
56

67

@@ -122,3 +123,57 @@ def test_filter_keep_all_noncoding(self):
122123
keep_all_coding=False, keep_all_noncoding=True
123124
)
124125
self.assertEqual(len(filtered.peptides), 4)
126+
127+
def test_filter_denylist(self):
128+
""" Filter with denylist """
129+
data = [
130+
['SSSSSSSSSR', 'ENST0001|SNV-100-A-T|1'],
131+
['SSSSSSSSAR', 'ENST0002|SNV-100-A-T|1'],
132+
['SSSSSSSSCR', 'ENST0003|SNV-100-A-T|1'],
133+
['SSSSSSSSGR', 'ENST0004|SNV-100-A-T|1'],
134+
]
135+
peptides = {create_aa_record(*x) for x in data}
136+
denylist_data = [
137+
'SSSSSSSSAR', 'SSSSSSSSGR'
138+
]
139+
denylist = {Seq(x) for x in denylist_data}
140+
exprs = None
141+
coding_tx = ['ENST0001', 'ENST0003']
142+
pool = VariantPeptidePool(peptides=peptides)
143+
filtered = pool.filter(
144+
exprs=exprs, cutoff=8, coding_transcripts=coding_tx,
145+
keep_all_coding=False, keep_all_noncoding=True,
146+
denylist=denylist, keep_canonical=False
147+
)
148+
self.assertEqual(len(filtered.peptides), 2)
149+
self.assertEqual(
150+
{str(x.seq) for x in filtered.peptides},
151+
{'SSSSSSSSSR', 'SSSSSSSSCR'}
152+
)
153+
154+
def test_filter_denylist_keep_canonical(self):
155+
""" Filter with denylist and keep_canonical = True """
156+
data = [
157+
['SSSSSSSSSR', 'ENST0001|SNV-100-A-T|1'],
158+
['SSSSSSSSAR', 'ENST0002|SNV-100-A-T|1'],
159+
['SSSSSSSSCR', 'ENST0003|SNV-100-A-T|1'],
160+
['SSSSSSSSGR', 'ENST0004|SNV-100-A-T|1'],
161+
]
162+
peptides = {create_aa_record(*x) for x in data}
163+
denylist_data = [
164+
'SSSSSSSSAR', 'SSSSSSSSGR'
165+
]
166+
denylist = {Seq(x) for x in denylist_data}
167+
exprs = None
168+
coding_tx = ['ENST0001', 'ENST0002', 'ENST0003']
169+
pool = VariantPeptidePool(peptides=peptides)
170+
filtered = pool.filter(
171+
exprs=exprs, cutoff=8, coding_transcripts=coding_tx,
172+
keep_all_coding=False, keep_all_noncoding=True,
173+
denylist=denylist, keep_canonical=True
174+
)
175+
self.assertEqual(len(filtered.peptides), 3)
176+
self.assertEqual(
177+
{str(x.seq) for x in filtered.peptides},
178+
{'SSSSSSSSSR', 'SSSSSSSSAR', 'SSSSSSSSCR'}
179+
)

0 commit comments

Comments
 (0)