Skip to content

Commit

Permalink
Merge pull request #485 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
Memory usage optimization for `callVariant` to convert variant records to label early
  • Loading branch information
zhuchcn authored Jun 18, 2022
2 parents 6424662 + 0aa54d2 commit badda70
Show file tree
Hide file tree
Showing 22 changed files with 424 additions and 288 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install biopython pathos pytest psutil
pip install biopython ray pytest psutil
- name: Run Unit Tests
run: |
Expand Down
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,17 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

---

## [0.6.1] - 2022-06-08
## [0.7.1] - 2022-06-18

### Changed

- `callVariant` optimized so the memory usage and run time is reduced to better handle hyper-mutated regions. #483

- Switched from [pathos](https://pathos.readthedocs.io/en/latest/index.html) to [ray](https://www.ray.io/) because the latter has better support for shared memory.

---

## [0.7.0] - 2022-06-08

### Fixed

Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '0.7.0'
__version__ = '0.7.1'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/aa/VariantPeptideLabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ def __le__(self, other:VariantPeptideInfo):
class LabelSourceMapping():
""" Helper class to handle label source mapping """
def __init__(self, data:Dict[str,Dict[str,str]]=None):
""" construnctor"""
""" constructor"""
self.data = data or {}

def add_record(self, gene_id:str, label:str, source:str):
Expand Down
12 changes: 9 additions & 3 deletions moPepGen/aa/VariantPeptidePool.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
""" Module for variant peptide pool (unique) """
from __future__ import annotations
from typing import Set, IO, Dict, List, Tuple
from typing import Set, IO, Dict, List, Tuple, TYPE_CHECKING
from pathlib import Path
from Bio import SeqUtils, SeqIO
from Bio.Seq import Seq
Expand All @@ -10,6 +10,9 @@
from .VariantPeptideLabel import VariantPeptideInfo


if TYPE_CHECKING:
from moPepGen import params

class VariantPeptidePool():
""" Varaint Peptide Pool """
def __init__(self, peptides:Set[AminoAcidSeqRecord]=None):
Expand All @@ -18,8 +21,8 @@ def __init__(self, peptides:Set[AminoAcidSeqRecord]=None):
self.peptide_delimeter = VARIANT_PEPTIDE_SOURCE_DELIMITER

def add_peptide(self, peptide:AminoAcidSeqRecord,
canonical_peptides:Set[str], min_mw:int=500, min_length:int=7,
max_length:int=25, skip_checking:bool=False):
canonical_peptides:Set[str], cleavage_params:params.CleavageParams=None,
skip_checking:bool=False):
""" Add a peptide to the pool if it does not already exist. Otherwise,
the label is appended to the existing same peptide.
Expand All @@ -31,6 +34,9 @@ def add_peptide(self, peptide:AminoAcidSeqRecord,
canonical_peptides (Set[str]): Canonical peptides.
"""
if not skip_checking:
min_mw = cleavage_params.min_mw
min_length = cleavage_params.min_length
max_length = cleavage_params.max_length
if SeqUtils.molecular_weight(peptide.seq, 'protein') < min_mw:
return
if len(peptide.seq) < min_length or len(peptide.seq) > max_length:
Expand Down
32 changes: 18 additions & 14 deletions moPepGen/cli/call_noncoding_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import TYPE_CHECKING, Set, List, Tuple, IO
from pathlib import Path
from Bio.SeqIO import FastaIO
from moPepGen import svgraph, aa, logger
from moPepGen import params, svgraph, aa, logger
from moPepGen.dna.DNASeqRecord import DNASeqRecordWithCoordinates
from moPepGen.err import ReferenceSeqnameNotFoundError, warning
from moPepGen.cli import common
Expand Down Expand Up @@ -78,12 +78,14 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
if args.output_orf:
common.validate_file_format(args.output_orf, OUTPUT_FILE_FORMATS)

rule:str = args.cleavage_rule
miscleavage:int = int(args.miscleavage)
min_mw:float = float(args.min_mw)
exception = 'trypsin_exception' if rule == 'trypsin' else None
min_length:int = args.min_length
max_length:int = args.max_length
cleavage_params = params.CleavageParams(
enzyme=args.cleavage_rule,
exception='trypsin_exception' if args.cleavage_rule == 'trypsin' else None,
miscleavage=int(args.miscleavage),
min_mw=float(args.min_mw),
min_length=args.min_length,
max_length=args.max_length
)

common.print_start_message(args)

Expand Down Expand Up @@ -111,7 +113,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:

try:
peptides, orfs = call_noncoding_peptide_main(tx_id, tx_model, genome,
rule, exception, miscleavage)
canonical_peptides, cleavage_params)

if not orfs:
continue
Expand All @@ -120,7 +122,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:

for peptide in peptides:
noncanonical_pool.add_peptide(peptide, canonical_peptides,
min_mw, min_length, max_length)
cleavage_params)
except ReferenceSeqnameNotFoundError as e:
if not ReferenceSeqnameNotFoundError.raised:
warning(e.args[0] + ' Make sure your GTF and FASTA files match.')
Expand All @@ -144,7 +146,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:


def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
genome:DNASeqDict, rule:str, exception:str, miscleavage:int
genome:DNASeqDict, canonical_peptides:Set[str],
cleavage_params:params.CleavageParams
) -> Tuple[Set[aa.AminoAcidSeqRecord],List[aa.AminoAcidSeqRecord]]:
""" Call noncoding peptides """
chrom = tx_model.transcript.location.seqname
Expand All @@ -159,15 +162,16 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
seq=tx_seq,
_id=tx_id,
cds_start_nf=True,
has_known_orf=False
has_known_orf=False,
cleavage_params=cleavage_params
)
dgraph.init_three_frames()
pgraph = dgraph.translate()
pgraph.create_cleavage_graph(rule=rule, exception=exception)
pgraph.create_cleavage_graph()
peptides = pgraph.call_variant_peptides(
miscleavage=miscleavage,
check_variants=False,
check_orf=True
check_orf=True,
blacklist=canonical_peptides
)
orfs = get_orf_sequences(pgraph, tx_id, tx_seq)
return peptides, orfs
Expand Down
Loading

0 comments on commit badda70

Please sign in to comment.