Merge pull request #485 from uclahs-cds/czhu-fix-call-variant

Memory usage optimization for `callVariant` to convert variant records to label early
uclahs-cds · Jun 18, 2022 · badda70 · badda70
2 parents 6424662 + 0aa54d2
commit badda70
Show file tree

Hide file tree

Showing 22 changed files with 424 additions and 288 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install biopython pathos pytest psutil
+          pip install biopython ray pytest psutil
 
       - name: Run Unit Tests
         run: |

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,17 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ---
 
-## [0.6.1] - 2022-06-08
+## [0.7.1] - 2022-06-18
+
+### Changed
+
+- `callVariant` optimized so the memory usage and run time is reduced to better handle hyper-mutated regions. #483
+
+- Switched from [pathos](https://pathos.readthedocs.io/en/latest/index.html) to [ray](https://www.ray.io/) because the latter has better support for shared memory.
+
+---
+
+## [0.7.0] - 2022-06-08
 
 ### Fixed
 

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '0.7.0'
+__version__ = '0.7.1'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

diff --git a/moPepGen/aa/VariantPeptideLabel.py b/moPepGen/aa/VariantPeptideLabel.py
@@ -288,7 +288,7 @@ def __le__(self, other:VariantPeptideInfo):
 class LabelSourceMapping():
     """ Helper class to handle label source mapping """
     def __init__(self, data:Dict[str,Dict[str,str]]=None):
-        """ construnctor"""
+        """ constructor"""
         self.data = data or {}
 
     def add_record(self, gene_id:str, label:str, source:str):

diff --git a/moPepGen/aa/VariantPeptidePool.py b/moPepGen/aa/VariantPeptidePool.py
@@ -1,6 +1,6 @@
 """ Module for variant peptide pool (unique) """
 from __future__ import annotations
-from typing import Set, IO, Dict, List, Tuple
+from typing import Set, IO, Dict, List, Tuple, TYPE_CHECKING
 from pathlib import Path
 from Bio import SeqUtils, SeqIO
 from Bio.Seq import Seq
@@ -10,6 +10,9 @@
 from .VariantPeptideLabel import VariantPeptideInfo
 
 
+if TYPE_CHECKING:
+    from moPepGen import params
+
 class VariantPeptidePool():
     """ Varaint Peptide Pool """
     def __init__(self, peptides:Set[AminoAcidSeqRecord]=None):
@@ -18,8 +21,8 @@ def __init__(self, peptides:Set[AminoAcidSeqRecord]=None):
         self.peptide_delimeter = VARIANT_PEPTIDE_SOURCE_DELIMITER
 
     def add_peptide(self, peptide:AminoAcidSeqRecord,
-            canonical_peptides:Set[str], min_mw:int=500, min_length:int=7,
-            max_length:int=25, skip_checking:bool=False):
+            canonical_peptides:Set[str], cleavage_params:params.CleavageParams=None,
+            skip_checking:bool=False):
         """ Add a peptide to the pool if it does not already exist. Otherwise,
         the label is appended to the existing same peptide.
 
@@ -31,6 +34,9 @@ def add_peptide(self, peptide:AminoAcidSeqRecord,
             canonical_peptides (Set[str]): Canonical peptides.
         """
         if not skip_checking:
+            min_mw = cleavage_params.min_mw
+            min_length = cleavage_params.min_length
+            max_length = cleavage_params.max_length
             if SeqUtils.molecular_weight(peptide.seq, 'protein') < min_mw:
                 return
             if len(peptide.seq) < min_length or len(peptide.seq) > max_length:

diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Set, List, Tuple, IO
 from pathlib import Path
 from Bio.SeqIO import FastaIO
-from moPepGen import svgraph, aa, logger
+from moPepGen import params, svgraph, aa, logger
 from moPepGen.dna.DNASeqRecord import DNASeqRecordWithCoordinates
 from moPepGen.err import ReferenceSeqnameNotFoundError, warning
 from moPepGen.cli import common
@@ -78,12 +78,14 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
     if args.output_orf:
         common.validate_file_format(args.output_orf, OUTPUT_FILE_FORMATS)
 
-    rule:str = args.cleavage_rule
-    miscleavage:int = int(args.miscleavage)
-    min_mw:float = float(args.min_mw)
-    exception = 'trypsin_exception' if rule == 'trypsin' else None
-    min_length:int = args.min_length
-    max_length:int = args.max_length
+    cleavage_params = params.CleavageParams(
+        enzyme=args.cleavage_rule,
+        exception='trypsin_exception' if args.cleavage_rule == 'trypsin' else None,
+        miscleavage=int(args.miscleavage),
+        min_mw=float(args.min_mw),
+        min_length=args.min_length,
+        max_length=args.max_length
+    )
 
     common.print_start_message(args)
 
@@ -111,7 +113,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
 
         try:
             peptides, orfs = call_noncoding_peptide_main(tx_id, tx_model, genome,
-                rule, exception, miscleavage)
+                canonical_peptides, cleavage_params)
 
             if not orfs:
                 continue
@@ -120,7 +122,7 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
 
             for peptide in peptides:
                 noncanonical_pool.add_peptide(peptide, canonical_peptides,
-                    min_mw, min_length, max_length)
+                    cleavage_params)
         except ReferenceSeqnameNotFoundError as e:
             if not ReferenceSeqnameNotFoundError.raised:
                 warning(e.args[0] + ' Make sure your GTF and FASTA files match.')
@@ -144,7 +146,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
 
 
 def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
-        genome:DNASeqDict, rule:str, exception:str, miscleavage:int
+        genome:DNASeqDict, canonical_peptides:Set[str],
+        cleavage_params:params.CleavageParams
         ) -> Tuple[Set[aa.AminoAcidSeqRecord],List[aa.AminoAcidSeqRecord]]:
     """ Call noncoding peptides """
     chrom = tx_model.transcript.location.seqname
@@ -159,15 +162,16 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
         seq=tx_seq,
         _id=tx_id,
         cds_start_nf=True,
-        has_known_orf=False
+        has_known_orf=False,
+        cleavage_params=cleavage_params
     )
     dgraph.init_three_frames()
     pgraph = dgraph.translate()
-    pgraph.create_cleavage_graph(rule=rule, exception=exception)
+    pgraph.create_cleavage_graph()
     peptides = pgraph.call_variant_peptides(
-        miscleavage=miscleavage,
         check_variants=False,
-        check_orf=True
+        check_orf=True,
+        blacklist=canonical_peptides
     )
     orfs = get_orf_sequences(pgraph, tx_id, tx_seq)
     return peptides, orfs