Merge pull request #856 from uclahs-cds/czhu-fix-call-variant

Peptide Table saved by callVariant
uclahs-cds · Mar 7, 2024 · 73fa63d · 73fa63d
2 parents f78ca08 + 4e6f79d
commit 73fa63d
Show file tree

Hide file tree

Showing 19 changed files with 583 additions and 191 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -42,6 +42,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Added `updateIndex` #853
 
+- Output a peptide table for additional information. #833
+
 ## [1.2.1] - 2023-10-05
 
 ### Add

diff --git a/moPepGen/SeqFeature.py b/moPepGen/SeqFeature.py
@@ -76,6 +76,18 @@ def is_superset(self, other:FeatureLocation) -> bool:
         """ Find whether the location is a superset of the other """
         return self.start <= other.start and self.end >= other.end
 
+    def shift(self, i:int) -> FeatureLocation:
+        """ shift """
+        return FeatureLocation(
+            seqname=self.seqname,
+            start=self.start + i,
+            end=self.end + i,
+            reading_frame_index=self.reading_frame_index,
+            start_offset=self.start_offset,
+            end_offset=self.end_offset,
+            strand=self.strand
+        )
+
 
 class SeqFeature(BioSeqFeature):
     """ Models the annotation of a given range of the sequence. This extends

diff --git a/moPepGen/aa/VariantPeptidePool.py b/moPepGen/aa/VariantPeptidePool.py
@@ -22,7 +22,7 @@ def __init__(self, peptides:Set[AminoAcidSeqRecord]=None):
 
     def add_peptide(self, peptide:AminoAcidSeqRecord,
             canonical_peptides:Set[str], cleavage_params:params.CleavageParams=None,
-            skip_checking:bool=False):
+            skip_checking:bool=False) -> bool:
         """ Add a peptide to the pool if it does not already exist. Otherwise,
         the label is appended to the existing same peptide.
 
@@ -36,12 +36,12 @@ def add_peptide(self, peptide:AminoAcidSeqRecord,
             min_mw = cleavage_params.min_mw
             min_length = cleavage_params.min_length
             max_length = cleavage_params.max_length
-            if SeqUtils.molecular_weight(peptide.seq, 'protein') < min_mw:
-                return
-            if len(peptide.seq) < min_length or len(peptide.seq) > max_length:
-                return
+            if SeqUtils.molecular_weight(peptide, 'protein') < min_mw:
+                return False
+            if len(peptide.seq) < min_length or len(peptide) > max_length:
+                return False
             if str(peptide.seq) in canonical_peptides:
-                return
+                return False
         same_peptide = get_equivalent(self.peptides, peptide)
         if same_peptide:
             same_peptide:Seq
@@ -51,6 +51,7 @@ def add_peptide(self, peptide:AminoAcidSeqRecord,
             same_peptide.name = same_peptide.description
         else:
             self.peptides.add(peptide)
+        return True
 
     def remove_redundant_headers(self):
         """ remove redundant fasta header entries """

diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py
@@ -2,13 +2,14 @@
 harbor any alternative translation event. """
 from __future__ import annotations
 import argparse
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, Set
 from pathlib import Path
-from moPepGen import params, svgraph, aa, get_logger
+from moPepGen import params, svgraph, aa, get_logger, VARIANT_PEPTIDE_SOURCE_DELIMITER
 from moPepGen.cli import common
 
 
 if TYPE_CHECKING:
+    from Bio.Seq import Seq
     from moPepGen.gtf import TranscriptAnnotationModel, GenomicAnnotation
     from moPepGen.dna import DNASeqDict
 
@@ -125,15 +126,35 @@ def call_alt_translation_main(tx_id:str, tx_model:TranscriptAnnotationModel,
         cds_start_nf=tx_model.is_cds_start_nf(),
         has_known_orf=True,
         mrna_end_nf=tx_model.is_mrna_end_nf(),
-        cleavage_params=cleavage_params
+        cleavage_params=cleavage_params,
+        coordinate_feature_type='transcript',
+        coordinate_feature_id=tx_id
     )
     dgraph.gather_sect_variants(anno)
     dgraph.init_three_frames()
     pgraph = dgraph.translate()
     pgraph.create_cleavage_graph()
-    return pgraph.call_variant_peptides(
+    peptide_anno = pgraph.call_variant_peptides(
         check_variants=True,
         truncate_sec=sec_truncation,
         w2f=w2f_reassignment,
         check_external_variants=False
     )
+    peptide_map:Dict[Seq, Set[str]] = {}
+    for seq, annotated_labels in peptide_anno.items():
+        for label in annotated_labels:
+            if seq in peptide_map:
+                peptide_map[seq].add(label.label)
+            else:
+                peptide_map[seq] = {label.label}
+    peptides = set()
+    for seq, labels in peptide_map.items():
+        label = VARIANT_PEPTIDE_SOURCE_DELIMITER.join(labels)
+        peptides.add(
+            aa.AminoAcidSeqRecord(
+                seq=seq,
+                description=label,
+                name=label
+            )
+        )
+    return peptides
diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
@@ -2,16 +2,17 @@
 It finds all start codons of any noncoding gene. """
 from __future__ import annotations
 import argparse
-from typing import TYPE_CHECKING, Set, List, Tuple, IO
+from typing import TYPE_CHECKING, Set, List, Tuple, IO, Dict
 from pathlib import Path
 from Bio.SeqIO import FastaIO
-from moPepGen import params, svgraph, aa, get_logger
+from moPepGen import params, svgraph, aa, get_logger, VARIANT_PEPTIDE_SOURCE_DELIMITER
 from moPepGen.dna.DNASeqRecord import DNASeqRecordWithCoordinates
 from moPepGen.err import ReferenceSeqnameNotFoundError, warning
 from moPepGen.cli import common
 
 
 if TYPE_CHECKING:
+    from Bio.Seq import Seq
     from moPepGen.gtf import TranscriptAnnotationModel
     from moPepGen.dna import DNASeqDict
 
@@ -190,19 +191,38 @@ def call_noncoding_peptide_main(tx_id:str, tx_model:TranscriptAnnotationModel,
         cds_start_nf=True,
         has_known_orf=False,
         cleavage_params=cleavage_params,
-        gene_id=tx_model.gene_id
+        gene_id=tx_model.gene_id,
+        coordinate_feature_type='transcript',
+        coordinate_feature_id=tx_id
     )
     dgraph.init_three_frames()
     pgraph = dgraph.translate()
     pgraph.create_cleavage_graph()
-    peptides = pgraph.call_variant_peptides(
+    peptide_anno = pgraph.call_variant_peptides(
         check_variants=False,
         check_orf=True,
         denylist=canonical_peptides,
         orf_assignment=orf_assignment,
         w2f=w2f_reassignment,
         check_external_variants=False
     )
+    peptide_map:Dict[Seq, Set[str]] = {}
+    for seq, annotated_labels in peptide_anno.items():
+        for label in annotated_labels:
+            if seq in peptide_map:
+                peptide_map[seq].add(label.label)
+            else:
+                peptide_map[seq] = {label.label}
+    peptides = set()
+    for seq, labels in peptide_map.items():
+        label = VARIANT_PEPTIDE_SOURCE_DELIMITER.join(labels)
+        peptides.add(
+            aa.AminoAcidSeqRecord(
+                seq=seq,
+                description=label,
+                name=label
+            )
+        )
     orfs = get_orf_sequences(pgraph, tx_id, tx_model.gene_id, tx_seq)
     return peptides, orfs