Merge pull request #755 from uclahs-cds/czhu-fix-index

Map GTF to memory
uclahs-cds · Jun 29, 2023 · f107557 · f107557
2 parents 0c8489d + 2945a19
commit f107557
Show file tree

Hide file tree

Showing 21 changed files with 844 additions and 295 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.1.0] - 2023-06-28
+
+### Fixed
+
+- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371
+
 ## [1.0.0] - 2023-06-15
 
 ### Added

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'

diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py
@@ -1,5 +1,5 @@
 """ Module for command line interface """
-from .generate_index import add_subparser_generate_index, generate_index
+from .generate_index import add_subparser_generate_index, generate_index, index_gtf
 from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide
 from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide
 from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation

diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py
@@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None:
 
     peptide_pool = aa.VariantPeptidePool()
 
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if not tx_model.is_protein_coding:
             continue
 

diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
@@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
     orf_pool = []
 
     i = 0
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if inclusion_biotypes and \
                 tx_model.transcript.biotype not in inclusion_biotypes:
             continue

diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py
@@ -1,7 +1,6 @@
 """ Common functions for cli """
 from __future__ import annotations
 import argparse
-import lzma
 import os
 import sys
 from typing import Tuple, Set, List
@@ -204,7 +203,7 @@ def add_args_source(parser:argparse.ArgumentParser):
 def load_references(args:argparse.Namespace, load_genome:bool=True,
         load_canonical_peptides:bool=True, load_proteome:bool=False,
         invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False
-        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]:
+        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]:
     """ Load reference files. If index_dir is specified, data will be loaded
     from pickles, otherwise, will read from FASTA and GTF. """
     index_dir:Path = args.index_dir
@@ -225,11 +224,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
                 if not version.is_valid(genome.version):
                     raise err.IndexVersionNotMatchError(version, genome.version)
 
-        with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle:
-            annotation = gtf.GenomicAnnotation()
-            annotation.dump_gtf(handle)
-            if not version.is_valid(annotation.version):
-                raise err.IndexVersionNotMatchError(version, genome.version)
+        gtf_file = list(index_dir.glob('annotation.gtf*'))[0]
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.init_handle(gtf_file)
+        annotation.load_index(gtf_file)
 
         if load_proteome:
             with open(f'{index_dir}/proteome.pkl', 'rb') as handle:
@@ -249,8 +247,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
         if (check_protein_coding is True or load_canonical_peptides is True) and \
                 args.proteome_fasta is None:
             raise ValueError('--proteome-fasta was not specified.')
-        annotation = gtf.GenomicAnnotation()
-        annotation.dump_gtf(args.annotation_gtf, source=args.reference_source)
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.generate_index(args.annotation_gtf, source=args.reference_source)
+
         if not quiet:
             logger('Annotation GTF loaded.')
 

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
@@ -6,19 +6,25 @@
 moPepGen to avoid processing the reference files repeatedly and save massive
 time. """
 from __future__ import annotations
+from typing import TYPE_CHECKING
 import argparse
+import os
+import shutil
+import gzip
 from pathlib import Path
 import pickle
-import lzma
 from moPepGen import dna, aa, gtf, logger
-from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
+from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
 
 
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer
+
 # pylint: disable=W0212
 def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     """ CLI for moPepGen generateIndex """
-    p = subparsers.add_parser(
+    p:argparse.ArgumentParser = subparsers.add_parser(
         name='generateIndex',
         help='Generate genome and proteome index files for moPepGen',
         description='Generate genome and proteome index files for moPepGen'
@@ -33,6 +39,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
         dest='output_dir',
         required=True
     )
+    p.add_argument(
+        '--gtf-symlink',
+        help='Create a symlink of the GTF file instead of copying it.',
+        default=False
+    )
     common.add_args_reference(p, index=False)
     common.add_args_cleavage(p)
     common.add_args_quiet(p)
@@ -41,8 +52,57 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     return p
 
 
+def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
+        invalid_protein_as_noncoding:bool=True):
+    """ Index a GTF file. """
+    anno = gtf.GenomicAnnotationOnDisk()
+
+    with open(file, 'rb') as handle:
+        anno.generate_index(handle, source)
+
+    if proteome:
+        anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
+
+    gene_idx_file, tx_idx_file = anno.get_index_files(file)
+    metadata = GTFIndexMetadata()
+
+    with open(gene_idx_file, 'wt') as handle:
+        metadata.write(handle)
+        for gene in anno.genes.keys():
+            gene_pointer:GenePointer = anno.genes.get_pointer(gene)
+            handle.write(gene_pointer.to_line() + '\n')
+
+    with open(tx_idx_file, 'wt') as handle:
+        metadata.write(handle)
+        for tx in anno.transcripts.keys():
+            tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
+            handle.write(tx_pointer.to_line() + '\n')
+
+def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
+    """ Create copy of GTF """
+    if file.suffix.lower() == '.gz':
+        if symlink:
+            symlink = False
+            logger(
+                "--gtf-symlink was suppressed because compressed GTF file was received. "
+            )
+    elif file.suffix.lower() != '.gtf':
+        raise ValueError(f"Cannot handle gtf file {file}")
+
+    output_file = output_dir/'annotation.gtf'
+
+    if symlink:
+        os.symlink(file, output_file)
+    elif file.suffix.lower() == '.gtf':
+        shutil.copy2(file, output_file)
+    else:
+        with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle:
+            for line in ihandle:
+                ohandle.write(line)
+    return output_file
+
 def generate_index(args:argparse.Namespace):
-    """ Generate  """
+    """ Generate index  """
     path_genome:Path = args.genome_fasta
     path_gtf:Path = args.annotation_gtf
     parth_proteome:Path = args.proteome_fasta
@@ -55,11 +115,11 @@ def generate_index(args:argparse.Namespace):
     exception = 'trypsin_exception' if rule == 'trypsin' else None
     invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
     quiet:bool = args.quiet
+    symlink:bool = args.gtf_symlink
 
     output_dir:Path = args.output_dir
     output_genome = output_dir/"genome.pkl"
     output_proteome = output_dir/"proteome.pkl"
-    output_anno = output_dir/"annotation.dat"
     output_peptides = output_dir/"canonical_peptides.pkl"
     output_coding_tx = output_dir/"coding_transcripts.pkl"
 
@@ -77,28 +137,24 @@ def generate_index(args:argparse.Namespace):
         logger('Genome FASTA saved to disk.')
     del genome
 
-    anno = gtf.GenomicAnnotation()
-    anno.dump_gtf(path_gtf, source=args.reference_source)
-    if not quiet:
-        logger('Genome annotation GTF loaded.')
-
     proteome = aa.AminoAcidSeqDict()
     proteome.dump_fasta(parth_proteome, source=args.reference_source)
+    if not quiet:
+        logger('Proteome FASTA loaded.')
 
-    anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
-
-    with lzma.open(output_anno, 'wt') as handle:
-        GtfIO.write(handle, anno)
+    output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
+    index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
     if not quiet:
         logger('Genome annotation GTF saved to disk.')
 
-    if not quiet:
-        logger('Proteome FASTA loaded.')
     with open(output_proteome, 'wb') as handle:
         pickle.dump(proteome, handle)
     if not quiet:
         logger('Proteome FASTA saved to disk.')
 
+    anno = gtf.GenomicAnnotationOnDisk()
+    anno.generate_index(output_gtf)
+
     # canoincal peptide pool
     canonical_peptides = proteome.create_unique_peptide_pool(
         anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,

diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py
@@ -0,0 +1,46 @@
+""" GTFIndexMetadata """
+from typing import IO
+from moPepGen.version import MetaVersion
+
+MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##'
+
+class GTFIndexMetadata():
+    """ Class for GTF index metadata. """
+    def __init__(self, source:str=None, version:MetaVersion=None):
+        """ constructor """
+        self.source = source
+        self.version = version or MetaVersion()
+
+    def write(self, handle:IO):
+        """ Write metadata to handle. """
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        handle.write(f"{prefix}source={self.source}\n")
+        handle.write(f"{prefix}python={self.version.python}\n")
+        handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n")
+        handle.write(f"{prefix}biopython={self.version.biopython}\n")
+
+    @classmethod
+    def parse(cls, handle:IO):
+        """ Read metadata from handle """
+        pos = handle.tell()
+        it = handle.readline()
+        metadata = {}
+        version = {}
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        while it and it.startswith(prefix):
+            key, val = it.lstrip(prefix).rstrip().split('=', 1)
+            if val == '':
+                val = None
+            if key in ('python', 'moPepGen', 'biopython'):
+                version[key.lower()] = val
+            else:
+                metadata[key] = val
+
+            pos = handle.tell()
+            it = handle.readline()
+
+        handle.seek(pos)
+
+        version = MetaVersion(**version)
+
+        return cls(source=metadata['source'], version=version)