Skip to content

Commit

Permalink
Merge pull request #755 from uclahs-cds/czhu-fix-index
Browse files Browse the repository at this point in the history
Map GTF to memory
  • Loading branch information
zhuchcn committed Jun 29, 2023
2 parents 0c8489d + 2945a19 commit f107557
Show file tree
Hide file tree
Showing 21 changed files with 844 additions and 295 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [1.1.0] - 2023-06-28

### Fixed

- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371

## [1.0.0] - 2023-06-15

### Added
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Iterable, IO


__version__ = '1.0.0'
__version__ = '1.1.0'

## Error messages
ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
Expand Down
2 changes: 1 addition & 1 deletion moPepGen/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
""" Module for command line interface """
from .generate_index import add_subparser_generate_index, generate_index
from .generate_index import add_subparser_generate_index, generate_index, index_gtf
from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide
from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide
from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation
Expand Down
3 changes: 2 additions & 1 deletion moPepGen/cli/call_alt_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None:

peptide_pool = aa.VariantPeptidePool()

for tx_id, tx_model in anno.transcripts.items():
for tx_id in anno.transcripts:
tx_model = anno.transcripts[tx_id]
if not tx_model.is_protein_coding:
continue

Expand Down
3 changes: 2 additions & 1 deletion moPepGen/cli/call_noncoding_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
orf_pool = []

i = 0
for tx_id, tx_model in anno.transcripts.items():
for tx_id in anno.transcripts:
tx_model = anno.transcripts[tx_id]
if inclusion_biotypes and \
tx_model.transcript.biotype not in inclusion_biotypes:
continue
Expand Down
17 changes: 8 additions & 9 deletions moPepGen/cli/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
""" Common functions for cli """
from __future__ import annotations
import argparse
import lzma
import os
import sys
from typing import Tuple, Set, List
Expand Down Expand Up @@ -204,7 +203,7 @@ def add_args_source(parser:argparse.ArgumentParser):
def load_references(args:argparse.Namespace, load_genome:bool=True,
load_canonical_peptides:bool=True, load_proteome:bool=False,
invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False
) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]:
) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]:
""" Load reference files. If index_dir is specified, data will be loaded
from pickles, otherwise, will read from FASTA and GTF. """
index_dir:Path = args.index_dir
Expand All @@ -225,11 +224,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
if not version.is_valid(genome.version):
raise err.IndexVersionNotMatchError(version, genome.version)

with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle:
annotation = gtf.GenomicAnnotation()
annotation.dump_gtf(handle)
if not version.is_valid(annotation.version):
raise err.IndexVersionNotMatchError(version, genome.version)
gtf_file = list(index_dir.glob('annotation.gtf*'))[0]
annotation = gtf.GenomicAnnotationOnDisk()
annotation.init_handle(gtf_file)
annotation.load_index(gtf_file)

if load_proteome:
with open(f'{index_dir}/proteome.pkl', 'rb') as handle:
Expand All @@ -249,8 +247,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
if (check_protein_coding is True or load_canonical_peptides is True) and \
args.proteome_fasta is None:
raise ValueError('--proteome-fasta was not specified.')
annotation = gtf.GenomicAnnotation()
annotation.dump_gtf(args.annotation_gtf, source=args.reference_source)
annotation = gtf.GenomicAnnotationOnDisk()
annotation.generate_index(args.annotation_gtf, source=args.reference_source)

if not quiet:
logger('Annotation GTF loaded.')

Expand Down
88 changes: 72 additions & 16 deletions moPepGen/cli/generate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@
moPepGen to avoid processing the reference files repeatedly and save massive
time. """
from __future__ import annotations
from typing import TYPE_CHECKING
import argparse
import os
import shutil
import gzip
from pathlib import Path
import pickle
import lzma
from moPepGen import dna, aa, gtf, logger
from moPepGen.gtf import GtfIO
from moPepGen.cli import common
from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata


if TYPE_CHECKING:
from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer

# pylint: disable=W0212
def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
""" CLI for moPepGen generateIndex """
p = subparsers.add_parser(
p:argparse.ArgumentParser = subparsers.add_parser(
name='generateIndex',
help='Generate genome and proteome index files for moPepGen',
description='Generate genome and proteome index files for moPepGen'
Expand All @@ -33,6 +39,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
dest='output_dir',
required=True
)
p.add_argument(
'--gtf-symlink',
help='Create a symlink of the GTF file instead of copying it.',
default=False
)
common.add_args_reference(p, index=False)
common.add_args_cleavage(p)
common.add_args_quiet(p)
Expand All @@ -41,8 +52,57 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
return p


def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
invalid_protein_as_noncoding:bool=True):
""" Index a GTF file. """
anno = gtf.GenomicAnnotationOnDisk()

with open(file, 'rb') as handle:
anno.generate_index(handle, source)

if proteome:
anno.check_protein_coding(proteome, invalid_protein_as_noncoding)

gene_idx_file, tx_idx_file = anno.get_index_files(file)
metadata = GTFIndexMetadata()

with open(gene_idx_file, 'wt') as handle:
metadata.write(handle)
for gene in anno.genes.keys():
gene_pointer:GenePointer = anno.genes.get_pointer(gene)
handle.write(gene_pointer.to_line() + '\n')

with open(tx_idx_file, 'wt') as handle:
metadata.write(handle)
for tx in anno.transcripts.keys():
tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
handle.write(tx_pointer.to_line() + '\n')

def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
""" Create copy of GTF """
if file.suffix.lower() == '.gz':
if symlink:
symlink = False
logger(
"--gtf-symlink was suppressed because compressed GTF file was received. "
)
elif file.suffix.lower() != '.gtf':
raise ValueError(f"Cannot handle gtf file {file}")

output_file = output_dir/'annotation.gtf'

if symlink:
os.symlink(file, output_file)
elif file.suffix.lower() == '.gtf':
shutil.copy2(file, output_file)
else:
with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle:
for line in ihandle:
ohandle.write(line)
return output_file

def generate_index(args:argparse.Namespace):
""" Generate """
""" Generate index """
path_genome:Path = args.genome_fasta
path_gtf:Path = args.annotation_gtf
parth_proteome:Path = args.proteome_fasta
Expand All @@ -55,11 +115,11 @@ def generate_index(args:argparse.Namespace):
exception = 'trypsin_exception' if rule == 'trypsin' else None
invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
quiet:bool = args.quiet
symlink:bool = args.gtf_symlink

output_dir:Path = args.output_dir
output_genome = output_dir/"genome.pkl"
output_proteome = output_dir/"proteome.pkl"
output_anno = output_dir/"annotation.dat"
output_peptides = output_dir/"canonical_peptides.pkl"
output_coding_tx = output_dir/"coding_transcripts.pkl"

Expand All @@ -77,28 +137,24 @@ def generate_index(args:argparse.Namespace):
logger('Genome FASTA saved to disk.')
del genome

anno = gtf.GenomicAnnotation()
anno.dump_gtf(path_gtf, source=args.reference_source)
if not quiet:
logger('Genome annotation GTF loaded.')

proteome = aa.AminoAcidSeqDict()
proteome.dump_fasta(parth_proteome, source=args.reference_source)
if not quiet:
logger('Proteome FASTA loaded.')

anno.check_protein_coding(proteome, invalid_protein_as_noncoding)

with lzma.open(output_anno, 'wt') as handle:
GtfIO.write(handle, anno)
output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
if not quiet:
logger('Genome annotation GTF saved to disk.')

if not quiet:
logger('Proteome FASTA loaded.')
with open(output_proteome, 'wb') as handle:
pickle.dump(proteome, handle)
if not quiet:
logger('Proteome FASTA saved to disk.')

anno = gtf.GenomicAnnotationOnDisk()
anno.generate_index(output_gtf)

# canoincal peptide pool
canonical_peptides = proteome.create_unique_peptide_pool(
anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,
Expand Down
46 changes: 46 additions & 0 deletions moPepGen/gtf/GTFIndexMetadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
""" GTFIndexMetadata """
from typing import IO
from moPepGen.version import MetaVersion

MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##'

class GTFIndexMetadata():
""" Class for GTF index metadata. """
def __init__(self, source:str=None, version:MetaVersion=None):
""" constructor """
self.source = source
self.version = version or MetaVersion()

def write(self, handle:IO):
""" Write metadata to handle. """
prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
handle.write(f"{prefix}source={self.source}\n")
handle.write(f"{prefix}python={self.version.python}\n")
handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n")
handle.write(f"{prefix}biopython={self.version.biopython}\n")

@classmethod
def parse(cls, handle:IO):
""" Read metadata from handle """
pos = handle.tell()
it = handle.readline()
metadata = {}
version = {}
prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
while it and it.startswith(prefix):
key, val = it.lstrip(prefix).rstrip().split('=', 1)
if val == '':
val = None
if key in ('python', 'moPepGen', 'biopython'):
version[key.lower()] = val
else:
metadata[key] = val

pos = handle.tell()
it = handle.readline()

handle.seek(pos)

version = MetaVersion(**version)

return cls(source=metadata['source'], version=version)
Loading

0 comments on commit f107557

Please sign in to comment.