diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dc26db9..0c245140 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [1.1.0] - 2023-06-28 + +### Fixed + +- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371 + ## [1.0.0] - 2023-06-15 ### Added diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py index 87b412aa..50556536 100644 --- a/moPepGen/__init__.py +++ b/moPepGen/__init__.py @@ -5,7 +5,7 @@ from typing import Iterable, IO -__version__ = '1.0.0' +__version__ = '1.1.0' ## Error messages ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron' diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py index 90f9e0a1..3881c6ea 100644 --- a/moPepGen/cli/__init__.py +++ b/moPepGen/cli/__init__.py @@ -1,5 +1,5 @@ """ Module for command line interface """ -from .generate_index import add_subparser_generate_index, generate_index +from .generate_index import add_subparser_generate_index, generate_index, index_gtf from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py index 5b8ba4de..a75f3f19 100644 --- a/moPepGen/cli/call_alt_translation.py +++ b/moPepGen/cli/call_alt_translation.py @@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None: peptide_pool = aa.VariantPeptidePool() - for tx_id, tx_model in anno.transcripts.items(): + for tx_id in anno.transcripts: + tx_model = anno.transcripts[tx_id] if not tx_model.is_protein_coding: continue diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py index 324267ed..2cc51082 100644 --- a/moPepGen/cli/call_noncoding_peptide.py +++ b/moPepGen/cli/call_noncoding_peptide.py @@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None: orf_pool = [] i = 0 - for tx_id, tx_model in anno.transcripts.items(): + for tx_id in anno.transcripts: + tx_model = anno.transcripts[tx_id] if inclusion_biotypes and \ tx_model.transcript.biotype not in inclusion_biotypes: continue diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py index 71b463c4..afdbb51d 100644 --- a/moPepGen/cli/common.py +++ b/moPepGen/cli/common.py @@ -1,7 +1,6 @@ """ Common functions for cli """ from __future__ import annotations import argparse -import lzma import os import sys from typing import Tuple, Set, List @@ -204,7 +203,7 @@ def add_args_source(parser:argparse.ArgumentParser): def load_references(args:argparse.Namespace, load_genome:bool=True, load_canonical_peptides:bool=True, load_proteome:bool=False, invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False - ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]: + ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]: """ Load reference files. If index_dir is specified, data will be loaded from pickles, otherwise, will read from FASTA and GTF. """ index_dir:Path = args.index_dir @@ -225,11 +224,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True, if not version.is_valid(genome.version): raise err.IndexVersionNotMatchError(version, genome.version) - with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle: - annotation = gtf.GenomicAnnotation() - annotation.dump_gtf(handle) - if not version.is_valid(annotation.version): - raise err.IndexVersionNotMatchError(version, genome.version) + gtf_file = list(index_dir.glob('annotation.gtf*'))[0] + annotation = gtf.GenomicAnnotationOnDisk() + annotation.init_handle(gtf_file) + annotation.load_index(gtf_file) if load_proteome: with open(f'{index_dir}/proteome.pkl', 'rb') as handle: @@ -249,8 +247,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True, if (check_protein_coding is True or load_canonical_peptides is True) and \ args.proteome_fasta is None: raise ValueError('--proteome-fasta was not specified.') - annotation = gtf.GenomicAnnotation() - annotation.dump_gtf(args.annotation_gtf, source=args.reference_source) + annotation = gtf.GenomicAnnotationOnDisk() + annotation.generate_index(args.annotation_gtf, source=args.reference_source) + if not quiet: logger('Annotation GTF loaded.') diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index fa558e6e..6f8ee0a6 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -6,19 +6,25 @@ moPepGen to avoid processing the reference files repeatedly and save massive time. """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse +import os +import shutil +import gzip from pathlib import Path import pickle -import lzma from moPepGen import dna, aa, gtf, logger -from moPepGen.gtf import GtfIO from moPepGen.cli import common +from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata +if TYPE_CHECKING: + from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer + # pylint: disable=W0212 def add_subparser_generate_index(subparsers:argparse._SubParsersAction): """ CLI for moPepGen generateIndex """ - p = subparsers.add_parser( + p:argparse.ArgumentParser = subparsers.add_parser( name='generateIndex', help='Generate genome and proteome index files for moPepGen', description='Generate genome and proteome index files for moPepGen' @@ -33,6 +39,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction): dest='output_dir', required=True ) + p.add_argument( + '--gtf-symlink', + help='Create a symlink of the GTF file instead of copying it.', + default=False + ) common.add_args_reference(p, index=False) common.add_args_cleavage(p) common.add_args_quiet(p) @@ -41,8 +52,57 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction): return p +def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, + invalid_protein_as_noncoding:bool=True): + """ Index a GTF file. """ + anno = gtf.GenomicAnnotationOnDisk() + + with open(file, 'rb') as handle: + anno.generate_index(handle, source) + + if proteome: + anno.check_protein_coding(proteome, invalid_protein_as_noncoding) + + gene_idx_file, tx_idx_file = anno.get_index_files(file) + metadata = GTFIndexMetadata() + + with open(gene_idx_file, 'wt') as handle: + metadata.write(handle) + for gene in anno.genes.keys(): + gene_pointer:GenePointer = anno.genes.get_pointer(gene) + handle.write(gene_pointer.to_line() + '\n') + + with open(tx_idx_file, 'wt') as handle: + metadata.write(handle) + for tx in anno.transcripts.keys(): + tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) + handle.write(tx_pointer.to_line() + '\n') + +def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path: + """ Create copy of GTF """ + if file.suffix.lower() == '.gz': + if symlink: + symlink = False + logger( + "--gtf-symlink was suppressed because compressed GTF file was received. " + ) + elif file.suffix.lower() != '.gtf': + raise ValueError(f"Cannot handle gtf file {file}") + + output_file = output_dir/'annotation.gtf' + + if symlink: + os.symlink(file, output_file) + elif file.suffix.lower() == '.gtf': + shutil.copy2(file, output_file) + else: + with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle: + for line in ihandle: + ohandle.write(line) + return output_file + def generate_index(args:argparse.Namespace): - """ Generate """ + """ Generate index """ path_genome:Path = args.genome_fasta path_gtf:Path = args.annotation_gtf parth_proteome:Path = args.proteome_fasta @@ -55,11 +115,11 @@ def generate_index(args:argparse.Namespace): exception = 'trypsin_exception' if rule == 'trypsin' else None invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding quiet:bool = args.quiet + symlink:bool = args.gtf_symlink output_dir:Path = args.output_dir output_genome = output_dir/"genome.pkl" output_proteome = output_dir/"proteome.pkl" - output_anno = output_dir/"annotation.dat" output_peptides = output_dir/"canonical_peptides.pkl" output_coding_tx = output_dir/"coding_transcripts.pkl" @@ -77,28 +137,24 @@ def generate_index(args:argparse.Namespace): logger('Genome FASTA saved to disk.') del genome - anno = gtf.GenomicAnnotation() - anno.dump_gtf(path_gtf, source=args.reference_source) - if not quiet: - logger('Genome annotation GTF loaded.') - proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(parth_proteome, source=args.reference_source) + if not quiet: + logger('Proteome FASTA loaded.') - anno.check_protein_coding(proteome, invalid_protein_as_noncoding) - - with lzma.open(output_anno, 'wt') as handle: - GtfIO.write(handle, anno) + output_gtf = create_gtf_copy(path_gtf, output_dir, symlink) + index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding) if not quiet: logger('Genome annotation GTF saved to disk.') - if not quiet: - logger('Proteome FASTA loaded.') with open(output_proteome, 'wb') as handle: pickle.dump(proteome, handle) if not quiet: logger('Proteome FASTA saved to disk.') + anno = gtf.GenomicAnnotationOnDisk() + anno.generate_index(output_gtf) + # canoincal peptide pool canonical_peptides = proteome.create_unique_peptide_pool( anno=anno, rule=rule, exception=exception, miscleavage=miscleavage, diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py new file mode 100644 index 00000000..1f7b8c7d --- /dev/null +++ b/moPepGen/gtf/GTFIndexMetadata.py @@ -0,0 +1,46 @@ +""" GTFIndexMetadata """ +from typing import IO +from moPepGen.version import MetaVersion + +MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##' + +class GTFIndexMetadata(): + """ Class for GTF index metadata. """ + def __init__(self, source:str=None, version:MetaVersion=None): + """ constructor """ + self.source = source + self.version = version or MetaVersion() + + def write(self, handle:IO): + """ Write metadata to handle. """ + prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX + handle.write(f"{prefix}source={self.source}\n") + handle.write(f"{prefix}python={self.version.python}\n") + handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n") + handle.write(f"{prefix}biopython={self.version.biopython}\n") + + @classmethod + def parse(cls, handle:IO): + """ Read metadata from handle """ + pos = handle.tell() + it = handle.readline() + metadata = {} + version = {} + prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX + while it and it.startswith(prefix): + key, val = it.lstrip(prefix).rstrip().split('=', 1) + if val == '': + val = None + if key in ('python', 'moPepGen', 'biopython'): + version[key.lower()] = val + else: + metadata[key] = val + + pos = handle.tell() + it = handle.readline() + + handle.seek(pos) + + version = MetaVersion(**version) + + return cls(source=metadata['source'], version=version) diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py new file mode 100644 index 00000000..502a102a --- /dev/null +++ b/moPepGen/gtf/GTFPointer.py @@ -0,0 +1,265 @@ +""" GTFPointer and GTFPointerDict """ +from __future__ import annotations +from typing import Dict, Set, Iterable, Mapping, Union, IO +from collections import deque +from moPepGen.gtf import GtfIO +from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel +from moPepGen.gtf.TranscriptAnnotationModel import ( + TranscriptAnnotationModel, + GTF_FEATURE_TYPES +) +from moPepGen.gtf.GTFSourceInferrer import GTFSourceInferrer + + +GENE_DICT_CACHE_SIZE = 10 +TX_DICT_CACHE_SIZE = 10 + +class GTFPointer(): + """ GTFPointer. Represents the range of the GTF file of a particular + genome annotation entity (gene or transcript). """ + def __init__(self, handle:IO, key:str, start:int, end:int, source:str): + """ """ + self.handle = handle + self.key = key + self.start = start + self.end = end + self.source = source + + def __len__(self) -> int: + """ length """ + return self.end - self.start + +class GenePointer(GTFPointer): + """ Pointer of a GTF file for a gene. """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + transcripts:Set[str]=None): + """ Constructor """ + super().__init__(handle, key, start, end, source) + self.transcripts = transcripts or set() + + def load(self) -> GeneAnnotationModel: + """ Load gene annotation model from handle. """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').rstrip().split('\n') + if len(lines) > 1: + raise ValueError(f"Multiple lines found for gene {self.key}") + record = GtfIO.line_to_seq_feature(lines[0]) + record.__class__ = GeneAnnotationModel + record.exons = [] + record.transcripts = list(self.transcripts) + return record + + def to_line(self) -> str: + """ Convert to line """ + fields = [ + self.key, + str(self.start), + str(self.end), + ','.join(self.transcripts) + ] + return '\t'.join(fields) + +class TranscriptPointer(GTFPointer): + """ Pointer of a GTF file for a trnascript. """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + is_protein_coding:bool=None): + """ TranscriptPointer """ + super().__init__(handle, key, start, end, source) + self.is_protein_coding = is_protein_coding + + def load(self) -> TranscriptAnnotationModel: + """ Load pointer and return a transcript annotation model. """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').rstrip().split('\n') + tx_model = TranscriptAnnotationModel() + + for line in lines: + record = GtfIO.line_to_seq_feature(line) + feature = record.type.lower() + if feature not in GTF_FEATURE_TYPES: + continue + tx_id = record.transcript_id + if tx_id != self.key: + raise ValueError("Loaded transcript do not match.") + record.id = tx_id + tx_model.add_record(feature, record) + tx_model.sort_records() + return tx_model + + def to_line(self) -> str: + """ Convert to string """ + fields = [ + self.key, + str(self.start), + str(self.end), + str(self.is_protein_coding) + ] + return "\t".join(fields) + +def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: + """ Iterate over a GTF file and yield pointers. """ + if not source: + inferrer = GTFSourceInferrer() + + cur_gene_id:str = None + cur_tx_id:str = None + cur_gene_pointer = None + cur_tx_pointer = None + line_end = 0 + for line in handle: + line:bytes + line_start = line_end + line_end += len(line) + line = line.decode('utf-8') + + if line.startswith('#'): + continue + + record = GtfIO.line_to_seq_feature(line) + + if not source: + record.source = inferrer.infer(record) + else: + record.source = source + + if record.type.lower() == 'gene': + if cur_gene_pointer: + yield cur_gene_pointer + if cur_tx_pointer: + yield cur_tx_pointer + cur_gene_id = record.gene_id + cur_gene_pointer = GenePointer( + handle, cur_gene_id, + line_start, line_end, record.source + ) + cur_tx_id = None + cur_tx_pointer = None + else: + if cur_tx_id != record.transcript_id: + if cur_tx_pointer: + yield cur_tx_pointer + cur_tx_id = record.transcript_id + cur_tx_pointer = TranscriptPointer( + handle, cur_tx_id, + line_start, line_end, record.source + ) + else: + cur_tx_pointer.end = line_end + if cur_gene_pointer: + cur_gene_pointer.transcripts.add(cur_tx_id) + if cur_gene_pointer: + yield cur_gene_pointer + if cur_tx_pointer: + yield cur_tx_pointer + +class GTFPointerDict(dict, Mapping[str, GTFPointer]): + """ GTFPointerDict. This defines a special dict class that keys are str + (gene or transcript ID) and values are corresponding GTFPointer. The getter + magic function is customized so it returns a loaded GeneAnnotationModel + or TranscriptAnnotationModel instead of the pointer. + """ + def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): + """ Constructor """ + for val in kwargs.values(): + self.validate(val, GTFPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GTFPointer) + super().__init__(*args, **kwargs) + + def __iter__(self) -> Iterable[str]: + """ Iterator """ + for k in self.keys(): + yield k + + def validate(self, val, __type): + """ Ensures the values are GTFPointers """ + if not __type is GTFPointer or not issubclass(__type, GTFPointer): + raise TypeError(f"type = {__type} is not supported") + if not isinstance(val, __type): + raise TypeError( + f"{self.__class__} only accecpts {__type.__class__} objects" + ) + + def __setitem__(self, __key: str, __value: GTFPointer) -> None: + """ Setter """ + self.validate(__value, GTFPointer) + return super().__setitem__(__key, __value) + + def get_pointer(self, __key: str) -> GTFPointer: + """ Get pointer by key without loading the annotation entity. """ + return super().__getitem__(__key) + +class GenePointerDict(GTFPointerDict): + """ GenePointerDict. This defines a dict class that keys are gene IDs and + values are the corresponding gene models. The getter magic function is + customized so it returns a loaded GeneAnnotationModel object instead of the + pointer. + """ + def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]): + """ Constructor """ + for val in kwargs.values(): + self.validate(val, GenePointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GenePointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str) -> GeneAnnotationModel: + """ Getter """ + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > GENE_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:GenePointer = self.get_pointer(__key) + val = pointer.load() + self._cache[__key] = val + return val + +class TranscriptPointerDict(GTFPointerDict): + """ TranscriptPointerDict. This defines a dict class that keys are transcript + IDs and values are the corresponding transcript models. The getter magic + function is customized so it returns a loaded TranscriptAnnotationModel + object instead of the pointer. + """ + def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]): + """ Constructor """ + for val in kwargs.values(): + self.validate(val, TranscriptPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, TranscriptPointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str) -> TranscriptAnnotationModel: + """ getter """ + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > TX_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:TranscriptPointer = self.get_pointer(__key) + val = pointer.load() + val.is_protein_coding = pointer.is_protein_coding is True + val.transcript.source = pointer.source + self._cache[__key] = val + return val diff --git a/moPepGen/gtf/GTFSourceInferrer.py b/moPepGen/gtf/GTFSourceInferrer.py new file mode 100644 index 00000000..4c3c1aa3 --- /dev/null +++ b/moPepGen/gtf/GTFSourceInferrer.py @@ -0,0 +1,31 @@ +""" Infer GTF source (e.g. GENCODE/ENSEMBL) """ +from __future__ import annotations +from typing import Dict, TYPE_CHECKING + + +if TYPE_CHECKING: + from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature + +class GTFSourceInferrer(): + """ Infer GTF source (e.g. GENOCDE/ENSEMBL) """ + def __init__(self): + """ Constructor """ + self.max_iter = 100 + self.data:Dict[str,int] = {} + self.count = 0 + self.source:str = None + + def infer(self, record:GTFSeqFeature) -> str: + """ Infer the source of a GTF record """ + if self.count > self.max_iter: + if not self.source: + self.source = sorted(self.data.items(), key=lambda x:x[1])[-1][0] + return self.source + self.count += 1 + record.infer_annotation_source() + source = record.source + if source not in self.data: + self.data[source] = 1 + else: + self.data[source] += 1 + return source diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py index 948dfbf6..adbde542 100644 --- a/moPepGen/gtf/GenomicAnnotation.py +++ b/moPepGen/gtf/GenomicAnnotation.py @@ -2,7 +2,7 @@ """ from __future__ import annotations import copy -from typing import List, Dict, Tuple, TYPE_CHECKING +from typing import List, Dict, Tuple, TYPE_CHECKING, IO, Union from moPepGen.SeqFeature import FeatureLocation, SeqFeature from moPepGen import seqvar, err from moPepGen.version import MetaVersion @@ -10,6 +10,7 @@ from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES from .GeneAnnotationModel import GeneAnnotationModel from .GTFSeqFeature import GTFSeqFeature +from .GTFSourceInferrer import GTFSourceInferrer if TYPE_CHECKING: @@ -31,12 +32,8 @@ def __init__(self, genes:Dict[str,GeneAnnotationModel]=None, transcripts:Dict[str, TranscriptAnnotationModel]=None, source:str=None): """ Construct a GenomicAnnotation object """ - if genes is None: - genes = {} - if transcripts is None: - transcripts = {} - self.genes = genes - self.transcripts = transcripts + self.genes = genes or {} + self.transcripts = transcripts or {} self.source = source self.gene_id_version_mapper:Dict[str, str] = None self.version = MetaVersion() @@ -106,7 +103,7 @@ def remove_cached_tx_seq(self): for tx_id in self._cached_tx_seqs: self.transcripts[tx_id].remove_cached_seq() - def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None: + def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=None)->None: """ Dump a GTF file into a GenomicAnnotation Args: @@ -117,25 +114,14 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None: """ record:GTFSeqFeature if not source: - count = 0 - inferred = {} - for record in GtfIO.parse(path): + inferrer = GTFSourceInferrer() + + for record in GtfIO.parse(handle): if biotype is not None and record.biotype not in biotype: continue if not source: - if count > 100: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - record.source = source - else: - count += 1 - record.infer_annotation_source() - inferred_source = record.source - if inferred_source not in inferred: - inferred[inferred_source] = 1 - else: - inferred[inferred_source] += 1 + record.source = inferrer.infer(record) else: record.source = source @@ -146,11 +132,11 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None: self.add_transcript_record(record) - if not source: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - self.source = source + if not source: + source = inferrer.source + else: + self.source = source for transcript_model in self.transcripts.values(): transcript_model.sort_records() diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py new file mode 100644 index 00000000..f667891b --- /dev/null +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -0,0 +1,151 @@ +""" Module for GenomicAnnotationOnDisk """ +from __future__ import annotations +from typing import Dict, IO, Union, Tuple +import io +from pathlib import Path +from moPepGen import err +from moPepGen.version import MetaVersion +from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation +from moPepGen.gtf.GTFPointer import ( + GenePointerDict, TranscriptPointerDict, iterate_pointer, + GenePointer, TranscriptPointer +) +from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata + + +class GenomicAnnotationOnDisk(GenomicAnnotation): + """ An on disk version of GenomicAnnotation that uses minimal memory. """ + def __init__(self, genes:GenePointerDict=None, + transcripts:TranscriptPointerDict=None, + source:str=None): + """ Constructor """ + # pylint: disable=W0231 + self.genes = genes or GenePointerDict() + self.transcripts = transcripts or TranscriptPointerDict() + self.source = source + self.gene_id_version_mapper:Dict[str,str] = None + self.version = MetaVersion() + self._cached_tx_seqs = [] + self.handle = None + + def __del__(self): + """ Hook for deconstruction """ + if self.handle: + self.handle.close() + + def dump_gtf(self, *args, **kwargs) -> None: + """ Forbidden """ + raise NotImplementedError + + def infer_source(self): + """ Infer GTF source (GENCODE vs ENSEMBL) from data. """ + i = 0 + inferred = {} + for tx in self.transcripts.keys(): + i += 1 + if i > 100: + break + s = self.transcripts.get_pointer(tx).source + if s in inferred: + inferred[s] += 1 + else: + inferred[s] = 1 + self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0] + + def init_handle(self, handle:Union[str, IO, Path]): + """ Initiate file handle """ + # pylint: disable=R1732 + if isinstance(handle, str): + handle = Path(handle) + + if isinstance(handle, Path): + if handle.suffix.lower() != '.gtf': + raise ValueError(f"Unsupported file formate of {handle}") + ihandle = open(handle, 'rb') + elif isinstance(handle, io.IOBase): + ihandle = handle + else: + raise ValueError(f"handle {handle} of type {handle.__class__} is not supported.") + + self.handle = ihandle + + @staticmethod + def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: + """ Get index file paths given the GTF file path. """ + if isinstance(file, str): + file = Path(file) + basename = str(file.name).split('.gtf', maxsplit=1)[0] + gene_idx_file = file.parent/f"{basename}_gene.idx" + tx_idx_file = file.parent/f"{basename}_tx.idx" + + return gene_idx_file, tx_idx_file + + def generate_index(self, handle:Union[IO, str, Path], source:str=None): + """ Generate GTF index """ + self.init_handle(handle) + + for pointer in iterate_pointer(self.handle, source): + if isinstance(pointer, GenePointer): + self.genes[pointer.key] = pointer + else: + self.transcripts[pointer.key] = pointer + + if source: + self.source = source + else: + self.infer_source() + + def load_index(self, file:Union[str,Path]): + """ Load index from idx files """ + self.init_handle(file) + gene_idx_file, tx_idx_file = self.get_index_files(file) + + if not gene_idx_file.exists(): + raise ValueError(f"Gene index file cannot be found in {file.parent}") + if not tx_idx_file.exists(): + raise ValueError(f"Transcript index file cannot be found in {file.parent}") + + version = MetaVersion() + + with open(gene_idx_file, 'rt') as handle: + metadata = GTFIndexMetadata.parse(handle) + if not version.is_valid(metadata.version): + raise err.IndexVersionNotMatchError(version, metadata.version) + + gene_source = metadata.source + if not gene_source: + raise ValueError("Cannot find source from gene idx.") + + for line in handle: + if line.startswith('#'): + continue + fields = line.rstrip().split('\t') + pointer = GenePointer( + self.handle, key=fields[0], + start=int(fields[1]), end=int(fields[2]), + source=gene_source, transcripts=fields[3].split(',') + ) + self.genes[pointer.key] = pointer + + with open(tx_idx_file, 'rt') as handle: + metadata = GTFIndexMetadata.parse(handle) + if not version.is_valid(metadata.version): + raise err.IndexVersionNotMatchError(version, metadata.version) + + tx_source = metadata.source + if not tx_source: + raise ValueError("Cannot find source from gene idx.") + if tx_source != gene_source: + raise ValueError('tx_source does not match with gene_source') + + for line in handle: + if line.startswith('#'): + continue + fields = line.rstrip().split('\t') + is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]] + pointer = TranscriptPointer( + handle=self.handle, key=fields[0], + start=int(fields[1]), end=int(fields[2]), + source=tx_source, is_protein_coding=is_protein_coding + ) + self.transcripts[pointer.key] = pointer diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py index 45978103..41b4810f 100644 --- a/moPepGen/gtf/GtfIO.py +++ b/moPepGen/gtf/GtfIO.py @@ -25,48 +25,54 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]: if line.startswith('#'): continue line = line.rstrip() - fields = line.split('\t') - - try: - strand={'+':1, '-':-1, '?':0}[fields[6]] - except KeyError: - strand=None - - attributes = {} - attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id', - 'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding'] - attribute_list = [field.strip().split(' ', 1) for field in \ - fields[8].rstrip(';').split(';')] - - for key,val in attribute_list: - if key not in attributes_to_keep: - continue - val = val.strip('"') - if key == 'tag': - if key not in attributes: - attributes[key] = [] - attributes[key].append(val) - else: - attributes[key] = val - - location = FeatureLocation( - seqname=fields[0], - start=int(fields[3])-1, - end=int(fields[4]), - strand=strand, - ) - - frame = None if fields[7] == '.' else int(fields[7]) - - record = GTFSeqFeature( - chrom=fields[0], - attributes=attributes, - location=location, - type=fields[2], - frame=frame - ) + record = line_to_seq_feature(line) + yield record +def line_to_seq_feature(line:str) -> GTFSeqFeature: + """ Conver line to SeqFeature """ + line = line.rstrip() + fields = line.split('\t') + + try: + strand={'+':1, '-':-1, '?':0}[fields[6]] + except KeyError: + strand=None + + attributes = {} + attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id', + 'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding'] + attribute_list = [field.strip().split(' ', 1) for field in \ + fields[8].rstrip(';').split(';')] + + for key,val in attribute_list: + if key not in attributes_to_keep: + continue + val = val.strip('"') + if key == 'tag': + if key not in attributes: + attributes[key] = [] + attributes[key].append(val) + else: + attributes[key] = val + + location = FeatureLocation( + seqname=fields[0], + start=int(fields[3])-1, + end=int(fields[4]), + strand=strand, + ) + + frame = None if fields[7] == '.' else int(fields[7]) + + return GTFSeqFeature( + chrom=fields[0], + attributes=attributes, + location=location, + type=fields[2], + frame=frame + ) + def parse(handle:Union[IO[str], str]) -> GtfIterator: """ Parser for GTF files. """ diff --git a/moPepGen/gtf/__init__.py b/moPepGen/gtf/__init__.py index b8fd2342..1118af9a 100644 --- a/moPepGen/gtf/__init__.py +++ b/moPepGen/gtf/__init__.py @@ -3,3 +3,4 @@ from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES from .GenomicAnnotation import GenomicAnnotation from .GeneAnnotationModel import GeneAnnotationModel +from .GenomicAnnotationOnDisk import GenomicAnnotationOnDisk diff --git a/moPepGen/version.py b/moPepGen/version.py index 01a5b4b9..71d16bdd 100644 --- a/moPepGen/version.py +++ b/moPepGen/version.py @@ -6,15 +6,15 @@ from moPepGen import __version__ -MINIMAL_VERSION = '0.12.0' +MINIMAL_VERSION = '1.1.0' class MetaVersion(): """ Versions """ - def __init__(self): + def __init__(self, python:str=None, biopython:str=None, mopepgen:str=None): """ constructor """ - self.python = sys.version_info[:3] - self.biopython = Bio.__version__ - self.mopepgen = __version__ + self.python = python or '.'.join([str(x) for x in sys.version_info[:3]]) + self.biopython = biopython or Bio.__version__ + self.mopepgen = mopepgen or __version__ def __eq__(self, other:MetaVersion): """ equal to """ @@ -45,5 +45,4 @@ def __ne__(self, other:MetaVersion): def __repr__(self) -> str: """ str representation """ - python = '.'.join([str(x) for x in self.python]) - return f"python={python}, biopython={self.biopython}, moPepGen={self.mopepgen}" + return f"python={self.python}, biopython={self.biopython}, moPepGen={self.mopepgen}" diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx new file mode 100644 index 00000000..76011e1e --- /dev/null +++ b/test/files/annotation_gene.idx @@ -0,0 +1,8 @@ +##source=None +##python=3.8.17 +##moPepGen=1.1.0 +##biopython=1.81 +ENSG00000128408.9 0 174 ENST00000614168.2,ENST00000614167.2 +ENSG00000244486.9 12491 12694 ENST00000622235.5 +ENSG00000099949.21 24586 24955 ENST00000642151.1 +ENSG00000279973.2 27280 27432 ENST00000624155.2 diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx new file mode 100644 index 00000000..81ead4f8 --- /dev/null +++ b/test/files/annotation_tx.idx @@ -0,0 +1,9 @@ +##source=None +##python=3.8.17 +##moPepGen=1.1.0 +##biopython=1.81 +ENST00000614167.2 174 7260 True +ENST00000614168.2 7260 12491 True +ENST00000622235.5 12694 24586 True +ENST00000642151.1 24955 27280 True +ENST00000624155.2 27432 28522 False diff --git a/test/files/fusion/star_fusion.gvf b/test/files/fusion/star_fusion.gvf index b11972e9..4d14aa2d 100644 --- a/test/files/fusion/star_fusion.gvf +++ b/test/files/fusion/star_fusion.gvf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##mopepgen_version=0.12.0 +##mopepgen_version=1.0.1 ##parser=parseSTARFusion ##reference_index=/home/chenghaozhu/projects/private-moPepGen/test/files/index ##genome_fasta= @@ -13,7 +13,7 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO -ENSG00000128408.9 832 FUSION-ENST00000614167.2:831-ENST00000642151.1:29 A . . TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 -ENSG00000128408.9 832 FUSION-ENST00000614168.2:831-ENST00000642151.1:29 A . . TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 +ENSG00000128408.9 832 FUSION-ENST00000614167.2:831-ENST00000642151.1:29 T . . TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 +ENSG00000128408.9 832 FUSION-ENST00000614168.2:831-ENST00000642151.1:29 T . . TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 ENSG00000244486.9 1752 FUSION-ENST00000622235.5:1751-ENST00000642151.1:29 G . . TRANSCRIPT_ID=ENST00000622235.5;GENE_SYMBOL=SCARF2;GENOMIC_POSITION=chr22:3213:3213;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 ENSG00000279973.2 93 FUSION-ENST00000624155.2:92-ENST00000622235.5:968 T . . TRANSCRIPT_ID=ENST00000624155.2;GENE_SYMBOL=CU104787.1;GENOMIC_POSITION=chr22:5311:5311;ACCEPTER_GENE_ID=ENSG00000244486.9;ACCEPTER_TRANSCRIPT_ID=ENST00000622235.5;ACCEPTER_SYMBOL=SCARF2;ACCEPTER_POSITION=969;ACCEPTER_GENOMIC_POSITION=chr22:3995:3995 diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta index 8aa7c5ea..bee351cd 100644 --- a/test/files/peptides/variant.fasta +++ b/test/files/peptides/variant.fasta @@ -1,218 +1,170 @@ ->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 -CLTCEPGWNR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 -LPPCDEFVGARR ->CIRC-ENST00000614167.2-E2-E1|W2F-11|1 -AGASVPLRRHFEPDGK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|2 -HETLFLLTFPRER ->ENST00000614167.2|RES-101-A-G|W2F-9|3 ENST00000614167.2|RES-101-A-G|W2F-9|3 -IIGGDTEAFDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|22 -MAVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|20 -MAVALPRDLRQDANLAK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|3 -HETLFLLTFPR ->CIRC-ENST00000614167.2-E2-E1|18 -MFKFMTR ->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 -ARHETLCC ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 -NNLVVDISERAGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 -HETLWLLTFPRERER ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2 -ERLYLCGVTGSPTENCAK ->ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3 -IIGGDTEAWDVQVHGQK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5 -RSKHTVVAYK ->CIRC-ENST00000614167.2-E2-E1|1 -HWEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|16 -FKFMTRR ->ENST00000622235.5|RES-202-G-A|W2F-8|2 ENST00000622235.5|RES-202-G-A|W2F-8|2 -CLTCEPGFNR ->ENST00000614167.2|SNV-1179-G-T|1 -IQGARATLUF ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 -HETLWLLTFPR ->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 -MEGAGPRGAVPAR ->CIRC-ENST00000614167.2-E2-E1|W2F-3|2 -RHFEPDGK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9 -EATEKARHETLWLLTFPR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9 -RLPPCDEFVGARR ->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 -CLTCEPGWNRTK ->CIRC-ENST00000614167.2-E2-E1|4 -RHWEPDGK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|1 -HETLFLLTFPRERER ->CIRC-ENST00000614167.2-E2-E1|2 -HWEPDGK ->ENST00000622235.5|RES-202-G-A|W2F-19|1 ENST00000622235.5|RES-202-G-A|W2F-19|1 -GQQPCTVAEGRCLTCEPGFNRTK +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2 +HTVVAYK +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10 +RLPPCDEFVGAR +>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 +IIGGDTEAWDVQVHGQKIKEATEK >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1 HTVVAYKDAI ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-12|1 -EATEKARHETLFLLTFPR ->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 -TKCDQPCATGFYGEGCSHR ->ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6 -GQQPCTVAEGRCLTCEPGWNR ->CIRC-ENST00000614167.2-E2-E1|17 -MFKFMTRR ->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 -EGAGPRGAVPAR ->CIRC-ENST00000614167.2-E2-E1|12 -MKPLVVDISERAGASVPLR ->CIRC-ENST00000614167.2-E2-E1|23 -MAVALPR ->ENST00000614167.2|RES-101-A-G|W2F-11|1 ENST00000614167.2|RES-101-A-G|W2F-11|1 -NRIIGGDTEAFDVQVHGQKIK ->CIRC-ENST00000614167.2-E2-E1|8 -ARHETFGC ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 -NTLSGMQKFMGEDUNFHER ->CIRC-ENST00000614167.2-E2-E1|W2F-3|1 -RHFEPDGKLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5 +HETLWLLTFPRER +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 +FKFMARR +>CIRC-ENST00000614167.2-E2-E1|11 +KPLVVDISERAGASVPLR >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3 NNLVVDISERAGASVPLR ->CIRC-ENST00000642151.1-E1-E2|3 -PFRRSTSGPSK ->ENST00000614167.2|RES-101-A-G|W2F-16|1 ENST00000614167.2|RES-101-A-G|W2F-16|1 -VFNARNRIIGGDTEAFDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|3 -RHWEPDGKLR ->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 -GQQPCTVAEGRCLTCEPGWNRTK ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1 -MKEEAAA ->CIRC-ENST00000614167.2-E2-E1|W2F-2|1 -HFEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 -MFKFMAR +>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 +ARHETLCC +>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 +CLTCEPGWNR +>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 +MEGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2 +ERLYLCGVTGSPTENCAK +>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 +MEGAGPRGAVPARR +>CIRC-ENST00000614167.2-E2-E1|7 +AGASVPLR +>ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2 +IIGGDTEAWDVQVHGQKIK >ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1 GAVPARRR +>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 +GQQPCTVAEGRCLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|4 +RHWEPDGK >CIRC-ENST00000614167.2-E2-E1|5 AGASVPLRRHWEPDGK ->ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5 -NRIIGGDTEAWDVQVHGQK +>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 +NTLSGMQKFMGEDUNFHER >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1 LYLCGVTGSPTENCAK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 -ERERLYLCGVTGSPTENCAK ->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 -VFNARNRIIGGDTEAWDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|19 -AVALPRDLRQDANLAK +>ENST00000614167.2|SNV-1179-G-T|2 +RIQGARATLUF +>CIRC-ENST00000614167.2-E2-E1|14 +MKPLVVDISER +>ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2 +GAVPARR >ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1 TKCDQPCATGFYGEGCSHRCPPCR ->CIRC-ENST00000614167.2-E2-E1|7 -AGASVPLR ->ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2 -IIGGDTEAWDVQVHGQKIK ->ENST00000622235.5|RES-202-G-A|W2F-19|2 ENST00000622235.5|RES-202-G-A|W2F-19|2 -GQQPCTVAEGRCLTCEPGFNR ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 -MNNLVVDISERAGASVPLRR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 -LPPCDEFVGAR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5 -HETLWLLTFPRER ->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 -EGAGPRGAVPARR ->ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4 -NRIIGGDTEAWDVQVHGQKIK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 -ARHETLWLLTFPR +>ENST00000614167.2|SNV-1179-G-T|1 +IQGARATLUF +>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 +EATEKARHE +>CIRC-ENST00000614167.2-E2-E1|1 +HWEPDGKLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 +ERERLYLCGVTGSPTENCAK +>ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3 +IIGGDTEAWDVQVHGQK >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3 AAGAVFLALSAQLLQARLMKEEAAA ->CIRC-ENST00000614167.2-E2-E1|10 -MKPLVVDISERAGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 -FKFMARR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 -MFKFMARR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9 +EATEKARHETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 +MNNLVVDISER +>CIRC-ENST00000614167.2-E2-E1|22 +MAVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 +MNNLVVDISERAGASVPLRR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7 +ARHETLWLLTFPRER +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6 +LPPCDEFVGARRSK >CIRC-ENST00000642151.1-E1-E2|2 RSTSGPSK +>CIRC-ENST00000614167.2-E2-E1|10 +MKPLVVDISERAGASVPLRR +>CIRC-ENST00000614167.2-E2-E1|2 +HWEPDGK +>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 +VFNARNRIIGGDTEAWDVQVHGQK +>CIRC-ENST00000614167.2-E2-E1|19 +AVALPRDLRQDANLAK >CIRC-ENST00000614167.2-E2-E1|15 EATEKARHETFGC ->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 -MEGAGPRGAVPARR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|1 -ARHETLFLLTFPRER ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 -SKHTVVAYK ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 -MNNLVVDISERAGASVPLR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|W2F-1|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|W2F-1|1 -FRRLPPCDEFVGAR ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|1 -NTLSGMQKFMGED ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 -WRRLPPCDEFVGAR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9 +RLPPCDEFVGARR +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 +MFKFMARR +>ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5 +NRIIGGDTEAWDVQVHGQK +>CIRC-ENST00000614167.2-E2-E1|9 +KPLVVDISERAGASVPLRR >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12 LESTTRKVHR ->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 -IIGGDTEAWDVQVHGQKIKEATEK ->ENST00000614167.2|RES-101-A-G|W2F-9|1 ENST00000614167.2|RES-101-A-G|W2F-9|1 -IIGGDTEAFDVQVHGQKIKEATEK ->ENST00000614167.2|RES-101-A-G|W2F-9|2 ENST00000614167.2|RES-101-A-G|W2F-9|2 -IIGGDTEAFDVQVHGQKIK ->ENST00000614167.2|RES-101-A-G|W2F-11|2 ENST00000614167.2|RES-101-A-G|W2F-11|2 -NRIIGGDTEAFDVQVHGQK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 -SKHTVVAYKDAI +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 +MNNLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 +HETLWLLTFPRERER >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5 NNLVVDISER ->ENST00000622235.5|RES-202-G-A|W2F-8|1 ENST00000622235.5|RES-202-G-A|W2F-8|1 -CLTCEPGFNRTK +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 +SKHTVVAYKDAI +>CIRC-ENST00000614167.2-E2-E1|20 +MAVALPRDLRQDANLAK +>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 +CLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 +HETLWLLTFPR >CIRC-ENST00000614167.2-E2-E1|6 AGASVPLRR ->ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2 -GAVPARR ->CIRC-ENST00000614167.2-E2-E1|14 -MKPLVVDISER ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7 -ARHETLWLLTFPRER +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5 +RSKHTVVAYK +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 +LMKEEAAA +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 +LPPCDEFVGARR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 +LPPCDEFVGAR +>CIRC-ENST00000614167.2-E2-E1|18 +MFKFMTR +>CIRC-ENST00000614167.2-E2-E1|21 +AVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|16 +FKFMTRR +>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 +EGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|12 +MKPLVVDISERAGASVPLR +>ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6 +GQQPCTVAEGRCLTCEPGWNR +>CIRC-ENST00000642151.1-E1-E2|3 +PFRRSTSGPSK >CIRC-ENST00000614167.2-E2-E1|13 KPLVVDISER ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10 -RLPPCDEFVGAR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|2 -ARHETLFLLTFPR ->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 -EATEKARHETLCC +>CIRC-ENST00000614167.2-E2-E1|23 +MAVALPR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 +SKHTVVAYK +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 +MFKFMAR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 +NNLVVDISERAGASVPLRR +>ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4 +NRIIGGDTEAWDVQVHGQKIK +>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 +EGAGPRGAVPARR +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1 +MKEEAAA +>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 +TKCDQPCATGFYGEGCSHR +>CIRC-ENST00000614167.2-E2-E1|8 +ARHETFGC >CIRC-ENST00000642151.1-E1-E2|4 MPFRRSTSGPSK ->ENST00000614167.2|SNV-1179-G-T|2 -RIQGARATLUF ->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 -EATEKARHE +>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 +EATEKARHETLCC +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 +ARHETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|3 +RHWEPDGKLR >CIRC-ENST00000642151.1-E1-E2|1 STSGPSK ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 -LMKEEAAA ->CIRC-ENST00000614167.2-E2-E1|9 -KPLVVDISERAGASVPLRR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6 -LPPCDEFVGARRSK ->CIRC-ENST00000614167.2-E2-E1|W2F-2|2 -HFEPDGK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2 -HTVVAYK ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 -MNNLVVDISER ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|2 -QSDNDVRNTLSGMQKFMGED >ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 NTLSGMQKFMGEDUNFHERK ->CIRC-ENST00000614167.2-E2-E1|21 -AVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|11 -KPLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|17 +MFKFMTRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 +WRRLPPCDEFVGAR diff --git a/test/files/vep/vep_gINDEL.gvf b/test/files/vep/vep_gINDEL.gvf index d1269c9f..0a64ea98 100644 --- a/test/files/vep/vep_gINDEL.gvf +++ b/test/files/vep/vep_gINDEL.gvf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##mopepgen_version=0.12.0 +##mopepgen_version=1.0.1 ##parser=parseVEP ##reference_index= ##genome_fasta=/home/chenghaozhu/projects/private-moPepGen/test/files/genome.fasta diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 90acf801..845f035e 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -1,9 +1,11 @@ """ Test the command line interface """ import argparse import subprocess as sp +import os import sys from test.integration import TestCaseIntegration -from moPepGen import cli +from moPepGen import cli, aa +from moPepGen.gtf import GenomicAnnotation, GenomicAnnotationOnDisk class TestGenerateIndex(TestCaseIntegration): @@ -31,6 +33,7 @@ def test_generate_index_case1(self): args.genome_fasta = self.data_dir / 'genome.fasta' args.annotation_gtf = self.data_dir / 'annotation.gtf' args.proteome_fasta = self.data_dir / 'translate.fasta' + args.gtf_symlink = False args.reference_source = None args.invalid_protein_as_noncoding = False args.cleavage_rule = 'trypsin' @@ -43,6 +46,35 @@ def test_generate_index_case1(self): args.output_dir.mkdir(parents=False, exist_ok=True) cli.generate_index(args) files = {str(file.name) for file in args.output_dir.glob('*')} - expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat', + expected = {'genome.pkl', 'proteome.pkl', + 'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx', 'canonical_peptides.pkl', 'coding_transcripts.pkl'} self.assertEqual(files, expected) + +class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration): + """ Test case for GenomicAnnotationOnDisk """ + def test_generate_index(self): + """ Test generate index """ + proteome = aa.AminoAcidSeqDict() + proteome.dump_fasta(self.data_dir/'translate.fasta') + gtf_file = self.data_dir/'annotation.gtf' + gtf_file2 = self.work_dir/'annotation.gtf' + os.symlink(gtf_file, gtf_file2) + cli.index_gtf(gtf_file2, proteome=proteome) + expect = { + 'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx' + } + received = {str(file.name) for file in self.work_dir.glob('*')} + self.assertEqual(received, expect) + + + def test_load_index(self): + """ test load index """ + anno1 = GenomicAnnotation() + anno1.dump_gtf(self.data_dir/'annotation.gtf') + + anno2 = GenomicAnnotationOnDisk() + anno2.load_index(self.data_dir/'annotation.gtf') + + self.assertEqual(anno1.genes.keys(), anno2.genes.keys()) + self.assertEqual(anno1.transcripts.keys(), anno2.transcripts.keys())