From 8d24b628fc94e6a140ccf8df8b433e15c451f537 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Fri, 23 Jun 2023 18:36:41 -0700 Subject: [PATCH 01/18] fix (GenomicAnnotationOnDisk): class created to keep genomic annotation on disk --- moPepGen/gtf/GenomicAnnotation.py | 14 +- moPepGen/gtf/GenomicAnnotationOnDisk.py | 270 ++++++++++++++++++++++++ moPepGen/gtf/GtfIO.py | 86 ++++---- 3 files changed, 321 insertions(+), 49 deletions(-) create mode 100644 moPepGen/gtf/GenomicAnnotationOnDisk.py diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py index 948dfbf6..bf2142ae 100644 --- a/moPepGen/gtf/GenomicAnnotation.py +++ b/moPepGen/gtf/GenomicAnnotation.py @@ -2,7 +2,7 @@ """ from __future__ import annotations import copy -from typing import List, Dict, Tuple, TYPE_CHECKING +from typing import List, Dict, Tuple, TYPE_CHECKING, IO, Union from moPepGen.SeqFeature import FeatureLocation, SeqFeature from moPepGen import seqvar, err from moPepGen.version import MetaVersion @@ -31,12 +31,8 @@ def __init__(self, genes:Dict[str,GeneAnnotationModel]=None, transcripts:Dict[str, TranscriptAnnotationModel]=None, source:str=None): """ Construct a GenomicAnnotation object """ - if genes is None: - genes = {} - if transcripts is None: - transcripts = {} - self.genes = genes - self.transcripts = transcripts + self.genes = genes or {} + self.transcripts = transcripts or {} self.source = source self.gene_id_version_mapper:Dict[str, str] = None self.version = MetaVersion() @@ -106,7 +102,7 @@ def remove_cached_tx_seq(self): for tx_id in self._cached_tx_seqs: self.transcripts[tx_id].remove_cached_seq() - def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None: + def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=None)->None: """ Dump a GTF file into a GenomicAnnotation Args: @@ -119,7 +115,7 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None: if not source: count = 0 inferred = {} - for record in GtfIO.parse(path): + for record in GtfIO.parse(handle): if biotype is not None and record.biotype not in biotype: continue diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py new file mode 100644 index 00000000..06128e11 --- /dev/null +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -0,0 +1,270 @@ +""" Module for GenomicAnnotationOnDisk """ +from __future__ import annotations +from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING +import io +from collections import deque +from pathlib import Path +from moPepGen.gtf import GtfIO +from moPepGen.version import MetaVersion +from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation +from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel +from moPepGen.gtf.TranscriptAnnotationModel import ( + TranscriptAnnotationModel, + GTF_FEATURE_TYPES +) + + +if TYPE_CHECKING: + from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature + +GENE_DICT_CACHE_SIZE = 10 +TX_DICT_CACHE_SIZE = 10 + +class GTFPointer(): + """ """ + def __init__(self, handle:IO, key:str, start:int, end:int, source:str): + """ """ + self.handle = handle + self.key = key + self.start = start + self.end = end + self.source = source + + def __len__(self) -> int: + """ length """ + return self.end - self.start + +class GenePointer(GTFPointer): + """ """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + transcripts:List[str]=None): + super().__init__(handle, key, start, end, source) + self.transcripts = transcripts or [] + + def load(self) -> GeneAnnotationModel: + """ """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').split('\n') + if len(lines) > 0: + raise ValueError(f"Multiple lines found for gene {self.key}") + record = GtfIO.line_to_seq_feature(lines[0]) + record.__class__ = GeneAnnotationModel + record.exons = [] + record.transcripts = self.transcripts + return record + +class TranscriptPointer(GTFPointer): + """ """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + is_protein_coding:bool=None): + super().__init__(handle, key, start, end, source) + self.is_protein_coding = is_protein_coding + + def load(self) -> TranscriptAnnotationModel: + """ """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').rstrip().split('\n') + tx_model = TranscriptAnnotationModel() + + for line in lines: + record = GtfIO.line_to_seq_feature(line) + feature = record.type.lower() + if feature not in GTF_FEATURE_TYPES: + continue + tx_id = record.transcript_id + if tx_id != self.key: + raise ValueError("Loaded transcript do not match.") + record.id = tx_id + tx_model.add_record(feature, record) + return tx_model + +def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: + """ """ + if not source: + count = 0 + inferred = {} + + cur_gene_id:str = None + cur_tx_id:str = None + cur_gene_pointer = None + cur_tx_pointer = None + line_end = 0 + for line in handle: + line:bytes + line_start = line_end + line_end += len(line) + line = line.decode('utf-8') + + if line.startswith('#'): + continue + + record = GtfIO.line_to_seq_feature(line) + + if not source: + if count > 100: + inferred = sorted(inferred.items(), key=lambda x: x[1]) + source = inferred[-1][0] + record.source = source + else: + count += 1 + record.infer_annotation_source() + inferred_source = record.source + if inferred_source not in inferred: + inferred[inferred_source] = 1 + else: + inferred[inferred_source] += 1 + else: + record.source = source + + if record.type.lower() == 'gene': + if cur_gene_pointer: + yield cur_gene_pointer + if cur_tx_pointer: + yield cur_tx_pointer + cur_gene_id = record.gene_id + cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source) + cur_tx_id = None + cur_tx_pointer = None + else: + if cur_tx_id != record.transcript_id: + if cur_tx_pointer: + yield cur_tx_pointer + cur_tx_id = record.transcript_id + cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source) + else: + cur_tx_pointer.end = line_end + if cur_gene_pointer: + cur_gene_pointer.transcripts.append(cur_tx_id) + +class GTFPointerDict(dict, Mapping[str, GTFPointer]): + """ """ + def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): + """ """ + for val in kwargs.values(): + self.validate(val, GTFPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GTFPointer) + super().__init__(*args, **kwargs) + + def validate(self, val, type): + """ """ + if not type is GTFPointer or not issubclass(type, GTFPointer): + raise TypeError(f"type = {type} is not supported") + if not isinstance(val, type): + raise TypeError( + f"{self.__class__} only accecpts {type.__class__} objects" + ) + + def __setitem__(self, __key: str, __value: GTFPointer) -> None: + self.validate(__value, GTFPointer) + return super().__setitem__(__key, __value) + + def get_pointer(self, __key: str) -> GTFPointer: + """ """ + return super().__getitem__(__key) + +class GenePointerDict(GTFPointerDict): + """ """ + def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]): + for val in kwargs.values(): + self.validate(val, GenePointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GenePointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str) -> GeneAnnotationModel: + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > GENE_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:GenePointer = self.get_pointer(__key) + return pointer.load() + +class TranscriptPointerDict(GTFPointerDict): + """ """ + def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]): + for val in kwargs.values(): + self.validate(val, TranscriptPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, TranscriptPointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str + ) -> TranscriptAnnotationModel: + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > TX_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:TranscriptPointer = self.get_pointer(__key) + tx_model = pointer.load() + tx_model.is_protein_coding = pointer.is_protein_coding is True + return tx_model + +class GenomicAnnotationOnDisk(GenomicAnnotation): + """ """ + def __init__(self, genes:GenePointerDict=None, + transcripts:TranscriptPointerDict=None, + source:str=None): + """ """ + self.genes = genes or GenePointerDict() + self.transcripts = transcripts or TranscriptPointerDict() + self.source = source + self.gene_id_version_mapper:Dict[str,str] = None + self.version = MetaVersion() + self._cached_tx_seqs = [] + self.handle = None + + def __del__(self): + """ """ + if self.handle: + self.handle.close() + + def dump_gtf(self, *args, **kwargs) -> None: + raise NotImplementedError + + def generate_index(self, handle:Union[IO, str], source:str=None): + """ """ + if isinstance(handle, str): + ihandle = open(handle, 'rb') + elif isinstance(handle, io.IOBase): + ihandle = handle + else: + raise ValueError(f"handle {handle} of type {handle.__class__} is not supported.") + + self.handle = ihandle + + for pointer in iterate_pointer(self.handle): + if isinstance(pointer, GenePointer): + self.genes[pointer.key] = pointer + else: + self.transcripts[pointer.key] = pointer + + def load_index(self, handle:IO): + """ """ + pass + +def index_gtf(handle:Union[str, IO], biotype:List[str]=None, source:str=None): + """""" diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py index 45978103..b59ea88c 100644 --- a/moPepGen/gtf/GtfIO.py +++ b/moPepGen/gtf/GtfIO.py @@ -25,48 +25,54 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]: if line.startswith('#'): continue line = line.rstrip() - fields = line.split('\t') - - try: - strand={'+':1, '-':-1, '?':0}[fields[6]] - except KeyError: - strand=None - - attributes = {} - attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id', - 'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding'] - attribute_list = [field.strip().split(' ', 1) for field in \ - fields[8].rstrip(';').split(';')] - - for key,val in attribute_list: - if key not in attributes_to_keep: - continue - val = val.strip('"') - if key == 'tag': - if key not in attributes: - attributes[key] = [] - attributes[key].append(val) - else: - attributes[key] = val - - location = FeatureLocation( - seqname=fields[0], - start=int(fields[3])-1, - end=int(fields[4]), - strand=strand, - ) - - frame = None if fields[7] == '.' else int(fields[7]) - - record = GTFSeqFeature( - chrom=fields[0], - attributes=attributes, - location=location, - type=fields[2], - frame=frame - ) + record = line_to_seq_feature(line) + yield record +def line_to_seq_feature(line:str) -> GTFSeqFeature: + """ """ + line = line.rstrip() + fields = line.split('\t') + + try: + strand={'+':1, '-':-1, '?':0}[fields[6]] + except KeyError: + strand=None + + attributes = {} + attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id', + 'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding'] + attribute_list = [field.strip().split(' ', 1) for field in \ + fields[8].rstrip(';').split(';')] + + for key,val in attribute_list: + if key not in attributes_to_keep: + continue + val = val.strip('"') + if key == 'tag': + if key not in attributes: + attributes[key] = [] + attributes[key].append(val) + else: + attributes[key] = val + + location = FeatureLocation( + seqname=fields[0], + start=int(fields[3])-1, + end=int(fields[4]), + strand=strand, + ) + + frame = None if fields[7] == '.' else int(fields[7]) + + return GTFSeqFeature( + chrom=fields[0], + attributes=attributes, + location=location, + type=fields[2], + frame=frame + ) + def parse(handle:Union[IO[str], str]) -> GtfIterator: """ Parser for GTF files. """ From 53ea8ddb2c81b121b0387b062bea9d09055b9dec Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sat, 24 Jun 2023 00:20:31 -0700 Subject: [PATCH 02/18] fix (GenomicAnnotationOnDisk): added index_gtf function --- moPepGen/gtf/GenomicAnnotationOnDisk.py | 63 +++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 3 deletions(-) diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 06128e11..c6d28d09 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -2,7 +2,7 @@ from __future__ import annotations from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING import io -from collections import deque +from collections import deque, Counter from pathlib import Path from moPepGen.gtf import GtfIO from moPepGen.version import MetaVersion @@ -56,6 +56,16 @@ def load(self) -> GeneAnnotationModel: record.transcripts = self.transcripts return record + def to_line(self) -> str: + """ """ + fields = [ + self.key, + self.start, + self.end, + ','.join(self.transcripts) + ] + return '\t'.join(fields) + class TranscriptPointer(GTFPointer): """ """ def __init__(self, handle: IO, key: str, start: int, end: int, source: str, @@ -84,6 +94,16 @@ def load(self) -> TranscriptAnnotationModel: tx_model.add_record(feature, record) return tx_model + def to_line(self) -> str: + """ """ + fields = [ + self.key, + self.start, + self.end, + self.is_protein_coding + ] + return "\t".join(fields) + def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: """ """ if not source: @@ -245,6 +265,21 @@ def __del__(self): def dump_gtf(self, *args, **kwargs) -> None: raise NotImplementedError + def infer_source(self): + """ """ + i = 0 + inferred = {} + for tx in self.transcripts.keys(): + i += 1 + if i > 100: + break + s = self.transcripts.get_pointer(tx).source + if s in inferred: + inferred[s] += 1 + else: + inferred[s] = 1 + self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0] + def generate_index(self, handle:Union[IO, str], source:str=None): """ """ if isinstance(handle, str): @@ -256,15 +291,37 @@ def generate_index(self, handle:Union[IO, str], source:str=None): self.handle = ihandle - for pointer in iterate_pointer(self.handle): + for pointer in iterate_pointer(self.handle, source): if isinstance(pointer, GenePointer): self.genes[pointer.key] = pointer else: self.transcripts[pointer.key] = pointer + if source: + self.source = source + else: + self.infer_source() + def load_index(self, handle:IO): """ """ pass -def index_gtf(handle:Union[str, IO], biotype:List[str]=None, source:str=None): +def index_gtf(file:Path, biotype:List[str]=None, source:str=None): """""" + anno = GenomicAnnotationOnDisk() + with open(Path, 'rb') as handle: + anno.generate_index(handle, source) + + gene_idx_file = file.parent/f"{file.stem}_gene.idx" + with open(gene_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for gene in anno.genes.keys(): + gene_pointer:GenePointer = anno.genes.get_pointer(gene) + handle.write(gene_pointer.to_line() + '\n') + + tx_idx_file = file.parent/f"{file.stem}_tx.idx" + with open(tx_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for tx in anno.transcripts.keys(): + tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) + handle.write(tx_pointer.to_line() + '\n') From 39b9c6b8f3a026f83366f25417e8be3e2c6baad2 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sat, 24 Jun 2023 21:31:53 -0700 Subject: [PATCH 03/18] fix (GenomicAnnotationOnDisk): added loader function from gtf idx --- moPepGen/gtf/GTFPointer.py | 240 +++++++++++++++++ moPepGen/gtf/GenomicAnnotationOnDisk.py | 343 ++++++------------------ moPepGen/gtf/__init__.py | 1 + test/integration/test_generate_index.py | 13 + 4 files changed, 339 insertions(+), 258 deletions(-) create mode 100644 moPepGen/gtf/GTFPointer.py diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py new file mode 100644 index 00000000..3a7937c5 --- /dev/null +++ b/moPepGen/gtf/GTFPointer.py @@ -0,0 +1,240 @@ +""" """ +from __future__ import annotations +from typing import Dict, Set, Iterable, Mapping, Union, IO, TYPE_CHECKING +from collections import deque +from moPepGen.gtf import GtfIO +from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel +from moPepGen.gtf.TranscriptAnnotationModel import TranscriptAnnotationModel +from moPepGen.gtf.TranscriptAnnotationModel import ( + TranscriptAnnotationModel, + GTF_FEATURE_TYPES +) + +if TYPE_CHECKING: + from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature + +GENE_DICT_CACHE_SIZE = 10 +TX_DICT_CACHE_SIZE = 10 + +class GTFPointer(): + """ """ + def __init__(self, handle:IO, key:str, start:int, end:int, source:str): + """ """ + self.handle = handle + self.key = key + self.start = start + self.end = end + self.source = source + + def __len__(self) -> int: + """ length """ + return self.end - self.start + +class GenePointer(GTFPointer): + """ """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + transcripts:Set[str]=None): + super().__init__(handle, key, start, end, source) + self.transcripts = transcripts or set() + + def load(self) -> GeneAnnotationModel: + """ """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').split('\n') + if len(lines) > 0: + raise ValueError(f"Multiple lines found for gene {self.key}") + record = GtfIO.line_to_seq_feature(lines[0]) + record.__class__ = GeneAnnotationModel + record.exons = [] + record.transcripts = list(self.transcripts) + return record + + def to_line(self) -> str: + """ """ + fields = [ + self.key, + str(self.start), + str(self.end), + ','.join(self.transcripts) + ] + return '\t'.join(fields) + +class TranscriptPointer(GTFPointer): + """ """ + def __init__(self, handle: IO, key: str, start: int, end: int, source: str, + is_protein_coding:bool=None): + super().__init__(handle, key, start, end, source) + self.is_protein_coding = is_protein_coding + + def load(self) -> TranscriptAnnotationModel: + """ """ + cur = self.handle.tell() + offset = self.start - cur + self.handle.seek(offset, 1) + buffer:bytes = self.handle.read(len(self)) + lines = buffer.decode('utf-8').rstrip().split('\n') + tx_model = TranscriptAnnotationModel() + + for line in lines: + record = GtfIO.line_to_seq_feature(line) + feature = record.type.lower() + if feature not in GTF_FEATURE_TYPES: + continue + tx_id = record.transcript_id + if tx_id != self.key: + raise ValueError("Loaded transcript do not match.") + record.id = tx_id + tx_model.add_record(feature, record) + return tx_model + + def to_line(self) -> str: + """ """ + fields = [ + self.key, + str(self.start), + str(self.end), + str(self.is_protein_coding) + ] + return "\t".join(fields) + +def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: + """ """ + if not source: + count = 0 + inferred = {} + + cur_gene_id:str = None + cur_tx_id:str = None + cur_gene_pointer = None + cur_tx_pointer = None + line_end = 0 + for line in handle: + line:bytes + line_start = line_end + line_end += len(line) + line = line.decode('utf-8') + + if line.startswith('#'): + continue + + record = GtfIO.line_to_seq_feature(line) + + if not source: + if count > 100: + inferred = sorted(inferred.items(), key=lambda x: x[1]) + source = inferred[-1][0] + record.source = source + else: + count += 1 + record.infer_annotation_source() + inferred_source = record.source + if inferred_source not in inferred: + inferred[inferred_source] = 1 + else: + inferred[inferred_source] += 1 + else: + record.source = source + + if record.type.lower() == 'gene': + if cur_gene_pointer: + yield cur_gene_pointer + if cur_tx_pointer: + yield cur_tx_pointer + cur_gene_id = record.gene_id + cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source) + cur_tx_id = None + cur_tx_pointer = None + else: + if cur_tx_id != record.transcript_id: + if cur_tx_pointer: + yield cur_tx_pointer + cur_tx_id = record.transcript_id + cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source) + else: + cur_tx_pointer.end = line_end + if cur_gene_pointer: + cur_gene_pointer.transcripts.add(cur_tx_id) + +class GTFPointerDict(dict, Mapping[str, GTFPointer]): + """ """ + def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): + """ """ + for val in kwargs.values(): + self.validate(val, GTFPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GTFPointer) + super().__init__(*args, **kwargs) + + def validate(self, val, type): + """ """ + if not type is GTFPointer or not issubclass(type, GTFPointer): + raise TypeError(f"type = {type} is not supported") + if not isinstance(val, type): + raise TypeError( + f"{self.__class__} only accecpts {type.__class__} objects" + ) + + def __setitem__(self, __key: str, __value: GTFPointer) -> None: + self.validate(__value, GTFPointer) + return super().__setitem__(__key, __value) + + def get_pointer(self, __key: str) -> GTFPointer: + """ """ + return super().__getitem__(__key) + +class GenePointerDict(GTFPointerDict): + """ """ + def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]): + for val in kwargs.values(): + self.validate(val, GenePointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, GenePointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str) -> GeneAnnotationModel: + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > GENE_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:GenePointer = self.get_pointer(__key) + return pointer.load() + +class TranscriptPointerDict(GTFPointerDict): + """ """ + def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]): + for val in kwargs.values(): + self.validate(val, TranscriptPointer) + if args: + if not isinstance(args[0], dict): + raise TypeError('Data type invalid') + for val in args[0].values(): + self.validate(val, TranscriptPointer) + super().__init__(*args, **kwargs) + self._cache = {} + self._cached_keys = deque() + + def __getitem__(self, __key: str + ) -> TranscriptAnnotationModel: + if __key in self._cache: + return self._cache[__key] + self._cached_keys.appendleft(__key) + if len(self._cached_keys) > TX_DICT_CACHE_SIZE: + key_pop = self._cached_keys.pop() + self._cache.pop(key_pop) + pointer:TranscriptPointer = self.get_pointer(__key) + tx_model = pointer.load() + tx_model.is_protein_coding = pointer.is_protein_coding is True + return tx_model diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index c6d28d09..4fbdf324 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -1,248 +1,16 @@ """ Module for GenomicAnnotationOnDisk """ from __future__ import annotations -from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING +from typing import Dict, IO, Union, Tuple import io -from collections import deque, Counter from pathlib import Path -from moPepGen.gtf import GtfIO from moPepGen.version import MetaVersion from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation -from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel -from moPepGen.gtf.TranscriptAnnotationModel import ( - TranscriptAnnotationModel, - GTF_FEATURE_TYPES +from moPepGen.gtf.GTFPointer import ( + GenePointerDict, TranscriptPointerDict, iterate_pointer, + GenePointer, TranscriptPointer ) -if TYPE_CHECKING: - from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature - -GENE_DICT_CACHE_SIZE = 10 -TX_DICT_CACHE_SIZE = 10 - -class GTFPointer(): - """ """ - def __init__(self, handle:IO, key:str, start:int, end:int, source:str): - """ """ - self.handle = handle - self.key = key - self.start = start - self.end = end - self.source = source - - def __len__(self) -> int: - """ length """ - return self.end - self.start - -class GenePointer(GTFPointer): - """ """ - def __init__(self, handle: IO, key: str, start: int, end: int, source: str, - transcripts:List[str]=None): - super().__init__(handle, key, start, end, source) - self.transcripts = transcripts or [] - - def load(self) -> GeneAnnotationModel: - """ """ - cur = self.handle.tell() - offset = self.start - cur - self.handle.seek(offset, 1) - buffer:bytes = self.handle.read(len(self)) - lines = buffer.decode('utf-8').split('\n') - if len(lines) > 0: - raise ValueError(f"Multiple lines found for gene {self.key}") - record = GtfIO.line_to_seq_feature(lines[0]) - record.__class__ = GeneAnnotationModel - record.exons = [] - record.transcripts = self.transcripts - return record - - def to_line(self) -> str: - """ """ - fields = [ - self.key, - self.start, - self.end, - ','.join(self.transcripts) - ] - return '\t'.join(fields) - -class TranscriptPointer(GTFPointer): - """ """ - def __init__(self, handle: IO, key: str, start: int, end: int, source: str, - is_protein_coding:bool=None): - super().__init__(handle, key, start, end, source) - self.is_protein_coding = is_protein_coding - - def load(self) -> TranscriptAnnotationModel: - """ """ - cur = self.handle.tell() - offset = self.start - cur - self.handle.seek(offset, 1) - buffer:bytes = self.handle.read(len(self)) - lines = buffer.decode('utf-8').rstrip().split('\n') - tx_model = TranscriptAnnotationModel() - - for line in lines: - record = GtfIO.line_to_seq_feature(line) - feature = record.type.lower() - if feature not in GTF_FEATURE_TYPES: - continue - tx_id = record.transcript_id - if tx_id != self.key: - raise ValueError("Loaded transcript do not match.") - record.id = tx_id - tx_model.add_record(feature, record) - return tx_model - - def to_line(self) -> str: - """ """ - fields = [ - self.key, - self.start, - self.end, - self.is_protein_coding - ] - return "\t".join(fields) - -def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: - """ """ - if not source: - count = 0 - inferred = {} - - cur_gene_id:str = None - cur_tx_id:str = None - cur_gene_pointer = None - cur_tx_pointer = None - line_end = 0 - for line in handle: - line:bytes - line_start = line_end - line_end += len(line) - line = line.decode('utf-8') - - if line.startswith('#'): - continue - - record = GtfIO.line_to_seq_feature(line) - - if not source: - if count > 100: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - record.source = source - else: - count += 1 - record.infer_annotation_source() - inferred_source = record.source - if inferred_source not in inferred: - inferred[inferred_source] = 1 - else: - inferred[inferred_source] += 1 - else: - record.source = source - - if record.type.lower() == 'gene': - if cur_gene_pointer: - yield cur_gene_pointer - if cur_tx_pointer: - yield cur_tx_pointer - cur_gene_id = record.gene_id - cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source) - cur_tx_id = None - cur_tx_pointer = None - else: - if cur_tx_id != record.transcript_id: - if cur_tx_pointer: - yield cur_tx_pointer - cur_tx_id = record.transcript_id - cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source) - else: - cur_tx_pointer.end = line_end - if cur_gene_pointer: - cur_gene_pointer.transcripts.append(cur_tx_id) - -class GTFPointerDict(dict, Mapping[str, GTFPointer]): - """ """ - def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): - """ """ - for val in kwargs.values(): - self.validate(val, GTFPointer) - if args: - if not isinstance(args[0], dict): - raise TypeError('Data type invalid') - for val in args[0].values(): - self.validate(val, GTFPointer) - super().__init__(*args, **kwargs) - - def validate(self, val, type): - """ """ - if not type is GTFPointer or not issubclass(type, GTFPointer): - raise TypeError(f"type = {type} is not supported") - if not isinstance(val, type): - raise TypeError( - f"{self.__class__} only accecpts {type.__class__} objects" - ) - - def __setitem__(self, __key: str, __value: GTFPointer) -> None: - self.validate(__value, GTFPointer) - return super().__setitem__(__key, __value) - - def get_pointer(self, __key: str) -> GTFPointer: - """ """ - return super().__getitem__(__key) - -class GenePointerDict(GTFPointerDict): - """ """ - def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]): - for val in kwargs.values(): - self.validate(val, GenePointer) - if args: - if not isinstance(args[0], dict): - raise TypeError('Data type invalid') - for val in args[0].values(): - self.validate(val, GenePointer) - super().__init__(*args, **kwargs) - self._cache = {} - self._cached_keys = deque() - - def __getitem__(self, __key: str) -> GeneAnnotationModel: - if __key in self._cache: - return self._cache[__key] - self._cached_keys.appendleft(__key) - if len(self._cached_keys) > GENE_DICT_CACHE_SIZE: - key_pop = self._cached_keys.pop() - self._cache.pop(key_pop) - pointer:GenePointer = self.get_pointer(__key) - return pointer.load() - -class TranscriptPointerDict(GTFPointerDict): - """ """ - def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]): - for val in kwargs.values(): - self.validate(val, TranscriptPointer) - if args: - if not isinstance(args[0], dict): - raise TypeError('Data type invalid') - for val in args[0].values(): - self.validate(val, TranscriptPointer) - super().__init__(*args, **kwargs) - self._cache = {} - self._cached_keys = deque() - - def __getitem__(self, __key: str - ) -> TranscriptAnnotationModel: - if __key in self._cache: - return self._cache[__key] - self._cached_keys.appendleft(__key) - if len(self._cached_keys) > TX_DICT_CACHE_SIZE: - key_pop = self._cached_keys.pop() - self._cache.pop(key_pop) - pointer:TranscriptPointer = self.get_pointer(__key) - tx_model = pointer.load() - tx_model.is_protein_coding = pointer.is_protein_coding is True - return tx_model - class GenomicAnnotationOnDisk(GenomicAnnotation): """ """ def __init__(self, genes:GenePointerDict=None, @@ -280,9 +48,9 @@ def infer_source(self): inferred[s] = 1 self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0] - def generate_index(self, handle:Union[IO, str], source:str=None): + def init_handle(self, handle:Union[str, IO, Path]): """ """ - if isinstance(handle, str): + if isinstance(handle, str) or isinstance(handle, Path): ihandle = open(handle, 'rb') elif isinstance(handle, io.IOBase): ihandle = handle @@ -291,6 +59,19 @@ def generate_index(self, handle:Union[IO, str], source:str=None): self.handle = ihandle + @staticmethod + def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: + """ """ + if isinstance(file, str): + file = Path(file) + gene_idx_file = file.parent/f"{file.stem}_gene.idx" + tx_idx_file = file.parent/f"{file.stem}_tx.idx" + return gene_idx_file, tx_idx_file + + def generate_index(self, handle:Union[IO, str, Path], source:str=None): + """ """ + self.init_handle(handle) + for pointer in iterate_pointer(self.handle, source): if isinstance(pointer, GenePointer): self.genes[pointer.key] = pointer @@ -302,26 +83,72 @@ def generate_index(self, handle:Union[IO, str], source:str=None): else: self.infer_source() - def load_index(self, handle:IO): + def load_index(self, file:Union[str,Path]): """ """ - pass - -def index_gtf(file:Path, biotype:List[str]=None, source:str=None): - """""" - anno = GenomicAnnotationOnDisk() - with open(Path, 'rb') as handle: - anno.generate_index(handle, source) + self.init_handle(file) + gene_idx_file, tx_idx_file = self.get_index_files(file) + + gene_source = None + with open(gene_idx_file, 'rt') as handle: + for line in handle: + if line.startswith('#'): + line = line.lstrip('#').lstrip() + if line.startswith('source='): + gene_source = line.split('=')[1] + continue + + if not gene_source: + raise ValueError("Cannot find source from gene idx.") + + fields = line.rstrip().split('\t') + pointer = GenePointer( + self.handle, key=fields[0], + start=int(fields[1]), end=int(fields[2]), + source=gene_source, transcripts=fields[3].split(',') + ) + self.genes[pointer.key] = pointer - gene_idx_file = file.parent/f"{file.stem}_gene.idx" - with open(gene_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") - for gene in anno.genes.keys(): - gene_pointer:GenePointer = anno.genes.get_pointer(gene) - handle.write(gene_pointer.to_line() + '\n') + tx_source = None + with open(tx_idx_file, 'rt') as handle: + for line in handle: + if line.startswith('#'): + line = line.lstrip('#').lstrip() + if line.startswith('source='): + tx_source = line.split('=')[1] + continue + + if not tx_source: + raise ValueError("Cannot find source from gene idx.") + + if tx_source != gene_source: + raise ValueError('tx_source does not match with gene_source') + + fields = line.rstrip().split('\t') + is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]] + pointer = TranscriptPointer( + handle=self.handle, key=fields[0], + start=int(fields[1]), end=int(fields[2]), + source=tx_source, is_protein_coding=is_protein_coding + ) + self.transcripts[pointer.key] = pointer - tx_idx_file = file.parent/f"{file.stem}_tx.idx" - with open(tx_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") - for tx in anno.transcripts.keys(): - tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) - handle.write(tx_pointer.to_line() + '\n') + @classmethod + def index_gtf(cls, file:Path, source:str=None): + """""" + anno = GenomicAnnotationOnDisk() + with open(file, 'rb') as handle: + anno.generate_index(handle, source) + + gene_idx_file, tx_idx_file = cls.get_index_files(file) + + with open(gene_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for gene in anno.genes.keys(): + gene_pointer:GenePointer = anno.genes.get_pointer(gene) + handle.write(gene_pointer.to_line() + '\n') + + with open(tx_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for tx in anno.transcripts.keys(): + tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) + handle.write(tx_pointer.to_line() + '\n') diff --git a/moPepGen/gtf/__init__.py b/moPepGen/gtf/__init__.py index b8fd2342..1118af9a 100644 --- a/moPepGen/gtf/__init__.py +++ b/moPepGen/gtf/__init__.py @@ -3,3 +3,4 @@ from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES from .GenomicAnnotation import GenomicAnnotation from .GeneAnnotationModel import GeneAnnotationModel +from .GenomicAnnotationOnDisk import GenomicAnnotationOnDisk diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 90acf801..569f572c 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -4,6 +4,7 @@ import sys from test.integration import TestCaseIntegration from moPepGen import cli +from moPepGen.gtf import GenomicAnnotationOnDisk class TestGenerateIndex(TestCaseIntegration): @@ -46,3 +47,15 @@ def test_generate_index_case1(self): expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat', 'canonical_peptides.pkl', 'coding_transcripts.pkl'} self.assertEqual(files, expected) + +class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration): + """ """ + def test_generate_index(self): + """ """ + anno = GenomicAnnotationOnDisk() + anno.index_gtf(self.data_dir/'annotation.gtf') + + def test_load_index(self): + """ """ + anno = GenomicAnnotationOnDisk() + anno.load_index(self.data_dir/'annotation.gtf') From dc149caac1562eb697ba9adbc7b0333b00efaf91 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Sat, 24 Jun 2023 23:36:50 -0700 Subject: [PATCH 04/18] fix (GenomicAnnotationOnDisk): check protein coding when generating gtf index --- moPepGen/cli/__init__.py | 2 +- moPepGen/cli/generate_index.py | 30 +++++++++++++++++++++++-- moPepGen/gtf/GenomicAnnotationOnDisk.py | 21 ----------------- test/integration/test_generate_index.py | 6 +++-- 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py index 90f9e0a1..3881c6ea 100644 --- a/moPepGen/cli/__init__.py +++ b/moPepGen/cli/__init__.py @@ -1,5 +1,5 @@ """ Module for command line interface """ -from .generate_index import add_subparser_generate_index, generate_index +from .generate_index import add_subparser_generate_index, generate_index, index_gtf from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index fa558e6e..cb19dc83 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -6,6 +6,7 @@ moPepGen to avoid processing the reference files repeatedly and save massive time. """ from __future__ import annotations +from typing import TYPE_CHECKING import argparse from pathlib import Path import pickle @@ -15,6 +16,9 @@ from moPepGen.cli import common +if TYPE_CHECKING: + from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer + # pylint: disable=W0212 def add_subparser_generate_index(subparsers:argparse._SubParsersAction): """ CLI for moPepGen generateIndex """ @@ -41,6 +45,30 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction): return p +def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, + invalid_protein_as_noncoding:bool=True): + """""" + anno = gtf.GenomicAnnotationOnDisk() + with open(file, 'rb') as handle: + anno.generate_index(handle, source) + + if proteome: + anno.check_protein_coding(proteome, invalid_protein_as_noncoding) + + gene_idx_file, tx_idx_file = anno.get_index_files(file) + + with open(gene_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for gene in anno.genes.keys(): + gene_pointer:GenePointer = anno.genes.get_pointer(gene) + handle.write(gene_pointer.to_line() + '\n') + + with open(tx_idx_file, 'wt') as handle: + handle.write(f"# source={anno.source}\n") + for tx in anno.transcripts.keys(): + tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) + handle.write(tx_pointer.to_line() + '\n') + def generate_index(args:argparse.Namespace): """ Generate """ path_genome:Path = args.genome_fasta @@ -85,8 +113,6 @@ def generate_index(args:argparse.Namespace): proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(parth_proteome, source=args.reference_source) - anno.check_protein_coding(proteome, invalid_protein_as_noncoding) - with lzma.open(output_anno, 'wt') as handle: GtfIO.write(handle, anno) if not quiet: diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 4fbdf324..ad6d23de 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -131,24 +131,3 @@ def load_index(self, file:Union[str,Path]): source=tx_source, is_protein_coding=is_protein_coding ) self.transcripts[pointer.key] = pointer - - @classmethod - def index_gtf(cls, file:Path, source:str=None): - """""" - anno = GenomicAnnotationOnDisk() - with open(file, 'rb') as handle: - anno.generate_index(handle, source) - - gene_idx_file, tx_idx_file = cls.get_index_files(file) - - with open(gene_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") - for gene in anno.genes.keys(): - gene_pointer:GenePointer = anno.genes.get_pointer(gene) - handle.write(gene_pointer.to_line() + '\n') - - with open(tx_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") - for tx in anno.transcripts.keys(): - tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) - handle.write(tx_pointer.to_line() + '\n') diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 569f572c..5bdc075d 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -3,7 +3,7 @@ import subprocess as sp import sys from test.integration import TestCaseIntegration -from moPepGen import cli +from moPepGen import cli, aa from moPepGen.gtf import GenomicAnnotationOnDisk @@ -52,8 +52,10 @@ class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration): """ """ def test_generate_index(self): """ """ + proteome = aa.AminoAcidSeqDict() + proteome.dump_fasta(self.data_dir/'translate.fasta') anno = GenomicAnnotationOnDisk() - anno.index_gtf(self.data_dir/'annotation.gtf') + cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome) def test_load_index(self): """ """ From d435bca79f67900ce8919d6497b203a8cbc5e41d Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Mon, 26 Jun 2023 23:45:20 -0700 Subject: [PATCH 05/18] fix (GenerateIndex): moving to use gene and tx index --- moPepGen/cli/generate_index.py | 61 +++++++++++++++++++++---- moPepGen/gtf/GenomicAnnotationOnDisk.py | 19 ++++++-- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index cb19dc83..d93a8700 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -8,9 +8,11 @@ from __future__ import annotations from typing import TYPE_CHECKING import argparse +import os from pathlib import Path import pickle import lzma +import gzip from moPepGen import dna, aa, gtf, logger from moPepGen.gtf import GtfIO from moPepGen.cli import common @@ -49,7 +51,14 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, invalid_protein_as_noncoding:bool=True): """""" anno = gtf.GenomicAnnotationOnDisk() - with open(file, 'rb') as handle: + + if file.suffix.lower() == '.gtf': + opener = open + elif file.suffix.lower() == '.gz': + opener = gzip.open + elif file.suffix.lower() == '.lz': + opener = lzma.open + with opener(file, 'rb') as handle: anno.generate_index(handle, source) if proteome: @@ -69,6 +78,37 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) handle.write(tx_pointer.to_line() + '\n') +def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path: + """ """ + if symlink: + if file.suffix.lower() == '.gz': + output_file = output_dir/'annotation.gtf.gz' + elif file.suffix.lower() == '.gtf': + output_file = output_dir/'annotation.gtf' + else: + raise ValueError(f"File not supported {file}") + os.symlink(file, output_file) + else: + if file.suffix.lower() == '.gtf': + i_opener = open + elif file.suffix.lower() == '.gz': + i_opener = gzip.open + + if compression == None: + o_opener = open + output_file = output_dir/'annotation.gtf' + elif compression == 'lzma': + o_opener = lzma.open + output_file = output_dir/'annotation.gtf.lz' + elif compression == 'gzip': + o_opener = gzip.open + output_file = output_dir/'annotation.gtf.gz' + + with i_opener(file, 'rt') as ihandle, o_opener(output_file, 'wt') as ohandle: + for line in ihandle: + ohandle.write(line) + return output_file + def generate_index(args:argparse.Namespace): """ Generate """ path_genome:Path = args.genome_fasta @@ -84,6 +124,9 @@ def generate_index(args:argparse.Namespace): invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding quiet:bool = args.quiet + symlink:bool = False + compression:bool = 'lzma' + output_dir:Path = args.output_dir output_genome = output_dir/"genome.pkl" output_proteome = output_dir/"proteome.pkl" @@ -105,26 +148,24 @@ def generate_index(args:argparse.Namespace): logger('Genome FASTA saved to disk.') del genome - anno = gtf.GenomicAnnotation() - anno.dump_gtf(path_gtf, source=args.reference_source) - if not quiet: - logger('Genome annotation GTF loaded.') - proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(parth_proteome, source=args.reference_source) + if not quiet: + logger('Proteome FASTA loaded.') - with lzma.open(output_anno, 'wt') as handle: - GtfIO.write(handle, anno) + output_gtf = create_gtf_copy(path_gtf, output_dir, symlink, compression) + index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding) if not quiet: logger('Genome annotation GTF saved to disk.') - if not quiet: - logger('Proteome FASTA loaded.') with open(output_proteome, 'wb') as handle: pickle.dump(proteome, handle) if not quiet: logger('Proteome FASTA saved to disk.') + anno = gtf.GenomicAnnotationOnDisk() + anno.load_index(output_gtf) + # canoincal peptide pool canonical_peptides = proteome.create_unique_peptide_pool( anno=anno, rule=rule, exception=exception, miscleavage=miscleavage, diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index ad6d23de..13e78d5b 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -2,6 +2,8 @@ from __future__ import annotations from typing import Dict, IO, Union, Tuple import io +import gzip +import lzma from pathlib import Path from moPepGen.version import MetaVersion from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation @@ -50,8 +52,16 @@ def infer_source(self): def init_handle(self, handle:Union[str, IO, Path]): """ """ - if isinstance(handle, str) or isinstance(handle, Path): - ihandle = open(handle, 'rb') + if isinstance(handle, str): + handle = Path(handle) + + if isinstance(handle, Path): + if handle.suffix.lower() == '.gtf': + ihandle = open(handle, 'rb') + elif handle.suffix.lower() == '.lz': + ihandle = lzma.open(handle, 'rb') + elif handle.suffix.lower() == '.gz': + ihandle = gzip.open(handle, 'rb') elif isinstance(handle, io.IOBase): ihandle = handle else: @@ -64,8 +74,9 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: """ """ if isinstance(file, str): file = Path(file) - gene_idx_file = file.parent/f"{file.stem}_gene.idx" - tx_idx_file = file.parent/f"{file.stem}_tx.idx" + basename = str(file.name).split('.gtf')[0] + gene_idx_file = file.parent/f"{basename}_gene.idx" + tx_idx_file = file.parent/f"{basename}_tx.idx" return gene_idx_file, tx_idx_file def generate_index(self, handle:Union[IO, str, Path], source:str=None): From 81765b5b811248df96f8eb3e23149c2465fc2bcd Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 27 Jun 2023 18:13:43 -0700 Subject: [PATCH 06/18] fix (moPepGen): test cases passed --- moPepGen/__init__.py | 2 +- moPepGen/cli/call_alt_translation.py | 3 +- moPepGen/cli/call_noncoding_peptide.py | 3 +- moPepGen/cli/common.py | 16 +- moPepGen/cli/generate_index.py | 8 +- moPepGen/gtf/GTFIndexMetadata.py | 46 ++++ moPepGen/gtf/GTFPointer.py | 26 +- moPepGen/gtf/GenomicAnnotationOnDisk.py | 47 ++-- moPepGen/version.py | 13 +- test/files/fusion/star_fusion.gvf | 6 +- test/files/peptides/variant.fasta | 332 ++++++++++-------------- test/files/vep/vep_gINDEL.gvf | 2 +- test/integration/test_generate_index.py | 3 +- 13 files changed, 266 insertions(+), 241 deletions(-) create mode 100644 moPepGen/gtf/GTFIndexMetadata.py diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py index 87b412aa..7952e5dd 100644 --- a/moPepGen/__init__.py +++ b/moPepGen/__init__.py @@ -5,7 +5,7 @@ from typing import Iterable, IO -__version__ = '1.0.0' +__version__ = '1.0.1' ## Error messages ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron' diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py index 5b8ba4de..a75f3f19 100644 --- a/moPepGen/cli/call_alt_translation.py +++ b/moPepGen/cli/call_alt_translation.py @@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None: peptide_pool = aa.VariantPeptidePool() - for tx_id, tx_model in anno.transcripts.items(): + for tx_id in anno.transcripts: + tx_model = anno.transcripts[tx_id] if not tx_model.is_protein_coding: continue diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py index 324267ed..2cc51082 100644 --- a/moPepGen/cli/call_noncoding_peptide.py +++ b/moPepGen/cli/call_noncoding_peptide.py @@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None: orf_pool = [] i = 0 - for tx_id, tx_model in anno.transcripts.items(): + for tx_id in anno.transcripts: + tx_model = anno.transcripts[tx_id] if inclusion_biotypes and \ tx_model.transcript.biotype not in inclusion_biotypes: continue diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py index 71b463c4..34bbf18b 100644 --- a/moPepGen/cli/common.py +++ b/moPepGen/cli/common.py @@ -204,7 +204,7 @@ def add_args_source(parser:argparse.ArgumentParser): def load_references(args:argparse.Namespace, load_genome:bool=True, load_canonical_peptides:bool=True, load_proteome:bool=False, invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False - ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]: + ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]: """ Load reference files. If index_dir is specified, data will be loaded from pickles, otherwise, will read from FASTA and GTF. """ index_dir:Path = args.index_dir @@ -225,11 +225,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True, if not version.is_valid(genome.version): raise err.IndexVersionNotMatchError(version, genome.version) - with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle: - annotation = gtf.GenomicAnnotation() - annotation.dump_gtf(handle) - if not version.is_valid(annotation.version): - raise err.IndexVersionNotMatchError(version, genome.version) + gtf_file = list(index_dir.glob('annotation.gtf*'))[0] + annotation = gtf.GenomicAnnotationOnDisk() + annotation.init_handle(gtf_file) + annotation.load_index(gtf_file) if load_proteome: with open(f'{index_dir}/proteome.pkl', 'rb') as handle: @@ -249,8 +248,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True, if (check_protein_coding is True or load_canonical_peptides is True) and \ args.proteome_fasta is None: raise ValueError('--proteome-fasta was not specified.') - annotation = gtf.GenomicAnnotation() - annotation.dump_gtf(args.annotation_gtf, source=args.reference_source) + annotation = gtf.GenomicAnnotationOnDisk() + annotation.generate_index(args.annotation_gtf, source=args.reference_source) + if not quiet: logger('Annotation GTF loaded.') diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index d93a8700..392d0620 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -16,6 +16,7 @@ from moPepGen import dna, aa, gtf, logger from moPepGen.gtf import GtfIO from moPepGen.cli import common +from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata if TYPE_CHECKING: @@ -65,15 +66,16 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, anno.check_protein_coding(proteome, invalid_protein_as_noncoding) gene_idx_file, tx_idx_file = anno.get_index_files(file) + metadata = GTFIndexMetadata() with open(gene_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") + metadata.write(handle) for gene in anno.genes.keys(): gene_pointer:GenePointer = anno.genes.get_pointer(gene) handle.write(gene_pointer.to_line() + '\n') with open(tx_idx_file, 'wt') as handle: - handle.write(f"# source={anno.source}\n") + metadata.write(handle) for tx in anno.transcripts.keys(): tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) handle.write(tx_pointer.to_line() + '\n') @@ -164,7 +166,7 @@ def generate_index(args:argparse.Namespace): logger('Proteome FASTA saved to disk.') anno = gtf.GenomicAnnotationOnDisk() - anno.load_index(output_gtf) + anno.generate_index(output_gtf) # canoincal peptide pool canonical_peptides = proteome.create_unique_peptide_pool( diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py new file mode 100644 index 00000000..f235a099 --- /dev/null +++ b/moPepGen/gtf/GTFIndexMetadata.py @@ -0,0 +1,46 @@ +""" """ +from typing import IO +from moPepGen.version import MetaVersion + +MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##' + +class GTFIndexMetadata(): + """ """ + def __init__(self, source:str=None, version:MetaVersion=None): + """ """ + self.source = source + self.version = version or MetaVersion() + + def write(self, handle:IO): + """ """ + prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX + handle.write(f"{prefix}source={self.source}\n") + handle.write(f"{prefix}python={self.version.python}\n") + handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n") + handle.write(f"{prefix}biopython={self.version.biopython}\n") + + @classmethod + def parse(cls, handle:IO): + """ """ + pos = handle.tell() + it = handle.readline() + metadata = {} + version = {} + prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX + while it and it.startswith(prefix): + key, val = it.lstrip(prefix).rstrip().split('=', 1) + if val == '': + val = None + if key in ('python', 'moPepGen', 'biopython'): + version[key.lower()] = val + else: + metadata[key] = val + + pos = handle.tell() + it = handle.readline() + + handle.seek(pos) + + version = MetaVersion(**version) + + return cls(source=metadata['source'], version=version) diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py index 3a7937c5..bcc7b04e 100644 --- a/moPepGen/gtf/GTFPointer.py +++ b/moPepGen/gtf/GTFPointer.py @@ -43,8 +43,8 @@ def load(self) -> GeneAnnotationModel: offset = self.start - cur self.handle.seek(offset, 1) buffer:bytes = self.handle.read(len(self)) - lines = buffer.decode('utf-8').split('\n') - if len(lines) > 0: + lines = buffer.decode('utf-8').rstrip().split('\n') + if len(lines) > 1: raise ValueError(f"Multiple lines found for gene {self.key}") record = GtfIO.line_to_seq_feature(lines[0]) record.__class__ = GeneAnnotationModel @@ -88,6 +88,7 @@ def load(self) -> TranscriptAnnotationModel: raise ValueError("Loaded transcript do not match.") record.id = tx_id tx_model.add_record(feature, record) + tx_model.sort_records() return tx_model def to_line(self) -> str: @@ -157,6 +158,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T cur_tx_pointer.end = line_end if cur_gene_pointer: cur_gene_pointer.transcripts.add(cur_tx_id) + if cur_gene_pointer: + yield cur_gene_pointer + if cur_tx_pointer: + yield cur_tx_pointer class GTFPointerDict(dict, Mapping[str, GTFPointer]): """ """ @@ -171,6 +176,11 @@ def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): self.validate(val, GTFPointer) super().__init__(*args, **kwargs) + def __iter__(self) -> Iterable[str]: + """""" + for k in self.keys(): + yield k + def validate(self, val, type): """ """ if not type is GTFPointer or not issubclass(type, GTFPointer): @@ -210,7 +220,9 @@ def __getitem__(self, __key: str) -> GeneAnnotationModel: key_pop = self._cached_keys.pop() self._cache.pop(key_pop) pointer:GenePointer = self.get_pointer(__key) - return pointer.load() + val = pointer.load() + self._cache[__key] = val + return val class TranscriptPointerDict(GTFPointerDict): """ """ @@ -235,6 +247,8 @@ def __getitem__(self, __key: str key_pop = self._cached_keys.pop() self._cache.pop(key_pop) pointer:TranscriptPointer = self.get_pointer(__key) - tx_model = pointer.load() - tx_model.is_protein_coding = pointer.is_protein_coding is True - return tx_model + val = pointer.load() + val.is_protein_coding = pointer.is_protein_coding is True + val.transcript.source = pointer.source + self._cache[__key] = val + return val diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 13e78d5b..55d48cf7 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -5,12 +5,14 @@ import gzip import lzma from pathlib import Path +from moPepGen import err from moPepGen.version import MetaVersion from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation from moPepGen.gtf.GTFPointer import ( GenePointerDict, TranscriptPointerDict, iterate_pointer, GenePointer, TranscriptPointer ) +from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata class GenomicAnnotationOnDisk(GenomicAnnotation): @@ -77,6 +79,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: basename = str(file.name).split('.gtf')[0] gene_idx_file = file.parent/f"{basename}_gene.idx" tx_idx_file = file.parent/f"{basename}_tx.idx" + return gene_idx_file, tx_idx_file def generate_index(self, handle:Union[IO, str, Path], source:str=None): @@ -99,18 +102,25 @@ def load_index(self, file:Union[str,Path]): self.init_handle(file) gene_idx_file, tx_idx_file = self.get_index_files(file) - gene_source = None + if not gene_idx_file.exists(): + raise ValueError(f"Gene index file cannot be found in {file.parent}") + if not tx_idx_file.exists(): + raise ValueError(f"Transcript index file cannot be found in {file.parent}") + + version = MetaVersion() + with open(gene_idx_file, 'rt') as handle: + metadata = GTFIndexMetadata.parse(handle) + if not version.is_valid(metadata.version): + raise err.IndexVersionNotMatchError(version, metadata.version) + + gene_source = metadata.source + if not gene_source: + raise ValueError("Cannot find source from gene idx.") + for line in handle: if line.startswith('#'): - line = line.lstrip('#').lstrip() - if line.startswith('source='): - gene_source = line.split('=')[1] continue - - if not gene_source: - raise ValueError("Cannot find source from gene idx.") - fields = line.rstrip().split('\t') pointer = GenePointer( self.handle, key=fields[0], @@ -119,21 +129,20 @@ def load_index(self, file:Union[str,Path]): ) self.genes[pointer.key] = pointer - tx_source = None with open(tx_idx_file, 'rt') as handle: + metadata = GTFIndexMetadata.parse(handle) + if not version.is_valid(metadata.version): + raise err.IndexVersionNotMatchError(version, metadata.version) + + tx_source = metadata.source + if not tx_source: + raise ValueError("Cannot find source from gene idx.") + if tx_source != gene_source: + raise ValueError('tx_source does not match with gene_source') + for line in handle: if line.startswith('#'): - line = line.lstrip('#').lstrip() - if line.startswith('source='): - tx_source = line.split('=')[1] continue - - if not tx_source: - raise ValueError("Cannot find source from gene idx.") - - if tx_source != gene_source: - raise ValueError('tx_source does not match with gene_source') - fields = line.rstrip().split('\t') is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]] pointer = TranscriptPointer( diff --git a/moPepGen/version.py b/moPepGen/version.py index 01a5b4b9..8c4d450c 100644 --- a/moPepGen/version.py +++ b/moPepGen/version.py @@ -6,15 +6,15 @@ from moPepGen import __version__ -MINIMAL_VERSION = '0.12.0' +MINIMAL_VERSION = '1.0.1' class MetaVersion(): """ Versions """ - def __init__(self): + def __init__(self, python:str=None, biopython:str=None, mopepgen:str=None): """ constructor """ - self.python = sys.version_info[:3] - self.biopython = Bio.__version__ - self.mopepgen = __version__ + self.python = python or '.'.join([str(x) for x in sys.version_info[:3]]) + self.biopython = biopython or Bio.__version__ + self.mopepgen = mopepgen or __version__ def __eq__(self, other:MetaVersion): """ equal to """ @@ -45,5 +45,4 @@ def __ne__(self, other:MetaVersion): def __repr__(self) -> str: """ str representation """ - python = '.'.join([str(x) for x in self.python]) - return f"python={python}, biopython={self.biopython}, moPepGen={self.mopepgen}" + return f"python={self.python}, biopython={self.biopython}, moPepGen={self.mopepgen}" diff --git a/test/files/fusion/star_fusion.gvf b/test/files/fusion/star_fusion.gvf index b11972e9..4d14aa2d 100644 --- a/test/files/fusion/star_fusion.gvf +++ b/test/files/fusion/star_fusion.gvf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##mopepgen_version=0.12.0 +##mopepgen_version=1.0.1 ##parser=parseSTARFusion ##reference_index=/home/chenghaozhu/projects/private-moPepGen/test/files/index ##genome_fasta= @@ -13,7 +13,7 @@ ##INFO= ##INFO= #CHROM POS ID REF ALT QUAL FILTER INFO -ENSG00000128408.9 832 FUSION-ENST00000614167.2:831-ENST00000642151.1:29 A . . TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 -ENSG00000128408.9 832 FUSION-ENST00000614168.2:831-ENST00000642151.1:29 A . . TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 +ENSG00000128408.9 832 FUSION-ENST00000614167.2:831-ENST00000642151.1:29 T . . TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 +ENSG00000128408.9 832 FUSION-ENST00000614168.2:831-ENST00000642151.1:29 T . . TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 ENSG00000244486.9 1752 FUSION-ENST00000622235.5:1751-ENST00000642151.1:29 G . . TRANSCRIPT_ID=ENST00000622235.5;GENE_SYMBOL=SCARF2;GENOMIC_POSITION=chr22:3213:3213;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010 ENSG00000279973.2 93 FUSION-ENST00000624155.2:92-ENST00000622235.5:968 T . . TRANSCRIPT_ID=ENST00000624155.2;GENE_SYMBOL=CU104787.1;GENOMIC_POSITION=chr22:5311:5311;ACCEPTER_GENE_ID=ENSG00000244486.9;ACCEPTER_TRANSCRIPT_ID=ENST00000622235.5;ACCEPTER_SYMBOL=SCARF2;ACCEPTER_POSITION=969;ACCEPTER_GENOMIC_POSITION=chr22:3995:3995 diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta index 8aa7c5ea..2c8adba0 100644 --- a/test/files/peptides/variant.fasta +++ b/test/files/peptides/variant.fasta @@ -1,218 +1,170 @@ ->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 -CLTCEPGWNR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 -LPPCDEFVGARR ->CIRC-ENST00000614167.2-E2-E1|W2F-11|1 -AGASVPLRRHFEPDGK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|2 -HETLFLLTFPRER ->ENST00000614167.2|RES-101-A-G|W2F-9|3 ENST00000614167.2|RES-101-A-G|W2F-9|3 -IIGGDTEAFDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|22 -MAVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|20 -MAVALPRDLRQDANLAK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|3 -HETLFLLTFPR ->CIRC-ENST00000614167.2-E2-E1|18 -MFKFMTR ->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 -ARHETLCC ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 -NNLVVDISERAGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 -HETLWLLTFPRERER +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 +MFKFMAR +>CIRC-ENST00000614167.2-E2-E1|3 +RHWEPDGKLR +>CIRC-ENST00000614167.2-E2-E1|7 +AGASVPLR >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2 ERLYLCGVTGSPTENCAK ->ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3 -IIGGDTEAWDVQVHGQK +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2 +HTVVAYK +>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 +VFNARNRIIGGDTEAWDVQVHGQK +>CIRC-ENST00000614167.2-E2-E1|4 +RHWEPDGK +>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 +MEGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 +MNNLVVDISER +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 +FKFMARR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 +NNLVVDISERAGASVPLRR +>CIRC-ENST00000642151.1-E1-E2|1 +STSGPSK +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 +HETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|22 +MAVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|14 +MKPLVVDISER +>CIRC-ENST00000614167.2-E2-E1|11 +KPLVVDISERAGASVPLR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12 +LESTTRKVHR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1 +LYLCGVTGSPTENCAK +>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 +NTLSGMQKFMGEDUNFHERK +>CIRC-ENST00000614167.2-E2-E1|16 +FKFMTRR +>ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1 +GAVPARRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10 +RLPPCDEFVGAR +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 +LMKEEAAA +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 +LPPCDEFVGAR >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5 RSKHTVVAYK +>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 +TKCDQPCATGFYGEGCSHR +>CIRC-ENST00000614167.2-E2-E1|17 +MFKFMTRR +>ENST00000614167.2|SNV-1179-G-T|2 +RIQGARATLUF +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1 +HTVVAYKDAI +>CIRC-ENST00000614167.2-E2-E1|20 +MAVALPRDLRQDANLAK +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5 +HETLWLLTFPRER +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 +SKHTVVAYKDAI +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 +ARHETLWLLTFPR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 +SKHTVVAYK +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5 +NNLVVDISER +>CIRC-ENST00000614167.2-E2-E1|6 +AGASVPLRR >CIRC-ENST00000614167.2-E2-E1|1 HWEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|16 -FKFMTRR ->ENST00000622235.5|RES-202-G-A|W2F-8|2 ENST00000622235.5|RES-202-G-A|W2F-8|2 -CLTCEPGFNR +>CIRC-ENST00000614167.2-E2-E1|9 +KPLVVDISERAGASVPLRR +>CIRC-ENST00000642151.1-E1-E2|4 +MPFRRSTSGPSK +>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 +NTLSGMQKFMGEDUNFHER +>CIRC-ENST00000614167.2-E2-E1|5 +AGASVPLRRHWEPDGK >ENST00000614167.2|SNV-1179-G-T|1 IQGARATLUF ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 -HETLWLLTFPR ->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 -MEGAGPRGAVPAR ->CIRC-ENST00000614167.2-E2-E1|W2F-3|2 -RHFEPDGK +>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 +EGAGPRGAVPARR +>CIRC-ENST00000614167.2-E2-E1|23 +MAVALPR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 +MNNLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|13 +KPLVVDISER +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 +MNNLVVDISERAGASVPLRR +>CIRC-ENST00000614167.2-E2-E1|19 +AVALPRDLRQDANLAK +>CIRC-ENST00000642151.1-E1-E2|2 +RSTSGPSK +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 +HETLWLLTFPRERER >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9 EATEKARHETLWLLTFPR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9 -RLPPCDEFVGARR ->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 -CLTCEPGWNRTK ->CIRC-ENST00000614167.2-E2-E1|4 -RHWEPDGK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|1 -HETLFLLTFPRERER ->CIRC-ENST00000614167.2-E2-E1|2 -HWEPDGK ->ENST00000622235.5|RES-202-G-A|W2F-19|1 ENST00000622235.5|RES-202-G-A|W2F-19|1 -GQQPCTVAEGRCLTCEPGFNRTK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1 -HTVVAYKDAI ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-12|1 -EATEKARHETLFLLTFPR ->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 -TKCDQPCATGFYGEGCSHR ->ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6 -GQQPCTVAEGRCLTCEPGWNR ->CIRC-ENST00000614167.2-E2-E1|17 -MFKFMTRR ->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 -EGAGPRGAVPAR >CIRC-ENST00000614167.2-E2-E1|12 MKPLVVDISERAGASVPLR ->CIRC-ENST00000614167.2-E2-E1|23 -MAVALPR ->ENST00000614167.2|RES-101-A-G|W2F-11|1 ENST00000614167.2|RES-101-A-G|W2F-11|1 -NRIIGGDTEAFDVQVHGQKIK +>ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1 +TKCDQPCATGFYGEGCSHRCPPCR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 +LPPCDEFVGARR >CIRC-ENST00000614167.2-E2-E1|8 ARHETFGC ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 -NTLSGMQKFMGEDUNFHER ->CIRC-ENST00000614167.2-E2-E1|W2F-3|1 -RHFEPDGKLR +>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 +GQQPCTVAEGRCLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|15 +EATEKARHETFGC +>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 +CLTCEPGWNR +>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 +CLTCEPGWNRTK +>ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3 +IIGGDTEAWDVQVHGQK +>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 +ARHETLCC >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3 NNLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7 +ARHETLWLLTFPRER +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6 +LPPCDEFVGARRSK +>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 +EATEKARHE +>CIRC-ENST00000614167.2-E2-E1|21 +AVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 +ERERLYLCGVTGSPTENCAK >CIRC-ENST00000642151.1-E1-E2|3 PFRRSTSGPSK ->ENST00000614167.2|RES-101-A-G|W2F-16|1 ENST00000614167.2|RES-101-A-G|W2F-16|1 -VFNARNRIIGGDTEAFDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|3 -RHWEPDGKLR ->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 -GQQPCTVAEGRCLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|18 +MFKFMTR +>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 +MEGAGPRGAVPARR +>CIRC-ENST00000614167.2-E2-E1|10 +MKPLVVDISERAGASVPLRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 +WRRLPPCDEFVGAR +>CIRC-ENST00000614167.2-E2-E1|2 +HWEPDGK >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1 MKEEAAA ->CIRC-ENST00000614167.2-E2-E1|W2F-2|1 -HFEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 -MFKFMAR ->ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1 -GAVPARRR ->CIRC-ENST00000614167.2-E2-E1|5 -AGASVPLRRHWEPDGK >ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5 NRIIGGDTEAWDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1 -LYLCGVTGSPTENCAK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 -ERERLYLCGVTGSPTENCAK ->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 -VFNARNRIIGGDTEAWDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|19 -AVALPRDLRQDANLAK ->ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1 -TKCDQPCATGFYGEGCSHRCPPCR ->CIRC-ENST00000614167.2-E2-E1|7 -AGASVPLR >ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2 IIGGDTEAWDVQVHGQKIK ->ENST00000622235.5|RES-202-G-A|W2F-19|2 ENST00000622235.5|RES-202-G-A|W2F-19|2 -GQQPCTVAEGRCLTCEPGFNR ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 -MNNLVVDISERAGASVPLRR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 -LPPCDEFVGAR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5 -HETLWLLTFPRER ->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 -EGAGPRGAVPARR ->ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4 -NRIIGGDTEAWDVQVHGQKIK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 -ARHETLWLLTFPR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9 +RLPPCDEFVGARR +>ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6 +GQQPCTVAEGRCLTCEPGWNR +>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 +EATEKARHETLCC >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3 AAGAVFLALSAQLLQARLMKEEAAA ->CIRC-ENST00000614167.2-E2-E1|10 -MKPLVVDISERAGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 -FKFMARR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 -MFKFMARR ->CIRC-ENST00000642151.1-E1-E2|2 -RSTSGPSK ->CIRC-ENST00000614167.2-E2-E1|15 -EATEKARHETFGC ->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 -MEGAGPRGAVPARR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|1 -ARHETLFLLTFPRER ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 -SKHTVVAYK ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 -MNNLVVDISERAGASVPLR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|W2F-1|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|W2F-1|1 -FRRLPPCDEFVGAR ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|1 -NTLSGMQKFMGED ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 -WRRLPPCDEFVGAR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12 -LESTTRKVHR ->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 -IIGGDTEAWDVQVHGQKIKEATEK ->ENST00000614167.2|RES-101-A-G|W2F-9|1 ENST00000614167.2|RES-101-A-G|W2F-9|1 -IIGGDTEAFDVQVHGQKIKEATEK ->ENST00000614167.2|RES-101-A-G|W2F-9|2 ENST00000614167.2|RES-101-A-G|W2F-9|2 -IIGGDTEAFDVQVHGQKIK ->ENST00000614167.2|RES-101-A-G|W2F-11|2 ENST00000614167.2|RES-101-A-G|W2F-11|2 -NRIIGGDTEAFDVQVHGQK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 -SKHTVVAYKDAI ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5 -NNLVVDISER ->ENST00000622235.5|RES-202-G-A|W2F-8|1 ENST00000622235.5|RES-202-G-A|W2F-8|1 -CLTCEPGFNRTK ->CIRC-ENST00000614167.2-E2-E1|6 -AGASVPLRR >ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2 GAVPARR ->CIRC-ENST00000614167.2-E2-E1|14 -MKPLVVDISER ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7 -ARHETLWLLTFPRER ->CIRC-ENST00000614167.2-E2-E1|13 -KPLVVDISER ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10 -RLPPCDEFVGAR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|2 -ARHETLFLLTFPR ->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 -EATEKARHETLCC ->CIRC-ENST00000642151.1-E1-E2|4 -MPFRRSTSGPSK ->ENST00000614167.2|SNV-1179-G-T|2 -RIQGARATLUF ->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 -EATEKARHE ->CIRC-ENST00000642151.1-E1-E2|1 -STSGPSK ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 -LMKEEAAA ->CIRC-ENST00000614167.2-E2-E1|9 -KPLVVDISERAGASVPLRR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6 -LPPCDEFVGARRSK ->CIRC-ENST00000614167.2-E2-E1|W2F-2|2 -HFEPDGK ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2 -HTVVAYK ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 -MNNLVVDISER ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|2 -QSDNDVRNTLSGMQKFMGED ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 -NTLSGMQKFMGEDUNFHERK ->CIRC-ENST00000614167.2-E2-E1|21 -AVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|11 -KPLVVDISERAGASVPLR +>ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4 +NRIIGGDTEAWDVQVHGQKIK +>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 +IIGGDTEAWDVQVHGQKIKEATEK +>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 +EGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 +MFKFMARR diff --git a/test/files/vep/vep_gINDEL.gvf b/test/files/vep/vep_gINDEL.gvf index d1269c9f..0a64ea98 100644 --- a/test/files/vep/vep_gINDEL.gvf +++ b/test/files/vep/vep_gINDEL.gvf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.2 -##mopepgen_version=0.12.0 +##mopepgen_version=1.0.1 ##parser=parseVEP ##reference_index= ##genome_fasta=/home/chenghaozhu/projects/private-moPepGen/test/files/genome.fasta diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 5bdc075d..dc527eec 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -44,7 +44,8 @@ def test_generate_index_case1(self): args.output_dir.mkdir(parents=False, exist_ok=True) cli.generate_index(args) files = {str(file.name) for file in args.output_dir.glob('*')} - expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat', + expected = {'genome.pkl', 'proteome.pkl', + 'annotation.gtf.lz', 'annotation_gene.idx', 'annotation_tx.idx', 'canonical_peptides.pkl', 'coding_transcripts.pkl'} self.assertEqual(files, expected) From 9c75b60c57cea1730a7f67e64578ef887193ae11 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Tue, 27 Jun 2023 18:18:24 -0700 Subject: [PATCH 07/18] fix (moPepGen): docstrings added --- moPepGen/cli/generate_index.py | 6 +++--- moPepGen/gtf/GTFIndexMetadata.py | 10 +++++----- moPepGen/gtf/GenomicAnnotationOnDisk.py | 17 +++++++++-------- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index 392d0620..47d2190c 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -50,7 +50,7 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction): def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, invalid_protein_as_noncoding:bool=True): - """""" + """ Index a GTF file. """ anno = gtf.GenomicAnnotationOnDisk() if file.suffix.lower() == '.gtf': @@ -81,7 +81,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, handle.write(tx_pointer.to_line() + '\n') def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path: - """ """ + """ Create copy of GTF """ if symlink: if file.suffix.lower() == '.gz': output_file = output_dir/'annotation.gtf.gz' @@ -112,7 +112,7 @@ def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:s return output_file def generate_index(args:argparse.Namespace): - """ Generate """ + """ Generate index """ path_genome:Path = args.genome_fasta path_gtf:Path = args.annotation_gtf parth_proteome:Path = args.proteome_fasta diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py index f235a099..1f7b8c7d 100644 --- a/moPepGen/gtf/GTFIndexMetadata.py +++ b/moPepGen/gtf/GTFIndexMetadata.py @@ -1,18 +1,18 @@ -""" """ +""" GTFIndexMetadata """ from typing import IO from moPepGen.version import MetaVersion MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##' class GTFIndexMetadata(): - """ """ + """ Class for GTF index metadata. """ def __init__(self, source:str=None, version:MetaVersion=None): - """ """ + """ constructor """ self.source = source self.version = version or MetaVersion() def write(self, handle:IO): - """ """ + """ Write metadata to handle. """ prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX handle.write(f"{prefix}source={self.source}\n") handle.write(f"{prefix}python={self.version.python}\n") @@ -21,7 +21,7 @@ def write(self, handle:IO): @classmethod def parse(cls, handle:IO): - """ """ + """ Read metadata from handle """ pos = handle.tell() it = handle.readline() metadata = {} diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 55d48cf7..0ddfa834 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -16,11 +16,11 @@ class GenomicAnnotationOnDisk(GenomicAnnotation): - """ """ + """ An on disk version of GenomicAnnotation that uses minimal memory. """ def __init__(self, genes:GenePointerDict=None, transcripts:TranscriptPointerDict=None, source:str=None): - """ """ + """ Constructur """ self.genes = genes or GenePointerDict() self.transcripts = transcripts or TranscriptPointerDict() self.source = source @@ -30,15 +30,16 @@ def __init__(self, genes:GenePointerDict=None, self.handle = None def __del__(self): - """ """ + """ Hook for deconstruction """ if self.handle: self.handle.close() def dump_gtf(self, *args, **kwargs) -> None: + """ Forbidden """ raise NotImplementedError def infer_source(self): - """ """ + """ Infer GTF source (GENCODE vs ENSEMBL) from data. """ i = 0 inferred = {} for tx in self.transcripts.keys(): @@ -53,7 +54,7 @@ def infer_source(self): self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0] def init_handle(self, handle:Union[str, IO, Path]): - """ """ + """ Initiate file handle """ if isinstance(handle, str): handle = Path(handle) @@ -73,7 +74,7 @@ def init_handle(self, handle:Union[str, IO, Path]): @staticmethod def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: - """ """ + """ Get index file paths given the GTF file path. """ if isinstance(file, str): file = Path(file) basename = str(file.name).split('.gtf')[0] @@ -83,7 +84,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: return gene_idx_file, tx_idx_file def generate_index(self, handle:Union[IO, str, Path], source:str=None): - """ """ + """ Generate GTF index """ self.init_handle(handle) for pointer in iterate_pointer(self.handle, source): @@ -98,7 +99,7 @@ def generate_index(self, handle:Union[IO, str, Path], source:str=None): self.infer_source() def load_index(self, file:Union[str,Path]): - """ """ + """ Load index from idx files """ self.init_handle(file) gene_idx_file, tx_idx_file = self.get_index_files(file) From 79771ac44020ed9a9bda52081cae3d2332e323ff Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Tue, 27 Jun 2023 22:20:09 -0700 Subject: [PATCH 08/18] fix (moPepGen): update version to 1.1.0 --- moPepGen/__init__.py | 2 +- moPepGen/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py index 7952e5dd..50556536 100644 --- a/moPepGen/__init__.py +++ b/moPepGen/__init__.py @@ -5,7 +5,7 @@ from typing import Iterable, IO -__version__ = '1.0.1' +__version__ = '1.1.0' ## Error messages ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron' diff --git a/moPepGen/version.py b/moPepGen/version.py index 8c4d450c..71d16bdd 100644 --- a/moPepGen/version.py +++ b/moPepGen/version.py @@ -6,7 +6,7 @@ from moPepGen import __version__ -MINIMAL_VERSION = '1.0.1' +MINIMAL_VERSION = '1.1.0' class MetaVersion(): """ Versions """ From ff4ba9f7e4b2715e71e3372b35f6b38172bc93de Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Wed, 28 Jun 2023 09:57:33 -0700 Subject: [PATCH 09/18] fix (generateIndex): trying to set compression to none --- moPepGen/cli/generate_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index 47d2190c..3e4acd08 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -127,7 +127,7 @@ def generate_index(args:argparse.Namespace): quiet:bool = args.quiet symlink:bool = False - compression:bool = 'lzma' + compression:bool = None output_dir:Path = args.output_dir output_genome = output_dir/"genome.pkl" From b1ac30cd5af6f3a0ac3118813e9994a8d718a310 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Wed, 28 Jun 2023 18:14:03 -0700 Subject: [PATCH 10/18] fix (generateIndex): remove code to compress the copied gtf file --- moPepGen/cli/generate_index.py | 61 ++--- test/files/peptides/variant.fasta | 284 ++++++++++++------------ test/integration/test_generate_index.py | 3 +- 3 files changed, 167 insertions(+), 181 deletions(-) diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index 3e4acd08..77dc32d1 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -9,10 +9,10 @@ from typing import TYPE_CHECKING import argparse import os +import shutil +import gzip from pathlib import Path import pickle -import lzma -import gzip from moPepGen import dna, aa, gtf, logger from moPepGen.gtf import GtfIO from moPepGen.cli import common @@ -25,7 +25,7 @@ # pylint: disable=W0212 def add_subparser_generate_index(subparsers:argparse._SubParsersAction): """ CLI for moPepGen generateIndex """ - p = subparsers.add_parser( + p:argparse.ArgumentParser = subparsers.add_parser( name='generateIndex', help='Generate genome and proteome index files for moPepGen', description='Generate genome and proteome index files for moPepGen' @@ -40,6 +40,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction): dest='output_dir', required=True ) + p.add_argument( + '--gtf-symlink', + help='Create a symlink of the GTF file instead of copying it.', + default=False + ) common.add_args_reference(p, index=False) common.add_args_cleavage(p) common.add_args_quiet(p) @@ -53,13 +58,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, """ Index a GTF file. """ anno = gtf.GenomicAnnotationOnDisk() - if file.suffix.lower() == '.gtf': - opener = open - elif file.suffix.lower() == '.gz': - opener = gzip.open - elif file.suffix.lower() == '.lz': - opener = lzma.open - with opener(file, 'rb') as handle: + with open(file, 'rb') as handle: anno.generate_index(handle, source) if proteome: @@ -80,33 +79,22 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None, tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx) handle.write(tx_pointer.to_line() + '\n') -def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path: +def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path: """ Create copy of GTF """ + if file.suffix.lower() == '.gz': + if symlink: + symlink = False + elif file.suffix.lower() != '.gtf': + raise ValueError(f"Cannot handle gtf file {file}") + + output_file = output_dir/'annotation.gtf' + if symlink: - if file.suffix.lower() == '.gz': - output_file = output_dir/'annotation.gtf.gz' - elif file.suffix.lower() == '.gtf': - output_file = output_dir/'annotation.gtf' - else: - raise ValueError(f"File not supported {file}") os.symlink(file, output_file) + elif file.suffix.lower() == '.gtf': + shutil.copy2(file, output_file) else: - if file.suffix.lower() == '.gtf': - i_opener = open - elif file.suffix.lower() == '.gz': - i_opener = gzip.open - - if compression == None: - o_opener = open - output_file = output_dir/'annotation.gtf' - elif compression == 'lzma': - o_opener = lzma.open - output_file = output_dir/'annotation.gtf.lz' - elif compression == 'gzip': - o_opener = gzip.open - output_file = output_dir/'annotation.gtf.gz' - - with i_opener(file, 'rt') as ihandle, o_opener(output_file, 'wt') as ohandle: + with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle: for line in ihandle: ohandle.write(line) return output_file @@ -125,14 +113,11 @@ def generate_index(args:argparse.Namespace): exception = 'trypsin_exception' if rule == 'trypsin' else None invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding quiet:bool = args.quiet - - symlink:bool = False - compression:bool = None + symlink:bool = args.gtf_symlink output_dir:Path = args.output_dir output_genome = output_dir/"genome.pkl" output_proteome = output_dir/"proteome.pkl" - output_anno = output_dir/"annotation.dat" output_peptides = output_dir/"canonical_peptides.pkl" output_coding_tx = output_dir/"coding_transcripts.pkl" @@ -155,7 +140,7 @@ def generate_index(args:argparse.Namespace): if not quiet: logger('Proteome FASTA loaded.') - output_gtf = create_gtf_copy(path_gtf, output_dir, symlink, compression) + output_gtf = create_gtf_copy(path_gtf, output_dir, symlink) index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding) if not quiet: logger('Genome annotation GTF saved to disk.') diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta index 2c8adba0..bee351cd 100644 --- a/test/files/peptides/variant.fasta +++ b/test/files/peptides/variant.fasta @@ -1,170 +1,170 @@ ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 -MFKFMAR ->CIRC-ENST00000614167.2-E2-E1|3 -RHWEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|7 -AGASVPLR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2 -ERLYLCGVTGSPTENCAK >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2 HTVVAYK ->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 -VFNARNRIIGGDTEAWDVQVHGQK ->CIRC-ENST00000614167.2-E2-E1|4 -RHWEPDGK ->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 -MEGAGPRGAVPAR ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 -MNNLVVDISER ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 -FKFMARR ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 -NNLVVDISERAGASVPLRR ->CIRC-ENST00000642151.1-E1-E2|1 -STSGPSK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 -HETLWLLTFPR ->CIRC-ENST00000614167.2-E2-E1|22 -MAVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|14 -MKPLVVDISER ->CIRC-ENST00000614167.2-E2-E1|11 -KPLVVDISERAGASVPLR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12 -LESTTRKVHR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1 -LYLCGVTGSPTENCAK ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 -NTLSGMQKFMGEDUNFHERK ->CIRC-ENST00000614167.2-E2-E1|16 -FKFMTRR ->ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1 -GAVPARRR >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10 RLPPCDEFVGAR ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 -LMKEEAAA ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 -LPPCDEFVGAR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5 -RSKHTVVAYK ->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 -TKCDQPCATGFYGEGCSHR ->CIRC-ENST00000614167.2-E2-E1|17 -MFKFMTRR ->ENST00000614167.2|SNV-1179-G-T|2 -RIQGARATLUF +>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 +IIGGDTEAWDVQVHGQKIKEATEK >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1 HTVVAYKDAI ->CIRC-ENST00000614167.2-E2-E1|20 -MAVALPRDLRQDANLAK >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5 HETLWLLTFPRER ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 -SKHTVVAYKDAI ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 -ARHETLWLLTFPR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 -SKHTVVAYK ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5 -NNLVVDISER ->CIRC-ENST00000614167.2-E2-E1|6 -AGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|1 -HWEPDGKLR ->CIRC-ENST00000614167.2-E2-E1|9 -KPLVVDISERAGASVPLRR ->CIRC-ENST00000642151.1-E1-E2|4 -MPFRRSTSGPSK ->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 -NTLSGMQKFMGEDUNFHER +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1 +FKFMARR +>CIRC-ENST00000614167.2-E2-E1|11 +KPLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3 +NNLVVDISERAGASVPLR +>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 +ARHETLCC +>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 +CLTCEPGWNR +>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6 +MEGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2 +ERLYLCGVTGSPTENCAK +>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 +MEGAGPRGAVPARR +>CIRC-ENST00000614167.2-E2-E1|7 +AGASVPLR +>ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2 +IIGGDTEAWDVQVHGQKIK +>ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1 +GAVPARRR +>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 +GQQPCTVAEGRCLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|4 +RHWEPDGK >CIRC-ENST00000614167.2-E2-E1|5 AGASVPLRRHWEPDGK ->ENST00000614167.2|SNV-1179-G-T|1 -IQGARATLUF ->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 -EGAGPRGAVPARR ->CIRC-ENST00000614167.2-E2-E1|23 -MAVALPR ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 -MNNLVVDISERAGASVPLR ->CIRC-ENST00000614167.2-E2-E1|13 -KPLVVDISER ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 -MNNLVVDISERAGASVPLRR ->CIRC-ENST00000614167.2-E2-E1|19 -AVALPRDLRQDANLAK ->CIRC-ENST00000642151.1-E1-E2|2 -RSTSGPSK ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 -HETLWLLTFPRERER ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9 -EATEKARHETLWLLTFPR ->CIRC-ENST00000614167.2-E2-E1|12 -MKPLVVDISERAGASVPLR +>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 +NTLSGMQKFMGEDUNFHER +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1 +LYLCGVTGSPTENCAK +>ENST00000614167.2|SNV-1179-G-T|2 +RIQGARATLUF +>CIRC-ENST00000614167.2-E2-E1|14 +MKPLVVDISER +>ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2 +GAVPARR >ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1 TKCDQPCATGFYGEGCSHRCPPCR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 -LPPCDEFVGARR ->CIRC-ENST00000614167.2-E2-E1|8 -ARHETFGC ->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5 -GQQPCTVAEGRCLTCEPGWNRTK ->CIRC-ENST00000614167.2-E2-E1|15 -EATEKARHETFGC ->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4 -CLTCEPGWNR ->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 -CLTCEPGWNRTK +>ENST00000614167.2|SNV-1179-G-T|1 +IQGARATLUF +>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 +EATEKARHE +>CIRC-ENST00000614167.2-E2-E1|1 +HWEPDGKLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 +ERERLYLCGVTGSPTENCAK >ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3 IIGGDTEAWDVQVHGQK ->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1 -ARHETLCC ->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3 -NNLVVDISERAGASVPLR +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3 +AAGAVFLALSAQLLQARLMKEEAAA +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9 +EATEKARHETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6 +MNNLVVDISER +>CIRC-ENST00000614167.2-E2-E1|22 +MAVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2 +MNNLVVDISERAGASVPLRR >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7 ARHETLWLLTFPRER >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6 LPPCDEFVGARRSK ->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1 -EATEKARHE ->CIRC-ENST00000614167.2-E2-E1|21 -AVALPRDLR ->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3 -ERERLYLCGVTGSPTENCAK ->CIRC-ENST00000642151.1-E1-E2|3 -PFRRSTSGPSK ->CIRC-ENST00000614167.2-E2-E1|18 -MFKFMTR ->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4 -MEGAGPRGAVPARR +>CIRC-ENST00000642151.1-E1-E2|2 +RSTSGPSK >CIRC-ENST00000614167.2-E2-E1|10 MKPLVVDISERAGASVPLRR ->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 -WRRLPPCDEFVGAR >CIRC-ENST00000614167.2-E2-E1|2 HWEPDGK ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1 -MKEEAAA ->ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5 -NRIIGGDTEAWDVQVHGQK ->ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2 -IIGGDTEAWDVQVHGQKIK +>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6 +VFNARNRIIGGDTEAWDVQVHGQK +>CIRC-ENST00000614167.2-E2-E1|19 +AVALPRDLRQDANLAK +>CIRC-ENST00000614167.2-E2-E1|15 +EATEKARHETFGC >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9 RLPPCDEFVGARR +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 +MFKFMARR +>ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5 +NRIIGGDTEAWDVQVHGQK +>CIRC-ENST00000614167.2-E2-E1|9 +KPLVVDISERAGASVPLRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12 +LESTTRKVHR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4 +MNNLVVDISERAGASVPLR +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4 +HETLWLLTFPRERER +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5 +NNLVVDISER +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3 +SKHTVVAYKDAI +>CIRC-ENST00000614167.2-E2-E1|20 +MAVALPRDLRQDANLAK +>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3 +CLTCEPGWNRTK +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6 +HETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|6 +AGASVPLRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5 +RSKHTVVAYK +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2 +LMKEEAAA +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7 +LPPCDEFVGARR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8 +LPPCDEFVGAR +>CIRC-ENST00000614167.2-E2-E1|18 +MFKFMTR +>CIRC-ENST00000614167.2-E2-E1|21 +AVALPRDLR +>CIRC-ENST00000614167.2-E2-E1|16 +FKFMTRR +>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 +EGAGPRGAVPAR +>CIRC-ENST00000614167.2-E2-E1|12 +MKPLVVDISERAGASVPLR >ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6 GQQPCTVAEGRCLTCEPGWNR ->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 -EATEKARHETLCC ->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3 -AAGAVFLALSAQLLQARLMKEEAAA ->ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2 -GAVPARR +>CIRC-ENST00000642151.1-E1-E2|3 +PFRRSTSGPSK +>CIRC-ENST00000614167.2-E2-E1|13 +KPLVVDISER +>CIRC-ENST00000614167.2-E2-E1|23 +MAVALPR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4 +SKHTVVAYK +>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3 +MFKFMAR +>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1 +NNLVVDISERAGASVPLRR >ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4 NRIIGGDTEAWDVQVHGQKIK ->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1 -IIGGDTEAWDVQVHGQKIKEATEK ->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5 -EGAGPRGAVPAR ->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2 -MFKFMARR +>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3 +EGAGPRGAVPARR +>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1 +MKEEAAA +>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2 +TKCDQPCATGFYGEGCSHR +>CIRC-ENST00000614167.2-E2-E1|8 +ARHETFGC +>CIRC-ENST00000642151.1-E1-E2|4 +MPFRRSTSGPSK +>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2 +EATEKARHETLCC +>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8 +ARHETLWLLTFPR +>CIRC-ENST00000614167.2-E2-E1|3 +RHWEPDGKLR +>CIRC-ENST00000642151.1-E1-E2|1 +STSGPSK +>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 +NTLSGMQKFMGEDUNFHERK +>CIRC-ENST00000614167.2-E2-E1|17 +MFKFMTRR +>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11 +WRRLPPCDEFVGAR diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index dc527eec..3523e39f 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -32,6 +32,7 @@ def test_generate_index_case1(self): args.genome_fasta = self.data_dir / 'genome.fasta' args.annotation_gtf = self.data_dir / 'annotation.gtf' args.proteome_fasta = self.data_dir / 'translate.fasta' + args.gtf_symlink = False args.reference_source = None args.invalid_protein_as_noncoding = False args.cleavage_rule = 'trypsin' @@ -45,7 +46,7 @@ def test_generate_index_case1(self): cli.generate_index(args) files = {str(file.name) for file in args.output_dir.glob('*')} expected = {'genome.pkl', 'proteome.pkl', - 'annotation.gtf.lz', 'annotation_gene.idx', 'annotation_tx.idx', + 'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx', 'canonical_peptides.pkl', 'coding_transcripts.pkl'} self.assertEqual(files, expected) From 19c682e01d7235649e98d02506c581c72d28a7c9 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 28 Jun 2023 22:26:10 -0700 Subject: [PATCH 11/18] style (moPepGen): fix pylint, added docstrings --- moPepGen/cli/common.py | 1 - moPepGen/cli/generate_index.py | 1 - moPepGen/gtf/GTFPointer.py | 80 ++++++++++++++++--------- moPepGen/gtf/GenomicAnnotationOnDisk.py | 8 ++- moPepGen/gtf/GtfIO.py | 2 +- 5 files changed, 57 insertions(+), 35 deletions(-) diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py index 34bbf18b..afdbb51d 100644 --- a/moPepGen/cli/common.py +++ b/moPepGen/cli/common.py @@ -1,7 +1,6 @@ """ Common functions for cli """ from __future__ import annotations import argparse -import lzma import os import sys from typing import Tuple, Set, List diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index 77dc32d1..4d2f5026 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -14,7 +14,6 @@ from pathlib import Path import pickle from moPepGen import dna, aa, gtf, logger -from moPepGen.gtf import GtfIO from moPepGen.cli import common from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py index bcc7b04e..29ab895a 100644 --- a/moPepGen/gtf/GTFPointer.py +++ b/moPepGen/gtf/GTFPointer.py @@ -1,23 +1,21 @@ -""" """ +""" GTFPointer and GTFPointerDict """ from __future__ import annotations -from typing import Dict, Set, Iterable, Mapping, Union, IO, TYPE_CHECKING +from typing import Dict, Set, Iterable, Mapping, Union, IO from collections import deque from moPepGen.gtf import GtfIO from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel -from moPepGen.gtf.TranscriptAnnotationModel import TranscriptAnnotationModel from moPepGen.gtf.TranscriptAnnotationModel import ( TranscriptAnnotationModel, GTF_FEATURE_TYPES ) -if TYPE_CHECKING: - from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature GENE_DICT_CACHE_SIZE = 10 TX_DICT_CACHE_SIZE = 10 class GTFPointer(): - """ """ + """ GTFPointer. Represents the range of the GTF file of a particular + genome annotation entity (gene or transcript). """ def __init__(self, handle:IO, key:str, start:int, end:int, source:str): """ """ self.handle = handle @@ -31,14 +29,15 @@ def __len__(self) -> int: return self.end - self.start class GenePointer(GTFPointer): - """ """ + """ Pointer of a GTF file for a gene. """ def __init__(self, handle: IO, key: str, start: int, end: int, source: str, transcripts:Set[str]=None): + """ Constructor """ super().__init__(handle, key, start, end, source) self.transcripts = transcripts or set() def load(self) -> GeneAnnotationModel: - """ """ + """ Load gene annotation model from handle. """ cur = self.handle.tell() offset = self.start - cur self.handle.seek(offset, 1) @@ -53,7 +52,7 @@ def load(self) -> GeneAnnotationModel: return record def to_line(self) -> str: - """ """ + """ Convert to line """ fields = [ self.key, str(self.start), @@ -63,14 +62,15 @@ def to_line(self) -> str: return '\t'.join(fields) class TranscriptPointer(GTFPointer): - """ """ + """ Pointer of a GTF file for a trnascript. """ def __init__(self, handle: IO, key: str, start: int, end: int, source: str, is_protein_coding:bool=None): + """ TranscriptPointer """ super().__init__(handle, key, start, end, source) self.is_protein_coding = is_protein_coding def load(self) -> TranscriptAnnotationModel: - """ """ + """ Load pointer and return a transcript annotation model. """ cur = self.handle.tell() offset = self.start - cur self.handle.seek(offset, 1) @@ -92,7 +92,7 @@ def load(self) -> TranscriptAnnotationModel: return tx_model def to_line(self) -> str: - """ """ + """ Convert to string """ fields = [ self.key, str(self.start), @@ -102,7 +102,7 @@ def to_line(self) -> str: return "\t".join(fields) def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: - """ """ + """ Iterate over a GTF file and yield pointers. """ if not source: count = 0 inferred = {} @@ -145,7 +145,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T if cur_tx_pointer: yield cur_tx_pointer cur_gene_id = record.gene_id - cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source) + cur_gene_pointer = GenePointer( + handle, cur_gene_id, + line_start, line_end, record.source + ) cur_tx_id = None cur_tx_pointer = None else: @@ -153,7 +156,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T if cur_tx_pointer: yield cur_tx_pointer cur_tx_id = record.transcript_id - cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source) + cur_tx_pointer = TranscriptPointer( + handle, cur_tx_id, + line_start, line_end, record.source + ) else: cur_tx_pointer.end = line_end if cur_gene_pointer: @@ -164,9 +170,13 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T yield cur_tx_pointer class GTFPointerDict(dict, Mapping[str, GTFPointer]): - """ """ + """ GTFPointerDict. This defines a special dict class that keys are str + (gene or transcript ID) and values are corresponding GTFPointer. The getter + magic function is customized so it returns a loaded GeneAnnotationModel + or TranscriptAnnotationModel instead of the pointer. + """ def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): - """ """ + """ Constructor """ for val in kwargs.values(): self.validate(val, GTFPointer) if args: @@ -177,30 +187,36 @@ def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]): super().__init__(*args, **kwargs) def __iter__(self) -> Iterable[str]: - """""" + """ Iterator """ for k in self.keys(): yield k - def validate(self, val, type): - """ """ - if not type is GTFPointer or not issubclass(type, GTFPointer): - raise TypeError(f"type = {type} is not supported") - if not isinstance(val, type): + def validate(self, val, __type): + """ Ensures the values are GTFPointers """ + if not __type is GTFPointer or not issubclass(__type, GTFPointer): + raise TypeError(f"type = {__type} is not supported") + if not isinstance(val, __type): raise TypeError( - f"{self.__class__} only accecpts {type.__class__} objects" + f"{self.__class__} only accecpts {__type.__class__} objects" ) def __setitem__(self, __key: str, __value: GTFPointer) -> None: + """ Setter """ self.validate(__value, GTFPointer) return super().__setitem__(__key, __value) def get_pointer(self, __key: str) -> GTFPointer: - """ """ + """ Get pointer by key without loading the annotation entity. """ return super().__getitem__(__key) class GenePointerDict(GTFPointerDict): - """ """ + """ GenePointerDict. This defines a dict class that keys are gene IDs and + values are the corresponding gene models. The getter magic function is + customized so it returns a loaded GeneAnnotationModel object instead of the + pointer. + """ def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]): + """ Constructor """ for val in kwargs.values(): self.validate(val, GenePointer) if args: @@ -213,6 +229,7 @@ def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointe self._cached_keys = deque() def __getitem__(self, __key: str) -> GeneAnnotationModel: + """ Getter """ if __key in self._cache: return self._cache[__key] self._cached_keys.appendleft(__key) @@ -225,8 +242,13 @@ def __getitem__(self, __key: str) -> GeneAnnotationModel: return val class TranscriptPointerDict(GTFPointerDict): - """ """ + """ TranscriptPointerDict. This defines a dict class that keys are transcript + IDs and values are the corresponding transcript models. The getter magic + function is customized so it returns a loaded TranscriptAnnotationModel + object instead of the pointer. + """ def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]): + """ Constructor """ for val in kwargs.values(): self.validate(val, TranscriptPointer) if args: @@ -238,8 +260,8 @@ def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, Tran self._cache = {} self._cached_keys = deque() - def __getitem__(self, __key: str - ) -> TranscriptAnnotationModel: + def __getitem__(self, __key: str) -> TranscriptAnnotationModel: + """ getter """ if __key in self._cache: return self._cache[__key] self._cached_keys.appendleft(__key) diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 0ddfa834..d2d05c3b 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -20,7 +20,8 @@ class GenomicAnnotationOnDisk(GenomicAnnotation): def __init__(self, genes:GenePointerDict=None, transcripts:TranscriptPointerDict=None, source:str=None): - """ Constructur """ + """ Constructor """ + # pylint: disable=W0231 self.genes = genes or GenePointerDict() self.transcripts = transcripts or TranscriptPointerDict() self.source = source @@ -55,6 +56,7 @@ def infer_source(self): def init_handle(self, handle:Union[str, IO, Path]): """ Initiate file handle """ + # pylint: disable=R1732 if isinstance(handle, str): handle = Path(handle) @@ -77,7 +79,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]: """ Get index file paths given the GTF file path. """ if isinstance(file, str): file = Path(file) - basename = str(file.name).split('.gtf')[0] + basename = str(file.name).split('.gtf', maxsplit=1)[0] gene_idx_file = file.parent/f"{basename}_gene.idx" tx_idx_file = file.parent/f"{basename}_tx.idx" @@ -139,7 +141,7 @@ def load_index(self, file:Union[str,Path]): if not tx_source: raise ValueError("Cannot find source from gene idx.") if tx_source != gene_source: - raise ValueError('tx_source does not match with gene_source') + raise ValueError('tx_source does not match with gene_source') for line in handle: if line.startswith('#'): diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py index b59ea88c..41b4810f 100644 --- a/moPepGen/gtf/GtfIO.py +++ b/moPepGen/gtf/GtfIO.py @@ -30,7 +30,7 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]: yield record def line_to_seq_feature(line:str) -> GTFSeqFeature: - """ """ + """ Conver line to SeqFeature """ line = line.rstrip() fields = line.split('\t') From 80de41ce666f64fd25089691b24ecc44c61c75a2 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Wed, 28 Jun 2023 22:40:29 -0700 Subject: [PATCH 12/18] doc (CHANGELOG): changelog added --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6dc26db9..0c245140 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [1.1.0] - 2023-06-28 + +### Fixed + +- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371 + ## [1.0.0] - 2023-06-15 ### Added From f1554dd65976c74db704c1d0d3b16841514d8a39 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Thu, 29 Jun 2023 09:08:14 -0700 Subject: [PATCH 13/18] style (test)): docstrings added --- test/integration/test_generate_index.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 3523e39f..f2c5ecd3 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -51,15 +51,15 @@ def test_generate_index_case1(self): self.assertEqual(files, expected) class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration): - """ """ + """ Test case for GenomicAnnotationOnDisk """ def test_generate_index(self): - """ """ + """ Test generate index """ proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(self.data_dir/'translate.fasta') anno = GenomicAnnotationOnDisk() cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome) def test_load_index(self): - """ """ + """ test load index """ anno = GenomicAnnotationOnDisk() anno.load_index(self.data_dir/'annotation.gtf') From 8ea2e51c25e5de58f1064d62558683df0d1cc8e9 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Thu, 29 Jun 2023 09:20:42 -0700 Subject: [PATCH 14/18] fix (GenomicAnnotationOnDisk): gtf file handle should be uncompressed GTF file only --- moPepGen/gtf/GenomicAnnotationOnDisk.py | 9 +++------ test/integration/test_generate_index.py | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index d2d05c3b..5c32ed2a 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -61,12 +61,9 @@ def init_handle(self, handle:Union[str, IO, Path]): handle = Path(handle) if isinstance(handle, Path): - if handle.suffix.lower() == '.gtf': - ihandle = open(handle, 'rb') - elif handle.suffix.lower() == '.lz': - ihandle = lzma.open(handle, 'rb') - elif handle.suffix.lower() == '.gz': - ihandle = gzip.open(handle, 'rb') + if handle.suffix.lower() != '.gtf': + raise ValueError(f"Unsupported file formate of {handle}") + ihandle = open(handle, 'rb') elif isinstance(handle, io.IOBase): ihandle = handle else: diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index f2c5ecd3..2604d52a 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -56,9 +56,9 @@ def test_generate_index(self): """ Test generate index """ proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(self.data_dir/'translate.fasta') - anno = GenomicAnnotationOnDisk() cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome) + def test_load_index(self): """ test load index """ anno = GenomicAnnotationOnDisk() From 60822b4a7265210cc1366a0ff844f41cdf7bfa2c Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Thu, 29 Jun 2023 09:29:59 -0700 Subject: [PATCH 15/18] fix (test): asserts added to GenomicAnnotationOnDisk --- test/files/annotation_gene.idx | 8 ++++++++ test/files/annotation_tx.idx | 9 +++++++++ test/integration/test_generate_index.py | 23 +++++++++++++++++++---- 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 test/files/annotation_gene.idx create mode 100644 test/files/annotation_tx.idx diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx new file mode 100644 index 00000000..f57b6263 --- /dev/null +++ b/test/files/annotation_gene.idx @@ -0,0 +1,8 @@ +##source=None +##python=3.8.11 +##moPepGen=1.1.0 +##biopython=1.79 +ENSG00000128408.9 0 174 ENST00000614168.2,ENST00000614167.2 +ENSG00000244486.9 12491 12694 ENST00000622235.5 +ENSG00000099949.21 24586 24955 ENST00000642151.1 +ENSG00000279973.2 27280 27432 ENST00000624155.2 diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx new file mode 100644 index 00000000..30cfbdde --- /dev/null +++ b/test/files/annotation_tx.idx @@ -0,0 +1,9 @@ +##source=None +##python=3.8.11 +##moPepGen=1.1.0 +##biopython=1.79 +ENST00000614167.2 174 7260 True +ENST00000614168.2 7260 12491 True +ENST00000622235.5 12694 24586 True +ENST00000642151.1 24955 27280 True +ENST00000624155.2 27432 28522 False diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py index 2604d52a..845f035e 100644 --- a/test/integration/test_generate_index.py +++ b/test/integration/test_generate_index.py @@ -1,10 +1,11 @@ """ Test the command line interface """ import argparse import subprocess as sp +import os import sys from test.integration import TestCaseIntegration from moPepGen import cli, aa -from moPepGen.gtf import GenomicAnnotationOnDisk +from moPepGen.gtf import GenomicAnnotation, GenomicAnnotationOnDisk class TestGenerateIndex(TestCaseIntegration): @@ -56,10 +57,24 @@ def test_generate_index(self): """ Test generate index """ proteome = aa.AminoAcidSeqDict() proteome.dump_fasta(self.data_dir/'translate.fasta') - cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome) + gtf_file = self.data_dir/'annotation.gtf' + gtf_file2 = self.work_dir/'annotation.gtf' + os.symlink(gtf_file, gtf_file2) + cli.index_gtf(gtf_file2, proteome=proteome) + expect = { + 'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx' + } + received = {str(file.name) for file in self.work_dir.glob('*')} + self.assertEqual(received, expect) def test_load_index(self): """ test load index """ - anno = GenomicAnnotationOnDisk() - anno.load_index(self.data_dir/'annotation.gtf') + anno1 = GenomicAnnotation() + anno1.dump_gtf(self.data_dir/'annotation.gtf') + + anno2 = GenomicAnnotationOnDisk() + anno2.load_index(self.data_dir/'annotation.gtf') + + self.assertEqual(anno1.genes.keys(), anno2.genes.keys()) + self.assertEqual(anno1.transcripts.keys(), anno2.transcripts.keys()) From 9a92ecfd458b960ae6bda02d7d77fec8694c8881 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Thu, 29 Jun 2023 09:49:31 -0700 Subject: [PATCH 16/18] fix (test): versions of gtf idx files dont match with the github action --- test/files/annotation_gene.idx | 4 ++-- test/files/annotation_tx.idx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx index f57b6263..76011e1e 100644 --- a/test/files/annotation_gene.idx +++ b/test/files/annotation_gene.idx @@ -1,7 +1,7 @@ ##source=None -##python=3.8.11 +##python=3.8.17 ##moPepGen=1.1.0 -##biopython=1.79 +##biopython=1.81 ENSG00000128408.9 0 174 ENST00000614168.2,ENST00000614167.2 ENSG00000244486.9 12491 12694 ENST00000622235.5 ENSG00000099949.21 24586 24955 ENST00000642151.1 diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx index 30cfbdde..81ead4f8 100644 --- a/test/files/annotation_tx.idx +++ b/test/files/annotation_tx.idx @@ -1,7 +1,7 @@ ##source=None -##python=3.8.11 +##python=3.8.17 ##moPepGen=1.1.0 -##biopython=1.79 +##biopython=1.81 ENST00000614167.2 174 7260 True ENST00000614168.2 7260 12491 True ENST00000622235.5 12694 24586 True From 8685d086b764c32275487492bbfc3ddd3643a249 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Thu, 29 Jun 2023 09:51:34 -0700 Subject: [PATCH 17/18] fix (GenomicAnnotationOnDisk): remove unused import --- moPepGen/gtf/GenomicAnnotationOnDisk.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py index 5c32ed2a..f667891b 100644 --- a/moPepGen/gtf/GenomicAnnotationOnDisk.py +++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py @@ -2,8 +2,6 @@ from __future__ import annotations from typing import Dict, IO, Union, Tuple import io -import gzip -import lzma from pathlib import Path from moPepGen import err from moPepGen.version import MetaVersion From 2945a191c3ec7b3dd9ad5c963334974bec0a25e2 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Thu, 29 Jun 2023 13:05:36 -0700 Subject: [PATCH 18/18] fix (gtf): created GTFSourceInferrer class --- moPepGen/cli/generate_index.py | 3 +++ moPepGen/gtf/GTFPointer.py | 17 +++-------------- moPepGen/gtf/GTFSourceInferrer.py | 31 +++++++++++++++++++++++++++++++ moPepGen/gtf/GenomicAnnotation.py | 26 ++++++++------------------ 4 files changed, 45 insertions(+), 32 deletions(-) create mode 100644 moPepGen/gtf/GTFSourceInferrer.py diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py index 4d2f5026..6f8ee0a6 100644 --- a/moPepGen/cli/generate_index.py +++ b/moPepGen/cli/generate_index.py @@ -83,6 +83,9 @@ def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path: if file.suffix.lower() == '.gz': if symlink: symlink = False + logger( + "--gtf-symlink was suppressed because compressed GTF file was received. " + ) elif file.suffix.lower() != '.gtf': raise ValueError(f"Cannot handle gtf file {file}") diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py index 29ab895a..502a102a 100644 --- a/moPepGen/gtf/GTFPointer.py +++ b/moPepGen/gtf/GTFPointer.py @@ -8,6 +8,7 @@ TranscriptAnnotationModel, GTF_FEATURE_TYPES ) +from moPepGen.gtf.GTFSourceInferrer import GTFSourceInferrer GENE_DICT_CACHE_SIZE = 10 @@ -104,8 +105,7 @@ def to_line(self) -> str: def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]: """ Iterate over a GTF file and yield pointers. """ if not source: - count = 0 - inferred = {} + inferrer = GTFSourceInferrer() cur_gene_id:str = None cur_tx_id:str = None @@ -124,18 +124,7 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T record = GtfIO.line_to_seq_feature(line) if not source: - if count > 100: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - record.source = source - else: - count += 1 - record.infer_annotation_source() - inferred_source = record.source - if inferred_source not in inferred: - inferred[inferred_source] = 1 - else: - inferred[inferred_source] += 1 + record.source = inferrer.infer(record) else: record.source = source diff --git a/moPepGen/gtf/GTFSourceInferrer.py b/moPepGen/gtf/GTFSourceInferrer.py new file mode 100644 index 00000000..4c3c1aa3 --- /dev/null +++ b/moPepGen/gtf/GTFSourceInferrer.py @@ -0,0 +1,31 @@ +""" Infer GTF source (e.g. GENCODE/ENSEMBL) """ +from __future__ import annotations +from typing import Dict, TYPE_CHECKING + + +if TYPE_CHECKING: + from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature + +class GTFSourceInferrer(): + """ Infer GTF source (e.g. GENOCDE/ENSEMBL) """ + def __init__(self): + """ Constructor """ + self.max_iter = 100 + self.data:Dict[str,int] = {} + self.count = 0 + self.source:str = None + + def infer(self, record:GTFSeqFeature) -> str: + """ Infer the source of a GTF record """ + if self.count > self.max_iter: + if not self.source: + self.source = sorted(self.data.items(), key=lambda x:x[1])[-1][0] + return self.source + self.count += 1 + record.infer_annotation_source() + source = record.source + if source not in self.data: + self.data[source] = 1 + else: + self.data[source] += 1 + return source diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py index bf2142ae..adbde542 100644 --- a/moPepGen/gtf/GenomicAnnotation.py +++ b/moPepGen/gtf/GenomicAnnotation.py @@ -10,6 +10,7 @@ from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES from .GeneAnnotationModel import GeneAnnotationModel from .GTFSeqFeature import GTFSeqFeature +from .GTFSourceInferrer import GTFSourceInferrer if TYPE_CHECKING: @@ -113,25 +114,14 @@ def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=Non """ record:GTFSeqFeature if not source: - count = 0 - inferred = {} + inferrer = GTFSourceInferrer() + for record in GtfIO.parse(handle): if biotype is not None and record.biotype not in biotype: continue if not source: - if count > 100: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - record.source = source - else: - count += 1 - record.infer_annotation_source() - inferred_source = record.source - if inferred_source not in inferred: - inferred[inferred_source] = 1 - else: - inferred[inferred_source] += 1 + record.source = inferrer.infer(record) else: record.source = source @@ -142,11 +132,11 @@ def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=Non self.add_transcript_record(record) - if not source: - inferred = sorted(inferred.items(), key=lambda x: x[1]) - source = inferred[-1][0] - self.source = source + if not source: + source = inferrer.source + else: + self.source = source for transcript_model in self.transcripts.values(): transcript_model.sort_records()