From 8d24b628fc94e6a140ccf8df8b433e15c451f537 Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Fri, 23 Jun 2023 18:36:41 -0700
Subject: [PATCH 01/18] fix (GenomicAnnotationOnDisk): class created to keep
 genomic annotation on disk

---
 moPepGen/gtf/GenomicAnnotation.py       |  14 +-
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 270 ++++++++++++++++++++++++
 moPepGen/gtf/GtfIO.py                   |  86 ++++----
 3 files changed, 321 insertions(+), 49 deletions(-)
 create mode 100644 moPepGen/gtf/GenomicAnnotationOnDisk.py

diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py
index 948dfbf6..bf2142ae 100644
--- a/moPepGen/gtf/GenomicAnnotation.py
+++ b/moPepGen/gtf/GenomicAnnotation.py
@@ -2,7 +2,7 @@
 """
 from __future__ import annotations
 import copy
-from typing import List, Dict, Tuple, TYPE_CHECKING
+from typing import List, Dict, Tuple, TYPE_CHECKING, IO, Union
 from moPepGen.SeqFeature import FeatureLocation, SeqFeature
 from moPepGen import seqvar, err
 from moPepGen.version import MetaVersion
@@ -31,12 +31,8 @@ def __init__(self, genes:Dict[str,GeneAnnotationModel]=None,
             transcripts:Dict[str, TranscriptAnnotationModel]=None,
             source:str=None):
         """ Construct a GenomicAnnotation object """
-        if genes is None:
-            genes = {}
-        if transcripts is None:
-            transcripts = {}
-        self.genes = genes
-        self.transcripts = transcripts
+        self.genes = genes or {}
+        self.transcripts = transcripts or {}
         self.source = source
         self.gene_id_version_mapper:Dict[str, str] = None
         self.version = MetaVersion()
@@ -106,7 +102,7 @@ def remove_cached_tx_seq(self):
         for tx_id in self._cached_tx_seqs:
             self.transcripts[tx_id].remove_cached_seq()
 
-    def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None:
+    def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=None)->None:
         """ Dump a GTF file into a GenomicAnnotation
 
         Args:
@@ -119,7 +115,7 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None:
         if not source:
             count = 0
             inferred = {}
-        for record in GtfIO.parse(path):
+        for record in GtfIO.parse(handle):
             if biotype is not None and record.biotype not in biotype:
                 continue
 
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
new file mode 100644
index 00000000..06128e11
--- /dev/null
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -0,0 +1,270 @@
+""" Module for GenomicAnnotationOnDisk """
+from __future__ import annotations
+from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING
+import io
+from collections import deque
+from pathlib import Path
+from moPepGen.gtf import GtfIO
+from moPepGen.version import MetaVersion
+from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation
+from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel
+from moPepGen.gtf.TranscriptAnnotationModel import (
+    TranscriptAnnotationModel,
+    GTF_FEATURE_TYPES
+)
+
+
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
+
+GENE_DICT_CACHE_SIZE = 10
+TX_DICT_CACHE_SIZE = 10
+
+class GTFPointer():
+    """ """
+    def __init__(self, handle:IO, key:str, start:int, end:int, source:str):
+        """ """
+        self.handle = handle
+        self.key = key
+        self.start = start
+        self.end = end
+        self.source = source
+
+    def __len__(self) -> int:
+        """ length """
+        return self.end - self.start
+
+class GenePointer(GTFPointer):
+    """ """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            transcripts:List[str]=None):
+        super().__init__(handle, key, start, end, source)
+        self.transcripts = transcripts or []
+
+    def load(self) -> GeneAnnotationModel:
+        """ """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').split('\n')
+        if len(lines) > 0:
+            raise ValueError(f"Multiple lines found for gene {self.key}")
+        record = GtfIO.line_to_seq_feature(lines[0])
+        record.__class__ = GeneAnnotationModel
+        record.exons = []
+        record.transcripts = self.transcripts
+        return record
+
+class TranscriptPointer(GTFPointer):
+    """ """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            is_protein_coding:bool=None):
+        super().__init__(handle, key, start, end, source)
+        self.is_protein_coding = is_protein_coding
+
+    def load(self) -> TranscriptAnnotationModel:
+        """ """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').rstrip().split('\n')
+        tx_model = TranscriptAnnotationModel()
+
+        for line in lines:
+            record = GtfIO.line_to_seq_feature(line)
+            feature = record.type.lower()
+            if feature not in GTF_FEATURE_TYPES:
+                continue
+            tx_id = record.transcript_id
+            if tx_id != self.key:
+                raise ValueError("Loaded transcript do not match.")
+            record.id = tx_id
+            tx_model.add_record(feature, record)
+        return tx_model
+
+def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
+    """ """
+    if not source:
+        count = 0
+        inferred = {}
+
+    cur_gene_id:str = None
+    cur_tx_id:str = None
+    cur_gene_pointer = None
+    cur_tx_pointer = None
+    line_end = 0
+    for line in handle:
+        line:bytes
+        line_start = line_end
+        line_end += len(line)
+        line = line.decode('utf-8')
+
+        if line.startswith('#'):
+            continue
+
+        record = GtfIO.line_to_seq_feature(line)
+
+        if not source:
+            if count > 100:
+                inferred = sorted(inferred.items(), key=lambda x: x[1])
+                source = inferred[-1][0]
+                record.source = source
+            else:
+                count += 1
+                record.infer_annotation_source()
+                inferred_source = record.source
+                if inferred_source not in inferred:
+                    inferred[inferred_source] = 1
+                else:
+                    inferred[inferred_source] += 1
+        else:
+            record.source = source
+
+        if record.type.lower() == 'gene':
+            if cur_gene_pointer:
+                yield cur_gene_pointer
+            if cur_tx_pointer:
+                yield cur_tx_pointer
+            cur_gene_id = record.gene_id
+            cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source)
+            cur_tx_id = None
+            cur_tx_pointer = None
+        else:
+            if cur_tx_id != record.transcript_id:
+                if cur_tx_pointer:
+                    yield cur_tx_pointer
+                cur_tx_id = record.transcript_id
+                cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source)
+            else:
+                cur_tx_pointer.end = line_end
+            if cur_gene_pointer:
+                cur_gene_pointer.transcripts.append(cur_tx_id)
+
+class GTFPointerDict(dict, Mapping[str, GTFPointer]):
+    """ """
+    def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
+        """ """
+        for val in kwargs.values():
+            self.validate(val, GTFPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GTFPointer)
+        super().__init__(*args, **kwargs)
+
+    def validate(self, val, type):
+        """ """
+        if not type is GTFPointer or not issubclass(type, GTFPointer):
+            raise TypeError(f"type = {type} is not supported")
+        if not isinstance(val, type):
+            raise TypeError(
+                f"{self.__class__} only accecpts {type.__class__} objects"
+            )
+
+    def __setitem__(self, __key: str, __value: GTFPointer) -> None:
+        self.validate(__value, GTFPointer)
+        return super().__setitem__(__key, __value)
+
+    def get_pointer(self, __key: str) -> GTFPointer:
+        """ """
+        return super().__getitem__(__key)
+
+class GenePointerDict(GTFPointerDict):
+    """ """
+    def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]):
+        for val in kwargs.values():
+            self.validate(val, GenePointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GenePointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str) -> GeneAnnotationModel:
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > GENE_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:GenePointer = self.get_pointer(__key)
+        return pointer.load()
+
+class TranscriptPointerDict(GTFPointerDict):
+    """ """
+    def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]):
+        for val in kwargs.values():
+            self.validate(val, TranscriptPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, TranscriptPointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str
+            ) -> TranscriptAnnotationModel:
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > TX_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:TranscriptPointer = self.get_pointer(__key)
+        tx_model = pointer.load()
+        tx_model.is_protein_coding = pointer.is_protein_coding is True
+        return tx_model
+
+class GenomicAnnotationOnDisk(GenomicAnnotation):
+    """ """
+    def __init__(self, genes:GenePointerDict=None,
+            transcripts:TranscriptPointerDict=None,
+            source:str=None):
+        """ """
+        self.genes = genes or GenePointerDict()
+        self.transcripts = transcripts or TranscriptPointerDict()
+        self.source = source
+        self.gene_id_version_mapper:Dict[str,str] = None
+        self.version = MetaVersion()
+        self._cached_tx_seqs = []
+        self.handle = None
+
+    def __del__(self):
+        """ """
+        if self.handle:
+            self.handle.close()
+
+    def dump_gtf(self, *args, **kwargs) -> None:
+        raise NotImplementedError
+
+    def generate_index(self, handle:Union[IO, str], source:str=None):
+        """ """
+        if isinstance(handle, str):
+            ihandle = open(handle, 'rb')
+        elif isinstance(handle, io.IOBase):
+            ihandle = handle
+        else:
+            raise ValueError(f"handle {handle} of type {handle.__class__} is not supported.")
+
+        self.handle = ihandle
+
+        for pointer in iterate_pointer(self.handle):
+            if isinstance(pointer, GenePointer):
+                self.genes[pointer.key] = pointer
+            else:
+                self.transcripts[pointer.key] = pointer
+
+    def load_index(self, handle:IO):
+        """ """
+        pass
+
+def index_gtf(handle:Union[str, IO], biotype:List[str]=None, source:str=None):
+    """"""
diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py
index 45978103..b59ea88c 100644
--- a/moPepGen/gtf/GtfIO.py
+++ b/moPepGen/gtf/GtfIO.py
@@ -25,48 +25,54 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]:
             if line.startswith('#'):
                 continue
             line = line.rstrip()
-            fields = line.split('\t')
-
-            try:
-                strand={'+':1, '-':-1, '?':0}[fields[6]]
-            except KeyError:
-                strand=None
-
-            attributes = {}
-            attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id',
-                'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding']
-            attribute_list = [field.strip().split(' ', 1) for field in \
-                fields[8].rstrip(';').split(';')]
-
-            for key,val in attribute_list:
-                if key not in attributes_to_keep:
-                    continue
-                val = val.strip('"')
-                if key == 'tag':
-                    if key not in attributes:
-                        attributes[key] = []
-                    attributes[key].append(val)
-                else:
-                    attributes[key] = val
-
-            location = FeatureLocation(
-                seqname=fields[0],
-                start=int(fields[3])-1,
-                end=int(fields[4]),
-                strand=strand,
-            )
-
-            frame = None if fields[7] == '.' else int(fields[7])
-
-            record = GTFSeqFeature(
-                chrom=fields[0],
-                attributes=attributes,
-                location=location,
-                type=fields[2],
-                frame=frame
-            )
+            record = line_to_seq_feature(line)
+
             yield record
 
+def line_to_seq_feature(line:str) -> GTFSeqFeature:
+    """ """
+    line = line.rstrip()
+    fields = line.split('\t')
+
+    try:
+        strand={'+':1, '-':-1, '?':0}[fields[6]]
+    except KeyError:
+        strand=None
+
+    attributes = {}
+    attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id',
+        'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding']
+    attribute_list = [field.strip().split(' ', 1) for field in \
+        fields[8].rstrip(';').split(';')]
+
+    for key,val in attribute_list:
+        if key not in attributes_to_keep:
+            continue
+        val = val.strip('"')
+        if key == 'tag':
+            if key not in attributes:
+                attributes[key] = []
+            attributes[key].append(val)
+        else:
+            attributes[key] = val
+
+    location = FeatureLocation(
+        seqname=fields[0],
+        start=int(fields[3])-1,
+        end=int(fields[4]),
+        strand=strand,
+    )
+
+    frame = None if fields[7] == '.' else int(fields[7])
+
+    return GTFSeqFeature(
+        chrom=fields[0],
+        attributes=attributes,
+        location=location,
+        type=fields[2],
+        frame=frame
+    )
+
 def parse(handle:Union[IO[str], str]) -> GtfIterator:
     """ Parser for GTF files.
     """

From 53ea8ddb2c81b121b0387b062bea9d09055b9dec Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sat, 24 Jun 2023 00:20:31 -0700
Subject: [PATCH 02/18] fix (GenomicAnnotationOnDisk): added index_gtf function

---
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 63 +++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 3 deletions(-)

diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 06128e11..c6d28d09 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING
 import io
-from collections import deque
+from collections import deque, Counter
 from pathlib import Path
 from moPepGen.gtf import GtfIO
 from moPepGen.version import MetaVersion
@@ -56,6 +56,16 @@ def load(self) -> GeneAnnotationModel:
         record.transcripts = self.transcripts
         return record
 
+    def to_line(self) -> str:
+        """ """
+        fields = [
+            self.key,
+            self.start,
+            self.end,
+            ','.join(self.transcripts)
+        ]
+        return '\t'.join(fields)
+
 class TranscriptPointer(GTFPointer):
     """ """
     def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
@@ -84,6 +94,16 @@ def load(self) -> TranscriptAnnotationModel:
             tx_model.add_record(feature, record)
         return tx_model
 
+    def to_line(self) -> str:
+        """ """
+        fields = [
+            self.key,
+            self.start,
+            self.end,
+            self.is_protein_coding
+        ]
+        return "\t".join(fields)
+
 def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
     """ """
     if not source:
@@ -245,6 +265,21 @@ def __del__(self):
     def dump_gtf(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
+    def infer_source(self):
+        """ """
+        i = 0
+        inferred = {}
+        for tx in self.transcripts.keys():
+            i += 1
+            if i > 100:
+                break
+            s = self.transcripts.get_pointer(tx).source
+            if s in inferred:
+                inferred[s] += 1
+            else:
+                inferred[s] = 1
+        self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0]
+
     def generate_index(self, handle:Union[IO, str], source:str=None):
         """ """
         if isinstance(handle, str):
@@ -256,15 +291,37 @@ def generate_index(self, handle:Union[IO, str], source:str=None):
 
         self.handle = ihandle
 
-        for pointer in iterate_pointer(self.handle):
+        for pointer in iterate_pointer(self.handle, source):
             if isinstance(pointer, GenePointer):
                 self.genes[pointer.key] = pointer
             else:
                 self.transcripts[pointer.key] = pointer
 
+        if source:
+            self.source = source
+        else:
+            self.infer_source()
+
     def load_index(self, handle:IO):
         """ """
         pass
 
-def index_gtf(handle:Union[str, IO], biotype:List[str]=None, source:str=None):
+def index_gtf(file:Path, biotype:List[str]=None, source:str=None):
     """"""
+    anno = GenomicAnnotationOnDisk()
+    with open(Path, 'rb') as handle:
+        anno.generate_index(handle, source)
+
+    gene_idx_file = file.parent/f"{file.stem}_gene.idx"
+    with open(gene_idx_file, 'wt') as handle:
+        handle.write(f"# source={anno.source}\n")
+        for gene in anno.genes.keys():
+            gene_pointer:GenePointer = anno.genes.get_pointer(gene)
+            handle.write(gene_pointer.to_line() + '\n')
+
+    tx_idx_file = file.parent/f"{file.stem}_tx.idx"
+    with open(tx_idx_file, 'wt') as handle:
+        handle.write(f"# source={anno.source}\n")
+        for tx in anno.transcripts.keys():
+            tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
+            handle.write(tx_pointer.to_line() + '\n')

From 39b9c6b8f3a026f83366f25417e8be3e2c6baad2 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sat, 24 Jun 2023 21:31:53 -0700
Subject: [PATCH 03/18] fix (GenomicAnnotationOnDisk): added loader function
 from gtf idx

---
 moPepGen/gtf/GTFPointer.py              | 240 +++++++++++++++++
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 343 ++++++------------------
 moPepGen/gtf/__init__.py                |   1 +
 test/integration/test_generate_index.py |  13 +
 4 files changed, 339 insertions(+), 258 deletions(-)
 create mode 100644 moPepGen/gtf/GTFPointer.py

diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py
new file mode 100644
index 00000000..3a7937c5
--- /dev/null
+++ b/moPepGen/gtf/GTFPointer.py
@@ -0,0 +1,240 @@
+""" """
+from __future__ import annotations
+from typing import Dict, Set, Iterable, Mapping, Union, IO, TYPE_CHECKING
+from collections import deque
+from moPepGen.gtf import GtfIO
+from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel
+from moPepGen.gtf.TranscriptAnnotationModel import TranscriptAnnotationModel
+from moPepGen.gtf.TranscriptAnnotationModel import (
+    TranscriptAnnotationModel,
+    GTF_FEATURE_TYPES
+)
+
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
+
+GENE_DICT_CACHE_SIZE = 10
+TX_DICT_CACHE_SIZE = 10
+
+class GTFPointer():
+    """ """
+    def __init__(self, handle:IO, key:str, start:int, end:int, source:str):
+        """ """
+        self.handle = handle
+        self.key = key
+        self.start = start
+        self.end = end
+        self.source = source
+
+    def __len__(self) -> int:
+        """ length """
+        return self.end - self.start
+
+class GenePointer(GTFPointer):
+    """ """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            transcripts:Set[str]=None):
+        super().__init__(handle, key, start, end, source)
+        self.transcripts = transcripts or set()
+
+    def load(self) -> GeneAnnotationModel:
+        """ """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').split('\n')
+        if len(lines) > 0:
+            raise ValueError(f"Multiple lines found for gene {self.key}")
+        record = GtfIO.line_to_seq_feature(lines[0])
+        record.__class__ = GeneAnnotationModel
+        record.exons = []
+        record.transcripts = list(self.transcripts)
+        return record
+
+    def to_line(self) -> str:
+        """ """
+        fields = [
+            self.key,
+            str(self.start),
+            str(self.end),
+            ','.join(self.transcripts)
+        ]
+        return '\t'.join(fields)
+
+class TranscriptPointer(GTFPointer):
+    """ """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            is_protein_coding:bool=None):
+        super().__init__(handle, key, start, end, source)
+        self.is_protein_coding = is_protein_coding
+
+    def load(self) -> TranscriptAnnotationModel:
+        """ """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').rstrip().split('\n')
+        tx_model = TranscriptAnnotationModel()
+
+        for line in lines:
+            record = GtfIO.line_to_seq_feature(line)
+            feature = record.type.lower()
+            if feature not in GTF_FEATURE_TYPES:
+                continue
+            tx_id = record.transcript_id
+            if tx_id != self.key:
+                raise ValueError("Loaded transcript do not match.")
+            record.id = tx_id
+            tx_model.add_record(feature, record)
+        return tx_model
+
+    def to_line(self) -> str:
+        """ """
+        fields = [
+            self.key,
+            str(self.start),
+            str(self.end),
+            str(self.is_protein_coding)
+        ]
+        return "\t".join(fields)
+
+def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
+    """ """
+    if not source:
+        count = 0
+        inferred = {}
+
+    cur_gene_id:str = None
+    cur_tx_id:str = None
+    cur_gene_pointer = None
+    cur_tx_pointer = None
+    line_end = 0
+    for line in handle:
+        line:bytes
+        line_start = line_end
+        line_end += len(line)
+        line = line.decode('utf-8')
+
+        if line.startswith('#'):
+            continue
+
+        record = GtfIO.line_to_seq_feature(line)
+
+        if not source:
+            if count > 100:
+                inferred = sorted(inferred.items(), key=lambda x: x[1])
+                source = inferred[-1][0]
+                record.source = source
+            else:
+                count += 1
+                record.infer_annotation_source()
+                inferred_source = record.source
+                if inferred_source not in inferred:
+                    inferred[inferred_source] = 1
+                else:
+                    inferred[inferred_source] += 1
+        else:
+            record.source = source
+
+        if record.type.lower() == 'gene':
+            if cur_gene_pointer:
+                yield cur_gene_pointer
+            if cur_tx_pointer:
+                yield cur_tx_pointer
+            cur_gene_id = record.gene_id
+            cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source)
+            cur_tx_id = None
+            cur_tx_pointer = None
+        else:
+            if cur_tx_id != record.transcript_id:
+                if cur_tx_pointer:
+                    yield cur_tx_pointer
+                cur_tx_id = record.transcript_id
+                cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source)
+            else:
+                cur_tx_pointer.end = line_end
+            if cur_gene_pointer:
+                cur_gene_pointer.transcripts.add(cur_tx_id)
+
+class GTFPointerDict(dict, Mapping[str, GTFPointer]):
+    """ """
+    def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
+        """ """
+        for val in kwargs.values():
+            self.validate(val, GTFPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GTFPointer)
+        super().__init__(*args, **kwargs)
+
+    def validate(self, val, type):
+        """ """
+        if not type is GTFPointer or not issubclass(type, GTFPointer):
+            raise TypeError(f"type = {type} is not supported")
+        if not isinstance(val, type):
+            raise TypeError(
+                f"{self.__class__} only accecpts {type.__class__} objects"
+            )
+
+    def __setitem__(self, __key: str, __value: GTFPointer) -> None:
+        self.validate(__value, GTFPointer)
+        return super().__setitem__(__key, __value)
+
+    def get_pointer(self, __key: str) -> GTFPointer:
+        """ """
+        return super().__getitem__(__key)
+
+class GenePointerDict(GTFPointerDict):
+    """ """
+    def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]):
+        for val in kwargs.values():
+            self.validate(val, GenePointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GenePointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str) -> GeneAnnotationModel:
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > GENE_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:GenePointer = self.get_pointer(__key)
+        return pointer.load()
+
+class TranscriptPointerDict(GTFPointerDict):
+    """ """
+    def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]):
+        for val in kwargs.values():
+            self.validate(val, TranscriptPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, TranscriptPointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str
+            ) -> TranscriptAnnotationModel:
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > TX_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:TranscriptPointer = self.get_pointer(__key)
+        tx_model = pointer.load()
+        tx_model.is_protein_coding = pointer.is_protein_coding is True
+        return tx_model
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index c6d28d09..4fbdf324 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -1,248 +1,16 @@
 """ Module for GenomicAnnotationOnDisk """
 from __future__ import annotations
-from typing import Dict, List, IO, Iterable, Mapping, Union, TYPE_CHECKING
+from typing import Dict, IO, Union, Tuple
 import io
-from collections import deque, Counter
 from pathlib import Path
-from moPepGen.gtf import GtfIO
 from moPepGen.version import MetaVersion
 from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation
-from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel
-from moPepGen.gtf.TranscriptAnnotationModel import (
-    TranscriptAnnotationModel,
-    GTF_FEATURE_TYPES
+from moPepGen.gtf.GTFPointer import (
+    GenePointerDict, TranscriptPointerDict, iterate_pointer,
+    GenePointer, TranscriptPointer
 )
 
 
-if TYPE_CHECKING:
-    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
-
-GENE_DICT_CACHE_SIZE = 10
-TX_DICT_CACHE_SIZE = 10
-
-class GTFPointer():
-    """ """
-    def __init__(self, handle:IO, key:str, start:int, end:int, source:str):
-        """ """
-        self.handle = handle
-        self.key = key
-        self.start = start
-        self.end = end
-        self.source = source
-
-    def __len__(self) -> int:
-        """ length """
-        return self.end - self.start
-
-class GenePointer(GTFPointer):
-    """ """
-    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
-            transcripts:List[str]=None):
-        super().__init__(handle, key, start, end, source)
-        self.transcripts = transcripts or []
-
-    def load(self) -> GeneAnnotationModel:
-        """ """
-        cur = self.handle.tell()
-        offset = self.start - cur
-        self.handle.seek(offset, 1)
-        buffer:bytes = self.handle.read(len(self))
-        lines = buffer.decode('utf-8').split('\n')
-        if len(lines) > 0:
-            raise ValueError(f"Multiple lines found for gene {self.key}")
-        record = GtfIO.line_to_seq_feature(lines[0])
-        record.__class__ = GeneAnnotationModel
-        record.exons = []
-        record.transcripts = self.transcripts
-        return record
-
-    def to_line(self) -> str:
-        """ """
-        fields = [
-            self.key,
-            self.start,
-            self.end,
-            ','.join(self.transcripts)
-        ]
-        return '\t'.join(fields)
-
-class TranscriptPointer(GTFPointer):
-    """ """
-    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
-            is_protein_coding:bool=None):
-        super().__init__(handle, key, start, end, source)
-        self.is_protein_coding = is_protein_coding
-
-    def load(self) -> TranscriptAnnotationModel:
-        """ """
-        cur = self.handle.tell()
-        offset = self.start - cur
-        self.handle.seek(offset, 1)
-        buffer:bytes = self.handle.read(len(self))
-        lines = buffer.decode('utf-8').rstrip().split('\n')
-        tx_model = TranscriptAnnotationModel()
-
-        for line in lines:
-            record = GtfIO.line_to_seq_feature(line)
-            feature = record.type.lower()
-            if feature not in GTF_FEATURE_TYPES:
-                continue
-            tx_id = record.transcript_id
-            if tx_id != self.key:
-                raise ValueError("Loaded transcript do not match.")
-            record.id = tx_id
-            tx_model.add_record(feature, record)
-        return tx_model
-
-    def to_line(self) -> str:
-        """ """
-        fields = [
-            self.key,
-            self.start,
-            self.end,
-            self.is_protein_coding
-        ]
-        return "\t".join(fields)
-
-def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
-    """ """
-    if not source:
-        count = 0
-        inferred = {}
-
-    cur_gene_id:str = None
-    cur_tx_id:str = None
-    cur_gene_pointer = None
-    cur_tx_pointer = None
-    line_end = 0
-    for line in handle:
-        line:bytes
-        line_start = line_end
-        line_end += len(line)
-        line = line.decode('utf-8')
-
-        if line.startswith('#'):
-            continue
-
-        record = GtfIO.line_to_seq_feature(line)
-
-        if not source:
-            if count > 100:
-                inferred = sorted(inferred.items(), key=lambda x: x[1])
-                source = inferred[-1][0]
-                record.source = source
-            else:
-                count += 1
-                record.infer_annotation_source()
-                inferred_source = record.source
-                if inferred_source not in inferred:
-                    inferred[inferred_source] = 1
-                else:
-                    inferred[inferred_source] += 1
-        else:
-            record.source = source
-
-        if record.type.lower() == 'gene':
-            if cur_gene_pointer:
-                yield cur_gene_pointer
-            if cur_tx_pointer:
-                yield cur_tx_pointer
-            cur_gene_id = record.gene_id
-            cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source)
-            cur_tx_id = None
-            cur_tx_pointer = None
-        else:
-            if cur_tx_id != record.transcript_id:
-                if cur_tx_pointer:
-                    yield cur_tx_pointer
-                cur_tx_id = record.transcript_id
-                cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source)
-            else:
-                cur_tx_pointer.end = line_end
-            if cur_gene_pointer:
-                cur_gene_pointer.transcripts.append(cur_tx_id)
-
-class GTFPointerDict(dict, Mapping[str, GTFPointer]):
-    """ """
-    def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
-        """ """
-        for val in kwargs.values():
-            self.validate(val, GTFPointer)
-        if args:
-            if not isinstance(args[0], dict):
-                raise TypeError('Data type invalid')
-            for val in args[0].values():
-                self.validate(val, GTFPointer)
-        super().__init__(*args, **kwargs)
-
-    def validate(self, val, type):
-        """ """
-        if not type is GTFPointer or not issubclass(type, GTFPointer):
-            raise TypeError(f"type = {type} is not supported")
-        if not isinstance(val, type):
-            raise TypeError(
-                f"{self.__class__} only accecpts {type.__class__} objects"
-            )
-
-    def __setitem__(self, __key: str, __value: GTFPointer) -> None:
-        self.validate(__value, GTFPointer)
-        return super().__setitem__(__key, __value)
-
-    def get_pointer(self, __key: str) -> GTFPointer:
-        """ """
-        return super().__getitem__(__key)
-
-class GenePointerDict(GTFPointerDict):
-    """ """
-    def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]):
-        for val in kwargs.values():
-            self.validate(val, GenePointer)
-        if args:
-            if not isinstance(args[0], dict):
-                raise TypeError('Data type invalid')
-            for val in args[0].values():
-                self.validate(val, GenePointer)
-        super().__init__(*args, **kwargs)
-        self._cache = {}
-        self._cached_keys = deque()
-
-    def __getitem__(self, __key: str) -> GeneAnnotationModel:
-        if __key in self._cache:
-            return self._cache[__key]
-        self._cached_keys.appendleft(__key)
-        if len(self._cached_keys) > GENE_DICT_CACHE_SIZE:
-            key_pop = self._cached_keys.pop()
-            self._cache.pop(key_pop)
-        pointer:GenePointer = self.get_pointer(__key)
-        return pointer.load()
-
-class TranscriptPointerDict(GTFPointerDict):
-    """ """
-    def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]):
-        for val in kwargs.values():
-            self.validate(val, TranscriptPointer)
-        if args:
-            if not isinstance(args[0], dict):
-                raise TypeError('Data type invalid')
-            for val in args[0].values():
-                self.validate(val, TranscriptPointer)
-        super().__init__(*args, **kwargs)
-        self._cache = {}
-        self._cached_keys = deque()
-
-    def __getitem__(self, __key: str
-            ) -> TranscriptAnnotationModel:
-        if __key in self._cache:
-            return self._cache[__key]
-        self._cached_keys.appendleft(__key)
-        if len(self._cached_keys) > TX_DICT_CACHE_SIZE:
-            key_pop = self._cached_keys.pop()
-            self._cache.pop(key_pop)
-        pointer:TranscriptPointer = self.get_pointer(__key)
-        tx_model = pointer.load()
-        tx_model.is_protein_coding = pointer.is_protein_coding is True
-        return tx_model
-
 class GenomicAnnotationOnDisk(GenomicAnnotation):
     """ """
     def __init__(self, genes:GenePointerDict=None,
@@ -280,9 +48,9 @@ def infer_source(self):
                 inferred[s] = 1
         self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0]
 
-    def generate_index(self, handle:Union[IO, str], source:str=None):
+    def init_handle(self, handle:Union[str, IO, Path]):
         """ """
-        if isinstance(handle, str):
+        if isinstance(handle, str) or isinstance(handle, Path):
             ihandle = open(handle, 'rb')
         elif isinstance(handle, io.IOBase):
             ihandle = handle
@@ -291,6 +59,19 @@ def generate_index(self, handle:Union[IO, str], source:str=None):
 
         self.handle = ihandle
 
+    @staticmethod
+    def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
+        """ """
+        if isinstance(file, str):
+            file = Path(file)
+        gene_idx_file = file.parent/f"{file.stem}_gene.idx"
+        tx_idx_file = file.parent/f"{file.stem}_tx.idx"
+        return gene_idx_file, tx_idx_file
+
+    def generate_index(self, handle:Union[IO, str, Path], source:str=None):
+        """ """
+        self.init_handle(handle)
+
         for pointer in iterate_pointer(self.handle, source):
             if isinstance(pointer, GenePointer):
                 self.genes[pointer.key] = pointer
@@ -302,26 +83,72 @@ def generate_index(self, handle:Union[IO, str], source:str=None):
         else:
             self.infer_source()
 
-    def load_index(self, handle:IO):
+    def load_index(self, file:Union[str,Path]):
         """ """
-        pass
-
-def index_gtf(file:Path, biotype:List[str]=None, source:str=None):
-    """"""
-    anno = GenomicAnnotationOnDisk()
-    with open(Path, 'rb') as handle:
-        anno.generate_index(handle, source)
+        self.init_handle(file)
+        gene_idx_file, tx_idx_file = self.get_index_files(file)
+
+        gene_source = None
+        with open(gene_idx_file, 'rt') as handle:
+            for line in handle:
+                if line.startswith('#'):
+                    line = line.lstrip('#').lstrip()
+                    if line.startswith('source='):
+                        gene_source = line.split('=')[1]
+                    continue
+
+                if not gene_source:
+                    raise ValueError("Cannot find source from gene idx.")
+
+                fields = line.rstrip().split('\t')
+                pointer = GenePointer(
+                    self.handle, key=fields[0],
+                    start=int(fields[1]), end=int(fields[2]),
+                    source=gene_source, transcripts=fields[3].split(',')
+                )
+                self.genes[pointer.key] = pointer
 
-    gene_idx_file = file.parent/f"{file.stem}_gene.idx"
-    with open(gene_idx_file, 'wt') as handle:
-        handle.write(f"# source={anno.source}\n")
-        for gene in anno.genes.keys():
-            gene_pointer:GenePointer = anno.genes.get_pointer(gene)
-            handle.write(gene_pointer.to_line() + '\n')
+        tx_source = None
+        with open(tx_idx_file, 'rt') as handle:
+            for line in handle:
+                if line.startswith('#'):
+                    line = line.lstrip('#').lstrip()
+                    if line.startswith('source='):
+                        tx_source = line.split('=')[1]
+                    continue
+
+                if not tx_source:
+                    raise ValueError("Cannot find source from gene idx.")
+
+                if tx_source != gene_source:
+                    raise ValueError('tx_source does not match with gene_source')
+
+                fields = line.rstrip().split('\t')
+                is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]]
+                pointer = TranscriptPointer(
+                    handle=self.handle, key=fields[0],
+                    start=int(fields[1]), end=int(fields[2]),
+                    source=tx_source, is_protein_coding=is_protein_coding
+                )
+                self.transcripts[pointer.key] = pointer
 
-    tx_idx_file = file.parent/f"{file.stem}_tx.idx"
-    with open(tx_idx_file, 'wt') as handle:
-        handle.write(f"# source={anno.source}\n")
-        for tx in anno.transcripts.keys():
-            tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
-            handle.write(tx_pointer.to_line() + '\n')
+    @classmethod
+    def index_gtf(cls, file:Path, source:str=None):
+        """"""
+        anno = GenomicAnnotationOnDisk()
+        with open(file, 'rb') as handle:
+            anno.generate_index(handle, source)
+
+        gene_idx_file, tx_idx_file = cls.get_index_files(file)
+
+        with open(gene_idx_file, 'wt') as handle:
+            handle.write(f"# source={anno.source}\n")
+            for gene in anno.genes.keys():
+                gene_pointer:GenePointer = anno.genes.get_pointer(gene)
+                handle.write(gene_pointer.to_line() + '\n')
+
+        with open(tx_idx_file, 'wt') as handle:
+            handle.write(f"# source={anno.source}\n")
+            for tx in anno.transcripts.keys():
+                tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
+                handle.write(tx_pointer.to_line() + '\n')
diff --git a/moPepGen/gtf/__init__.py b/moPepGen/gtf/__init__.py
index b8fd2342..1118af9a 100644
--- a/moPepGen/gtf/__init__.py
+++ b/moPepGen/gtf/__init__.py
@@ -3,3 +3,4 @@
 from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES
 from .GenomicAnnotation import GenomicAnnotation
 from .GeneAnnotationModel import GeneAnnotationModel
+from .GenomicAnnotationOnDisk import GenomicAnnotationOnDisk
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 90acf801..569f572c 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -4,6 +4,7 @@
 import sys
 from test.integration import TestCaseIntegration
 from moPepGen import cli
+from moPepGen.gtf import GenomicAnnotationOnDisk
 
 
 class TestGenerateIndex(TestCaseIntegration):
@@ -46,3 +47,15 @@ def test_generate_index_case1(self):
         expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat',
             'canonical_peptides.pkl', 'coding_transcripts.pkl'}
         self.assertEqual(files, expected)
+
+class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration):
+    """ """
+    def test_generate_index(self):
+        """ """
+        anno = GenomicAnnotationOnDisk()
+        anno.index_gtf(self.data_dir/'annotation.gtf')
+
+    def test_load_index(self):
+        """ """
+        anno = GenomicAnnotationOnDisk()
+        anno.load_index(self.data_dir/'annotation.gtf')

From dc149caac1562eb697ba9adbc7b0333b00efaf91 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Sat, 24 Jun 2023 23:36:50 -0700
Subject: [PATCH 04/18] fix (GenomicAnnotationOnDisk): check protein coding
 when generating gtf index

---
 moPepGen/cli/__init__.py                |  2 +-
 moPepGen/cli/generate_index.py          | 30 +++++++++++++++++++++++--
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 21 -----------------
 test/integration/test_generate_index.py |  6 +++--
 4 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py
index 90f9e0a1..3881c6ea 100644
--- a/moPepGen/cli/__init__.py
+++ b/moPepGen/cli/__init__.py
@@ -1,5 +1,5 @@
 """ Module for command line interface """
-from .generate_index import add_subparser_generate_index, generate_index
+from .generate_index import add_subparser_generate_index, generate_index, index_gtf
 from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide
 from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide
 from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation
diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index fa558e6e..cb19dc83 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -6,6 +6,7 @@
 moPepGen to avoid processing the reference files repeatedly and save massive
 time. """
 from __future__ import annotations
+from typing import TYPE_CHECKING
 import argparse
 from pathlib import Path
 import pickle
@@ -15,6 +16,9 @@
 from moPepGen.cli import common
 
 
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer
+
 # pylint: disable=W0212
 def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     """ CLI for moPepGen generateIndex """
@@ -41,6 +45,30 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     return p
 
 
+def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
+        invalid_protein_as_noncoding:bool=True):
+    """"""
+    anno = gtf.GenomicAnnotationOnDisk()
+    with open(file, 'rb') as handle:
+        anno.generate_index(handle, source)
+
+    if proteome:
+        anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
+
+    gene_idx_file, tx_idx_file = anno.get_index_files(file)
+
+    with open(gene_idx_file, 'wt') as handle:
+        handle.write(f"# source={anno.source}\n")
+        for gene in anno.genes.keys():
+            gene_pointer:GenePointer = anno.genes.get_pointer(gene)
+            handle.write(gene_pointer.to_line() + '\n')
+
+    with open(tx_idx_file, 'wt') as handle:
+        handle.write(f"# source={anno.source}\n")
+        for tx in anno.transcripts.keys():
+            tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
+            handle.write(tx_pointer.to_line() + '\n')
+
 def generate_index(args:argparse.Namespace):
     """ Generate  """
     path_genome:Path = args.genome_fasta
@@ -85,8 +113,6 @@ def generate_index(args:argparse.Namespace):
     proteome = aa.AminoAcidSeqDict()
     proteome.dump_fasta(parth_proteome, source=args.reference_source)
 
-    anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
-
     with lzma.open(output_anno, 'wt') as handle:
         GtfIO.write(handle, anno)
     if not quiet:
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 4fbdf324..ad6d23de 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -131,24 +131,3 @@ def load_index(self, file:Union[str,Path]):
                     source=tx_source, is_protein_coding=is_protein_coding
                 )
                 self.transcripts[pointer.key] = pointer
-
-    @classmethod
-    def index_gtf(cls, file:Path, source:str=None):
-        """"""
-        anno = GenomicAnnotationOnDisk()
-        with open(file, 'rb') as handle:
-            anno.generate_index(handle, source)
-
-        gene_idx_file, tx_idx_file = cls.get_index_files(file)
-
-        with open(gene_idx_file, 'wt') as handle:
-            handle.write(f"# source={anno.source}\n")
-            for gene in anno.genes.keys():
-                gene_pointer:GenePointer = anno.genes.get_pointer(gene)
-                handle.write(gene_pointer.to_line() + '\n')
-
-        with open(tx_idx_file, 'wt') as handle:
-            handle.write(f"# source={anno.source}\n")
-            for tx in anno.transcripts.keys():
-                tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
-                handle.write(tx_pointer.to_line() + '\n')
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 569f572c..5bdc075d 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -3,7 +3,7 @@
 import subprocess as sp
 import sys
 from test.integration import TestCaseIntegration
-from moPepGen import cli
+from moPepGen import cli, aa
 from moPepGen.gtf import GenomicAnnotationOnDisk
 
 
@@ -52,8 +52,10 @@ class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration):
     """ """
     def test_generate_index(self):
         """ """
+        proteome = aa.AminoAcidSeqDict()
+        proteome.dump_fasta(self.data_dir/'translate.fasta')
         anno = GenomicAnnotationOnDisk()
-        anno.index_gtf(self.data_dir/'annotation.gtf')
+        cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome)
 
     def test_load_index(self):
         """ """

From d435bca79f67900ce8919d6497b203a8cbc5e41d Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Mon, 26 Jun 2023 23:45:20 -0700
Subject: [PATCH 05/18] fix (GenerateIndex): moving to use gene and tx index

---
 moPepGen/cli/generate_index.py          | 61 +++++++++++++++++++++----
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 19 ++++++--
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index cb19dc83..d93a8700 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -8,9 +8,11 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 import argparse
+import os
 from pathlib import Path
 import pickle
 import lzma
+import gzip
 from moPepGen import dna, aa, gtf, logger
 from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
@@ -49,7 +51,14 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
         invalid_protein_as_noncoding:bool=True):
     """"""
     anno = gtf.GenomicAnnotationOnDisk()
-    with open(file, 'rb') as handle:
+
+    if file.suffix.lower() == '.gtf':
+        opener = open
+    elif file.suffix.lower() == '.gz':
+        opener = gzip.open
+    elif file.suffix.lower() == '.lz':
+        opener = lzma.open
+    with opener(file, 'rb') as handle:
         anno.generate_index(handle, source)
 
     if proteome:
@@ -69,6 +78,37 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
             tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
             handle.write(tx_pointer.to_line() + '\n')
 
+def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path:
+    """ """
+    if symlink:
+        if file.suffix.lower() == '.gz':
+            output_file = output_dir/'annotation.gtf.gz'
+        elif file.suffix.lower() == '.gtf':
+            output_file = output_dir/'annotation.gtf'
+        else:
+            raise ValueError(f"File not supported {file}")
+        os.symlink(file, output_file)
+    else:
+        if file.suffix.lower() == '.gtf':
+            i_opener = open
+        elif file.suffix.lower() == '.gz':
+            i_opener = gzip.open
+
+        if compression == None:
+            o_opener = open
+            output_file = output_dir/'annotation.gtf'
+        elif compression == 'lzma':
+            o_opener = lzma.open
+            output_file = output_dir/'annotation.gtf.lz'
+        elif compression == 'gzip':
+            o_opener = gzip.open
+            output_file = output_dir/'annotation.gtf.gz'
+
+        with i_opener(file, 'rt') as ihandle, o_opener(output_file, 'wt') as ohandle:
+            for line in ihandle:
+                ohandle.write(line)
+    return output_file
+
 def generate_index(args:argparse.Namespace):
     """ Generate  """
     path_genome:Path = args.genome_fasta
@@ -84,6 +124,9 @@ def generate_index(args:argparse.Namespace):
     invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
     quiet:bool = args.quiet
 
+    symlink:bool = False
+    compression:bool = 'lzma'
+
     output_dir:Path = args.output_dir
     output_genome = output_dir/"genome.pkl"
     output_proteome = output_dir/"proteome.pkl"
@@ -105,26 +148,24 @@ def generate_index(args:argparse.Namespace):
         logger('Genome FASTA saved to disk.')
     del genome
 
-    anno = gtf.GenomicAnnotation()
-    anno.dump_gtf(path_gtf, source=args.reference_source)
-    if not quiet:
-        logger('Genome annotation GTF loaded.')
-
     proteome = aa.AminoAcidSeqDict()
     proteome.dump_fasta(parth_proteome, source=args.reference_source)
+    if not quiet:
+        logger('Proteome FASTA loaded.')
 
-    with lzma.open(output_anno, 'wt') as handle:
-        GtfIO.write(handle, anno)
+    output_gtf = create_gtf_copy(path_gtf, output_dir, symlink, compression)
+    index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
     if not quiet:
         logger('Genome annotation GTF saved to disk.')
 
-    if not quiet:
-        logger('Proteome FASTA loaded.')
     with open(output_proteome, 'wb') as handle:
         pickle.dump(proteome, handle)
     if not quiet:
         logger('Proteome FASTA saved to disk.')
 
+    anno = gtf.GenomicAnnotationOnDisk()
+    anno.load_index(output_gtf)
+
     # canoincal peptide pool
     canonical_peptides = proteome.create_unique_peptide_pool(
         anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index ad6d23de..13e78d5b 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -2,6 +2,8 @@
 from __future__ import annotations
 from typing import Dict, IO, Union, Tuple
 import io
+import gzip
+import lzma
 from pathlib import Path
 from moPepGen.version import MetaVersion
 from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation
@@ -50,8 +52,16 @@ def infer_source(self):
 
     def init_handle(self, handle:Union[str, IO, Path]):
         """ """
-        if isinstance(handle, str) or isinstance(handle, Path):
-            ihandle = open(handle, 'rb')
+        if isinstance(handle, str):
+            handle = Path(handle)
+
+        if isinstance(handle, Path):
+            if handle.suffix.lower() == '.gtf':
+                ihandle = open(handle, 'rb')
+            elif handle.suffix.lower() == '.lz':
+                ihandle = lzma.open(handle, 'rb')
+            elif handle.suffix.lower() == '.gz':
+                ihandle = gzip.open(handle, 'rb')
         elif isinstance(handle, io.IOBase):
             ihandle = handle
         else:
@@ -64,8 +74,9 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
         """ """
         if isinstance(file, str):
             file = Path(file)
-        gene_idx_file = file.parent/f"{file.stem}_gene.idx"
-        tx_idx_file = file.parent/f"{file.stem}_tx.idx"
+        basename = str(file.name).split('.gtf')[0]
+        gene_idx_file = file.parent/f"{basename}_gene.idx"
+        tx_idx_file = file.parent/f"{basename}_tx.idx"
         return gene_idx_file, tx_idx_file
 
     def generate_index(self, handle:Union[IO, str, Path], source:str=None):

From 81765b5b811248df96f8eb3e23149c2465fc2bcd Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Tue, 27 Jun 2023 18:13:43 -0700
Subject: [PATCH 06/18] fix (moPepGen): test cases passed

---
 moPepGen/__init__.py                    |   2 +-
 moPepGen/cli/call_alt_translation.py    |   3 +-
 moPepGen/cli/call_noncoding_peptide.py  |   3 +-
 moPepGen/cli/common.py                  |  16 +-
 moPepGen/cli/generate_index.py          |   8 +-
 moPepGen/gtf/GTFIndexMetadata.py        |  46 ++++
 moPepGen/gtf/GTFPointer.py              |  26 +-
 moPepGen/gtf/GenomicAnnotationOnDisk.py |  47 ++--
 moPepGen/version.py                     |  13 +-
 test/files/fusion/star_fusion.gvf       |   6 +-
 test/files/peptides/variant.fasta       | 332 ++++++++++--------------
 test/files/vep/vep_gINDEL.gvf           |   2 +-
 test/integration/test_generate_index.py |   3 +-
 13 files changed, 266 insertions(+), 241 deletions(-)
 create mode 100644 moPepGen/gtf/GTFIndexMetadata.py

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
index 87b412aa..7952e5dd 100644
--- a/moPepGen/__init__.py
+++ b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '1.0.0'
+__version__ = '1.0.1'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py
index 5b8ba4de..a75f3f19 100644
--- a/moPepGen/cli/call_alt_translation.py
+++ b/moPepGen/cli/call_alt_translation.py
@@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None:
 
     peptide_pool = aa.VariantPeptidePool()
 
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if not tx_model.is_protein_coding:
             continue
 
diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
index 324267ed..2cc51082 100644
--- a/moPepGen/cli/call_noncoding_peptide.py
+++ b/moPepGen/cli/call_noncoding_peptide.py
@@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
     orf_pool = []
 
     i = 0
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if inclusion_biotypes and \
                 tx_model.transcript.biotype not in inclusion_biotypes:
             continue
diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py
index 71b463c4..34bbf18b 100644
--- a/moPepGen/cli/common.py
+++ b/moPepGen/cli/common.py
@@ -204,7 +204,7 @@ def add_args_source(parser:argparse.ArgumentParser):
 def load_references(args:argparse.Namespace, load_genome:bool=True,
         load_canonical_peptides:bool=True, load_proteome:bool=False,
         invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False
-        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]:
+        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]:
     """ Load reference files. If index_dir is specified, data will be loaded
     from pickles, otherwise, will read from FASTA and GTF. """
     index_dir:Path = args.index_dir
@@ -225,11 +225,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
                 if not version.is_valid(genome.version):
                     raise err.IndexVersionNotMatchError(version, genome.version)
 
-        with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle:
-            annotation = gtf.GenomicAnnotation()
-            annotation.dump_gtf(handle)
-            if not version.is_valid(annotation.version):
-                raise err.IndexVersionNotMatchError(version, genome.version)
+        gtf_file = list(index_dir.glob('annotation.gtf*'))[0]
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.init_handle(gtf_file)
+        annotation.load_index(gtf_file)
 
         if load_proteome:
             with open(f'{index_dir}/proteome.pkl', 'rb') as handle:
@@ -249,8 +248,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
         if (check_protein_coding is True or load_canonical_peptides is True) and \
                 args.proteome_fasta is None:
             raise ValueError('--proteome-fasta was not specified.')
-        annotation = gtf.GenomicAnnotation()
-        annotation.dump_gtf(args.annotation_gtf, source=args.reference_source)
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.generate_index(args.annotation_gtf, source=args.reference_source)
+
         if not quiet:
             logger('Annotation GTF loaded.')
 
diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index d93a8700..392d0620 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -16,6 +16,7 @@
 from moPepGen import dna, aa, gtf, logger
 from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
+from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
 
 
 if TYPE_CHECKING:
@@ -65,15 +66,16 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
         anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
 
     gene_idx_file, tx_idx_file = anno.get_index_files(file)
+    metadata = GTFIndexMetadata()
 
     with open(gene_idx_file, 'wt') as handle:
-        handle.write(f"# source={anno.source}\n")
+        metadata.write(handle)
         for gene in anno.genes.keys():
             gene_pointer:GenePointer = anno.genes.get_pointer(gene)
             handle.write(gene_pointer.to_line() + '\n')
 
     with open(tx_idx_file, 'wt') as handle:
-        handle.write(f"# source={anno.source}\n")
+        metadata.write(handle)
         for tx in anno.transcripts.keys():
             tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
             handle.write(tx_pointer.to_line() + '\n')
@@ -164,7 +166,7 @@ def generate_index(args:argparse.Namespace):
         logger('Proteome FASTA saved to disk.')
 
     anno = gtf.GenomicAnnotationOnDisk()
-    anno.load_index(output_gtf)
+    anno.generate_index(output_gtf)
 
     # canoincal peptide pool
     canonical_peptides = proteome.create_unique_peptide_pool(
diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py
new file mode 100644
index 00000000..f235a099
--- /dev/null
+++ b/moPepGen/gtf/GTFIndexMetadata.py
@@ -0,0 +1,46 @@
+""" """
+from typing import IO
+from moPepGen.version import MetaVersion
+
+MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##'
+
+class GTFIndexMetadata():
+    """ """
+    def __init__(self, source:str=None, version:MetaVersion=None):
+        """ """
+        self.source = source
+        self.version = version or MetaVersion()
+
+    def write(self, handle:IO):
+        """ """
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        handle.write(f"{prefix}source={self.source}\n")
+        handle.write(f"{prefix}python={self.version.python}\n")
+        handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n")
+        handle.write(f"{prefix}biopython={self.version.biopython}\n")
+
+    @classmethod
+    def parse(cls, handle:IO):
+        """ """
+        pos = handle.tell()
+        it = handle.readline()
+        metadata = {}
+        version = {}
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        while it and it.startswith(prefix):
+            key, val = it.lstrip(prefix).rstrip().split('=', 1)
+            if val == '':
+                val = None
+            if key in ('python', 'moPepGen', 'biopython'):
+                version[key.lower()] = val
+            else:
+                metadata[key] = val
+
+            pos = handle.tell()
+            it = handle.readline()
+
+        handle.seek(pos)
+
+        version = MetaVersion(**version)
+
+        return cls(source=metadata['source'], version=version)
diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py
index 3a7937c5..bcc7b04e 100644
--- a/moPepGen/gtf/GTFPointer.py
+++ b/moPepGen/gtf/GTFPointer.py
@@ -43,8 +43,8 @@ def load(self) -> GeneAnnotationModel:
         offset = self.start - cur
         self.handle.seek(offset, 1)
         buffer:bytes = self.handle.read(len(self))
-        lines = buffer.decode('utf-8').split('\n')
-        if len(lines) > 0:
+        lines = buffer.decode('utf-8').rstrip().split('\n')
+        if len(lines) > 1:
             raise ValueError(f"Multiple lines found for gene {self.key}")
         record = GtfIO.line_to_seq_feature(lines[0])
         record.__class__ = GeneAnnotationModel
@@ -88,6 +88,7 @@ def load(self) -> TranscriptAnnotationModel:
                 raise ValueError("Loaded transcript do not match.")
             record.id = tx_id
             tx_model.add_record(feature, record)
+        tx_model.sort_records()
         return tx_model
 
     def to_line(self) -> str:
@@ -157,6 +158,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T
                 cur_tx_pointer.end = line_end
             if cur_gene_pointer:
                 cur_gene_pointer.transcripts.add(cur_tx_id)
+    if cur_gene_pointer:
+        yield cur_gene_pointer
+    if cur_tx_pointer:
+        yield cur_tx_pointer
 
 class GTFPointerDict(dict, Mapping[str, GTFPointer]):
     """ """
@@ -171,6 +176,11 @@ def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
                 self.validate(val, GTFPointer)
         super().__init__(*args, **kwargs)
 
+    def __iter__(self) -> Iterable[str]:
+        """"""
+        for k in self.keys():
+            yield k
+
     def validate(self, val, type):
         """ """
         if not type is GTFPointer or not issubclass(type, GTFPointer):
@@ -210,7 +220,9 @@ def __getitem__(self, __key: str) -> GeneAnnotationModel:
             key_pop = self._cached_keys.pop()
             self._cache.pop(key_pop)
         pointer:GenePointer = self.get_pointer(__key)
-        return pointer.load()
+        val = pointer.load()
+        self._cache[__key] = val
+        return val
 
 class TranscriptPointerDict(GTFPointerDict):
     """ """
@@ -235,6 +247,8 @@ def __getitem__(self, __key: str
             key_pop = self._cached_keys.pop()
             self._cache.pop(key_pop)
         pointer:TranscriptPointer = self.get_pointer(__key)
-        tx_model = pointer.load()
-        tx_model.is_protein_coding = pointer.is_protein_coding is True
-        return tx_model
+        val = pointer.load()
+        val.is_protein_coding = pointer.is_protein_coding is True
+        val.transcript.source = pointer.source
+        self._cache[__key] = val
+        return val
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 13e78d5b..55d48cf7 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -5,12 +5,14 @@
 import gzip
 import lzma
 from pathlib import Path
+from moPepGen import err
 from moPepGen.version import MetaVersion
 from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation
 from moPepGen.gtf.GTFPointer import (
     GenePointerDict, TranscriptPointerDict, iterate_pointer,
     GenePointer, TranscriptPointer
 )
+from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
 
 
 class GenomicAnnotationOnDisk(GenomicAnnotation):
@@ -77,6 +79,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
         basename = str(file.name).split('.gtf')[0]
         gene_idx_file = file.parent/f"{basename}_gene.idx"
         tx_idx_file = file.parent/f"{basename}_tx.idx"
+
         return gene_idx_file, tx_idx_file
 
     def generate_index(self, handle:Union[IO, str, Path], source:str=None):
@@ -99,18 +102,25 @@ def load_index(self, file:Union[str,Path]):
         self.init_handle(file)
         gene_idx_file, tx_idx_file = self.get_index_files(file)
 
-        gene_source = None
+        if not gene_idx_file.exists():
+            raise ValueError(f"Gene index file cannot be found in {file.parent}")
+        if not tx_idx_file.exists():
+            raise ValueError(f"Transcript index file cannot be found in {file.parent}")
+
+        version = MetaVersion()
+
         with open(gene_idx_file, 'rt') as handle:
+            metadata = GTFIndexMetadata.parse(handle)
+            if not version.is_valid(metadata.version):
+                raise err.IndexVersionNotMatchError(version, metadata.version)
+
+            gene_source = metadata.source
+            if not gene_source:
+                raise ValueError("Cannot find source from gene idx.")
+
             for line in handle:
                 if line.startswith('#'):
-                    line = line.lstrip('#').lstrip()
-                    if line.startswith('source='):
-                        gene_source = line.split('=')[1]
                     continue
-
-                if not gene_source:
-                    raise ValueError("Cannot find source from gene idx.")
-
                 fields = line.rstrip().split('\t')
                 pointer = GenePointer(
                     self.handle, key=fields[0],
@@ -119,21 +129,20 @@ def load_index(self, file:Union[str,Path]):
                 )
                 self.genes[pointer.key] = pointer
 
-        tx_source = None
         with open(tx_idx_file, 'rt') as handle:
+            metadata = GTFIndexMetadata.parse(handle)
+            if not version.is_valid(metadata.version):
+                raise err.IndexVersionNotMatchError(version, metadata.version)
+
+            tx_source = metadata.source
+            if not tx_source:
+                raise ValueError("Cannot find source from gene idx.")
+            if tx_source != gene_source:
+                    raise ValueError('tx_source does not match with gene_source')
+
             for line in handle:
                 if line.startswith('#'):
-                    line = line.lstrip('#').lstrip()
-                    if line.startswith('source='):
-                        tx_source = line.split('=')[1]
                     continue
-
-                if not tx_source:
-                    raise ValueError("Cannot find source from gene idx.")
-
-                if tx_source != gene_source:
-                    raise ValueError('tx_source does not match with gene_source')
-
                 fields = line.rstrip().split('\t')
                 is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]]
                 pointer = TranscriptPointer(
diff --git a/moPepGen/version.py b/moPepGen/version.py
index 01a5b4b9..8c4d450c 100644
--- a/moPepGen/version.py
+++ b/moPepGen/version.py
@@ -6,15 +6,15 @@
 from moPepGen import __version__
 
 
-MINIMAL_VERSION = '0.12.0'
+MINIMAL_VERSION = '1.0.1'
 
 class MetaVersion():
     """ Versions """
-    def __init__(self):
+    def __init__(self, python:str=None, biopython:str=None, mopepgen:str=None):
         """ constructor """
-        self.python = sys.version_info[:3]
-        self.biopython = Bio.__version__
-        self.mopepgen = __version__
+        self.python = python or '.'.join([str(x) for x in sys.version_info[:3]])
+        self.biopython = biopython or Bio.__version__
+        self.mopepgen = mopepgen or __version__
 
     def __eq__(self, other:MetaVersion):
         """ equal to """
@@ -45,5 +45,4 @@ def __ne__(self, other:MetaVersion):
 
     def __repr__(self) -> str:
         """ str representation """
-        python = '.'.join([str(x) for x in self.python])
-        return f"python={python}, biopython={self.biopython}, moPepGen={self.mopepgen}"
+        return f"python={self.python}, biopython={self.biopython}, moPepGen={self.mopepgen}"
diff --git a/test/files/fusion/star_fusion.gvf b/test/files/fusion/star_fusion.gvf
index b11972e9..4d14aa2d 100644
--- a/test/files/fusion/star_fusion.gvf
+++ b/test/files/fusion/star_fusion.gvf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##mopepgen_version=0.12.0
+##mopepgen_version=1.0.1
 ##parser=parseSTARFusion
 ##reference_index=/home/chenghaozhu/projects/private-moPepGen/test/files/index
 ##genome_fasta=
@@ -13,7 +13,7 @@
 ##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
 ##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-ENSG00000128408.9	832	FUSION-ENST00000614167.2:831-ENST00000642151.1:29	A	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
-ENSG00000128408.9	832	FUSION-ENST00000614168.2:831-ENST00000642151.1:29	A	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
+ENSG00000128408.9	832	FUSION-ENST00000614167.2:831-ENST00000642151.1:29	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
+ENSG00000128408.9	832	FUSION-ENST00000614168.2:831-ENST00000642151.1:29	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
 ENSG00000244486.9	1752	FUSION-ENST00000622235.5:1751-ENST00000642151.1:29	G	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000622235.5;GENE_SYMBOL=SCARF2;GENOMIC_POSITION=chr22:3213:3213;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
 ENSG00000279973.2	93	FUSION-ENST00000624155.2:92-ENST00000622235.5:968	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000624155.2;GENE_SYMBOL=CU104787.1;GENOMIC_POSITION=chr22:5311:5311;ACCEPTER_GENE_ID=ENSG00000244486.9;ACCEPTER_TRANSCRIPT_ID=ENST00000622235.5;ACCEPTER_SYMBOL=SCARF2;ACCEPTER_POSITION=969;ACCEPTER_GENOMIC_POSITION=chr22:3995:3995
diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta
index 8aa7c5ea..2c8adba0 100644
--- a/test/files/peptides/variant.fasta
+++ b/test/files/peptides/variant.fasta
@@ -1,218 +1,170 @@
->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
-CLTCEPGWNR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
-LPPCDEFVGARR
->CIRC-ENST00000614167.2-E2-E1|W2F-11|1
-AGASVPLRRHFEPDGK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|2
-HETLFLLTFPRER
->ENST00000614167.2|RES-101-A-G|W2F-9|3 ENST00000614167.2|RES-101-A-G|W2F-9|3
-IIGGDTEAFDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|22
-MAVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|20
-MAVALPRDLRQDANLAK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|3
-HETLFLLTFPR
->CIRC-ENST00000614167.2-E2-E1|18
-MFKFMTR
->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
-ARHETLCC
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
-NNLVVDISERAGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
-HETLWLLTFPRERER
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
+MFKFMAR
+>CIRC-ENST00000614167.2-E2-E1|3
+RHWEPDGKLR
+>CIRC-ENST00000614167.2-E2-E1|7
+AGASVPLR
 >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2
 ERLYLCGVTGSPTENCAK
->ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3
-IIGGDTEAWDVQVHGQK
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2
+HTVVAYK
+>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
+VFNARNRIIGGDTEAWDVQVHGQK
+>CIRC-ENST00000614167.2-E2-E1|4
+RHWEPDGK
+>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
+MEGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
+MNNLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
+FKFMARR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
+NNLVVDISERAGASVPLRR
+>CIRC-ENST00000642151.1-E1-E2|1
+STSGPSK
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
+HETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|22
+MAVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|14
+MKPLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|11
+KPLVVDISERAGASVPLR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12
+LESTTRKVHR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1
+LYLCGVTGSPTENCAK
+>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1
+NTLSGMQKFMGEDUNFHERK
+>CIRC-ENST00000614167.2-E2-E1|16
+FKFMTRR
+>ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1
+GAVPARRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10
+RLPPCDEFVGAR
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
+LMKEEAAA
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
+LPPCDEFVGAR
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5
 RSKHTVVAYK
+>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
+TKCDQPCATGFYGEGCSHR
+>CIRC-ENST00000614167.2-E2-E1|17
+MFKFMTRR
+>ENST00000614167.2|SNV-1179-G-T|2
+RIQGARATLUF
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1
+HTVVAYKDAI
+>CIRC-ENST00000614167.2-E2-E1|20
+MAVALPRDLRQDANLAK
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5
+HETLWLLTFPRER
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
+SKHTVVAYKDAI
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
+ARHETLWLLTFPR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
+SKHTVVAYK
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5
+NNLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|6
+AGASVPLRR
 >CIRC-ENST00000614167.2-E2-E1|1
 HWEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|16
-FKFMTRR
->ENST00000622235.5|RES-202-G-A|W2F-8|2 ENST00000622235.5|RES-202-G-A|W2F-8|2
-CLTCEPGFNR
+>CIRC-ENST00000614167.2-E2-E1|9
+KPLVVDISERAGASVPLRR
+>CIRC-ENST00000642151.1-E1-E2|4
+MPFRRSTSGPSK
+>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
+NTLSGMQKFMGEDUNFHER
+>CIRC-ENST00000614167.2-E2-E1|5
+AGASVPLRRHWEPDGK
 >ENST00000614167.2|SNV-1179-G-T|1
 IQGARATLUF
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
-HETLWLLTFPR
->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
-MEGAGPRGAVPAR
->CIRC-ENST00000614167.2-E2-E1|W2F-3|2
-RHFEPDGK
+>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
+EGAGPRGAVPARR
+>CIRC-ENST00000614167.2-E2-E1|23
+MAVALPR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
+MNNLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|13
+KPLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
+MNNLVVDISERAGASVPLRR
+>CIRC-ENST00000614167.2-E2-E1|19
+AVALPRDLRQDANLAK
+>CIRC-ENST00000642151.1-E1-E2|2
+RSTSGPSK
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
+HETLWLLTFPRERER
 >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9
 EATEKARHETLWLLTFPR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9
-RLPPCDEFVGARR
->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
-CLTCEPGWNRTK
->CIRC-ENST00000614167.2-E2-E1|4
-RHWEPDGK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|1
-HETLFLLTFPRERER
->CIRC-ENST00000614167.2-E2-E1|2
-HWEPDGK
->ENST00000622235.5|RES-202-G-A|W2F-19|1 ENST00000622235.5|RES-202-G-A|W2F-19|1
-GQQPCTVAEGRCLTCEPGFNRTK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1
-HTVVAYKDAI
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-12|1
-EATEKARHETLFLLTFPR
->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
-TKCDQPCATGFYGEGCSHR
->ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6
-GQQPCTVAEGRCLTCEPGWNR
->CIRC-ENST00000614167.2-E2-E1|17
-MFKFMTRR
->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
-EGAGPRGAVPAR
 >CIRC-ENST00000614167.2-E2-E1|12
 MKPLVVDISERAGASVPLR
->CIRC-ENST00000614167.2-E2-E1|23
-MAVALPR
->ENST00000614167.2|RES-101-A-G|W2F-11|1 ENST00000614167.2|RES-101-A-G|W2F-11|1
-NRIIGGDTEAFDVQVHGQKIK
+>ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1
+TKCDQPCATGFYGEGCSHRCPPCR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
+LPPCDEFVGARR
 >CIRC-ENST00000614167.2-E2-E1|8
 ARHETFGC
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
-NTLSGMQKFMGEDUNFHER
->CIRC-ENST00000614167.2-E2-E1|W2F-3|1
-RHFEPDGKLR
+>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
+GQQPCTVAEGRCLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|15
+EATEKARHETFGC
+>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
+CLTCEPGWNR
+>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
+CLTCEPGWNRTK
+>ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3
+IIGGDTEAWDVQVHGQK
+>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
+ARHETLCC
 >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3
 NNLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7
+ARHETLWLLTFPRER
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6
+LPPCDEFVGARRSK
+>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
+EATEKARHE
+>CIRC-ENST00000614167.2-E2-E1|21
+AVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
+ERERLYLCGVTGSPTENCAK
 >CIRC-ENST00000642151.1-E1-E2|3
 PFRRSTSGPSK
->ENST00000614167.2|RES-101-A-G|W2F-16|1 ENST00000614167.2|RES-101-A-G|W2F-16|1
-VFNARNRIIGGDTEAFDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|3
-RHWEPDGKLR
->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
-GQQPCTVAEGRCLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|18
+MFKFMTR
+>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
+MEGAGPRGAVPARR
+>CIRC-ENST00000614167.2-E2-E1|10
+MKPLVVDISERAGASVPLRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
+WRRLPPCDEFVGAR
+>CIRC-ENST00000614167.2-E2-E1|2
+HWEPDGK
 >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1
 MKEEAAA
->CIRC-ENST00000614167.2-E2-E1|W2F-2|1
-HFEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
-MFKFMAR
->ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1
-GAVPARRR
->CIRC-ENST00000614167.2-E2-E1|5
-AGASVPLRRHWEPDGK
 >ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5
 NRIIGGDTEAWDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1
-LYLCGVTGSPTENCAK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
-ERERLYLCGVTGSPTENCAK
->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
-VFNARNRIIGGDTEAWDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|19
-AVALPRDLRQDANLAK
->ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1
-TKCDQPCATGFYGEGCSHRCPPCR
->CIRC-ENST00000614167.2-E2-E1|7
-AGASVPLR
 >ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2
 IIGGDTEAWDVQVHGQKIK
->ENST00000622235.5|RES-202-G-A|W2F-19|2 ENST00000622235.5|RES-202-G-A|W2F-19|2
-GQQPCTVAEGRCLTCEPGFNR
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
-MNNLVVDISERAGASVPLRR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
-LPPCDEFVGAR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5
-HETLWLLTFPRER
->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
-EGAGPRGAVPARR
->ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4
-NRIIGGDTEAWDVQVHGQKIK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
-ARHETLWLLTFPR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9
+RLPPCDEFVGARR
+>ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6
+GQQPCTVAEGRCLTCEPGWNR
+>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
+EATEKARHETLCC
 >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3
 AAGAVFLALSAQLLQARLMKEEAAA
->CIRC-ENST00000614167.2-E2-E1|10
-MKPLVVDISERAGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
-FKFMARR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
-MFKFMARR
->CIRC-ENST00000642151.1-E1-E2|2
-RSTSGPSK
->CIRC-ENST00000614167.2-E2-E1|15
-EATEKARHETFGC
->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
-MEGAGPRGAVPARR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|1
-ARHETLFLLTFPRER
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
-SKHTVVAYK
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
-MNNLVVDISERAGASVPLR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|W2F-1|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|W2F-1|1
-FRRLPPCDEFVGAR
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|1
-NTLSGMQKFMGED
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
-WRRLPPCDEFVGAR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12
-LESTTRKVHR
->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
-IIGGDTEAWDVQVHGQKIKEATEK
->ENST00000614167.2|RES-101-A-G|W2F-9|1 ENST00000614167.2|RES-101-A-G|W2F-9|1
-IIGGDTEAFDVQVHGQKIKEATEK
->ENST00000614167.2|RES-101-A-G|W2F-9|2 ENST00000614167.2|RES-101-A-G|W2F-9|2
-IIGGDTEAFDVQVHGQKIK
->ENST00000614167.2|RES-101-A-G|W2F-11|2 ENST00000614167.2|RES-101-A-G|W2F-11|2
-NRIIGGDTEAFDVQVHGQK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
-SKHTVVAYKDAI
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5
-NNLVVDISER
->ENST00000622235.5|RES-202-G-A|W2F-8|1 ENST00000622235.5|RES-202-G-A|W2F-8|1
-CLTCEPGFNRTK
->CIRC-ENST00000614167.2-E2-E1|6
-AGASVPLRR
 >ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2
 GAVPARR
->CIRC-ENST00000614167.2-E2-E1|14
-MKPLVVDISER
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7
-ARHETLWLLTFPRER
->CIRC-ENST00000614167.2-E2-E1|13
-KPLVVDISER
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10
-RLPPCDEFVGAR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|2
-ARHETLFLLTFPR
->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
-EATEKARHETLCC
->CIRC-ENST00000642151.1-E1-E2|4
-MPFRRSTSGPSK
->ENST00000614167.2|SNV-1179-G-T|2
-RIQGARATLUF
->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
-EATEKARHE
->CIRC-ENST00000642151.1-E1-E2|1
-STSGPSK
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
-LMKEEAAA
->CIRC-ENST00000614167.2-E2-E1|9
-KPLVVDISERAGASVPLRR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6
-LPPCDEFVGARRSK
->CIRC-ENST00000614167.2-E2-E1|W2F-2|2
-HFEPDGK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2
-HTVVAYK
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
-MNNLVVDISER
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|2
-QSDNDVRNTLSGMQKFMGED
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1
-NTLSGMQKFMGEDUNFHERK
->CIRC-ENST00000614167.2-E2-E1|21
-AVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|11
-KPLVVDISERAGASVPLR
+>ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4
+NRIIGGDTEAWDVQVHGQKIK
+>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
+IIGGDTEAWDVQVHGQKIKEATEK
+>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
+EGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
+MFKFMARR
diff --git a/test/files/vep/vep_gINDEL.gvf b/test/files/vep/vep_gINDEL.gvf
index d1269c9f..0a64ea98 100644
--- a/test/files/vep/vep_gINDEL.gvf
+++ b/test/files/vep/vep_gINDEL.gvf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##mopepgen_version=0.12.0
+##mopepgen_version=1.0.1
 ##parser=parseVEP
 ##reference_index=
 ##genome_fasta=/home/chenghaozhu/projects/private-moPepGen/test/files/genome.fasta
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 5bdc075d..dc527eec 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -44,7 +44,8 @@ def test_generate_index_case1(self):
         args.output_dir.mkdir(parents=False, exist_ok=True)
         cli.generate_index(args)
         files = {str(file.name) for file in args.output_dir.glob('*')}
-        expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat',
+        expected = {'genome.pkl', 'proteome.pkl',
+            'annotation.gtf.lz', 'annotation_gene.idx', 'annotation_tx.idx',
             'canonical_peptides.pkl', 'coding_transcripts.pkl'}
         self.assertEqual(files, expected)
 

From 9c75b60c57cea1730a7f67e64578ef887193ae11 Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Tue, 27 Jun 2023 18:18:24 -0700
Subject: [PATCH 07/18] fix (moPepGen): docstrings added

---
 moPepGen/cli/generate_index.py          |  6 +++---
 moPepGen/gtf/GTFIndexMetadata.py        | 10 +++++-----
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 17 +++++++++--------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index 392d0620..47d2190c 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -50,7 +50,7 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
 
 def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
         invalid_protein_as_noncoding:bool=True):
-    """"""
+    """ Index a GTF file. """
     anno = gtf.GenomicAnnotationOnDisk()
 
     if file.suffix.lower() == '.gtf':
@@ -81,7 +81,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
             handle.write(tx_pointer.to_line() + '\n')
 
 def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path:
-    """ """
+    """ Create copy of GTF """
     if symlink:
         if file.suffix.lower() == '.gz':
             output_file = output_dir/'annotation.gtf.gz'
@@ -112,7 +112,7 @@ def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:s
     return output_file
 
 def generate_index(args:argparse.Namespace):
-    """ Generate  """
+    """ Generate index  """
     path_genome:Path = args.genome_fasta
     path_gtf:Path = args.annotation_gtf
     parth_proteome:Path = args.proteome_fasta
diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py
index f235a099..1f7b8c7d 100644
--- a/moPepGen/gtf/GTFIndexMetadata.py
+++ b/moPepGen/gtf/GTFIndexMetadata.py
@@ -1,18 +1,18 @@
-""" """
+""" GTFIndexMetadata """
 from typing import IO
 from moPepGen.version import MetaVersion
 
 MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##'
 
 class GTFIndexMetadata():
-    """ """
+    """ Class for GTF index metadata. """
     def __init__(self, source:str=None, version:MetaVersion=None):
-        """ """
+        """ constructor """
         self.source = source
         self.version = version or MetaVersion()
 
     def write(self, handle:IO):
-        """ """
+        """ Write metadata to handle. """
         prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
         handle.write(f"{prefix}source={self.source}\n")
         handle.write(f"{prefix}python={self.version.python}\n")
@@ -21,7 +21,7 @@ def write(self, handle:IO):
 
     @classmethod
     def parse(cls, handle:IO):
-        """ """
+        """ Read metadata from handle """
         pos = handle.tell()
         it = handle.readline()
         metadata = {}
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 55d48cf7..0ddfa834 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -16,11 +16,11 @@
 
 
 class GenomicAnnotationOnDisk(GenomicAnnotation):
-    """ """
+    """ An on disk version of GenomicAnnotation that uses minimal memory. """
     def __init__(self, genes:GenePointerDict=None,
             transcripts:TranscriptPointerDict=None,
             source:str=None):
-        """ """
+        """ Constructur """
         self.genes = genes or GenePointerDict()
         self.transcripts = transcripts or TranscriptPointerDict()
         self.source = source
@@ -30,15 +30,16 @@ def __init__(self, genes:GenePointerDict=None,
         self.handle = None
 
     def __del__(self):
-        """ """
+        """ Hook for deconstruction """
         if self.handle:
             self.handle.close()
 
     def dump_gtf(self, *args, **kwargs) -> None:
+        """ Forbidden """
         raise NotImplementedError
 
     def infer_source(self):
-        """ """
+        """ Infer GTF source (GENCODE vs ENSEMBL) from data. """
         i = 0
         inferred = {}
         for tx in self.transcripts.keys():
@@ -53,7 +54,7 @@ def infer_source(self):
         self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0]
 
     def init_handle(self, handle:Union[str, IO, Path]):
-        """ """
+        """ Initiate file handle """
         if isinstance(handle, str):
             handle = Path(handle)
 
@@ -73,7 +74,7 @@ def init_handle(self, handle:Union[str, IO, Path]):
 
     @staticmethod
     def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
-        """ """
+        """ Get index file paths given the GTF file path. """
         if isinstance(file, str):
             file = Path(file)
         basename = str(file.name).split('.gtf')[0]
@@ -83,7 +84,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
         return gene_idx_file, tx_idx_file
 
     def generate_index(self, handle:Union[IO, str, Path], source:str=None):
-        """ """
+        """ Generate GTF index """
         self.init_handle(handle)
 
         for pointer in iterate_pointer(self.handle, source):
@@ -98,7 +99,7 @@ def generate_index(self, handle:Union[IO, str, Path], source:str=None):
             self.infer_source()
 
     def load_index(self, file:Union[str,Path]):
-        """ """
+        """ Load index from idx files """
         self.init_handle(file)
         gene_idx_file, tx_idx_file = self.get_index_files(file)
 

From 79771ac44020ed9a9bda52081cae3d2332e323ff Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Tue, 27 Jun 2023 22:20:09 -0700
Subject: [PATCH 08/18] fix (moPepGen): update version to 1.1.0

---
 moPepGen/__init__.py | 2 +-
 moPepGen/version.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
index 7952e5dd..50556536 100644
--- a/moPepGen/__init__.py
+++ b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '1.0.1'
+__version__ = '1.1.0'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
diff --git a/moPepGen/version.py b/moPepGen/version.py
index 8c4d450c..71d16bdd 100644
--- a/moPepGen/version.py
+++ b/moPepGen/version.py
@@ -6,7 +6,7 @@
 from moPepGen import __version__
 
 
-MINIMAL_VERSION = '1.0.1'
+MINIMAL_VERSION = '1.1.0'
 
 class MetaVersion():
     """ Versions """

From ff4ba9f7e4b2715e71e3372b35f6b38172bc93de Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Wed, 28 Jun 2023 09:57:33 -0700
Subject: [PATCH 09/18] fix (generateIndex): trying to set compression to none

---
 moPepGen/cli/generate_index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index 47d2190c..3e4acd08 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -127,7 +127,7 @@ def generate_index(args:argparse.Namespace):
     quiet:bool = args.quiet
 
     symlink:bool = False
-    compression:bool = 'lzma'
+    compression:bool = None
 
     output_dir:Path = args.output_dir
     output_genome = output_dir/"genome.pkl"

From b1ac30cd5af6f3a0ac3118813e9994a8d718a310 Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Wed, 28 Jun 2023 18:14:03 -0700
Subject: [PATCH 10/18] fix (generateIndex): remove code to compress the copied
 gtf file

---
 moPepGen/cli/generate_index.py          |  61 ++---
 test/files/peptides/variant.fasta       | 284 ++++++++++++------------
 test/integration/test_generate_index.py |   3 +-
 3 files changed, 167 insertions(+), 181 deletions(-)

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index 3e4acd08..77dc32d1 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -9,10 +9,10 @@
 from typing import TYPE_CHECKING
 import argparse
 import os
+import shutil
+import gzip
 from pathlib import Path
 import pickle
-import lzma
-import gzip
 from moPepGen import dna, aa, gtf, logger
 from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
@@ -25,7 +25,7 @@
 # pylint: disable=W0212
 def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     """ CLI for moPepGen generateIndex """
-    p = subparsers.add_parser(
+    p:argparse.ArgumentParser = subparsers.add_parser(
         name='generateIndex',
         help='Generate genome and proteome index files for moPepGen',
         description='Generate genome and proteome index files for moPepGen'
@@ -40,6 +40,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
         dest='output_dir',
         required=True
     )
+    p.add_argument(
+        '--gtf-symlink',
+        help='Create a symlink of the GTF file instead of copying it.',
+        default=False
+    )
     common.add_args_reference(p, index=False)
     common.add_args_cleavage(p)
     common.add_args_quiet(p)
@@ -53,13 +58,7 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
     """ Index a GTF file. """
     anno = gtf.GenomicAnnotationOnDisk()
 
-    if file.suffix.lower() == '.gtf':
-        opener = open
-    elif file.suffix.lower() == '.gz':
-        opener = gzip.open
-    elif file.suffix.lower() == '.lz':
-        opener = lzma.open
-    with opener(file, 'rb') as handle:
+    with open(file, 'rb') as handle:
         anno.generate_index(handle, source)
 
     if proteome:
@@ -80,33 +79,22 @@ def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
             tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
             handle.write(tx_pointer.to_line() + '\n')
 
-def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True, compression:str=None) -> Path:
+def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
     """ Create copy of GTF """
+    if file.suffix.lower() == '.gz':
+        if symlink:
+            symlink = False
+    elif file.suffix.lower() != '.gtf':
+        raise ValueError(f"Cannot handle gtf file {file}")
+
+    output_file = output_dir/'annotation.gtf'
+
     if symlink:
-        if file.suffix.lower() == '.gz':
-            output_file = output_dir/'annotation.gtf.gz'
-        elif file.suffix.lower() == '.gtf':
-            output_file = output_dir/'annotation.gtf'
-        else:
-            raise ValueError(f"File not supported {file}")
         os.symlink(file, output_file)
+    elif file.suffix.lower() == '.gtf':
+        shutil.copy2(file, output_file)
     else:
-        if file.suffix.lower() == '.gtf':
-            i_opener = open
-        elif file.suffix.lower() == '.gz':
-            i_opener = gzip.open
-
-        if compression == None:
-            o_opener = open
-            output_file = output_dir/'annotation.gtf'
-        elif compression == 'lzma':
-            o_opener = lzma.open
-            output_file = output_dir/'annotation.gtf.lz'
-        elif compression == 'gzip':
-            o_opener = gzip.open
-            output_file = output_dir/'annotation.gtf.gz'
-
-        with i_opener(file, 'rt') as ihandle, o_opener(output_file, 'wt') as ohandle:
+        with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle:
             for line in ihandle:
                 ohandle.write(line)
     return output_file
@@ -125,14 +113,11 @@ def generate_index(args:argparse.Namespace):
     exception = 'trypsin_exception' if rule == 'trypsin' else None
     invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
     quiet:bool = args.quiet
-
-    symlink:bool = False
-    compression:bool = None
+    symlink:bool = args.gtf_symlink
 
     output_dir:Path = args.output_dir
     output_genome = output_dir/"genome.pkl"
     output_proteome = output_dir/"proteome.pkl"
-    output_anno = output_dir/"annotation.dat"
     output_peptides = output_dir/"canonical_peptides.pkl"
     output_coding_tx = output_dir/"coding_transcripts.pkl"
 
@@ -155,7 +140,7 @@ def generate_index(args:argparse.Namespace):
     if not quiet:
         logger('Proteome FASTA loaded.')
 
-    output_gtf = create_gtf_copy(path_gtf, output_dir, symlink, compression)
+    output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
     index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
     if not quiet:
         logger('Genome annotation GTF saved to disk.')
diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta
index 2c8adba0..bee351cd 100644
--- a/test/files/peptides/variant.fasta
+++ b/test/files/peptides/variant.fasta
@@ -1,170 +1,170 @@
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
-MFKFMAR
->CIRC-ENST00000614167.2-E2-E1|3
-RHWEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|7
-AGASVPLR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2
-ERLYLCGVTGSPTENCAK
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2
 HTVVAYK
->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
-VFNARNRIIGGDTEAWDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|4
-RHWEPDGK
->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
-MEGAGPRGAVPAR
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
-MNNLVVDISER
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
-FKFMARR
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
-NNLVVDISERAGASVPLRR
->CIRC-ENST00000642151.1-E1-E2|1
-STSGPSK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
-HETLWLLTFPR
->CIRC-ENST00000614167.2-E2-E1|22
-MAVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|14
-MKPLVVDISER
->CIRC-ENST00000614167.2-E2-E1|11
-KPLVVDISERAGASVPLR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12
-LESTTRKVHR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1
-LYLCGVTGSPTENCAK
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1
-NTLSGMQKFMGEDUNFHERK
->CIRC-ENST00000614167.2-E2-E1|16
-FKFMTRR
->ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1
-GAVPARRR
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10
 RLPPCDEFVGAR
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
-LMKEEAAA
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
-LPPCDEFVGAR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5
-RSKHTVVAYK
->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
-TKCDQPCATGFYGEGCSHR
->CIRC-ENST00000614167.2-E2-E1|17
-MFKFMTRR
->ENST00000614167.2|SNV-1179-G-T|2
-RIQGARATLUF
+>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
+IIGGDTEAWDVQVHGQKIKEATEK
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1
 HTVVAYKDAI
->CIRC-ENST00000614167.2-E2-E1|20
-MAVALPRDLRQDANLAK
 >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5
 HETLWLLTFPRER
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
-SKHTVVAYKDAI
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
-ARHETLWLLTFPR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
-SKHTVVAYK
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5
-NNLVVDISER
->CIRC-ENST00000614167.2-E2-E1|6
-AGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|1
-HWEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|9
-KPLVVDISERAGASVPLRR
->CIRC-ENST00000642151.1-E1-E2|4
-MPFRRSTSGPSK
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
-NTLSGMQKFMGEDUNFHER
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
+FKFMARR
+>CIRC-ENST00000614167.2-E2-E1|11
+KPLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3
+NNLVVDISERAGASVPLR
+>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
+ARHETLCC
+>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
+CLTCEPGWNR
+>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
+MEGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2
+ERLYLCGVTGSPTENCAK
+>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
+MEGAGPRGAVPARR
+>CIRC-ENST00000614167.2-E2-E1|7
+AGASVPLR
+>ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2
+IIGGDTEAWDVQVHGQKIK
+>ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1
+GAVPARRR
+>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
+GQQPCTVAEGRCLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|4
+RHWEPDGK
 >CIRC-ENST00000614167.2-E2-E1|5
 AGASVPLRRHWEPDGK
->ENST00000614167.2|SNV-1179-G-T|1
-IQGARATLUF
->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
-EGAGPRGAVPARR
->CIRC-ENST00000614167.2-E2-E1|23
-MAVALPR
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
-MNNLVVDISERAGASVPLR
->CIRC-ENST00000614167.2-E2-E1|13
-KPLVVDISER
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
-MNNLVVDISERAGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|19
-AVALPRDLRQDANLAK
->CIRC-ENST00000642151.1-E1-E2|2
-RSTSGPSK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
-HETLWLLTFPRERER
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9
-EATEKARHETLWLLTFPR
->CIRC-ENST00000614167.2-E2-E1|12
-MKPLVVDISERAGASVPLR
+>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
+NTLSGMQKFMGEDUNFHER
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1
+LYLCGVTGSPTENCAK
+>ENST00000614167.2|SNV-1179-G-T|2
+RIQGARATLUF
+>CIRC-ENST00000614167.2-E2-E1|14
+MKPLVVDISER
+>ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2
+GAVPARR
 >ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1
 TKCDQPCATGFYGEGCSHRCPPCR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
-LPPCDEFVGARR
->CIRC-ENST00000614167.2-E2-E1|8
-ARHETFGC
->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
-GQQPCTVAEGRCLTCEPGWNRTK
->CIRC-ENST00000614167.2-E2-E1|15
-EATEKARHETFGC
->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
-CLTCEPGWNR
->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
-CLTCEPGWNRTK
+>ENST00000614167.2|SNV-1179-G-T|1
+IQGARATLUF
+>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
+EATEKARHE
+>CIRC-ENST00000614167.2-E2-E1|1
+HWEPDGKLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
+ERERLYLCGVTGSPTENCAK
 >ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3
 IIGGDTEAWDVQVHGQK
->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
-ARHETLCC
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3
-NNLVVDISERAGASVPLR
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3
+AAGAVFLALSAQLLQARLMKEEAAA
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9
+EATEKARHETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
+MNNLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|22
+MAVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
+MNNLVVDISERAGASVPLRR
 >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7
 ARHETLWLLTFPRER
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6
 LPPCDEFVGARRSK
->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
-EATEKARHE
->CIRC-ENST00000614167.2-E2-E1|21
-AVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
-ERERLYLCGVTGSPTENCAK
->CIRC-ENST00000642151.1-E1-E2|3
-PFRRSTSGPSK
->CIRC-ENST00000614167.2-E2-E1|18
-MFKFMTR
->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
-MEGAGPRGAVPARR
+>CIRC-ENST00000642151.1-E1-E2|2
+RSTSGPSK
 >CIRC-ENST00000614167.2-E2-E1|10
 MKPLVVDISERAGASVPLRR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
-WRRLPPCDEFVGAR
 >CIRC-ENST00000614167.2-E2-E1|2
 HWEPDGK
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1
-MKEEAAA
->ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5
-NRIIGGDTEAWDVQVHGQK
->ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2
-IIGGDTEAWDVQVHGQKIK
+>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
+VFNARNRIIGGDTEAWDVQVHGQK
+>CIRC-ENST00000614167.2-E2-E1|19
+AVALPRDLRQDANLAK
+>CIRC-ENST00000614167.2-E2-E1|15
+EATEKARHETFGC
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9
 RLPPCDEFVGARR
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
+MFKFMARR
+>ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5
+NRIIGGDTEAWDVQVHGQK
+>CIRC-ENST00000614167.2-E2-E1|9
+KPLVVDISERAGASVPLRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12
+LESTTRKVHR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
+MNNLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
+HETLWLLTFPRERER
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5
+NNLVVDISER
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
+SKHTVVAYKDAI
+>CIRC-ENST00000614167.2-E2-E1|20
+MAVALPRDLRQDANLAK
+>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
+CLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
+HETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|6
+AGASVPLRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5
+RSKHTVVAYK
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
+LMKEEAAA
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
+LPPCDEFVGARR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
+LPPCDEFVGAR
+>CIRC-ENST00000614167.2-E2-E1|18
+MFKFMTR
+>CIRC-ENST00000614167.2-E2-E1|21
+AVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|16
+FKFMTRR
+>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
+EGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|12
+MKPLVVDISERAGASVPLR
 >ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6
 GQQPCTVAEGRCLTCEPGWNR
->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
-EATEKARHETLCC
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3
-AAGAVFLALSAQLLQARLMKEEAAA
->ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2
-GAVPARR
+>CIRC-ENST00000642151.1-E1-E2|3
+PFRRSTSGPSK
+>CIRC-ENST00000614167.2-E2-E1|13
+KPLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|23
+MAVALPR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
+SKHTVVAYK
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
+MFKFMAR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
+NNLVVDISERAGASVPLRR
 >ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4
 NRIIGGDTEAWDVQVHGQKIK
->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
-IIGGDTEAWDVQVHGQKIKEATEK
->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
-EGAGPRGAVPAR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
-MFKFMARR
+>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
+EGAGPRGAVPARR
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1
+MKEEAAA
+>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
+TKCDQPCATGFYGEGCSHR
+>CIRC-ENST00000614167.2-E2-E1|8
+ARHETFGC
+>CIRC-ENST00000642151.1-E1-E2|4
+MPFRRSTSGPSK
+>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
+EATEKARHETLCC
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
+ARHETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|3
+RHWEPDGKLR
+>CIRC-ENST00000642151.1-E1-E2|1
+STSGPSK
+>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1
+NTLSGMQKFMGEDUNFHERK
+>CIRC-ENST00000614167.2-E2-E1|17
+MFKFMTRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
+WRRLPPCDEFVGAR
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index dc527eec..3523e39f 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -32,6 +32,7 @@ def test_generate_index_case1(self):
         args.genome_fasta = self.data_dir / 'genome.fasta'
         args.annotation_gtf = self.data_dir / 'annotation.gtf'
         args.proteome_fasta = self.data_dir / 'translate.fasta'
+        args.gtf_symlink = False
         args.reference_source = None
         args.invalid_protein_as_noncoding = False
         args.cleavage_rule = 'trypsin'
@@ -45,7 +46,7 @@ def test_generate_index_case1(self):
         cli.generate_index(args)
         files = {str(file.name) for file in args.output_dir.glob('*')}
         expected = {'genome.pkl', 'proteome.pkl',
-            'annotation.gtf.lz', 'annotation_gene.idx', 'annotation_tx.idx',
+            'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx',
             'canonical_peptides.pkl', 'coding_transcripts.pkl'}
         self.assertEqual(files, expected)
 

From 19c682e01d7235649e98d02506c581c72d28a7c9 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 28 Jun 2023 22:26:10 -0700
Subject: [PATCH 11/18] style (moPepGen): fix pylint, added docstrings

---
 moPepGen/cli/common.py                  |  1 -
 moPepGen/cli/generate_index.py          |  1 -
 moPepGen/gtf/GTFPointer.py              | 80 ++++++++++++++++---------
 moPepGen/gtf/GenomicAnnotationOnDisk.py |  8 ++-
 moPepGen/gtf/GtfIO.py                   |  2 +-
 5 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py
index 34bbf18b..afdbb51d 100644
--- a/moPepGen/cli/common.py
+++ b/moPepGen/cli/common.py
@@ -1,7 +1,6 @@
 """ Common functions for cli """
 from __future__ import annotations
 import argparse
-import lzma
 import os
 import sys
 from typing import Tuple, Set, List
diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index 77dc32d1..4d2f5026 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -14,7 +14,6 @@
 from pathlib import Path
 import pickle
 from moPepGen import dna, aa, gtf, logger
-from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
 from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
 
diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py
index bcc7b04e..29ab895a 100644
--- a/moPepGen/gtf/GTFPointer.py
+++ b/moPepGen/gtf/GTFPointer.py
@@ -1,23 +1,21 @@
-""" """
+""" GTFPointer and GTFPointerDict """
 from __future__ import annotations
-from typing import Dict, Set, Iterable, Mapping, Union, IO, TYPE_CHECKING
+from typing import Dict, Set, Iterable, Mapping, Union, IO
 from collections import deque
 from moPepGen.gtf import GtfIO
 from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel
-from moPepGen.gtf.TranscriptAnnotationModel import TranscriptAnnotationModel
 from moPepGen.gtf.TranscriptAnnotationModel import (
     TranscriptAnnotationModel,
     GTF_FEATURE_TYPES
 )
 
-if TYPE_CHECKING:
-    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
 
 GENE_DICT_CACHE_SIZE = 10
 TX_DICT_CACHE_SIZE = 10
 
 class GTFPointer():
-    """ """
+    """ GTFPointer. Represents the range of the GTF file of a particular
+    genome annotation entity (gene or transcript). """
     def __init__(self, handle:IO, key:str, start:int, end:int, source:str):
         """ """
         self.handle = handle
@@ -31,14 +29,15 @@ def __len__(self) -> int:
         return self.end - self.start
 
 class GenePointer(GTFPointer):
-    """ """
+    """ Pointer of a GTF file for a gene. """
     def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
             transcripts:Set[str]=None):
+        """ Constructor """
         super().__init__(handle, key, start, end, source)
         self.transcripts = transcripts or set()
 
     def load(self) -> GeneAnnotationModel:
-        """ """
+        """ Load gene annotation model from handle. """
         cur = self.handle.tell()
         offset = self.start - cur
         self.handle.seek(offset, 1)
@@ -53,7 +52,7 @@ def load(self) -> GeneAnnotationModel:
         return record
 
     def to_line(self) -> str:
-        """ """
+        """ Convert to line """
         fields = [
             self.key,
             str(self.start),
@@ -63,14 +62,15 @@ def to_line(self) -> str:
         return '\t'.join(fields)
 
 class TranscriptPointer(GTFPointer):
-    """ """
+    """ Pointer of a GTF file for a trnascript. """
     def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
             is_protein_coding:bool=None):
+        """ TranscriptPointer """
         super().__init__(handle, key, start, end, source)
         self.is_protein_coding = is_protein_coding
 
     def load(self) -> TranscriptAnnotationModel:
-        """ """
+        """ Load pointer and return a transcript annotation model. """
         cur = self.handle.tell()
         offset = self.start - cur
         self.handle.seek(offset, 1)
@@ -92,7 +92,7 @@ def load(self) -> TranscriptAnnotationModel:
         return tx_model
 
     def to_line(self) -> str:
-        """ """
+        """ Convert to string """
         fields = [
             self.key,
             str(self.start),
@@ -102,7 +102,7 @@ def to_line(self) -> str:
         return "\t".join(fields)
 
 def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
-    """ """
+    """ Iterate over a GTF file and yield pointers. """
     if not source:
         count = 0
         inferred = {}
@@ -145,7 +145,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T
             if cur_tx_pointer:
                 yield cur_tx_pointer
             cur_gene_id = record.gene_id
-            cur_gene_pointer = GenePointer(handle, cur_gene_id, line_start, line_end, record.source)
+            cur_gene_pointer = GenePointer(
+                handle, cur_gene_id,
+                line_start, line_end, record.source
+            )
             cur_tx_id = None
             cur_tx_pointer = None
         else:
@@ -153,7 +156,10 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T
                 if cur_tx_pointer:
                     yield cur_tx_pointer
                 cur_tx_id = record.transcript_id
-                cur_tx_pointer = TranscriptPointer(handle, cur_tx_id, line_start, line_end, record.source)
+                cur_tx_pointer = TranscriptPointer(
+                    handle, cur_tx_id,
+                    line_start, line_end, record.source
+                )
             else:
                 cur_tx_pointer.end = line_end
             if cur_gene_pointer:
@@ -164,9 +170,13 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T
         yield cur_tx_pointer
 
 class GTFPointerDict(dict, Mapping[str, GTFPointer]):
-    """ """
+    """ GTFPointerDict. This defines a special dict class that keys are str
+    (gene or transcript ID) and values are corresponding GTFPointer. The getter
+    magic function is customized so it returns a loaded GeneAnnotationModel
+    or TranscriptAnnotationModel instead of the pointer.
+    """
     def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
-        """ """
+        """ Constructor """
         for val in kwargs.values():
             self.validate(val, GTFPointer)
         if args:
@@ -177,30 +187,36 @@ def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
         super().__init__(*args, **kwargs)
 
     def __iter__(self) -> Iterable[str]:
-        """"""
+        """ Iterator """
         for k in self.keys():
             yield k
 
-    def validate(self, val, type):
-        """ """
-        if not type is GTFPointer or not issubclass(type, GTFPointer):
-            raise TypeError(f"type = {type} is not supported")
-        if not isinstance(val, type):
+    def validate(self, val, __type):
+        """ Ensures the values are GTFPointers """
+        if not __type is GTFPointer or not issubclass(__type, GTFPointer):
+            raise TypeError(f"type = {__type} is not supported")
+        if not isinstance(val, __type):
             raise TypeError(
-                f"{self.__class__} only accecpts {type.__class__} objects"
+                f"{self.__class__} only accecpts {__type.__class__} objects"
             )
 
     def __setitem__(self, __key: str, __value: GTFPointer) -> None:
+        """ Setter """
         self.validate(__value, GTFPointer)
         return super().__setitem__(__key, __value)
 
     def get_pointer(self, __key: str) -> GTFPointer:
-        """ """
+        """ Get pointer by key without loading the annotation entity. """
         return super().__getitem__(__key)
 
 class GenePointerDict(GTFPointerDict):
-    """ """
+    """ GenePointerDict. This defines a dict class that keys are gene IDs and
+    values are the corresponding gene models. The getter magic function is
+    customized so it returns a loaded GeneAnnotationModel object instead of the
+    pointer.
+    """
     def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]):
+        """ Constructor """
         for val in kwargs.values():
             self.validate(val, GenePointer)
         if args:
@@ -213,6 +229,7 @@ def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointe
         self._cached_keys = deque()
 
     def __getitem__(self, __key: str) -> GeneAnnotationModel:
+        """ Getter """
         if __key in self._cache:
             return self._cache[__key]
         self._cached_keys.appendleft(__key)
@@ -225,8 +242,13 @@ def __getitem__(self, __key: str) -> GeneAnnotationModel:
         return val
 
 class TranscriptPointerDict(GTFPointerDict):
-    """ """
+    """ TranscriptPointerDict. This defines a dict class that keys are transcript
+    IDs and values are the corresponding transcript models. The getter magic
+    function is customized so it returns a loaded TranscriptAnnotationModel
+    object instead of the pointer.
+    """
     def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]):
+        """ Constructor """
         for val in kwargs.values():
             self.validate(val, TranscriptPointer)
         if args:
@@ -238,8 +260,8 @@ def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, Tran
         self._cache = {}
         self._cached_keys = deque()
 
-    def __getitem__(self, __key: str
-            ) -> TranscriptAnnotationModel:
+    def __getitem__(self, __key: str) -> TranscriptAnnotationModel:
+        """ getter """
         if __key in self._cache:
             return self._cache[__key]
         self._cached_keys.appendleft(__key)
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 0ddfa834..d2d05c3b 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -20,7 +20,8 @@ class GenomicAnnotationOnDisk(GenomicAnnotation):
     def __init__(self, genes:GenePointerDict=None,
             transcripts:TranscriptPointerDict=None,
             source:str=None):
-        """ Constructur """
+        """ Constructor """
+        # pylint: disable=W0231
         self.genes = genes or GenePointerDict()
         self.transcripts = transcripts or TranscriptPointerDict()
         self.source = source
@@ -55,6 +56,7 @@ def infer_source(self):
 
     def init_handle(self, handle:Union[str, IO, Path]):
         """ Initiate file handle """
+        # pylint: disable=R1732
         if isinstance(handle, str):
             handle = Path(handle)
 
@@ -77,7 +79,7 @@ def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
         """ Get index file paths given the GTF file path. """
         if isinstance(file, str):
             file = Path(file)
-        basename = str(file.name).split('.gtf')[0]
+        basename = str(file.name).split('.gtf', maxsplit=1)[0]
         gene_idx_file = file.parent/f"{basename}_gene.idx"
         tx_idx_file = file.parent/f"{basename}_tx.idx"
 
@@ -139,7 +141,7 @@ def load_index(self, file:Union[str,Path]):
             if not tx_source:
                 raise ValueError("Cannot find source from gene idx.")
             if tx_source != gene_source:
-                    raise ValueError('tx_source does not match with gene_source')
+                raise ValueError('tx_source does not match with gene_source')
 
             for line in handle:
                 if line.startswith('#'):
diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py
index b59ea88c..41b4810f 100644
--- a/moPepGen/gtf/GtfIO.py
+++ b/moPepGen/gtf/GtfIO.py
@@ -30,7 +30,7 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]:
             yield record
 
 def line_to_seq_feature(line:str) -> GTFSeqFeature:
-    """ """
+    """ Conver line to SeqFeature """
     line = line.rstrip()
     fields = line.split('\t')
 

From 80de41ce666f64fd25089691b24ecc44c61c75a2 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Wed, 28 Jun 2023 22:40:29 -0700
Subject: [PATCH 12/18] doc (CHANGELOG): changelog added

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6dc26db9..0c245140 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.1.0] - 2023-06-28
+
+### Fixed
+
+- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371
+
 ## [1.0.0] - 2023-06-15
 
 ### Added

From f1554dd65976c74db704c1d0d3b16841514d8a39 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Thu, 29 Jun 2023 09:08:14 -0700
Subject: [PATCH 13/18] style (test)): docstrings added

---
 test/integration/test_generate_index.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 3523e39f..f2c5ecd3 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -51,15 +51,15 @@ def test_generate_index_case1(self):
         self.assertEqual(files, expected)
 
 class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration):
-    """ """
+    """ Test case for GenomicAnnotationOnDisk """
     def test_generate_index(self):
-        """ """
+        """ Test generate index """
         proteome = aa.AminoAcidSeqDict()
         proteome.dump_fasta(self.data_dir/'translate.fasta')
         anno = GenomicAnnotationOnDisk()
         cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome)
 
     def test_load_index(self):
-        """ """
+        """ test load index """
         anno = GenomicAnnotationOnDisk()
         anno.load_index(self.data_dir/'annotation.gtf')

From 8ea2e51c25e5de58f1064d62558683df0d1cc8e9 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Thu, 29 Jun 2023 09:20:42 -0700
Subject: [PATCH 14/18] fix (GenomicAnnotationOnDisk): gtf file handle should
 be uncompressed GTF file only

---
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 9 +++------
 test/integration/test_generate_index.py | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index d2d05c3b..5c32ed2a 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -61,12 +61,9 @@ def init_handle(self, handle:Union[str, IO, Path]):
             handle = Path(handle)
 
         if isinstance(handle, Path):
-            if handle.suffix.lower() == '.gtf':
-                ihandle = open(handle, 'rb')
-            elif handle.suffix.lower() == '.lz':
-                ihandle = lzma.open(handle, 'rb')
-            elif handle.suffix.lower() == '.gz':
-                ihandle = gzip.open(handle, 'rb')
+            if handle.suffix.lower() != '.gtf':
+                raise ValueError(f"Unsupported file formate of {handle}")
+            ihandle = open(handle, 'rb')
         elif isinstance(handle, io.IOBase):
             ihandle = handle
         else:
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index f2c5ecd3..2604d52a 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -56,9 +56,9 @@ def test_generate_index(self):
         """ Test generate index """
         proteome = aa.AminoAcidSeqDict()
         proteome.dump_fasta(self.data_dir/'translate.fasta')
-        anno = GenomicAnnotationOnDisk()
         cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome)
 
+
     def test_load_index(self):
         """ test load index """
         anno = GenomicAnnotationOnDisk()

From 60822b4a7265210cc1366a0ff844f41cdf7bfa2c Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Thu, 29 Jun 2023 09:29:59 -0700
Subject: [PATCH 15/18] fix (test): asserts added to GenomicAnnotationOnDisk

---
 test/files/annotation_gene.idx          |  8 ++++++++
 test/files/annotation_tx.idx            |  9 +++++++++
 test/integration/test_generate_index.py | 23 +++++++++++++++++++----
 3 files changed, 36 insertions(+), 4 deletions(-)
 create mode 100644 test/files/annotation_gene.idx
 create mode 100644 test/files/annotation_tx.idx

diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx
new file mode 100644
index 00000000..f57b6263
--- /dev/null
+++ b/test/files/annotation_gene.idx
@@ -0,0 +1,8 @@
+##source=None
+##python=3.8.11
+##moPepGen=1.1.0
+##biopython=1.79
+ENSG00000128408.9	0	174	ENST00000614168.2,ENST00000614167.2
+ENSG00000244486.9	12491	12694	ENST00000622235.5
+ENSG00000099949.21	24586	24955	ENST00000642151.1
+ENSG00000279973.2	27280	27432	ENST00000624155.2
diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx
new file mode 100644
index 00000000..30cfbdde
--- /dev/null
+++ b/test/files/annotation_tx.idx
@@ -0,0 +1,9 @@
+##source=None
+##python=3.8.11
+##moPepGen=1.1.0
+##biopython=1.79
+ENST00000614167.2	174	7260	True
+ENST00000614168.2	7260	12491	True
+ENST00000622235.5	12694	24586	True
+ENST00000642151.1	24955	27280	True
+ENST00000624155.2	27432	28522	False
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 2604d52a..845f035e 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -1,10 +1,11 @@
 """ Test the command line interface """
 import argparse
 import subprocess as sp
+import os
 import sys
 from test.integration import TestCaseIntegration
 from moPepGen import cli, aa
-from moPepGen.gtf import GenomicAnnotationOnDisk
+from moPepGen.gtf import GenomicAnnotation, GenomicAnnotationOnDisk
 
 
 class TestGenerateIndex(TestCaseIntegration):
@@ -56,10 +57,24 @@ def test_generate_index(self):
         """ Test generate index """
         proteome = aa.AminoAcidSeqDict()
         proteome.dump_fasta(self.data_dir/'translate.fasta')
-        cli.index_gtf(self.data_dir/'annotation.gtf', proteome=proteome)
+        gtf_file = self.data_dir/'annotation.gtf'
+        gtf_file2 = self.work_dir/'annotation.gtf'
+        os.symlink(gtf_file, gtf_file2)
+        cli.index_gtf(gtf_file2, proteome=proteome)
+        expect = {
+            'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx'
+        }
+        received = {str(file.name) for file in self.work_dir.glob('*')}
+        self.assertEqual(received, expect)
 
 
     def test_load_index(self):
         """ test load index """
-        anno = GenomicAnnotationOnDisk()
-        anno.load_index(self.data_dir/'annotation.gtf')
+        anno1 = GenomicAnnotation()
+        anno1.dump_gtf(self.data_dir/'annotation.gtf')
+
+        anno2 = GenomicAnnotationOnDisk()
+        anno2.load_index(self.data_dir/'annotation.gtf')
+
+        self.assertEqual(anno1.genes.keys(), anno2.genes.keys())
+        self.assertEqual(anno1.transcripts.keys(), anno2.transcripts.keys())

From 9a92ecfd458b960ae6bda02d7d77fec8694c8881 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Thu, 29 Jun 2023 09:49:31 -0700
Subject: [PATCH 16/18] fix (test): versions of gtf idx files dont match with
 the github action

---
 test/files/annotation_gene.idx | 4 ++--
 test/files/annotation_tx.idx   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx
index f57b6263..76011e1e 100644
--- a/test/files/annotation_gene.idx
+++ b/test/files/annotation_gene.idx
@@ -1,7 +1,7 @@
 ##source=None
-##python=3.8.11
+##python=3.8.17
 ##moPepGen=1.1.0
-##biopython=1.79
+##biopython=1.81
 ENSG00000128408.9	0	174	ENST00000614168.2,ENST00000614167.2
 ENSG00000244486.9	12491	12694	ENST00000622235.5
 ENSG00000099949.21	24586	24955	ENST00000642151.1
diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx
index 30cfbdde..81ead4f8 100644
--- a/test/files/annotation_tx.idx
+++ b/test/files/annotation_tx.idx
@@ -1,7 +1,7 @@
 ##source=None
-##python=3.8.11
+##python=3.8.17
 ##moPepGen=1.1.0
-##biopython=1.79
+##biopython=1.81
 ENST00000614167.2	174	7260	True
 ENST00000614168.2	7260	12491	True
 ENST00000622235.5	12694	24586	True

From 8685d086b764c32275487492bbfc3ddd3643a249 Mon Sep 17 00:00:00 2001
From: Chenghao Zhu <zhuchcn@gmail.com>
Date: Thu, 29 Jun 2023 09:51:34 -0700
Subject: [PATCH 17/18] fix (GenomicAnnotationOnDisk): remove unused import

---
 moPepGen/gtf/GenomicAnnotationOnDisk.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
index 5c32ed2a..f667891b 100644
--- a/moPepGen/gtf/GenomicAnnotationOnDisk.py
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -2,8 +2,6 @@
 from __future__ import annotations
 from typing import Dict, IO, Union, Tuple
 import io
-import gzip
-import lzma
 from pathlib import Path
 from moPepGen import err
 from moPepGen.version import MetaVersion

From 2945a191c3ec7b3dd9ad5c963334974bec0a25e2 Mon Sep 17 00:00:00 2001
From: zhuchcn <chenghaozhu@mednet.ucla.edu>
Date: Thu, 29 Jun 2023 13:05:36 -0700
Subject: [PATCH 18/18] fix (gtf): created GTFSourceInferrer class

---
 moPepGen/cli/generate_index.py    |  3 +++
 moPepGen/gtf/GTFPointer.py        | 17 +++--------------
 moPepGen/gtf/GTFSourceInferrer.py | 31 +++++++++++++++++++++++++++++++
 moPepGen/gtf/GenomicAnnotation.py | 26 ++++++++------------------
 4 files changed, 45 insertions(+), 32 deletions(-)
 create mode 100644 moPepGen/gtf/GTFSourceInferrer.py

diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index 4d2f5026..6f8ee0a6 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -83,6 +83,9 @@ def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
     if file.suffix.lower() == '.gz':
         if symlink:
             symlink = False
+            logger(
+                "--gtf-symlink was suppressed because compressed GTF file was received. "
+            )
     elif file.suffix.lower() != '.gtf':
         raise ValueError(f"Cannot handle gtf file {file}")
 
diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py
index 29ab895a..502a102a 100644
--- a/moPepGen/gtf/GTFPointer.py
+++ b/moPepGen/gtf/GTFPointer.py
@@ -8,6 +8,7 @@
     TranscriptAnnotationModel,
     GTF_FEATURE_TYPES
 )
+from moPepGen.gtf.GTFSourceInferrer import GTFSourceInferrer
 
 
 GENE_DICT_CACHE_SIZE = 10
@@ -104,8 +105,7 @@ def to_line(self) -> str:
 def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
     """ Iterate over a GTF file and yield pointers. """
     if not source:
-        count = 0
-        inferred = {}
+        inferrer = GTFSourceInferrer()
 
     cur_gene_id:str = None
     cur_tx_id:str = None
@@ -124,18 +124,7 @@ def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, T
         record = GtfIO.line_to_seq_feature(line)
 
         if not source:
-            if count > 100:
-                inferred = sorted(inferred.items(), key=lambda x: x[1])
-                source = inferred[-1][0]
-                record.source = source
-            else:
-                count += 1
-                record.infer_annotation_source()
-                inferred_source = record.source
-                if inferred_source not in inferred:
-                    inferred[inferred_source] = 1
-                else:
-                    inferred[inferred_source] += 1
+            record.source = inferrer.infer(record)
         else:
             record.source = source
 
diff --git a/moPepGen/gtf/GTFSourceInferrer.py b/moPepGen/gtf/GTFSourceInferrer.py
new file mode 100644
index 00000000..4c3c1aa3
--- /dev/null
+++ b/moPepGen/gtf/GTFSourceInferrer.py
@@ -0,0 +1,31 @@
+""" Infer GTF source (e.g. GENCODE/ENSEMBL) """
+from __future__ import annotations
+from typing import Dict, TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
+
+class GTFSourceInferrer():
+    """ Infer GTF source (e.g. GENOCDE/ENSEMBL) """
+    def __init__(self):
+        """ Constructor """
+        self.max_iter = 100
+        self.data:Dict[str,int] = {}
+        self.count = 0
+        self.source:str = None
+
+    def infer(self, record:GTFSeqFeature) -> str:
+        """ Infer the source of a GTF record """
+        if self.count > self.max_iter:
+            if not self.source:
+                self.source = sorted(self.data.items(), key=lambda x:x[1])[-1][0]
+            return self.source
+        self.count += 1
+        record.infer_annotation_source()
+        source = record.source
+        if source not in self.data:
+            self.data[source] = 1
+        else:
+            self.data[source] += 1
+        return source
diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py
index bf2142ae..adbde542 100644
--- a/moPepGen/gtf/GenomicAnnotation.py
+++ b/moPepGen/gtf/GenomicAnnotation.py
@@ -10,6 +10,7 @@
 from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES
 from .GeneAnnotationModel import GeneAnnotationModel
 from .GTFSeqFeature import GTFSeqFeature
+from .GTFSourceInferrer import GTFSourceInferrer
 
 
 if TYPE_CHECKING:
@@ -113,25 +114,14 @@ def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=Non
         """
         record:GTFSeqFeature
         if not source:
-            count = 0
-            inferred = {}
+            inferrer = GTFSourceInferrer()
+
         for record in GtfIO.parse(handle):
             if biotype is not None and record.biotype not in biotype:
                 continue
 
             if not source:
-                if count > 100:
-                    inferred = sorted(inferred.items(), key=lambda x: x[1])
-                    source = inferred[-1][0]
-                    record.source = source
-                else:
-                    count += 1
-                    record.infer_annotation_source()
-                    inferred_source = record.source
-                    if inferred_source not in inferred:
-                        inferred[inferred_source] = 1
-                    else:
-                        inferred[inferred_source] += 1
+                record.source = inferrer.infer(record)
             else:
                 record.source = source
 
@@ -142,11 +132,11 @@ def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=Non
 
             self.add_transcript_record(record)
 
-        if not source:
-            inferred = sorted(inferred.items(), key=lambda x: x[1])
-            source = inferred[-1][0]
 
-        self.source = source
+        if not source:
+            source = inferrer.source
+        else:
+            self.source = source
 
         for transcript_model in self.transcripts.values():
             transcript_model.sort_records()