diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6dc26db9..0c245140 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [1.1.0] - 2023-06-28
+
+### Fixed
+
+- Reduced memory usage by mapping the GTF files into memory instead of reading it all at once. #371
+
 ## [1.0.0] - 2023-06-15
 
 ### Added
diff --git a/moPepGen/__init__.py b/moPepGen/__init__.py
index 87b412aa..50556536 100644
--- a/moPepGen/__init__.py
+++ b/moPepGen/__init__.py
@@ -5,7 +5,7 @@
 from typing import Iterable, IO
 
 
-__version__ = '1.0.0'
+__version__ = '1.1.0'
 
 ## Error messages
 ERROR_INDEX_IN_INTRON = 'The genomic index seems to be in an intron'
diff --git a/moPepGen/cli/__init__.py b/moPepGen/cli/__init__.py
index 90f9e0a1..3881c6ea 100644
--- a/moPepGen/cli/__init__.py
+++ b/moPepGen/cli/__init__.py
@@ -1,5 +1,5 @@
 """ Module for command line interface """
-from .generate_index import add_subparser_generate_index, generate_index
+from .generate_index import add_subparser_generate_index, generate_index, index_gtf
 from .call_variant_peptide import add_subparser_call_variant, call_variant_peptide
 from .call_noncoding_peptide import add_subparser_call_noncoding, call_noncoding_peptide
 from .call_alt_translation import add_subparser_call_alt_translation, call_alt_translation
diff --git a/moPepGen/cli/call_alt_translation.py b/moPepGen/cli/call_alt_translation.py
index 5b8ba4de..a75f3f19 100644
--- a/moPepGen/cli/call_alt_translation.py
+++ b/moPepGen/cli/call_alt_translation.py
@@ -81,7 +81,8 @@ def call_alt_translation(args:argparse.Namespace) -> None:
 
     peptide_pool = aa.VariantPeptidePool()
 
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if not tx_model.is_protein_coding:
             continue
 
diff --git a/moPepGen/cli/call_noncoding_peptide.py b/moPepGen/cli/call_noncoding_peptide.py
index 324267ed..2cc51082 100644
--- a/moPepGen/cli/call_noncoding_peptide.py
+++ b/moPepGen/cli/call_noncoding_peptide.py
@@ -119,7 +119,8 @@ def call_noncoding_peptide(args:argparse.Namespace) -> None:
     orf_pool = []
 
     i = 0
-    for tx_id, tx_model in anno.transcripts.items():
+    for tx_id in anno.transcripts:
+        tx_model = anno.transcripts[tx_id]
         if inclusion_biotypes and \
                 tx_model.transcript.biotype not in inclusion_biotypes:
             continue
diff --git a/moPepGen/cli/common.py b/moPepGen/cli/common.py
index 71b463c4..afdbb51d 100644
--- a/moPepGen/cli/common.py
+++ b/moPepGen/cli/common.py
@@ -1,7 +1,6 @@
 """ Common functions for cli """
 from __future__ import annotations
 import argparse
-import lzma
 import os
 import sys
 from typing import Tuple, Set, List
@@ -204,7 +203,7 @@ def add_args_source(parser:argparse.ArgumentParser):
 def load_references(args:argparse.Namespace, load_genome:bool=True,
         load_canonical_peptides:bool=True, load_proteome:bool=False,
         invalid_protein_as_noncoding:bool=False, check_protein_coding:bool=False
-        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotation, Set[str]]:
+        ) -> Tuple[dna.DNASeqDict, gtf.GenomicAnnotationOnDisk, Set[str]]:
     """ Load reference files. If index_dir is specified, data will be loaded
     from pickles, otherwise, will read from FASTA and GTF. """
     index_dir:Path = args.index_dir
@@ -225,11 +224,10 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
                 if not version.is_valid(genome.version):
                     raise err.IndexVersionNotMatchError(version, genome.version)
 
-        with lzma.open(f'{index_dir}/annotation.dat', 'rt') as handle:
-            annotation = gtf.GenomicAnnotation()
-            annotation.dump_gtf(handle)
-            if not version.is_valid(annotation.version):
-                raise err.IndexVersionNotMatchError(version, genome.version)
+        gtf_file = list(index_dir.glob('annotation.gtf*'))[0]
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.init_handle(gtf_file)
+        annotation.load_index(gtf_file)
 
         if load_proteome:
             with open(f'{index_dir}/proteome.pkl', 'rb') as handle:
@@ -249,8 +247,9 @@ def load_references(args:argparse.Namespace, load_genome:bool=True,
         if (check_protein_coding is True or load_canonical_peptides is True) and \
                 args.proteome_fasta is None:
             raise ValueError('--proteome-fasta was not specified.')
-        annotation = gtf.GenomicAnnotation()
-        annotation.dump_gtf(args.annotation_gtf, source=args.reference_source)
+        annotation = gtf.GenomicAnnotationOnDisk()
+        annotation.generate_index(args.annotation_gtf, source=args.reference_source)
+
         if not quiet:
             logger('Annotation GTF loaded.')
 
diff --git a/moPepGen/cli/generate_index.py b/moPepGen/cli/generate_index.py
index fa558e6e..6f8ee0a6 100644
--- a/moPepGen/cli/generate_index.py
+++ b/moPepGen/cli/generate_index.py
@@ -6,19 +6,25 @@
 moPepGen to avoid processing the reference files repeatedly and save massive
 time. """
 from __future__ import annotations
+from typing import TYPE_CHECKING
 import argparse
+import os
+import shutil
+import gzip
 from pathlib import Path
 import pickle
-import lzma
 from moPepGen import dna, aa, gtf, logger
-from moPepGen.gtf import GtfIO
 from moPepGen.cli import common
+from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
 
 
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFPointer import GenePointer, TranscriptPointer
+
 # pylint: disable=W0212
 def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     """ CLI for moPepGen generateIndex """
-    p = subparsers.add_parser(
+    p:argparse.ArgumentParser = subparsers.add_parser(
         name='generateIndex',
         help='Generate genome and proteome index files for moPepGen',
         description='Generate genome and proteome index files for moPepGen'
@@ -33,6 +39,11 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
         dest='output_dir',
         required=True
     )
+    p.add_argument(
+        '--gtf-symlink',
+        help='Create a symlink of the GTF file instead of copying it.',
+        default=False
+    )
     common.add_args_reference(p, index=False)
     common.add_args_cleavage(p)
     common.add_args_quiet(p)
@@ -41,8 +52,57 @@ def add_subparser_generate_index(subparsers:argparse._SubParsersAction):
     return p
 
 
+def index_gtf(file:Path, source:str=None, proteome:aa.AminoAcidSeqDict=None,
+        invalid_protein_as_noncoding:bool=True):
+    """ Index a GTF file. """
+    anno = gtf.GenomicAnnotationOnDisk()
+
+    with open(file, 'rb') as handle:
+        anno.generate_index(handle, source)
+
+    if proteome:
+        anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
+
+    gene_idx_file, tx_idx_file = anno.get_index_files(file)
+    metadata = GTFIndexMetadata()
+
+    with open(gene_idx_file, 'wt') as handle:
+        metadata.write(handle)
+        for gene in anno.genes.keys():
+            gene_pointer:GenePointer = anno.genes.get_pointer(gene)
+            handle.write(gene_pointer.to_line() + '\n')
+
+    with open(tx_idx_file, 'wt') as handle:
+        metadata.write(handle)
+        for tx in anno.transcripts.keys():
+            tx_pointer:TranscriptPointer = anno.transcripts.get_pointer(tx)
+            handle.write(tx_pointer.to_line() + '\n')
+
+def create_gtf_copy(file:Path, output_dir:Path, symlink:bool=True) -> Path:
+    """ Create copy of GTF """
+    if file.suffix.lower() == '.gz':
+        if symlink:
+            symlink = False
+            logger(
+                "--gtf-symlink was suppressed because compressed GTF file was received. "
+            )
+    elif file.suffix.lower() != '.gtf':
+        raise ValueError(f"Cannot handle gtf file {file}")
+
+    output_file = output_dir/'annotation.gtf'
+
+    if symlink:
+        os.symlink(file, output_file)
+    elif file.suffix.lower() == '.gtf':
+        shutil.copy2(file, output_file)
+    else:
+        with gzip.open(file, 'rt') as ihandle, open(output_file, 'wt') as ohandle:
+            for line in ihandle:
+                ohandle.write(line)
+    return output_file
+
 def generate_index(args:argparse.Namespace):
-    """ Generate  """
+    """ Generate index  """
     path_genome:Path = args.genome_fasta
     path_gtf:Path = args.annotation_gtf
     parth_proteome:Path = args.proteome_fasta
@@ -55,11 +115,11 @@ def generate_index(args:argparse.Namespace):
     exception = 'trypsin_exception' if rule == 'trypsin' else None
     invalid_protein_as_noncoding:bool = args.invalid_protein_as_noncoding
     quiet:bool = args.quiet
+    symlink:bool = args.gtf_symlink
 
     output_dir:Path = args.output_dir
     output_genome = output_dir/"genome.pkl"
     output_proteome = output_dir/"proteome.pkl"
-    output_anno = output_dir/"annotation.dat"
     output_peptides = output_dir/"canonical_peptides.pkl"
     output_coding_tx = output_dir/"coding_transcripts.pkl"
 
@@ -77,28 +137,24 @@ def generate_index(args:argparse.Namespace):
         logger('Genome FASTA saved to disk.')
     del genome
 
-    anno = gtf.GenomicAnnotation()
-    anno.dump_gtf(path_gtf, source=args.reference_source)
-    if not quiet:
-        logger('Genome annotation GTF loaded.')
-
     proteome = aa.AminoAcidSeqDict()
     proteome.dump_fasta(parth_proteome, source=args.reference_source)
+    if not quiet:
+        logger('Proteome FASTA loaded.')
 
-    anno.check_protein_coding(proteome, invalid_protein_as_noncoding)
-
-    with lzma.open(output_anno, 'wt') as handle:
-        GtfIO.write(handle, anno)
+    output_gtf = create_gtf_copy(path_gtf, output_dir, symlink)
+    index_gtf(output_gtf, args.reference_source, proteome, invalid_protein_as_noncoding)
     if not quiet:
         logger('Genome annotation GTF saved to disk.')
 
-    if not quiet:
-        logger('Proteome FASTA loaded.')
     with open(output_proteome, 'wb') as handle:
         pickle.dump(proteome, handle)
     if not quiet:
         logger('Proteome FASTA saved to disk.')
 
+    anno = gtf.GenomicAnnotationOnDisk()
+    anno.generate_index(output_gtf)
+
     # canoincal peptide pool
     canonical_peptides = proteome.create_unique_peptide_pool(
         anno=anno, rule=rule, exception=exception, miscleavage=miscleavage,
diff --git a/moPepGen/gtf/GTFIndexMetadata.py b/moPepGen/gtf/GTFIndexMetadata.py
new file mode 100644
index 00000000..1f7b8c7d
--- /dev/null
+++ b/moPepGen/gtf/GTFIndexMetadata.py
@@ -0,0 +1,46 @@
+""" GTFIndexMetadata """
+from typing import IO
+from moPepGen.version import MetaVersion
+
+MOPEPGEN_GTF_INDEX_METADATA_PREFIX = '##'
+
+class GTFIndexMetadata():
+    """ Class for GTF index metadata. """
+    def __init__(self, source:str=None, version:MetaVersion=None):
+        """ constructor """
+        self.source = source
+        self.version = version or MetaVersion()
+
+    def write(self, handle:IO):
+        """ Write metadata to handle. """
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        handle.write(f"{prefix}source={self.source}\n")
+        handle.write(f"{prefix}python={self.version.python}\n")
+        handle.write(f"{prefix}moPepGen={self.version.mopepgen}\n")
+        handle.write(f"{prefix}biopython={self.version.biopython}\n")
+
+    @classmethod
+    def parse(cls, handle:IO):
+        """ Read metadata from handle """
+        pos = handle.tell()
+        it = handle.readline()
+        metadata = {}
+        version = {}
+        prefix = MOPEPGEN_GTF_INDEX_METADATA_PREFIX
+        while it and it.startswith(prefix):
+            key, val = it.lstrip(prefix).rstrip().split('=', 1)
+            if val == '':
+                val = None
+            if key in ('python', 'moPepGen', 'biopython'):
+                version[key.lower()] = val
+            else:
+                metadata[key] = val
+
+            pos = handle.tell()
+            it = handle.readline()
+
+        handle.seek(pos)
+
+        version = MetaVersion(**version)
+
+        return cls(source=metadata['source'], version=version)
diff --git a/moPepGen/gtf/GTFPointer.py b/moPepGen/gtf/GTFPointer.py
new file mode 100644
index 00000000..502a102a
--- /dev/null
+++ b/moPepGen/gtf/GTFPointer.py
@@ -0,0 +1,265 @@
+""" GTFPointer and GTFPointerDict """
+from __future__ import annotations
+from typing import Dict, Set, Iterable, Mapping, Union, IO
+from collections import deque
+from moPepGen.gtf import GtfIO
+from moPepGen.gtf.GeneAnnotationModel import GeneAnnotationModel
+from moPepGen.gtf.TranscriptAnnotationModel import (
+    TranscriptAnnotationModel,
+    GTF_FEATURE_TYPES
+)
+from moPepGen.gtf.GTFSourceInferrer import GTFSourceInferrer
+
+
+GENE_DICT_CACHE_SIZE = 10
+TX_DICT_CACHE_SIZE = 10
+
+class GTFPointer():
+    """ GTFPointer. Represents the range of the GTF file of a particular
+    genome annotation entity (gene or transcript). """
+    def __init__(self, handle:IO, key:str, start:int, end:int, source:str):
+        """ """
+        self.handle = handle
+        self.key = key
+        self.start = start
+        self.end = end
+        self.source = source
+
+    def __len__(self) -> int:
+        """ length """
+        return self.end - self.start
+
+class GenePointer(GTFPointer):
+    """ Pointer of a GTF file for a gene. """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            transcripts:Set[str]=None):
+        """ Constructor """
+        super().__init__(handle, key, start, end, source)
+        self.transcripts = transcripts or set()
+
+    def load(self) -> GeneAnnotationModel:
+        """ Load gene annotation model from handle. """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').rstrip().split('\n')
+        if len(lines) > 1:
+            raise ValueError(f"Multiple lines found for gene {self.key}")
+        record = GtfIO.line_to_seq_feature(lines[0])
+        record.__class__ = GeneAnnotationModel
+        record.exons = []
+        record.transcripts = list(self.transcripts)
+        return record
+
+    def to_line(self) -> str:
+        """ Convert to line """
+        fields = [
+            self.key,
+            str(self.start),
+            str(self.end),
+            ','.join(self.transcripts)
+        ]
+        return '\t'.join(fields)
+
+class TranscriptPointer(GTFPointer):
+    """ Pointer of a GTF file for a trnascript. """
+    def __init__(self, handle: IO, key: str, start: int, end: int, source: str,
+            is_protein_coding:bool=None):
+        """ TranscriptPointer """
+        super().__init__(handle, key, start, end, source)
+        self.is_protein_coding = is_protein_coding
+
+    def load(self) -> TranscriptAnnotationModel:
+        """ Load pointer and return a transcript annotation model. """
+        cur = self.handle.tell()
+        offset = self.start - cur
+        self.handle.seek(offset, 1)
+        buffer:bytes = self.handle.read(len(self))
+        lines = buffer.decode('utf-8').rstrip().split('\n')
+        tx_model = TranscriptAnnotationModel()
+
+        for line in lines:
+            record = GtfIO.line_to_seq_feature(line)
+            feature = record.type.lower()
+            if feature not in GTF_FEATURE_TYPES:
+                continue
+            tx_id = record.transcript_id
+            if tx_id != self.key:
+                raise ValueError("Loaded transcript do not match.")
+            record.id = tx_id
+            tx_model.add_record(feature, record)
+        tx_model.sort_records()
+        return tx_model
+
+    def to_line(self) -> str:
+        """ Convert to string """
+        fields = [
+            self.key,
+            str(self.start),
+            str(self.end),
+            str(self.is_protein_coding)
+        ]
+        return "\t".join(fields)
+
+def iterate_pointer(handle:IO, source:str=None) -> Iterable[Union[GenePointer, TranscriptPointer]]:
+    """ Iterate over a GTF file and yield pointers. """
+    if not source:
+        inferrer = GTFSourceInferrer()
+
+    cur_gene_id:str = None
+    cur_tx_id:str = None
+    cur_gene_pointer = None
+    cur_tx_pointer = None
+    line_end = 0
+    for line in handle:
+        line:bytes
+        line_start = line_end
+        line_end += len(line)
+        line = line.decode('utf-8')
+
+        if line.startswith('#'):
+            continue
+
+        record = GtfIO.line_to_seq_feature(line)
+
+        if not source:
+            record.source = inferrer.infer(record)
+        else:
+            record.source = source
+
+        if record.type.lower() == 'gene':
+            if cur_gene_pointer:
+                yield cur_gene_pointer
+            if cur_tx_pointer:
+                yield cur_tx_pointer
+            cur_gene_id = record.gene_id
+            cur_gene_pointer = GenePointer(
+                handle, cur_gene_id,
+                line_start, line_end, record.source
+            )
+            cur_tx_id = None
+            cur_tx_pointer = None
+        else:
+            if cur_tx_id != record.transcript_id:
+                if cur_tx_pointer:
+                    yield cur_tx_pointer
+                cur_tx_id = record.transcript_id
+                cur_tx_pointer = TranscriptPointer(
+                    handle, cur_tx_id,
+                    line_start, line_end, record.source
+                )
+            else:
+                cur_tx_pointer.end = line_end
+            if cur_gene_pointer:
+                cur_gene_pointer.transcripts.add(cur_tx_id)
+    if cur_gene_pointer:
+        yield cur_gene_pointer
+    if cur_tx_pointer:
+        yield cur_tx_pointer
+
+class GTFPointerDict(dict, Mapping[str, GTFPointer]):
+    """ GTFPointerDict. This defines a special dict class that keys are str
+    (gene or transcript ID) and values are corresponding GTFPointer. The getter
+    magic function is customized so it returns a loaded GeneAnnotationModel
+    or TranscriptAnnotationModel instead of the pointer.
+    """
+    def __init__(self, *args:Dict[str,GTFPointer], **kwargs:Dict[str,GTFPointer]):
+        """ Constructor """
+        for val in kwargs.values():
+            self.validate(val, GTFPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GTFPointer)
+        super().__init__(*args, **kwargs)
+
+    def __iter__(self) -> Iterable[str]:
+        """ Iterator """
+        for k in self.keys():
+            yield k
+
+    def validate(self, val, __type):
+        """ Ensures the values are GTFPointers """
+        if not __type is GTFPointer or not issubclass(__type, GTFPointer):
+            raise TypeError(f"type = {__type} is not supported")
+        if not isinstance(val, __type):
+            raise TypeError(
+                f"{self.__class__} only accecpts {__type.__class__} objects"
+            )
+
+    def __setitem__(self, __key: str, __value: GTFPointer) -> None:
+        """ Setter """
+        self.validate(__value, GTFPointer)
+        return super().__setitem__(__key, __value)
+
+    def get_pointer(self, __key: str) -> GTFPointer:
+        """ Get pointer by key without loading the annotation entity. """
+        return super().__getitem__(__key)
+
+class GenePointerDict(GTFPointerDict):
+    """ GenePointerDict. This defines a dict class that keys are gene IDs and
+    values are the corresponding gene models. The getter magic function is
+    customized so it returns a loaded GeneAnnotationModel object instead of the
+    pointer.
+    """
+    def __init__(self, *args: Dict[str, GenePointer], **kwargs: Dict[str, GenePointer]):
+        """ Constructor """
+        for val in kwargs.values():
+            self.validate(val, GenePointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, GenePointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str) -> GeneAnnotationModel:
+        """ Getter """
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > GENE_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:GenePointer = self.get_pointer(__key)
+        val = pointer.load()
+        self._cache[__key] = val
+        return val
+
+class TranscriptPointerDict(GTFPointerDict):
+    """ TranscriptPointerDict. This defines a dict class that keys are transcript
+    IDs and values are the corresponding transcript models. The getter magic
+    function is customized so it returns a loaded TranscriptAnnotationModel
+    object instead of the pointer.
+    """
+    def __init__(self, *args: Dict[str, TranscriptPointer], **kwargs: Dict[str, TranscriptPointer]):
+        """ Constructor """
+        for val in kwargs.values():
+            self.validate(val, TranscriptPointer)
+        if args:
+            if not isinstance(args[0], dict):
+                raise TypeError('Data type invalid')
+            for val in args[0].values():
+                self.validate(val, TranscriptPointer)
+        super().__init__(*args, **kwargs)
+        self._cache = {}
+        self._cached_keys = deque()
+
+    def __getitem__(self, __key: str) -> TranscriptAnnotationModel:
+        """ getter """
+        if __key in self._cache:
+            return self._cache[__key]
+        self._cached_keys.appendleft(__key)
+        if len(self._cached_keys) > TX_DICT_CACHE_SIZE:
+            key_pop = self._cached_keys.pop()
+            self._cache.pop(key_pop)
+        pointer:TranscriptPointer = self.get_pointer(__key)
+        val = pointer.load()
+        val.is_protein_coding = pointer.is_protein_coding is True
+        val.transcript.source = pointer.source
+        self._cache[__key] = val
+        return val
diff --git a/moPepGen/gtf/GTFSourceInferrer.py b/moPepGen/gtf/GTFSourceInferrer.py
new file mode 100644
index 00000000..4c3c1aa3
--- /dev/null
+++ b/moPepGen/gtf/GTFSourceInferrer.py
@@ -0,0 +1,31 @@
+""" Infer GTF source (e.g. GENCODE/ENSEMBL) """
+from __future__ import annotations
+from typing import Dict, TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from moPepGen.gtf.GTFSeqFeature import GTFSeqFeature
+
+class GTFSourceInferrer():
+    """ Infer GTF source (e.g. GENOCDE/ENSEMBL) """
+    def __init__(self):
+        """ Constructor """
+        self.max_iter = 100
+        self.data:Dict[str,int] = {}
+        self.count = 0
+        self.source:str = None
+
+    def infer(self, record:GTFSeqFeature) -> str:
+        """ Infer the source of a GTF record """
+        if self.count > self.max_iter:
+            if not self.source:
+                self.source = sorted(self.data.items(), key=lambda x:x[1])[-1][0]
+            return self.source
+        self.count += 1
+        record.infer_annotation_source()
+        source = record.source
+        if source not in self.data:
+            self.data[source] = 1
+        else:
+            self.data[source] += 1
+        return source
diff --git a/moPepGen/gtf/GenomicAnnotation.py b/moPepGen/gtf/GenomicAnnotation.py
index 948dfbf6..adbde542 100644
--- a/moPepGen/gtf/GenomicAnnotation.py
+++ b/moPepGen/gtf/GenomicAnnotation.py
@@ -2,7 +2,7 @@
 """
 from __future__ import annotations
 import copy
-from typing import List, Dict, Tuple, TYPE_CHECKING
+from typing import List, Dict, Tuple, TYPE_CHECKING, IO, Union
 from moPepGen.SeqFeature import FeatureLocation, SeqFeature
 from moPepGen import seqvar, err
 from moPepGen.version import MetaVersion
@@ -10,6 +10,7 @@
 from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES
 from .GeneAnnotationModel import GeneAnnotationModel
 from .GTFSeqFeature import GTFSeqFeature
+from .GTFSourceInferrer import GTFSourceInferrer
 
 
 if TYPE_CHECKING:
@@ -31,12 +32,8 @@ def __init__(self, genes:Dict[str,GeneAnnotationModel]=None,
             transcripts:Dict[str, TranscriptAnnotationModel]=None,
             source:str=None):
         """ Construct a GenomicAnnotation object """
-        if genes is None:
-            genes = {}
-        if transcripts is None:
-            transcripts = {}
-        self.genes = genes
-        self.transcripts = transcripts
+        self.genes = genes or {}
+        self.transcripts = transcripts or {}
         self.source = source
         self.gene_id_version_mapper:Dict[str, str] = None
         self.version = MetaVersion()
@@ -106,7 +103,7 @@ def remove_cached_tx_seq(self):
         for tx_id in self._cached_tx_seqs:
             self.transcripts[tx_id].remove_cached_seq()
 
-    def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None:
+    def dump_gtf(self, handle:Union[str, IO], biotype:List[str]=None, source:str=None)->None:
         """ Dump a GTF file into a GenomicAnnotation
 
         Args:
@@ -117,25 +114,14 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None:
         """
         record:GTFSeqFeature
         if not source:
-            count = 0
-            inferred = {}
-        for record in GtfIO.parse(path):
+            inferrer = GTFSourceInferrer()
+
+        for record in GtfIO.parse(handle):
             if biotype is not None and record.biotype not in biotype:
                 continue
 
             if not source:
-                if count > 100:
-                    inferred = sorted(inferred.items(), key=lambda x: x[1])
-                    source = inferred[-1][0]
-                    record.source = source
-                else:
-                    count += 1
-                    record.infer_annotation_source()
-                    inferred_source = record.source
-                    if inferred_source not in inferred:
-                        inferred[inferred_source] = 1
-                    else:
-                        inferred[inferred_source] += 1
+                record.source = inferrer.infer(record)
             else:
                 record.source = source
 
@@ -146,11 +132,11 @@ def dump_gtf(self, path:str, biotype:List[str]=None, source:str=None)->None:
 
             self.add_transcript_record(record)
 
-        if not source:
-            inferred = sorted(inferred.items(), key=lambda x: x[1])
-            source = inferred[-1][0]
 
-        self.source = source
+        if not source:
+            source = inferrer.source
+        else:
+            self.source = source
 
         for transcript_model in self.transcripts.values():
             transcript_model.sort_records()
diff --git a/moPepGen/gtf/GenomicAnnotationOnDisk.py b/moPepGen/gtf/GenomicAnnotationOnDisk.py
new file mode 100644
index 00000000..f667891b
--- /dev/null
+++ b/moPepGen/gtf/GenomicAnnotationOnDisk.py
@@ -0,0 +1,151 @@
+""" Module for GenomicAnnotationOnDisk """
+from __future__ import annotations
+from typing import Dict, IO, Union, Tuple
+import io
+from pathlib import Path
+from moPepGen import err
+from moPepGen.version import MetaVersion
+from moPepGen.gtf.GenomicAnnotation import GenomicAnnotation
+from moPepGen.gtf.GTFPointer import (
+    GenePointerDict, TranscriptPointerDict, iterate_pointer,
+    GenePointer, TranscriptPointer
+)
+from moPepGen.gtf.GTFIndexMetadata import GTFIndexMetadata
+
+
+class GenomicAnnotationOnDisk(GenomicAnnotation):
+    """ An on disk version of GenomicAnnotation that uses minimal memory. """
+    def __init__(self, genes:GenePointerDict=None,
+            transcripts:TranscriptPointerDict=None,
+            source:str=None):
+        """ Constructor """
+        # pylint: disable=W0231
+        self.genes = genes or GenePointerDict()
+        self.transcripts = transcripts or TranscriptPointerDict()
+        self.source = source
+        self.gene_id_version_mapper:Dict[str,str] = None
+        self.version = MetaVersion()
+        self._cached_tx_seqs = []
+        self.handle = None
+
+    def __del__(self):
+        """ Hook for deconstruction """
+        if self.handle:
+            self.handle.close()
+
+    def dump_gtf(self, *args, **kwargs) -> None:
+        """ Forbidden """
+        raise NotImplementedError
+
+    def infer_source(self):
+        """ Infer GTF source (GENCODE vs ENSEMBL) from data. """
+        i = 0
+        inferred = {}
+        for tx in self.transcripts.keys():
+            i += 1
+            if i > 100:
+                break
+            s = self.transcripts.get_pointer(tx).source
+            if s in inferred:
+                inferred[s] += 1
+            else:
+                inferred[s] = 1
+        self.source = sorted(inferred.items(), key=lambda it: it[1])[-1][0]
+
+    def init_handle(self, handle:Union[str, IO, Path]):
+        """ Initiate file handle """
+        # pylint: disable=R1732
+        if isinstance(handle, str):
+            handle = Path(handle)
+
+        if isinstance(handle, Path):
+            if handle.suffix.lower() != '.gtf':
+                raise ValueError(f"Unsupported file formate of {handle}")
+            ihandle = open(handle, 'rb')
+        elif isinstance(handle, io.IOBase):
+            ihandle = handle
+        else:
+            raise ValueError(f"handle {handle} of type {handle.__class__} is not supported.")
+
+        self.handle = ihandle
+
+    @staticmethod
+    def get_index_files(file:Union[str,Path]) -> Tuple[Path,Path]:
+        """ Get index file paths given the GTF file path. """
+        if isinstance(file, str):
+            file = Path(file)
+        basename = str(file.name).split('.gtf', maxsplit=1)[0]
+        gene_idx_file = file.parent/f"{basename}_gene.idx"
+        tx_idx_file = file.parent/f"{basename}_tx.idx"
+
+        return gene_idx_file, tx_idx_file
+
+    def generate_index(self, handle:Union[IO, str, Path], source:str=None):
+        """ Generate GTF index """
+        self.init_handle(handle)
+
+        for pointer in iterate_pointer(self.handle, source):
+            if isinstance(pointer, GenePointer):
+                self.genes[pointer.key] = pointer
+            else:
+                self.transcripts[pointer.key] = pointer
+
+        if source:
+            self.source = source
+        else:
+            self.infer_source()
+
+    def load_index(self, file:Union[str,Path]):
+        """ Load index from idx files """
+        self.init_handle(file)
+        gene_idx_file, tx_idx_file = self.get_index_files(file)
+
+        if not gene_idx_file.exists():
+            raise ValueError(f"Gene index file cannot be found in {file.parent}")
+        if not tx_idx_file.exists():
+            raise ValueError(f"Transcript index file cannot be found in {file.parent}")
+
+        version = MetaVersion()
+
+        with open(gene_idx_file, 'rt') as handle:
+            metadata = GTFIndexMetadata.parse(handle)
+            if not version.is_valid(metadata.version):
+                raise err.IndexVersionNotMatchError(version, metadata.version)
+
+            gene_source = metadata.source
+            if not gene_source:
+                raise ValueError("Cannot find source from gene idx.")
+
+            for line in handle:
+                if line.startswith('#'):
+                    continue
+                fields = line.rstrip().split('\t')
+                pointer = GenePointer(
+                    self.handle, key=fields[0],
+                    start=int(fields[1]), end=int(fields[2]),
+                    source=gene_source, transcripts=fields[3].split(',')
+                )
+                self.genes[pointer.key] = pointer
+
+        with open(tx_idx_file, 'rt') as handle:
+            metadata = GTFIndexMetadata.parse(handle)
+            if not version.is_valid(metadata.version):
+                raise err.IndexVersionNotMatchError(version, metadata.version)
+
+            tx_source = metadata.source
+            if not tx_source:
+                raise ValueError("Cannot find source from gene idx.")
+            if tx_source != gene_source:
+                raise ValueError('tx_source does not match with gene_source')
+
+            for line in handle:
+                if line.startswith('#'):
+                    continue
+                fields = line.rstrip().split('\t')
+                is_protein_coding = {'None': None, 'True': True, 'False': False}[fields[3]]
+                pointer = TranscriptPointer(
+                    handle=self.handle, key=fields[0],
+                    start=int(fields[1]), end=int(fields[2]),
+                    source=tx_source, is_protein_coding=is_protein_coding
+                )
+                self.transcripts[pointer.key] = pointer
diff --git a/moPepGen/gtf/GtfIO.py b/moPepGen/gtf/GtfIO.py
index 45978103..41b4810f 100644
--- a/moPepGen/gtf/GtfIO.py
+++ b/moPepGen/gtf/GtfIO.py
@@ -25,48 +25,54 @@ def iterate(handle:IO[str]) -> Iterable[GTFSeqFeature]:
             if line.startswith('#'):
                 continue
             line = line.rstrip()
-            fields = line.split('\t')
-
-            try:
-                strand={'+':1, '-':-1, '?':0}[fields[6]]
-            except KeyError:
-                strand=None
-
-            attributes = {}
-            attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id',
-                'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding']
-            attribute_list = [field.strip().split(' ', 1) for field in \
-                fields[8].rstrip(';').split(';')]
-
-            for key,val in attribute_list:
-                if key not in attributes_to_keep:
-                    continue
-                val = val.strip('"')
-                if key == 'tag':
-                    if key not in attributes:
-                        attributes[key] = []
-                    attributes[key].append(val)
-                else:
-                    attributes[key] = val
-
-            location = FeatureLocation(
-                seqname=fields[0],
-                start=int(fields[3])-1,
-                end=int(fields[4]),
-                strand=strand,
-            )
-
-            frame = None if fields[7] == '.' else int(fields[7])
-
-            record = GTFSeqFeature(
-                chrom=fields[0],
-                attributes=attributes,
-                location=location,
-                type=fields[2],
-                frame=frame
-            )
+            record = line_to_seq_feature(line)
+
             yield record
 
+def line_to_seq_feature(line:str) -> GTFSeqFeature:
+    """ Conver line to SeqFeature """
+    line = line.rstrip()
+    fields = line.split('\t')
+
+    try:
+        strand={'+':1, '-':-1, '?':0}[fields[6]]
+    except KeyError:
+        strand=None
+
+    attributes = {}
+    attributes_to_keep = ['gene_id', 'transcript_id', 'protein_id',
+        'gene_name', 'gene_type', 'gene_biotype', 'tag', 'is_protein_coding']
+    attribute_list = [field.strip().split(' ', 1) for field in \
+        fields[8].rstrip(';').split(';')]
+
+    for key,val in attribute_list:
+        if key not in attributes_to_keep:
+            continue
+        val = val.strip('"')
+        if key == 'tag':
+            if key not in attributes:
+                attributes[key] = []
+            attributes[key].append(val)
+        else:
+            attributes[key] = val
+
+    location = FeatureLocation(
+        seqname=fields[0],
+        start=int(fields[3])-1,
+        end=int(fields[4]),
+        strand=strand,
+    )
+
+    frame = None if fields[7] == '.' else int(fields[7])
+
+    return GTFSeqFeature(
+        chrom=fields[0],
+        attributes=attributes,
+        location=location,
+        type=fields[2],
+        frame=frame
+    )
+
 def parse(handle:Union[IO[str], str]) -> GtfIterator:
     """ Parser for GTF files.
     """
diff --git a/moPepGen/gtf/__init__.py b/moPepGen/gtf/__init__.py
index b8fd2342..1118af9a 100644
--- a/moPepGen/gtf/__init__.py
+++ b/moPepGen/gtf/__init__.py
@@ -3,3 +3,4 @@
 from .TranscriptAnnotationModel import TranscriptAnnotationModel, GTF_FEATURE_TYPES
 from .GenomicAnnotation import GenomicAnnotation
 from .GeneAnnotationModel import GeneAnnotationModel
+from .GenomicAnnotationOnDisk import GenomicAnnotationOnDisk
diff --git a/moPepGen/version.py b/moPepGen/version.py
index 01a5b4b9..71d16bdd 100644
--- a/moPepGen/version.py
+++ b/moPepGen/version.py
@@ -6,15 +6,15 @@
 from moPepGen import __version__
 
 
-MINIMAL_VERSION = '0.12.0'
+MINIMAL_VERSION = '1.1.0'
 
 class MetaVersion():
     """ Versions """
-    def __init__(self):
+    def __init__(self, python:str=None, biopython:str=None, mopepgen:str=None):
         """ constructor """
-        self.python = sys.version_info[:3]
-        self.biopython = Bio.__version__
-        self.mopepgen = __version__
+        self.python = python or '.'.join([str(x) for x in sys.version_info[:3]])
+        self.biopython = biopython or Bio.__version__
+        self.mopepgen = mopepgen or __version__
 
     def __eq__(self, other:MetaVersion):
         """ equal to """
@@ -45,5 +45,4 @@ def __ne__(self, other:MetaVersion):
 
     def __repr__(self) -> str:
         """ str representation """
-        python = '.'.join([str(x) for x in self.python])
-        return f"python={python}, biopython={self.biopython}, moPepGen={self.mopepgen}"
+        return f"python={self.python}, biopython={self.biopython}, moPepGen={self.mopepgen}"
diff --git a/test/files/annotation_gene.idx b/test/files/annotation_gene.idx
new file mode 100644
index 00000000..76011e1e
--- /dev/null
+++ b/test/files/annotation_gene.idx
@@ -0,0 +1,8 @@
+##source=None
+##python=3.8.17
+##moPepGen=1.1.0
+##biopython=1.81
+ENSG00000128408.9	0	174	ENST00000614168.2,ENST00000614167.2
+ENSG00000244486.9	12491	12694	ENST00000622235.5
+ENSG00000099949.21	24586	24955	ENST00000642151.1
+ENSG00000279973.2	27280	27432	ENST00000624155.2
diff --git a/test/files/annotation_tx.idx b/test/files/annotation_tx.idx
new file mode 100644
index 00000000..81ead4f8
--- /dev/null
+++ b/test/files/annotation_tx.idx
@@ -0,0 +1,9 @@
+##source=None
+##python=3.8.17
+##moPepGen=1.1.0
+##biopython=1.81
+ENST00000614167.2	174	7260	True
+ENST00000614168.2	7260	12491	True
+ENST00000622235.5	12694	24586	True
+ENST00000642151.1	24955	27280	True
+ENST00000624155.2	27432	28522	False
diff --git a/test/files/fusion/star_fusion.gvf b/test/files/fusion/star_fusion.gvf
index b11972e9..4d14aa2d 100644
--- a/test/files/fusion/star_fusion.gvf
+++ b/test/files/fusion/star_fusion.gvf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##mopepgen_version=0.12.0
+##mopepgen_version=1.0.1
 ##parser=parseSTARFusion
 ##reference_index=/home/chenghaozhu/projects/private-moPepGen/test/files/index
 ##genome_fasta=
@@ -13,7 +13,7 @@
 ##INFO=<ID=ACCEPTER_TRANSCRIPT_ID,Number=1,Type=String,Description="3' Accepter Transcript's Transcript ID">
 ##INFO=<ID=ACCEPTER_POSITION,Number=1,Type=Integer,Description="Position of the break point of the 3' accepter transcript">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
-ENSG00000128408.9	832	FUSION-ENST00000614167.2:831-ENST00000642151.1:29	A	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
-ENSG00000128408.9	832	FUSION-ENST00000614168.2:831-ENST00000642151.1:29	A	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
+ENSG00000128408.9	832	FUSION-ENST00000614167.2:831-ENST00000642151.1:29	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614167.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
+ENSG00000128408.9	832	FUSION-ENST00000614168.2:831-ENST00000642151.1:29	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000614168.2;GENE_SYMBOL=RIBC2;GENOMIC_POSITION=chr22:831:831;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
 ENSG00000244486.9	1752	FUSION-ENST00000622235.5:1751-ENST00000642151.1:29	G	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000622235.5;GENE_SYMBOL=SCARF2;GENOMIC_POSITION=chr22:3213:3213;ACCEPTER_GENE_ID=ENSG00000099949.21;ACCEPTER_TRANSCRIPT_ID=ENST00000642151.1;ACCEPTER_SYMBOL=LZTR1;ACCEPTER_POSITION=30;ACCEPTER_GENOMIC_POSITION=chr22:5010:5010
 ENSG00000279973.2	93	FUSION-ENST00000624155.2:92-ENST00000622235.5:968	T	<FUSION>	.	.	TRANSCRIPT_ID=ENST00000624155.2;GENE_SYMBOL=CU104787.1;GENOMIC_POSITION=chr22:5311:5311;ACCEPTER_GENE_ID=ENSG00000244486.9;ACCEPTER_TRANSCRIPT_ID=ENST00000622235.5;ACCEPTER_SYMBOL=SCARF2;ACCEPTER_POSITION=969;ACCEPTER_GENOMIC_POSITION=chr22:3995:3995
diff --git a/test/files/peptides/variant.fasta b/test/files/peptides/variant.fasta
index 8aa7c5ea..bee351cd 100644
--- a/test/files/peptides/variant.fasta
+++ b/test/files/peptides/variant.fasta
@@ -1,218 +1,170 @@
->ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
-CLTCEPGWNR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
-LPPCDEFVGARR
->CIRC-ENST00000614167.2-E2-E1|W2F-11|1
-AGASVPLRRHFEPDGK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|2
-HETLFLLTFPRER
->ENST00000614167.2|RES-101-A-G|W2F-9|3 ENST00000614167.2|RES-101-A-G|W2F-9|3
-IIGGDTEAFDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|22
-MAVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|20
-MAVALPRDLRQDANLAK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|3
-HETLFLLTFPR
->CIRC-ENST00000614167.2-E2-E1|18
-MFKFMTR
->ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
-ARHETLCC
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
-NNLVVDISERAGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
-HETLWLLTFPRERER
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2
-ERLYLCGVTGSPTENCAK
->ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3
-IIGGDTEAWDVQVHGQK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5
-RSKHTVVAYK
->CIRC-ENST00000614167.2-E2-E1|1
-HWEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|16
-FKFMTRR
->ENST00000622235.5|RES-202-G-A|W2F-8|2 ENST00000622235.5|RES-202-G-A|W2F-8|2
-CLTCEPGFNR
->ENST00000614167.2|SNV-1179-G-T|1
-IQGARATLUF
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
-HETLWLLTFPR
->ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
-MEGAGPRGAVPAR
->CIRC-ENST00000614167.2-E2-E1|W2F-3|2
-RHFEPDGK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9
-EATEKARHETLWLLTFPR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9
-RLPPCDEFVGARR
->ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
-CLTCEPGWNRTK
->CIRC-ENST00000614167.2-E2-E1|4
-RHWEPDGK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-5|1
-HETLFLLTFPRERER
->CIRC-ENST00000614167.2-E2-E1|2
-HWEPDGK
->ENST00000622235.5|RES-202-G-A|W2F-19|1 ENST00000622235.5|RES-202-G-A|W2F-19|1
-GQQPCTVAEGRCLTCEPGFNRTK
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2
+HTVVAYK
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10
+RLPPCDEFVGAR
+>ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
+IIGGDTEAWDVQVHGQKIKEATEK
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|1
 HTVVAYKDAI
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-12|1
-EATEKARHETLFLLTFPR
->ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
-TKCDQPCATGFYGEGCSHR
->ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6
-GQQPCTVAEGRCLTCEPGWNR
->CIRC-ENST00000614167.2-E2-E1|17
-MFKFMTRR
->ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
-EGAGPRGAVPAR
->CIRC-ENST00000614167.2-E2-E1|12
-MKPLVVDISERAGASVPLR
->CIRC-ENST00000614167.2-E2-E1|23
-MAVALPR
->ENST00000614167.2|RES-101-A-G|W2F-11|1 ENST00000614167.2|RES-101-A-G|W2F-11|1
-NRIIGGDTEAFDVQVHGQKIK
->CIRC-ENST00000614167.2-E2-E1|8
-ARHETFGC
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
-NTLSGMQKFMGEDUNFHER
->CIRC-ENST00000614167.2-E2-E1|W2F-3|1
-RHFEPDGKLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5
+HETLWLLTFPRER
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
+FKFMARR
+>CIRC-ENST00000614167.2-E2-E1|11
+KPLVVDISERAGASVPLR
 >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|3
 NNLVVDISERAGASVPLR
->CIRC-ENST00000642151.1-E1-E2|3
-PFRRSTSGPSK
->ENST00000614167.2|RES-101-A-G|W2F-16|1 ENST00000614167.2|RES-101-A-G|W2F-16|1
-VFNARNRIIGGDTEAFDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|3
-RHWEPDGKLR
->ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
-GQQPCTVAEGRCLTCEPGWNRTK
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1
-MKEEAAA
->CIRC-ENST00000614167.2-E2-E1|W2F-2|1
-HFEPDGKLR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
-MFKFMAR
+>ENST00000614167.2|INDEL-399-A-AC|1 ENST00000614167.2|INDEL-399-A-AC|1
+ARHETLCC
+>ENST00000622235.5|RES-202-G-A|4 ENST00000622235.5|RES-202-G-A|4
+CLTCEPGWNR
+>ENST00000622235.5|SNV-100-G-T|6 ENST00000622235.5|SNV-100-G-T|6
+MEGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|2
+ERLYLCGVTGSPTENCAK
+>ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
+MEGAGPRGAVPARR
+>CIRC-ENST00000614167.2-E2-E1|7
+AGASVPLR
+>ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2
+IIGGDTEAWDVQVHGQKIK
 >ENST00000622235.5|SNV-100-G-T|1 ENST00000622235.5|SNV-100-G-T|1
 GAVPARRR
+>ENST00000622235.5|RES-202-G-A|5 ENST00000622235.5|RES-202-G-A|5
+GQQPCTVAEGRCLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|4
+RHWEPDGK
 >CIRC-ENST00000614167.2-E2-E1|5
 AGASVPLRRHWEPDGK
->ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5
-NRIIGGDTEAWDVQVHGQK
+>ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|2
+NTLSGMQKFMGEDUNFHER
 >CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|1
 LYLCGVTGSPTENCAK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
-ERERLYLCGVTGSPTENCAK
->ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
-VFNARNRIIGGDTEAWDVQVHGQK
->CIRC-ENST00000614167.2-E2-E1|19
-AVALPRDLRQDANLAK
+>ENST00000614167.2|SNV-1179-G-T|2
+RIQGARATLUF
+>CIRC-ENST00000614167.2-E2-E1|14
+MKPLVVDISER
+>ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2
+GAVPARR
 >ENST00000622235.5|RES-202-G-A|1 ENST00000622235.5|RES-202-G-A|1
 TKCDQPCATGFYGEGCSHRCPPCR
->CIRC-ENST00000614167.2-E2-E1|7
-AGASVPLR
->ENST00000614167.2|RES-101-A-G|2 ENST00000614167.2|RES-101-A-G|2
-IIGGDTEAWDVQVHGQKIK
->ENST00000622235.5|RES-202-G-A|W2F-19|2 ENST00000622235.5|RES-202-G-A|W2F-19|2
-GQQPCTVAEGRCLTCEPGFNR
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
-MNNLVVDISERAGASVPLRR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
-LPPCDEFVGAR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|5
-HETLWLLTFPRER
->ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
-EGAGPRGAVPARR
->ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4
-NRIIGGDTEAWDVQVHGQKIK
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
-ARHETLWLLTFPR
+>ENST00000614167.2|SNV-1179-G-T|1
+IQGARATLUF
+>ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
+EATEKARHE
+>CIRC-ENST00000614167.2-E2-E1|1
+HWEPDGKLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|3
+ERERLYLCGVTGSPTENCAK
+>ENST00000614167.2|RES-101-A-G|3 ENST00000614167.2|RES-101-A-G|3
+IIGGDTEAWDVQVHGQK
 >FUSION-ENST00000624155.2:92-ENST00000622235.5:968|3
 AAGAVFLALSAQLLQARLMKEEAAA
->CIRC-ENST00000614167.2-E2-E1|10
-MKPLVVDISERAGASVPLRR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|1
-FKFMARR
->CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
-MFKFMARR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|9
+EATEKARHETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
+MNNLVVDISER
+>CIRC-ENST00000614167.2-E2-E1|22
+MAVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|2
+MNNLVVDISERAGASVPLRR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7
+ARHETLWLLTFPRER
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6
+LPPCDEFVGARRSK
 >CIRC-ENST00000642151.1-E1-E2|2
 RSTSGPSK
+>CIRC-ENST00000614167.2-E2-E1|10
+MKPLVVDISERAGASVPLRR
+>CIRC-ENST00000614167.2-E2-E1|2
+HWEPDGK
+>ENST00000614167.2|RES-101-A-G|6 ENST00000614167.2|RES-101-A-G|6
+VFNARNRIIGGDTEAWDVQVHGQK
+>CIRC-ENST00000614167.2-E2-E1|19
+AVALPRDLRQDANLAK
 >CIRC-ENST00000614167.2-E2-E1|15
 EATEKARHETFGC
->ENST00000622235.5|SNV-100-G-T|4 ENST00000622235.5|SNV-100-G-T|4
-MEGAGPRGAVPARR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|1
-ARHETLFLLTFPRER
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
-SKHTVVAYK
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
-MNNLVVDISERAGASVPLR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|W2F-1|1 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|W2F-1|1
-FRRLPPCDEFVGAR
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|1
-NTLSGMQKFMGED
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
-WRRLPPCDEFVGAR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|9 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|9
+RLPPCDEFVGARR
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|2
+MFKFMARR
+>ENST00000614167.2|RES-101-A-G|5 ENST00000614167.2|RES-101-A-G|5
+NRIIGGDTEAWDVQVHGQK
+>CIRC-ENST00000614167.2-E2-E1|9
+KPLVVDISERAGASVPLRR
 >FUSION-ENST00000614167.2:831-ENST00000642151.1:29|12 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|12
 LESTTRKVHR
->ENST00000614167.2|RES-101-A-G|1 ENST00000614167.2|RES-101-A-G|1
-IIGGDTEAWDVQVHGQKIKEATEK
->ENST00000614167.2|RES-101-A-G|W2F-9|1 ENST00000614167.2|RES-101-A-G|W2F-9|1
-IIGGDTEAFDVQVHGQKIKEATEK
->ENST00000614167.2|RES-101-A-G|W2F-9|2 ENST00000614167.2|RES-101-A-G|W2F-9|2
-IIGGDTEAFDVQVHGQKIK
->ENST00000614167.2|RES-101-A-G|W2F-11|2 ENST00000614167.2|RES-101-A-G|W2F-11|2
-NRIIGGDTEAFDVQVHGQK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
-SKHTVVAYKDAI
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|4
+MNNLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|4
+HETLWLLTFPRERER
 >CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|5
 NNLVVDISER
->ENST00000622235.5|RES-202-G-A|W2F-8|1 ENST00000622235.5|RES-202-G-A|W2F-8|1
-CLTCEPGFNRTK
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|3 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|3
+SKHTVVAYKDAI
+>CIRC-ENST00000614167.2-E2-E1|20
+MAVALPRDLRQDANLAK
+>ENST00000622235.5|RES-202-G-A|3 ENST00000622235.5|RES-202-G-A|3
+CLTCEPGWNRTK
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|6
+HETLWLLTFPR
 >CIRC-ENST00000614167.2-E2-E1|6
 AGASVPLRR
->ENST00000622235.5|SNV-100-G-T|2 ENST00000622235.5|SNV-100-G-T|2
-GAVPARR
->CIRC-ENST00000614167.2-E2-E1|14
-MKPLVVDISER
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|7
-ARHETLWLLTFPRER
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|5 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|5
+RSKHTVVAYK
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
+LMKEEAAA
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|7 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|7
+LPPCDEFVGARR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|8 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|8
+LPPCDEFVGAR
+>CIRC-ENST00000614167.2-E2-E1|18
+MFKFMTR
+>CIRC-ENST00000614167.2-E2-E1|21
+AVALPRDLR
+>CIRC-ENST00000614167.2-E2-E1|16
+FKFMTRR
+>ENST00000622235.5|SNV-100-G-T|5 ENST00000622235.5|SNV-100-G-T|5
+EGAGPRGAVPAR
+>CIRC-ENST00000614167.2-E2-E1|12
+MKPLVVDISERAGASVPLR
+>ENST00000622235.5|RES-202-G-A|6 ENST00000622235.5|RES-202-G-A|6
+GQQPCTVAEGRCLTCEPGWNR
+>CIRC-ENST00000642151.1-E1-E2|3
+PFRRSTSGPSK
 >CIRC-ENST00000614167.2-E2-E1|13
 KPLVVDISER
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|10 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|10
-RLPPCDEFVGAR
->CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|W2F-7|2
-ARHETLFLLTFPR
->ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
-EATEKARHETLCC
+>CIRC-ENST00000614167.2-E2-E1|23
+MAVALPR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|4 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|4
+SKHTVVAYK
+>CIRC-ENST00000614167.2-E2-E1|RES-101-A-G|3
+MFKFMAR
+>CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|1
+NNLVVDISERAGASVPLRR
+>ENST00000614167.2|RES-101-A-G|4 ENST00000614167.2|RES-101-A-G|4
+NRIIGGDTEAWDVQVHGQKIK
+>ENST00000622235.5|SNV-100-G-T|3 ENST00000622235.5|SNV-100-G-T|3
+EGAGPRGAVPARR
+>FUSION-ENST00000624155.2:92-ENST00000622235.5:968|1
+MKEEAAA
+>ENST00000622235.5|RES-202-G-A|2 ENST00000622235.5|RES-202-G-A|2
+TKCDQPCATGFYGEGCSHR
+>CIRC-ENST00000614167.2-E2-E1|8
+ARHETFGC
 >CIRC-ENST00000642151.1-E1-E2|4
 MPFRRSTSGPSK
->ENST00000614167.2|SNV-1179-G-T|2
-RIQGARATLUF
->ENST00000614167.2|MNV-399-ACC-TAA|1 ENST00000614167.2|MNV-399-ACC-TAA|1
-EATEKARHE
+>ENST00000614167.2|INDEL-399-A-AC|2 ENST00000614167.2|INDEL-399-A-AC|2
+EATEKARHETLCC
+>CIRC-ENST00000614167.2-E2-E1|INDEL-399-A-AC|8
+ARHETLWLLTFPR
+>CIRC-ENST00000614167.2-E2-E1|3
+RHWEPDGKLR
 >CIRC-ENST00000642151.1-E1-E2|1
 STSGPSK
->FUSION-ENST00000624155.2:92-ENST00000622235.5:968|2
-LMKEEAAA
->CIRC-ENST00000614167.2-E2-E1|9
-KPLVVDISERAGASVPLRR
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|6 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|6
-LPPCDEFVGARRSK
->CIRC-ENST00000614167.2-E2-E1|W2F-2|2
-HFEPDGK
->FUSION-ENST00000614167.2:831-ENST00000642151.1:29|2 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|2
-HTVVAYK
->CIRC-ENST00000614167.2-E2-E1|MNV-399-ACC-TAA|6
-MNNLVVDISER
->ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|SECT-645|2
-QSDNDVRNTLSGMQKFMGED
 >ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1 ENST00000614167.2|SNV-611-G-A|SNV-612-A-T|1
 NTLSGMQKFMGEDUNFHERK
->CIRC-ENST00000614167.2-E2-E1|21
-AVALPRDLR
->CIRC-ENST00000614167.2-E2-E1|11
-KPLVVDISERAGASVPLR
+>CIRC-ENST00000614167.2-E2-E1|17
+MFKFMTRR
+>FUSION-ENST00000614167.2:831-ENST00000642151.1:29|11 FUSION-ENST00000614168.2:831-ENST00000642151.1:29|11
+WRRLPPCDEFVGAR
diff --git a/test/files/vep/vep_gINDEL.gvf b/test/files/vep/vep_gINDEL.gvf
index d1269c9f..0a64ea98 100644
--- a/test/files/vep/vep_gINDEL.gvf
+++ b/test/files/vep/vep_gINDEL.gvf
@@ -1,5 +1,5 @@
 ##fileformat=VCFv4.2
-##mopepgen_version=0.12.0
+##mopepgen_version=1.0.1
 ##parser=parseVEP
 ##reference_index=
 ##genome_fasta=/home/chenghaozhu/projects/private-moPepGen/test/files/genome.fasta
diff --git a/test/integration/test_generate_index.py b/test/integration/test_generate_index.py
index 90acf801..845f035e 100644
--- a/test/integration/test_generate_index.py
+++ b/test/integration/test_generate_index.py
@@ -1,9 +1,11 @@
 """ Test the command line interface """
 import argparse
 import subprocess as sp
+import os
 import sys
 from test.integration import TestCaseIntegration
-from moPepGen import cli
+from moPepGen import cli, aa
+from moPepGen.gtf import GenomicAnnotation, GenomicAnnotationOnDisk
 
 
 class TestGenerateIndex(TestCaseIntegration):
@@ -31,6 +33,7 @@ def test_generate_index_case1(self):
         args.genome_fasta = self.data_dir / 'genome.fasta'
         args.annotation_gtf = self.data_dir / 'annotation.gtf'
         args.proteome_fasta = self.data_dir / 'translate.fasta'
+        args.gtf_symlink = False
         args.reference_source = None
         args.invalid_protein_as_noncoding = False
         args.cleavage_rule = 'trypsin'
@@ -43,6 +46,35 @@ def test_generate_index_case1(self):
         args.output_dir.mkdir(parents=False, exist_ok=True)
         cli.generate_index(args)
         files = {str(file.name) for file in args.output_dir.glob('*')}
-        expected = {'genome.pkl', 'proteome.pkl', 'annotation.dat',
+        expected = {'genome.pkl', 'proteome.pkl',
+            'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx',
             'canonical_peptides.pkl', 'coding_transcripts.pkl'}
         self.assertEqual(files, expected)
+
+class TestCaseGenomicAnnotationOnDisk(TestCaseIntegration):
+    """ Test case for GenomicAnnotationOnDisk """
+    def test_generate_index(self):
+        """ Test generate index """
+        proteome = aa.AminoAcidSeqDict()
+        proteome.dump_fasta(self.data_dir/'translate.fasta')
+        gtf_file = self.data_dir/'annotation.gtf'
+        gtf_file2 = self.work_dir/'annotation.gtf'
+        os.symlink(gtf_file, gtf_file2)
+        cli.index_gtf(gtf_file2, proteome=proteome)
+        expect = {
+            'annotation.gtf', 'annotation_gene.idx', 'annotation_tx.idx'
+        }
+        received = {str(file.name) for file in self.work_dir.glob('*')}
+        self.assertEqual(received, expect)
+
+
+    def test_load_index(self):
+        """ test load index """
+        anno1 = GenomicAnnotation()
+        anno1.dump_gtf(self.data_dir/'annotation.gtf')
+
+        anno2 = GenomicAnnotationOnDisk()
+        anno2.load_index(self.data_dir/'annotation.gtf')
+
+        self.assertEqual(anno1.genes.keys(), anno2.genes.keys())
+        self.assertEqual(anno1.transcripts.keys(), anno2.transcripts.keys())