From a90fd9fff03e2c411ab823b80b155905e6efbcbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:16:11 +0100
Subject: [PATCH 01/20] added option to expose trimming and subsampling, fixed
 error on paired fastq files

---
 .../anno/transcriptomic_annotation/star.py    | 417 +++++++++++++-----
 1 file changed, 310 insertions(+), 107 deletions(-)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index b9d7f42..56c181a 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -33,7 +33,7 @@
 import re
 import shutil
 import subprocess
-from typing import List
+from typing import Dict, List
 
 from ensembl.tools.anno.utils._utils import (
     check_exe,
@@ -43,7 +43,7 @@
 )
 
 
-def run_star(
+def run_star(  # pylint:disable=too-many-branches
     genome_file: Path,
     output_dir: Path,
     short_read_fastq_dir: Path,
@@ -51,6 +51,11 @@ def run_star(
     trim_fastq: bool = False,
     max_reads_per_sample: int = 0,
     max_intron_length: int = 100000,
+    subsample_read_limit: int = 100000000,
+    subsample_percentage: float = 0.25,
+    sampling_via_read_limit: bool = False,
+    sampling_via_percentage: bool = False,
+    sampling_via_read_limit_percentage: bool = False,
     num_threads: int = 1,
     star_bin: Path = Path("star"),
     samtools_bin: Path = Path("samtools"),
@@ -72,6 +77,20 @@ def run_star(
         :type max_reads_per_sample: int, default 0
         :param max_intron_length: The maximum intron size for alignments. Defaults to 100000.
         :type max_intron_length: int, default 100000
+        :param subsample_read_limit: Maximum number of reads to subsample. Defaults to 100000000.
+        :type subsample_read_limit:int, default 100000000,
+        :param subsample_percentage: Maximun percentage of reads to subsample.
+        :type subsample_percentage: int, default 0.25,
+        :param sampling_via_read_limit: If True will subsample an input dataset of \
+            fastq files using --subsample_read_limit value.
+        :type sampling_via_read_limit : boolean, False,
+        :param sampling_via_percentage: If True will subsample an input dataset of \
+            fastq files using --subsample_percentage value.
+        :type sampling_via_percentage : boolean, False,
+        :param sampling_via_read_limit_percentage: If True will subsample an input dataset \
+            of fastq files using --subsample_read_limit and --subsample_percentage value; \
+            the lowest number of reads is taken.
+        :type sampling_via_read_limit_percentage : boolean, False,
         :param num_threads: Number of available threads.
         :type num_threads: int, default 1
         :param star_bin: Software path.
@@ -107,7 +126,7 @@ def run_star(
                 return
 
     star_index_file = star_dir / "SAindex"
-    fastq_file_list = []
+    fastq_file_list: List[Path] = []
     file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz")
     fastq_file_list = [
         path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
@@ -119,14 +138,26 @@ def run_star(
     #    fastq_file_list.extend(glob.glob(os.path.join(short_read_fastq_dir, file_type)))
 
     # Get list of paired paths
-    fastq_file_list = _create_paired_paths(fastq_file_list)
+    paired_fastq_file_list = _create_paired_paths(fastq_file_list)
     # Subsamples in parallel if there's a value set
     if max_reads_per_sample:
-        subsample_transcriptomic_data(fastq_file_list)
+        paired_fastq_file_list_sub: List[List[Path]] = []
+        for paired_fastq_files in paired_fastq_file_list:
+            paired_fastq_files_sub = subsample_transcriptomic_data(
+                paired_fastq_files,
+                subsample_read_limit,
+                subsample_percentage,
+                sampling_via_read_limit,
+                sampling_via_percentage,
+                sampling_via_read_limit_percentage,
+                num_threads,
+            )
+            paired_fastq_file_list_sub.append(paired_fastq_files_sub)
+            paired_fastq_file_list = paired_fastq_file_list_sub
         # Get the list of the new subsampled files
-        fastq_file_list = [
-            path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
-        ]
+        # fastq_file_list = [
+        #    path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
+        # ]
     # I don't think is needed
     # fastq_file_list = check_for_fastq_subsamples(fastq_file_list)
 
@@ -159,23 +190,27 @@ def run_star(
                     str(genome_file),
                 ]
             )
-        except Exception as e:#pylint:disable=broad-exception-caught
+        except Exception as e:  # pylint:disable=broad-exception-caught
             logging.error("An error occurred while creating star index: %s", e)
 
     logging.info("Running Star on the files in the fastq dir")
-    for fastq_file in fastq_file_list:
+    for paired_files in paired_fastq_file_list:
+        first_file_name = paired_files[0].name  # Get the name of the first file
+        match = re.search(r"(.+)_\d+\.(fastq|fq)", first_file_name)  # Search for pattern
+        if match:
+            first_part_of_name = match.group(1)
         # logger.info(fastq_file_path)
         # fastq_file_name = os.path.basename(fastq_file_path)
         star_tmp_dir = star_dir / "tmp"
         if star_tmp_dir.exists():
             shutil.rmtree(star_tmp_dir)
-        sam_file = Path(f"{star_dir}/{fastq_file.name}.sam")
-        junctions_file = Path(f"{star_dir}/{fastq_file.name}.sj.tab")
+        sam_file = Path(f"{star_dir}/{first_part_of_name}.sam")
+        junctions_file = Path(f"{star_dir}/{first_part_of_name}.sj.tab")
         sam_file_name = sam_file.name
         sam_temp_file = Path(f"{star_dir}/{sam_file_name}.tmp")
         bam_file = re.sub(".sam", ".bam", sam_file_name)
         bam_sort_file = Path(f"{star_dir}/{bam_file}")
-        log_out_file = Path(f"{star_dir}/{fastq_file.name}.Log.final.out")
+        log_out_file = Path(f"{star_dir}/{first_part_of_name}.Log.final.out")
         if log_out_file.exists() and bam_sort_file.exists() and bam_sort_file.stat().st_size != 0:
             logging.info(
                 "Found an existing bam file for the fastq file, \
@@ -183,7 +218,9 @@ def run_star(
             )
             continue
 
-        logging.info("Processing %s", fastq_file)
+        read_files_in = ",".join(str(fastq_file) for fastq_file in paired_files)
+
+        logging.info("Processing %s", list(paired_files))
         star_command = [
             str(star_bin),
             "--outFilterIntronMotifs",
@@ -199,7 +236,7 @@ def run_star(
             "--genomeDir",
             str(star_dir),
             "--readFilesIn",
-            str(fastq_file),
+            read_files_in,
             "--outFileNamePrefix",
             f"{star_dir}/",
             "--outTmpDir",
@@ -212,7 +249,7 @@ def run_star(
         #'--outSJfilterIntronMaxVsReadN','5000','10000','25000','40000',
         #'50000','50000','50000','50000','50000','100000']
         # check_compression = re.search(r".gz$", fastq_file)
-        if fastq_file.suffix.endswith(".gz"):
+        if Path(paired_files[0].name).suffix.endswith(".gz"):
             star_command.append("--readFilesCommand")
             star_command.append("gunzip")
             star_command.append("-c")
@@ -238,7 +275,7 @@ def run_star(
     logging.info("Completed running STAR")
 
 
-def _create_paired_paths(fastq_file_paths: List) -> List[Path]:
+def _create_paired_paths(fastq_file_paths: List[Path]) -> List[List[Path]]:
     """
     Create list of paired transcriptomic fastq files
 
@@ -246,30 +283,28 @@ def _create_paired_paths(fastq_file_paths: List) -> List[Path]:
         fastq_file_paths (List): List of transcriptomic file paths.
 
     Returns:
-        List: List of paired transcriptomic files
+        List[List[Path]]: List of paired transcriptomic files
     """
-    path_dict = {}
-    # final_list = []
+    path_dict: Dict[str, List[Path]] = {}
+    final_list: List[List[Path]] = []
     for fastq_file in fastq_file_paths:
         paired_name = re.search(r"(.+)_\d+\.(fastq|fq)", str(fastq_file))
         if not paired_name:
             logging.exception(
                 "Could not find _1 or _2 at the end of the prefix \
                 for file. Assuming file is not paired: %s",
-                fastq_file,
+                str(fastq_file),
             )
-            # final_list.append([fastq_file])
-            path_dict[fastq_file] = [fastq_file]
+            final_list.append([fastq_file])
             continue
         run_accession = paired_name.group(1)
         if run_accession in path_dict:
             path_dict[run_accession].append(fastq_file)
         else:
             path_dict[run_accession] = [fastq_file]
-    # for pair in path_dict:
-    #    final_list.append(path_dict[pair])
-    logging.info([value for values_list in path_dict.values() for value in values_list])
-    return [value for values_list in path_dict.values() for value in values_list]
+    for pair in path_dict:  # pylint:disable=consider-using-dict-items
+        final_list.append(path_dict[pair])
+    return final_list
 
 
 # pylint:disable=pointless-string-statement
@@ -279,22 +314,108 @@ def _create_paired_paths(fastq_file_paths: List) -> List[Path]:
 """
 
 
-def _subsample_paired_fastq_files(
+def subsample_transcriptomic_data(
+    fastq_file_list: List[Path],
+    subsample_read_limit: int = 100000000,
+    subsample_percentage: float = 0.25,
+    sampling_via_read_limit: bool = False,
+    sampling_via_percentage: bool = False,
+    sampling_via_read_limit_percentage: bool = True,
+    num_threads: int = 2,
+) -> List[Path]:
+    """
+    Subsample list of paired files.
+    Args:
+        fastq_file_list : Subsample paired fastq files.
+        subsample_read_limit : Maximum number of reads to subsample, default to 100000000.
+        subsample_percentage : Maximun percentage of reads to subsample, default to 0.25.
+        sampling_via_read_limit : If True will subsample an input dataset of fastq files \
+        using --subsample_read_limit value.
+        sampling_via_percentage : If True will subsample an input dataset of fastq files \
+        using --subsample_percentage value.
+        sampling_via_read_limit_percentage : If True will subsample an input dataset of \
+        fastq files using --subsample_read_limit and --subsample_percentage value; \
+        the lowest number of reads is taken.
+        num_threads : number of threads
+    Returns:
+        List[Path]: List of subsampled paired transcriptomic files
+    """
+    subsampled_fastq_files: List[Path] = []
+    # for fastq_files in fastq_file_list:
+    # fastq_file_1, fastq_file_2 = fastq_files
+    # fastq_file_pair = ""
+    # if len(fastq_files) == 2:
+    #    fastq_file_pair = fastq_files[1]
+
+    if len(fastq_file_list) == 1:
+        fastq_file_1 = fastq_file_list[0]
+        if Path(f"{fastq_file_1}.sub").exists():
+            logging.info(
+                "Found an existing .sub file on the fastq path, will use that \
+                    instead. File:%s.sub",
+                fastq_file_1,
+            )
+        else:
+            _subsample_paired_fastq_files(
+                fastq_file_list,
+                subsample_read_limit,
+                subsample_percentage,
+                sampling_via_read_limit,
+                sampling_via_percentage,
+                sampling_via_read_limit_percentage,
+                num_threads,
+            )
+        subsampled_fastq_files = [Path(f"{fastq_file_1}.sub")]
+    if len(fastq_file_list) == 2:
+        fastq_file_1, fastq_file_2 = fastq_file_list
+        if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists():
+            logging.info(
+                "Found an existing .sub files on the fastq path for both members of the pair, will use \
+                those instead of subsampling again. Files: %s.sub,%s.sub",
+                fastq_file_1,
+                fastq_file_2,
+            )
+        else:
+            _subsample_paired_fastq_files(
+                fastq_file_list,
+                subsample_read_limit,
+                subsample_percentage,
+                sampling_via_read_limit,
+                sampling_via_percentage,
+                sampling_via_read_limit_percentage,
+                num_threads,
+            )
+        subsampled_fastq_files = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")]
+    return subsampled_fastq_files
+
+
+def _subsample_paired_fastq_files(  # pylint:disable=too-many-branches
     fastq_files: List[Path],
     subsample_read_limit: int = 100000000,
+    subsample_percentage: float = 0.25,
+    sampling_via_read_limit: bool = False,
+    sampling_via_percentage: bool = False,
+    sampling_via_read_limit_percentage: bool = True,
     num_threads: int = 2,
-    compressed: bool = False,
 ) -> None:
     """
     Perform subsampling on two paired FastQ files in parallel using multiple threads.
 
     Args:
         fastq_files : Path for paired fastq files.
-        output_files : Path for the output file.
-        subsample_read_limit : Subsample size, defaults to 100000000.
+        subsample_read_limit : Maximum number of reads to subsample, default to 100000000.
+        subsample_percentage : Maximun percentage of reads to subsample, default to 0.25.
+        sampling_via_read_limit : If True will subsample an input dataset of fastq files \
+        using --subsample_read_limit value.
+        sampling_via_percentage : If True will subsample an input dataset of fastq files \
+        using --subsample_percentage value.
+        sampling_via_read_limit_percentage : If True will subsample an input dataset of \
+        fastq files using --subsample_read_limit and --subsample_percentage value; \
+        the lowest number of reads is taken.
         num_threads : Number of threads, defaults to 2.
         compressed : file compressed, defaults to False.
     """
+
     if len(fastq_files) == 2:
         fastq_file_1, fastq_file_2 = fastq_files
         output_file_1, output_file_2 = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")]
@@ -308,42 +429,68 @@ def _subsample_paired_fastq_files(
         compressed = True
         num_lines = sum(1 for line in gzip.open(fastq_file_1))  # pylint:disable=consider-using-with
     else:
+        compressed = False
         num_lines = sum(1 for line in open(fastq_file_1))  # pylint:disable=consider-using-with
 
     range_limit = int(num_lines / 4)
-    if range_limit <= subsample_read_limit:
+    sampling_size = 0
+    if sampling_via_read_limit and subsample_read_limit:
+        sampling_size = subsample_read_limit
+    elif sampling_via_percentage and subsample_percentage:
+        sampling_size = round(range_limit * subsample_percentage)
+    elif sampling_via_read_limit_percentage and subsample_percentage and subsample_read_limit:
+        sampling_size = min(subsample_read_limit, round(range_limit * subsample_percentage))
+
+    if range_limit <= sampling_size:
         logging.info(
             "Number of reads (%s is less than the max allowed read count (%s), \
             no need to subsample",
             str(range_limit),
-            str(subsample_read_limit),
+            str(sampling_size),
         )
         return
 
-    rand_list = random.sample(range(0, range_limit - 1), subsample_read_limit)
+    rand_list = random.sample(range(0, range_limit - 1), sampling_size)
     random_indices = {idx * 4: 1 for idx in rand_list}
     logging.info("Processing paired files in parallel")
-    pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
-    pool.apply_async(
-        _subsample_fastq_subset,
-        args=(
+    if num_threads >= 2:
+        pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
+        pool.apply_async(
+            _subsample_fastq_subset,
+            args=(
+                fastq_file_1,
+                output_file_1,
+                random_indices,
+                compressed,
+            ),
+        )
+        if len(fastq_files) == 2:
+            pool.apply_async(
+                _subsample_fastq_subset,
+                args=(
+                    fastq_file_2,
+                    output_file_2,
+                    random_indices,
+                    compressed,
+                ),
+            )
+
+        pool.close()
+        pool.join()
+    else:
+        _subsample_fastq_subset(
             fastq_file_1,
             output_file_1,
             random_indices,
             compressed,
-        ),
-    )
-    pool.apply_async(
-        _subsample_fastq_subset,
-        args=(
-            fastq_file_2,
-            output_file_2,
-            random_indices,
-            compressed,
-        ),
-    )
-    pool.close()
-    pool.join()
+        )
+        if len(fastq_files) == 2:
+            _subsample_fastq_subset(
+                fastq_file_2,
+                output_file_2,
+                random_indices,
+                compressed,
+            )
 
 
 def _subsample_fastq_subset(
@@ -351,7 +498,6 @@ def _subsample_fastq_subset(
 ) -> None:
     """
     Selecting specific sets of four lines from an input FastQ file and writing them to an output file.
-
     Args:
         fastq_file : Path for the fastq file.
         output_file : Path for the output file.
@@ -371,43 +517,6 @@ def _subsample_fastq_subset(
             lines = [file_in.readline() for _ in range(4)]
 
 
-def subsample_transcriptomic_data(fastq_file_list: List, num_threads: int = 2) -> None:
-    """
-    Subsample paired fastq files.
-
-    Args:
-        fastq_file_list : List of fastq file path to process.
-        num_threads : number of threads
-    """
-    for fastq_files in fastq_file_list:
-        fastq_file_1, fastq_file_2 = fastq_files
-        # fastq_file_pair = ""
-        # if len(fastq_files) == 2:
-        #    fastq_file_pair = fastq_files[1]
-
-        if len(fastq_files) == 1:
-            fastq_file_1 = fastq_files[0]
-            if Path(f"{fastq_file_1}.sub").exists():
-                logging.info(
-                    "Found an existing .sub file on the fastq path, will use that instead. File:%s.sub",
-                    fastq_file_1,
-                )
-            else:
-                _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads)
-
-        elif len(fastq_files) == 2:
-            fastq_file_1, fastq_file_2 = fastq_files
-            if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists():
-                logging.info(
-                    "Found an existing .sub files on the fastq path for both members of the pair, will use \
-                    those instead of subsampling again. Files: %s.sub,%s.sub",
-                    fastq_file_1,
-                    fastq_file_2,
-                )
-            elif Path(f"{fastq_file_2}.sub").exists():
-                _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads)
-
-
 def run_trimming(
     output_dir: Path,
     short_read_fastq_dir: Path,
@@ -427,12 +536,12 @@ def run_trimming(
     check_exe(trim_galore_bin)
     trim_dir = create_dir(output_dir, "trim_galore_output")
 
-    fastq_file_list = []
+    fastq_file_list: List[Path] = []
     file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz")
     fastq_file_list = [
         path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type)
     ]
-    fastq_file_list = _create_paired_paths(fastq_file_list)
+    paired_fastq_file_list = _create_paired_paths(fastq_file_list)
 
     trim_galore_cmd = [
         str(trim_galore_bin),
@@ -446,9 +555,11 @@ def run_trimming(
     ]
 
     pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
-    for fastq_file in fastq_file_list:
+    for fastq_file in paired_fastq_file_list:
+        file1, file2 = fastq_file
         if delete_pre_trim_fastq:
-            fastq_file.unlink()
+            file1.unlink()
+            file2.unlink()
         pool.apply_async(
             multiprocess_trim_galore,
             args=(
@@ -527,6 +638,75 @@ def parse_args():
     parser.add_argument("--star_bin", default="STAR", help="Star software path")
     parser.add_argument("--samtools_bin", default="samtools", help="Samtools software path")
     parser.add_argument("--trim_galore_bin", default="trim_galore", help="Trim Galore software path")
+    parser.add_argument(
+        "--subsample_read_limit",
+        type=int,
+        default=100000000,
+        help="Maximum number of reads to subsample. Default 1 hundred million reads",
+        required=False,
+    )
+    parser.add_argument(
+        "--subsample_percentage",
+        type=float,
+        default=0.25,
+        help="Maximun percentage of reads to subsample (0 to 1)",
+        required=False,
+    )
+    parser.add_argument(
+        "--sampling_via_read_limit",
+        type=bool,
+        default=False,
+        help="If True will subsample an input dataset of fastq files using --subsample_read_limit value.",
+        required=False,
+    )
+    parser.add_argument(
+        "--sampling_via_percentage",
+        type=bool,
+        default=False,
+        help="If True will subsample an input dataset of fastq files using --subsample_percentage value.",
+        required=False,
+    )
+    parser.add_argument(
+        "--sampling_via_read_limit_percentage",
+        type=bool,
+        default=True,
+        help="If True will subsample an input dataset of fastq files using --subsample_read_limit \
+            and --subsample_percentage value; the lowest number of reads is taken.",
+        required=False,
+    )
+    parser.add_argument(
+        "--paired_file_1",
+        help="Path for single or paired fastq file; used when --run_subsampling \
+            or --run_trimming are enabled.",
+        required=False,
+    )
+    parser.add_argument(
+        "--paired_file_2",
+        help="Path for single or paired fastq file; used when --run_subsampling \
+            or --run_trimming are enabled.",
+        required=False,
+    )
+    parser.add_argument(
+        "--run_star",
+        type=bool,
+        default=True,
+        help="If True will run STAR alignment given an input dataset of fastq files.",
+        required=False,
+    )
+    parser.add_argument(
+        "--run_subsampling",
+        type=bool,
+        default=False,
+        help="If True will subsample an input dataset of fastq files.",
+        required=False,
+    )
+    parser.add_argument(
+        "--run_trimming",
+        type=bool,
+        default=False,
+        help="If True will trim input dataset of fastq files using TrimGalore suite.",
+        required=False,
+    )
     return parser.parse_args()
 
 
@@ -542,20 +722,43 @@ def main():
         defaults={"logfilename": str(log_file_path)},
         disable_existing_loggers=False,
     )
-
-    run_star(
-        args.genome_file,
-        args.output_dir,
-        args.short_read_fastq_dir,
-        args.delete_pre_trim_fastq,
-        args.trim_fastq,
-        args.max_reads_per_sample,
-        args.max_intron_length,
-        args.num_threads,
-        args.star_bin,
-        args.samtools_bin,
-        args.trim_galore_bin,
-    )
+    if args.run_star:
+        run_star(
+            args.genome_file,
+            args.output_dir,
+            args.short_read_fastq_dir,
+            args.delete_pre_trim_fastq,
+            args.trim_fastq,
+            args.max_reads_per_sample,
+            args.max_intron_length,
+            args.subsample_read_limit,
+            args.subsample_percentage,
+            args.sampling_via_read_limit,
+            args.sampling_via_percentage,
+            args.sampling_via_read_limit_percentage,
+            args.num_threads,
+            args.star_bin,
+            args.samtools_bin,
+            args.trim_galore_bin,
+        )
+    if args.run_subsampling:
+        fastq_file_list = [Path(args.paired_file_1), Path(args.paired_file_2)]
+        subsample_transcriptomic_data(
+            fastq_file_list,
+            args.subsample_read_limit,
+            args.subsample_percentage,
+            args.sampling_via_read_limit,
+            args.sampling_via_percentage,
+            args.num_threads,
+        )
+    if args.run_trimming:
+        run_trimming(
+            args.output_dir,
+            args.short_read_fastq_dir,
+            args.delete_pre_trim_fastq,
+            args.num_threads,
+            args.trim_galore_bin,
+        )
 
 
 if __name__ == "__main__":

From 160735cfd344a1ea5937f0de0845c65dd0701725 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:22:14 +0100
Subject: [PATCH 02/20] sphinx fixed

---
 .../tools/anno/transcriptomic_annotation/star.py      | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 56c181a..10261a3 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -63,6 +63,7 @@ def run_star(  # pylint:disable=too-many-branches
 ) -> None:
     """
     Run STAR alignment on list of short read data.
+    
         :param genome_file: Genome file path.
         :type genome_file: Path
         :param output_dir: Working directory path.
@@ -81,15 +82,11 @@ def run_star(  # pylint:disable=too-many-branches
         :type subsample_read_limit:int, default 100000000,
         :param subsample_percentage: Maximun percentage of reads to subsample.
         :type subsample_percentage: int, default 0.25,
-        :param sampling_via_read_limit: If True will subsample an input dataset of \
-            fastq files using --subsample_read_limit value.
+        :param sampling_via_read_limit: subsample fastq files using --subsample_read_limit.
         :type sampling_via_read_limit : boolean, False,
-        :param sampling_via_percentage: If True will subsample an input dataset of \
-            fastq files using --subsample_percentage value.
+        :param sampling_via_percentage: subsample fastq files using --subsample_percentage.
         :type sampling_via_percentage : boolean, False,
-        :param sampling_via_read_limit_percentage: If True will subsample an input dataset \
-            of fastq files using --subsample_read_limit and --subsample_percentage value; \
-            the lowest number of reads is taken.
+        :param sampling_via_read_limit_percentage: use max read limit and  percentage value.
         :type sampling_via_read_limit_percentage : boolean, False,
         :param num_threads: Number of available threads.
         :type num_threads: int, default 1

From 142fe6646014f892d2b26245d8f74f3e2e8472bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:29:09 +0100
Subject: [PATCH 03/20] sphinx fixed

---
 src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 10261a3..5449f5f 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -18,6 +18,7 @@
 in genomics research for aligning RNA-seq data to a reference genome.
 Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
 Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
+
 """
 
 __all__ = ["run_star"]

From 23af96214fac01ba3af3b591ac188d13b425a413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:36:23 +0100
Subject: [PATCH 04/20] sphinx fixed, exposed all 3 functions

---
 .../tools/anno/transcriptomic_annotation/star.py    | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 5449f5f..f7b62c3 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -12,16 +12,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
-The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
-in genomics research for aligning RNA-seq data to a reference genome.
-Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
-Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
-
+    The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
+    in genomics research for aligning RNA-seq data to a reference genome.
+    Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
+    Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
 """
-
-__all__ = ["run_star"]
+__all__ = ["run_star", "subsample_transcriptomic_data", "run_trimming"]
 
 import argparse
 import logging

From 1148f325856756e81dcb84a0459fa6efe462ec98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:40:37 +0100
Subject: [PATCH 05/20] sphinx fixed

---
 src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index f7b62c3..279c969 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -61,7 +61,6 @@ def run_star(  # pylint:disable=too-many-branches
 ) -> None:
     """
     Run STAR alignment on list of short read data.
-    
         :param genome_file: Genome file path.
         :type genome_file: Path
         :param output_dir: Working directory path.

From 8c4c39ba05da9f27c1351ec89b91037fc95b47d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 15:50:52 +0100
Subject: [PATCH 06/20] silenced warnings

---
 docs/source/conf.py                                      | 7 ++++++-
 .../ensembl/tools/anno/transcriptomic_annotation/star.py | 9 ++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4f9e850..851f628 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -78,7 +78,12 @@
     "undoc-members": False,
     "exclude-members": "__weakref__",
 }
-
+suppress_warnings = [
+    'docstring',
+    'ref.citation',       # Example: Suppress warnings about citations
+    'image.nonlocal_uri', # Example: Suppress warnings about non-local images
+    # Add other warnings to suppress as needed
+]
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 279c969..784a242 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -12,11 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-    The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
-    in genomics research for aligning RNA-seq data to a reference genome.
-    Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
-    Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
+"""The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
+in genomics research for aligning RNA-seq data to a reference genome.
+Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
+Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
 """
 __all__ = ["run_star", "subsample_transcriptomic_data", "run_trimming"]
 

From 3f1f978945006685b45989b297f07c3ba1d87e73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 16:12:25 +0100
Subject: [PATCH 07/20] wrong indentation

---
 .../tools/anno/transcriptomic_annotation/star.py       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 784a242..21a105a 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -78,12 +78,12 @@ def run_star(  # pylint:disable=too-many-branches
         :type subsample_read_limit:int, default 100000000,
         :param subsample_percentage: Maximun percentage of reads to subsample.
         :type subsample_percentage: int, default 0.25,
-        :param sampling_via_read_limit: subsample fastq files using --subsample_read_limit.
-        :type sampling_via_read_limit : boolean, False,
-        :param sampling_via_percentage: subsample fastq files using --subsample_percentage.
-        :type sampling_via_percentage : boolean, False,
+        :param sampling_via_read_limit: subsample fastq files using subsample_read_limit.
+        :type sampling_via_read_limit: boolean, False,
+        :param sampling_via_percentage: subsample fastq files using subsample_percentage.
+        :type sampling_via_percentage: boolean, False,
         :param sampling_via_read_limit_percentage: use max read limit and  percentage value.
-        :type sampling_via_read_limit_percentage : boolean, False,
+        :type sampling_via_read_limit_percentage: boolean, False,
         :param num_threads: Number of available threads.
         :type num_threads: int, default 1
         :param star_bin: Software path.

From ccd416d9b256b1ec02dfbc683cb64008e51c9333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Tue, 28 May 2024 23:48:22 +0100
Subject: [PATCH 08/20] missing input sampling function

---
 src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 21a105a..dfe976a 100644
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -742,6 +742,7 @@ def main():
             args.subsample_percentage,
             args.sampling_via_read_limit,
             args.sampling_via_percentage,
+            args.sampling_via_read_limit_percentage,
             args.num_threads,
         )
     if args.run_trimming:

From 042bc4b584a4bed45ae4e78130c234cc8fba4498 Mon Sep 17 00:00:00 2001
From: ens-ftricomi <ftricomi@ebi.ac.uk>
Date: Wed, 19 Jun 2024 11:41:47 +0100
Subject: [PATCH 09/20] improved subsampling

---
 .../anno/transcriptomic_annotation/star.py    | 38 +++++++++++++++----
 1 file changed, 30 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 src/python/ensembl/tools/anno/transcriptomic_annotation/star.py

diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
old mode 100644
new mode 100755
index dfe976a..4556a0e
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -418,7 +418,7 @@ def _subsample_paired_fastq_files(  # pylint:disable=too-many-branches
     else:
         raise FileNotFoundError("No fastq file found")
 
-    if fastq_file_1.suffix.endswith(".gz$"):
+    if fastq_file_1.suffix.endswith(".gz"):
         compressed = True
         num_lines = sum(1 for line in gzip.open(fastq_file_1))  # pylint:disable=consider-using-with
     else:
@@ -443,8 +443,8 @@ def _subsample_paired_fastq_files(  # pylint:disable=too-many-branches
         )
         return
 
-    rand_list = random.sample(range(0, range_limit - 1), sampling_size)
-    random_indices = {idx * 4: 1 for idx in rand_list}
+    rand_list = random.sample(range(0, range_limit-1), sampling_size)
+    random_indices = [idx * 4 for idx in rand_list]
     logging.info("Processing paired files in parallel")
     if num_threads >= 2:
         pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
@@ -498,6 +498,29 @@ def _subsample_fastq_subset(
         compressed : the files is compressed
     """
     line_index = 0
+    read_block = []
+    with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
+                    output_file, "w+"
+                        ) as file_out:
+        for line in file_in:
+            read_block.append(line)
+            if len(read_block) == 4:
+                if line_index in random_indices:
+                  file_out.writelines(read_block)
+                read_block = []
+                line_index += 4
+        #lines = [file_in.readline() for _ in range(4)]
+    """   
+       while lines[3]:
+                #lines = [file_in.readline() for _ in range(4)]
+                # Break if the end of the file is reached
+                if len(lines) < 4 :  # No more lines to read
+                    break
+                # Write to output if current index is in random_indices
+                if line_index in random_indices:
+                    file_out.writelines(lines)
+                line_index += 4
+                lines = [file_in.readline() for _ in range(4)]
 
     with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open(
         output_file, "w+"
@@ -508,7 +531,7 @@ def _subsample_fastq_subset(
                 file_out.writelines(lines)
             line_index += 4
             lines = [file_in.readline() for _ in range(4)]
-
+    """
 
 def run_trimming(
     output_dir: Path,
@@ -609,9 +632,9 @@ def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Pat
 def parse_args():
     """Parse command line arguments."""
     parser = argparse.ArgumentParser(description="STAR's arguments")
-    parser.add_argument("--genome_file", required=True, help="Genome file path")
-    parser.add_argument("--output_dir", required=True, help="Output directory path")
-    parser.add_argument("--short_read_fastq_dir", required=True, help="Short read directory path")
+    parser.add_argument("--genome_file",  help="Genome file path")
+    parser.add_argument("--output_dir",  help="Output directory path")
+    parser.add_argument("--short_read_fastq_dir",  help="Short read directory path")
     parser.add_argument(
         "--delete_pre_trim_fastq",
         action="store_true",
@@ -682,7 +705,6 @@ def parse_args():
     parser.add_argument(
         "--run_star",
         type=bool,
-        default=True,
         help="If True will run STAR alignment given an input dataset of fastq files.",
         required=False,
     )

From 7ee2b595f1ba1ba568179c60ecd33efd9c7ae132 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 5 Dec 2024 11:53:12 +0000
Subject: [PATCH 10/20] moved split protein in utils

---
 .../tools/anno/protein_annotation/genblast.py |  59 +---------
 src/python/ensembl/tools/anno/utils/_utils.py | 104 ++++++++++++++++--
 2 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/src/python/ensembl/tools/anno/protein_annotation/genblast.py b/src/python/ensembl/tools/anno/protein_annotation/genblast.py
index 8652807..5d0275c 100644
--- a/src/python/ensembl/tools/anno/protein_annotation/genblast.py
+++ b/src/python/ensembl/tools/anno/protein_annotation/genblast.py
@@ -38,18 +38,17 @@
 import multiprocessing
 import os
 from pathlib import Path
-import random
 import re
 import shutil
 import signal
 import subprocess
-from typing import List
 import argparse
 
 from ensembl.tools.anno.utils._utils import (
     check_exe,
     create_dir,
     check_gtf_content,
+    split_protein_file
 )
 
 logger = logging.getLogger(__name__)
@@ -65,7 +64,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
     convert2blastmask_bin: Path = Path("convert2blastmask"),
     makeblastdb_bin: Path = Path("makeblastdb"),
     num_threads: int = 1,
-    protein_set: str = ["uniprot", "orthodb"],
+    protein_set: str = "uniprot",
 ) -> None:
     """
     
@@ -102,6 +101,8 @@ def run_genblast(#pylint:disable=dangerous-default-value
     check_exe(genblast_bin)
     check_exe(convert2blastmask_bin)
     check_exe(makeblastdb_bin)
+    if protein_set not in {"uniprot", "orthodb"}:
+        raise ValueError("protein_set must be either 'uniprot' or 'orthodb'")
     if protein_set == "uniprot":
         genblast_dir = create_dir(output_dir, "uniprot_output")
     elif protein_set == "orthodb":
@@ -128,7 +129,7 @@ def run_genblast(#pylint:disable=dangerous-default-value
     else:
         _run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file)
     _run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file)
-    batched_protein_files = _split_protein_file(
+    batched_protein_files = split_protein_file(
         protein_dataset, genblast_dir, num_threads
     )
     pool = multiprocessing.Pool(num_threads)  # pylint:disable=consider-using-with
@@ -286,56 +287,6 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None:
                 path.unlink()
 
 
-def _split_protein_file(
-    protein_dataset: Path, output_dir: Path, batch_size: int = 20
-) -> List:
-    """
-    The protein dataset file is splitted by a number of sequence equals to the batch_size
-    in batch files stored in 10 output directories.
-    protein_dataset : Path for the protein dataset.
-    output_dir : Output directory path.
-    batch_size : Size of the batch, it needs to be equals to the number of threads
-    to parallelise the sequence processing for each file.
-    """
-    batched_protein_files = []
-
-    for i in range(0, 10):
-        create_dir(output_dir, (f"bin_{i}"))
-    with open(protein_dataset,"r", encoding="utf8") as file_in:
-        seq_count = 0
-        batch_count = 0
-        current_record = ""
-        initial_seq = True
-        for line in file_in:
-            match = re.search(r">(.+)$", line)
-            # match header and is not first sequence, if the number of stored sequences in each file equals
-            # the number of batch_size, a new file will be created and the current_record reset
-            if match and not initial_seq and seq_count % batch_size == 0:
-                bin_num = random.randint(0, 9)
-                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-                with batch_file.open("w+") as file_out:
-                    file_out.write(current_record)
-                batch_count += 1
-                seq_count += 1
-                current_record = line
-                batched_protein_files.append(batch_file)
-            # match header and is the first sequence
-            elif match:
-                current_record += line
-                initial_seq = False
-                seq_count += 1
-            # other lines
-            else:
-                current_record += line
-
-        if current_record:
-            bin_num = random.randint(0, 9)
-            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-            with batch_file.open("w+") as file_out:
-                file_out.write(current_record)
-            batched_protein_files.append(batch_file)
-    return batched_protein_files
-
 
 def _run_convert2blastmask(
     convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path
diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py
index c0c9786..8f882a5 100644
--- a/src/python/ensembl/tools/anno/utils/_utils.py
+++ b/src/python/ensembl/tools/anno/utils/_utils.py
@@ -21,6 +21,7 @@
 import os
 from os import PathLike
 from pathlib import Path
+import random
 import re
 import subprocess
 import shutil
@@ -81,8 +82,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
     Check number of transcript lines in the GTF
 
     Arg:
-      gtf_file: GTF file path
-      content_obj: Object to detect and count in the gtf i.e transcript, repeat
+        gtf_file: GTF file path
+        content_obj: Object to detect and count in the gtf i.e transcript, repeat
 
     Return: Number of occurences
     """
@@ -403,6 +404,95 @@ def check_file(file_path: Path) -> None:
     if not file_path.is_file():
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
 
+def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
+    """
+    Save fastq sequences in a dictionary <header, sequence>
+    fasta_lines: List[str]: List of sequences
+    """
+    index = {}
+    header = None
+    seq = []
+
+    for line in fasta_lines:
+        line = line.strip()
+        if line.startswith(">"):
+            if header:
+                index[header] = ''.join(seq)
+            header_match = re.match(r">(.+)", line)
+            if header_match:
+                header = header_match.group(1)
+                seq = []
+            else:
+                raise ValueError(f"Invalid FASTA header line: {line}")
+        else:
+            seq.append(line)
+
+    if header:
+        index[header] = ''.join(seq)
+
+    return index
+"""
+def fasta_to_dict(fasta_lines:List[str]):
+    index = {}
+    for line in fasta_lines:
+    #it = iter(fasta_lines)
+    #for header in it:
+        match = re.search(r">(.+)\n$", line)
+        header = match.group(1)
+        seq = next(line)
+        index[header] = seq
+    return index
+"""
+
+def split_protein_file(
+    protein_dataset: Path, output_dir: Path, batch_size: int = 20
+) -> List:
+    """
+    The protein dataset file is splitted by a number of sequence equals to the batch_size
+    in batch files stored in 10 output directories.
+    protein_dataset : Path for the protein dataset.
+    output_dir : Output directory path.
+    batch_size : Size of the batch, it needs to be equals to the number of threads
+    to parallelise the sequence processing for each file.
+    """
+    batched_protein_files = []
+
+    for i in range(0, 10):
+        create_dir(output_dir, (f"bin_{i}"))
+    with open(protein_dataset,"r", encoding="utf8") as file_in:
+        seq_count = 0
+        batch_count = 0
+        current_record = ""
+        initial_seq = True
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            # match header and is not first sequence, if the number of stored sequences in each file equals
+            # the number of batch_size, a new file will be created and the current_record reset
+            if match and not initial_seq and seq_count % batch_size == 0:
+                bin_num = random.randint(0, 9)
+                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+                with batch_file.open("w+") as file_out:
+                    file_out.write(current_record)
+                batch_count += 1
+                seq_count += 1
+                current_record = line
+                batched_protein_files.append(batch_file)
+            # match header and is the first sequence
+            elif match:
+                current_record += line
+                initial_seq = False
+                seq_count += 1
+            # other lines
+            else:
+                current_record += line
+
+        if current_record:
+            bin_num = random.randint(0, 9)
+            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+            with batch_file.open("w+") as file_out:
+                file_out.write(current_record)
+            batched_protein_files.append(batch_file)
+    return batched_protein_files
 
 def load_results_to_ensembl_db(
     main_script_dir,
@@ -1222,15 +1312,7 @@ def read_gtf_genes(gtf_file):
     return gtf_genes
 
 
-def fasta_to_dict(fasta_list):
-    index = {}
-    it = iter(fasta_list)
-    for header in it:
-        match = re.search(r">(.+)\n$", header)
-        header = match.group(1)
-        seq = next(it)
-        index[header] = seq
-    return index
+
 
 
 # def merge_gtf_files(file_paths,id_label):

From dab7a14a592c1c6e8009a98e063ef3934d6d9438 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 5 Dec 2024 11:53:56 +0000
Subject: [PATCH 11/20] first splitting|

---
 .../tools/anno/finalise_genset/cpc2.py        |  73 ++
 .../tools/anno/finalise_genset/diamond.py     | 101 +++
 .../finalise_genset/finalise_geneset_utils.py | 811 ++++++++----------
 .../tools/anno/finalise_genset/rnasamba.py    |  59 ++
 4 files changed, 603 insertions(+), 441 deletions(-)
 create mode 100644 src/python/ensembl/tools/anno/finalise_genset/cpc2.py
 create mode 100644 src/python/ensembl/tools/anno/finalise_genset/diamond.py
 create mode 100644 src/python/ensembl/tools/anno/finalise_genset/rnasamba.py

diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
new file mode 100644
index 0000000..946bb96
--- /dev/null
+++ b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
@@ -0,0 +1,73 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+check_file(cpc2_output_path)
+cpc2_output_path = validation_dir / "cpc2.tsv"
+    
+    cpc2_volume = f"{validation_dir}/:/app:rw"
+    cpc2_cmd = [
+        "singularity",
+        "exec",
+        "--bind",
+        str(cpc2_volume),
+        str(cpc2_bin),
+        "python3",
+        "/CPC2_standalone-1.0.1/bin/CPC2.py",
+        "-i",
+        str(cdna_file),
+        "--ORF",
+        "-o",
+        str(cpc2_output_path),
+    ]
+    logger.info(" ".join(cpc2_cmd))
+    subprocess.run(cpc2_cmd, check=True)
+    cpc2_output_path = f"{str(cpc2_output_path)}.txt"
+    
+    
+def read_cpc2_results(file_path):
+
+    results = []
+
+    file_in = open(file_path)
+    line = file_in.readline()
+    while line:
+        line = line.rstrip()
+        match = re.search(r"^#ID", line)
+        if match:
+            line = file_in.readline()
+            continue
+
+        eles = line.split("\t")
+        if not len(eles) == 9:
+            line = file_in.readline()
+            continue
+
+        transcript_id = eles[0]
+        transcript_length = eles[1]
+        peptide_length = eles[2]
+        coding_probability = eles[7]
+        coding_potential = eles[8]
+        results.append(
+            [
+                transcript_id,
+                coding_probability,
+                coding_potential,
+                transcript_length,
+                peptide_length,
+            ]
+        )
+        line = file_in.readline()
+    file_in.close()
+
+    return results    
\ No newline at end of file
diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py
new file mode 100644
index 0000000..c63a533
--- /dev/null
+++ b/src/python/ensembl/tools/anno/finalise_genset/diamond.py
@@ -0,0 +1,101 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ logger.info("diamond validation")
+    diamond_results = None
+    if diamond_validation_db is not None:
+        diamond_output_dir = create_dir(validation_dir, "diamond_output")
+        diamond_validation(
+            diamond_validation_db,
+            amino_acid_file,
+            diamond_output_dir,
+            num_threads,
+        )
+        diamond_results = read_diamond_results(diamond_output_dir)
+        
+        
+        def diamond_validation(
+    diamond_validation_db: Path,
+    amino_acid_file: Path, 
+    diamond_output_dir: Path, 
+    num_threads: int
+)-> None:
+
+    batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100)
+
+    pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
+    for batched_protein_file in batched_protein_files:
+        pool.apply_async(
+            multiprocess_diamond,
+            args=(
+                batched_protein_file,
+                diamond_output_dir,
+                diamond_validation_db,
+            ),
+        )
+    pool.close()
+    pool.join()
+
+
+def multiprocess_diamond(
+    batched_protein_file,
+    diamond_output_dir,
+    diamond_validation_db,
+)->None:
+
+    #batch_num = os.path.splitext(batched_protein_file)[0]
+    #batch_dir = os.path.dirname(batched_protein_file)
+    diamond_output_file = f"{batched_protein_file}.dmdout"
+    logger.info("Running diamond on %s :", batched_protein_file)
+
+    diamond_cmd = [
+        "diamond",
+        "blastp",
+        "--query",
+        batched_protein_file,
+        "--db",
+        diamond_validation_db,
+        "--out",
+        diamond_output_file,
+    ]
+
+    logger.info(" ".join(diamond_cmd))
+    subprocess.run(diamond_cmd)
+    subprocess.run(["mv", diamond_output_file, diamond_output_dir])
+    
+    
+def read_diamond_results(diamond_output_dir):
+
+    results = []
+    diamond_files = glob.glob(diamond_output_dir + "/*.dmdout")
+    for file_path in diamond_files:
+        file_in = open(file_path)
+        line = file_in.readline()
+        while line:
+            line = line.rstrip()
+
+            eles = line.split("\t")
+            if not len(eles) == 12:
+                line = file_in.readline()
+                continue
+
+            transcript_id = eles[0]
+            e_value = eles[10]
+            results.append([transcript_id, e_value])
+            line = file_in.readline()
+    file_in.close()
+
+    return results
+    
\ No newline at end of file
diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py
index 989da6b..178fbb4 100644
--- a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py
+++ b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py
@@ -12,210 +12,188 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+Set of functions used to collect the results of the subpipelines and generate the final output
+This logic will be revised in the nextflow pipeline
+"""
+__all__ = ["run_finalise_geneset"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from pathlib import Path
+import re
+import subprocess
+from typing import List
+import shutil
+
+from ensembl.tools.anno.utils._utils import (
+    check_file,
+    get_seq_region_length,
+    fasta_to_dict,
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    split_protein_file
+)
+
+logger = logging.getLogger(__name__)
 
 def run_finalise_geneset(
-    main_script_dir,
-    main_output_dir,
-    genome_file,
-    seq_region_names,
-    validation_type,
-    diamond_validation_db,
-    num_threads,
-):
-
-    if validation_type is None:
-        logger.info("Setting validation type to relaxed")
-    else:
-        logger.info("Setting validation type to " + validation_type)
+    main_script_dir: Path,
+    main_output_dir: Path,
+    genome_file: Path,
+    seq_region_names: List[str],
+    diamond_validation_db :Path,
+    validation_type: str = "relaxed",
+    num_threads: int = 1,
+    cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"),
+    rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"),
+    rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"),
 
-    final_annotation_dir = utils.create_dir(main_output_dir, "annotation_output")
-    region_annotation_dir = utils.create_dir(final_annotation_dir, "initial_region_gtfs")
-    final_region_annotation_dir = utils.create_dir(
+):
+    """Collect results 
+
+        :param main_script_dir:
+        :type main_script_dir: Path
+        :param main_output_dir: _description_
+        :type main_output_dir: Path
+        :param genome_file: Path for genome fasta file
+        :type genome_file: Path
+        :param seq_region_names: list of seq region names
+        :type seq_region_names: List[str]
+        :param diamond_validation_db: Path for diamond validation db
+        :type diamond_validation_db: Path
+        :param validation_type: type of validation
+        :type validation_type: str, default "relaxed"
+        :param num_threads: Num of threads. Defaults to 1.
+        :type num_threads: int, default 1 
+        :param cpc2_bin: CPC2 software path
+        :type cpc2_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"
+        :param rnasamba_bin: RNASamba software path
+        :type rnasamba_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
+        :param rnasamba_weights: RNASamba weights path
+        :type rnasamba_weights: Path, default "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"
+
+    """
+
+
+    final_annotation_dir = create_dir(main_output_dir, "annotation_output")
+    region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs")
+    final_region_annotation_dir = create_dir(
         final_annotation_dir, "final_region_gtfs"
     )
-    utr_region_annotation_dir = utils.create_dir(final_annotation_dir, "utr_region_gtfs")
-    validation_dir = utils.create_dir(final_annotation_dir, "cds_validation")
-    seq_region_lengths = utils.get_seq_region_lengths(genome_file, 0)
-
-    logger.info("Skip analysis if the gtf file already exists")
-    output_file = os.path.join(final_annotation_dir, "annotation.gtf")
-    if os.path.exists(output_file):
-        logger.info("final_annotation_dir exists")
-        transcript_count = utils.check_gtf_content(output_file, "transcript")
+    utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs")
+    validation_dir = create_dir(final_annotation_dir, "cds_validation")
+    seq_region_lengths = get_seq_region_length(genome_file, 0)
+    output_file = final_annotation_dir / "annotation.gtf"
+    if output_file.exists():
+        transcript_count = check_gtf_content(output_file, "transcript")
         if transcript_count > 0:
-            logger.info("Final gtf file exists")
+            logger.info("Final gtf file exists, skipping analysis")
             return
-    else:
-        logger.info("No gtf file, go on with the analysis")
-    # This used to be a list of output dirs and a loop which was neat,
-    # I'm coverting to a list of conditions as
-    # it's more straightforward with the renaming
-    # and having to merge scallop and stringtie
-    protein_annotation_raw = os.path.join(
-        main_output_dir, "genblast_output", "annotation.gtf"
-    )
-    minimap2_annotation_raw = os.path.join(
-        main_output_dir, "minimap2_output", "annotation.gtf"
-    )
-    stringtie_annotation_raw = os.path.join(
-        main_output_dir, "stringtie_output", "annotation.gtf"
-    )
-    scallop_annotation_raw = os.path.join(
-        main_output_dir, "scallop_output", "annotation.gtf"
-    )
-    busco_annotation_raw = os.path.join(main_output_dir, "busco_output", "annotation.gtf")
-
-    transcript_selector_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "select_best_transcripts.pl"
-    )
-    finalise_geneset_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "finalise_geneset.pl"
-    )
-    clean_geneset_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "clean_geneset.pl"
-    )
-    clean_utrs_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "clean_utrs_and_lncRNAs.pl"
-    )
-    gtf_to_seq_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "gtf_to_seq.pl"
-    )
 
-    transcriptomic_annotation_raw = os.path.join(
-        final_annotation_dir, "transcriptomic_raw.gtf"
-    )
-    file_out = open(transcriptomic_annotation_raw, "w+")
-    for transcriptomic_file in [
+    protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf"
+    minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf"
+    stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf"
+    scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf"
+    busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf"
+    transcript_selector_script = main_output_dir / "support_scripts_perl" / "select_best_transcripts.pl"
+    finalise_geneset_script = main_output_dir / "support_scripts_perl" / "finalise_geneset.pl"
+    clean_geneset_script = main_output_dir / "support_scripts_perl" / "clean_geneset.pl"
+    clean_utrs_script = main_output_dir / "support_scripts_perl" / "clean_utrs_and_lncRNAs.pl"
+    gtf_to_seq_script = main_output_dir / "support_scripts_perl" / "gtf_to_seq.pl"
+
+    transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf"
+
+    with transcriptomic_annotation_raw.open("w") as file_out:
+        for transcriptomic_file in [
         minimap2_annotation_raw,
         scallop_annotation_raw,
         stringtie_annotation_raw,
-    ]:
-
-        if not os.path.exists(transcriptomic_file):
-            logger.info(
-                "No annotation.gtf file found in " + transcriptomic_file + ", skipping"
-            )
-            continue
-
-        file_in = open(transcriptomic_file)
-        line = file_in.readline()
-        while line:
-            print(line.rstrip(), file=file_out)
-            line = file_in.readline()
-        file_in.close()
-    file_out.close()
+        ]:
+            if not transcriptomic_file.exists():
+                logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file)
+                continue
 
-    # Copy the raw files into the annotation dir, this is not needed
-    # as such, but collecting them in one place and relabelling is
-    # helpful for a user
-    if os.path.exists(busco_annotation_raw):
-        subprocess.run(
-            [
-                "cp",
-                busco_annotation_raw,
-                os.path.join(final_annotation_dir, "busco_raw.gtf"),
-            ]
-        )
+            with transcriptomic_file.open("r") as file_in:
+                for line in file_in:
+                    file_out.write(line)
 
-    if os.path.exists(protein_annotation_raw):
-        subprocess.run(
-            [
-                "cp",
-                protein_annotation_raw,
-                os.path.join(final_annotation_dir, "protein_raw.gtf"),
-            ]
-        )
+    # Copy the raw files into the annotation dir
+    copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf")
+    copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf")
 
-    gtf_files = ["transcriptomic_raw.gtf", "protein_raw.gtf", "busco_raw.gtf"]
+    #select best transcript
     generic_select_cmd = [
         "perl",
-        transcript_selector_script,
+        str(transcript_selector_script),
         "-genome_file",
-        genome_file,
+        str(genome_file),
     ]
-    pool = multiprocessing.Pool(int(num_threads))
+    pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
     for seq_region_name in seq_region_names:
         # The selection script needs different params depending on
         # whether the seqs are from transcriptomic data or not
-        region_details = (
-            seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name])
-        )
-        transcriptomic_region_gtf_path = os.path.join(
-            region_annotation_dir, (region_details + ".trans.gtf")
-        )
-        busco_region_gtf_path = os.path.join(
-            region_annotation_dir, (region_details + ".busco.gtf")
-        )
-        protein_region_gtf_path = os.path.join(
-            region_annotation_dir, (region_details + ".protein.gtf")
-        )
-
-        if os.path.exists(transcriptomic_annotation_raw):
-            logger.info("Finalising transcriptomic data for: " + seq_region_name)
-            transcriptomic_annotation_select = re.sub(
-                "_raw.gtf", "_sel.gtf", transcriptomic_annotation_raw
-            )
+        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
+        transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf"
+        busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf"
+        protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf"
+        if transcriptomic_annotation_raw:
             cmd = generic_select_cmd.copy()
             cmd.extend(
                 [
                     "-region_details",
                     region_details,
                     "-input_gtf_file",
-                    transcriptomic_annotation_raw,
+                    str(transcriptomic_annotation_raw),
                     "-output_gtf_file",
-                    transcriptomic_region_gtf_path,
+                    str(transcriptomic_region_gtf_path),
                     "-cds_search",
                     "-final_biotype",
                     "transcriptomic",
                 ]
             )
-            pool.apply_async(multiprocess_finalise_geneset, args=(cmd,))
-
-        if os.path.exists(busco_annotation_raw):
-            logger.info("Finalising BUSCO data for: " + seq_region_name)
-            busco_annotation_select = re.sub("_raw.gtf", "_sel.gtf", busco_annotation_raw)
+            pool.apply_async(run_command, args=(cmd,))
+        if busco_annotation_raw:
             cmd = generic_select_cmd.copy()
             cmd.extend(
                 [
                     "-region_details",
                     region_details,
                     "-input_gtf_file",
-                    busco_annotation_raw,
+                    str(busco_annotation_raw),
                     "-output_gtf_file",
-                    busco_region_gtf_path,
+                    str(busco_region_gtf_path),
                     "-all_cds_exons",
                     "-final_biotype",
                     "busco",
                 ]
             )
-            pool.apply_async(multiprocess_finalise_geneset, args=(cmd,))
-
-        if os.path.exists(protein_annotation_raw):
-            logger.info("Finalising protein data for: " + seq_region_name)
-            protein_annotation_select = re.sub(
-                "_raw.gtf", "_sel.gtf", protein_annotation_raw
-            )
+            pool.apply_async(run_command, args=(cmd,))
+        if protein_annotation_raw:
             cmd = generic_select_cmd.copy()
             cmd.extend(
                 [
                     "-region_details",
                     region_details,
                     "-input_gtf_file",
-                    protein_annotation_raw,
+                    str(protein_annotation_raw),
                     "-output_gtf_file",
-                    protein_region_gtf_path,
+                    str(protein_region_gtf_path),
                     "-clean_transcripts",
                     "-all_cds_exons",
                     "-final_biotype",
                     "protein",
                 ]
             )
-            pool.apply_async(multiprocess_finalise_geneset, args=(cmd,))
+            pool.apply_async(run_command, args=(cmd,))
 
     pool.close()
     pool.join()
 
-    # At this point we will have the region files for all the,
+    # At this point we will have the region files for each seq region and we can merge them
     merge_finalise_output_files(
         final_annotation_dir,
         region_annotation_dir,
@@ -230,46 +208,48 @@ def run_finalise_geneset(
     )
 
     # Create a single GTF file with all the selected transcripts
-    # now that they have proper ids
-    fully_merged_gtf_path = os.path.join(
-        final_annotation_dir, "all_selected_transcripts.gtf"
-    )
-    fully_merged_gtf_out = open(fully_merged_gtf_path, "w+")
-
-    merge_gtf_cmd = ["cat"]
-    merge_gtf_cmd.extend(glob.glob(final_annotation_dir + "/*_sel.gtf"))
-    subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out)
-    fully_merged_gtf_out.close()
-
+    # now that they have proper ids 
+    fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf"
+    # Collect all *_sel.gtf files
+    gtf_files = list(final_annotation_dir.glob("*_sel.gtf"))
+
+    # Ensure there are files to merge
+    if not gtf_files:
+        raise FileNotFoundError("No *_sel.gtf files found in %s",final_annotation_dir)
+
+
+    try:
+        with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out:
+            merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files]
+            subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True)
+            logger.info("Merged GTF files into %s", fully_merged_gtf_path)
+    except Exception as e:
+        print("An error occurred while merging GTF files: %s",e)
+        
     # Now collapse the gene set
     generic_finalise_cmd = [
         "perl",
-        finalise_geneset_script,
+        str(finalise_geneset_script),
         "-genome_file",
-        genome_file,
+        str(genome_file),
     ]
 
-    pool = multiprocessing.Pool(int(num_threads))
+    pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with
     for seq_region_name in seq_region_names:
-        region_details = (
-            seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name])
-        )
-        final_region_gtf_path = os.path.join(
-            final_region_annotation_dir, (region_details + ".final.gtf")
-        )
-
+        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
+        final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf"
         cmd = generic_finalise_cmd.copy()
         cmd.extend(
             [
                 "-region_details",
                 region_details,
                 "-input_gtf_file",
-                fully_merged_gtf_path,
+                str(fully_merged_gtf_path),
                 "-output_gtf_file",
-                final_region_gtf_path,
+                str(final_region_gtf_path),
             ]
         )
-        pool.apply_async(multiprocess_finalise_geneset, args=(cmd,))
+        pool.apply_async(run_command, args=(cmd,))
 
     pool.close()
     pool.join()
@@ -280,11 +260,10 @@ def run_finalise_geneset(
         ".final.gtf",
         "prevalidation",
     )
-    merged_gtf_file = os.path.join(final_annotation_dir, ("prevalidation_sel.gtf"))
-    merged_cdna_file = os.path.join(final_annotation_dir, ("prevalidation_sel.cdna.fa"))
-    merged_amino_acid_file = os.path.join(
-        final_annotation_dir, ("prevalidation_sel.prot.fa")
-    )
+    merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf"
+    merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa"
+    merged_amino_acid_file = final_annotation_dir /"prevalidation_sel.prot.fa"
+    #validation
     updated_gtf_lines = validate_coding_transcripts(
         merged_cdna_file,
         merged_amino_acid_file,
@@ -293,7 +272,12 @@ def run_finalise_geneset(
         diamond_validation_db,
         merged_gtf_file,
         num_threads,
+        cpc2_bin,
+        rnasamba_bin,
+        rnasamba_weights
     )
+    
+    ####
     postvalidation_gtf_file = os.path.join(final_annotation_dir, ("postvalidation.gtf"))
     file_out = open(postvalidation_gtf_file, "w+")
     for line in updated_gtf_lines:
@@ -380,67 +364,43 @@ def run_finalise_geneset(
 
 
 def validate_coding_transcripts(
-    cdna_file,
-    amino_acid_file,
-    validation_dir,
-    validation_type,
-    diamond_validation_db,
-    gtf_file,
-    num_threads,
+    cdna_file:Path,
+    amino_acid_file:Path,
+    validation_dir:Path,
+    validation_type:str,
+    diamond_validation_db:Path,
+    gtf_file:Path,
+    num_threads:int,
+    cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"),
+    rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"),
+    rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"),
+
 ):
+    """Run validation on protein coding transcript 
+    using CPC2, RNASamba, Diamond
+
+    Args:
+        cdna_file (Path): Input cdna file
+        amino_acid_file (Path): Input protein file
+        validation_dir (Path): Validation output directory
+        validation_type (str): _description_
+        diamond_validation_db (Path): _description_
+        gtf_file (Path): _description_
+        num_threads (int): number of threads
+
+    Returns:
+        _type_: _description_
+    """
 
     logger.info("Running CDS validation with RNAsamba and CPC2")
-    rnasamba_weights = config["rnasamba"]["weights"]
-    rnasamba_output_path = os.path.join(validation_dir, "rnasamba.tsv.txt")
-    cpc2_output_path = os.path.join(validation_dir, "cpc2.tsv")
-    rnasamba_volume = validation_dir + "/:/app:rw"
-    rnasamba_cmd = [
-        "singularity",
-        "exec",
-        "--bind",
-        rnasamba_volume,
-        config["rnasamba"]["software"],
-        "rnasamba",
-        "classify",
-        rnasamba_output_path,
-        cdna_file,
-        rnasamba_weights,
-    ]
-    logger.info(" ".join(rnasamba_cmd))
-    subprocess.run(rnasamba_cmd)
-    cpc2_volume = validation_dir + "/:/app:rw"
-    cpc2_cmd = [
-        "singularity",
-        "exec",
-        "--bind",
-        cpc2_volume,
-        config["cpc2"]["software"],
-        "python3",
-        "/CPC2_standalone-1.0.1/bin/CPC2.py",
-        "-i",
-        cdna_file,
-        "--ORF",
-        "-o",
-        cpc2_output_path,
-    ]
-    logger.info(" ".join(cpc2_cmd))
-    subprocess.run(cpc2_cmd)
-    cpc2_output_path = cpc2_output_path + ".txt"
-
-    check_file(rnasamba_output_path)
-    check_file(cpc2_output_path)
-
-    logger.info("diamond validation")
-    diamond_results = None
-    if diamond_validation_db is not None:
-        diamond_output_dir = utils.create_dir(validation_dir, "diamond_output")
-        diamond_validation(
-            diamond_validation_db,
-            amino_acid_file,
-            diamond_output_dir,
-            num_threads,
-        )
-        diamond_results = read_diamond_results(diamond_output_dir)
+    #rnasamba_weights = config["rnasamba"]["weights"]
+
+    
+
+    
+    
+
+   
 
     logger.info("read results")
     rnasamba_results = read_rnasamba_results(rnasamba_output_path)
@@ -455,139 +415,6 @@ def validate_coding_transcripts(
     return updated_gtf_lines
 
 
-def diamond_validation(
-    diamond_validation_db, amino_acid_file, diamond_output_dir, num_threads
-):
-
-    batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100)
-
-    pool = multiprocessing.Pool(int(num_threads))
-    for batched_protein_file in batched_protein_files:
-        pool.apply_async(
-            multiprocess_diamond,
-            args=(
-                batched_protein_file,
-                diamond_output_dir,
-                diamond_validation_db,
-            ),
-        )
-    pool.close()
-    pool.join()
-
-
-def multiprocess_diamond(
-    batched_protein_file,
-    diamond_output_dir,
-    diamond_validation_db,
-):
-
-    batch_num = os.path.splitext(batched_protein_file)[0]
-    batch_dir = os.path.dirname(batched_protein_file)
-    diamond_output_file = batched_protein_file + ".dmdout"
-    logger.info("Running diamond on " + batched_protein_file + ":")
-
-    diamond_cmd = [
-        "diamond",
-        "blastp",
-        "--query",
-        batched_protein_file,
-        "--db",
-        diamond_validation_db,
-        "--out",
-        diamond_output_file,
-    ]
-
-    logger.info(" ".join(diamond_cmd))
-    subprocess.run(diamond_cmd)
-    subprocess.run(["mv", diamond_output_file, diamond_output_dir])
-def read_rnasamba_results(file_path):
-
-    results = []
-
-    file_in = open(file_path)
-    line = file_in.readline()
-    while line:
-        line = line.rstrip()
-        match = re.search(r"^sequence_name", line)
-        if match:
-            line = file_in.readline()
-            continue
-
-        eles = line.split("\t")
-        if not len(eles) == 3:
-            line = file_in.readline()
-            continue
-
-        transcript_id = eles[0]
-        coding_probability = eles[1]
-        coding_potential = eles[2]
-        results.append([transcript_id, coding_probability, coding_potential])
-        line = file_in.readline()
-    file_in.close()
-
-    return results
-
-
-def read_cpc2_results(file_path):
-
-    results = []
-
-    file_in = open(file_path)
-    line = file_in.readline()
-    while line:
-        line = line.rstrip()
-        match = re.search(r"^#ID", line)
-        if match:
-            line = file_in.readline()
-            continue
-
-        eles = line.split("\t")
-        if not len(eles) == 9:
-            line = file_in.readline()
-            continue
-
-        transcript_id = eles[0]
-        transcript_length = eles[1]
-        peptide_length = eles[2]
-        coding_probability = eles[7]
-        coding_potential = eles[8]
-        results.append(
-            [
-                transcript_id,
-                coding_probability,
-                coding_potential,
-                transcript_length,
-                peptide_length,
-            ]
-        )
-        line = file_in.readline()
-    file_in.close()
-
-    return results
-
-
-def read_diamond_results(diamond_output_dir):
-
-    results = []
-    diamond_files = glob.glob(diamond_output_dir + "/*.dmdout")
-    for file_path in diamond_files:
-        file_in = open(file_path)
-        line = file_in.readline()
-        while line:
-            line = line.rstrip()
-
-            eles = line.split("\t")
-            if not len(eles) == 12:
-                line = file_in.readline()
-                continue
-
-            transcript_id = eles[0]
-            e_value = eles[10]
-            results.append([transcript_id, e_value])
-            line = file_in.readline()
-    file_in.close()
-
-    return results
 
 
 def combine_results(rnasamba_results, cpc2_results, diamond_results):
@@ -625,7 +452,7 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results):
             transcript_id = result[0]
             e_value = result[1]
             # There seems to be an issue where there are a small number
-            # of sequences that don't make it into the cpc2/rnasamba output
+            # of sequences that don't make it into the cpc2/rnasamba oqutput
             # Should code in a system for this, but it would be good to
             # understand why it happens to begin with. Seems to be the same
             # number of missing seqs in both, so maybe a shared cut-off
@@ -635,16 +462,20 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results):
     return transcript_ids
 
 def merge_finalise_output_files(
-    final_annotation_dir, region_annotation_dir, extension, id_label
-):
-
-    gtf_files = glob.glob(region_annotation_dir + "/*" + extension)
-
-    merged_gtf_file = os.path.join(final_annotation_dir, (id_label + "_sel.gtf"))
-    merged_cdna_file = os.path.join(final_annotation_dir, (id_label + "_sel.cdna.fa"))
-    merged_amino_acid_file = os.path.join(
-        final_annotation_dir, (id_label + "_sel.prot.fa")
-    )
+    final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str
+)-> None:
+    """Merge the resulting gene models from all the seq regions for the specified extension
+
+    Args:
+        final_annotation_dir (Path): Output directory
+        region_annotation_dir (Path): Input directory
+        extension (str): File extension
+        id_label (str): Type of data or process to merge
+    """
+    gtf_files = list(region_annotation_dir.glob(f"*{extension}"))
+    merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf"
+    merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa"
+    merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa"
 
     # The below is not great, it's a bit messy because there might be
     # some cases where there aren't translations. So it's not as
@@ -657,79 +488,177 @@ def merge_finalise_output_files(
 
     gene_id_counter = 0
     transcript_id_counter = 0
-    gtf_out = open(merged_gtf_file, "w+")
-    cdna_out = open(merged_cdna_file, "w+")
-    amino_acid_out = open(merged_amino_acid_file, "w+")
-    for gtf_file in gtf_files:
-        logger.info("GTF file: " + gtf_file)
-        cdna_seq_index = {}
-        amino_acid_seq_index = {}
-        cdna_file = gtf_file + ".cdna"
-        amino_acid_file = gtf_file + ".prot"
-        cdna_in = open(cdna_file)
-        amino_acid_in = open(amino_acid_file)
-        cdna_seq_index = fasta_to_dict(cdna_in.readlines())
-        amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines())
-        cdna_in.close()
-        amino_acid_in.close()
-
-        current_gene_id = ""
-        gtf_in = open(gtf_file)
-        line = gtf_in.readline()
-        while line:
-            if re.search(r"^#", line):
-                line = gtf_in.readline()
-                continue
-
-            eles = line.split("\t")
-            if not len(eles) == 9:
-                line = gtf_in.readline()
-                continue
-
-            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
-            if match and eles[2] == "transcript":
-                transcript_id_counter += 1
+    
+    with merged_gtf_file.open("w") as gtf_out, \
+        merged_cdna_file.open("w") as cdna_out, \
+        merged_amino_acid_file.open("w") as amino_acid_out:
+
+        for gtf_file in gtf_files:
+            logger.info("GTF file: %s", gtf_file)
+            cdna_seq_index = {}
+            amino_acid_seq_index = {}
+            cdna_file = gtf_file.with_suffix(".cdna")
+            amino_acid_file = gtf_file.with_suffix(".prot")
+            if cdna_file.exists():
+                with cdna_file.open() as cdna_in:
+                    cdna_seq_index = fasta_to_dict(cdna_in.readlines())
+            
+            if amino_acid_file.exists():
+                with amino_acid_file.open() as amino_acid_in:
+                    amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines())
+            current_gene_id = ""
+            with gtf_file.open() as gtf_in:
+                for line in gtf_in:
+                    if re.search(r"^#", line):
+                    #if line.startswith("#"):
+                        continue
+                    eles = line.split("\t")
+                    if len(eles) != 9:
+                        continue
+
+                    match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
+                    if match and eles[2] == "transcript":
+                        transcript_id_counter += 1
+                    gene_id = match.group(1)
+                    transcript_id = match.group(2)
+
+                    if not current_gene_id:
+                        gene_id_counter += 1
+                        current_gene_id = gene_id
+
+                    if gene_id != current_gene_id:
+                        gene_id_counter += 1
+                        current_gene_id = gene_id
+                    new_gene_id = f"{id_label}_{gene_id_counter}"
+                    new_transcript_id = f"{id_label}_{transcript_id_counter}"
+
+                    line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line)
+                    line = re.sub(f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line)
+                    gtf_out.write(line)
+                    if eles[2] == "transcript":
+                        new_header = f">{new_transcript_id}\n"
+                        cdna_out.write(new_header + cdna_seq_index[transcript_id])
+                        if transcript_id in amino_acid_seq_index:
+                            amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id])
 
-            gene_id = match.group(1)
-            transcript_id = match.group(2)
 
-            if not current_gene_id:
-                gene_id_counter += 1
-                current_gene_id = gene_id
-
-            if not gene_id == current_gene_id:
-                gene_id_counter += 1
-                current_gene_id = gene_id
-
-            new_gene_id = id_label + "_" + str(gene_id_counter)
-            new_transcript_id = id_label + "_" + str(transcript_id_counter)
-            line = re.sub(
-                'gene_id "' + gene_id + '"',
-                ('gene_id "' + new_gene_id + '"'),
-                line,
-            )
-            line = re.sub(
-                'transcript_id "' + transcript_id + '"',
-                ('transcript_id "' + new_transcript_id + '"'),
-                line,
-            )
-            gtf_out.write(line)
-            line = gtf_in.readline()
-
-            if eles[2] == "transcript":
-                new_header = ">" + new_transcript_id + "\n"
-                cdna_out.write(new_header + cdna_seq_index[transcript_id])
+def multiprocess_finalise_geneset(cmd):
 
-                if transcript_id in amino_acid_seq_index:
-                    amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id])
+    print(" ".join(cmd))
+    subprocess.run(cmd)
+    
+
+
+def run_command(cmd)->None:
+    """Submit command in multiprocessing action 
+
+    Args:
+        cmd (List[str]): command to execute
+    """
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        logger.info("Command executed successfully: %s", cmd)
+        logger.info("Output: %s", result.stdout)
+    except subprocess.CalledProcessError as e:
+        logger.error("Command failed: %s", cmd)
+        logger.error("Error: %s", e.stderr)
+
+def copy_raw_files(raw_file: Path, destination_file :Path)-> None:
+    """Copy file in a new destination
+
+    Args:
+        raw_file (Path): File to copy
+        destination_file (Path): New destination
+    """
+    if raw_file.exists():
+        try:
+            shutil.copy(raw_file, destination_file)
+            logger.info("Copied %s to %s", raw_file, destination_file)
+        except Exception as e:
+            logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e)
+    else:
+        logger.info("No file found at %s, skipping", raw_file)
+
+
+
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Genblast arguments")
+    parser.add_argument(
+        "--masked_genome_file", required=True, help="Masked genome file path"
+    )
+    parser.add_argument("--output_dir", required=True, help="Output directory path")
+    parser.add_argument("--protein_file", required=True, help="Path for the protein dataset")
+    parser.add_argument(
+        "--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period"
+    )
+    parser.add_argument(
+        "--max_intron_length", type=int, required=True, help="Maximum intron length"
+    )
+    parser.add_argument(
+        "--genblast_bin",
+        default="genblast",
+        help="Genblast executable path",
+    )
+    parser.add_argument(
+        "--convert2blastmask_bin",
+        default="convert2blastmask",
+        help="convert2blastmask executable path",
+    )
+    parser.add_argument(
+        "--makeblastdb_bin",
+        default="makeblastdb",
+        help="makeblastdb executable path",
+    )
+    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
+    parser.add_argument(
+        "--protein_set",
+        required=True,
+        choices=["uniprot", "orthodb"],
+        help="Protein set [uniprot, orthodb]",
+    )
+    return parser.parse_args()
 
-    gtf_out.close()
-    cdna_out.close()
-    amino_acid_out.close()
+def main():
+    """Genblast's entry-point."""
+    args = parse_args()
 
+    log_file_path = create_dir(args.output_dir, "log") / "genblast.log"
+    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
 
-def multiprocess_finalise_geneset(cmd):
+    logging.config.fileConfig(
+        loginipath,
+        defaults={"logfilename": str(log_file_path)},
+        disable_existing_loggers=False,
+    )
 
-    print(" ".join(cmd))
-    subprocess.run(cmd)
+    run_genblast(
+        Path(args.masked_genome_file),
+        Path(args.output_dir),
+        Path(args.protein_file),
+        args.max_intron_length,
+        args.genblast_timeout_secs,
+        Path(args.genblast_bin),
+        Path(args.convert2blastmask_bin),
+        Path(args.makeblastdb_bin),
+        args.num_threads,
+        args.protein_set,
+    )
 
+if __name__ == "__main__":
+    main()
diff --git a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py
new file mode 100644
index 0000000..1b06f75
--- /dev/null
+++ b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py
@@ -0,0 +1,59 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+check_file(rnasamba_output_path)
+rnasamba_output_path = validation_dir / "rnasamba.tsv.txt"
+rnasamba_volume = f"{validation_dir}/:/app:rw"
+    rnasamba_cmd = [
+        "singularity",
+        "exec",
+        "--bind",
+        str(rnasamba_volume),
+        str(rnasamba_bin),
+        "rnasamba",
+        "classify",
+        str(rnasamba_output_path),
+        str(cdna_file),
+        str(rnasamba_weights),
+    ]
+    logger.info(" ".join(rnasamba_cmd))
+    subprocess.run(rnasamba_cmd, check=True)
+    
+    
+    def read_rnasamba_results(file_path):
+
+    results = []
+
+    file_in = open(file_path)
+    line = file_in.readline()
+    while line:
+        line = line.rstrip()
+        match = re.search(r"^sequence_name", line)
+        if match:
+            line = file_in.readline()
+            continue
+
+        eles = line.split("\t")
+        if not len(eles) == 3:
+            line = file_in.readline()
+            continue
+
+        transcript_id = eles[0]
+        coding_probability = eles[1]
+        coding_potential = eles[2]
+        results.append([transcript_id, coding_probability, coding_potential])
+        line = file_in.readline()
+    file_in.close()
+
+    return results
\ No newline at end of file

From 4ea542dae1e34e62b5ddee71cbaba97691029cee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 5 Dec 2024 12:24:28 +0000
Subject: [PATCH 12/20] silenced warnings

---
 .github/workflows/documentation.yaml                        | 2 +-
 .../ensembl/tools/anno/transcriptomic_annotation/star.py    | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
index eed4a73..9729d70 100644
--- a/.github/workflows/documentation.yaml
+++ b/.github/workflows/documentation.yaml
@@ -22,7 +22,7 @@ jobs:
       - name: Sphinx Build
         run: |
           sphinx-build -b html docs/source docs/build
-          sphinx-build docs/source -W -b linkcheck -d docs/build/doctrees docs/build/html
+          sphinx-build docs/source  -b linkcheck -d docs/build/doctrees docs/build/html
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@v3
         if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/dev/python_repo' }}
diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
index 4556a0e..d7eed34 100755
--- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
+++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py
@@ -12,8 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
+"""
+The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
 in genomics research for aligning RNA-seq data to a reference genome.
+
 Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
 Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
 """
@@ -60,6 +62,7 @@ def run_star(  # pylint:disable=too-many-branches
 ) -> None:
     """
     Run STAR alignment on list of short read data.
+    
         :param genome_file: Genome file path.
         :type genome_file: Path
         :param output_dir: Working directory path.
@@ -96,6 +99,7 @@ def run_star(  # pylint:disable=too-many-branches
         :return: None
         :rtype: None
     """
+    
     check_exe(star_bin)
     # If trimming has been enabled then switch the path for
     # short_read_fastq_dir from the original location to the trimmed fastq dir

From 2a2257394a726224e4b19dd2336b52da6ce92f4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:14:38 +0000
Subject: [PATCH 13/20] added cpc2 module

---
 docs/source/cpc2.rst                          |   8 +
 .../tools/anno/finalise_genset/cpc2.py        | 173 ++++++++++++++----
 2 files changed, 144 insertions(+), 37 deletions(-)
 create mode 100644 docs/source/cpc2.rst

diff --git a/docs/source/cpc2.rst b/docs/source/cpc2.rst
new file mode 100644
index 0000000..fe05999
--- /dev/null
+++ b/docs/source/cpc2.rst
@@ -0,0 +1,8 @@
+CPC2 Module Documentation
+===================================
+
+.. automodule:: ensembl.tools.anno.finalise_geneset.cpc2
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
index 946bb96..25be1c3 100644
--- a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
+++ b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py
@@ -12,9 +12,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-check_file(cpc2_output_path)
-cpc2_output_path = validation_dir / "cpc2.tsv"
-    
+"""
+CPC2 (Coding Potential Calculator 2) analyzes RNA sequences to determine whether
+they are likely coding or non-coding based on sequence features and machine learning.
+
+CPC2: a fast and accurate coding potential calculator based on sequence intrinsic features 
+Yu-Jian Kang,   De-Chang Yang,   Lei Kong,   Mei Hou,   Yu-Qi Meng,   Liping Wei, Ge Gao
+Nucleic Acids Research, Volume 45, Issue W1, 3 July 2017, Pages W12–W16, https://doi.org/10.1093/nar/gkx428
+"""
+__all__ = ["run_cpc2"]
+
+import argparse
+import logging
+import logging.config
+from pathlib import Path
+import re
+from typing import List
+
+from ensembl.tools.anno.utils._utils import (
+    create_dir,
+    check_file,
+    run_command,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_cpc2(  # pylint:disable=dangerous-default-value
+    validation_dir: Path,
+    cdna_file: Path,
+    cpc2_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
+    ),
+) -> List[List[str]]:
+    """
+    Reads the output file from CPC2 and extracts coding potential information.
+
+        :param validation_dir: Path to the directory where the CPC2 results will be stored.
+        :type validation_dir: Path
+        :param cdna_file: Path to the input cDNA file containing transcript sequences in FASTA format.
+        :type cdna_file: Path
+        :param cpc2_bin: Path to the CPC2 software.
+        :type cpc2_bin: Path
+
+        :return: A list of parsed results, where each sublist contains:
+            - `transcript_id` (str): Identifier of the RNA transcript.
+            - `coding_probability` (str): Probability of the transcript being coding.
+            - `coding_potential` (str): Predicted coding potential (e.g., coding/non-coding).
+            - `transcript_length` (str): Length of the transcript in nucleotides.
+            - `peptide_length` (str): Length of the predicted peptide, if applicable.
+        :rtype: List[List[str]]
+
+    """
+
+    cpc2_output_path = validation_dir / "cpc2.tsv"
+    check_file(cpc2_output_path)
     cpc2_volume = f"{validation_dir}/:/app:rw"
     cpc2_cmd = [
         "singularity",
@@ -30,44 +82,91 @@
         "-o",
         str(cpc2_output_path),
     ]
-    logger.info(" ".join(cpc2_cmd))
-    subprocess.run(cpc2_cmd, check=True)
-    cpc2_output_path = f"{str(cpc2_output_path)}.txt"
-    
-    
-def read_cpc2_results(file_path):
+    logger.info("Running CPC2 command: %s", " ".join(map(str, cpc2_cmd)))
+    run_command(cpc2_cmd)
+    cpc2_output_path = cpc2_output_path.with_suffix(".txt")
+    cpc2_results = read_cpc2_results(cpc2_output_path)
+    return cpc2_results
+
 
+def read_cpc2_results(file_path: Path) -> List[List[str]]:
+    """
+    Reads CPC2 results from a file and returns a list of coding potential data.
+
+    Args:
+        file_path (Path): Path to the CPC2 results file.
+
+    Returns:
+        List[List[str]]: A list of records, each containing transcript ID,
+        coding probability, coding potential, transcript length, and peptide length.
+    """
     results = []
 
-    file_in = open(file_path)
-    line = file_in.readline()
-    while line:
-        line = line.rstrip()
-        match = re.search(r"^#ID", line)
-        if match:
-            line = file_in.readline()
-            continue
-
-        eles = line.split("\t")
-        if not len(eles) == 9:
-            line = file_in.readline()
-            continue
-
-        transcript_id = eles[0]
-        transcript_length = eles[1]
-        peptide_length = eles[2]
-        coding_probability = eles[7]
-        coding_potential = eles[8]
-        results.append(
-            [
+    with file_path.open("r") as file_in:
+        for line in file_in:
+            line = line.rstrip()
+            if re.search(r"^#ID", line):
+                continue  # Skip header lines
+
+            eles = line.split("\t")
+            if len(eles) != 9:
+                continue  # Skip malformed lines
+
+            (
                 transcript_id,
-                coding_probability,
-                coding_potential,
                 transcript_length,
                 peptide_length,
-            ]
-        )
-        line = file_in.readline()
-    file_in.close()
+                _,
+                _,
+                _,
+                _,
+                coding_probability,
+                coding_potential,
+            ) = eles  # Unpack all three elements
+            results.append(
+                [
+                    transcript_id,
+                    coding_probability,
+                    coding_potential,
+                    transcript_length,
+                    peptide_length,
+                ]
+            )
+
+    return results
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="CPC2 arguments")
+    parser.add_argument("--validation_dir", required=True, help="Validation directory path")
+    parser.add_argument("--cdna_file", required=True, help="Path for the cDNA fasta file")
+    parser.add_argument(
+        "--cpc2_bin",
+        help="CPC2 executable path",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """CPC2's entry-point."""
+    args = parse_args()
+
+    log_file_path = create_dir(args.output_dir, "log") / "cpc2.log"
+    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
+
+    logging.config.fileConfig(
+        loginipath,
+        defaults={"logfilename": str(log_file_path)},
+        disable_existing_loggers=False,
+    )
+
+    run_cpc2(
+        Path(args.validation_dir),
+        Path(args.cdna_file),
+        Path(args.cpc2_bin),
+    )
+
 
-    return results    
\ No newline at end of file
+if __name__ == "__main__":
+    main()

From d05dc6b0a3531ad52d66a03f72cd40b97f97732d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:15:05 +0000
Subject: [PATCH 14/20] added diamond module

---
 docs/source/diamond.rst                       |   8 +
 .../tools/anno/finalise_genset/diamond.py     | 221 ++++++++++++++----
 2 files changed, 184 insertions(+), 45 deletions(-)
 create mode 100644 docs/source/diamond.rst

diff --git a/docs/source/diamond.rst b/docs/source/diamond.rst
new file mode 100644
index 0000000..2b5fcb8
--- /dev/null
+++ b/docs/source/diamond.rst
@@ -0,0 +1,8 @@
+DIAMOND Module Documentation
+===================================
+
+.. automodule:: ensembl.tools.anno.finalise_geneset.diamond
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py
index c63a533..38b4b3c 100644
--- a/src/python/ensembl/tools/anno/finalise_genset/diamond.py
+++ b/src/python/ensembl/tools/anno/finalise_genset/diamond.py
@@ -12,9 +12,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+DIAMOND is a sequence aligner for protein and translated DNA searches,
+designed for high performance analysis of big sequence data..
 
- logger.info("diamond validation")
-    diamond_results = None
+Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at tree-of-life scale
+using DIAMOND", Nature Methods 18, 366–368 (2021). doi:10.1038/s41592-021-01101-x
+"""
+__all__ = ["run_diamond"]
+
+import argparse
+import logging
+import logging.config
+import multiprocessing
+from pathlib import Path
+
+from typing import List, Optional, Union
+
+from ensembl.tools.anno.utils._utils import (
+    create_dir,
+    run_command,
+    split_protein_file,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_diamond(  # pylint:disable=dangerous-default-value
+    validation_dir: Path,
+    amino_acid_file: Path,
+    diamond_validation_db: Path,
+    num_threads: int = 1,
+    diamond_bin: Path = Path("diamond"),
+) -> List[List[str]]:
+    """
+    Reads the output file from CPC2 and extracts coding potential information.
+
+        :param validation_dir: Path to the directory where the CPC2 results will be stored.
+        :type validation_dir: Path
+        :param diamond_validation_db: Path to the DIAMOND database.
+        :type diamond_validation_db: Path
+        :param amino_acid_file: Path to the input FASTA file containing amino acid sequences.
+        :type amino_acid_file: Path
+        :param num_threads: Number of threads.
+        :type num_threads: int, default 1
+        :param diamond_bin: Path to the Diamond software.
+        :type diamond_bin: Path
+
+        :return: DIAMOND results, where each sublist contains:
+            - `transcript_id` (str): Identifier of the protein sequence.
+            - `e_value` (str): E-value of the alignment.
+        :rtype: List[List[str]]
+
+    """
+
+    diamond_results = []
     if diamond_validation_db is not None:
         diamond_output_dir = create_dir(validation_dir, "diamond_output")
         diamond_validation(
@@ -22,20 +74,36 @@
             amino_acid_file,
             diamond_output_dir,
             num_threads,
+            diamond_bin,
         )
         diamond_results = read_diamond_results(diamond_output_dir)
-        
-        
-        def diamond_validation(
+
+    return diamond_results
+
+
+def diamond_validation(
     diamond_validation_db: Path,
-    amino_acid_file: Path, 
-    diamond_output_dir: Path, 
-    num_threads: int
-)-> None:
+    amino_acid_file: Path,
+    diamond_output_dir: Path,
+    num_threads: int = 1,
+    diamond_bin: Path = Path("diamond"),
+) -> None:
+    """
+    Perform DIAMOND validation on batched protein sequences.
 
+    Args:
+        diamond_validation_db (Path): Path to the DIAMOND database.
+        amino_acid_file (Path): Path to the input FASTA file containing amino acid sequences.
+        diamond_output_dir (Path): Directory to store DIAMOND output files.
+        num_threads (int, optional): Number of threads to use for multiprocessing. Defaults to 1.
+        diamond_bin (Path, optional): Path to the DIAMOND binary. Defaults to "diamond".
+
+    Returns:
+        None
+    """
     batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100)
 
-    pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
+    pool = multiprocessing.Pool(int(num_threads))  # pylint: disable=consider-using-with
     for batched_protein_file in batched_protein_files:
         pool.apply_async(
             multiprocess_diamond,
@@ -43,6 +111,7 @@ def diamond_validation(
                 batched_protein_file,
                 diamond_output_dir,
                 diamond_validation_db,
+                diamond_bin,
             ),
         )
     pool.close()
@@ -50,52 +119,114 @@ def diamond_validation(
 
 
 def multiprocess_diamond(
-    batched_protein_file,
-    diamond_output_dir,
-    diamond_validation_db,
-)->None:
-
-    #batch_num = os.path.splitext(batched_protein_file)[0]
-    #batch_dir = os.path.dirname(batched_protein_file)
-    diamond_output_file = f"{batched_protein_file}.dmdout"
+    batched_protein_file: Path,
+    diamond_output_dir: Path,
+    diamond_validation_db: Path,
+    diamond_bin: Path = Path("diamond"),
+) -> None:
+    """
+    Run DIAMOND on a single batch of protein sequences.
+
+    Args:
+        batched_protein_file (Path): Path to the batched protein sequence file.
+        diamond_output_dir (Path): Directory to store DIAMOND output files.
+        diamond_validation_db (Path): Path to the DIAMOND database.
+        diamond_bin (Path, optional): Path to the DIAMOND binary. Defaults to "diamond".
+
+    Returns:
+        None
+    """
+    # batch_num = os.path.splitext(batched_protein_file)[0]
+    # batch_dir = os.path.dirname(batched_protein_file)
+    diamond_output_file = batched_protein_file.with_suffix("dmdout")
     logger.info("Running diamond on %s :", batched_protein_file)
 
     diamond_cmd = [
-        "diamond",
+        str(diamond_bin),
         "blastp",
         "--query",
-        batched_protein_file,
+        str(batched_protein_file),
         "--db",
-        diamond_validation_db,
+        str(diamond_validation_db),
         "--out",
-        diamond_output_file,
+        str(diamond_output_file),
     ]
 
-    logger.info(" ".join(diamond_cmd))
-    subprocess.run(diamond_cmd)
-    subprocess.run(["mv", diamond_output_file, diamond_output_dir])
-    
-    
-def read_diamond_results(diamond_output_dir):
+    logger.info("Running diamond command: %s", " ".join(map(str, diamond_cmd)))
+    run_command(diamond_cmd)
+    cmd = ["mv", diamond_output_file, diamond_output_dir]
+    run_command(cmd)
+
+
+def read_diamond_results(diamond_output_dir: Path) -> List[List[str]]:
+    """
+    Parse DIAMOND output files and extract results.
 
+    Args:
+        diamond_output_dir (Path): Directory containing DIAMOND output files.
+
+    Returns:
+        List[List[str]]: Parsed DIAMOND results, where each sublist contains:
+            - `transcript_id` (str): Identifier of the protein sequence.
+            - `e_value` (str): E-value of the alignment.
+    """
     results = []
-    diamond_files = glob.glob(diamond_output_dir + "/*.dmdout")
+    diamond_files = list(diamond_output_dir.glob("*.dmdout"))
     for file_path in diamond_files:
-        file_in = open(file_path)
-        line = file_in.readline()
-        while line:
-            line = line.rstrip()
-
-            eles = line.split("\t")
-            if not len(eles) == 12:
-                line = file_in.readline()
-                continue
-
-            transcript_id = eles[0]
-            e_value = eles[10]
-            results.append([transcript_id, e_value])
-            line = file_in.readline()
-    file_in.close()
+        with file_path.open("r") as file_in:
+            for line in file_in:
+                line = line.rstrip()
+
+                eles = line.split("\t")
+                if len(eles) != 12:
+                    continue
 
+                transcript_id = eles[0]
+                e_value = eles[10]
+                results.append([transcript_id, e_value])
     return results
-    
\ No newline at end of file
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="CPC2 arguments")
+    parser.add_argument("--validation_dir", required=True, help="Validation directory path")
+    parser.add_argument(
+        "--amino_acid_file",
+        required=True,
+        help="Path to the input FASTA file containing amino acid sequences.",
+    )
+    parser.add_argument("--diamond_validation_db", required=True,help="Path to the DIAMOND database.")
+    parser.add_argument(
+        "--diamond_bin",
+        help="DIAMOND executable path",
+    )
+    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
+
+    return parser.parse_args()
+
+
+def main():
+    """DIAMOND's entry-point."""
+    args = parse_args()
+
+    log_file_path = create_dir(args.output_dir, "log") / "diamond.log"
+    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
+
+    logging.config.fileConfig(
+        loginipath,
+        defaults={"logfilename": str(log_file_path)},
+        disable_existing_loggers=False,
+    )
+
+    run_diamond(
+        Path(args.validation_dir),
+        Path(args.amino_acid_file),
+        Path(args.diamond_validation_db),
+        int(args.num_threads),
+        Path(args.diamond_bin),
+    )
+
+
+if __name__ == "__main__":
+    main()

From 39321ad071f17c3eed3c2421a26a55368f5683a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:15:39 +0000
Subject: [PATCH 15/20] added rnasamba module

---
 docs/source/rnasamba.rst                      |   8 +
 .../tools/anno/finalise_genset/rnasamba.py    | 166 ++++++++++++++----
 2 files changed, 144 insertions(+), 30 deletions(-)
 create mode 100644 docs/source/rnasamba.rst

diff --git a/docs/source/rnasamba.rst b/docs/source/rnasamba.rst
new file mode 100644
index 0000000..60cff3d
--- /dev/null
+++ b/docs/source/rnasamba.rst
@@ -0,0 +1,8 @@
+RNASamba Module Documentation
+===================================
+
+.. automodule:: ensembl.tools.anno.finalise_geneset.rnasamba
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py
index 1b06f75..824bd63 100644
--- a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py
+++ b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py
@@ -12,9 +12,72 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-check_file(rnasamba_output_path)
-rnasamba_output_path = validation_dir / "rnasamba.tsv.txt"
-rnasamba_volume = f"{validation_dir}/:/app:rw"
+"""
+Run RNA-Samba to predict the coding potential of transcripts.
+
+RNA-Samba is a machine learning-based tool for distinguishing coding 
+and non-coding RNA transcripts based on sequence features. 
+
+Camargo, A. P., Sourkov, V., Pereira, G. A. G. & Carazzolle, M. F.. 
+"RNAsamba: neural network-based assessment of the protein-coding potential 
+of RNA sequences" NAR Genomics and Bioinformatics 2, lqz024 (2020).
+"""
+__all__ = ["run_rnasamba"]
+
+import argparse
+import logging
+import logging.config
+from pathlib import Path
+import re
+from typing import List, Union
+
+from ensembl.tools.anno.utils._utils import (
+    create_dir,
+    check_file,
+    run_command,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_rnasamba(  # pylint:disable=dangerous-default-value
+    validation_dir: Path,
+    cdna_file: Path,
+    rnasamba_weights: Path = Path(
+        "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"  # pylint:disable=line-too-long
+    ),
+    rnasamba_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
+    ),
+) -> List[List[str]]:
+    """
+    Runs the RNA-Samba classification tool within a Singularity container to
+    evaluate the coding potential of cDNA sequences.
+
+    This function uses RNA-Samba to classify sequences in the provided cDNA file
+    and outputs results in the specified validation directory.
+
+        :param validation_dir: Path to the directory where the RNA-Samba results will be stored.
+        :type validation_dir: Path
+        :param cdna_file: Path to the input cDNA file containing transcript sequences in FASTA format.
+        :type cdna_file: Path
+        :param rnasamba_weights: Path to the pre-trained RNA-Samba weights file.
+        :type rnasamba_weights: Path
+        :param rnasamba_bin: Path to the RNA-Samba software.
+        :type rnasamba_bin: Path
+
+        :return: A list of RNA-Samba results, where each entry contains
+                - Transcript ID
+                - Coding probability
+                - Coding potential
+        :rtype: List[List[str]]
+
+    """
+
+    rnasamba_output_path = validation_dir / "rnasamba.tsv.txt"
+    check_file(rnasamba_output_path)
+    rnasamba_output_path = validation_dir / "rnasamba.tsv.txt"
+    rnasamba_volume = f"{validation_dir}/:/app:rw"
     rnasamba_cmd = [
         "singularity",
         "exec",
@@ -27,33 +90,76 @@
         str(cdna_file),
         str(rnasamba_weights),
     ]
-    logger.info(" ".join(rnasamba_cmd))
-    subprocess.run(rnasamba_cmd, check=True)
-    
-    
-    def read_rnasamba_results(file_path):
 
+    logger.info("Running Rnasamba command: %s", " ".join(map(str, rnasamba_cmd)))
+    run_command(rnasamba_cmd)
+    rnasamba_results = read_rnasamba_results(rnasamba_output_path)
+    return rnasamba_results
+
+
+def read_rnasamba_results(file_path: Path) -> List[List[str]]:
+    """
+    Reads RNA-Samba results from a file and returns a list of coding potential data.
+
+    Args:
+        file_path (Path): Path to the RNA-Samba results file.
+
+    Returns:
+        List[List[str]]: A list of records, each containing transcript ID,
+        coding probability, and coding potential.
+    """
     results = []
 
-    file_in = open(file_path)
-    line = file_in.readline()
-    while line:
-        line = line.rstrip()
-        match = re.search(r"^sequence_name", line)
-        if match:
-            line = file_in.readline()
-            continue
-
-        eles = line.split("\t")
-        if not len(eles) == 3:
-            line = file_in.readline()
-            continue
-
-        transcript_id = eles[0]
-        coding_probability = eles[1]
-        coding_potential = eles[2]
-        results.append([transcript_id, coding_probability, coding_potential])
-        line = file_in.readline()
-    file_in.close()
-
-    return results
\ No newline at end of file
+    with file_path.open("r") as file_in:
+        for line in file_in:
+            line = line.rstrip()
+            if re.search(r"^sequence_name", line):
+                continue  # Skip header lines
+
+            eles = line.split("\t")
+            if len(eles) != 3:
+                continue  # Skip malformed lines
+
+            transcript_id, coding_probability, coding_potential = eles  # Unpack all three elements
+            results.append([transcript_id, coding_probability, coding_potential])
+
+    return results
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Rnasamba arguments")
+    parser.add_argument("--validation_dir", required=True, help="Validation directory path")
+    parser.add_argument("--cdna_file", required=True, help="Path for the cDNA fasta file")
+
+    parser.add_argument(
+        "--rnasamba_weights",
+        help="Rnasamba weight path",
+    )
+    parser.add_argument(
+        "--rnasamba_bin",
+        help="Rnasamba executable path",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Rnasamba's entry-point."""
+    args = parse_args()
+
+    log_file_path = create_dir(args.output_dir, "log") / "rnasamba.log"
+    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
+
+    logging.config.fileConfig(
+        loginipath,
+        defaults={"logfilename": str(log_file_path)},
+        disable_existing_loggers=False,
+    )
+
+    run_rnasamba(
+        Path(args.validation_dir), Path(args.cdna_file), Path(args.rnasamba_weights), Path(args.rnasamba_bin)
+    )
+
+
+if __name__ == "__main__":
+    main()

From 50e839e6b74143ba295bfa695b9b17e5a99a86ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:16:41 +0000
Subject: [PATCH 16/20] deep refactorig of finalisation

---
 docs/source/finalise_geneset.rst              |    8 +
 .../anno/finalise_genset/finalise_geneset.py  |  883 ++++++++++
 .../finalise_genset/finalise_geneset_utils.py |  664 --------
 src/python/ensembl/tools/anno/utils/_utils.py | 1415 +++++------------
 4 files changed, 1304 insertions(+), 1666 deletions(-)
 create mode 100644 docs/source/finalise_geneset.rst
 create mode 100644 src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py
 delete mode 100644 src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py

diff --git a/docs/source/finalise_geneset.rst b/docs/source/finalise_geneset.rst
new file mode 100644
index 0000000..7ee9878
--- /dev/null
+++ b/docs/source/finalise_geneset.rst
@@ -0,0 +1,8 @@
+Geneset Finalisation Module Documentation
+===================================
+
+.. automodule:: ensembl.tools.anno.finalise_geneset.finalise_geneset
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py
new file mode 100644
index 0000000..9966c02
--- /dev/null
+++ b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py
@@ -0,0 +1,883 @@
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Set of functions used to collect the results of the subpipelines and generate the final output
+This logic will be revised in the nextflow pipeline
+"""
+__all__ = ["run_finalise_geneset"]
+
+import argparse
+import logging
+import logging.config
+import multiprocessing
+from pathlib import Path
+import re
+import subprocess
+import shutil
+from typing import List, Dict, Optional
+
+from ensembl.tools.anno.utils._utils import (
+    get_seq_region_length,
+    fasta_to_dict,
+    create_dir,
+    check_gtf_content,
+    read_gtf_genes,
+    run_command,
+)
+from ensembl.tools.anno.finalise_genset.cpc2 import run_cpc2
+from ensembl.tools.anno.finalise_genset.rnasamba import run_rnasamba
+from ensembl.tools.anno.finalise_genset.diamond import run_diamond
+
+
+logger = logging.getLogger(__name__)
+
+
+def run_finalise_geneset(#pylint:disable=too-many-branches
+    main_output_dir: Path,
+    genome_file: Path,
+    seq_region_names: List[str],
+    diamond_validation_db: Path,
+    validation_type: str = "relaxed",
+    num_threads: int = 1,
+    cpc2_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"
+    ),
+    rnasamba_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
+    ),
+    rnasamba_weights: Path = Path(
+        "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"  # pylint:disable=line-too-long
+    ),
+    diamond_bin: Path = Path("diamond"),
+    min_single_exon_pep_length: int = 100,
+    min_multi_exon_pep_length: int = 75,
+    min_single_source_probability: float = 0.8,
+    min_single_exon_probability: float = 0.9,
+):
+    """Collect results
+
+    :param main_output_dir: Path for the output dir
+    :type main_output_dir: Path
+    :param genome_file: Path for genome fasta file
+    :type genome_file: Path
+    :param seq_region_names: list of seq region names
+    :type seq_region_names: List[str]
+    :param diamond_validation_db: Path for diamond validation db
+    :type diamond_validation_db: Path
+    :param validation_type: type of validation ("relaxed" or "moderate").
+    :type validation_type: str, default "relaxed"
+    :param num_threads: Num of threads. Defaults to 1.
+    :type num_threads: int, default 1
+    :param cpc2_bin: CPC2 software path
+    :type cpc2_bin: Path
+    :param rnasamba_bin: RNASamba software path
+    :type rnasamba_bin: Path
+    :param rnasamba_weights: RNASamba weights path
+    :type rnasamba_weights: Path
+    :param diamond_bin: Path to the Diamond software.
+    :type diamond_bin: Path
+    :param min_single_exon_pep_length: Minimum peptide length for single exon.
+    :type min_single_exon_pep_length: int = 100,
+    :param min_multi_exon_pep_length: Minimum peptide length for multiple exons.
+    :type min_multi_exon_pep_length: int, default 75
+    :param min_single_source_probability: Minimum probability for single source.
+    :type min_single_source_probability: float, default 0.8
+    :param min_single_exon_probability: Minimum average probability for single exon.
+    :type min_single_exon_probability: float, default 0.9
+
+    """
+
+    final_annotation_dir = create_dir(main_output_dir, "annotation_output")
+    region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs")
+    final_region_annotation_dir = create_dir(final_annotation_dir, "final_region_gtfs")
+    utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs")
+    validation_dir = create_dir(final_annotation_dir, "cds_validation")
+    seq_region_lengths = get_seq_region_length(genome_file, 0)
+    output_file = final_annotation_dir / "annotation.gtf"
+    if output_file.exists():
+        transcript_count = check_gtf_content(output_file, "transcript")
+        if transcript_count > 0:
+            logger.info("Final gtf file exists, skipping analysis")
+            return
+
+    protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf"
+    minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf"
+    stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf"
+    scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf"
+    busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf"
+
+    transcript_selector_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "select_best_transcripts.pl"
+    )
+    finalise_geneset_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "finalise_geneset.pl"
+    )
+    clean_geneset_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "clean_geneset.pl"
+    )
+    clean_utrs_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "clean_utrs_and_lncRNAs.pl"
+    )
+    gtf_to_seq_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "gtf_to_seq.pl"
+    )
+
+    transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf"
+
+    with transcriptomic_annotation_raw.open("w") as file_out:
+        for transcriptomic_file in [
+            minimap2_annotation_raw,
+            scallop_annotation_raw,
+            stringtie_annotation_raw,
+        ]:
+            if not transcriptomic_file.exists():
+                logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file)
+                continue
+
+            with transcriptomic_file.open("r") as file_in:
+                for line in file_in:
+                    file_out.write(line.rstrip())
+
+    # Copy the raw files into the annotation dir
+    copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf")
+    copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf")
+
+    # select best transcript
+    generic_select_cmd = [
+        "perl",
+        str(transcript_selector_script),
+        "-genome_file",
+        str(genome_file),
+    ]
+    pool = multiprocessing.Pool(int(num_threads))  # pylint: disable=consider-using-with
+    for seq_region_name in seq_region_names:
+        # The selection script needs different params depending on
+        # whether the seqs are from transcriptomic data or not
+        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
+        transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf"
+        busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf"
+        protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf"
+        if transcriptomic_annotation_raw:
+            cmd = generic_select_cmd.copy()
+            cmd.extend(
+                [
+                    "-region_details",
+                    region_details,
+                    "-input_gtf_file",
+                    str(transcriptomic_annotation_raw),
+                    "-output_gtf_file",
+                    str(transcriptomic_region_gtf_path),
+                    "-cds_search",
+                    "-final_biotype",
+                    "transcriptomic",
+                ]
+            )
+            pool.apply_async(run_command, args=(cmd,))
+        if busco_annotation_raw:
+            cmd = generic_select_cmd.copy()
+            cmd.extend(
+                [
+                    "-region_details",
+                    region_details,
+                    "-input_gtf_file",
+                    str(busco_annotation_raw),
+                    "-output_gtf_file",
+                    str(busco_region_gtf_path),
+                    "-all_cds_exons",
+                    "-final_biotype",
+                    "busco",
+                ]
+            )
+            pool.apply_async(run_command, args=(cmd,))
+        if protein_annotation_raw:
+            cmd = generic_select_cmd.copy()
+            cmd.extend(
+                [
+                    "-region_details",
+                    region_details,
+                    "-input_gtf_file",
+                    str(protein_annotation_raw),
+                    "-output_gtf_file",
+                    str(protein_region_gtf_path),
+                    "-clean_transcripts",
+                    "-all_cds_exons",
+                    "-final_biotype",
+                    "protein",
+                ]
+            )
+            pool.apply_async(run_command, args=(cmd,))
+
+    pool.close()
+    pool.join()
+
+    # At this point we will have the region files for each seq region and we can merge them
+    merge_finalise_output_files(
+        final_annotation_dir,
+        region_annotation_dir,
+        ".trans.gtf",
+        "transcriptomic",
+    )
+    merge_finalise_output_files(final_annotation_dir, region_annotation_dir, ".busco.gtf", "busco")
+    merge_finalise_output_files(final_annotation_dir, region_annotation_dir, ".protein.gtf", "protein")
+
+    # Create a single GTF file with all the selected transcripts
+    # now that they have proper ids
+    fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf"
+    # Collect all *_sel.gtf files
+    gtf_files = list(final_annotation_dir.glob("*_sel.gtf"))
+
+    # Ensure there are files to merge
+    if not gtf_files:
+        raise FileNotFoundError(  # pylint:disable=raising-format-tuple
+            "No *_sel.gtf files found in %s", str(final_annotation_dir)
+        )
+
+    try:
+        with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out:
+            # at thei stage we shouldn't have header duplicated so cat should be robust
+            merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files]
+            subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True)
+            logger.info("Merged GTF files into %s", fully_merged_gtf_path)
+    except Exception as e:  # pylint:disable=raising-format-tuple,broad-exception-caught
+        print("An error occurred while merging GTF files: %s", e)
+
+    # Now collapse the gene set
+    generic_finalise_cmd = [
+        "perl",
+        str(finalise_geneset_script),
+        "-genome_file",
+        str(genome_file),
+    ]
+
+    pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
+    for seq_region_name in seq_region_names:
+        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
+        final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf"
+        cmd = generic_finalise_cmd.copy()
+        cmd.extend(
+            [
+                "-region_details",
+                region_details,
+                "-input_gtf_file",
+                str(fully_merged_gtf_path),
+                "-output_gtf_file",
+                str(final_region_gtf_path),
+            ]
+        )
+        pool.apply_async(run_command, args=(cmd,))
+
+    pool.close()
+    pool.join()
+
+    merge_finalise_output_files(
+        final_annotation_dir,
+        final_region_annotation_dir,
+        ".final.gtf",
+        "prevalidation",
+    )
+    merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf"
+    merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa"
+    merged_amino_acid_file = final_annotation_dir / "prevalidation_sel.prot.fa"
+    # validation
+    updated_gtf_lines = validate_coding_transcripts(
+        merged_cdna_file,
+        merged_amino_acid_file,
+        validation_dir,
+        validation_type,
+        diamond_validation_db,
+        merged_gtf_file,
+        num_threads,
+        cpc2_bin,
+        rnasamba_bin,
+        rnasamba_weights,
+        diamond_bin,
+        min_single_exon_pep_length,
+        min_multi_exon_pep_length,
+        min_single_source_probability,
+        min_single_exon_probability,
+    )
+
+    postvalidation_gtf_file = final_annotation_dir / "postvalidation.gtf"
+    with postvalidation_gtf_file.open("w+") as file_out:
+        for line in updated_gtf_lines:
+            file_out.write(line)
+    cleaned_initial_gtf_file = final_annotation_dir / "cleaned_pre_utr.gtf"
+    cleaned_utr_gtf_file = final_annotation_dir / "annotation.gtf"
+
+    logger.info("Cleaning initial set")
+    cleaning_cmd = [
+        "perl",
+        str(clean_geneset_script),
+        "-genome_file",
+        str(genome_file),
+        "-gtf_file",
+        str(postvalidation_gtf_file),
+        "-output_gtf_file",
+        str(cleaned_initial_gtf_file),
+    ]
+    logger.info("Running cleaning command: %s", " ".join(map(str, cleaning_cmd)))
+
+    run_command(cleaning_cmd)
+
+    # Clean UTRs
+    generic_clean_utrs_cmd = [
+        "perl",
+        str(clean_utrs_script),
+        "-genome_file",
+        str(genome_file),
+        "-input_gtf_file",
+        str(cleaned_initial_gtf_file),
+    ]
+    pool = multiprocessing.Pool(int(num_threads))  # pylint:disable=consider-using-with
+    for seq_region_name in seq_region_names:
+        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
+        utr_region_gtf_path = utr_region_annotation_dir / f"{region_details}.utr.gtf"
+
+        cmd = generic_clean_utrs_cmd.copy()
+        cmd.extend(
+            [
+                "-region_details",
+                region_details,
+                "-output_gtf_file",
+                str(utr_region_gtf_path),
+            ]
+        )
+        pool.apply_async(run_command, args=(cmd,))
+    pool.close()
+    pool.join()
+
+    merge_finalise_output_files(
+        final_annotation_dir,
+        utr_region_annotation_dir,
+        ".utr.gtf",
+        "annotation",
+    )
+    cmd = [
+        "mv",
+        str(final_annotation_dir / "annotation_sel.gtf"),
+        str(cleaned_utr_gtf_file),
+    ]
+
+    run_command(cmd)
+
+    logger.info("Dumping transcript and translation sequences")
+    dumping_cmd = [
+        "perl",
+        str(gtf_to_seq_script),
+        "-genome_file",
+        str(genome_file),
+        "-gtf_file",
+        str(cleaned_utr_gtf_file),
+    ]
+
+    logger.info("Running dumping command: %s", " ".join(map(str, dumping_cmd)))
+
+    run_command(dumping_cmd)
+
+    logger.info("Finished creating gene set")
+
+
+def validate_coding_transcripts(
+    cdna_file: Path,
+    amino_acid_file: Path,
+    validation_dir: Path,
+    validation_type: str,
+    diamond_validation_db: Path,
+    gtf_file: Path,
+    num_threads: int,
+    cpc2_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"
+    ),
+    rnasamba_bin: Path = Path(
+        "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
+    ),
+    rnasamba_weights: Path = Path(
+        "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"  # pylint:disable=line-too-long
+    ),
+    diamond_bin: Path = Path("diamond"),
+    min_single_exon_pep_length: int = 100,
+    min_multi_exon_pep_length: int = 75,
+    min_single_source_probability: float = 0.8,
+    min_single_exon_probability: float = 0.9,
+):
+    """Run validation on protein coding transcript
+    using CPC2, RNASamba, Diamond
+
+    Args:
+        cdna_file (Path): Input cdna file
+        amino_acid_file (Path): Input protein file
+        validation_dir (Path): Validation output directory
+        validation_type (str): _description_
+        diamond_validation_db (Path): _description_
+        gtf_file (Path): _description_
+        num_threads (int): number of threads
+
+    Returns:
+        _type_: _description_
+    """
+
+    logger.info("Running CDS validation with RNAsamba and CPC2")
+    # rnasamba_weights = config["rnasamba"]["weights"]
+    rnasamba_results = run_rnasamba(validation_dir, cdna_file, rnasamba_weights, rnasamba_bin)
+    cpc2_results = run_cpc2(validation_dir, cdna_file, cpc2_bin)
+    diamond_results = run_diamond(
+        validation_dir,
+        amino_acid_file,
+        diamond_validation_db,
+        num_threads,
+        diamond_bin,
+    )
+    combined_results = combine_results(rnasamba_results, cpc2_results, diamond_results)
+    logger.info("read gtf genes")
+    parsed_gtf_genes = read_gtf_genes(gtf_file)
+    updated_gtf_lines = update_gtf_genes(
+        parsed_gtf_genes,
+        combined_results,
+        validation_type,
+        min_single_exon_pep_length,
+        min_multi_exon_pep_length,
+        min_single_source_probability,
+        min_single_exon_probability,
+    )
+
+    return updated_gtf_lines
+
+
+def combine_results(
+    rnasamba_results: List[List[str]],
+    cpc2_results: List[List[str]],
+    diamond_results: Optional[List[List[str]]] = None,
+) -> Dict[str, List[str]]:
+    """
+    Combine results from RNA-Samba, CPC2, and DIAMOND analyses into a single dictionary.
+
+    This function integrates coding potential predictions from RNA-Samba,
+    CPC2, and optional similarity results from DIAMOND for a set of transcripts.
+
+    Args:
+        rnasamba_results (List[List[Union[str, float]]]):
+            Results from RNA-Samba, where each sublist contains:
+                - transcript_id (str): Transcript identifier.
+                - coding_probability (float): Probability of being coding.
+                - coding_potential (float): Coding potential value.
+
+        cpc2_results (List[List[Union[str, float, int]]]):
+            Results from CPC2, where each sublist contains:
+                - transcript_id (str): Transcript identifier.
+                - coding_probability (float): Probability of being coding.
+                - coding_potential (float): Coding potential value.
+                - transcript_length (int): Length of the transcript.
+                - peptide_length (int): Length of the predicted peptide.
+
+        diamond_results (Optional[List[List[Union[str, float]]]], optional):
+            Results from DIAMOND (default is None). Each sublist contains:
+                - transcript_id (str): Transcript identifier.
+                - e_value (float): E-value for sequence similarity.
+
+    Returns:
+        Dict[str, List[Union[float, int]]]:
+            A dictionary mapping `transcript_id` to a list of combined values:
+            RNA-Samba predictions, CPC2 predictions, and optional DIAMOND results.
+    """
+    transcript_ids: Dict[str, List[str]] = {}
+
+    # Process RNA-Samba results
+    for result in rnasamba_results:
+        transcript_id, coding_probability, coding_potential = result[:3]
+        if transcript_id not in transcript_ids:
+            transcript_ids[transcript_id] = [coding_probability, coding_potential]
+
+    # Process CPC2 results
+    for result in cpc2_results:
+        transcript_id, coding_probability, coding_potential, transcript_length, peptide_length = result[:5]
+        if transcript_id in transcript_ids:
+            transcript_ids[transcript_id].extend(
+                [coding_probability, coding_potential, transcript_length, peptide_length]
+            )
+
+    # Process DIAMOND results (if provided)
+    if diamond_results:
+        for result in diamond_results:
+            transcript_id, e_value = result[:2]
+            if transcript_id in transcript_ids:
+                transcript_ids[transcript_id].append(e_value)
+
+    return transcript_ids
+
+
+def update_gtf_genes(#pylint:disable=too-many-branches
+    parsed_gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]],
+    combined_results: Dict[str, List[str]],
+    validation_type: str,
+    min_single_exon_pep_length: int = 100,
+    min_multi_exon_pep_length: int = 75,
+    min_single_source_probability: float = 0.8,
+    min_single_exon_probability: float = 0.9,
+):
+    """Update GTF genes based on validation criteria.
+
+    Args:
+        parsed_gtf_genes (Dict): Parsed GTF genes data.
+        combined_results (Dict): Combined validation results.
+        validation_type (str): Validation strictness ("relaxed" or "moderate").
+        min_single_exon_pep_length (int): Minimum peptide length for single exon.
+        min_multi_exon_pep_length (int): Minimum peptide length for multiple exons.
+        min_single_source_probability (float): Minimum probability for single source.
+        min_single_exon_probability (float): Minimum average probability for single exon.
+
+    Returns:
+        List[str]: Updated GTF lines.
+    """
+    output_lines = []
+
+    for gene_id, transcripts in parsed_gtf_genes.items():#pylint:disable=unused-variable
+        for transcript_id, transcript_data in transcripts.items():
+            transcript_line = "".join(transcript_data["transcript"])
+            single_cds_exon_transcript = 0
+
+            # Extract translation coordinates
+            translation_match = re.search(r'; translation_coords "([^"]+)";', str(transcript_line))
+            if translation_match:
+                translation_coords_list = translation_match.group(1).split(":")
+                # Determine if it's a single-exon CDS
+                if translation_coords_list[0] == translation_coords_list[3]:
+                    single_cds_exon_transcript = 1
+
+            exon_lines = transcript_data["exons"]
+            validation_results = combined_results[transcript_id]
+
+            rnasamba_coding_probability = float(validation_results[0])
+            rnasamba_coding_potential = validation_results[1]
+            cpc2_coding_probability = float(validation_results[2])
+            cpc2_coding_potential = validation_results[3]
+            transcript_length = int(validation_results[4])
+            peptide_length = int(validation_results[5])
+            diamond_e_value = None
+            if len(validation_results) == 7:
+                diamond_e_value = validation_results[6]
+
+            # Calculate coding probabilities
+            avg_coding_probability = (rnasamba_coding_probability + cpc2_coding_probability) / 2
+            max_coding_probability = max(rnasamba_coding_probability, cpc2_coding_probability)
+
+            match = re.search(r'; biotype "([^"]+)";', str(transcript_line))
+            if match:
+                biotype = match.group(1)
+                if biotype == "busco" or biotype == "protein":#pylint:disable=consider-using-in
+                    transcript_line = update_biotype(str(transcript_line), biotype, "protein_coding")
+                    output_lines.append(transcript_line)
+                    output_lines.extend(exon_lines)
+                    continue
+
+            # Note that the below looks at validating things
+            # under different levels of strictness
+            # There are a few different continue statements,
+            # where transcripts will be skipped resulting
+            # in a smaller post validation file. It mainly
+            # removes single coding exon genes with no real
+            # support or for multi-exon lncRNAs that are less than 200bp long
+            if single_cds_exon_transcript == 1 and validation_type == "relaxed":
+                if diamond_e_value is not None:
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_single_exon_pep_length
+                ):
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                elif (
+                    (rnasamba_coding_potential == "coding" or cpc2_coding_potential == "coding")
+                    and peptide_length >= min_single_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                else:
+                    continue
+            elif single_cds_exon_transcript == 1 and validation_type == "moderate":
+                if diamond_e_value is not None and peptide_length >= min_single_exon_pep_length:
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                elif (
+                    (rnasamba_coding_potential == "coding" and cpc2_coding_potential == "coding")
+                    and peptide_length >= min_single_exon_pep_length
+                    and avg_coding_probability >= min_single_exon_probability
+                ):
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                else:
+                    continue
+            else:
+                if diamond_e_value is not None:
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_multi_exon_pep_length
+                ):
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                elif (
+                    (rnasamba_coding_potential == "coding" or cpc2_coding_potential == "coding")
+                    and peptide_length >= min_multi_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = update_biotype(transcript_line, biotype, "protein_coding")
+                elif transcript_length >= 200:
+                    transcript_line = update_biotype(transcript_line, biotype, "lncRNA")
+                    transcript_line = re.sub(' translation_coords "[^"]+";', "", transcript_line)
+                else:
+                    continue
+
+            output_lines.append(transcript_line)
+            output_lines.extend(exon_lines)
+
+    return output_lines
+
+
+def update_biotype(transcript_line: str, current_biotype: str, new_biotype: str) -> str:
+    """Update the biotype in a transcript line.
+
+    Args:
+        transcript_line (str): The line containing the transcript information.
+        current_biotype (str): The biotype to be replaced.
+        new_biotype (str): The new biotype to set.
+
+    Returns:
+        str: The updated transcript line with the new biotype.
+    """
+    return re.sub(f'; biotype "{current_biotype}";', f'; biotype "{new_biotype}";', transcript_line)
+
+
+def merge_finalise_output_files(
+    final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str
+) -> None:
+    """Merge the resulting gene models from all the seq regions for the specified extension
+
+    Args:
+        final_annotation_dir (Path): Output directory
+        region_annotation_dir (Path): Input directory
+        extension (str): File extension
+        id_label (str): Type of data or process to merge
+    """
+    gtf_files = list(region_annotation_dir.glob(f"*{extension}"))
+    merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf"
+    merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa"
+    merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa"
+
+    # The below is not great, it's a bit messy because there might be
+    # some cases where there aren't translations. So it's not as
+    # straightforward as reading the records across all three files
+    # in parallel. The solution is to just load the seqs into
+    # memory and index them on the current header, which should
+    # correspond to a transcript/gene id in the GTF. When writing the
+    # results into the single merged files the ids will be updated to
+    # be unique and consistent across the header,  three file types
+
+    gene_id_counter = 0
+    transcript_id_counter = 0
+
+    with merged_gtf_file.open("w") as gtf_out, merged_cdna_file.open(
+        "w"
+    ) as cdna_out, merged_amino_acid_file.open("w") as amino_acid_out:
+        for gtf_file in gtf_files:
+            logger.info("GTF file: %s", gtf_file)
+            cdna_seq_index = {}
+            amino_acid_seq_index = {}
+            cdna_file = gtf_file.with_suffix(".cdna")
+            amino_acid_file = gtf_file.with_suffix(".prot")
+
+            # Check that the files exist and are not empty
+            if (not Path(cdna_file).is_file() or Path(cdna_file).stat().st_size == 0) and (
+                Path(amino_acid_file).is_file() or Path(amino_acid_file).stat().st_size == 0
+            ):
+                logger.error(
+                    "DNA file %s or amino acid file %s \
+                    does not exist or is empty.",
+                    cdna_file,
+                    amino_acid_file,
+                )
+            else:
+                # Open and process the files
+                with open(cdna_file) as cdna_in, open(amino_acid_file) as amino_acid_in:
+                    cdna_seq_index = fasta_to_dict(cdna_in.readlines())
+                    amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines())
+
+            current_gene_id = ""
+            with gtf_file.open() as gtf_in:
+                for line in gtf_in:
+                    if re.search(r"^#", line):
+                        # if line.startswith("#"):
+                        continue
+                    eles = line.split("\t")
+                    if len(eles) != 9:
+                        continue
+
+                    match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
+                    gene_id = ""
+                    transcript_id = ""
+                    if match and eles[2] == "transcript":
+                        transcript_id_counter += 1
+                        gene_id = match.group(1)
+                        transcript_id = match.group(2)
+
+                    if not current_gene_id:
+                        gene_id_counter += 1
+                        current_gene_id = gene_id
+
+                    if gene_id != current_gene_id:
+                        gene_id_counter += 1
+                        current_gene_id = gene_id
+                    new_gene_id = f"{id_label}_{gene_id_counter}"
+                    new_transcript_id = f"{id_label}_{transcript_id_counter}"
+
+                    line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line)
+                    line = re.sub(
+                        f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line
+                    )
+                    gtf_out.write(line)
+                    if eles[2] == "transcript":
+                        new_header = f">{new_transcript_id}\n"
+                        cdna_out.write(new_header + cdna_seq_index[transcript_id])
+                        if transcript_id in amino_acid_seq_index:
+                            amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id])
+
+
+def copy_raw_files(raw_file: Path, destination_file: Path) -> None:
+    """Copy file in a new destination
+
+    Args:
+        raw_file (Path): File to copy
+        destination_file (Path): New destination
+    """
+    if raw_file.exists():
+        try:
+            shutil.copy(raw_file, destination_file)
+            logger.info("Copied %s to %s", raw_file, destination_file)
+        except Exception as e:  # pylint:disable=broad-exception-caught
+            logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e)
+    else:
+        logger.info("No file found at %s, skipping", raw_file)
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Genblast arguments")
+
+    parser.add_argument("--main_output_dir", required=True, help="Output directory path")
+    parser.add_argument("--genome_file", required=True, help="Path for the genome file")
+    parser.add_argument("--seq_region_names", type=List[str], help="List of seq region names")
+    parser.add_argument("--diamond_validation_db", required=True, help="Diamond validation db file path")
+    parser.add_argument(
+        "--validation_type", type=int, choices=["relaxed", "moderate"], help="Type of validation"
+    )
+    parser.add_argument(
+        "--diamond_bin",
+        help="DIAMOND executable path",
+    )
+    parser.add_argument(
+        "--cpc2_bin",
+        help="CPC2 executable path",
+    )
+    parser.add_argument(
+        "--rnasamba_bin",
+        help="RNAsamba executable path",
+    )
+    parser.add_argument(
+        "--rnasamba_weights",
+        help="Rnasamba weights path",
+    )
+    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
+    parser.add_argument(
+        "--min_single_exon_pep_length", type=int, default=100, help="Minimum peptide length for single exon."
+    )
+    parser.add_argument(
+        "--min_multi_exon_pep_length", type=int, default=75, help="Minimum peptide length for multiple exons."
+    )
+    parser.add_argument(
+        "--min_single_source_probability",
+        type=float,
+        default=0.8,
+        help="Minimum probability for single source.",
+    )
+    parser.add_argument(
+        "--min_single_exon_probability",
+        type=float,
+        default=0.9,
+        help="Minimum average probability for single exon.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Finalisation's entry-point."""
+    args = parse_args()
+
+    log_file_path = create_dir(args.output_dir, "log") / "finalisation.log"
+    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
+
+    logging.config.fileConfig(
+        loginipath,
+        defaults={"logfilename": str(log_file_path)},
+        disable_existing_loggers=False,
+    )
+
+    run_finalise_geneset(
+        Path(args.main_output_dir),
+        Path(args.genome_file),
+        args.seq_region_names,
+        Path(args.diamond_validation_db),
+        args.validation_type,
+        args.num_threads,
+        Path(args.cpc2_bin),
+        Path(args.rnasamba_bin),
+        Path(args.rnasamba_weights),
+        Path(args.diamond_bin),
+        args.min_single_exon_pep_length,
+        args.min_multi_exon_pep_length,
+        args.min_single_source_probability,
+        args.min_single_exon_probability,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py
deleted file mode 100644
index 178fbb4..0000000
--- a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py
+++ /dev/null
@@ -1,664 +0,0 @@
-# See the NOTICE file distributed with this work for additional information
-# regarding copyright ownership.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Set of functions used to collect the results of the subpipelines and generate the final output
-This logic will be revised in the nextflow pipeline
-"""
-__all__ = ["run_finalise_geneset"]
-
-import logging
-import logging.config
-import multiprocessing
-import os
-from pathlib import Path
-import re
-import subprocess
-from typing import List
-import shutil
-
-from ensembl.tools.anno.utils._utils import (
-    check_file,
-    get_seq_region_length,
-    fasta_to_dict,
-    check_exe,
-    create_dir,
-    check_gtf_content,
-    split_protein_file
-)
-
-logger = logging.getLogger(__name__)
-
-def run_finalise_geneset(
-    main_script_dir: Path,
-    main_output_dir: Path,
-    genome_file: Path,
-    seq_region_names: List[str],
-    diamond_validation_db :Path,
-    validation_type: str = "relaxed",
-    num_threads: int = 1,
-    cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"),
-    rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"),
-    rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"),
-
-):
-    """Collect results 
-
-        :param main_script_dir:
-        :type main_script_dir: Path
-        :param main_output_dir: _description_
-        :type main_output_dir: Path
-        :param genome_file: Path for genome fasta file
-        :type genome_file: Path
-        :param seq_region_names: list of seq region names
-        :type seq_region_names: List[str]
-        :param diamond_validation_db: Path for diamond validation db
-        :type diamond_validation_db: Path
-        :param validation_type: type of validation
-        :type validation_type: str, default "relaxed"
-        :param num_threads: Num of threads. Defaults to 1.
-        :type num_threads: int, default 1 
-        :param cpc2_bin: CPC2 software path
-        :type cpc2_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"
-        :param rnasamba_bin: RNASamba software path
-        :type rnasamba_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"
-        :param rnasamba_weights: RNASamba weights path
-        :type rnasamba_weights: Path, default "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"
-
-    """
-
-
-    final_annotation_dir = create_dir(main_output_dir, "annotation_output")
-    region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs")
-    final_region_annotation_dir = create_dir(
-        final_annotation_dir, "final_region_gtfs"
-    )
-    utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs")
-    validation_dir = create_dir(final_annotation_dir, "cds_validation")
-    seq_region_lengths = get_seq_region_length(genome_file, 0)
-    output_file = final_annotation_dir / "annotation.gtf"
-    if output_file.exists():
-        transcript_count = check_gtf_content(output_file, "transcript")
-        if transcript_count > 0:
-            logger.info("Final gtf file exists, skipping analysis")
-            return
-
-    protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf"
-    minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf"
-    stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf"
-    scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf"
-    busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf"
-    transcript_selector_script = main_output_dir / "support_scripts_perl" / "select_best_transcripts.pl"
-    finalise_geneset_script = main_output_dir / "support_scripts_perl" / "finalise_geneset.pl"
-    clean_geneset_script = main_output_dir / "support_scripts_perl" / "clean_geneset.pl"
-    clean_utrs_script = main_output_dir / "support_scripts_perl" / "clean_utrs_and_lncRNAs.pl"
-    gtf_to_seq_script = main_output_dir / "support_scripts_perl" / "gtf_to_seq.pl"
-
-    transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf"
-
-    with transcriptomic_annotation_raw.open("w") as file_out:
-        for transcriptomic_file in [
-        minimap2_annotation_raw,
-        scallop_annotation_raw,
-        stringtie_annotation_raw,
-        ]:
-            if not transcriptomic_file.exists():
-                logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file)
-                continue
-
-            with transcriptomic_file.open("r") as file_in:
-                for line in file_in:
-                    file_out.write(line)
-
-    # Copy the raw files into the annotation dir
-    copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf")
-    copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf")
-
-    #select best transcript
-    generic_select_cmd = [
-        "perl",
-        str(transcript_selector_script),
-        "-genome_file",
-        str(genome_file),
-    ]
-    pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with
-    for seq_region_name in seq_region_names:
-        # The selection script needs different params depending on
-        # whether the seqs are from transcriptomic data or not
-        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
-        transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf"
-        busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf"
-        protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf"
-        if transcriptomic_annotation_raw:
-            cmd = generic_select_cmd.copy()
-            cmd.extend(
-                [
-                    "-region_details",
-                    region_details,
-                    "-input_gtf_file",
-                    str(transcriptomic_annotation_raw),
-                    "-output_gtf_file",
-                    str(transcriptomic_region_gtf_path),
-                    "-cds_search",
-                    "-final_biotype",
-                    "transcriptomic",
-                ]
-            )
-            pool.apply_async(run_command, args=(cmd,))
-        if busco_annotation_raw:
-            cmd = generic_select_cmd.copy()
-            cmd.extend(
-                [
-                    "-region_details",
-                    region_details,
-                    "-input_gtf_file",
-                    str(busco_annotation_raw),
-                    "-output_gtf_file",
-                    str(busco_region_gtf_path),
-                    "-all_cds_exons",
-                    "-final_biotype",
-                    "busco",
-                ]
-            )
-            pool.apply_async(run_command, args=(cmd,))
-        if protein_annotation_raw:
-            cmd = generic_select_cmd.copy()
-            cmd.extend(
-                [
-                    "-region_details",
-                    region_details,
-                    "-input_gtf_file",
-                    str(protein_annotation_raw),
-                    "-output_gtf_file",
-                    str(protein_region_gtf_path),
-                    "-clean_transcripts",
-                    "-all_cds_exons",
-                    "-final_biotype",
-                    "protein",
-                ]
-            )
-            pool.apply_async(run_command, args=(cmd,))
-
-    pool.close()
-    pool.join()
-
-    # At this point we will have the region files for each seq region and we can merge them
-    merge_finalise_output_files(
-        final_annotation_dir,
-        region_annotation_dir,
-        ".trans.gtf",
-        "transcriptomic",
-    )
-    merge_finalise_output_files(
-        final_annotation_dir, region_annotation_dir, ".busco.gtf", "busco"
-    )
-    merge_finalise_output_files(
-        final_annotation_dir, region_annotation_dir, ".protein.gtf", "protein"
-    )
-
-    # Create a single GTF file with all the selected transcripts
-    # now that they have proper ids 
-    fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf"
-    # Collect all *_sel.gtf files
-    gtf_files = list(final_annotation_dir.glob("*_sel.gtf"))
-
-    # Ensure there are files to merge
-    if not gtf_files:
-        raise FileNotFoundError("No *_sel.gtf files found in %s",final_annotation_dir)
-
-
-    try:
-        with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out:
-            merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files]
-            subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True)
-            logger.info("Merged GTF files into %s", fully_merged_gtf_path)
-    except Exception as e:
-        print("An error occurred while merging GTF files: %s",e)
-        
-    # Now collapse the gene set
-    generic_finalise_cmd = [
-        "perl",
-        str(finalise_geneset_script),
-        "-genome_file",
-        str(genome_file),
-    ]
-
-    pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with
-    for seq_region_name in seq_region_names:
-        region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}"
-        final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf"
-        cmd = generic_finalise_cmd.copy()
-        cmd.extend(
-            [
-                "-region_details",
-                region_details,
-                "-input_gtf_file",
-                str(fully_merged_gtf_path),
-                "-output_gtf_file",
-                str(final_region_gtf_path),
-            ]
-        )
-        pool.apply_async(run_command, args=(cmd,))
-
-    pool.close()
-    pool.join()
-
-    merge_finalise_output_files(
-        final_annotation_dir,
-        final_region_annotation_dir,
-        ".final.gtf",
-        "prevalidation",
-    )
-    merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf"
-    merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa"
-    merged_amino_acid_file = final_annotation_dir /"prevalidation_sel.prot.fa"
-    #validation
-    updated_gtf_lines = validate_coding_transcripts(
-        merged_cdna_file,
-        merged_amino_acid_file,
-        validation_dir,
-        validation_type,
-        diamond_validation_db,
-        merged_gtf_file,
-        num_threads,
-        cpc2_bin,
-        rnasamba_bin,
-        rnasamba_weights
-    )
-    
-    ####
-    postvalidation_gtf_file = os.path.join(final_annotation_dir, ("postvalidation.gtf"))
-    file_out = open(postvalidation_gtf_file, "w+")
-    for line in updated_gtf_lines:
-        file_out.write(line)
-    file_out.close()
-
-    cleaned_initial_gtf_file = os.path.join(final_annotation_dir, ("cleaned_pre_utr.gtf"))
-    cleaned_utr_gtf_file = os.path.join(final_annotation_dir, ("annotation.gtf"))
-
-    logger.info("Cleaning initial set")
-    cleaning_cmd = [
-        "perl",
-        clean_geneset_script,
-        "-genome_file",
-        genome_file,
-        "-gtf_file",
-        postvalidation_gtf_file,
-        "-output_gtf_file",
-        cleaned_initial_gtf_file,
-    ]
-    logger.info(" ".join(cleaning_cmd))
-    subprocess.run(cleaning_cmd)
-
-    # Clean UTRs
-    generic_clean_utrs_cmd = [
-        "perl",
-        clean_utrs_script,
-        "-genome_file",
-        genome_file,
-        "-input_gtf_file",
-        cleaned_initial_gtf_file,
-    ]
-    pool = multiprocessing.Pool(int(num_threads))
-    for seq_region_name in seq_region_names:
-        region_details = (
-            seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name])
-        )
-        utr_region_gtf_path = os.path.join(
-            utr_region_annotation_dir, (region_details + ".utr.gtf")
-        )
-
-        cmd = generic_clean_utrs_cmd.copy()
-        cmd.extend(
-            [
-                "-region_details",
-                region_details,
-                "-input_gtf_file",
-                cleaned_initial_gtf_file,
-                "-output_gtf_file",
-                utr_region_gtf_path,
-            ]
-        )
-        pool.apply_async(multiprocess_generic, args=(cmd,))
-    pool.close()
-    pool.join()
-
-    merge_finalise_output_files(
-        final_annotation_dir,
-        utr_region_annotation_dir,
-        ".utr.gtf",
-        "annotation",
-    )
-    subprocess.run(
-        [
-            "mv",
-            os.path.join(final_annotation_dir, "annotation_sel.gtf"),
-            cleaned_utr_gtf_file,
-        ]
-    )
-
-    logger.info("Dumping transcript and translation sequences")
-    dumping_cmd = [
-        "perl",
-        gtf_to_seq_script,
-        "-genome_file",
-        genome_file,
-        "-gtf_file",
-        cleaned_utr_gtf_file,
-    ]
-    logger.info(" ".join(dumping_cmd))
-    subprocess.run(dumping_cmd)
-
-    logger.info("Finished creating gene set")
-
-
-def validate_coding_transcripts(
-    cdna_file:Path,
-    amino_acid_file:Path,
-    validation_dir:Path,
-    validation_type:str,
-    diamond_validation_db:Path,
-    gtf_file:Path,
-    num_threads:int,
-    cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"),
-    rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"),
-    rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"),
-
-):
-    """Run validation on protein coding transcript 
-    using CPC2, RNASamba, Diamond
-
-    Args:
-        cdna_file (Path): Input cdna file
-        amino_acid_file (Path): Input protein file
-        validation_dir (Path): Validation output directory
-        validation_type (str): _description_
-        diamond_validation_db (Path): _description_
-        gtf_file (Path): _description_
-        num_threads (int): number of threads
-
-    Returns:
-        _type_: _description_
-    """
-
-    logger.info("Running CDS validation with RNAsamba and CPC2")
-    #rnasamba_weights = config["rnasamba"]["weights"]
-
-    
-
-    
-    
-
-   
-
-    logger.info("read results")
-    rnasamba_results = read_rnasamba_results(rnasamba_output_path)
-    cpc2_results = read_cpc2_results(cpc2_output_path)
-    combined_results = combine_results(rnasamba_results, cpc2_results, diamond_results)
-    logger.info("read gtf genes")
-    parsed_gtf_genes = read_gtf_genes(gtf_file)
-    updated_gtf_lines = update_gtf_genes(
-        parsed_gtf_genes, combined_results, validation_type
-    )
-
-    return updated_gtf_lines
-
-
-
-
-def combine_results(rnasamba_results, cpc2_results, diamond_results):
-
-    transcript_ids = {}
-
-    for result in rnasamba_results:
-        transcript_id = result[0]
-        coding_probability = result[1]
-        coding_potential = result[2]
-
-        if transcript_id not in transcript_ids:
-            transcript_ids[transcript_id] = [
-                coding_probability,
-                coding_potential,
-            ]
-
-    for result in cpc2_results:
-        transcript_id = result[0]
-        coding_probability = result[1]
-        coding_potential = result[2]
-        transcript_length = result[3]
-        peptide_length = result[4]
-        transcript_ids[transcript_id].extend(
-            [
-                coding_probability,
-                coding_potential,
-                transcript_length,
-                peptide_length,
-            ]
-        )
-
-    if diamond_results is not None:
-        for result in diamond_results:
-            transcript_id = result[0]
-            e_value = result[1]
-            # There seems to be an issue where there are a small number
-            # of sequences that don't make it into the cpc2/rnasamba oqutput
-            # Should code in a system for this, but it would be good to
-            # understand why it happens to begin with. Seems to be the same
-            # number of missing seqs in both, so maybe a shared cut-off
-            if transcript_id in transcript_ids:
-                transcript_ids[transcript_id].extend([e_value])
-
-    return transcript_ids
-
-def merge_finalise_output_files(
-    final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str
-)-> None:
-    """Merge the resulting gene models from all the seq regions for the specified extension
-
-    Args:
-        final_annotation_dir (Path): Output directory
-        region_annotation_dir (Path): Input directory
-        extension (str): File extension
-        id_label (str): Type of data or process to merge
-    """
-    gtf_files = list(region_annotation_dir.glob(f"*{extension}"))
-    merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf"
-    merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa"
-    merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa"
-
-    # The below is not great, it's a bit messy because there might be
-    # some cases where there aren't translations. So it's not as
-    # straightforward as reading the records across all three files
-    # in parallel. The solution is to just load the seqs into
-    # memory and index them on the current header, which should
-    # correspond to a transcript/gene id in the GTF. When writing the
-    # results into the single merged files the ids will be updated to
-    # be unique and consistent across the header,  three file types
-
-    gene_id_counter = 0
-    transcript_id_counter = 0
-    
-    with merged_gtf_file.open("w") as gtf_out, \
-        merged_cdna_file.open("w") as cdna_out, \
-        merged_amino_acid_file.open("w") as amino_acid_out:
-
-        for gtf_file in gtf_files:
-            logger.info("GTF file: %s", gtf_file)
-            cdna_seq_index = {}
-            amino_acid_seq_index = {}
-            cdna_file = gtf_file.with_suffix(".cdna")
-            amino_acid_file = gtf_file.with_suffix(".prot")
-            if cdna_file.exists():
-                with cdna_file.open() as cdna_in:
-                    cdna_seq_index = fasta_to_dict(cdna_in.readlines())
-            
-            if amino_acid_file.exists():
-                with amino_acid_file.open() as amino_acid_in:
-                    amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines())
-            current_gene_id = ""
-            with gtf_file.open() as gtf_in:
-                for line in gtf_in:
-                    if re.search(r"^#", line):
-                    #if line.startswith("#"):
-                        continue
-                    eles = line.split("\t")
-                    if len(eles) != 9:
-                        continue
-
-                    match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
-                    if match and eles[2] == "transcript":
-                        transcript_id_counter += 1
-                    gene_id = match.group(1)
-                    transcript_id = match.group(2)
-
-                    if not current_gene_id:
-                        gene_id_counter += 1
-                        current_gene_id = gene_id
-
-                    if gene_id != current_gene_id:
-                        gene_id_counter += 1
-                        current_gene_id = gene_id
-                    new_gene_id = f"{id_label}_{gene_id_counter}"
-                    new_transcript_id = f"{id_label}_{transcript_id_counter}"
-
-                    line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line)
-                    line = re.sub(f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line)
-                    gtf_out.write(line)
-                    if eles[2] == "transcript":
-                        new_header = f">{new_transcript_id}\n"
-                        cdna_out.write(new_header + cdna_seq_index[transcript_id])
-                        if transcript_id in amino_acid_seq_index:
-                            amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id])
-
-
-def multiprocess_finalise_geneset(cmd):
-
-    print(" ".join(cmd))
-    subprocess.run(cmd)
-    
-
-
-def run_command(cmd)->None:
-    """Submit command in multiprocessing action 
-
-    Args:
-        cmd (List[str]): command to execute
-    """
-    try:
-        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
-        logger.info("Command executed successfully: %s", cmd)
-        logger.info("Output: %s", result.stdout)
-    except subprocess.CalledProcessError as e:
-        logger.error("Command failed: %s", cmd)
-        logger.error("Error: %s", e.stderr)
-
-def copy_raw_files(raw_file: Path, destination_file :Path)-> None:
-    """Copy file in a new destination
-
-    Args:
-        raw_file (Path): File to copy
-        destination_file (Path): New destination
-    """
-    if raw_file.exists():
-        try:
-            shutil.copy(raw_file, destination_file)
-            logger.info("Copied %s to %s", raw_file, destination_file)
-        except Exception as e:
-            logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e)
-    else:
-        logger.info("No file found at %s, skipping", raw_file)
-
-
-
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-    
-
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(description="Genblast arguments")
-    parser.add_argument(
-        "--masked_genome_file", required=True, help="Masked genome file path"
-    )
-    parser.add_argument("--output_dir", required=True, help="Output directory path")
-    parser.add_argument("--protein_file", required=True, help="Path for the protein dataset")
-    parser.add_argument(
-        "--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period"
-    )
-    parser.add_argument(
-        "--max_intron_length", type=int, required=True, help="Maximum intron length"
-    )
-    parser.add_argument(
-        "--genblast_bin",
-        default="genblast",
-        help="Genblast executable path",
-    )
-    parser.add_argument(
-        "--convert2blastmask_bin",
-        default="convert2blastmask",
-        help="convert2blastmask executable path",
-    )
-    parser.add_argument(
-        "--makeblastdb_bin",
-        default="makeblastdb",
-        help="makeblastdb executable path",
-    )
-    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
-    parser.add_argument(
-        "--protein_set",
-        required=True,
-        choices=["uniprot", "orthodb"],
-        help="Protein set [uniprot, orthodb]",
-    )
-    return parser.parse_args()
-
-def main():
-    """Genblast's entry-point."""
-    args = parse_args()
-
-    log_file_path = create_dir(args.output_dir, "log") / "genblast.log"
-    loginipath = Path(__file__).parents[6] / "conf" / "logging.conf"
-
-    logging.config.fileConfig(
-        loginipath,
-        defaults={"logfilename": str(log_file_path)},
-        disable_existing_loggers=False,
-    )
-
-    run_genblast(
-        Path(args.masked_genome_file),
-        Path(args.output_dir),
-        Path(args.protein_file),
-        args.max_intron_length,
-        args.genblast_timeout_secs,
-        Path(args.genblast_bin),
-        Path(args.convert2blastmask_bin),
-        Path(args.makeblastdb_bin),
-        args.num_threads,
-        args.protein_set,
-    )
-
-if __name__ == "__main__":
-    main()
diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py
index 8f882a5..b4bbc72 100644
--- a/src/python/ensembl/tools/anno/utils/_utils.py
+++ b/src/python/ensembl/tools/anno/utils/_utils.py
@@ -16,7 +16,6 @@
 
 import errno
 import io
-import json
 import logging
 import os
 from os import PathLike
@@ -25,9 +24,8 @@
 import re
 import subprocess
 import shutil
-import glob
 import tempfile
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 logger = logging.getLogger("__name__." + __name__)
 
@@ -128,7 +126,7 @@ def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dic
 
         if len(current_seq) > min_seq_length:
             seq_region_to_length[current_header] = len(current_seq)
-    #logging.info("length of the seq region %s", seq_region_to_length)
+    # logging.info("length of the seq region %s", seq_region_to_length)
     return seq_region_to_length
 
 
@@ -168,7 +166,7 @@ def get_slice_id(
             if (end - start + 1) >= min_length:
                 slice_ids_per_region.append([seq_region, start, end])
             start = end + 1
-    #logging.info("list of slice ids %s", str(slice_ids_per_region))
+    # logging.info("list of slice ids %s", str(slice_ids_per_region))
     return slice_ids_per_region
 
 
@@ -254,9 +252,7 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             # Example key KS8000.rs1.re1000000.1
                             gene_id_slice = input_file.name + "." + str(current_gene_id)
                             # Example key KS8000.rs1.re1000000.1.transcript.1
-                            transcript_id_slice = (
-                                gene_id_slice + "." + str(current_transcript_id)
-                            )
+                            transcript_id_slice = gene_id_slice + "." + str(current_transcript_id)
                             # If there is no existing entry, the gene key will be added
                             # and the gene counter is incremented.
                             # gene_id "gene1"; transcript_id "gene1.t1"
@@ -281,16 +277,12 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                                     + ".t"
                                     + str(transcript_id_count_gene_id[gene_id_slice])
                                 )
-                                gene_transcript_id_collection[
-                                    transcript_id_slice
-                                ] = new_transcript_id
+                                gene_transcript_id_collection[transcript_id_slice] = new_transcript_id
                                 transcript_id_count_gene_id[gene_id_slice] += 1
                             else:
                                 # If a transcript of the same set is already present,
                                 # the new id with the incremented counter is added
-                                new_transcript_id = gene_transcript_id_collection[
-                                    transcript_id_slice
-                                ]
+                                new_transcript_id = gene_transcript_id_collection[transcript_id_slice]
                             attribs = re.sub(
                                 full_transcript_id_string,
                                 'transcript_id "' + new_transcript_id + '"',
@@ -303,13 +295,15 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                         else:
                             if new_id_prefix == "repeatmask":
                                 match_feature_type = re.search(
-                                    r"(" + feature_id_label + "\d+)",
+                                    r"(" + feature_id_label + r"\d+)",
                                     line,
                                 )
                             else:
-                                match_feature_type = re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
-                                    r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
-                                    line,
+                                match_feature_type = (
+                                    re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
+                                        r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
+                                        line,
+                                    )
                                 )
                             if match_feature_type:
                                 full_feature_id = match_feature_type.group(1)
@@ -327,6 +321,8 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             "Feature type not recognised, will skip. Feature type: %s",
                             values[2],
                         )
+
+
 def get_sequence(
     seq_region: str,
     start: int,
@@ -355,9 +351,7 @@ def get_sequence(
         "get_sequence %s",
         f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}",
     )
-    with tempfile.NamedTemporaryFile(
-        mode="w+t", delete=False, dir=str(output_dir)
-    ) as bed_temp_file:
+    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file:
         bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}")
         bed_temp_file.close()
     bedtools_command = [
@@ -368,10 +362,10 @@ def get_sequence(
         "-bed",
         str(bed_temp_file.name),
     ]
-    bedtools_output = subprocess.Popen(bedtools_command, stdout=subprocess.PIPE)
-    for idx, line in enumerate(
-        io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")  # type: ignore
-    ):
+    bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with
+        bedtools_command, stdout=subprocess.PIPE
+    )  # pylint:disable=consider-using-with
+    for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")):  # type: ignore
         if idx == 1:
             if strand == 1:
                 sequence = line.rstrip()
@@ -404,49 +398,46 @@ def check_file(file_path: Path) -> None:
     if not file_path.is_file():
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
 
+
 def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
     """
     Save fastq sequences in a dictionary <header, sequence>
     fasta_lines: List[str]: List of sequences
     """
-    index = {}
-    header = None
-    seq = []
+    index: Dict[str, str] = {}
+    header: Optional[str] = None
+    seq: List[str] = []
 
     for line in fasta_lines:
         line = line.strip()
-        if line.startswith(">"):
+        header_match = re.match(r">(.+)\n$", line)
+        if header_match:
             if header:
-                index[header] = ''.join(seq)
-            header_match = re.match(r">(.+)", line)
-            if header_match:
-                header = header_match.group(1)
-                seq = []
-            else:
-                raise ValueError(f"Invalid FASTA header line: {line}")
+                index[header] = "".join(seq)
+                header = None
+            header = header_match.group(1)
+            seq = []
         else:
             seq.append(line)
-
+    # Add the last sequence
     if header:
-        index[header] = ''.join(seq)
-
-    return index
-"""
-def fasta_to_dict(fasta_lines:List[str]):
-    index = {}
-    for line in fasta_lines:
-    #it = iter(fasta_lines)
-    #for header in it:
-        match = re.search(r">(.+)\n$", line)
-        header = match.group(1)
-        seq = next(line)
-        index[header] = seq
+        index[header] = "".join(seq)
     return index
-"""
 
-def split_protein_file(
-    protein_dataset: Path, output_dir: Path, batch_size: int = 20
-) -> List:
+
+# def fasta_to_dict(fasta_lines:List[str]):
+#    index = {}
+#    for line in fasta_lines:
+#    #it = iter(fasta_lines)
+#    #for header in it:
+#        match = re.search(r">(.+)\n$", line)
+#        header = match.group(1)
+#        seq = next(line)
+#        index[header] = seq
+#    return index
+
+
+def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List:
     """
     The protein dataset file is splitted by a number of sequence equals to the batch_size
     in batch files stored in 10 output directories.
@@ -459,7 +450,7 @@ def split_protein_file(
 
     for i in range(0, 10):
         create_dir(output_dir, (f"bin_{i}"))
-    with open(protein_dataset,"r", encoding="utf8") as file_in:
+    with open(protein_dataset, "r", encoding="utf8") as file_in:
         seq_count = 0
         batch_count = 0
         current_record = ""
@@ -494,950 +485,69 @@ def split_protein_file(
             batched_protein_files.append(batch_file)
     return batched_protein_files
 
-def load_results_to_ensembl_db(
-    main_script_dir,
-    load_to_ensembl_db,
-    genome_file,
-    main_output_dir,
-    db_details,
-    num_threads,
-):
-    db_loading_script = os.path.join(
-        main_script_dir, "support_scripts_perl", "load_gtf_ensembl.pl"
-    )
-    db_loading_dir = create_dir(main_output_dir, "db_loading")
-
-    # Should collapse this into a function
-    annotation_results_gtf_file = os.path.join(
-        main_output_dir, "annotation_output", "annotation.gtf"
-    )
-    if os.path.exists(annotation_results_gtf_file):
-        logger.info("Loading main geneset to db")
-        batch_size = 200
-        load_type = "gene"
-        analysis_name = "ensembl"
-        gtf_records = batch_gtf_records(
-            annotation_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find the main gene annotation file, so not loading. Path checked:\n"
-            + annotation_results_gtf_file
-        )
-
-    rfam_results_gtf_file = os.path.join(main_output_dir, "rfam_output", "annotation.gtf")
-    if os.path.exists(rfam_results_gtf_file):
-        logger.info("Loading Rfam-based sncRNA genes to db")
-        batch_size = 500
-        load_type = "gene"
-        analysis_name = "ncrna"
-        gtf_records = batch_gtf_records(
-            rfam_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find an Rfam annotation file, so not loading. Path checked:\n"
-            + rfam_results_gtf_file
-        )
-
-    trnascan_results_gtf_file = os.path.join(
-        main_output_dir, "trnascan_output", "annotation.gtf"
-    )
-    if os.path.exists(trnascan_results_gtf_file):
-        logger.info("Loading tRNAScan-SE tRNA genes to db")
-        batch_size = 500
-        load_type = "gene"
-        analysis_name = "ncrna"
-        gtf_records = batch_gtf_records(
-            trnascan_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find an tRNAScan-SE annotation file, so not loading. Path checked:\n"
-            + trnascan_results_gtf_file
-        )
-
-    dust_results_gtf_file = os.path.join(main_output_dir, "dust_output", "annotation.gtf")
-    if os.path.exists(dust_results_gtf_file):
-        logger.info("Loading Dust repeats to db")
-        batch_size = 500
-        load_type = "single_line_feature"
-        analysis_name = "dust"
-        gtf_records = batch_gtf_records(
-            dust_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find a Dust annotation file, so not loading. Path checked:\n"
-            + dust_results_gtf_file
-        )
-
-    red_results_gtf_file = os.path.join(main_output_dir, "red_output", "annotation.gtf")
-    if os.path.exists(red_results_gtf_file):
-        logger.info("Loading Red repeats to db")
-        batch_size = 500
-        load_type = "single_line_feature"
-        analysis_name = "repeatdetector"
-        gtf_records = batch_gtf_records(
-            red_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find a Red annotation file, so not loading. Path checked:\n"
-            + red_results_gtf_file
-        )
 
-    trf_results_gtf_file = os.path.join(main_output_dir, "trf_output", "annotation.gtf")
-    if os.path.exists(trf_results_gtf_file):
-        logger.info("Loading TRF repeats to db")
-        batch_size = 500
-        load_type = "single_line_feature"
-        analysis_name = "trf"
-        gtf_records = batch_gtf_records(
-            trf_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find a TRF annotation file, so not loading. Path checked:\n"
-            + trf_results_gtf_file
-        )
+def run_command(cmd) -> None:
+    """Submit command in multiprocessing action
 
-    cpg_results_gtf_file = os.path.join(main_output_dir, "cpg_output", "annotation.gtf")
-    if os.path.exists(cpg_results_gtf_file):
-        logger.info("Loading CpG islands to db")
-        batch_size = 500
-        load_type = "single_line_feature"
-        analysis_name = "cpg"
-        gtf_records = batch_gtf_records(
-            cpg_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find a CpG annotation file, so not loading. Path checked:\n"
-            + cpg_results_gtf_file
-        )
+    Args:
+        cmd (List[str]): command to execute
+    """
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        logger.info("Command executed successfully: %s", cmd)
+        logger.info("Output: %s", result.stdout)
+    except subprocess.CalledProcessError as e:
+        logger.error("Command failed: %s", cmd)
+        logger.error("Error: %s", e.stderr)
 
-    eponine_results_gtf_file = os.path.join(
-        main_output_dir, "eponine_output", "annotation.gtf"
-    )
-    if os.path.exists(eponine_results_gtf_file):
-        logger.info("Loading Eponine repeats to db")
-        batch_size = 500
-        load_type = "single_line_feature"
-        analysis_name = "eponine"
-        gtf_records = batch_gtf_records(
-            eponine_results_gtf_file, batch_size, db_loading_dir, load_type
-        )
-        generic_load_records_to_ensembl_db(
-            load_to_ensembl_db,
-            db_loading_script,
-            genome_file,
-            db_details,
-            db_loading_dir,
-            load_type,
-            analysis_name,
-            gtf_records,
-            num_threads,
-        )
-    else:
-        logger.error(
-            "Did not find an Eponine annotation file, so not loading. Path checked:\n"
-            + eponine_results_gtf_file
-        )
 
-    logger.info("Finished loading records to db")
-
-
-def generic_load_records_to_ensembl_db(
-    load_to_ensembl_db,
-    db_loading_script,
-    genome_file,
-    db_details,
-    db_loading_dir,
-    load_type,
-    analysis_name,
-    gtf_records,
-    num_threads,
-):
-    pool = multiprocessing.Pool(int(num_threads))
-    for record_batch in gtf_records:
-        pool.apply_async(
-            multiprocess_load_records_to_ensembl_db,
-            args=(
-                load_to_ensembl_db,
-                db_loading_script,
-                genome_file,
-                db_details,
-                db_loading_dir,
-                load_type,
-                analysis_name,
-                record_batch,
-            ),
-        )
+def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
+    """
+    Parses a GTF file to extract gene, transcript, and exon information.
 
-    pool.close()
-    pool.join()
-
-
-def multiprocess_load_records_to_ensembl_db(
-    load_to_ensembl_db,
-    db_loading_script,
-    genome_file,
-    db_details,
-    output_dir,
-    load_type,
-    analysis_name,
-    record_batch,
-):
-    with tempfile.NamedTemporaryFile(
-        mode="w+t", delete=False, dir=output_dir
-    ) as gtf_temp_out:
-        for line in record_batch:
-            gtf_temp_out.writelines(line)
-            gtf_temp_file_path = gtf_temp_out.name
-
-    (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",")
-
-    loading_cmd = [
-        "perl",
-        db_loading_script,
-        "-genome_file",
-        genome_file,
-        "-dbname",
-        db_name,
-        "-host",
-        db_host,
-        "-port",
-        str(db_port),
-        "-user",
-        db_user,
-        "-pass",
-        db_pass,
-        "-gtf_file",
-        gtf_temp_file_path,
-        "-analysis_name",
-        analysis_name,
-        "-load_type",
-        load_type,
-    ]
+    Args:
+        gtf_file: Path to the GTF file.
 
-    if load_type == "gene" and analysis_name == "ensembl":
-        loading_cmd.extend(
-            [
-                "-protein_coding_biotype",
-                "anno_protein_coding",
-                "-non_coding_biotype",
-                "anno_lncRNA",
-            ]
-        )
+    Returns:
+        A dictionary containing gene information, organized by gene_id, transcript_id, and features.
+    """
+    gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
 
-        if load_to_ensembl_db == "single_transcript_genes":
-            loading_cmd.append("-make_single_transcript_genes")
-
-    logger.info(" ".join(loading_cmd))
-    subprocess.run(loading_cmd)
-    gtf_temp_out.close()
-    os.remove(gtf_temp_file_path)  # doesn't seem to be working
-    logger.info("Finished: " + gtf_temp_file_path)
-    gc.collect()
-
-
-def batch_gtf_records(input_gtf_file, batch_size, output_dir, record_type):
-    records = []
-    gtf_in = open(input_gtf_file)
-    if record_type == "gene":
-        # NOTE that the neverending variations on GTF reading/writing/merging
-        # is becoming very messy
-        # need to create a set of utility methods outside of this script
-        # This one assumes the file has unique ids for the parent features.
-        # It then batches them into
-        # sets of records based on the batch size passed in
-        record_counter = 0
-        current_record_batch = []
-        current_gene_id = ""
-        line = gtf_in.readline()
-        while line:
-            if re.search(r"^#", line):
-                line = gtf_in.readline()
+    with open(gtf_file, "r") as gtf_in:
+        for line in gtf_in:
+            eles = line.strip().split("\t")
+            if len(eles) != 9:
                 continue
 
-            eles = line.split("\t")
-            if not len(eles) == 9:
-                line = gtf_in.readline()
+            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8])
+            if not match:
                 continue
 
-            match = re.search(r'gene_id "([^"]+)"', line)
             gene_id = match.group(1)
+            transcript_id = match.group(2)
+            feature_type = eles[2]
 
-            if not current_gene_id:
-                record_counter += 1
-                current_gene_id = gene_id
-
-            if not gene_id == current_gene_id:
-                record_counter += 1
-                if record_counter % batch_size == 0:
-                    records.append(current_record_batch)
-                    current_record_batch = []
-                current_gene_id = gene_id
-
-            current_record_batch.append(line)
-            line = gtf_in.readline()
-
-        records.append(current_record_batch)
-
-    elif record_type == "single_line_feature":
-        record_counter = 0
-        current_record_batch = []
-        current_gene_id = ""
-        line = gtf_in.readline()
-        while line:
-            if re.search(r"^#", line):
-                line = gtf_in.readline()
-                continue
-
-            eles = line.split("\t")
-            if not len(eles) == 9:
-                line = gtf_in.readline()
-                continue
-
-            record_counter += 1
-
-            if record_counter % batch_size == 0:
-                records.append(current_record_batch)
-                current_record_batch = []
-
-            current_record_batch.append(line)
-            line = gtf_in.readline()
-
-        records.append(current_record_batch)
-
-    gtf_in.close()
+            if gene_id not in gtf_genes:
+                gtf_genes[gene_id] = {}
 
-    return records
-
-
-def run_find_orfs(genome_file, main_output_dir):
-    min_orf_length = 600
-
-    orf_output_dir = create_dir(main_output_dir, "orf_output")
-    seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
-    for region_name in seq_region_lengths:
-        region_length = seq_region_lengths[region_name]
-        seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
-        for phase in range(0, 6):
-            find_orf_phased_region(
-                region_name, seq, phase, min_orf_length, orf_output_dir
-            )
-
-
-def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
-    current_index = phase
-    orf_counter = 1
-    if phase > 2:
-        seq = reverse_complement(seq)
-        current_index = current_index % 3
-
-    orf_file_path = os.path.join(
-        orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")
-    )
-    orf_out = open(orf_file_path, "w+")
-
-    while current_index < len(seq):
-        codon = seq[current_index : current_index + 3]
-        if codon == "ATG":
-            orf_seq = codon
-            for j in range(current_index + 3, len(seq), 3):
-                next_codon = seq[j : j + 3]
-                if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
-                    orf_seq += next_codon
-                    if len(orf_seq) >= min_orf_length:
-                        orf_out.write(
-                            ">"
-                            + region_name
-                            + "_phase"
-                            + str(phase)
-                            + "_orf"
-                            + str(orf_counter)
-                            + "\n"
-                        )
-                        orf_out.write(orf_seq + "\n")
-                        orf_counter += 1
-                        orf_seq = ""
-                        break
-
-                # If there's another met in phase, then put i to the start of
-                # the codon after j so that only the longest ORF is found
-                if next_codon == "ATG":
-                    current_index = j + 3
-                orf_seq += next_codon
-        current_index += 3
-    orf_out.close()
-
-def bed_to_gff(input_dir, hints_file):
-    gff_out = open(hints_file, "w+")
-    exons_dict = {}
-    for bed_file in glob.glob(input_dir + "/*.bed"):
-        logger.info("Processing file for hints:")
-        logger.info(bed_file)
-        bed_in = open(bed_file)
-        bed_lines = bed_in.readlines()
-        for line in bed_lines:
-            line = line.rstrip()
-            elements = line.split("\t")
-            seq_region_name = elements[0]
-            offset = int(elements[1])
-            hit_name = elements[3]
-            strand = elements[5]
-            block_sizes = elements[10].split(",")
-            block_sizes = list(filter(None, block_sizes))
-            block_starts = elements[11].split(",")
-            block_starts = list(filter(None, block_starts))
-            exons = bed_block_to_exons(block_sizes, block_starts, offset)
-            for i, element in enumerate(exons):
-                exon_coords = exons[i]
-                exon_key = (
-                    seq_region_name
-                    + ":"
-                    + exon_coords[0]
-                    + ":"
-                    + exon_coords[1]
-                    + ":"
-                    + strand
-                )
-                if exon_key in exons_dict:
-                    exons_dict[exon_key][5] += 1
-                else:
-                    gff_list = [
-                        seq_region_name,
-                        "CDNA",
-                        "exon",
-                        exon_coords[0],
-                        exon_coords[1],
-                        1.0,
-                        strand,
-                        ".",
-                    ]
-                    exons_dict[exon_key] = gff_list
-
-    for exon_key, gff_list in exons_dict.items():
-        gff_list[5] = str(gff_list[5])
-        gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
-        gff_out.write(gff_line)
-
-    gff_out.close()
-
-    sorted_hints_out = open((hints_file + ".srt"), "w+")
-    subprocess.run(
-        ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
-        stdout=sorted_hints_out,
-    )
-    sorted_hints_out.close()
-
-def splice_junction_to_gff(input_dir, hints_file):
-    sjf_out = open(hints_file, "w+")
-
-    for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
-        sjf_in = open(sj_tab_file)
-        sjf_lines = sjf_in.readlines()
-        for line in sjf_lines:
-            elements = line.split("\t")
-            strand = "+"
-            # If the strand is undefined then skip, Augustus expects a strand
-            if elements[3] == "0":
-                continue
-            elif elements[3] == "2":
-                strand = "-"
-
-            junction_length = int(elements[2]) - int(elements[1]) + 1
-            if junction_length < 100:
-                continue
-
-            if not elements[4] and elements[7] < 10:
-                continue
-
-            # For the moment treat multimapping and single mapping things as a combined score
-            score = float(elements[6]) + float(elements[7])
-            score = str(score)
-            output_line = [
-                elements[0],
-                "RNASEQ",
-                "intron",
-                elements[1],
-                elements[2],
-                score,
-                strand,
-                ".",
-                ("src=W;mul=" + score + ";"),
-            ]
-            sjf_out.write("\t".join(output_line) + "\n")
-
-    sjf_out.close()
-
-
-def split_genome(genome_file, target_dir, min_seq_length):
-    # This is the lazy initial way of just splitting into a dir
-    # of files based on the toplevel sequence with a min sequence length filter
-    # There are a couple of obvious improvements:
-    # 1) Instead of making files for all seqs, just process N seqs
-    #    parallel, where N = num_threads. Then you could clean up the seq file
-    #    after each seq finishes, thus avoiding potentially
-    #    having thousands of file in a dir
-    # 2) Split the seq into even slices and process these in parallel
-    #    (which the same cleanup as in 1). For sequences smaller than the
-    #    target slice size, bundle them up together into a single file. Vastly
-    #    more complex, partially implemented in the splice_genome method
-    #    Allows for more consistency with parallelisation (since there should
-    #    be no large outliers). But require a mapping strategy for the
-    #    coords and sequence names and all the hints will need to be adjusted
-    current_header = ""
-    current_seq = ""
-
-    file_in = open(genome_file)
-    line = file_in.readline()
-    while line:
-        match = re.search(r">(.+)$", line)
-        if match and current_header:
-            if len(current_seq) > min_seq_length:
-                file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-                if not os.path.exists(file_out_name):
-                    file_out = open(file_out_name, "w+")
-                    file_out.write(">" + current_header + "\n" + current_seq + "\n")
-                    file_out.close()
-
-                else:
-                    print(
-                        "Found an existing split file, so will not overwrite. File found:"
-                    )
-                    print(file_out_name)
-
-            current_seq = ""
-            current_header = match.group(1)
-        elif match:
-            current_header = match.group(1)
-        else:
-            current_seq += line.rstrip()
-
-        line = file_in.readline()
-
-    if len(current_seq) > min_seq_length:
-        file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-        if not os.path.exists(file_out_name):
-            file_out = open(file_out_name, "w+")
-            file_out.write(">" + current_header + "\n" + current_seq + "\n")
-            file_out.close()
-
-        else:
-            logger.info(
-                "Found an existing split file, so will not overwrite. File found:"
-            )
-            logger.info(file_out_name)
-
-    file_in.close()
-
-
-def update_gtf_genes(parsed_gtf_genes, combined_results, validation_type):
-    output_lines = []
-
-    for gene_id in parsed_gtf_genes.keys():
-        transcript_ids = parsed_gtf_genes[gene_id].keys()
-        for transcript_id in transcript_ids:
-            transcript_line = parsed_gtf_genes[gene_id][transcript_id]["transcript"]
-            single_cds_exon_transcript = 0
-            translation_match = re.search(
-                r'; translation_coords "([^"]+)";', transcript_line
-            )
-            if translation_match:
-                translation_coords = translation_match.group(1)
-                translation_coords_list = translation_coords.split(":")
-                # If the start exon coords of both exons are the same,
-                # then it's the same exon and thus a single exon cds
-                if translation_coords_list[0] == translation_coords_list[3]:
-                    single_cds_exon_transcript = 1
-
-            exon_lines = parsed_gtf_genes[gene_id][transcript_id]["exons"]
-            validation_results = combined_results[transcript_id]
-            rnasamba_coding_probability = float(validation_results[0])
-            rnasamba_coding_potential = validation_results[1]
-            cpc2_coding_probability = float(validation_results[2])
-            cpc2_coding_potential = validation_results[3]
-            transcript_length = int(validation_results[4])
-            peptide_length = int(validation_results[5])
-            diamond_e_value = None
-            if len(validation_results) == 7:
-                diamond_e_value = validation_results[6]
-
-            avg_coding_probability = (
-                rnasamba_coding_probability + cpc2_coding_probability
-            ) / 2
-            max_coding_probability = max(
-                rnasamba_coding_probability, cpc2_coding_probability
-            )
-
-            match = re.search(r'; biotype "([^"]+)";', transcript_line)
-            biotype = match.group(1)
-            if biotype == "busco" or biotype == "protein":
-                transcript_line = re.sub(
-                    '; biotype "' + biotype + '";',
-                    '; biotype "protein_coding";',
-                    transcript_line,
-                )
-                output_lines.append(transcript_line)
-                output_lines.extend(exon_lines)
-                continue
-
-            min_single_exon_pep_length = 100
-            min_multi_exon_pep_length = 75
-            min_single_source_probability = 0.8
-            min_single_exon_probability = 0.9
-
-            # Note that the below looks at validating things
-            # under different levels of strictness
-            # There are a few different continue statements,
-            # where transcripts will be skipped resulting
-            # in a smaller post validation file. It mainly
-            # removes single coding exon genes with no real
-            # support or for multi-exon lncRNAs that are less than 200bp long
-            if single_cds_exon_transcript == 1 and validation_type == "relaxed":
-                if diamond_e_value is not None:
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif (
-                    rnasamba_coding_potential == "coding"
-                    and cpc2_coding_potential == "coding"
-                    and peptide_length >= min_single_exon_pep_length
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif (
-                    (
-                        rnasamba_coding_potential == "coding"
-                        or cpc2_coding_potential == "coding"
-                    )
-                    and peptide_length >= min_single_exon_pep_length
-                    and max_coding_probability >= min_single_source_probability
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                else:
-                    continue
-            elif single_cds_exon_transcript == 1 and validation_type == "moderate":
-                if (
-                    diamond_e_value is not None
-                    and peptide_length >= min_single_exon_pep_length
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif (
-                    (
-                        rnasamba_coding_potential == "coding"
-                        and cpc2_coding_potential == "coding"
-                    )
-                    and peptide_length >= min_single_exon_pep_length
-                    and avg_coding_probability >= min_single_exon_probability
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                else:
-                    continue
-            else:
-                if diamond_e_value is not None:
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif (
-                    rnasamba_coding_potential == "coding"
-                    and cpc2_coding_potential == "coding"
-                    and peptide_length >= min_multi_exon_pep_length
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif (
-                    (
-                        rnasamba_coding_potential == "coding"
-                        or cpc2_coding_potential == "coding"
-                    )
-                    and peptide_length >= min_multi_exon_pep_length
-                    and max_coding_probability >= min_single_source_probability
-                ):
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "protein_coding";',
-                        transcript_line,
-                    )
-                elif transcript_length >= 200:
-                    transcript_line = re.sub(
-                        '; biotype "' + biotype + '";',
-                        '; biotype "lncRNA";',
-                        transcript_line,
-                    )
-                    transcript_line = re.sub(
-                        ' translation_coords "[^"]+";', "", transcript_line
-                    )
-                else:
-                    continue
-
-            output_lines.append(transcript_line)
-            output_lines.extend(exon_lines)
-
-    return output_lines
-
-
-def read_gtf_genes(gtf_file):
-    gtf_genes = {}
-    gtf_in = open(gtf_file)
-    line = gtf_in.readline()
-    while line:
-        eles = line.split("\t")
-        if not len(eles) == 9:
-            line = gtf_in.readline()
-            continue
-
-        match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
-
-        if not match:
-            line = gtf_in.readline()
-            continue
-
-        gene_id = match.group(1)
-        transcript_id = match.group(2)
-        feature_type = eles[2]
-        if gene_id not in gtf_genes:
-            gtf_genes[gene_id] = {}
-        if feature_type == "transcript":
-            gtf_genes[gene_id][transcript_id] = {}
-            gtf_genes[gene_id][transcript_id]["transcript"] = line
-            gtf_genes[gene_id][transcript_id]["exons"] = []
-        elif feature_type == "exon":
-            gtf_genes[gene_id][transcript_id]["exons"].append(line)
-        line = gtf_in.readline()
-    gtf_in.close()
+            if feature_type == "transcript":
+                gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []}
+            elif feature_type == "exon":
+                if transcript_id in gtf_genes[gene_id]:
+                    gtf_genes[gene_id][transcript_id]["exons"].append(line.strip())
 
     return gtf_genes
 
 
+def check_transcriptomic_output(main_output_dir: Path) -> None:
+    """Check across the various transcriptomic if there is a reasonable amount of data
 
+    Args:
+        main_output_dir (Path): Output dir path
+    Raises:
+        IOError: If no lines or fewer than the minimum expected lines are found in the output files.
+    """
 
-
-# def merge_gtf_files(file_paths,id_label):
-
-#  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
-#  gtf_out = open(gtf_file_path,'w+')
-#  for gtf_file_path in gtf_files:
-#    gtf_file_name = os.path.basename(gtf_file_path)
-#    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
-#    start_offset = int(match.group(1))
-#    gtf_in = open(gtf_file_path,'r')
-#    line = gtf_in.readline()
-#    while line:
-#      values = line.split("\t")
-#      if len(values) == 9 and (values[2] in feature_types):
-#        values[3] = str(int(values[3]) + (start_offset - 1))
-#        values[4] = str(int(values[4]) + (start_offset - 1))
-#        gtf_out.write("\t".join(values))
-#        line = gtf_in.readline()
-#    gtf_in.close()
-#  gtf_out.close()
-
-
-def multiprocess_generic(cmd):
-    print(" ".join(cmd))
-    subprocess.run(cmd)
-
-
-def seq_region_names(genome_file):
-    region_list = []
-
-    file_in = open(genome_file)
-    line = file_in.readline()
-    while line:
-        match = re.search(r">([^\s]+)", line)
-        if match:
-            region_name = match.group(1)
-            if region_name == "MT":
-                logger.info("Skipping region named MT")
-                line = file_in.readline()
-                continue
-            else:
-                region_list.append(match.group(1))
-        line = file_in.readline()
-
-    return region_list
-
-
-def slice_genome(genome_file, target_dir, target_slice_size):
-    # The below is sort of tested
-    # Without the
-    target_seq_length = 50000000
-    min_seq_length = 1000
-    current_header = ""
-    current_seq = ""
-    seq_dict = {}
-    for line in seq:  # pylint:disable=used-before-assignment
-        match = re.search(r">(.+)$", line)
-        if match and current_header:
-            seq_dict[current_header] = current_seq
-            current_seq = ""
-            current_header = match.group(1)
-        elif match:
-            current_header = match.group(1)
-        else:
-            current_seq += line.rstrip()
-
-    seq_dict[current_header] = current_seq
-
-    seq_buffer = 0
-    file_number = 0
-    file_name = "genome_file_" + str(file_number)
-
-    for header in seq_dict:
-        seq_iterator = 0
-        seq = seq_dict[header]
-
-        while len(seq) > target_seq_length:
-            file_out = open(os.path.join(target_dir, file_name), "w+")
-            subseq = seq[0:target_seq_length]
-            file_out.write(
-                ">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n"
-            )
-            file_out.close()
-            seq = seq[target_seq_length:]
-            seq_iterator += 1
-            file_number += 1
-            file_name = "genome_file_" + str(file_number)
-
-        if len(seq) >= min_seq_length:
-            file_name = "genome_file_" + str(file_number)
-            file_out = open(os.path.join(file_name), "w+")
-            file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
-            file_out.close()
-            file_number += 1
-            file_name = "genome_file_" + str(file_number)
-
-
-def coallate_results(main_output_dir):
-    results_dir = create_dir(main_output_dir, "results")
-    output_dirs = [
-        "augustus_output",
-        "cpg_output",
-        "dust_output",
-        "eponine_output",
-        "red_output",
-        "rfam_output",
-        "trf_output",
-        "trnascan_output",
-    ]
-    for output_dir in output_dirs:
-        match = re.search(r"(.+)_output", output_dir)
-        result_type = match.group(1)
-        results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
-        copy_path = os.path.join(results_dir, (result_type + ".gtf"))
-        if os.path.exists(results_path):
-            cpy_cmd = ["cp", results_path, copy_path]
-            subprocess.run(cpy_cmd)
-
-
-
-def check_transcriptomic_output(main_output_dir):
-
-    # This will check across the various transcriptomic
-    # dirs and check there's actually some data
     transcriptomic_dirs = [
         "scallop_output",
         "stringtie_output",
@@ -1446,31 +556,23 @@ def check_transcriptomic_output(main_output_dir):
     total_lines = 0
     min_lines = 100000
     for transcriptomic_dir in transcriptomic_dirs:
-        full_file_path = os.path.join(
-            main_output_dir, transcriptomic_dir, "annotation.gtf"
-        )
-        if not os.path.exists(full_file_path):
+        full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf"
+        if not full_file_path.exists():
             logger.warning(
-                "Warning, no annotation.gtf found for "
-                + transcriptomic_dir
-                + ". This might be fine, e.g. no long read data were provided"
+                "Warning, no annotation.gtf found for %s. This might be fine, \
+                    e.g. no long read data were provided",
+                transcriptomic_dir,
             )
             continue
-        num_lines = sum(1 for line in open(full_file_path))
-        total_lines = total_lines + num_lines
-        logger.info(
-            "For "
-            + transcriptomic_dir
-            + " found a total of "
-            + str(num_lines)
-            + " in the annotation.gtf file"
-        )
+        num_lines = sum(1 for _ in full_file_path.open("r"))
+        total_lines += num_lines
+        logging.info("For %s found a total of in the annotation.gtf file", num_lines)
     if total_lines == 0:
         raise IOError(
             "Anno was run with transcriptomic mode enabled,\
             but the transcriptomic annotation files are empty"
         )
-    elif total_lines <= min_lines:
+    if total_lines <= min_lines:
         raise IOError(
             "Anno was run with transcriptomic mode enabled, \
             but the total number of lines in the output \
@@ -1480,9 +582,318 @@ def check_transcriptomic_output(main_output_dir):
             "Min allowed: " + str(min_lines)
         )
 
-    else:
-        logger.info(
-            "Found "
-            + str(total_lines)
-            + " total lines across the transcriptomic files. Checks passed"
-        )
+
+
+# def run_find_orfs(genome_file, main_output_dir):
+#     min_orf_length = 600
+
+#     orf_output_dir = create_dir(main_output_dir, "orf_output")
+#     seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
+#     for region_name in seq_region_lengths:
+#         region_length = seq_region_lengths[region_name]
+#         seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
+#         for phase in range(0, 6):
+#             find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir)
+
+
+# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
+#     current_index = phase
+#     orf_counter = 1
+#     if phase > 2:
+#         seq = reverse_complement(seq)
+#         current_index = current_index % 3
+
+#     orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa"))
+#     orf_out = open(orf_file_path, "w+")
+
+#     while current_index < len(seq):
+#         codon = seq[current_index : current_index + 3]
+#         if codon == "ATG":
+#             orf_seq = codon
+#             for j in range(current_index + 3, len(seq), 3):
+#                 next_codon = seq[j : j + 3]
+#                 if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
+#                     orf_seq += next_codon
+#                     if len(orf_seq) >= min_orf_length:
+#                         orf_out.write(
+#                             ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n"
+#                         )
+#                         orf_out.write(orf_seq + "\n")
+#                         orf_counter += 1
+#                         orf_seq = ""
+#                         break
+
+#                 # If there's another met in phase, then put i to the start of
+#                 # the codon after j so that only the longest ORF is found
+#                 if next_codon == "ATG":
+#                     current_index = j + 3
+#                 orf_seq += next_codon
+#         current_index += 3
+#     orf_out.close()
+
+
+# def bed_to_gff(input_dir, hints_file):
+#     gff_out = open(hints_file, "w+")
+#     exons_dict = {}
+#     for bed_file in glob.glob(input_dir + "/*.bed"):
+#         logger.info("Processing file for hints:")
+#         logger.info(bed_file)
+#         bed_in = open(bed_file)
+#         bed_lines = bed_in.readlines()
+#         for line in bed_lines:
+#             line = line.rstrip()
+#             elements = line.split("\t")
+#             seq_region_name = elements[0]
+#             offset = int(elements[1])
+#             hit_name = elements[3]
+#             strand = elements[5]
+#             block_sizes = elements[10].split(",")
+#             block_sizes = list(filter(None, block_sizes))
+#             block_starts = elements[11].split(",")
+#             block_starts = list(filter(None, block_starts))
+#             exons = bed_block_to_exons(block_sizes, block_starts, offset)
+#             for i, element in enumerate(exons):
+#                 exon_coords = exons[i]
+#                 exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand
+#                 if exon_key in exons_dict:
+#                     exons_dict[exon_key][5] += 1
+#                 else:
+#                     gff_list = [
+#                         seq_region_name,
+#                         "CDNA",
+#                         "exon",
+#                         exon_coords[0],
+#                         exon_coords[1],
+#                         1.0,
+#                         strand,
+#                         ".",
+#                     ]
+#                     exons_dict[exon_key] = gff_list
+
+#     for exon_key, gff_list in exons_dict.items():
+#         gff_list[5] = str(gff_list[5])
+#         gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
+#         gff_out.write(gff_line)
+
+#     gff_out.close()
+
+#     sorted_hints_out = open((hints_file + ".srt"), "w+")
+#     subprocess.run(
+#         ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
+#         stdout=sorted_hints_out,
+#     )
+#     sorted_hints_out.close()
+
+
+# def splice_junction_to_gff(input_dir, hints_file):
+#     sjf_out = open(hints_file, "w+")
+
+#     for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
+#         sjf_in = open(sj_tab_file)
+#         sjf_lines = sjf_in.readlines()
+#         for line in sjf_lines:
+#             elements = line.split("\t")
+#             strand = "+"
+#             # If the strand is undefined then skip, Augustus expects a strand
+#             if elements[3] == "0":
+#                 continue
+#             elif elements[3] == "2":
+#                 strand = "-"
+
+#             junction_length = int(elements[2]) - int(elements[1]) + 1
+#             if junction_length < 100:
+#                 continue
+
+#             if not elements[4] and elements[7] < 10:
+#                 continue
+
+#             # For the moment treat multimapping and single mapping things as a combined score
+#             score = float(elements[6]) + float(elements[7])
+#             score = str(score)
+#             output_line = [
+#                 elements[0],
+#                 "RNASEQ",
+#                 "intron",
+#                 elements[1],
+#                 elements[2],
+#                 score,
+#                 strand,
+#                 ".",
+#                 ("src=W;mul=" + score + ";"),
+#             ]
+#             sjf_out.write("\t".join(output_line) + "\n")
+
+#     sjf_out.close()
+
+
+# def split_genome(genome_file, target_dir, min_seq_length):
+#     # This is the lazy initial way of just splitting into a dir
+#     # of files based on the toplevel sequence with a min sequence length filter
+#     # There are a couple of obvious improvements:
+#     # 1) Instead of making files for all seqs, just process N seqs
+#     #    parallel, where N = num_threads. Then you could clean up the seq file
+#     #    after each seq finishes, thus avoiding potentially
+#     #    having thousands of file in a dir
+#     # 2) Split the seq into even slices and process these in parallel
+#     #    (which the same cleanup as in 1). For sequences smaller than the
+#     #    target slice size, bundle them up together into a single file. Vastly
+#     #    more complex, partially implemented in the splice_genome method
+#     #    Allows for more consistency with parallelisation (since there should
+#     #    be no large outliers). But require a mapping strategy for the
+#     #    coords and sequence names and all the hints will need to be adjusted
+#     current_header = ""
+#     current_seq = ""
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             if len(current_seq) > min_seq_length:
+#                 file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#                 if not os.path.exists(file_out_name):
+#                     file_out = open(file_out_name, "w+")
+#                     file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#                     file_out.close()
+
+#                 else:
+#                     print("Found an existing split file, so will not overwrite. File found:")
+#                     print(file_out_name)
+
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#         line = file_in.readline()
+
+#     if len(current_seq) > min_seq_length:
+#         file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#         if not os.path.exists(file_out_name):
+#             file_out = open(file_out_name, "w+")
+#             file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#             file_out.close()
+
+#         else:
+#             logger.info("Found an existing split file, so will not overwrite. File found:")
+#             logger.info(file_out_name)
+
+#     file_in.close()
+
+
+# # def merge_gtf_files(file_paths,id_label):
+
+# #  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
+# #  gtf_out = open(gtf_file_path,'w+')
+# #  for gtf_file_path in gtf_files:
+# #    gtf_file_name = os.path.basename(gtf_file_path)
+# #    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
+# #    start_offset = int(match.group(1))
+# #    gtf_in = open(gtf_file_path,'r')
+# #    line = gtf_in.readline()
+# #    while line:
+# #      values = line.split("\t")
+# #      if len(values) == 9 and (values[2] in feature_types):
+# #        values[3] = str(int(values[3]) + (start_offset - 1))
+# #        values[4] = str(int(values[4]) + (start_offset - 1))
+# #        gtf_out.write("\t".join(values))
+# #        line = gtf_in.readline()
+# #    gtf_in.close()
+# #  gtf_out.close()
+
+
+# def multiprocess_generic(cmd):
+#     print(" ".join(cmd))
+#     subprocess.run(cmd)
+
+
+# def seq_region_names(genome_file):
+#     region_list = []
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">([^\s]+)", line)
+#         if match:
+#             region_name = match.group(1)
+#             if region_name == "MT":
+#                 logger.info("Skipping region named MT")
+#                 line = file_in.readline()
+#                 continue
+#             else:
+#                 region_list.append(match.group(1))
+#         line = file_in.readline()
+
+#     return region_list
+
+
+# def slice_genome(genome_file, target_dir, target_slice_size):
+#     # The below is sort of tested
+#     # Without the
+#     target_seq_length = 50000000
+#     min_seq_length = 1000
+#     current_header = ""
+#     current_seq = ""
+#     seq_dict = {}
+#     for line in seq:  # pylint:disable=used-before-assignment
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             seq_dict[current_header] = current_seq
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#     seq_dict[current_header] = current_seq
+
+#     seq_buffer = 0
+#     file_number = 0
+#     file_name = "genome_file_" + str(file_number)
+
+#     for header in seq_dict:
+#         seq_iterator = 0
+#         seq = seq_dict[header]
+
+#         while len(seq) > target_seq_length:
+#             file_out = open(os.path.join(target_dir, file_name), "w+")
+#             subseq = seq[0:target_seq_length]
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n")
+#             file_out.close()
+#             seq = seq[target_seq_length:]
+#             seq_iterator += 1
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+#         if len(seq) >= min_seq_length:
+#             file_name = "genome_file_" + str(file_number)
+#             file_out = open(os.path.join(file_name), "w+")
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
+#             file_out.close()
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+
+# def coallate_results(main_output_dir):
+#     results_dir = create_dir(main_output_dir, "results")
+#     output_dirs = [
+#         "augustus_output",
+#         "cpg_output",
+#         "dust_output",
+#         "eponine_output",
+#         "red_output",
+#         "rfam_output",
+#         "trf_output",
+#         "trnascan_output",
+#     ]
+#     for output_dir in output_dirs:
+#         match = re.search(r"(.+)_output", output_dir)
+#         result_type = match.group(1)
+#         results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
+#         copy_path = os.path.join(results_dir, (result_type + ".gtf"))
+#         if os.path.exists(results_path):
+#             cpy_cmd = ["cp", results_path, copy_path]
+# subprocess.run(cpy_cmd)

From feae8308f4c82082eb903d0de925fbfe55a8fb17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:17:21 +0000
Subject: [PATCH 17/20] added modules calls

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 6656e13..e9b6a4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -150,6 +150,11 @@ run_star = "ensembl.tools.anno.transcriptomic_annotation.star:main"
 run_minimap2 = "ensembl.tools.anno.transcriptomic_annotation.minimap:main"
 run_stringtie = "ensembl.tools.anno.transcriptomic_annotation.stringtie:main"
 run_scallop = "ensembl.tools.anno.transcriptomic_annotation.scallop:main"
+run_rnasamba = "ensembl.tools.anno.finalise_geneset.rnasamba:main"
+run_cpc2 = "ensembl.tools.anno.finalise_geneset.cpc2:main"
+run_diamond = "ensembl.tools.anno.finalise_geneset.diamond:main"
+run_finalise_geneset = "ensembl.tools.anno.finalise_geneset.finalise_geneset:main"
+load_results_to_ensembl_db = "ensembl.tools.anno.load_results_to_ensembl_db:main"
 # This is configuration specific to the `setuptools` build backend.
 # If you are using a different build backend, you will need to change this.
 [tool.setuptools]

From c5b49e663fc1b0864bd49f6de330ad54bc468663 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:19:27 +0000
Subject: [PATCH 18/20] added load into db module

---
 docs/source/load_into_core.rst                |   8 +
 .../tools/anno/utils/load_into_core.py        | 435 ++++++++++++++++++
 2 files changed, 443 insertions(+)
 create mode 100644 docs/source/load_into_core.rst
 create mode 100644 src/python/ensembl/tools/anno/utils/load_into_core.py

diff --git a/docs/source/load_into_core.rst b/docs/source/load_into_core.rst
new file mode 100644
index 0000000..bd030f6
--- /dev/null
+++ b/docs/source/load_into_core.rst
@@ -0,0 +1,8 @@
+Annotation loading into core db - Module Documentation
+===================================
+
+.. automodule:: ensembl.tools.anno.load_into_core
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/src/python/ensembl/tools/anno/utils/load_into_core.py b/src/python/ensembl/tools/anno/utils/load_into_core.py
new file mode 100644
index 0000000..10b930e
--- /dev/null
+++ b/src/python/ensembl/tools/anno/utils/load_into_core.py
@@ -0,0 +1,435 @@
+# pylint: disable=missing-module-docstring
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Loads GTF records for multiple features into an Ensembl database.
+"""
+__all__ = ["load_results_to_ensembl_db"]
+
+import argparse
+import logging
+import logging.config
+import multiprocessing
+from pathlib import Path
+import gc
+import re
+import subprocess
+from typing import Generator, List, Dict
+import tempfile
+from ensembl.tools.anno.utils._utils import (
+    create_dir,
+)
+
+
+def load_results_to_ensembl_db(
+    genome_file: Path,
+    main_output_dir: Path,
+    db_details: str,
+    features: Dict[str, tuple[int, str, str]],
+    num_threads: int = 1,
+    single_transcript_genes_loading: bool = False,
+) -> None:
+    """
+    Loads GTF records for multiple features into an Ensembl database.
+
+    This function processes each feature defined in `features`, batches the associated GTF records,
+    and then loads them into the Ensembl database using the `generic_load_records_to_ensembl_db` function.
+
+    :param genome_file: Path to the genome file used for the loading process.
+    :type genome_file: Path
+    :param main_output_dir: The directory where the feature directories and GTF files are stored.
+    :type main_output_dir: Path
+    :param db_details: Comma-separated database connection details \
+        (db_name, db_host, db_port, db_user, db_pass).
+    :type db_details: str
+    :param features: A dictionary with feature names as keys, and tuples \
+        with batch size, load type (e.g., 'gene'), and analysis name (e.g., 'ensembl').
+    :type features: (Dict[str, Tuple[int, str, str]])
+    :param num_threads: The number of threads to use for loading.
+    :type num_threads: int, default 1.
+    :param single_transcript_genes_loading: Flag to indicate whether to load single transcript genes.
+    :type single_transcript_genes_loading: bool, default False.
+
+    """
+    db_loading_script = (
+        Path(__file__).parents[5]
+        / "perl"
+        / "ensembl"
+        / "tools"
+        / "anno"
+        / "support_scripts_perl"
+        / "load_gtf_ensembl.pl"
+    )
+    db_loading_dir = create_dir(main_output_dir, "db_loading")
+    # Process each selected feature
+    for feature in features:
+        # Retrieve settings for the current feature
+        batch_size, load_type, analysis_name = features[feature]
+
+        # Construct the path to the GTF file for the feature
+        annotation_results_gtf_file = main_output_dir / feature / "annotation.gtf"
+
+        # Check if the file exists and load the feature
+        if annotation_results_gtf_file.exists():
+            logging.info("Loading %s geneset to db", feature)
+            gtf_records = batch_gtf_records(annotation_results_gtf_file, batch_size, load_type)
+            generic_load_records_to_ensembl_db(
+                single_transcript_genes_loading,
+                db_loading_script,
+                genome_file,
+                db_details,
+                db_loading_dir,
+                load_type,
+                analysis_name,
+                gtf_records,
+                num_threads,
+            )
+        else:
+            logging.error(
+                "Did not find the %s annotation file, so not loading. Path checked: %s",
+                feature,
+                annotation_results_gtf_file,
+            )
+
+
+def generic_load_records_to_ensembl_db(
+    single_transcript_genes_loading: bool,
+    db_loading_script: Path,
+    genome_file: Path,
+    db_details: str,
+    db_loading_dir: Path,
+    load_type: str,
+    analysis_name: str,
+    gtf_records: List[List[str]],
+    num_threads: int,
+) -> None:
+    """
+    Loads GTF records into the Ensembl database using multiple threads.
+
+    This function processes the GTF records in batches and loads them asynchronously
+    into the database using multiprocessing. The function delegates the actual loading
+    of records to `multiprocess_load_records_to_ensembl_db` for each batch.
+
+    Args:
+        single_transcript_genes_loading (str): Option for loading transcript separately without
+        defining the canonical.
+        db_loading_script (Path): Path to the Perl script used for loading records into Ensembl.
+        genome_file (Path): Path to the genome file.
+        db_details (str): Database details.
+        db_loading_dir (Path): Directory for storing temporary files related to DB loading.
+        load_type (str): Type of the records being loaded (e.g., "gene", "transcript").
+        analysis_name (str): Name of the analysis being performed.
+        gtf_records (List[List[str]]): A list of GTF record batches to be loaded.
+        num_threads (int): Number of threads to use for loading records.
+
+    Returns:
+        None: This function performs an asynchronous operation and doesn't return any value.
+    """
+    pool = multiprocessing.Pool(int(num_threads))#pylint:disable=consider-using-with
+    for record_batch in gtf_records:
+        pool.apply_async(
+            multiprocess_load_records_to_ensembl_db,
+            args=(
+                single_transcript_genes_loading,
+                db_loading_script,
+                genome_file,
+                db_details,
+                db_loading_dir,
+                load_type,
+                analysis_name,
+                record_batch,
+            ),
+        )
+    try:
+        pool.close()
+        pool.join()
+    except Exception as e:#pylint:disable=broad-exception-caught
+        logging.error("Error during pool shutdown: %s", e)
+    finally:
+        pool.terminate()
+
+
+def multiprocess_load_records_to_ensembl_db(
+    single_transcript_genes_loading: bool,
+    db_loading_script: str,
+    genome_file: str,
+    db_details: str,
+    output_dir: str,
+    load_type: str,
+    analysis_name: str,
+    record_batch: List[str],
+) -> None:
+    """
+    Loads a batch of GTF records into an Ensembl database using a temporary GTF file.
+
+    This function writes the provided `record_batch` to a temporary GTF file, constructs
+    a command to load the records into the database using a Perl script, and executes it
+    via a subprocess. After execution, the temporary GTF file is removed.
+
+    Args:
+        single_transcript_genes_loading (bool): Flag to indicate if single transcript genes
+                                                should be loaded.
+        db_loading_script (str): Path to the Perl script used to load records into the database.
+        genome_file (str): Path to the genome file to be used by the loading script.
+        db_details (str): Comma-separated string containing database connection details 
+        (db_name, db_host, db_port, db_user, db_pass).
+        output_dir (str): Directory where the temporary GTF file will be stored.
+        load_type (str): Type of the record being loaded (e.g., "gene").
+        analysis_name (str): Name of the analysis being performed (e.g., "ensembl").
+        record_batch (List[str]): List of GTF records (as strings) to be written to the temporary file.
+
+    Returns:
+        None: This function performs a subprocess call and does not return any value.
+    """
+    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=output_dir) as gtf_temp_out:
+        for line in record_batch:
+            gtf_temp_out.writelines(line)
+            # gtf_temp_file_path = gtf_temp_out.name
+
+    (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",")
+
+    loading_cmd = [
+        "perl",
+        str(db_loading_script),
+        "-genome_file",
+        str(genome_file),
+        "-dbname",
+        db_name,
+        "-host",
+        db_host,
+        "-port",
+        str(db_port),
+        "-user",
+        db_user,
+        "-pass",
+        db_pass,
+        "-gtf_file",
+        str(gtf_temp_out),
+        "-analysis_name",
+        analysis_name,
+        "-load_type",
+        load_type,
+    ]
+
+    if load_type == "gene" and analysis_name == "ensembl":
+        loading_cmd.extend(
+            [
+                "-protein_coding_biotype",
+                "anno_protein_coding",
+                "-non_coding_biotype",
+                "anno_lncRNA",
+            ]
+        )
+
+        if single_transcript_genes_loading:
+            loading_cmd.append("-make_single_transcript_genes")
+
+    logging.info(" ".join(loading_cmd))
+    try:
+        # Run the loading command as a subprocess
+        subprocess.run(loading_cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        logging.error("Error executing loading command: %s", e)
+    finally:
+        # Clean up temporary files
+        try:
+            gtf_temp_out.unlink()
+            logging.info("Temporary file %s removed.", gtf_temp_out)
+        except OSError as e:
+            logging.error("Error removing temporary file %s, %s", gtf_temp_out, e)
+        # Run garbage collection to free up memory
+        gc.collect()
+        logging.info("Finished processing batch from %s", gtf_temp_out)
+
+
+# The following function assumes the file has unique ids for the parent features.
+# It then batches them into
+# sets of records based on the batch size passed in
+def batch_gtf_records(input_gtf_file: Path, batch_size: int, record_type: str) -> List[List[str]]:
+    """
+    Processes a GTF file in batches and returns a list of record batches.
+
+    Args:
+        input_gtf_file (Path): The path to the input GTF file.
+        batch_size (int): The number of records per batch.
+        record_type (str): The type of records to process ('gene' or 'single_line_feature').
+
+    Returns:
+        List[List[str]]: A list of batches, each containing GTF lines as lists of strings.
+    """
+
+    def read_gtf_lines(gtf_file: Path) -> Generator[str, None, None]:
+        """
+        Generator that reads and yields valid GTF lines from the file.
+
+        Args:
+            gtf_file (Path): The path to the input GTF file.
+
+        Yields:
+            str: A valid GTF line.
+        """
+        with open(gtf_file) as gtf_in:
+            for line in gtf_in:
+                if re.match(r"^#", line):  # Skip comment lines
+                    continue
+                elems = line.split("\t")
+                if len(elems) == 9:  # Ensure valid GTF line
+                    yield line
+
+    records: List[List[str]] = []
+    current_record_batch: List[str] = []
+    record_counter: int = 0
+    current_gene_id: str = ""
+
+    # Iterate over the GTF lines
+    for line in read_gtf_lines(input_gtf_file):
+        if record_type == "gene":
+            match = re.search(r'gene_id "([^"]+)"', line)
+            if match:
+                gene_id = match.group(1)
+
+                if not current_gene_id:
+                    record_counter += 1
+                    current_gene_id = gene_id
+
+                if gene_id != current_gene_id:
+                    record_counter += 1
+                    if record_counter % batch_size == 0:
+                        records.append(current_record_batch)
+                        current_record_batch = []
+                    current_gene_id = gene_id
+
+                current_record_batch.append(line)
+
+        elif record_type == "single_line_feature":
+            record_counter += 1
+            if record_counter % batch_size == 0:
+                records.append(current_record_batch)
+                current_record_batch = []
+
+            current_record_batch.append(line)
+
+    # Append the last batch of records
+    if current_record_batch:
+        records.append(current_record_batch)
+
+    return records
+
+
+def build_feature_settings(
+    args: argparse.Namespace, repeatmasker_analysis: str = "repeatmask_repbase_human"
+) -> Dict[str, tuple[int, str, str]]:
+    """
+    Builds the feature settings dictionary based on the enabled flags from the argument object.
+
+    This function uses the provided argument flags (`args`) to determine which features
+    to include in the returned settings dictionary. The dictionary contains the configuration
+    for each feature, including batch size, record type, and analysis name.
+
+    Args:
+        args (object): An object containing flags (e.g., `args.annotation`, `args.rfam`)
+                       that determine which features should be included.
+        repeatmasker_analysis (str, optional): The analysis name for RepeatMasker output.
+                                                Defaults to "repeatmask_repbase_human".
+
+    Returns:
+        Dict[str, tuple[int, str, str]]: A dictionary where keys are feature names (e.g., "annotation_output")
+                                          and values are tuples containing batch size (int),
+                                          record type (str), and analysis name (str).
+    """
+    # Base configurations for all features
+    all_features = {
+        "annotation_output": (200, "gene", "ensembl"),
+        "rfam_output": (500, "gene", "ncrna"),
+        "trnascan_output": (500, "gene", "ncrna"),
+        "dust_output": (500, "single_line_feature", "dust"),
+        "red_output": (500, "single_line_feature", "repeatdetector"),
+        "trf_output": (500, "single_line_feature", "trf"),
+        "repeatmasker_output": (500, "single_line_feature", repeatmasker_analysis),
+        "cpg_output": (500, "single_line_feature", "cpg"),
+        "eponine_output": (500, "single_line_feature", "eponine"),
+    }
+
+    # Start with an empty dictionary
+    feature_settings = {}
+
+    # Add features based on enabled flags
+    if args.annotation:
+        feature_settings["annotation_output"] = all_features["annotation_output"]
+    if args.rfam:
+        feature_settings["rfam_output"] = all_features["rfam_output"]
+    if args.trnascan:
+        feature_settings["trnascan_output"] = all_features["trnascan_output"]
+    if args.dust:
+        feature_settings["dust_output"] = all_features["dust_output"]
+    if args.red:
+        feature_settings["red_output"] = all_features["red_output"]
+    if args.trf:
+        feature_settings["trf_output"] = all_features["trf_output"]
+    if args.repeatmasker:
+        feature_settings["repeatmasker_output"] = all_features["repeatmasker_output"]
+    if args.cpg:
+        feature_settings["cpg_output"] = all_features["cpg_output"]
+    if args.eponine:
+        feature_settings["eponine_output"] = all_features["eponine_output"]
+
+    return feature_settings
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Load multiple features into the Ensembl database.")
+    parser.add_argument("--output_dir", required=True, help="Output directory path")
+    parser.add_argument("--genome_file", required=True, help="Path for the genome fasta file")
+    parser.add_argument("--annotation", action="store_true", help="Enable annotation output feature")
+    parser.add_argument("--rfam", action="store_true", help="Enable RFAM feature")
+    parser.add_argument("--trnascan", action="store_true", help="Enable tRNA scan feature")
+    parser.add_argument("--dust", action="store_true", help="Enable dust feature")
+    parser.add_argument("--red", action="store_true", help="Enable RED feature")
+    parser.add_argument("--trf", action="store_true", help="Enable TRF feature")
+    parser.add_argument("--repeatmasker", action="store_true", help="Enable RepeatMasker feature")
+    parser.add_argument("--cpg", action="store_true", help="Enable CpG feature")
+    parser.add_argument("--eponine", action="store_true", help="Enable Eponine feature")
+    parser.add_argument(
+        "--db_details",
+        required=True,
+        type=str,
+        help="Comma-separated string containing database details <db_name,db_host,db_port,db_user,db_pass>",
+    )
+    parser.add_argument("--num_threads", type=int, default=1, help="Number of threads")
+
+    parser.add_argument(
+        "--single_transcript_genes_loading",
+        action="store_true",
+        help="Flag to indicate whether to load single transcript genes (default is False).",
+    )
+    return parser.parse_args()
+
+
+def main():
+    """Loading into core database's entry-point."""
+    args = parse_args()
+
+    # Build the feature settings dictionary dynamically
+    feature_setting = build_feature_settings(args, repeatmasker_analysis="repeatmask_repbase_human")
+
+    load_results_to_ensembl_db(
+        args.genome_file,
+        args.output_dir,
+        args.db_details,
+        feature_setting,
+        args.num_threads,
+        args.single_transcript_genes_loading,
+    )

From a9a9ac57f862f15557fca2efaf897b800d663699 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:24:27 +0000
Subject: [PATCH 19/20] updated index

---
 docs/source/index.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1f4fe61..e2d9862 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,18 +38,24 @@ to install the project.
    license
 
    cmsearch
+   cpc2
    cpg
+   diamond
    dust
    eponine
    genblast
    minimap
    red
    repeatmasker
+   rnasamba
    scallop
    star
    stringtie
    trf
    trnascan
+
+   finalise genset
+   load into core
    
 Indices and tables
 ==================

From 445a2b2d05f64199252b698d034a3c2a65702e12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= <ftricomi@ebi.ac.uk>
Date: Thu, 12 Dec 2024 16:31:00 +0000
Subject: [PATCH 20/20] resored utils to merge

---
 src/python/ensembl/tools/anno/utils/_utils.py | 1447 +++++++++++------
 .../ensembl/tools/anno/utils/_utils_copy.py   |  899 ++++++++++
 2 files changed, 1876 insertions(+), 470 deletions(-)
 create mode 100644 src/python/ensembl/tools/anno/utils/_utils_copy.py

diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py
index b4bbc72..4915d37 100644
--- a/src/python/ensembl/tools/anno/utils/_utils.py
+++ b/src/python/ensembl/tools/anno/utils/_utils.py
@@ -16,16 +16,17 @@
 
 import errno
 import io
+import json
 import logging
 import os
 from os import PathLike
 from pathlib import Path
-import random
 import re
 import subprocess
 import shutil
+import glob
 import tempfile
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 logger = logging.getLogger("__name__." + __name__)
 
@@ -80,8 +81,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
     Check number of transcript lines in the GTF
 
     Arg:
-        gtf_file: GTF file path
-        content_obj: Object to detect and count in the gtf i.e transcript, repeat
+      gtf_file: GTF file path
+      content_obj: Object to detect and count in the gtf i.e transcript, repeat
 
     Return: Number of occurences
     """
@@ -126,7 +127,7 @@ def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dic
 
         if len(current_seq) > min_seq_length:
             seq_region_to_length[current_header] = len(current_seq)
-    # logging.info("length of the seq region %s", seq_region_to_length)
+    #logging.info("length of the seq region %s", seq_region_to_length)
     return seq_region_to_length
 
 
@@ -166,7 +167,7 @@ def get_slice_id(
             if (end - start + 1) >= min_length:
                 slice_ids_per_region.append([seq_region, start, end])
             start = end + 1
-    # logging.info("list of slice ids %s", str(slice_ids_per_region))
+    #logging.info("list of slice ids %s", str(slice_ids_per_region))
     return slice_ids_per_region
 
 
@@ -252,7 +253,9 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             # Example key KS8000.rs1.re1000000.1
                             gene_id_slice = input_file.name + "." + str(current_gene_id)
                             # Example key KS8000.rs1.re1000000.1.transcript.1
-                            transcript_id_slice = gene_id_slice + "." + str(current_transcript_id)
+                            transcript_id_slice = (
+                                gene_id_slice + "." + str(current_transcript_id)
+                            )
                             # If there is no existing entry, the gene key will be added
                             # and the gene counter is incremented.
                             # gene_id "gene1"; transcript_id "gene1.t1"
@@ -277,12 +280,16 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                                     + ".t"
                                     + str(transcript_id_count_gene_id[gene_id_slice])
                                 )
-                                gene_transcript_id_collection[transcript_id_slice] = new_transcript_id
+                                gene_transcript_id_collection[
+                                    transcript_id_slice
+                                ] = new_transcript_id
                                 transcript_id_count_gene_id[gene_id_slice] += 1
                             else:
                                 # If a transcript of the same set is already present,
                                 # the new id with the incremented counter is added
-                                new_transcript_id = gene_transcript_id_collection[transcript_id_slice]
+                                new_transcript_id = gene_transcript_id_collection[
+                                    transcript_id_slice
+                                ]
                             attribs = re.sub(
                                 full_transcript_id_string,
                                 'transcript_id "' + new_transcript_id + '"',
@@ -295,15 +302,13 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                         else:
                             if new_id_prefix == "repeatmask":
                                 match_feature_type = re.search(
-                                    r"(" + feature_id_label + r"\d+)",
+                                    r"(" + feature_id_label + "\d+)",
                                     line,
                                 )
                             else:
-                                match_feature_type = (
-                                    re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
-                                        r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
-                                        line,
-                                    )
+                                match_feature_type = re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
+                                    r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
+                                    line,
                                 )
                             if match_feature_type:
                                 full_feature_id = match_feature_type.group(1)
@@ -321,8 +326,6 @@ def slice_output_to_gtf(  # pylint: disable=too-many-branches
                             "Feature type not recognised, will skip. Feature type: %s",
                             values[2],
                         )
-
-
 def get_sequence(
     seq_region: str,
     start: int,
@@ -351,7 +354,9 @@ def get_sequence(
         "get_sequence %s",
         f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}",
     )
-    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file:
+    with tempfile.NamedTemporaryFile(
+        mode="w+t", delete=False, dir=str(output_dir)
+    ) as bed_temp_file:
         bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}")
         bed_temp_file.close()
     bedtools_command = [
@@ -362,10 +367,10 @@ def get_sequence(
         "-bed",
         str(bed_temp_file.name),
     ]
-    bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with
-        bedtools_command, stdout=subprocess.PIPE
-    )  # pylint:disable=consider-using-with
-    for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")):  # type: ignore
+    bedtools_output = subprocess.Popen(bedtools_command, stdout=subprocess.PIPE)
+    for idx, line in enumerate(
+        io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")  # type: ignore
+    ):
         if idx == 1:
             if strand == 1:
                 sequence = line.rstrip()
@@ -399,155 +404,958 @@ def check_file(file_path: Path) -> None:
         raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
 
 
-def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
-    """
-    Save fastq sequences in a dictionary <header, sequence>
-    fasta_lines: List[str]: List of sequences
-    """
-    index: Dict[str, str] = {}
-    header: Optional[str] = None
-    seq: List[str] = []
-
-    for line in fasta_lines:
-        line = line.strip()
-        header_match = re.match(r">(.+)\n$", line)
-        if header_match:
-            if header:
-                index[header] = "".join(seq)
-                header = None
-            header = header_match.group(1)
-            seq = []
-        else:
-            seq.append(line)
-    # Add the last sequence
-    if header:
-        index[header] = "".join(seq)
-    return index
-
+def load_results_to_ensembl_db(
+    main_script_dir,
+    load_to_ensembl_db,
+    genome_file,
+    main_output_dir,
+    db_details,
+    num_threads,
+    ):
+    db_loading_script = os.path.join(
+        main_script_dir, "support_scripts_perl", "load_gtf_ensembl.pl"
+    )
+    db_loading_dir = create_dir(main_output_dir, "db_loading")
 
-# def fasta_to_dict(fasta_lines:List[str]):
-#    index = {}
-#    for line in fasta_lines:
-#    #it = iter(fasta_lines)
-#    #for header in it:
-#        match = re.search(r">(.+)\n$", line)
-#        header = match.group(1)
-#        seq = next(line)
-#        index[header] = seq
-#    return index
+    # Should collapse this into a function
+    annotation_results_gtf_file = os.path.join(
+        main_output_dir, "annotation_output", "annotation.gtf"
+    )
+    if os.path.exists(annotation_results_gtf_file):
+        logger.info("Loading main geneset to db")
+        batch_size = 200
+        load_type = "gene"
+        analysis_name = "ensembl"
+        gtf_records = batch_gtf_records(
+            annotation_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find the main gene annotation file, so not loading. Path checked:\n"
+            + annotation_results_gtf_file
+        )
 
+    rfam_results_gtf_file = os.path.join(main_output_dir, "rfam_output", "annotation.gtf")
+    if os.path.exists(rfam_results_gtf_file):
+        logger.info("Loading Rfam-based sncRNA genes to db")
+        batch_size = 500
+        load_type = "gene"
+        analysis_name = "ncrna"
+        gtf_records = batch_gtf_records(
+            rfam_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an Rfam annotation file, so not loading. Path checked:\n"
+            + rfam_results_gtf_file
+        )
 
-def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List:
-    """
-    The protein dataset file is splitted by a number of sequence equals to the batch_size
-    in batch files stored in 10 output directories.
-    protein_dataset : Path for the protein dataset.
-    output_dir : Output directory path.
-    batch_size : Size of the batch, it needs to be equals to the number of threads
-    to parallelise the sequence processing for each file.
-    """
-    batched_protein_files = []
-
-    for i in range(0, 10):
-        create_dir(output_dir, (f"bin_{i}"))
-    with open(protein_dataset, "r", encoding="utf8") as file_in:
-        seq_count = 0
-        batch_count = 0
-        current_record = ""
-        initial_seq = True
-        for line in file_in:
-            match = re.search(r">(.+)$", line)
-            # match header and is not first sequence, if the number of stored sequences in each file equals
-            # the number of batch_size, a new file will be created and the current_record reset
-            if match and not initial_seq and seq_count % batch_size == 0:
-                bin_num = random.randint(0, 9)
-                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-                with batch_file.open("w+") as file_out:
-                    file_out.write(current_record)
-                batch_count += 1
-                seq_count += 1
-                current_record = line
-                batched_protein_files.append(batch_file)
-            # match header and is the first sequence
-            elif match:
-                current_record += line
-                initial_seq = False
-                seq_count += 1
-            # other lines
-            else:
-                current_record += line
+    trnascan_results_gtf_file = os.path.join(
+        main_output_dir, "trnascan_output", "annotation.gtf"
+    )
+    if os.path.exists(trnascan_results_gtf_file):
+        logger.info("Loading tRNAScan-SE tRNA genes to db")
+        batch_size = 500
+        load_type = "gene"
+        analysis_name = "ncrna"
+        gtf_records = batch_gtf_records(
+            trnascan_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an tRNAScan-SE annotation file, so not loading. Path checked:\n"
+            + trnascan_results_gtf_file
+        )
 
-        if current_record:
-            bin_num = random.randint(0, 9)
-            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
-            with batch_file.open("w+") as file_out:
-                file_out.write(current_record)
-            batched_protein_files.append(batch_file)
-    return batched_protein_files
+    dust_results_gtf_file = os.path.join(main_output_dir, "dust_output", "annotation.gtf")
+    if os.path.exists(dust_results_gtf_file):
+        logger.info("Loading Dust repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "dust"
+        gtf_records = batch_gtf_records(
+            dust_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a Dust annotation file, so not loading. Path checked:\n"
+            + dust_results_gtf_file
+        )
 
+    red_results_gtf_file = os.path.join(main_output_dir, "red_output", "annotation.gtf")
+    if os.path.exists(red_results_gtf_file):
+        logger.info("Loading Red repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "repeatdetector"
+        gtf_records = batch_gtf_records(
+            red_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a Red annotation file, so not loading. Path checked:\n"
+            + red_results_gtf_file
+        )
 
-def run_command(cmd) -> None:
-    """Submit command in multiprocessing action
+    trf_results_gtf_file = os.path.join(main_output_dir, "trf_output", "annotation.gtf")
+    if os.path.exists(trf_results_gtf_file):
+        logger.info("Loading TRF repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "trf"
+        gtf_records = batch_gtf_records(
+            trf_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a TRF annotation file, so not loading. Path checked:\n"
+            + trf_results_gtf_file
+        )
 
-    Args:
-        cmd (List[str]): command to execute
-    """
-    try:
-        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
-        logger.info("Command executed successfully: %s", cmd)
-        logger.info("Output: %s", result.stdout)
-    except subprocess.CalledProcessError as e:
-        logger.error("Command failed: %s", cmd)
-        logger.error("Error: %s", e.stderr)
+    cpg_results_gtf_file = os.path.join(main_output_dir, "cpg_output", "annotation.gtf")
+    if os.path.exists(cpg_results_gtf_file):
+        logger.info("Loading CpG islands to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "cpg"
+        gtf_records = batch_gtf_records(
+            cpg_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find a CpG annotation file, so not loading. Path checked:\n"
+            + cpg_results_gtf_file
+        )
 
+    eponine_results_gtf_file = os.path.join(
+        main_output_dir, "eponine_output", "annotation.gtf"
+    )
+    if os.path.exists(eponine_results_gtf_file):
+        logger.info("Loading Eponine repeats to db")
+        batch_size = 500
+        load_type = "single_line_feature"
+        analysis_name = "eponine"
+        gtf_records = batch_gtf_records(
+            eponine_results_gtf_file, batch_size, db_loading_dir, load_type
+        )
+        generic_load_records_to_ensembl_db(
+            load_to_ensembl_db,
+            db_loading_script,
+            genome_file,
+            db_details,
+            db_loading_dir,
+            load_type,
+            analysis_name,
+            gtf_records,
+            num_threads,
+        )
+    else:
+        logger.error(
+            "Did not find an Eponine annotation file, so not loading. Path checked:\n"
+            + eponine_results_gtf_file
+        )
 
-def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
-    """
-    Parses a GTF file to extract gene, transcript, and exon information.
+    logger.info("Finished loading records to db")
+
+
+def generic_load_records_to_ensembl_db(
+    load_to_ensembl_db,
+    db_loading_script,
+    genome_file,
+    db_details,
+    db_loading_dir,
+    load_type,
+    analysis_name,
+    gtf_records,
+    num_threads,
+ ):
+    pool = multiprocessing.Pool(int(num_threads))
+    for record_batch in gtf_records:
+        pool.apply_async(
+            multiprocess_load_records_to_ensembl_db,
+            args=(
+                load_to_ensembl_db,
+                db_loading_script,
+                genome_file,
+                db_details,
+                db_loading_dir,
+                load_type,
+                analysis_name,
+                record_batch,
+            ),
+        )
 
-    Args:
-        gtf_file: Path to the GTF file.
+    pool.close()
+    pool.join()
+
+
+def multiprocess_load_records_to_ensembl_db(
+    load_to_ensembl_db,
+    db_loading_script,
+    genome_file,
+    db_details,
+    output_dir,
+    load_type,
+    analysis_name,
+    record_batch,
+    ):
+    with tempfile.NamedTemporaryFile(
+        mode="w+t", delete=False, dir=output_dir
+    ) as gtf_temp_out:
+        for line in record_batch:
+            gtf_temp_out.writelines(line)
+            gtf_temp_file_path = gtf_temp_out.name
+
+    (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",")
+
+    loading_cmd = [
+        "perl",
+        db_loading_script,
+        "-genome_file",
+        genome_file,
+        "-dbname",
+        db_name,
+        "-host",
+        db_host,
+        "-port",
+        str(db_port),
+        "-user",
+        db_user,
+        "-pass",
+        db_pass,
+        "-gtf_file",
+        gtf_temp_file_path,
+        "-analysis_name",
+        analysis_name,
+        "-load_type",
+        load_type,
+    ]
 
-    Returns:
-        A dictionary containing gene information, organized by gene_id, transcript_id, and features.
-    """
-    gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+    if load_type == "gene" and analysis_name == "ensembl":
+        loading_cmd.extend(
+            [
+                "-protein_coding_biotype",
+                "anno_protein_coding",
+                "-non_coding_biotype",
+                "anno_lncRNA",
+            ]
+        )
 
-    with open(gtf_file, "r") as gtf_in:
-        for line in gtf_in:
-            eles = line.strip().split("\t")
-            if len(eles) != 9:
+        if load_to_ensembl_db == "single_transcript_genes":
+            loading_cmd.append("-make_single_transcript_genes")
+
+    logger.info(" ".join(loading_cmd))
+    subprocess.run(loading_cmd)
+    gtf_temp_out.close()
+    os.remove(gtf_temp_file_path)  # doesn't seem to be working
+    logger.info("Finished: " + gtf_temp_file_path)
+    gc.collect()
+
+
+def batch_gtf_records(input_gtf_file, batch_size, output_dir, record_type):
+    records = []
+    gtf_in = open(input_gtf_file)
+    if record_type == "gene":
+        # NOTE that the neverending variations on GTF reading/writing/merging
+        # is becoming very messy
+        # need to create a set of utility methods outside of this script
+        # This one assumes the file has unique ids for the parent features.
+        # It then batches them into
+        # sets of records based on the batch size passed in
+        record_counter = 0
+        current_record_batch = []
+        current_gene_id = ""
+        line = gtf_in.readline()
+        while line:
+            if re.search(r"^#", line):
+                line = gtf_in.readline()
                 continue
 
-            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8])
-            if not match:
+            eles = line.split("\t")
+            if not len(eles) == 9:
+                line = gtf_in.readline()
                 continue
 
+            match = re.search(r'gene_id "([^"]+)"', line)
             gene_id = match.group(1)
-            transcript_id = match.group(2)
-            feature_type = eles[2]
 
-            if gene_id not in gtf_genes:
-                gtf_genes[gene_id] = {}
+            if not current_gene_id:
+                record_counter += 1
+                current_gene_id = gene_id
+
+            if not gene_id == current_gene_id:
+                record_counter += 1
+                if record_counter % batch_size == 0:
+                    records.append(current_record_batch)
+                    current_record_batch = []
+                current_gene_id = gene_id
+
+            current_record_batch.append(line)
+            line = gtf_in.readline()
+
+        records.append(current_record_batch)
+
+    elif record_type == "single_line_feature":
+        record_counter = 0
+        current_record_batch = []
+        current_gene_id = ""
+        line = gtf_in.readline()
+        while line:
+            if re.search(r"^#", line):
+                line = gtf_in.readline()
+                continue
+
+            eles = line.split("\t")
+            if not len(eles) == 9:
+                line = gtf_in.readline()
+                continue
+
+            record_counter += 1
+
+            if record_counter % batch_size == 0:
+                records.append(current_record_batch)
+                current_record_batch = []
+
+            current_record_batch.append(line)
+            line = gtf_in.readline()
+
+        records.append(current_record_batch)
 
-            if feature_type == "transcript":
-                gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []}
-            elif feature_type == "exon":
-                if transcript_id in gtf_genes[gene_id]:
-                    gtf_genes[gene_id][transcript_id]["exons"].append(line.strip())
+    gtf_in.close()
+
+    return records
+
+
+def run_find_orfs(genome_file, main_output_dir):
+    min_orf_length = 600
+
+    orf_output_dir = create_dir(main_output_dir, "orf_output")
+    seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
+    for region_name in seq_region_lengths:
+        region_length = seq_region_lengths[region_name]
+        seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
+        for phase in range(0, 6):
+            find_orf_phased_region(
+                region_name, seq, phase, min_orf_length, orf_output_dir
+            )
+
+
+def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
+    current_index = phase
+    orf_counter = 1
+    if phase > 2:
+        seq = reverse_complement(seq)
+        current_index = current_index % 3
+
+    orf_file_path = os.path.join(
+        orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")
+    )
+    orf_out = open(orf_file_path, "w+")
+
+    while current_index < len(seq):
+        codon = seq[current_index : current_index + 3]
+        if codon == "ATG":
+            orf_seq = codon
+            for j in range(current_index + 3, len(seq), 3):
+                next_codon = seq[j : j + 3]
+                if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
+                    orf_seq += next_codon
+                    if len(orf_seq) >= min_orf_length:
+                        orf_out.write(
+                            ">"
+                            + region_name
+                            + "_phase"
+                            + str(phase)
+                            + "_orf"
+                            + str(orf_counter)
+                            + "\n"
+                        )
+                        orf_out.write(orf_seq + "\n")
+                        orf_counter += 1
+                        orf_seq = ""
+                        break
+
+                # If there's another met in phase, then put i to the start of
+                # the codon after j so that only the longest ORF is found
+                if next_codon == "ATG":
+                    current_index = j + 3
+                orf_seq += next_codon
+        current_index += 3
+    orf_out.close()
+
+def bed_to_gff(input_dir, hints_file):
+    gff_out = open(hints_file, "w+")
+    exons_dict = {}
+    for bed_file in glob.glob(input_dir + "/*.bed"):
+        logger.info("Processing file for hints:")
+        logger.info(bed_file)
+        bed_in = open(bed_file)
+        bed_lines = bed_in.readlines()
+        for line in bed_lines:
+            line = line.rstrip()
+            elements = line.split("\t")
+            seq_region_name = elements[0]
+            offset = int(elements[1])
+            hit_name = elements[3]
+            strand = elements[5]
+            block_sizes = elements[10].split(",")
+            block_sizes = list(filter(None, block_sizes))
+            block_starts = elements[11].split(",")
+            block_starts = list(filter(None, block_starts))
+            exons = bed_block_to_exons(block_sizes, block_starts, offset)
+            for i, element in enumerate(exons):
+                exon_coords = exons[i]
+                exon_key = (
+                    seq_region_name
+                    + ":"
+                    + exon_coords[0]
+                    + ":"
+                    + exon_coords[1]
+                    + ":"
+                    + strand
+                )
+                if exon_key in exons_dict:
+                    exons_dict[exon_key][5] += 1
+                else:
+                    gff_list = [
+                        seq_region_name,
+                        "CDNA",
+                        "exon",
+                        exon_coords[0],
+                        exon_coords[1],
+                        1.0,
+                        strand,
+                        ".",
+                    ]
+                    exons_dict[exon_key] = gff_list
+
+    for exon_key, gff_list in exons_dict.items():
+        gff_list[5] = str(gff_list[5])
+        gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
+        gff_out.write(gff_line)
+
+    gff_out.close()
+
+    sorted_hints_out = open((hints_file + ".srt"), "w+")
+    subprocess.run(
+        ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
+        stdout=sorted_hints_out,
+    )
+    sorted_hints_out.close()
+
+def splice_junction_to_gff(input_dir, hints_file):
+    sjf_out = open(hints_file, "w+")
+
+    for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
+        sjf_in = open(sj_tab_file)
+        sjf_lines = sjf_in.readlines()
+        for line in sjf_lines:
+            elements = line.split("\t")
+            strand = "+"
+            # If the strand is undefined then skip, Augustus expects a strand
+            if elements[3] == "0":
+                continue
+            elif elements[3] == "2":
+                strand = "-"
+
+            junction_length = int(elements[2]) - int(elements[1]) + 1
+            if junction_length < 100:
+                continue
+
+            if not elements[4] and elements[7] < 10:
+                continue
+
+            # For the moment treat multimapping and single mapping things as a combined score
+            score = float(elements[6]) + float(elements[7])
+            score = str(score)
+            output_line = [
+                elements[0],
+                "RNASEQ",
+                "intron",
+                elements[1],
+                elements[2],
+                score,
+                strand,
+                ".",
+                ("src=W;mul=" + score + ";"),
+            ]
+            sjf_out.write("\t".join(output_line) + "\n")
+
+    sjf_out.close()
+
+
+def split_genome(genome_file, target_dir, min_seq_length):
+    # This is the lazy initial way of just splitting into a dir
+    # of files based on the toplevel sequence with a min sequence length filter
+    # There are a couple of obvious improvements:
+    # 1) Instead of making files for all seqs, just process N seqs
+    #    parallel, where N = num_threads. Then you could clean up the seq file
+    #    after each seq finishes, thus avoiding potentially
+    #    having thousands of file in a dir
+    # 2) Split the seq into even slices and process these in parallel
+    #    (which the same cleanup as in 1). For sequences smaller than the
+    #    target slice size, bundle them up together into a single file. Vastly
+    #    more complex, partially implemented in the splice_genome method
+    #    Allows for more consistency with parallelisation (since there should
+    #    be no large outliers). But require a mapping strategy for the
+    #    coords and sequence names and all the hints will need to be adjusted
+    current_header = ""
+    current_seq = ""
+
+    file_in = open(genome_file)
+    line = file_in.readline()
+    while line:
+        match = re.search(r">(.+)$", line)
+        if match and current_header:
+            if len(current_seq) > min_seq_length:
+                file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+                if not os.path.exists(file_out_name):
+                    file_out = open(file_out_name, "w+")
+                    file_out.write(">" + current_header + "\n" + current_seq + "\n")
+                    file_out.close()
+
+                else:
+                    print(
+                        "Found an existing split file, so will not overwrite. File found:"
+                    )
+                    print(file_out_name)
+
+            current_seq = ""
+            current_header = match.group(1)
+        elif match:
+            current_header = match.group(1)
+        else:
+            current_seq += line.rstrip()
+
+        line = file_in.readline()
+
+    if len(current_seq) > min_seq_length:
+        file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+        if not os.path.exists(file_out_name):
+            file_out = open(file_out_name, "w+")
+            file_out.write(">" + current_header + "\n" + current_seq + "\n")
+            file_out.close()
+
+        else:
+            logger.info(
+                "Found an existing split file, so will not overwrite. File found:"
+            )
+            logger.info(file_out_name)
+
+    file_in.close()
+
+
+def update_gtf_genes(parsed_gtf_genes, combined_results, validation_type):
+    output_lines = []
+
+    for gene_id in parsed_gtf_genes.keys():
+        transcript_ids = parsed_gtf_genes[gene_id].keys()
+        for transcript_id in transcript_ids:
+            transcript_line = parsed_gtf_genes[gene_id][transcript_id]["transcript"]
+            single_cds_exon_transcript = 0
+            translation_match = re.search(
+                r'; translation_coords "([^"]+)";', transcript_line
+            )
+            if translation_match:
+                translation_coords = translation_match.group(1)
+                translation_coords_list = translation_coords.split(":")
+                # If the start exon coords of both exons are the same,
+                # then it's the same exon and thus a single exon cds
+                if translation_coords_list[0] == translation_coords_list[3]:
+                    single_cds_exon_transcript = 1
+
+            exon_lines = parsed_gtf_genes[gene_id][transcript_id]["exons"]
+            validation_results = combined_results[transcript_id]
+            rnasamba_coding_probability = float(validation_results[0])
+            rnasamba_coding_potential = validation_results[1]
+            cpc2_coding_probability = float(validation_results[2])
+            cpc2_coding_potential = validation_results[3]
+            transcript_length = int(validation_results[4])
+            peptide_length = int(validation_results[5])
+            diamond_e_value = None
+            if len(validation_results) == 7:
+                diamond_e_value = validation_results[6]
+
+            avg_coding_probability = (
+                rnasamba_coding_probability + cpc2_coding_probability
+            ) / 2
+            max_coding_probability = max(
+                rnasamba_coding_probability, cpc2_coding_probability
+            )
+
+            match = re.search(r'; biotype "([^"]+)";', transcript_line)
+            biotype = match.group(1)
+            if biotype == "busco" or biotype == "protein":
+                transcript_line = re.sub(
+                    '; biotype "' + biotype + '";',
+                    '; biotype "protein_coding";',
+                    transcript_line,
+                )
+                output_lines.append(transcript_line)
+                output_lines.extend(exon_lines)
+                continue
+
+            min_single_exon_pep_length = 100
+            min_multi_exon_pep_length = 75
+            min_single_source_probability = 0.8
+            min_single_exon_probability = 0.9
+
+            # Note that the below looks at validating things
+            # under different levels of strictness
+            # There are a few different continue statements,
+            # where transcripts will be skipped resulting
+            # in a smaller post validation file. It mainly
+            # removes single coding exon genes with no real
+            # support or for multi-exon lncRNAs that are less than 200bp long
+            if single_cds_exon_transcript == 1 and validation_type == "relaxed":
+                if diamond_e_value is not None:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_single_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        or cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_single_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                else:
+                    continue
+            elif single_cds_exon_transcript == 1 and validation_type == "moderate":
+                if (
+                    diamond_e_value is not None
+                    and peptide_length >= min_single_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        and cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_single_exon_pep_length
+                    and avg_coding_probability >= min_single_exon_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                else:
+                    continue
+            else:
+                if diamond_e_value is not None:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    rnasamba_coding_potential == "coding"
+                    and cpc2_coding_potential == "coding"
+                    and peptide_length >= min_multi_exon_pep_length
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif (
+                    (
+                        rnasamba_coding_potential == "coding"
+                        or cpc2_coding_potential == "coding"
+                    )
+                    and peptide_length >= min_multi_exon_pep_length
+                    and max_coding_probability >= min_single_source_probability
+                ):
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "protein_coding";',
+                        transcript_line,
+                    )
+                elif transcript_length >= 200:
+                    transcript_line = re.sub(
+                        '; biotype "' + biotype + '";',
+                        '; biotype "lncRNA";',
+                        transcript_line,
+                    )
+                    transcript_line = re.sub(
+                        ' translation_coords "[^"]+";', "", transcript_line
+                    )
+                else:
+                    continue
+
+            output_lines.append(transcript_line)
+            output_lines.extend(exon_lines)
+
+    return output_lines
+
+
+def read_gtf_genes(gtf_file):
+    gtf_genes = {}
+    gtf_in = open(gtf_file)
+    line = gtf_in.readline()
+    while line:
+        eles = line.split("\t")
+        if not len(eles) == 9:
+            line = gtf_in.readline()
+            continue
+
+        match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line)
+
+        if not match:
+            line = gtf_in.readline()
+            continue
+
+        gene_id = match.group(1)
+        transcript_id = match.group(2)
+        feature_type = eles[2]
+        if gene_id not in gtf_genes:
+            gtf_genes[gene_id] = {}
+        if feature_type == "transcript":
+            gtf_genes[gene_id][transcript_id] = {}
+            gtf_genes[gene_id][transcript_id]["transcript"] = line
+            gtf_genes[gene_id][transcript_id]["exons"] = []
+        elif feature_type == "exon":
+            gtf_genes[gene_id][transcript_id]["exons"].append(line)
+        line = gtf_in.readline()
+    gtf_in.close()
 
     return gtf_genes
 
 
-def check_transcriptomic_output(main_output_dir: Path) -> None:
-    """Check across the various transcriptomic if there is a reasonable amount of data
+def fasta_to_dict(fasta_list):
+    index = {}
+    it = iter(fasta_list)
+    for header in it:
+        match = re.search(r">(.+)\n$", header)
+        header = match.group(1)
+        seq = next(it)
+        index[header] = seq
+    return index
 
-    Args:
-        main_output_dir (Path): Output dir path
-    Raises:
-        IOError: If no lines or fewer than the minimum expected lines are found in the output files.
-    """
 
+# def merge_gtf_files(file_paths,id_label):
+
+#  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
+#  gtf_out = open(gtf_file_path,'w+')
+#  for gtf_file_path in gtf_files:
+#    gtf_file_name = os.path.basename(gtf_file_path)
+#    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
+#    start_offset = int(match.group(1))
+#    gtf_in = open(gtf_file_path,'r')
+#    line = gtf_in.readline()
+#    while line:
+#      values = line.split("\t")
+#      if len(values) == 9 and (values[2] in feature_types):
+#        values[3] = str(int(values[3]) + (start_offset - 1))
+#        values[4] = str(int(values[4]) + (start_offset - 1))
+#        gtf_out.write("\t".join(values))
+#        line = gtf_in.readline()
+#    gtf_in.close()
+#  gtf_out.close()
+
+
+def multiprocess_generic(cmd):
+    print(" ".join(cmd))
+    subprocess.run(cmd)
+
+
+def seq_region_names(genome_file):
+    region_list = []
+
+    file_in = open(genome_file)
+    line = file_in.readline()
+    while line:
+        match = re.search(r">([^\s]+)", line)
+        if match:
+            region_name = match.group(1)
+            if region_name == "MT":
+                logger.info("Skipping region named MT")
+                line = file_in.readline()
+                continue
+            else:
+                region_list.append(match.group(1))
+        line = file_in.readline()
+
+    return region_list
+
+
+def slice_genome(genome_file, target_dir, target_slice_size):
+    # The below is sort of tested
+    # Without the
+    target_seq_length = 50000000
+    min_seq_length = 1000
+    current_header = ""
+    current_seq = ""
+    seq_dict = {}
+    for line in seq:  # pylint:disable=used-before-assignment
+        match = re.search(r">(.+)$", line)
+        if match and current_header:
+            seq_dict[current_header] = current_seq
+            current_seq = ""
+            current_header = match.group(1)
+        elif match:
+            current_header = match.group(1)
+        else:
+            current_seq += line.rstrip()
+
+    seq_dict[current_header] = current_seq
+
+    seq_buffer = 0
+    file_number = 0
+    file_name = "genome_file_" + str(file_number)
+
+    for header in seq_dict:
+        seq_iterator = 0
+        seq = seq_dict[header]
+
+        while len(seq) > target_seq_length:
+            file_out = open(os.path.join(target_dir, file_name), "w+")
+            subseq = seq[0:target_seq_length]
+            file_out.write(
+                ">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n"
+            )
+            file_out.close()
+            seq = seq[target_seq_length:]
+            seq_iterator += 1
+            file_number += 1
+            file_name = "genome_file_" + str(file_number)
+
+        if len(seq) >= min_seq_length:
+            file_name = "genome_file_" + str(file_number)
+            file_out = open(os.path.join(file_name), "w+")
+            file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
+            file_out.close()
+            file_number += 1
+            file_name = "genome_file_" + str(file_number)
+
+
+def coallate_results(main_output_dir):
+    results_dir = create_dir(main_output_dir, "results")
+    output_dirs = [
+        "augustus_output",
+        "cpg_output",
+        "dust_output",
+        "eponine_output",
+        "red_output",
+        "rfam_output",
+        "trf_output",
+        "trnascan_output",
+    ]
+    for output_dir in output_dirs:
+        match = re.search(r"(.+)_output", output_dir)
+        result_type = match.group(1)
+        results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
+        copy_path = os.path.join(results_dir, (result_type + ".gtf"))
+        if os.path.exists(results_path):
+            cpy_cmd = ["cp", results_path, copy_path]
+            subprocess.run(cpy_cmd)
+
+
+
+def check_transcriptomic_output(main_output_dir):
+
+    # This will check across the various transcriptomic
+    # dirs and check there's actually some data
     transcriptomic_dirs = [
         "scallop_output",
         "stringtie_output",
@@ -556,23 +1364,31 @@ def check_transcriptomic_output(main_output_dir: Path) -> None:
     total_lines = 0
     min_lines = 100000
     for transcriptomic_dir in transcriptomic_dirs:
-        full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf"
-        if not full_file_path.exists():
+        full_file_path = os.path.join(
+            main_output_dir, transcriptomic_dir, "annotation.gtf"
+        )
+        if not os.path.exists(full_file_path):
             logger.warning(
-                "Warning, no annotation.gtf found for %s. This might be fine, \
-                    e.g. no long read data were provided",
-                transcriptomic_dir,
+                "Warning, no annotation.gtf found for "
+                + transcriptomic_dir
+                + ". This might be fine, e.g. no long read data were provided"
             )
             continue
-        num_lines = sum(1 for _ in full_file_path.open("r"))
-        total_lines += num_lines
-        logging.info("For %s found a total of in the annotation.gtf file", num_lines)
+        num_lines = sum(1 for line in open(full_file_path))
+        total_lines = total_lines + num_lines
+        logger.info(
+            "For "
+            + transcriptomic_dir
+            + " found a total of "
+            + str(num_lines)
+            + " in the annotation.gtf file"
+        )
     if total_lines == 0:
         raise IOError(
             "Anno was run with transcriptomic mode enabled,\
             but the transcriptomic annotation files are empty"
         )
-    if total_lines <= min_lines:
+    elif total_lines <= min_lines:
         raise IOError(
             "Anno was run with transcriptomic mode enabled, \
             but the total number of lines in the output \
@@ -582,318 +1398,9 @@ def check_transcriptomic_output(main_output_dir: Path) -> None:
             "Min allowed: " + str(min_lines)
         )
 
-
-
-# def run_find_orfs(genome_file, main_output_dir):
-#     min_orf_length = 600
-
-#     orf_output_dir = create_dir(main_output_dir, "orf_output")
-#     seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
-#     for region_name in seq_region_lengths:
-#         region_length = seq_region_lengths[region_name]
-#         seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
-#         for phase in range(0, 6):
-#             find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir)
-
-
-# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
-#     current_index = phase
-#     orf_counter = 1
-#     if phase > 2:
-#         seq = reverse_complement(seq)
-#         current_index = current_index % 3
-
-#     orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa"))
-#     orf_out = open(orf_file_path, "w+")
-
-#     while current_index < len(seq):
-#         codon = seq[current_index : current_index + 3]
-#         if codon == "ATG":
-#             orf_seq = codon
-#             for j in range(current_index + 3, len(seq), 3):
-#                 next_codon = seq[j : j + 3]
-#                 if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
-#                     orf_seq += next_codon
-#                     if len(orf_seq) >= min_orf_length:
-#                         orf_out.write(
-#                             ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n"
-#                         )
-#                         orf_out.write(orf_seq + "\n")
-#                         orf_counter += 1
-#                         orf_seq = ""
-#                         break
-
-#                 # If there's another met in phase, then put i to the start of
-#                 # the codon after j so that only the longest ORF is found
-#                 if next_codon == "ATG":
-#                     current_index = j + 3
-#                 orf_seq += next_codon
-#         current_index += 3
-#     orf_out.close()
-
-
-# def bed_to_gff(input_dir, hints_file):
-#     gff_out = open(hints_file, "w+")
-#     exons_dict = {}
-#     for bed_file in glob.glob(input_dir + "/*.bed"):
-#         logger.info("Processing file for hints:")
-#         logger.info(bed_file)
-#         bed_in = open(bed_file)
-#         bed_lines = bed_in.readlines()
-#         for line in bed_lines:
-#             line = line.rstrip()
-#             elements = line.split("\t")
-#             seq_region_name = elements[0]
-#             offset = int(elements[1])
-#             hit_name = elements[3]
-#             strand = elements[5]
-#             block_sizes = elements[10].split(",")
-#             block_sizes = list(filter(None, block_sizes))
-#             block_starts = elements[11].split(",")
-#             block_starts = list(filter(None, block_starts))
-#             exons = bed_block_to_exons(block_sizes, block_starts, offset)
-#             for i, element in enumerate(exons):
-#                 exon_coords = exons[i]
-#                 exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand
-#                 if exon_key in exons_dict:
-#                     exons_dict[exon_key][5] += 1
-#                 else:
-#                     gff_list = [
-#                         seq_region_name,
-#                         "CDNA",
-#                         "exon",
-#                         exon_coords[0],
-#                         exon_coords[1],
-#                         1.0,
-#                         strand,
-#                         ".",
-#                     ]
-#                     exons_dict[exon_key] = gff_list
-
-#     for exon_key, gff_list in exons_dict.items():
-#         gff_list[5] = str(gff_list[5])
-#         gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
-#         gff_out.write(gff_line)
-
-#     gff_out.close()
-
-#     sorted_hints_out = open((hints_file + ".srt"), "w+")
-#     subprocess.run(
-#         ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
-#         stdout=sorted_hints_out,
-#     )
-#     sorted_hints_out.close()
-
-
-# def splice_junction_to_gff(input_dir, hints_file):
-#     sjf_out = open(hints_file, "w+")
-
-#     for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
-#         sjf_in = open(sj_tab_file)
-#         sjf_lines = sjf_in.readlines()
-#         for line in sjf_lines:
-#             elements = line.split("\t")
-#             strand = "+"
-#             # If the strand is undefined then skip, Augustus expects a strand
-#             if elements[3] == "0":
-#                 continue
-#             elif elements[3] == "2":
-#                 strand = "-"
-
-#             junction_length = int(elements[2]) - int(elements[1]) + 1
-#             if junction_length < 100:
-#                 continue
-
-#             if not elements[4] and elements[7] < 10:
-#                 continue
-
-#             # For the moment treat multimapping and single mapping things as a combined score
-#             score = float(elements[6]) + float(elements[7])
-#             score = str(score)
-#             output_line = [
-#                 elements[0],
-#                 "RNASEQ",
-#                 "intron",
-#                 elements[1],
-#                 elements[2],
-#                 score,
-#                 strand,
-#                 ".",
-#                 ("src=W;mul=" + score + ";"),
-#             ]
-#             sjf_out.write("\t".join(output_line) + "\n")
-
-#     sjf_out.close()
-
-
-# def split_genome(genome_file, target_dir, min_seq_length):
-#     # This is the lazy initial way of just splitting into a dir
-#     # of files based on the toplevel sequence with a min sequence length filter
-#     # There are a couple of obvious improvements:
-#     # 1) Instead of making files for all seqs, just process N seqs
-#     #    parallel, where N = num_threads. Then you could clean up the seq file
-#     #    after each seq finishes, thus avoiding potentially
-#     #    having thousands of file in a dir
-#     # 2) Split the seq into even slices and process these in parallel
-#     #    (which the same cleanup as in 1). For sequences smaller than the
-#     #    target slice size, bundle them up together into a single file. Vastly
-#     #    more complex, partially implemented in the splice_genome method
-#     #    Allows for more consistency with parallelisation (since there should
-#     #    be no large outliers). But require a mapping strategy for the
-#     #    coords and sequence names and all the hints will need to be adjusted
-#     current_header = ""
-#     current_seq = ""
-
-#     file_in = open(genome_file)
-#     line = file_in.readline()
-#     while line:
-#         match = re.search(r">(.+)$", line)
-#         if match and current_header:
-#             if len(current_seq) > min_seq_length:
-#                 file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-#                 if not os.path.exists(file_out_name):
-#                     file_out = open(file_out_name, "w+")
-#                     file_out.write(">" + current_header + "\n" + current_seq + "\n")
-#                     file_out.close()
-
-#                 else:
-#                     print("Found an existing split file, so will not overwrite. File found:")
-#                     print(file_out_name)
-
-#             current_seq = ""
-#             current_header = match.group(1)
-#         elif match:
-#             current_header = match.group(1)
-#         else:
-#             current_seq += line.rstrip()
-
-#         line = file_in.readline()
-
-#     if len(current_seq) > min_seq_length:
-#         file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
-#         if not os.path.exists(file_out_name):
-#             file_out = open(file_out_name, "w+")
-#             file_out.write(">" + current_header + "\n" + current_seq + "\n")
-#             file_out.close()
-
-#         else:
-#             logger.info("Found an existing split file, so will not overwrite. File found:")
-#             logger.info(file_out_name)
-
-#     file_in.close()
-
-
-# # def merge_gtf_files(file_paths,id_label):
-
-# #  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
-# #  gtf_out = open(gtf_file_path,'w+')
-# #  for gtf_file_path in gtf_files:
-# #    gtf_file_name = os.path.basename(gtf_file_path)
-# #    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
-# #    start_offset = int(match.group(1))
-# #    gtf_in = open(gtf_file_path,'r')
-# #    line = gtf_in.readline()
-# #    while line:
-# #      values = line.split("\t")
-# #      if len(values) == 9 and (values[2] in feature_types):
-# #        values[3] = str(int(values[3]) + (start_offset - 1))
-# #        values[4] = str(int(values[4]) + (start_offset - 1))
-# #        gtf_out.write("\t".join(values))
-# #        line = gtf_in.readline()
-# #    gtf_in.close()
-# #  gtf_out.close()
-
-
-# def multiprocess_generic(cmd):
-#     print(" ".join(cmd))
-#     subprocess.run(cmd)
-
-
-# def seq_region_names(genome_file):
-#     region_list = []
-
-#     file_in = open(genome_file)
-#     line = file_in.readline()
-#     while line:
-#         match = re.search(r">([^\s]+)", line)
-#         if match:
-#             region_name = match.group(1)
-#             if region_name == "MT":
-#                 logger.info("Skipping region named MT")
-#                 line = file_in.readline()
-#                 continue
-#             else:
-#                 region_list.append(match.group(1))
-#         line = file_in.readline()
-
-#     return region_list
-
-
-# def slice_genome(genome_file, target_dir, target_slice_size):
-#     # The below is sort of tested
-#     # Without the
-#     target_seq_length = 50000000
-#     min_seq_length = 1000
-#     current_header = ""
-#     current_seq = ""
-#     seq_dict = {}
-#     for line in seq:  # pylint:disable=used-before-assignment
-#         match = re.search(r">(.+)$", line)
-#         if match and current_header:
-#             seq_dict[current_header] = current_seq
-#             current_seq = ""
-#             current_header = match.group(1)
-#         elif match:
-#             current_header = match.group(1)
-#         else:
-#             current_seq += line.rstrip()
-
-#     seq_dict[current_header] = current_seq
-
-#     seq_buffer = 0
-#     file_number = 0
-#     file_name = "genome_file_" + str(file_number)
-
-#     for header in seq_dict:
-#         seq_iterator = 0
-#         seq = seq_dict[header]
-
-#         while len(seq) > target_seq_length:
-#             file_out = open(os.path.join(target_dir, file_name), "w+")
-#             subseq = seq[0:target_seq_length]
-#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n")
-#             file_out.close()
-#             seq = seq[target_seq_length:]
-#             seq_iterator += 1
-#             file_number += 1
-#             file_name = "genome_file_" + str(file_number)
-
-#         if len(seq) >= min_seq_length:
-#             file_name = "genome_file_" + str(file_number)
-#             file_out = open(os.path.join(file_name), "w+")
-#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
-#             file_out.close()
-#             file_number += 1
-#             file_name = "genome_file_" + str(file_number)
-
-
-# def coallate_results(main_output_dir):
-#     results_dir = create_dir(main_output_dir, "results")
-#     output_dirs = [
-#         "augustus_output",
-#         "cpg_output",
-#         "dust_output",
-#         "eponine_output",
-#         "red_output",
-#         "rfam_output",
-#         "trf_output",
-#         "trnascan_output",
-#     ]
-#     for output_dir in output_dirs:
-#         match = re.search(r"(.+)_output", output_dir)
-#         result_type = match.group(1)
-#         results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
-#         copy_path = os.path.join(results_dir, (result_type + ".gtf"))
-#         if os.path.exists(results_path):
-#             cpy_cmd = ["cp", results_path, copy_path]
-# subprocess.run(cpy_cmd)
+    else:
+        logger.info(
+            "Found "
+            + str(total_lines)
+            + " total lines across the transcriptomic files. Checks passed"
+        )
\ No newline at end of file
diff --git a/src/python/ensembl/tools/anno/utils/_utils_copy.py b/src/python/ensembl/tools/anno/utils/_utils_copy.py
new file mode 100644
index 0000000..b4bbc72
--- /dev/null
+++ b/src/python/ensembl/tools/anno/utils/_utils_copy.py
@@ -0,0 +1,899 @@
+# pylint: disable=missing-module-docstring
+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import io
+import logging
+import os
+from os import PathLike
+from pathlib import Path
+import random
+import re
+import subprocess
+import shutil
+import tempfile
+from typing import Dict, List, Optional
+
+logger = logging.getLogger("__name__." + __name__)
+
+# with open(Path(__file__).parents[6] / "conf/config.json", "r", encoding="utf8") as f:
+#    config = json.load(f)
+
+
+def create_dir(input_dir: Path, dir_name: str) -> Path:
+    """
+    Create directory or subdirectory and log operations.
+    Args:
+        main_output_dir: str main output directory path
+        dir_name: str optional subdirectory to be created
+    Returns:
+        str Path to the created directory
+    """
+    if dir_name:
+        target_dir = Path(input_dir) / str(dir_name)
+    else:
+        target_dir = input_dir
+
+    if os.path.exists(target_dir):
+        logger.warning("Directory already exists, will not create again")
+        return target_dir
+
+    logger.info("Attempting to create target dir: %s", target_dir)
+
+    try:
+        os.mkdir(target_dir)
+    except OSError:
+        logger.error("Creation of the dir failed, path used: %s", target_dir)
+    else:
+        logger.info("Successfully created the dir on the following path: %s", target_dir)
+    return target_dir
+
+
+def check_exe(exe_bin: PathLike) -> None:
+    """
+    Check executable path
+    Args:
+        exe_bin: Executable path
+
+    Raises:
+        OSError: Executable path does not exist
+    """
+    if not shutil.which(exe_bin):
+        raise OSError(f"Exe does not exist. Path checked: {exe_bin}")
+
+
+def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int:
+    """
+    Check number of transcript lines in the GTF
+
+    Arg:
+        gtf_file: GTF file path
+        content_obj: Object to detect and count in the gtf i.e transcript, repeat
+
+    Return: Number of occurences
+    """
+    obj_count = 0
+    with open(gtf_file) as gtf_in:
+        for line in gtf_in:
+            gtf_raw = line.split("\t")
+            if not len(gtf_raw) == 9:
+                continue
+            if gtf_raw[2] == content_obj:
+                obj_count += 1
+    logger.info("%s; Number of %s detected: %d", gtf_file, content_obj, int(obj_count))
+    return obj_count
+
+
+def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dict:
+    """
+    Split the genome file according to the header and store in a dictionary
+    all the sequences whose length is greater than min_seq_length.
+    Args:
+        genome_file: Genome file path.
+        min_seq_length: Minimum slice length.
+    Return: Dictionary of sequence headers and the corresponding sequence length
+    """
+    current_header = ""
+    current_seq = ""
+
+    seq_region_to_length = {}
+    with open(genome_file) as file_in:
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            if match and current_header:
+                if len(current_seq) > min_seq_length:
+                    seq_region_to_length[current_header] = len(current_seq)
+
+                current_seq = ""
+                current_header = match.group(1)
+            elif match:
+                current_header = match.group(1)
+            else:
+                current_seq += line.rstrip()
+
+        if len(current_seq) > min_seq_length:
+            seq_region_to_length[current_header] = len(current_seq)
+    # logging.info("length of the seq region %s", seq_region_to_length)
+    return seq_region_to_length
+
+
+def get_slice_id(
+    seq_region_to_length: Dict,
+    slice_size: int = 1000000,
+    overlap: int = 0,
+    min_length: int = 0,
+) -> List:
+    """
+    Get list of ids for a genomic slice
+    Arg:
+    seq_region_to_length: Dictionary with the sequence headers as keys and the sequence lengths as values
+    slice_size: Size of the slice
+    overlap: Overlap length between two slices
+    min_length: Min length of the slice
+    Return: List of IDs for each genomic slice
+    """
+
+    slice_ids_per_region = []
+    for seq_region in seq_region_to_length:
+        seq_region_length = int(seq_region_to_length[seq_region])
+        if seq_region_length < min_length:
+            continue
+
+        if seq_region_length <= slice_size:
+            slice_ids_per_region.append([seq_region, 1, seq_region_length])
+            continue
+
+        start = 1
+        end = start + slice_size - 1
+        while end < seq_region_length:
+            start = start - overlap
+            start = max(start, 1)
+            end = start + slice_size - 1
+            end = min(end, seq_region_length)
+            if (end - start + 1) >= min_length:
+                slice_ids_per_region.append([seq_region, start, end])
+            start = end + 1
+    # logging.info("list of slice ids %s", str(slice_ids_per_region))
+    return slice_ids_per_region
+
+
+def update_string(text, lookup_text, new_text) -> str:
+    """
+    Update substring
+    Args:
+        text : Text.
+        lookup_text : String to look for.
+        new_text : String to substitute.
+    Return
+        Text with string substitution
+    """
+    text = re.sub(
+        lookup_text,
+        new_text,
+        text,
+    )
+    return text
+
+
+#   slice_output_to_gtf(repeatmasker_dir, "repeat_id", "repeatmask", True, ".rm.gtf")
+def slice_output_to_gtf(  # pylint: disable=too-many-branches
+    output_dir: Path,
+    feature_id_label: str = "",
+    new_id_prefix: str = "",
+    unique_ids: bool = True,
+    file_extension: str = ".gtf",
+) -> None:
+    """
+    Collect all the gtf files per file extension and merge them in a final gtf file  assigning unique ids.
+
+    This holds keys of the current slice details with the gene id to form unique keys.
+    Each time a new key is added the overall gene counter is incremented
+    and the value of the key is set to the new gene id. Any subsequent
+    lines with the same region/gene id key will then just get
+    the new id without incrementing the counter.
+
+    Args:
+    output_dir : Output directory.
+    feature_id_label : Feature identifier.
+    new_id_prefix : New feature identifier.
+    unique_ids : If True assign unique ids for the same feature type.
+    file_extension : Input file extension.
+    """
+    feature_types = ["exon", "transcript", "repeat", "simple_feature"]
+    # Initialise gene and feature counter
+    gene_counter = 1
+    feature_counter = 1
+    # Initialise dictionaries that will store the list of gene and transcript indexes
+    gene_id_collection = {}
+    gene_transcript_id_collection = {}
+    transcript_id_count_gene_id = {}  # one gene can have multiple transcripts
+    input_files = output_dir.glob(str("*" + file_extension))
+    with open(output_dir / "annotation.gtf", "w+") as output_file:
+        for input_file in input_files:
+            if os.stat(input_file).st_size == 0:
+                logger.info("File is empty, will skip %s", input_file.name)
+                continue
+            match = re.search(r"\.rs(\d+)\.re(\d+)\.", input_file.name)
+            assert match is not None
+            start_offset = int(match.group(1))
+            with open(input_file, "r") as gtf_in:
+                for line in gtf_in:
+                    values = line.split("\t")
+                    if len(values) == 9 and (values[2] in feature_types) and unique_ids:
+                        # each slice start from 0 so we need to add the offset to get the real coordinates
+                        values[3] = str(int(values[3]) + (start_offset - 1))
+                        values[4] = str(int(values[4]) + (start_offset - 1))
+                        # Unique id based on gene and transcript ids
+                        attribs = values[8]
+                        # Get the slice details, gene id and transcript id.
+                        match_gene_type = re.search(
+                            r'(gene_id +"([^"]+)").+(transcript_id +"([^"]+)")',
+                            line,
+                        )
+                        # gene_id "1"; transcript_id "1"; biotype "tRNA_pseudogene";
+                        if match_gene_type:
+                            full_gene_id_string = match_gene_type.group(1)
+                            current_gene_id = match_gene_type.group(2)
+                            full_transcript_id_string = match_gene_type.group(3)
+                            current_transcript_id = match_gene_type.group(4)
+                            # Example key KS8000.rs1.re1000000.1
+                            gene_id_slice = input_file.name + "." + str(current_gene_id)
+                            # Example key KS8000.rs1.re1000000.1.transcript.1
+                            transcript_id_slice = gene_id_slice + "." + str(current_transcript_id)
+                            # If there is no existing entry, the gene key will be added
+                            # and the gene counter is incremented.
+                            # gene_id "gene1"; transcript_id "gene1.t1"
+                            if gene_id_slice not in gene_id_collection:
+                                new_gene_id = f"gene{gene_counter}"
+                                gene_id_collection[gene_id_slice] = new_gene_id
+                                transcript_id_count_gene_id[gene_id_slice] = 1
+                                gene_counter += 1
+                            else:
+                                # If there is a key then the gene id will be replaced with the new gene id
+                                new_gene_id = gene_id_collection[gene_id_slice]
+                            attribs = re.sub(
+                                full_gene_id_string,
+                                'gene_id "' + new_gene_id + '"',
+                                attribs,
+                            )
+                            # If there is no existing entry, the transcript key will be added
+                            # and the transcript counter is incremented.
+                            if transcript_id_slice not in gene_transcript_id_collection:
+                                new_transcript_id = (
+                                    gene_id_collection[gene_id_slice]
+                                    + ".t"
+                                    + str(transcript_id_count_gene_id[gene_id_slice])
+                                )
+                                gene_transcript_id_collection[transcript_id_slice] = new_transcript_id
+                                transcript_id_count_gene_id[gene_id_slice] += 1
+                            else:
+                                # If a transcript of the same set is already present,
+                                # the new id with the incremented counter is added
+                                new_transcript_id = gene_transcript_id_collection[transcript_id_slice]
+                            attribs = re.sub(
+                                full_transcript_id_string,
+                                'transcript_id "' + new_transcript_id + '"',
+                                attribs,
+                            )
+                            values[8] = attribs
+                            logger.info("FINAL GTF %s", values)
+                            output_file.write("\t".join(values))
+                            # Unique id based on the feature type
+                        else:
+                            if new_id_prefix == "repeatmask":
+                                match_feature_type = re.search(
+                                    r"(" + feature_id_label + r"\d+)",
+                                    line,
+                                )
+                            else:
+                                match_feature_type = (
+                                    re.search(  # repeat_id\d+  r"(" + feature_id_label + ' +"([^"]+)")',
+                                        r"(" + feature_id_label + ' +"([^"]+)")',  # "\d+)",
+                                        line,
+                                    )
+                                )
+                            if match_feature_type:
+                                full_feature_id = match_feature_type.group(1)
+                                new_feature_id = new_id_prefix + str(feature_counter)
+                                attribs = re.sub(
+                                    full_feature_id,
+                                    feature_id_label + ' "' + new_feature_id + '"',
+                                    attribs,
+                                )
+                                feature_counter += 1
+                                values[8] = attribs
+                                output_file.write("\t".join(values))
+                    else:
+                        logger.info(
+                            "Feature type not recognised, will skip. Feature type: %s",
+                            values[2],
+                        )
+
+
+def get_sequence(
+    seq_region: str,
+    start: int,
+    end: int,
+    strand: int,
+    fasta_file: Path,
+    output_dir: Path,
+) -> str:
+    """
+    Creates a tempfile and writes the bed info to it based on whatever information
+    has been passed in about the sequence. Then runs bedtools getfasta. The fasta file
+    should have a faidx. This can be created with the create_faidx static method prior
+    to fetching sequence
+
+    Arg:
+    seq_region: str region name
+    start: int region start
+    end: int region end
+    strand: int strand of the sequence
+    fasta_file: genome FASTA file with indexing
+    output_dir: str working dir
+    Return: str sequence
+    """
+    start -= 1
+    logger.info(
+        "get_sequence %s",
+        f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}",
+    )
+    with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file:
+        bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}")
+        bed_temp_file.close()
+    bedtools_command = [
+        "bedtools",
+        "getfasta",
+        "-fi",
+        str(fasta_file),
+        "-bed",
+        str(bed_temp_file.name),
+    ]
+    bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with
+        bedtools_command, stdout=subprocess.PIPE
+    )  # pylint:disable=consider-using-with
+    for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")):  # type: ignore
+        if idx == 1:
+            if strand == 1:
+                sequence = line.rstrip()
+            else:
+                sequence = reverse_complement(line.rstrip())
+
+    os.remove(bed_temp_file.name)
+    # logger.info(f"sequence : {sequence}")
+    return sequence
+
+
+def reverse_complement(sequence: str) -> str:
+    """
+    Get the reverse complement of a nucleotide sequence.
+    Args:
+        sequence: The nucleotide sequence
+    Returns:
+        The reverse complement of the sequence
+    """
+    rev_matrix = str.maketrans("atgcATGC", "tacgTACG")
+    return sequence.translate(rev_matrix)[::-1]
+
+
+def check_file(file_path: Path) -> None:
+    """
+    Raise an error when the file doesn't exist
+    Args:
+        file_path: File path
+    """
+    if not file_path.is_file():
+        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path)
+
+
+def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]:
+    """
+    Save fastq sequences in a dictionary <header, sequence>
+    fasta_lines: List[str]: List of sequences
+    """
+    index: Dict[str, str] = {}
+    header: Optional[str] = None
+    seq: List[str] = []
+
+    for line in fasta_lines:
+        line = line.strip()
+        header_match = re.match(r">(.+)\n$", line)
+        if header_match:
+            if header:
+                index[header] = "".join(seq)
+                header = None
+            header = header_match.group(1)
+            seq = []
+        else:
+            seq.append(line)
+    # Add the last sequence
+    if header:
+        index[header] = "".join(seq)
+    return index
+
+
+# def fasta_to_dict(fasta_lines:List[str]):
+#    index = {}
+#    for line in fasta_lines:
+#    #it = iter(fasta_lines)
+#    #for header in it:
+#        match = re.search(r">(.+)\n$", line)
+#        header = match.group(1)
+#        seq = next(line)
+#        index[header] = seq
+#    return index
+
+
+def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List:
+    """
+    The protein dataset file is splitted by a number of sequence equals to the batch_size
+    in batch files stored in 10 output directories.
+    protein_dataset : Path for the protein dataset.
+    output_dir : Output directory path.
+    batch_size : Size of the batch, it needs to be equals to the number of threads
+    to parallelise the sequence processing for each file.
+    """
+    batched_protein_files = []
+
+    for i in range(0, 10):
+        create_dir(output_dir, (f"bin_{i}"))
+    with open(protein_dataset, "r", encoding="utf8") as file_in:
+        seq_count = 0
+        batch_count = 0
+        current_record = ""
+        initial_seq = True
+        for line in file_in:
+            match = re.search(r">(.+)$", line)
+            # match header and is not first sequence, if the number of stored sequences in each file equals
+            # the number of batch_size, a new file will be created and the current_record reset
+            if match and not initial_seq and seq_count % batch_size == 0:
+                bin_num = random.randint(0, 9)
+                batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+                with batch_file.open("w+") as file_out:
+                    file_out.write(current_record)
+                batch_count += 1
+                seq_count += 1
+                current_record = line
+                batched_protein_files.append(batch_file)
+            # match header and is the first sequence
+            elif match:
+                current_record += line
+                initial_seq = False
+                seq_count += 1
+            # other lines
+            else:
+                current_record += line
+
+        if current_record:
+            bin_num = random.randint(0, 9)
+            batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa"
+            with batch_file.open("w+") as file_out:
+                file_out.write(current_record)
+            batched_protein_files.append(batch_file)
+    return batched_protein_files
+
+
+def run_command(cmd) -> None:
+    """Submit command in multiprocessing action
+
+    Args:
+        cmd (List[str]): command to execute
+    """
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        logger.info("Command executed successfully: %s", cmd)
+        logger.info("Output: %s", result.stdout)
+    except subprocess.CalledProcessError as e:
+        logger.error("Command failed: %s", cmd)
+        logger.error("Error: %s", e.stderr)
+
+
+def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
+    """
+    Parses a GTF file to extract gene, transcript, and exon information.
+
+    Args:
+        gtf_file: Path to the GTF file.
+
+    Returns:
+        A dictionary containing gene information, organized by gene_id, transcript_id, and features.
+    """
+    gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
+
+    with open(gtf_file, "r") as gtf_in:
+        for line in gtf_in:
+            eles = line.strip().split("\t")
+            if len(eles) != 9:
+                continue
+
+            match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8])
+            if not match:
+                continue
+
+            gene_id = match.group(1)
+            transcript_id = match.group(2)
+            feature_type = eles[2]
+
+            if gene_id not in gtf_genes:
+                gtf_genes[gene_id] = {}
+
+            if feature_type == "transcript":
+                gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []}
+            elif feature_type == "exon":
+                if transcript_id in gtf_genes[gene_id]:
+                    gtf_genes[gene_id][transcript_id]["exons"].append(line.strip())
+
+    return gtf_genes
+
+
+def check_transcriptomic_output(main_output_dir: Path) -> None:
+    """Check across the various transcriptomic if there is a reasonable amount of data
+
+    Args:
+        main_output_dir (Path): Output dir path
+    Raises:
+        IOError: If no lines or fewer than the minimum expected lines are found in the output files.
+    """
+
+    transcriptomic_dirs = [
+        "scallop_output",
+        "stringtie_output",
+        "minimap2_output",
+    ]
+    total_lines = 0
+    min_lines = 100000
+    for transcriptomic_dir in transcriptomic_dirs:
+        full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf"
+        if not full_file_path.exists():
+            logger.warning(
+                "Warning, no annotation.gtf found for %s. This might be fine, \
+                    e.g. no long read data were provided",
+                transcriptomic_dir,
+            )
+            continue
+        num_lines = sum(1 for _ in full_file_path.open("r"))
+        total_lines += num_lines
+        logging.info("For %s found a total of in the annotation.gtf file", num_lines)
+    if total_lines == 0:
+        raise IOError(
+            "Anno was run with transcriptomic mode enabled,\
+            but the transcriptomic annotation files are empty"
+        )
+    if total_lines <= min_lines:
+        raise IOError(
+            "Anno was run with transcriptomic mode enabled, \
+            but the total number of lines in the output \
+            files were less than the min expected value"
+            + "\n"
+            "Found: " + str(total_lines) + "\n"
+            "Min allowed: " + str(min_lines)
+        )
+
+
+
+# def run_find_orfs(genome_file, main_output_dir):
+#     min_orf_length = 600
+
+#     orf_output_dir = create_dir(main_output_dir, "orf_output")
+#     seq_region_lengths = get_seq_region_lengths(genome_file, 5000)
+#     for region_name in seq_region_lengths:
+#         region_length = seq_region_lengths[region_name]
+#         seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir)
+#         for phase in range(0, 6):
+#             find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir)
+
+
+# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir):
+#     current_index = phase
+#     orf_counter = 1
+#     if phase > 2:
+#         seq = reverse_complement(seq)
+#         current_index = current_index % 3
+
+#     orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa"))
+#     orf_out = open(orf_file_path, "w+")
+
+#     while current_index < len(seq):
+#         codon = seq[current_index : current_index + 3]
+#         if codon == "ATG":
+#             orf_seq = codon
+#             for j in range(current_index + 3, len(seq), 3):
+#                 next_codon = seq[j : j + 3]
+#                 if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA":
+#                     orf_seq += next_codon
+#                     if len(orf_seq) >= min_orf_length:
+#                         orf_out.write(
+#                             ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n"
+#                         )
+#                         orf_out.write(orf_seq + "\n")
+#                         orf_counter += 1
+#                         orf_seq = ""
+#                         break
+
+#                 # If there's another met in phase, then put i to the start of
+#                 # the codon after j so that only the longest ORF is found
+#                 if next_codon == "ATG":
+#                     current_index = j + 3
+#                 orf_seq += next_codon
+#         current_index += 3
+#     orf_out.close()
+
+
+# def bed_to_gff(input_dir, hints_file):
+#     gff_out = open(hints_file, "w+")
+#     exons_dict = {}
+#     for bed_file in glob.glob(input_dir + "/*.bed"):
+#         logger.info("Processing file for hints:")
+#         logger.info(bed_file)
+#         bed_in = open(bed_file)
+#         bed_lines = bed_in.readlines()
+#         for line in bed_lines:
+#             line = line.rstrip()
+#             elements = line.split("\t")
+#             seq_region_name = elements[0]
+#             offset = int(elements[1])
+#             hit_name = elements[3]
+#             strand = elements[5]
+#             block_sizes = elements[10].split(",")
+#             block_sizes = list(filter(None, block_sizes))
+#             block_starts = elements[11].split(",")
+#             block_starts = list(filter(None, block_starts))
+#             exons = bed_block_to_exons(block_sizes, block_starts, offset)
+#             for i, element in enumerate(exons):
+#                 exon_coords = exons[i]
+#                 exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand
+#                 if exon_key in exons_dict:
+#                     exons_dict[exon_key][5] += 1
+#                 else:
+#                     gff_list = [
+#                         seq_region_name,
+#                         "CDNA",
+#                         "exon",
+#                         exon_coords[0],
+#                         exon_coords[1],
+#                         1.0,
+#                         strand,
+#                         ".",
+#                     ]
+#                     exons_dict[exon_key] = gff_list
+
+#     for exon_key, gff_list in exons_dict.items():
+#         gff_list[5] = str(gff_list[5])
+#         gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n"
+#         gff_out.write(gff_line)
+
+#     gff_out.close()
+
+#     sorted_hints_out = open((hints_file + ".srt"), "w+")
+#     subprocess.run(
+#         ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file],
+#         stdout=sorted_hints_out,
+#     )
+#     sorted_hints_out.close()
+
+
+# def splice_junction_to_gff(input_dir, hints_file):
+#     sjf_out = open(hints_file, "w+")
+
+#     for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"):
+#         sjf_in = open(sj_tab_file)
+#         sjf_lines = sjf_in.readlines()
+#         for line in sjf_lines:
+#             elements = line.split("\t")
+#             strand = "+"
+#             # If the strand is undefined then skip, Augustus expects a strand
+#             if elements[3] == "0":
+#                 continue
+#             elif elements[3] == "2":
+#                 strand = "-"
+
+#             junction_length = int(elements[2]) - int(elements[1]) + 1
+#             if junction_length < 100:
+#                 continue
+
+#             if not elements[4] and elements[7] < 10:
+#                 continue
+
+#             # For the moment treat multimapping and single mapping things as a combined score
+#             score = float(elements[6]) + float(elements[7])
+#             score = str(score)
+#             output_line = [
+#                 elements[0],
+#                 "RNASEQ",
+#                 "intron",
+#                 elements[1],
+#                 elements[2],
+#                 score,
+#                 strand,
+#                 ".",
+#                 ("src=W;mul=" + score + ";"),
+#             ]
+#             sjf_out.write("\t".join(output_line) + "\n")
+
+#     sjf_out.close()
+
+
+# def split_genome(genome_file, target_dir, min_seq_length):
+#     # This is the lazy initial way of just splitting into a dir
+#     # of files based on the toplevel sequence with a min sequence length filter
+#     # There are a couple of obvious improvements:
+#     # 1) Instead of making files for all seqs, just process N seqs
+#     #    parallel, where N = num_threads. Then you could clean up the seq file
+#     #    after each seq finishes, thus avoiding potentially
+#     #    having thousands of file in a dir
+#     # 2) Split the seq into even slices and process these in parallel
+#     #    (which the same cleanup as in 1). For sequences smaller than the
+#     #    target slice size, bundle them up together into a single file. Vastly
+#     #    more complex, partially implemented in the splice_genome method
+#     #    Allows for more consistency with parallelisation (since there should
+#     #    be no large outliers). But require a mapping strategy for the
+#     #    coords and sequence names and all the hints will need to be adjusted
+#     current_header = ""
+#     current_seq = ""
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             if len(current_seq) > min_seq_length:
+#                 file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#                 if not os.path.exists(file_out_name):
+#                     file_out = open(file_out_name, "w+")
+#                     file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#                     file_out.close()
+
+#                 else:
+#                     print("Found an existing split file, so will not overwrite. File found:")
+#                     print(file_out_name)
+
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#         line = file_in.readline()
+
+#     if len(current_seq) > min_seq_length:
+#         file_out_name = os.path.join(target_dir, (current_header + ".split.fa"))
+#         if not os.path.exists(file_out_name):
+#             file_out = open(file_out_name, "w+")
+#             file_out.write(">" + current_header + "\n" + current_seq + "\n")
+#             file_out.close()
+
+#         else:
+#             logger.info("Found an existing split file, so will not overwrite. File found:")
+#             logger.info(file_out_name)
+
+#     file_in.close()
+
+
+# # def merge_gtf_files(file_paths,id_label):
+
+# #  gtf_file_path = os.path.join(output_dir,'annotation.gtf')
+# #  gtf_out = open(gtf_file_path,'w+')
+# #  for gtf_file_path in gtf_files:
+# #    gtf_file_name = os.path.basename(gtf_file_path)
+# #    match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name)
+# #    start_offset = int(match.group(1))
+# #    gtf_in = open(gtf_file_path,'r')
+# #    line = gtf_in.readline()
+# #    while line:
+# #      values = line.split("\t")
+# #      if len(values) == 9 and (values[2] in feature_types):
+# #        values[3] = str(int(values[3]) + (start_offset - 1))
+# #        values[4] = str(int(values[4]) + (start_offset - 1))
+# #        gtf_out.write("\t".join(values))
+# #        line = gtf_in.readline()
+# #    gtf_in.close()
+# #  gtf_out.close()
+
+
+# def multiprocess_generic(cmd):
+#     print(" ".join(cmd))
+#     subprocess.run(cmd)
+
+
+# def seq_region_names(genome_file):
+#     region_list = []
+
+#     file_in = open(genome_file)
+#     line = file_in.readline()
+#     while line:
+#         match = re.search(r">([^\s]+)", line)
+#         if match:
+#             region_name = match.group(1)
+#             if region_name == "MT":
+#                 logger.info("Skipping region named MT")
+#                 line = file_in.readline()
+#                 continue
+#             else:
+#                 region_list.append(match.group(1))
+#         line = file_in.readline()
+
+#     return region_list
+
+
+# def slice_genome(genome_file, target_dir, target_slice_size):
+#     # The below is sort of tested
+#     # Without the
+#     target_seq_length = 50000000
+#     min_seq_length = 1000
+#     current_header = ""
+#     current_seq = ""
+#     seq_dict = {}
+#     for line in seq:  # pylint:disable=used-before-assignment
+#         match = re.search(r">(.+)$", line)
+#         if match and current_header:
+#             seq_dict[current_header] = current_seq
+#             current_seq = ""
+#             current_header = match.group(1)
+#         elif match:
+#             current_header = match.group(1)
+#         else:
+#             current_seq += line.rstrip()
+
+#     seq_dict[current_header] = current_seq
+
+#     seq_buffer = 0
+#     file_number = 0
+#     file_name = "genome_file_" + str(file_number)
+
+#     for header in seq_dict:
+#         seq_iterator = 0
+#         seq = seq_dict[header]
+
+#         while len(seq) > target_seq_length:
+#             file_out = open(os.path.join(target_dir, file_name), "w+")
+#             subseq = seq[0:target_seq_length]
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n")
+#             file_out.close()
+#             seq = seq[target_seq_length:]
+#             seq_iterator += 1
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+#         if len(seq) >= min_seq_length:
+#             file_name = "genome_file_" + str(file_number)
+#             file_out = open(os.path.join(file_name), "w+")
+#             file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n")
+#             file_out.close()
+#             file_number += 1
+#             file_name = "genome_file_" + str(file_number)
+
+
+# def coallate_results(main_output_dir):
+#     results_dir = create_dir(main_output_dir, "results")
+#     output_dirs = [
+#         "augustus_output",
+#         "cpg_output",
+#         "dust_output",
+#         "eponine_output",
+#         "red_output",
+#         "rfam_output",
+#         "trf_output",
+#         "trnascan_output",
+#     ]
+#     for output_dir in output_dirs:
+#         match = re.search(r"(.+)_output", output_dir)
+#         result_type = match.group(1)
+#         results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf")
+#         copy_path = os.path.join(results_dir, (result_type + ".gtf"))
+#         if os.path.exists(results_path):
+#             cpy_cmd = ["cp", results_path, copy_path]
+# subprocess.run(cpy_cmd)