From a90fd9fff03e2c411ab823b80b155905e6efbcbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:16:11 +0100 Subject: [PATCH 01/20] added option to expose trimming and subsampling, fixed error on paired fastq files --- .../anno/transcriptomic_annotation/star.py | 417 +++++++++++++----- 1 file changed, 310 insertions(+), 107 deletions(-) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index b9d7f42..56c181a 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -33,7 +33,7 @@ import re import shutil import subprocess -from typing import List +from typing import Dict, List from ensembl.tools.anno.utils._utils import ( check_exe, @@ -43,7 +43,7 @@ ) -def run_star( +def run_star( # pylint:disable=too-many-branches genome_file: Path, output_dir: Path, short_read_fastq_dir: Path, @@ -51,6 +51,11 @@ def run_star( trim_fastq: bool = False, max_reads_per_sample: int = 0, max_intron_length: int = 100000, + subsample_read_limit: int = 100000000, + subsample_percentage: float = 0.25, + sampling_via_read_limit: bool = False, + sampling_via_percentage: bool = False, + sampling_via_read_limit_percentage: bool = False, num_threads: int = 1, star_bin: Path = Path("star"), samtools_bin: Path = Path("samtools"), @@ -72,6 +77,20 @@ def run_star( :type max_reads_per_sample: int, default 0 :param max_intron_length: The maximum intron size for alignments. Defaults to 100000. :type max_intron_length: int, default 100000 + :param subsample_read_limit: Maximum number of reads to subsample. Defaults to 100000000. + :type subsample_read_limit:int, default 100000000, + :param subsample_percentage: Maximun percentage of reads to subsample. + :type subsample_percentage: int, default 0.25, + :param sampling_via_read_limit: If True will subsample an input dataset of \ + fastq files using --subsample_read_limit value. + :type sampling_via_read_limit : boolean, False, + :param sampling_via_percentage: If True will subsample an input dataset of \ + fastq files using --subsample_percentage value. + :type sampling_via_percentage : boolean, False, + :param sampling_via_read_limit_percentage: If True will subsample an input dataset \ + of fastq files using --subsample_read_limit and --subsample_percentage value; \ + the lowest number of reads is taken. + :type sampling_via_read_limit_percentage : boolean, False, :param num_threads: Number of available threads. :type num_threads: int, default 1 :param star_bin: Software path. @@ -107,7 +126,7 @@ def run_star( return star_index_file = star_dir / "SAindex" - fastq_file_list = [] + fastq_file_list: List[Path] = [] file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") fastq_file_list = [ path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) @@ -119,14 +138,26 @@ def run_star( # fastq_file_list.extend(glob.glob(os.path.join(short_read_fastq_dir, file_type))) # Get list of paired paths - fastq_file_list = _create_paired_paths(fastq_file_list) + paired_fastq_file_list = _create_paired_paths(fastq_file_list) # Subsamples in parallel if there's a value set if max_reads_per_sample: - subsample_transcriptomic_data(fastq_file_list) + paired_fastq_file_list_sub: List[List[Path]] = [] + for paired_fastq_files in paired_fastq_file_list: + paired_fastq_files_sub = subsample_transcriptomic_data( + paired_fastq_files, + subsample_read_limit, + subsample_percentage, + sampling_via_read_limit, + sampling_via_percentage, + sampling_via_read_limit_percentage, + num_threads, + ) + paired_fastq_file_list_sub.append(paired_fastq_files_sub) + paired_fastq_file_list = paired_fastq_file_list_sub # Get the list of the new subsampled files - fastq_file_list = [ - path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) - ] + # fastq_file_list = [ + # path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + # ] # I don't think is needed # fastq_file_list = check_for_fastq_subsamples(fastq_file_list) @@ -159,23 +190,27 @@ def run_star( str(genome_file), ] ) - except Exception as e:#pylint:disable=broad-exception-caught + except Exception as e: # pylint:disable=broad-exception-caught logging.error("An error occurred while creating star index: %s", e) logging.info("Running Star on the files in the fastq dir") - for fastq_file in fastq_file_list: + for paired_files in paired_fastq_file_list: + first_file_name = paired_files[0].name # Get the name of the first file + match = re.search(r"(.+)_\d+\.(fastq|fq)", first_file_name) # Search for pattern + if match: + first_part_of_name = match.group(1) # logger.info(fastq_file_path) # fastq_file_name = os.path.basename(fastq_file_path) star_tmp_dir = star_dir / "tmp" if star_tmp_dir.exists(): shutil.rmtree(star_tmp_dir) - sam_file = Path(f"{star_dir}/{fastq_file.name}.sam") - junctions_file = Path(f"{star_dir}/{fastq_file.name}.sj.tab") + sam_file = Path(f"{star_dir}/{first_part_of_name}.sam") + junctions_file = Path(f"{star_dir}/{first_part_of_name}.sj.tab") sam_file_name = sam_file.name sam_temp_file = Path(f"{star_dir}/{sam_file_name}.tmp") bam_file = re.sub(".sam", ".bam", sam_file_name) bam_sort_file = Path(f"{star_dir}/{bam_file}") - log_out_file = Path(f"{star_dir}/{fastq_file.name}.Log.final.out") + log_out_file = Path(f"{star_dir}/{first_part_of_name}.Log.final.out") if log_out_file.exists() and bam_sort_file.exists() and bam_sort_file.stat().st_size != 0: logging.info( "Found an existing bam file for the fastq file, \ @@ -183,7 +218,9 @@ def run_star( ) continue - logging.info("Processing %s", fastq_file) + read_files_in = ",".join(str(fastq_file) for fastq_file in paired_files) + + logging.info("Processing %s", list(paired_files)) star_command = [ str(star_bin), "--outFilterIntronMotifs", @@ -199,7 +236,7 @@ def run_star( "--genomeDir", str(star_dir), "--readFilesIn", - str(fastq_file), + read_files_in, "--outFileNamePrefix", f"{star_dir}/", "--outTmpDir", @@ -212,7 +249,7 @@ def run_star( #'--outSJfilterIntronMaxVsReadN','5000','10000','25000','40000', #'50000','50000','50000','50000','50000','100000'] # check_compression = re.search(r".gz$", fastq_file) - if fastq_file.suffix.endswith(".gz"): + if Path(paired_files[0].name).suffix.endswith(".gz"): star_command.append("--readFilesCommand") star_command.append("gunzip") star_command.append("-c") @@ -238,7 +275,7 @@ def run_star( logging.info("Completed running STAR") -def _create_paired_paths(fastq_file_paths: List) -> List[Path]: +def _create_paired_paths(fastq_file_paths: List[Path]) -> List[List[Path]]: """ Create list of paired transcriptomic fastq files @@ -246,30 +283,28 @@ def _create_paired_paths(fastq_file_paths: List) -> List[Path]: fastq_file_paths (List): List of transcriptomic file paths. Returns: - List: List of paired transcriptomic files + List[List[Path]]: List of paired transcriptomic files """ - path_dict = {} - # final_list = [] + path_dict: Dict[str, List[Path]] = {} + final_list: List[List[Path]] = [] for fastq_file in fastq_file_paths: paired_name = re.search(r"(.+)_\d+\.(fastq|fq)", str(fastq_file)) if not paired_name: logging.exception( "Could not find _1 or _2 at the end of the prefix \ for file. Assuming file is not paired: %s", - fastq_file, + str(fastq_file), ) - # final_list.append([fastq_file]) - path_dict[fastq_file] = [fastq_file] + final_list.append([fastq_file]) continue run_accession = paired_name.group(1) if run_accession in path_dict: path_dict[run_accession].append(fastq_file) else: path_dict[run_accession] = [fastq_file] - # for pair in path_dict: - # final_list.append(path_dict[pair]) - logging.info([value for values_list in path_dict.values() for value in values_list]) - return [value for values_list in path_dict.values() for value in values_list] + for pair in path_dict: # pylint:disable=consider-using-dict-items + final_list.append(path_dict[pair]) + return final_list # pylint:disable=pointless-string-statement @@ -279,22 +314,108 @@ def _create_paired_paths(fastq_file_paths: List) -> List[Path]: """ -def _subsample_paired_fastq_files( +def subsample_transcriptomic_data( + fastq_file_list: List[Path], + subsample_read_limit: int = 100000000, + subsample_percentage: float = 0.25, + sampling_via_read_limit: bool = False, + sampling_via_percentage: bool = False, + sampling_via_read_limit_percentage: bool = True, + num_threads: int = 2, +) -> List[Path]: + """ + Subsample list of paired files. + Args: + fastq_file_list : Subsample paired fastq files. + subsample_read_limit : Maximum number of reads to subsample, default to 100000000. + subsample_percentage : Maximun percentage of reads to subsample, default to 0.25. + sampling_via_read_limit : If True will subsample an input dataset of fastq files \ + using --subsample_read_limit value. + sampling_via_percentage : If True will subsample an input dataset of fastq files \ + using --subsample_percentage value. + sampling_via_read_limit_percentage : If True will subsample an input dataset of \ + fastq files using --subsample_read_limit and --subsample_percentage value; \ + the lowest number of reads is taken. + num_threads : number of threads + Returns: + List[Path]: List of subsampled paired transcriptomic files + """ + subsampled_fastq_files: List[Path] = [] + # for fastq_files in fastq_file_list: + # fastq_file_1, fastq_file_2 = fastq_files + # fastq_file_pair = "" + # if len(fastq_files) == 2: + # fastq_file_pair = fastq_files[1] + + if len(fastq_file_list) == 1: + fastq_file_1 = fastq_file_list[0] + if Path(f"{fastq_file_1}.sub").exists(): + logging.info( + "Found an existing .sub file on the fastq path, will use that \ + instead. File:%s.sub", + fastq_file_1, + ) + else: + _subsample_paired_fastq_files( + fastq_file_list, + subsample_read_limit, + subsample_percentage, + sampling_via_read_limit, + sampling_via_percentage, + sampling_via_read_limit_percentage, + num_threads, + ) + subsampled_fastq_files = [Path(f"{fastq_file_1}.sub")] + if len(fastq_file_list) == 2: + fastq_file_1, fastq_file_2 = fastq_file_list + if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists(): + logging.info( + "Found an existing .sub files on the fastq path for both members of the pair, will use \ + those instead of subsampling again. Files: %s.sub,%s.sub", + fastq_file_1, + fastq_file_2, + ) + else: + _subsample_paired_fastq_files( + fastq_file_list, + subsample_read_limit, + subsample_percentage, + sampling_via_read_limit, + sampling_via_percentage, + sampling_via_read_limit_percentage, + num_threads, + ) + subsampled_fastq_files = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")] + return subsampled_fastq_files + + +def _subsample_paired_fastq_files( # pylint:disable=too-many-branches fastq_files: List[Path], subsample_read_limit: int = 100000000, + subsample_percentage: float = 0.25, + sampling_via_read_limit: bool = False, + sampling_via_percentage: bool = False, + sampling_via_read_limit_percentage: bool = True, num_threads: int = 2, - compressed: bool = False, ) -> None: """ Perform subsampling on two paired FastQ files in parallel using multiple threads. Args: fastq_files : Path for paired fastq files. - output_files : Path for the output file. - subsample_read_limit : Subsample size, defaults to 100000000. + subsample_read_limit : Maximum number of reads to subsample, default to 100000000. + subsample_percentage : Maximun percentage of reads to subsample, default to 0.25. + sampling_via_read_limit : If True will subsample an input dataset of fastq files \ + using --subsample_read_limit value. + sampling_via_percentage : If True will subsample an input dataset of fastq files \ + using --subsample_percentage value. + sampling_via_read_limit_percentage : If True will subsample an input dataset of \ + fastq files using --subsample_read_limit and --subsample_percentage value; \ + the lowest number of reads is taken. num_threads : Number of threads, defaults to 2. compressed : file compressed, defaults to False. """ + if len(fastq_files) == 2: fastq_file_1, fastq_file_2 = fastq_files output_file_1, output_file_2 = [Path(f"{fastq_file_1}.sub"), Path(f"{fastq_file_2}.sub")] @@ -308,42 +429,68 @@ def _subsample_paired_fastq_files( compressed = True num_lines = sum(1 for line in gzip.open(fastq_file_1)) # pylint:disable=consider-using-with else: + compressed = False num_lines = sum(1 for line in open(fastq_file_1)) # pylint:disable=consider-using-with range_limit = int(num_lines / 4) - if range_limit <= subsample_read_limit: + sampling_size = 0 + if sampling_via_read_limit and subsample_read_limit: + sampling_size = subsample_read_limit + elif sampling_via_percentage and subsample_percentage: + sampling_size = round(range_limit * subsample_percentage) + elif sampling_via_read_limit_percentage and subsample_percentage and subsample_read_limit: + sampling_size = min(subsample_read_limit, round(range_limit * subsample_percentage)) + + if range_limit <= sampling_size: logging.info( "Number of reads (%s is less than the max allowed read count (%s), \ no need to subsample", str(range_limit), - str(subsample_read_limit), + str(sampling_size), ) return - rand_list = random.sample(range(0, range_limit - 1), subsample_read_limit) + rand_list = random.sample(range(0, range_limit - 1), sampling_size) random_indices = {idx * 4: 1 for idx in rand_list} logging.info("Processing paired files in parallel") - pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with - pool.apply_async( - _subsample_fastq_subset, - args=( + if num_threads >= 2: + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_1, + output_file_1, + random_indices, + compressed, + ), + ) + if len(fastq_files) == 2: + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_2, + output_file_2, + random_indices, + compressed, + ), + ) + + pool.close() + pool.join() + else: + _subsample_fastq_subset( fastq_file_1, output_file_1, random_indices, compressed, - ), - ) - pool.apply_async( - _subsample_fastq_subset, - args=( - fastq_file_2, - output_file_2, - random_indices, - compressed, - ), - ) - pool.close() - pool.join() + ) + if len(fastq_files) == 2: + _subsample_fastq_subset( + fastq_file_2, + output_file_2, + random_indices, + compressed, + ) def _subsample_fastq_subset( @@ -351,7 +498,6 @@ def _subsample_fastq_subset( ) -> None: """ Selecting specific sets of four lines from an input FastQ file and writing them to an output file. - Args: fastq_file : Path for the fastq file. output_file : Path for the output file. @@ -371,43 +517,6 @@ def _subsample_fastq_subset( lines = [file_in.readline() for _ in range(4)] -def subsample_transcriptomic_data(fastq_file_list: List, num_threads: int = 2) -> None: - """ - Subsample paired fastq files. - - Args: - fastq_file_list : List of fastq file path to process. - num_threads : number of threads - """ - for fastq_files in fastq_file_list: - fastq_file_1, fastq_file_2 = fastq_files - # fastq_file_pair = "" - # if len(fastq_files) == 2: - # fastq_file_pair = fastq_files[1] - - if len(fastq_files) == 1: - fastq_file_1 = fastq_files[0] - if Path(f"{fastq_file_1}.sub").exists(): - logging.info( - "Found an existing .sub file on the fastq path, will use that instead. File:%s.sub", - fastq_file_1, - ) - else: - _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads) - - elif len(fastq_files) == 2: - fastq_file_1, fastq_file_2 = fastq_files - if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists(): - logging.info( - "Found an existing .sub files on the fastq path for both members of the pair, will use \ - those instead of subsampling again. Files: %s.sub,%s.sub", - fastq_file_1, - fastq_file_2, - ) - elif Path(f"{fastq_file_2}.sub").exists(): - _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads) - - def run_trimming( output_dir: Path, short_read_fastq_dir: Path, @@ -427,12 +536,12 @@ def run_trimming( check_exe(trim_galore_bin) trim_dir = create_dir(output_dir, "trim_galore_output") - fastq_file_list = [] + fastq_file_list: List[Path] = [] file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") fastq_file_list = [ path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) ] - fastq_file_list = _create_paired_paths(fastq_file_list) + paired_fastq_file_list = _create_paired_paths(fastq_file_list) trim_galore_cmd = [ str(trim_galore_bin), @@ -446,9 +555,11 @@ def run_trimming( ] pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with - for fastq_file in fastq_file_list: + for fastq_file in paired_fastq_file_list: + file1, file2 = fastq_file if delete_pre_trim_fastq: - fastq_file.unlink() + file1.unlink() + file2.unlink() pool.apply_async( multiprocess_trim_galore, args=( @@ -527,6 +638,75 @@ def parse_args(): parser.add_argument("--star_bin", default="STAR", help="Star software path") parser.add_argument("--samtools_bin", default="samtools", help="Samtools software path") parser.add_argument("--trim_galore_bin", default="trim_galore", help="Trim Galore software path") + parser.add_argument( + "--subsample_read_limit", + type=int, + default=100000000, + help="Maximum number of reads to subsample. Default 1 hundred million reads", + required=False, + ) + parser.add_argument( + "--subsample_percentage", + type=float, + default=0.25, + help="Maximun percentage of reads to subsample (0 to 1)", + required=False, + ) + parser.add_argument( + "--sampling_via_read_limit", + type=bool, + default=False, + help="If True will subsample an input dataset of fastq files using --subsample_read_limit value.", + required=False, + ) + parser.add_argument( + "--sampling_via_percentage", + type=bool, + default=False, + help="If True will subsample an input dataset of fastq files using --subsample_percentage value.", + required=False, + ) + parser.add_argument( + "--sampling_via_read_limit_percentage", + type=bool, + default=True, + help="If True will subsample an input dataset of fastq files using --subsample_read_limit \ + and --subsample_percentage value; the lowest number of reads is taken.", + required=False, + ) + parser.add_argument( + "--paired_file_1", + help="Path for single or paired fastq file; used when --run_subsampling \ + or --run_trimming are enabled.", + required=False, + ) + parser.add_argument( + "--paired_file_2", + help="Path for single or paired fastq file; used when --run_subsampling \ + or --run_trimming are enabled.", + required=False, + ) + parser.add_argument( + "--run_star", + type=bool, + default=True, + help="If True will run STAR alignment given an input dataset of fastq files.", + required=False, + ) + parser.add_argument( + "--run_subsampling", + type=bool, + default=False, + help="If True will subsample an input dataset of fastq files.", + required=False, + ) + parser.add_argument( + "--run_trimming", + type=bool, + default=False, + help="If True will trim input dataset of fastq files using TrimGalore suite.", + required=False, + ) return parser.parse_args() @@ -542,20 +722,43 @@ def main(): defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False, ) - - run_star( - args.genome_file, - args.output_dir, - args.short_read_fastq_dir, - args.delete_pre_trim_fastq, - args.trim_fastq, - args.max_reads_per_sample, - args.max_intron_length, - args.num_threads, - args.star_bin, - args.samtools_bin, - args.trim_galore_bin, - ) + if args.run_star: + run_star( + args.genome_file, + args.output_dir, + args.short_read_fastq_dir, + args.delete_pre_trim_fastq, + args.trim_fastq, + args.max_reads_per_sample, + args.max_intron_length, + args.subsample_read_limit, + args.subsample_percentage, + args.sampling_via_read_limit, + args.sampling_via_percentage, + args.sampling_via_read_limit_percentage, + args.num_threads, + args.star_bin, + args.samtools_bin, + args.trim_galore_bin, + ) + if args.run_subsampling: + fastq_file_list = [Path(args.paired_file_1), Path(args.paired_file_2)] + subsample_transcriptomic_data( + fastq_file_list, + args.subsample_read_limit, + args.subsample_percentage, + args.sampling_via_read_limit, + args.sampling_via_percentage, + args.num_threads, + ) + if args.run_trimming: + run_trimming( + args.output_dir, + args.short_read_fastq_dir, + args.delete_pre_trim_fastq, + args.num_threads, + args.trim_galore_bin, + ) if __name__ == "__main__": From 160735cfd344a1ea5937f0de0845c65dd0701725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:22:14 +0100 Subject: [PATCH 02/20] sphinx fixed --- .../tools/anno/transcriptomic_annotation/star.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 56c181a..10261a3 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -63,6 +63,7 @@ def run_star( # pylint:disable=too-many-branches ) -> None: """ Run STAR alignment on list of short read data. + :param genome_file: Genome file path. :type genome_file: Path :param output_dir: Working directory path. @@ -81,15 +82,11 @@ def run_star( # pylint:disable=too-many-branches :type subsample_read_limit:int, default 100000000, :param subsample_percentage: Maximun percentage of reads to subsample. :type subsample_percentage: int, default 0.25, - :param sampling_via_read_limit: If True will subsample an input dataset of \ - fastq files using --subsample_read_limit value. + :param sampling_via_read_limit: subsample fastq files using --subsample_read_limit. :type sampling_via_read_limit : boolean, False, - :param sampling_via_percentage: If True will subsample an input dataset of \ - fastq files using --subsample_percentage value. + :param sampling_via_percentage: subsample fastq files using --subsample_percentage. :type sampling_via_percentage : boolean, False, - :param sampling_via_read_limit_percentage: If True will subsample an input dataset \ - of fastq files using --subsample_read_limit and --subsample_percentage value; \ - the lowest number of reads is taken. + :param sampling_via_read_limit_percentage: use max read limit and percentage value. :type sampling_via_read_limit_percentage : boolean, False, :param num_threads: Number of available threads. :type num_threads: int, default 1 From 142fe6646014f892d2b26245d8f74f3e2e8472bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:29:09 +0100 Subject: [PATCH 03/20] sphinx fixed --- src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 10261a3..5449f5f 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -18,6 +18,7 @@ in genomics research for aligning RNA-seq data to a reference genome. Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 + """ __all__ = ["run_star"] From 23af96214fac01ba3af3b591ac188d13b425a413 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:36:23 +0100 Subject: [PATCH 04/20] sphinx fixed, exposed all 3 functions --- .../tools/anno/transcriptomic_annotation/star.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 5449f5f..f7b62c3 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -12,16 +12,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - """ -The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used -in genomics research for aligning RNA-seq data to a reference genome. -Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. -Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 - + The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used + in genomics research for aligning RNA-seq data to a reference genome. + Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. + Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 """ - -__all__ = ["run_star"] +__all__ = ["run_star", "subsample_transcriptomic_data", "run_trimming"] import argparse import logging From 1148f325856756e81dcb84a0459fa6efe462ec98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:40:37 +0100 Subject: [PATCH 05/20] sphinx fixed --- src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index f7b62c3..279c969 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -61,7 +61,6 @@ def run_star( # pylint:disable=too-many-branches ) -> None: """ Run STAR alignment on list of short read data. - :param genome_file: Genome file path. :type genome_file: Path :param output_dir: Working directory path. From 8c4c39ba05da9f27c1351ec89b91037fc95b47d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 15:50:52 +0100 Subject: [PATCH 06/20] silenced warnings --- docs/source/conf.py | 7 ++++++- .../ensembl/tools/anno/transcriptomic_annotation/star.py | 9 ++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4f9e850..851f628 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -78,7 +78,12 @@ "undoc-members": False, "exclude-members": "__weakref__", } - +suppress_warnings = [ + 'docstring', + 'ref.citation', # Example: Suppress warnings about citations + 'image.nonlocal_uri', # Example: Suppress warnings about non-local images + # Add other warnings to suppress as needed +] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 279c969..784a242 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -12,11 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" - The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used - in genomics research for aligning RNA-seq data to a reference genome. - Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. - Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 +"""The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used +in genomics research for aligning RNA-seq data to a reference genome. +Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. +Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 """ __all__ = ["run_star", "subsample_transcriptomic_data", "run_trimming"] From 3f1f978945006685b45989b297f07c3ba1d87e73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 16:12:25 +0100 Subject: [PATCH 07/20] wrong indentation --- .../tools/anno/transcriptomic_annotation/star.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 784a242..21a105a 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -78,12 +78,12 @@ def run_star( # pylint:disable=too-many-branches :type subsample_read_limit:int, default 100000000, :param subsample_percentage: Maximun percentage of reads to subsample. :type subsample_percentage: int, default 0.25, - :param sampling_via_read_limit: subsample fastq files using --subsample_read_limit. - :type sampling_via_read_limit : boolean, False, - :param sampling_via_percentage: subsample fastq files using --subsample_percentage. - :type sampling_via_percentage : boolean, False, + :param sampling_via_read_limit: subsample fastq files using subsample_read_limit. + :type sampling_via_read_limit: boolean, False, + :param sampling_via_percentage: subsample fastq files using subsample_percentage. + :type sampling_via_percentage: boolean, False, :param sampling_via_read_limit_percentage: use max read limit and percentage value. - :type sampling_via_read_limit_percentage : boolean, False, + :type sampling_via_read_limit_percentage: boolean, False, :param num_threads: Number of available threads. :type num_threads: int, default 1 :param star_bin: Software path. From ccd416d9b256b1ec02dfbc683cb64008e51c9333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Tue, 28 May 2024 23:48:22 +0100 Subject: [PATCH 08/20] missing input sampling function --- src/python/ensembl/tools/anno/transcriptomic_annotation/star.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 21a105a..dfe976a 100644 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -742,6 +742,7 @@ def main(): args.subsample_percentage, args.sampling_via_read_limit, args.sampling_via_percentage, + args.sampling_via_read_limit_percentage, args.num_threads, ) if args.run_trimming: From 042bc4b584a4bed45ae4e78130c234cc8fba4498 Mon Sep 17 00:00:00 2001 From: ens-ftricomi Date: Wed, 19 Jun 2024 11:41:47 +0100 Subject: [PATCH 09/20] improved subsampling --- .../anno/transcriptomic_annotation/star.py | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) mode change 100644 => 100755 src/python/ensembl/tools/anno/transcriptomic_annotation/star.py diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py old mode 100644 new mode 100755 index dfe976a..4556a0e --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -418,7 +418,7 @@ def _subsample_paired_fastq_files( # pylint:disable=too-many-branches else: raise FileNotFoundError("No fastq file found") - if fastq_file_1.suffix.endswith(".gz$"): + if fastq_file_1.suffix.endswith(".gz"): compressed = True num_lines = sum(1 for line in gzip.open(fastq_file_1)) # pylint:disable=consider-using-with else: @@ -443,8 +443,8 @@ def _subsample_paired_fastq_files( # pylint:disable=too-many-branches ) return - rand_list = random.sample(range(0, range_limit - 1), sampling_size) - random_indices = {idx * 4: 1 for idx in rand_list} + rand_list = random.sample(range(0, range_limit-1), sampling_size) + random_indices = [idx * 4 for idx in rand_list] logging.info("Processing paired files in parallel") if num_threads >= 2: pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with @@ -498,6 +498,29 @@ def _subsample_fastq_subset( compressed : the files is compressed """ line_index = 0 + read_block = [] + with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open( + output_file, "w+" + ) as file_out: + for line in file_in: + read_block.append(line) + if len(read_block) == 4: + if line_index in random_indices: + file_out.writelines(read_block) + read_block = [] + line_index += 4 + #lines = [file_in.readline() for _ in range(4)] + """ + while lines[3]: + #lines = [file_in.readline() for _ in range(4)] + # Break if the end of the file is reached + if len(lines) < 4 : # No more lines to read + break + # Write to output if current index is in random_indices + if line_index in random_indices: + file_out.writelines(lines) + line_index += 4 + lines = [file_in.readline() for _ in range(4)] with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open( output_file, "w+" @@ -508,7 +531,7 @@ def _subsample_fastq_subset( file_out.writelines(lines) line_index += 4 lines = [file_in.readline() for _ in range(4)] - + """ def run_trimming( output_dir: Path, @@ -609,9 +632,9 @@ def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Pat def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser(description="STAR's arguments") - parser.add_argument("--genome_file", required=True, help="Genome file path") - parser.add_argument("--output_dir", required=True, help="Output directory path") - parser.add_argument("--short_read_fastq_dir", required=True, help="Short read directory path") + parser.add_argument("--genome_file", help="Genome file path") + parser.add_argument("--output_dir", help="Output directory path") + parser.add_argument("--short_read_fastq_dir", help="Short read directory path") parser.add_argument( "--delete_pre_trim_fastq", action="store_true", @@ -682,7 +705,6 @@ def parse_args(): parser.add_argument( "--run_star", type=bool, - default=True, help="If True will run STAR alignment given an input dataset of fastq files.", required=False, ) From 7ee2b595f1ba1ba568179c60ecd33efd9c7ae132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 5 Dec 2024 11:53:12 +0000 Subject: [PATCH 10/20] moved split protein in utils --- .../tools/anno/protein_annotation/genblast.py | 59 +--------- src/python/ensembl/tools/anno/utils/_utils.py | 104 ++++++++++++++++-- 2 files changed, 98 insertions(+), 65 deletions(-) diff --git a/src/python/ensembl/tools/anno/protein_annotation/genblast.py b/src/python/ensembl/tools/anno/protein_annotation/genblast.py index 8652807..5d0275c 100644 --- a/src/python/ensembl/tools/anno/protein_annotation/genblast.py +++ b/src/python/ensembl/tools/anno/protein_annotation/genblast.py @@ -38,18 +38,17 @@ import multiprocessing import os from pathlib import Path -import random import re import shutil import signal import subprocess -from typing import List import argparse from ensembl.tools.anno.utils._utils import ( check_exe, create_dir, check_gtf_content, + split_protein_file ) logger = logging.getLogger(__name__) @@ -65,7 +64,7 @@ def run_genblast(#pylint:disable=dangerous-default-value convert2blastmask_bin: Path = Path("convert2blastmask"), makeblastdb_bin: Path = Path("makeblastdb"), num_threads: int = 1, - protein_set: str = ["uniprot", "orthodb"], + protein_set: str = "uniprot", ) -> None: """ @@ -102,6 +101,8 @@ def run_genblast(#pylint:disable=dangerous-default-value check_exe(genblast_bin) check_exe(convert2blastmask_bin) check_exe(makeblastdb_bin) + if protein_set not in {"uniprot", "orthodb"}: + raise ValueError("protein_set must be either 'uniprot' or 'orthodb'") if protein_set == "uniprot": genblast_dir = create_dir(output_dir, "uniprot_output") elif protein_set == "orthodb": @@ -128,7 +129,7 @@ def run_genblast(#pylint:disable=dangerous-default-value else: _run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file) _run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file) - batched_protein_files = _split_protein_file( + batched_protein_files = split_protein_file( protein_dataset, genblast_dir, num_threads ) pool = multiprocessing.Pool(num_threads) # pylint:disable=consider-using-with @@ -286,56 +287,6 @@ def _generate_genblast_gtf(genblast_dir: Path) -> None: path.unlink() -def _split_protein_file( - protein_dataset: Path, output_dir: Path, batch_size: int = 20 -) -> List: - """ - The protein dataset file is splitted by a number of sequence equals to the batch_size - in batch files stored in 10 output directories. - protein_dataset : Path for the protein dataset. - output_dir : Output directory path. - batch_size : Size of the batch, it needs to be equals to the number of threads - to parallelise the sequence processing for each file. - """ - batched_protein_files = [] - - for i in range(0, 10): - create_dir(output_dir, (f"bin_{i}")) - with open(protein_dataset,"r", encoding="utf8") as file_in: - seq_count = 0 - batch_count = 0 - current_record = "" - initial_seq = True - for line in file_in: - match = re.search(r">(.+)$", line) - # match header and is not first sequence, if the number of stored sequences in each file equals - # the number of batch_size, a new file will be created and the current_record reset - if match and not initial_seq and seq_count % batch_size == 0: - bin_num = random.randint(0, 9) - batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" - with batch_file.open("w+") as file_out: - file_out.write(current_record) - batch_count += 1 - seq_count += 1 - current_record = line - batched_protein_files.append(batch_file) - # match header and is the first sequence - elif match: - current_record += line - initial_seq = False - seq_count += 1 - # other lines - else: - current_record += line - - if current_record: - bin_num = random.randint(0, 9) - batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" - with batch_file.open("w+") as file_out: - file_out.write(current_record) - batched_protein_files.append(batch_file) - return batched_protein_files - def _run_convert2blastmask( convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py index c0c9786..8f882a5 100644 --- a/src/python/ensembl/tools/anno/utils/_utils.py +++ b/src/python/ensembl/tools/anno/utils/_utils.py @@ -21,6 +21,7 @@ import os from os import PathLike from pathlib import Path +import random import re import subprocess import shutil @@ -81,8 +82,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int: Check number of transcript lines in the GTF Arg: - gtf_file: GTF file path - content_obj: Object to detect and count in the gtf i.e transcript, repeat + gtf_file: GTF file path + content_obj: Object to detect and count in the gtf i.e transcript, repeat Return: Number of occurences """ @@ -403,6 +404,95 @@ def check_file(file_path: Path) -> None: if not file_path.is_file(): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) +def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]: + """ + Save fastq sequences in a dictionary + fasta_lines: List[str]: List of sequences + """ + index = {} + header = None + seq = [] + + for line in fasta_lines: + line = line.strip() + if line.startswith(">"): + if header: + index[header] = ''.join(seq) + header_match = re.match(r">(.+)", line) + if header_match: + header = header_match.group(1) + seq = [] + else: + raise ValueError(f"Invalid FASTA header line: {line}") + else: + seq.append(line) + + if header: + index[header] = ''.join(seq) + + return index +""" +def fasta_to_dict(fasta_lines:List[str]): + index = {} + for line in fasta_lines: + #it = iter(fasta_lines) + #for header in it: + match = re.search(r">(.+)\n$", line) + header = match.group(1) + seq = next(line) + index[header] = seq + return index +""" + +def split_protein_file( + protein_dataset: Path, output_dir: Path, batch_size: int = 20 +) -> List: + """ + The protein dataset file is splitted by a number of sequence equals to the batch_size + in batch files stored in 10 output directories. + protein_dataset : Path for the protein dataset. + output_dir : Output directory path. + batch_size : Size of the batch, it needs to be equals to the number of threads + to parallelise the sequence processing for each file. + """ + batched_protein_files = [] + + for i in range(0, 10): + create_dir(output_dir, (f"bin_{i}")) + with open(protein_dataset,"r", encoding="utf8") as file_in: + seq_count = 0 + batch_count = 0 + current_record = "" + initial_seq = True + for line in file_in: + match = re.search(r">(.+)$", line) + # match header and is not first sequence, if the number of stored sequences in each file equals + # the number of batch_size, a new file will be created and the current_record reset + if match and not initial_seq and seq_count % batch_size == 0: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batch_count += 1 + seq_count += 1 + current_record = line + batched_protein_files.append(batch_file) + # match header and is the first sequence + elif match: + current_record += line + initial_seq = False + seq_count += 1 + # other lines + else: + current_record += line + + if current_record: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batched_protein_files.append(batch_file) + return batched_protein_files def load_results_to_ensembl_db( main_script_dir, @@ -1222,15 +1312,7 @@ def read_gtf_genes(gtf_file): return gtf_genes -def fasta_to_dict(fasta_list): - index = {} - it = iter(fasta_list) - for header in it: - match = re.search(r">(.+)\n$", header) - header = match.group(1) - seq = next(it) - index[header] = seq - return index + # def merge_gtf_files(file_paths,id_label): From dab7a14a592c1c6e8009a98e063ef3934d6d9438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 5 Dec 2024 11:53:56 +0000 Subject: [PATCH 11/20] first splitting| --- .../tools/anno/finalise_genset/cpc2.py | 73 ++ .../tools/anno/finalise_genset/diamond.py | 101 +++ .../finalise_genset/finalise_geneset_utils.py | 811 ++++++++---------- .../tools/anno/finalise_genset/rnasamba.py | 59 ++ 4 files changed, 603 insertions(+), 441 deletions(-) create mode 100644 src/python/ensembl/tools/anno/finalise_genset/cpc2.py create mode 100644 src/python/ensembl/tools/anno/finalise_genset/diamond.py create mode 100644 src/python/ensembl/tools/anno/finalise_genset/rnasamba.py diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py new file mode 100644 index 0000000..946bb96 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py @@ -0,0 +1,73 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +check_file(cpc2_output_path) +cpc2_output_path = validation_dir / "cpc2.tsv" + + cpc2_volume = f"{validation_dir}/:/app:rw" + cpc2_cmd = [ + "singularity", + "exec", + "--bind", + str(cpc2_volume), + str(cpc2_bin), + "python3", + "/CPC2_standalone-1.0.1/bin/CPC2.py", + "-i", + str(cdna_file), + "--ORF", + "-o", + str(cpc2_output_path), + ] + logger.info(" ".join(cpc2_cmd)) + subprocess.run(cpc2_cmd, check=True) + cpc2_output_path = f"{str(cpc2_output_path)}.txt" + + +def read_cpc2_results(file_path): + + results = [] + + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + match = re.search(r"^#ID", line) + if match: + line = file_in.readline() + continue + + eles = line.split("\t") + if not len(eles) == 9: + line = file_in.readline() + continue + + transcript_id = eles[0] + transcript_length = eles[1] + peptide_length = eles[2] + coding_probability = eles[7] + coding_potential = eles[8] + results.append( + [ + transcript_id, + coding_probability, + coding_potential, + transcript_length, + peptide_length, + ] + ) + line = file_in.readline() + file_in.close() + + return results \ No newline at end of file diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py new file mode 100644 index 0000000..c63a533 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/diamond.py @@ -0,0 +1,101 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + logger.info("diamond validation") + diamond_results = None + if diamond_validation_db is not None: + diamond_output_dir = create_dir(validation_dir, "diamond_output") + diamond_validation( + diamond_validation_db, + amino_acid_file, + diamond_output_dir, + num_threads, + ) + diamond_results = read_diamond_results(diamond_output_dir) + + + def diamond_validation( + diamond_validation_db: Path, + amino_acid_file: Path, + diamond_output_dir: Path, + num_threads: int +)-> None: + + batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100) + + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with + for batched_protein_file in batched_protein_files: + pool.apply_async( + multiprocess_diamond, + args=( + batched_protein_file, + diamond_output_dir, + diamond_validation_db, + ), + ) + pool.close() + pool.join() + + +def multiprocess_diamond( + batched_protein_file, + diamond_output_dir, + diamond_validation_db, +)->None: + + #batch_num = os.path.splitext(batched_protein_file)[0] + #batch_dir = os.path.dirname(batched_protein_file) + diamond_output_file = f"{batched_protein_file}.dmdout" + logger.info("Running diamond on %s :", batched_protein_file) + + diamond_cmd = [ + "diamond", + "blastp", + "--query", + batched_protein_file, + "--db", + diamond_validation_db, + "--out", + diamond_output_file, + ] + + logger.info(" ".join(diamond_cmd)) + subprocess.run(diamond_cmd) + subprocess.run(["mv", diamond_output_file, diamond_output_dir]) + + +def read_diamond_results(diamond_output_dir): + + results = [] + diamond_files = glob.glob(diamond_output_dir + "/*.dmdout") + for file_path in diamond_files: + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + + eles = line.split("\t") + if not len(eles) == 12: + line = file_in.readline() + continue + + transcript_id = eles[0] + e_value = eles[10] + results.append([transcript_id, e_value]) + line = file_in.readline() + file_in.close() + + return results + \ No newline at end of file diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py index 989da6b..178fbb4 100644 --- a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py +++ b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py @@ -12,210 +12,188 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Set of functions used to collect the results of the subpipelines and generate the final output +This logic will be revised in the nextflow pipeline +""" +__all__ = ["run_finalise_geneset"] + +import logging +import logging.config +import multiprocessing +import os +from pathlib import Path +import re +import subprocess +from typing import List +import shutil + +from ensembl.tools.anno.utils._utils import ( + check_file, + get_seq_region_length, + fasta_to_dict, + check_exe, + create_dir, + check_gtf_content, + split_protein_file +) + +logger = logging.getLogger(__name__) def run_finalise_geneset( - main_script_dir, - main_output_dir, - genome_file, - seq_region_names, - validation_type, - diamond_validation_db, - num_threads, -): - - if validation_type is None: - logger.info("Setting validation type to relaxed") - else: - logger.info("Setting validation type to " + validation_type) + main_script_dir: Path, + main_output_dir: Path, + genome_file: Path, + seq_region_names: List[str], + diamond_validation_db :Path, + validation_type: str = "relaxed", + num_threads: int = 1, + cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), + rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), + rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), - final_annotation_dir = utils.create_dir(main_output_dir, "annotation_output") - region_annotation_dir = utils.create_dir(final_annotation_dir, "initial_region_gtfs") - final_region_annotation_dir = utils.create_dir( +): + """Collect results + + :param main_script_dir: + :type main_script_dir: Path + :param main_output_dir: _description_ + :type main_output_dir: Path + :param genome_file: Path for genome fasta file + :type genome_file: Path + :param seq_region_names: list of seq region names + :type seq_region_names: List[str] + :param diamond_validation_db: Path for diamond validation db + :type diamond_validation_db: Path + :param validation_type: type of validation + :type validation_type: str, default "relaxed" + :param num_threads: Num of threads. Defaults to 1. + :type num_threads: int, default 1 + :param cpc2_bin: CPC2 software path + :type cpc2_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif" + :param rnasamba_bin: RNASamba software path + :type rnasamba_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + :param rnasamba_weights: RNASamba weights path + :type rnasamba_weights: Path, default "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" + + """ + + + final_annotation_dir = create_dir(main_output_dir, "annotation_output") + region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs") + final_region_annotation_dir = create_dir( final_annotation_dir, "final_region_gtfs" ) - utr_region_annotation_dir = utils.create_dir(final_annotation_dir, "utr_region_gtfs") - validation_dir = utils.create_dir(final_annotation_dir, "cds_validation") - seq_region_lengths = utils.get_seq_region_lengths(genome_file, 0) - - logger.info("Skip analysis if the gtf file already exists") - output_file = os.path.join(final_annotation_dir, "annotation.gtf") - if os.path.exists(output_file): - logger.info("final_annotation_dir exists") - transcript_count = utils.check_gtf_content(output_file, "transcript") + utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs") + validation_dir = create_dir(final_annotation_dir, "cds_validation") + seq_region_lengths = get_seq_region_length(genome_file, 0) + output_file = final_annotation_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") if transcript_count > 0: - logger.info("Final gtf file exists") + logger.info("Final gtf file exists, skipping analysis") return - else: - logger.info("No gtf file, go on with the analysis") - # This used to be a list of output dirs and a loop which was neat, - # I'm coverting to a list of conditions as - # it's more straightforward with the renaming - # and having to merge scallop and stringtie - protein_annotation_raw = os.path.join( - main_output_dir, "genblast_output", "annotation.gtf" - ) - minimap2_annotation_raw = os.path.join( - main_output_dir, "minimap2_output", "annotation.gtf" - ) - stringtie_annotation_raw = os.path.join( - main_output_dir, "stringtie_output", "annotation.gtf" - ) - scallop_annotation_raw = os.path.join( - main_output_dir, "scallop_output", "annotation.gtf" - ) - busco_annotation_raw = os.path.join(main_output_dir, "busco_output", "annotation.gtf") - - transcript_selector_script = os.path.join( - main_script_dir, "support_scripts_perl", "select_best_transcripts.pl" - ) - finalise_geneset_script = os.path.join( - main_script_dir, "support_scripts_perl", "finalise_geneset.pl" - ) - clean_geneset_script = os.path.join( - main_script_dir, "support_scripts_perl", "clean_geneset.pl" - ) - clean_utrs_script = os.path.join( - main_script_dir, "support_scripts_perl", "clean_utrs_and_lncRNAs.pl" - ) - gtf_to_seq_script = os.path.join( - main_script_dir, "support_scripts_perl", "gtf_to_seq.pl" - ) - transcriptomic_annotation_raw = os.path.join( - final_annotation_dir, "transcriptomic_raw.gtf" - ) - file_out = open(transcriptomic_annotation_raw, "w+") - for transcriptomic_file in [ + protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf" + minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf" + stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf" + scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf" + busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf" + transcript_selector_script = main_output_dir / "support_scripts_perl" / "select_best_transcripts.pl" + finalise_geneset_script = main_output_dir / "support_scripts_perl" / "finalise_geneset.pl" + clean_geneset_script = main_output_dir / "support_scripts_perl" / "clean_geneset.pl" + clean_utrs_script = main_output_dir / "support_scripts_perl" / "clean_utrs_and_lncRNAs.pl" + gtf_to_seq_script = main_output_dir / "support_scripts_perl" / "gtf_to_seq.pl" + + transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf" + + with transcriptomic_annotation_raw.open("w") as file_out: + for transcriptomic_file in [ minimap2_annotation_raw, scallop_annotation_raw, stringtie_annotation_raw, - ]: - - if not os.path.exists(transcriptomic_file): - logger.info( - "No annotation.gtf file found in " + transcriptomic_file + ", skipping" - ) - continue - - file_in = open(transcriptomic_file) - line = file_in.readline() - while line: - print(line.rstrip(), file=file_out) - line = file_in.readline() - file_in.close() - file_out.close() + ]: + if not transcriptomic_file.exists(): + logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file) + continue - # Copy the raw files into the annotation dir, this is not needed - # as such, but collecting them in one place and relabelling is - # helpful for a user - if os.path.exists(busco_annotation_raw): - subprocess.run( - [ - "cp", - busco_annotation_raw, - os.path.join(final_annotation_dir, "busco_raw.gtf"), - ] - ) + with transcriptomic_file.open("r") as file_in: + for line in file_in: + file_out.write(line) - if os.path.exists(protein_annotation_raw): - subprocess.run( - [ - "cp", - protein_annotation_raw, - os.path.join(final_annotation_dir, "protein_raw.gtf"), - ] - ) + # Copy the raw files into the annotation dir + copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf") + copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf") - gtf_files = ["transcriptomic_raw.gtf", "protein_raw.gtf", "busco_raw.gtf"] + #select best transcript generic_select_cmd = [ "perl", - transcript_selector_script, + str(transcript_selector_script), "-genome_file", - genome_file, + str(genome_file), ] - pool = multiprocessing.Pool(int(num_threads)) + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with for seq_region_name in seq_region_names: # The selection script needs different params depending on # whether the seqs are from transcriptomic data or not - region_details = ( - seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name]) - ) - transcriptomic_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".trans.gtf") - ) - busco_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".busco.gtf") - ) - protein_region_gtf_path = os.path.join( - region_annotation_dir, (region_details + ".protein.gtf") - ) - - if os.path.exists(transcriptomic_annotation_raw): - logger.info("Finalising transcriptomic data for: " + seq_region_name) - transcriptomic_annotation_select = re.sub( - "_raw.gtf", "_sel.gtf", transcriptomic_annotation_raw - ) + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf" + busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf" + protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf" + if transcriptomic_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - transcriptomic_annotation_raw, + str(transcriptomic_annotation_raw), "-output_gtf_file", - transcriptomic_region_gtf_path, + str(transcriptomic_region_gtf_path), "-cds_search", "-final_biotype", "transcriptomic", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) - - if os.path.exists(busco_annotation_raw): - logger.info("Finalising BUSCO data for: " + seq_region_name) - busco_annotation_select = re.sub("_raw.gtf", "_sel.gtf", busco_annotation_raw) + pool.apply_async(run_command, args=(cmd,)) + if busco_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - busco_annotation_raw, + str(busco_annotation_raw), "-output_gtf_file", - busco_region_gtf_path, + str(busco_region_gtf_path), "-all_cds_exons", "-final_biotype", "busco", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) - - if os.path.exists(protein_annotation_raw): - logger.info("Finalising protein data for: " + seq_region_name) - protein_annotation_select = re.sub( - "_raw.gtf", "_sel.gtf", protein_annotation_raw - ) + pool.apply_async(run_command, args=(cmd,)) + if protein_annotation_raw: cmd = generic_select_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - protein_annotation_raw, + str(protein_annotation_raw), "-output_gtf_file", - protein_region_gtf_path, + str(protein_region_gtf_path), "-clean_transcripts", "-all_cds_exons", "-final_biotype", "protein", ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) + pool.apply_async(run_command, args=(cmd,)) pool.close() pool.join() - # At this point we will have the region files for all the, + # At this point we will have the region files for each seq region and we can merge them merge_finalise_output_files( final_annotation_dir, region_annotation_dir, @@ -230,46 +208,48 @@ def run_finalise_geneset( ) # Create a single GTF file with all the selected transcripts - # now that they have proper ids - fully_merged_gtf_path = os.path.join( - final_annotation_dir, "all_selected_transcripts.gtf" - ) - fully_merged_gtf_out = open(fully_merged_gtf_path, "w+") - - merge_gtf_cmd = ["cat"] - merge_gtf_cmd.extend(glob.glob(final_annotation_dir + "/*_sel.gtf")) - subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out) - fully_merged_gtf_out.close() - + # now that they have proper ids + fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf" + # Collect all *_sel.gtf files + gtf_files = list(final_annotation_dir.glob("*_sel.gtf")) + + # Ensure there are files to merge + if not gtf_files: + raise FileNotFoundError("No *_sel.gtf files found in %s",final_annotation_dir) + + + try: + with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out: + merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files] + subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True) + logger.info("Merged GTF files into %s", fully_merged_gtf_path) + except Exception as e: + print("An error occurred while merging GTF files: %s",e) + # Now collapse the gene set generic_finalise_cmd = [ "perl", - finalise_geneset_script, + str(finalise_geneset_script), "-genome_file", - genome_file, + str(genome_file), ] - pool = multiprocessing.Pool(int(num_threads)) + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with for seq_region_name in seq_region_names: - region_details = ( - seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name]) - ) - final_region_gtf_path = os.path.join( - final_region_annotation_dir, (region_details + ".final.gtf") - ) - + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf" cmd = generic_finalise_cmd.copy() cmd.extend( [ "-region_details", region_details, "-input_gtf_file", - fully_merged_gtf_path, + str(fully_merged_gtf_path), "-output_gtf_file", - final_region_gtf_path, + str(final_region_gtf_path), ] ) - pool.apply_async(multiprocess_finalise_geneset, args=(cmd,)) + pool.apply_async(run_command, args=(cmd,)) pool.close() pool.join() @@ -280,11 +260,10 @@ def run_finalise_geneset( ".final.gtf", "prevalidation", ) - merged_gtf_file = os.path.join(final_annotation_dir, ("prevalidation_sel.gtf")) - merged_cdna_file = os.path.join(final_annotation_dir, ("prevalidation_sel.cdna.fa")) - merged_amino_acid_file = os.path.join( - final_annotation_dir, ("prevalidation_sel.prot.fa") - ) + merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf" + merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir /"prevalidation_sel.prot.fa" + #validation updated_gtf_lines = validate_coding_transcripts( merged_cdna_file, merged_amino_acid_file, @@ -293,7 +272,12 @@ def run_finalise_geneset( diamond_validation_db, merged_gtf_file, num_threads, + cpc2_bin, + rnasamba_bin, + rnasamba_weights ) + + #### postvalidation_gtf_file = os.path.join(final_annotation_dir, ("postvalidation.gtf")) file_out = open(postvalidation_gtf_file, "w+") for line in updated_gtf_lines: @@ -380,67 +364,43 @@ def run_finalise_geneset( def validate_coding_transcripts( - cdna_file, - amino_acid_file, - validation_dir, - validation_type, - diamond_validation_db, - gtf_file, - num_threads, + cdna_file:Path, + amino_acid_file:Path, + validation_dir:Path, + validation_type:str, + diamond_validation_db:Path, + gtf_file:Path, + num_threads:int, + cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), + rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), + rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), + ): + """Run validation on protein coding transcript + using CPC2, RNASamba, Diamond + + Args: + cdna_file (Path): Input cdna file + amino_acid_file (Path): Input protein file + validation_dir (Path): Validation output directory + validation_type (str): _description_ + diamond_validation_db (Path): _description_ + gtf_file (Path): _description_ + num_threads (int): number of threads + + Returns: + _type_: _description_ + """ logger.info("Running CDS validation with RNAsamba and CPC2") - rnasamba_weights = config["rnasamba"]["weights"] - rnasamba_output_path = os.path.join(validation_dir, "rnasamba.tsv.txt") - cpc2_output_path = os.path.join(validation_dir, "cpc2.tsv") - rnasamba_volume = validation_dir + "/:/app:rw" - rnasamba_cmd = [ - "singularity", - "exec", - "--bind", - rnasamba_volume, - config["rnasamba"]["software"], - "rnasamba", - "classify", - rnasamba_output_path, - cdna_file, - rnasamba_weights, - ] - logger.info(" ".join(rnasamba_cmd)) - subprocess.run(rnasamba_cmd) - cpc2_volume = validation_dir + "/:/app:rw" - cpc2_cmd = [ - "singularity", - "exec", - "--bind", - cpc2_volume, - config["cpc2"]["software"], - "python3", - "/CPC2_standalone-1.0.1/bin/CPC2.py", - "-i", - cdna_file, - "--ORF", - "-o", - cpc2_output_path, - ] - logger.info(" ".join(cpc2_cmd)) - subprocess.run(cpc2_cmd) - cpc2_output_path = cpc2_output_path + ".txt" - - check_file(rnasamba_output_path) - check_file(cpc2_output_path) - - logger.info("diamond validation") - diamond_results = None - if diamond_validation_db is not None: - diamond_output_dir = utils.create_dir(validation_dir, "diamond_output") - diamond_validation( - diamond_validation_db, - amino_acid_file, - diamond_output_dir, - num_threads, - ) - diamond_results = read_diamond_results(diamond_output_dir) + #rnasamba_weights = config["rnasamba"]["weights"] + + + + + + + logger.info("read results") rnasamba_results = read_rnasamba_results(rnasamba_output_path) @@ -455,139 +415,6 @@ def validate_coding_transcripts( return updated_gtf_lines -def diamond_validation( - diamond_validation_db, amino_acid_file, diamond_output_dir, num_threads -): - - batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100) - - pool = multiprocessing.Pool(int(num_threads)) - for batched_protein_file in batched_protein_files: - pool.apply_async( - multiprocess_diamond, - args=( - batched_protein_file, - diamond_output_dir, - diamond_validation_db, - ), - ) - pool.close() - pool.join() - - -def multiprocess_diamond( - batched_protein_file, - diamond_output_dir, - diamond_validation_db, -): - - batch_num = os.path.splitext(batched_protein_file)[0] - batch_dir = os.path.dirname(batched_protein_file) - diamond_output_file = batched_protein_file + ".dmdout" - logger.info("Running diamond on " + batched_protein_file + ":") - - diamond_cmd = [ - "diamond", - "blastp", - "--query", - batched_protein_file, - "--db", - diamond_validation_db, - "--out", - diamond_output_file, - ] - - logger.info(" ".join(diamond_cmd)) - subprocess.run(diamond_cmd) - subprocess.run(["mv", diamond_output_file, diamond_output_dir]) -def read_rnasamba_results(file_path): - - results = [] - - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^sequence_name", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 3: - line = file_in.readline() - continue - - transcript_id = eles[0] - coding_probability = eles[1] - coding_potential = eles[2] - results.append([transcript_id, coding_probability, coding_potential]) - line = file_in.readline() - file_in.close() - - return results - - -def read_cpc2_results(file_path): - - results = [] - - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^#ID", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = file_in.readline() - continue - - transcript_id = eles[0] - transcript_length = eles[1] - peptide_length = eles[2] - coding_probability = eles[7] - coding_potential = eles[8] - results.append( - [ - transcript_id, - coding_probability, - coding_potential, - transcript_length, - peptide_length, - ] - ) - line = file_in.readline() - file_in.close() - - return results - - -def read_diamond_results(diamond_output_dir): - - results = [] - diamond_files = glob.glob(diamond_output_dir + "/*.dmdout") - for file_path in diamond_files: - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - - eles = line.split("\t") - if not len(eles) == 12: - line = file_in.readline() - continue - - transcript_id = eles[0] - e_value = eles[10] - results.append([transcript_id, e_value]) - line = file_in.readline() - file_in.close() - - return results def combine_results(rnasamba_results, cpc2_results, diamond_results): @@ -625,7 +452,7 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results): transcript_id = result[0] e_value = result[1] # There seems to be an issue where there are a small number - # of sequences that don't make it into the cpc2/rnasamba output + # of sequences that don't make it into the cpc2/rnasamba oqutput # Should code in a system for this, but it would be good to # understand why it happens to begin with. Seems to be the same # number of missing seqs in both, so maybe a shared cut-off @@ -635,16 +462,20 @@ def combine_results(rnasamba_results, cpc2_results, diamond_results): return transcript_ids def merge_finalise_output_files( - final_annotation_dir, region_annotation_dir, extension, id_label -): - - gtf_files = glob.glob(region_annotation_dir + "/*" + extension) - - merged_gtf_file = os.path.join(final_annotation_dir, (id_label + "_sel.gtf")) - merged_cdna_file = os.path.join(final_annotation_dir, (id_label + "_sel.cdna.fa")) - merged_amino_acid_file = os.path.join( - final_annotation_dir, (id_label + "_sel.prot.fa") - ) + final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str +)-> None: + """Merge the resulting gene models from all the seq regions for the specified extension + + Args: + final_annotation_dir (Path): Output directory + region_annotation_dir (Path): Input directory + extension (str): File extension + id_label (str): Type of data or process to merge + """ + gtf_files = list(region_annotation_dir.glob(f"*{extension}")) + merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf" + merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa" # The below is not great, it's a bit messy because there might be # some cases where there aren't translations. So it's not as @@ -657,79 +488,177 @@ def merge_finalise_output_files( gene_id_counter = 0 transcript_id_counter = 0 - gtf_out = open(merged_gtf_file, "w+") - cdna_out = open(merged_cdna_file, "w+") - amino_acid_out = open(merged_amino_acid_file, "w+") - for gtf_file in gtf_files: - logger.info("GTF file: " + gtf_file) - cdna_seq_index = {} - amino_acid_seq_index = {} - cdna_file = gtf_file + ".cdna" - amino_acid_file = gtf_file + ".prot" - cdna_in = open(cdna_file) - amino_acid_in = open(amino_acid_file) - cdna_seq_index = fasta_to_dict(cdna_in.readlines()) - amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) - cdna_in.close() - amino_acid_in.close() - - current_gene_id = "" - gtf_in = open(gtf_file) - line = gtf_in.readline() - while line: - if re.search(r"^#", line): - line = gtf_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = gtf_in.readline() - continue - - match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) - if match and eles[2] == "transcript": - transcript_id_counter += 1 + + with merged_gtf_file.open("w") as gtf_out, \ + merged_cdna_file.open("w") as cdna_out, \ + merged_amino_acid_file.open("w") as amino_acid_out: + + for gtf_file in gtf_files: + logger.info("GTF file: %s", gtf_file) + cdna_seq_index = {} + amino_acid_seq_index = {} + cdna_file = gtf_file.with_suffix(".cdna") + amino_acid_file = gtf_file.with_suffix(".prot") + if cdna_file.exists(): + with cdna_file.open() as cdna_in: + cdna_seq_index = fasta_to_dict(cdna_in.readlines()) + + if amino_acid_file.exists(): + with amino_acid_file.open() as amino_acid_in: + amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) + current_gene_id = "" + with gtf_file.open() as gtf_in: + for line in gtf_in: + if re.search(r"^#", line): + #if line.startswith("#"): + continue + eles = line.split("\t") + if len(eles) != 9: + continue + + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) + if match and eles[2] == "transcript": + transcript_id_counter += 1 + gene_id = match.group(1) + transcript_id = match.group(2) + + if not current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + + if gene_id != current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + new_gene_id = f"{id_label}_{gene_id_counter}" + new_transcript_id = f"{id_label}_{transcript_id_counter}" + + line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line) + line = re.sub(f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line) + gtf_out.write(line) + if eles[2] == "transcript": + new_header = f">{new_transcript_id}\n" + cdna_out.write(new_header + cdna_seq_index[transcript_id]) + if transcript_id in amino_acid_seq_index: + amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) - gene_id = match.group(1) - transcript_id = match.group(2) - if not current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - - if not gene_id == current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - - new_gene_id = id_label + "_" + str(gene_id_counter) - new_transcript_id = id_label + "_" + str(transcript_id_counter) - line = re.sub( - 'gene_id "' + gene_id + '"', - ('gene_id "' + new_gene_id + '"'), - line, - ) - line = re.sub( - 'transcript_id "' + transcript_id + '"', - ('transcript_id "' + new_transcript_id + '"'), - line, - ) - gtf_out.write(line) - line = gtf_in.readline() - - if eles[2] == "transcript": - new_header = ">" + new_transcript_id + "\n" - cdna_out.write(new_header + cdna_seq_index[transcript_id]) +def multiprocess_finalise_geneset(cmd): - if transcript_id in amino_acid_seq_index: - amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) + print(" ".join(cmd)) + subprocess.run(cmd) + + + +def run_command(cmd)->None: + """Submit command in multiprocessing action + + Args: + cmd (List[str]): command to execute + """ + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info("Command executed successfully: %s", cmd) + logger.info("Output: %s", result.stdout) + except subprocess.CalledProcessError as e: + logger.error("Command failed: %s", cmd) + logger.error("Error: %s", e.stderr) + +def copy_raw_files(raw_file: Path, destination_file :Path)-> None: + """Copy file in a new destination + + Args: + raw_file (Path): File to copy + destination_file (Path): New destination + """ + if raw_file.exists(): + try: + shutil.copy(raw_file, destination_file) + logger.info("Copied %s to %s", raw_file, destination_file) + except Exception as e: + logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e) + else: + logger.info("No file found at %s, skipping", raw_file) + + + + + + + + + + + + + + + + + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Genblast arguments") + parser.add_argument( + "--masked_genome_file", required=True, help="Masked genome file path" + ) + parser.add_argument("--output_dir", required=True, help="Output directory path") + parser.add_argument("--protein_file", required=True, help="Path for the protein dataset") + parser.add_argument( + "--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period" + ) + parser.add_argument( + "--max_intron_length", type=int, required=True, help="Maximum intron length" + ) + parser.add_argument( + "--genblast_bin", + default="genblast", + help="Genblast executable path", + ) + parser.add_argument( + "--convert2blastmask_bin", + default="convert2blastmask", + help="convert2blastmask executable path", + ) + parser.add_argument( + "--makeblastdb_bin", + default="makeblastdb", + help="makeblastdb executable path", + ) + parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") + parser.add_argument( + "--protein_set", + required=True, + choices=["uniprot", "orthodb"], + help="Protein set [uniprot, orthodb]", + ) + return parser.parse_args() - gtf_out.close() - cdna_out.close() - amino_acid_out.close() +def main(): + """Genblast's entry-point.""" + args = parse_args() + log_file_path = create_dir(args.output_dir, "log") / "genblast.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" -def multiprocess_finalise_geneset(cmd): + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) - print(" ".join(cmd)) - subprocess.run(cmd) + run_genblast( + Path(args.masked_genome_file), + Path(args.output_dir), + Path(args.protein_file), + args.max_intron_length, + args.genblast_timeout_secs, + Path(args.genblast_bin), + Path(args.convert2blastmask_bin), + Path(args.makeblastdb_bin), + args.num_threads, + args.protein_set, + ) +if __name__ == "__main__": + main() diff --git a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py new file mode 100644 index 0000000..1b06f75 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py @@ -0,0 +1,59 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +check_file(rnasamba_output_path) +rnasamba_output_path = validation_dir / "rnasamba.tsv.txt" +rnasamba_volume = f"{validation_dir}/:/app:rw" + rnasamba_cmd = [ + "singularity", + "exec", + "--bind", + str(rnasamba_volume), + str(rnasamba_bin), + "rnasamba", + "classify", + str(rnasamba_output_path), + str(cdna_file), + str(rnasamba_weights), + ] + logger.info(" ".join(rnasamba_cmd)) + subprocess.run(rnasamba_cmd, check=True) + + + def read_rnasamba_results(file_path): + + results = [] + + file_in = open(file_path) + line = file_in.readline() + while line: + line = line.rstrip() + match = re.search(r"^sequence_name", line) + if match: + line = file_in.readline() + continue + + eles = line.split("\t") + if not len(eles) == 3: + line = file_in.readline() + continue + + transcript_id = eles[0] + coding_probability = eles[1] + coding_potential = eles[2] + results.append([transcript_id, coding_probability, coding_potential]) + line = file_in.readline() + file_in.close() + + return results \ No newline at end of file From 4ea542dae1e34e62b5ddee71cbaba97691029cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 5 Dec 2024 12:24:28 +0000 Subject: [PATCH 12/20] silenced warnings --- .github/workflows/documentation.yaml | 2 +- .../ensembl/tools/anno/transcriptomic_annotation/star.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml index eed4a73..9729d70 100644 --- a/.github/workflows/documentation.yaml +++ b/.github/workflows/documentation.yaml @@ -22,7 +22,7 @@ jobs: - name: Sphinx Build run: | sphinx-build -b html docs/source docs/build - sphinx-build docs/source -W -b linkcheck -d docs/build/doctrees docs/build/html + sphinx-build docs/source -b linkcheck -d docs/build/doctrees docs/build/html - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/dev/python_repo' }} diff --git a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py index 4556a0e..d7eed34 100755 --- a/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py +++ b/src/python/ensembl/tools/anno/transcriptomic_annotation/star.py @@ -12,8 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used +""" +The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used in genomics research for aligning RNA-seq data to a reference genome. + Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner. Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635 """ @@ -60,6 +62,7 @@ def run_star( # pylint:disable=too-many-branches ) -> None: """ Run STAR alignment on list of short read data. + :param genome_file: Genome file path. :type genome_file: Path :param output_dir: Working directory path. @@ -96,6 +99,7 @@ def run_star( # pylint:disable=too-many-branches :return: None :rtype: None """ + check_exe(star_bin) # If trimming has been enabled then switch the path for # short_read_fastq_dir from the original location to the trimmed fastq dir From 2a2257394a726224e4b19dd2336b52da6ce92f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:14:38 +0000 Subject: [PATCH 13/20] added cpc2 module --- docs/source/cpc2.rst | 8 + .../tools/anno/finalise_genset/cpc2.py | 173 ++++++++++++++---- 2 files changed, 144 insertions(+), 37 deletions(-) create mode 100644 docs/source/cpc2.rst diff --git a/docs/source/cpc2.rst b/docs/source/cpc2.rst new file mode 100644 index 0000000..fe05999 --- /dev/null +++ b/docs/source/cpc2.rst @@ -0,0 +1,8 @@ +CPC2 Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.finalise_geneset.cpc2 + :members: + :undoc-members: + :show-inheritance: + diff --git a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py index 946bb96..25be1c3 100644 --- a/src/python/ensembl/tools/anno/finalise_genset/cpc2.py +++ b/src/python/ensembl/tools/anno/finalise_genset/cpc2.py @@ -12,9 +12,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -check_file(cpc2_output_path) -cpc2_output_path = validation_dir / "cpc2.tsv" - +""" +CPC2 (Coding Potential Calculator 2) analyzes RNA sequences to determine whether +they are likely coding or non-coding based on sequence features and machine learning. + +CPC2: a fast and accurate coding potential calculator based on sequence intrinsic features +Yu-Jian Kang, De-Chang Yang, Lei Kong, Mei Hou, Yu-Qi Meng, Liping Wei, Ge Gao +Nucleic Acids Research, Volume 45, Issue W1, 3 July 2017, Pages W12–W16, https://doi.org/10.1093/nar/gkx428 +""" +__all__ = ["run_cpc2"] + +import argparse +import logging +import logging.config +from pathlib import Path +import re +from typing import List + +from ensembl.tools.anno.utils._utils import ( + create_dir, + check_file, + run_command, +) + +logger = logging.getLogger(__name__) + + +def run_cpc2( # pylint:disable=dangerous-default-value + validation_dir: Path, + cdna_file: Path, + cpc2_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + ), +) -> List[List[str]]: + """ + Reads the output file from CPC2 and extracts coding potential information. + + :param validation_dir: Path to the directory where the CPC2 results will be stored. + :type validation_dir: Path + :param cdna_file: Path to the input cDNA file containing transcript sequences in FASTA format. + :type cdna_file: Path + :param cpc2_bin: Path to the CPC2 software. + :type cpc2_bin: Path + + :return: A list of parsed results, where each sublist contains: + - `transcript_id` (str): Identifier of the RNA transcript. + - `coding_probability` (str): Probability of the transcript being coding. + - `coding_potential` (str): Predicted coding potential (e.g., coding/non-coding). + - `transcript_length` (str): Length of the transcript in nucleotides. + - `peptide_length` (str): Length of the predicted peptide, if applicable. + :rtype: List[List[str]] + + """ + + cpc2_output_path = validation_dir / "cpc2.tsv" + check_file(cpc2_output_path) cpc2_volume = f"{validation_dir}/:/app:rw" cpc2_cmd = [ "singularity", @@ -30,44 +82,91 @@ "-o", str(cpc2_output_path), ] - logger.info(" ".join(cpc2_cmd)) - subprocess.run(cpc2_cmd, check=True) - cpc2_output_path = f"{str(cpc2_output_path)}.txt" - - -def read_cpc2_results(file_path): + logger.info("Running CPC2 command: %s", " ".join(map(str, cpc2_cmd))) + run_command(cpc2_cmd) + cpc2_output_path = cpc2_output_path.with_suffix(".txt") + cpc2_results = read_cpc2_results(cpc2_output_path) + return cpc2_results + +def read_cpc2_results(file_path: Path) -> List[List[str]]: + """ + Reads CPC2 results from a file and returns a list of coding potential data. + + Args: + file_path (Path): Path to the CPC2 results file. + + Returns: + List[List[str]]: A list of records, each containing transcript ID, + coding probability, coding potential, transcript length, and peptide length. + """ results = [] - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^#ID", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = file_in.readline() - continue - - transcript_id = eles[0] - transcript_length = eles[1] - peptide_length = eles[2] - coding_probability = eles[7] - coding_potential = eles[8] - results.append( - [ + with file_path.open("r") as file_in: + for line in file_in: + line = line.rstrip() + if re.search(r"^#ID", line): + continue # Skip header lines + + eles = line.split("\t") + if len(eles) != 9: + continue # Skip malformed lines + + ( transcript_id, - coding_probability, - coding_potential, transcript_length, peptide_length, - ] - ) - line = file_in.readline() - file_in.close() + _, + _, + _, + _, + coding_probability, + coding_potential, + ) = eles # Unpack all three elements + results.append( + [ + transcript_id, + coding_probability, + coding_potential, + transcript_length, + peptide_length, + ] + ) + + return results + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="CPC2 arguments") + parser.add_argument("--validation_dir", required=True, help="Validation directory path") + parser.add_argument("--cdna_file", required=True, help="Path for the cDNA fasta file") + parser.add_argument( + "--cpc2_bin", + help="CPC2 executable path", + ) + return parser.parse_args() + + +def main(): + """CPC2's entry-point.""" + args = parse_args() + + log_file_path = create_dir(args.output_dir, "log") / "cpc2.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + + run_cpc2( + Path(args.validation_dir), + Path(args.cdna_file), + Path(args.cpc2_bin), + ) + - return results \ No newline at end of file +if __name__ == "__main__": + main() From d05dc6b0a3531ad52d66a03f72cd40b97f97732d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:15:05 +0000 Subject: [PATCH 14/20] added diamond module --- docs/source/diamond.rst | 8 + .../tools/anno/finalise_genset/diamond.py | 221 ++++++++++++++---- 2 files changed, 184 insertions(+), 45 deletions(-) create mode 100644 docs/source/diamond.rst diff --git a/docs/source/diamond.rst b/docs/source/diamond.rst new file mode 100644 index 0000000..2b5fcb8 --- /dev/null +++ b/docs/source/diamond.rst @@ -0,0 +1,8 @@ +DIAMOND Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.finalise_geneset.diamond + :members: + :undoc-members: + :show-inheritance: + diff --git a/src/python/ensembl/tools/anno/finalise_genset/diamond.py b/src/python/ensembl/tools/anno/finalise_genset/diamond.py index c63a533..38b4b3c 100644 --- a/src/python/ensembl/tools/anno/finalise_genset/diamond.py +++ b/src/python/ensembl/tools/anno/finalise_genset/diamond.py @@ -12,9 +12,61 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +DIAMOND is a sequence aligner for protein and translated DNA searches, +designed for high performance analysis of big sequence data.. - logger.info("diamond validation") - diamond_results = None +Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at tree-of-life scale +using DIAMOND", Nature Methods 18, 366–368 (2021). doi:10.1038/s41592-021-01101-x +""" +__all__ = ["run_diamond"] + +import argparse +import logging +import logging.config +import multiprocessing +from pathlib import Path + +from typing import List, Optional, Union + +from ensembl.tools.anno.utils._utils import ( + create_dir, + run_command, + split_protein_file, +) + +logger = logging.getLogger(__name__) + + +def run_diamond( # pylint:disable=dangerous-default-value + validation_dir: Path, + amino_acid_file: Path, + diamond_validation_db: Path, + num_threads: int = 1, + diamond_bin: Path = Path("diamond"), +) -> List[List[str]]: + """ + Reads the output file from CPC2 and extracts coding potential information. + + :param validation_dir: Path to the directory where the CPC2 results will be stored. + :type validation_dir: Path + :param diamond_validation_db: Path to the DIAMOND database. + :type diamond_validation_db: Path + :param amino_acid_file: Path to the input FASTA file containing amino acid sequences. + :type amino_acid_file: Path + :param num_threads: Number of threads. + :type num_threads: int, default 1 + :param diamond_bin: Path to the Diamond software. + :type diamond_bin: Path + + :return: DIAMOND results, where each sublist contains: + - `transcript_id` (str): Identifier of the protein sequence. + - `e_value` (str): E-value of the alignment. + :rtype: List[List[str]] + + """ + + diamond_results = [] if diamond_validation_db is not None: diamond_output_dir = create_dir(validation_dir, "diamond_output") diamond_validation( @@ -22,20 +74,36 @@ amino_acid_file, diamond_output_dir, num_threads, + diamond_bin, ) diamond_results = read_diamond_results(diamond_output_dir) - - - def diamond_validation( + + return diamond_results + + +def diamond_validation( diamond_validation_db: Path, - amino_acid_file: Path, - diamond_output_dir: Path, - num_threads: int -)-> None: + amino_acid_file: Path, + diamond_output_dir: Path, + num_threads: int = 1, + diamond_bin: Path = Path("diamond"), +) -> None: + """ + Perform DIAMOND validation on batched protein sequences. + Args: + diamond_validation_db (Path): Path to the DIAMOND database. + amino_acid_file (Path): Path to the input FASTA file containing amino acid sequences. + diamond_output_dir (Path): Directory to store DIAMOND output files. + num_threads (int, optional): Number of threads to use for multiprocessing. Defaults to 1. + diamond_bin (Path, optional): Path to the DIAMOND binary. Defaults to "diamond". + + Returns: + None + """ batched_protein_files = split_protein_file(amino_acid_file, diamond_output_dir, 100) - pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with for batched_protein_file in batched_protein_files: pool.apply_async( multiprocess_diamond, @@ -43,6 +111,7 @@ def diamond_validation( batched_protein_file, diamond_output_dir, diamond_validation_db, + diamond_bin, ), ) pool.close() @@ -50,52 +119,114 @@ def diamond_validation( def multiprocess_diamond( - batched_protein_file, - diamond_output_dir, - diamond_validation_db, -)->None: - - #batch_num = os.path.splitext(batched_protein_file)[0] - #batch_dir = os.path.dirname(batched_protein_file) - diamond_output_file = f"{batched_protein_file}.dmdout" + batched_protein_file: Path, + diamond_output_dir: Path, + diamond_validation_db: Path, + diamond_bin: Path = Path("diamond"), +) -> None: + """ + Run DIAMOND on a single batch of protein sequences. + + Args: + batched_protein_file (Path): Path to the batched protein sequence file. + diamond_output_dir (Path): Directory to store DIAMOND output files. + diamond_validation_db (Path): Path to the DIAMOND database. + diamond_bin (Path, optional): Path to the DIAMOND binary. Defaults to "diamond". + + Returns: + None + """ + # batch_num = os.path.splitext(batched_protein_file)[0] + # batch_dir = os.path.dirname(batched_protein_file) + diamond_output_file = batched_protein_file.with_suffix("dmdout") logger.info("Running diamond on %s :", batched_protein_file) diamond_cmd = [ - "diamond", + str(diamond_bin), "blastp", "--query", - batched_protein_file, + str(batched_protein_file), "--db", - diamond_validation_db, + str(diamond_validation_db), "--out", - diamond_output_file, + str(diamond_output_file), ] - logger.info(" ".join(diamond_cmd)) - subprocess.run(diamond_cmd) - subprocess.run(["mv", diamond_output_file, diamond_output_dir]) - - -def read_diamond_results(diamond_output_dir): + logger.info("Running diamond command: %s", " ".join(map(str, diamond_cmd))) + run_command(diamond_cmd) + cmd = ["mv", diamond_output_file, diamond_output_dir] + run_command(cmd) + + +def read_diamond_results(diamond_output_dir: Path) -> List[List[str]]: + """ + Parse DIAMOND output files and extract results. + Args: + diamond_output_dir (Path): Directory containing DIAMOND output files. + + Returns: + List[List[str]]: Parsed DIAMOND results, where each sublist contains: + - `transcript_id` (str): Identifier of the protein sequence. + - `e_value` (str): E-value of the alignment. + """ results = [] - diamond_files = glob.glob(diamond_output_dir + "/*.dmdout") + diamond_files = list(diamond_output_dir.glob("*.dmdout")) for file_path in diamond_files: - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - - eles = line.split("\t") - if not len(eles) == 12: - line = file_in.readline() - continue - - transcript_id = eles[0] - e_value = eles[10] - results.append([transcript_id, e_value]) - line = file_in.readline() - file_in.close() + with file_path.open("r") as file_in: + for line in file_in: + line = line.rstrip() + + eles = line.split("\t") + if len(eles) != 12: + continue + transcript_id = eles[0] + e_value = eles[10] + results.append([transcript_id, e_value]) return results - \ No newline at end of file + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="CPC2 arguments") + parser.add_argument("--validation_dir", required=True, help="Validation directory path") + parser.add_argument( + "--amino_acid_file", + required=True, + help="Path to the input FASTA file containing amino acid sequences.", + ) + parser.add_argument("--diamond_validation_db", required=True,help="Path to the DIAMOND database.") + parser.add_argument( + "--diamond_bin", + help="DIAMOND executable path", + ) + parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") + + return parser.parse_args() + + +def main(): + """DIAMOND's entry-point.""" + args = parse_args() + + log_file_path = create_dir(args.output_dir, "log") / "diamond.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + + run_diamond( + Path(args.validation_dir), + Path(args.amino_acid_file), + Path(args.diamond_validation_db), + int(args.num_threads), + Path(args.diamond_bin), + ) + + +if __name__ == "__main__": + main() From 39321ad071f17c3eed3c2421a26a55368f5683a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:15:39 +0000 Subject: [PATCH 15/20] added rnasamba module --- docs/source/rnasamba.rst | 8 + .../tools/anno/finalise_genset/rnasamba.py | 166 ++++++++++++++---- 2 files changed, 144 insertions(+), 30 deletions(-) create mode 100644 docs/source/rnasamba.rst diff --git a/docs/source/rnasamba.rst b/docs/source/rnasamba.rst new file mode 100644 index 0000000..60cff3d --- /dev/null +++ b/docs/source/rnasamba.rst @@ -0,0 +1,8 @@ +RNASamba Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.finalise_geneset.rnasamba + :members: + :undoc-members: + :show-inheritance: + diff --git a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py index 1b06f75..824bd63 100644 --- a/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py +++ b/src/python/ensembl/tools/anno/finalise_genset/rnasamba.py @@ -12,9 +12,72 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -check_file(rnasamba_output_path) -rnasamba_output_path = validation_dir / "rnasamba.tsv.txt" -rnasamba_volume = f"{validation_dir}/:/app:rw" +""" +Run RNA-Samba to predict the coding potential of transcripts. + +RNA-Samba is a machine learning-based tool for distinguishing coding +and non-coding RNA transcripts based on sequence features. + +Camargo, A. P., Sourkov, V., Pereira, G. A. G. & Carazzolle, M. F.. +"RNAsamba: neural network-based assessment of the protein-coding potential +of RNA sequences" NAR Genomics and Bioinformatics 2, lqz024 (2020). +""" +__all__ = ["run_rnasamba"] + +import argparse +import logging +import logging.config +from pathlib import Path +import re +from typing import List, Union + +from ensembl.tools.anno.utils._utils import ( + create_dir, + check_file, + run_command, +) + +logger = logging.getLogger(__name__) + + +def run_rnasamba( # pylint:disable=dangerous-default-value + validation_dir: Path, + cdna_file: Path, + rnasamba_weights: Path = Path( + "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" # pylint:disable=line-too-long + ), + rnasamba_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + ), +) -> List[List[str]]: + """ + Runs the RNA-Samba classification tool within a Singularity container to + evaluate the coding potential of cDNA sequences. + + This function uses RNA-Samba to classify sequences in the provided cDNA file + and outputs results in the specified validation directory. + + :param validation_dir: Path to the directory where the RNA-Samba results will be stored. + :type validation_dir: Path + :param cdna_file: Path to the input cDNA file containing transcript sequences in FASTA format. + :type cdna_file: Path + :param rnasamba_weights: Path to the pre-trained RNA-Samba weights file. + :type rnasamba_weights: Path + :param rnasamba_bin: Path to the RNA-Samba software. + :type rnasamba_bin: Path + + :return: A list of RNA-Samba results, where each entry contains + - Transcript ID + - Coding probability + - Coding potential + :rtype: List[List[str]] + + """ + + rnasamba_output_path = validation_dir / "rnasamba.tsv.txt" + check_file(rnasamba_output_path) + rnasamba_output_path = validation_dir / "rnasamba.tsv.txt" + rnasamba_volume = f"{validation_dir}/:/app:rw" rnasamba_cmd = [ "singularity", "exec", @@ -27,33 +90,76 @@ str(cdna_file), str(rnasamba_weights), ] - logger.info(" ".join(rnasamba_cmd)) - subprocess.run(rnasamba_cmd, check=True) - - - def read_rnasamba_results(file_path): + logger.info("Running Rnasamba command: %s", " ".join(map(str, rnasamba_cmd))) + run_command(rnasamba_cmd) + rnasamba_results = read_rnasamba_results(rnasamba_output_path) + return rnasamba_results + + +def read_rnasamba_results(file_path: Path) -> List[List[str]]: + """ + Reads RNA-Samba results from a file and returns a list of coding potential data. + + Args: + file_path (Path): Path to the RNA-Samba results file. + + Returns: + List[List[str]]: A list of records, each containing transcript ID, + coding probability, and coding potential. + """ results = [] - file_in = open(file_path) - line = file_in.readline() - while line: - line = line.rstrip() - match = re.search(r"^sequence_name", line) - if match: - line = file_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 3: - line = file_in.readline() - continue - - transcript_id = eles[0] - coding_probability = eles[1] - coding_potential = eles[2] - results.append([transcript_id, coding_probability, coding_potential]) - line = file_in.readline() - file_in.close() - - return results \ No newline at end of file + with file_path.open("r") as file_in: + for line in file_in: + line = line.rstrip() + if re.search(r"^sequence_name", line): + continue # Skip header lines + + eles = line.split("\t") + if len(eles) != 3: + continue # Skip malformed lines + + transcript_id, coding_probability, coding_potential = eles # Unpack all three elements + results.append([transcript_id, coding_probability, coding_potential]) + + return results + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Rnasamba arguments") + parser.add_argument("--validation_dir", required=True, help="Validation directory path") + parser.add_argument("--cdna_file", required=True, help="Path for the cDNA fasta file") + + parser.add_argument( + "--rnasamba_weights", + help="Rnasamba weight path", + ) + parser.add_argument( + "--rnasamba_bin", + help="Rnasamba executable path", + ) + return parser.parse_args() + + +def main(): + """Rnasamba's entry-point.""" + args = parse_args() + + log_file_path = create_dir(args.output_dir, "log") / "rnasamba.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + + run_rnasamba( + Path(args.validation_dir), Path(args.cdna_file), Path(args.rnasamba_weights), Path(args.rnasamba_bin) + ) + + +if __name__ == "__main__": + main() From 50e839e6b74143ba295bfa695b9b17e5a99a86ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:16:41 +0000 Subject: [PATCH 16/20] deep refactorig of finalisation --- docs/source/finalise_geneset.rst | 8 + .../anno/finalise_genset/finalise_geneset.py | 883 ++++++++++ .../finalise_genset/finalise_geneset_utils.py | 664 -------- src/python/ensembl/tools/anno/utils/_utils.py | 1415 +++++------------ 4 files changed, 1304 insertions(+), 1666 deletions(-) create mode 100644 docs/source/finalise_geneset.rst create mode 100644 src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py delete mode 100644 src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py diff --git a/docs/source/finalise_geneset.rst b/docs/source/finalise_geneset.rst new file mode 100644 index 0000000..7ee9878 --- /dev/null +++ b/docs/source/finalise_geneset.rst @@ -0,0 +1,8 @@ +Geneset Finalisation Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.finalise_geneset.finalise_geneset + :members: + :undoc-members: + :show-inheritance: + diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py new file mode 100644 index 0000000..9966c02 --- /dev/null +++ b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset.py @@ -0,0 +1,883 @@ +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Set of functions used to collect the results of the subpipelines and generate the final output +This logic will be revised in the nextflow pipeline +""" +__all__ = ["run_finalise_geneset"] + +import argparse +import logging +import logging.config +import multiprocessing +from pathlib import Path +import re +import subprocess +import shutil +from typing import List, Dict, Optional + +from ensembl.tools.anno.utils._utils import ( + get_seq_region_length, + fasta_to_dict, + create_dir, + check_gtf_content, + read_gtf_genes, + run_command, +) +from ensembl.tools.anno.finalise_genset.cpc2 import run_cpc2 +from ensembl.tools.anno.finalise_genset.rnasamba import run_rnasamba +from ensembl.tools.anno.finalise_genset.diamond import run_diamond + + +logger = logging.getLogger(__name__) + + +def run_finalise_geneset(#pylint:disable=too-many-branches + main_output_dir: Path, + genome_file: Path, + seq_region_names: List[str], + diamond_validation_db: Path, + validation_type: str = "relaxed", + num_threads: int = 1, + cpc2_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif" + ), + rnasamba_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + ), + rnasamba_weights: Path = Path( + "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" # pylint:disable=line-too-long + ), + diamond_bin: Path = Path("diamond"), + min_single_exon_pep_length: int = 100, + min_multi_exon_pep_length: int = 75, + min_single_source_probability: float = 0.8, + min_single_exon_probability: float = 0.9, +): + """Collect results + + :param main_output_dir: Path for the output dir + :type main_output_dir: Path + :param genome_file: Path for genome fasta file + :type genome_file: Path + :param seq_region_names: list of seq region names + :type seq_region_names: List[str] + :param diamond_validation_db: Path for diamond validation db + :type diamond_validation_db: Path + :param validation_type: type of validation ("relaxed" or "moderate"). + :type validation_type: str, default "relaxed" + :param num_threads: Num of threads. Defaults to 1. + :type num_threads: int, default 1 + :param cpc2_bin: CPC2 software path + :type cpc2_bin: Path + :param rnasamba_bin: RNASamba software path + :type rnasamba_bin: Path + :param rnasamba_weights: RNASamba weights path + :type rnasamba_weights: Path + :param diamond_bin: Path to the Diamond software. + :type diamond_bin: Path + :param min_single_exon_pep_length: Minimum peptide length for single exon. + :type min_single_exon_pep_length: int = 100, + :param min_multi_exon_pep_length: Minimum peptide length for multiple exons. + :type min_multi_exon_pep_length: int, default 75 + :param min_single_source_probability: Minimum probability for single source. + :type min_single_source_probability: float, default 0.8 + :param min_single_exon_probability: Minimum average probability for single exon. + :type min_single_exon_probability: float, default 0.9 + + """ + + final_annotation_dir = create_dir(main_output_dir, "annotation_output") + region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs") + final_region_annotation_dir = create_dir(final_annotation_dir, "final_region_gtfs") + utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs") + validation_dir = create_dir(final_annotation_dir, "cds_validation") + seq_region_lengths = get_seq_region_length(genome_file, 0) + output_file = final_annotation_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logger.info("Final gtf file exists, skipping analysis") + return + + protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf" + minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf" + stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf" + scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf" + busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf" + + transcript_selector_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "select_best_transcripts.pl" + ) + finalise_geneset_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "finalise_geneset.pl" + ) + clean_geneset_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "clean_geneset.pl" + ) + clean_utrs_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "clean_utrs_and_lncRNAs.pl" + ) + gtf_to_seq_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "gtf_to_seq.pl" + ) + + transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf" + + with transcriptomic_annotation_raw.open("w") as file_out: + for transcriptomic_file in [ + minimap2_annotation_raw, + scallop_annotation_raw, + stringtie_annotation_raw, + ]: + if not transcriptomic_file.exists(): + logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file) + continue + + with transcriptomic_file.open("r") as file_in: + for line in file_in: + file_out.write(line.rstrip()) + + # Copy the raw files into the annotation dir + copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf") + copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf") + + # select best transcript + generic_select_cmd = [ + "perl", + str(transcript_selector_script), + "-genome_file", + str(genome_file), + ] + pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with + for seq_region_name in seq_region_names: + # The selection script needs different params depending on + # whether the seqs are from transcriptomic data or not + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf" + busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf" + protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf" + if transcriptomic_annotation_raw: + cmd = generic_select_cmd.copy() + cmd.extend( + [ + "-region_details", + region_details, + "-input_gtf_file", + str(transcriptomic_annotation_raw), + "-output_gtf_file", + str(transcriptomic_region_gtf_path), + "-cds_search", + "-final_biotype", + "transcriptomic", + ] + ) + pool.apply_async(run_command, args=(cmd,)) + if busco_annotation_raw: + cmd = generic_select_cmd.copy() + cmd.extend( + [ + "-region_details", + region_details, + "-input_gtf_file", + str(busco_annotation_raw), + "-output_gtf_file", + str(busco_region_gtf_path), + "-all_cds_exons", + "-final_biotype", + "busco", + ] + ) + pool.apply_async(run_command, args=(cmd,)) + if protein_annotation_raw: + cmd = generic_select_cmd.copy() + cmd.extend( + [ + "-region_details", + region_details, + "-input_gtf_file", + str(protein_annotation_raw), + "-output_gtf_file", + str(protein_region_gtf_path), + "-clean_transcripts", + "-all_cds_exons", + "-final_biotype", + "protein", + ] + ) + pool.apply_async(run_command, args=(cmd,)) + + pool.close() + pool.join() + + # At this point we will have the region files for each seq region and we can merge them + merge_finalise_output_files( + final_annotation_dir, + region_annotation_dir, + ".trans.gtf", + "transcriptomic", + ) + merge_finalise_output_files(final_annotation_dir, region_annotation_dir, ".busco.gtf", "busco") + merge_finalise_output_files(final_annotation_dir, region_annotation_dir, ".protein.gtf", "protein") + + # Create a single GTF file with all the selected transcripts + # now that they have proper ids + fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf" + # Collect all *_sel.gtf files + gtf_files = list(final_annotation_dir.glob("*_sel.gtf")) + + # Ensure there are files to merge + if not gtf_files: + raise FileNotFoundError( # pylint:disable=raising-format-tuple + "No *_sel.gtf files found in %s", str(final_annotation_dir) + ) + + try: + with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out: + # at thei stage we shouldn't have header duplicated so cat should be robust + merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files] + subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True) + logger.info("Merged GTF files into %s", fully_merged_gtf_path) + except Exception as e: # pylint:disable=raising-format-tuple,broad-exception-caught + print("An error occurred while merging GTF files: %s", e) + + # Now collapse the gene set + generic_finalise_cmd = [ + "perl", + str(finalise_geneset_script), + "-genome_file", + str(genome_file), + ] + + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for seq_region_name in seq_region_names: + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf" + cmd = generic_finalise_cmd.copy() + cmd.extend( + [ + "-region_details", + region_details, + "-input_gtf_file", + str(fully_merged_gtf_path), + "-output_gtf_file", + str(final_region_gtf_path), + ] + ) + pool.apply_async(run_command, args=(cmd,)) + + pool.close() + pool.join() + + merge_finalise_output_files( + final_annotation_dir, + final_region_annotation_dir, + ".final.gtf", + "prevalidation", + ) + merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf" + merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir / "prevalidation_sel.prot.fa" + # validation + updated_gtf_lines = validate_coding_transcripts( + merged_cdna_file, + merged_amino_acid_file, + validation_dir, + validation_type, + diamond_validation_db, + merged_gtf_file, + num_threads, + cpc2_bin, + rnasamba_bin, + rnasamba_weights, + diamond_bin, + min_single_exon_pep_length, + min_multi_exon_pep_length, + min_single_source_probability, + min_single_exon_probability, + ) + + postvalidation_gtf_file = final_annotation_dir / "postvalidation.gtf" + with postvalidation_gtf_file.open("w+") as file_out: + for line in updated_gtf_lines: + file_out.write(line) + cleaned_initial_gtf_file = final_annotation_dir / "cleaned_pre_utr.gtf" + cleaned_utr_gtf_file = final_annotation_dir / "annotation.gtf" + + logger.info("Cleaning initial set") + cleaning_cmd = [ + "perl", + str(clean_geneset_script), + "-genome_file", + str(genome_file), + "-gtf_file", + str(postvalidation_gtf_file), + "-output_gtf_file", + str(cleaned_initial_gtf_file), + ] + logger.info("Running cleaning command: %s", " ".join(map(str, cleaning_cmd))) + + run_command(cleaning_cmd) + + # Clean UTRs + generic_clean_utrs_cmd = [ + "perl", + str(clean_utrs_script), + "-genome_file", + str(genome_file), + "-input_gtf_file", + str(cleaned_initial_gtf_file), + ] + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for seq_region_name in seq_region_names: + region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" + utr_region_gtf_path = utr_region_annotation_dir / f"{region_details}.utr.gtf" + + cmd = generic_clean_utrs_cmd.copy() + cmd.extend( + [ + "-region_details", + region_details, + "-output_gtf_file", + str(utr_region_gtf_path), + ] + ) + pool.apply_async(run_command, args=(cmd,)) + pool.close() + pool.join() + + merge_finalise_output_files( + final_annotation_dir, + utr_region_annotation_dir, + ".utr.gtf", + "annotation", + ) + cmd = [ + "mv", + str(final_annotation_dir / "annotation_sel.gtf"), + str(cleaned_utr_gtf_file), + ] + + run_command(cmd) + + logger.info("Dumping transcript and translation sequences") + dumping_cmd = [ + "perl", + str(gtf_to_seq_script), + "-genome_file", + str(genome_file), + "-gtf_file", + str(cleaned_utr_gtf_file), + ] + + logger.info("Running dumping command: %s", " ".join(map(str, dumping_cmd))) + + run_command(dumping_cmd) + + logger.info("Finished creating gene set") + + +def validate_coding_transcripts( + cdna_file: Path, + amino_acid_file: Path, + validation_dir: Path, + validation_type: str, + diamond_validation_db: Path, + gtf_file: Path, + num_threads: int, + cpc2_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif" + ), + rnasamba_bin: Path = Path( + "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" + ), + rnasamba_weights: Path = Path( + "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" # pylint:disable=line-too-long + ), + diamond_bin: Path = Path("diamond"), + min_single_exon_pep_length: int = 100, + min_multi_exon_pep_length: int = 75, + min_single_source_probability: float = 0.8, + min_single_exon_probability: float = 0.9, +): + """Run validation on protein coding transcript + using CPC2, RNASamba, Diamond + + Args: + cdna_file (Path): Input cdna file + amino_acid_file (Path): Input protein file + validation_dir (Path): Validation output directory + validation_type (str): _description_ + diamond_validation_db (Path): _description_ + gtf_file (Path): _description_ + num_threads (int): number of threads + + Returns: + _type_: _description_ + """ + + logger.info("Running CDS validation with RNAsamba and CPC2") + # rnasamba_weights = config["rnasamba"]["weights"] + rnasamba_results = run_rnasamba(validation_dir, cdna_file, rnasamba_weights, rnasamba_bin) + cpc2_results = run_cpc2(validation_dir, cdna_file, cpc2_bin) + diamond_results = run_diamond( + validation_dir, + amino_acid_file, + diamond_validation_db, + num_threads, + diamond_bin, + ) + combined_results = combine_results(rnasamba_results, cpc2_results, diamond_results) + logger.info("read gtf genes") + parsed_gtf_genes = read_gtf_genes(gtf_file) + updated_gtf_lines = update_gtf_genes( + parsed_gtf_genes, + combined_results, + validation_type, + min_single_exon_pep_length, + min_multi_exon_pep_length, + min_single_source_probability, + min_single_exon_probability, + ) + + return updated_gtf_lines + + +def combine_results( + rnasamba_results: List[List[str]], + cpc2_results: List[List[str]], + diamond_results: Optional[List[List[str]]] = None, +) -> Dict[str, List[str]]: + """ + Combine results from RNA-Samba, CPC2, and DIAMOND analyses into a single dictionary. + + This function integrates coding potential predictions from RNA-Samba, + CPC2, and optional similarity results from DIAMOND for a set of transcripts. + + Args: + rnasamba_results (List[List[Union[str, float]]]): + Results from RNA-Samba, where each sublist contains: + - transcript_id (str): Transcript identifier. + - coding_probability (float): Probability of being coding. + - coding_potential (float): Coding potential value. + + cpc2_results (List[List[Union[str, float, int]]]): + Results from CPC2, where each sublist contains: + - transcript_id (str): Transcript identifier. + - coding_probability (float): Probability of being coding. + - coding_potential (float): Coding potential value. + - transcript_length (int): Length of the transcript. + - peptide_length (int): Length of the predicted peptide. + + diamond_results (Optional[List[List[Union[str, float]]]], optional): + Results from DIAMOND (default is None). Each sublist contains: + - transcript_id (str): Transcript identifier. + - e_value (float): E-value for sequence similarity. + + Returns: + Dict[str, List[Union[float, int]]]: + A dictionary mapping `transcript_id` to a list of combined values: + RNA-Samba predictions, CPC2 predictions, and optional DIAMOND results. + """ + transcript_ids: Dict[str, List[str]] = {} + + # Process RNA-Samba results + for result in rnasamba_results: + transcript_id, coding_probability, coding_potential = result[:3] + if transcript_id not in transcript_ids: + transcript_ids[transcript_id] = [coding_probability, coding_potential] + + # Process CPC2 results + for result in cpc2_results: + transcript_id, coding_probability, coding_potential, transcript_length, peptide_length = result[:5] + if transcript_id in transcript_ids: + transcript_ids[transcript_id].extend( + [coding_probability, coding_potential, transcript_length, peptide_length] + ) + + # Process DIAMOND results (if provided) + if diamond_results: + for result in diamond_results: + transcript_id, e_value = result[:2] + if transcript_id in transcript_ids: + transcript_ids[transcript_id].append(e_value) + + return transcript_ids + + +def update_gtf_genes(#pylint:disable=too-many-branches + parsed_gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]], + combined_results: Dict[str, List[str]], + validation_type: str, + min_single_exon_pep_length: int = 100, + min_multi_exon_pep_length: int = 75, + min_single_source_probability: float = 0.8, + min_single_exon_probability: float = 0.9, +): + """Update GTF genes based on validation criteria. + + Args: + parsed_gtf_genes (Dict): Parsed GTF genes data. + combined_results (Dict): Combined validation results. + validation_type (str): Validation strictness ("relaxed" or "moderate"). + min_single_exon_pep_length (int): Minimum peptide length for single exon. + min_multi_exon_pep_length (int): Minimum peptide length for multiple exons. + min_single_source_probability (float): Minimum probability for single source. + min_single_exon_probability (float): Minimum average probability for single exon. + + Returns: + List[str]: Updated GTF lines. + """ + output_lines = [] + + for gene_id, transcripts in parsed_gtf_genes.items():#pylint:disable=unused-variable + for transcript_id, transcript_data in transcripts.items(): + transcript_line = "".join(transcript_data["transcript"]) + single_cds_exon_transcript = 0 + + # Extract translation coordinates + translation_match = re.search(r'; translation_coords "([^"]+)";', str(transcript_line)) + if translation_match: + translation_coords_list = translation_match.group(1).split(":") + # Determine if it's a single-exon CDS + if translation_coords_list[0] == translation_coords_list[3]: + single_cds_exon_transcript = 1 + + exon_lines = transcript_data["exons"] + validation_results = combined_results[transcript_id] + + rnasamba_coding_probability = float(validation_results[0]) + rnasamba_coding_potential = validation_results[1] + cpc2_coding_probability = float(validation_results[2]) + cpc2_coding_potential = validation_results[3] + transcript_length = int(validation_results[4]) + peptide_length = int(validation_results[5]) + diamond_e_value = None + if len(validation_results) == 7: + diamond_e_value = validation_results[6] + + # Calculate coding probabilities + avg_coding_probability = (rnasamba_coding_probability + cpc2_coding_probability) / 2 + max_coding_probability = max(rnasamba_coding_probability, cpc2_coding_probability) + + match = re.search(r'; biotype "([^"]+)";', str(transcript_line)) + if match: + biotype = match.group(1) + if biotype == "busco" or biotype == "protein":#pylint:disable=consider-using-in + transcript_line = update_biotype(str(transcript_line), biotype, "protein_coding") + output_lines.append(transcript_line) + output_lines.extend(exon_lines) + continue + + # Note that the below looks at validating things + # under different levels of strictness + # There are a few different continue statements, + # where transcripts will be skipped resulting + # in a smaller post validation file. It mainly + # removes single coding exon genes with no real + # support or for multi-exon lncRNAs that are less than 200bp long + if single_cds_exon_transcript == 1 and validation_type == "relaxed": + if diamond_e_value is not None: + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + + elif ( + rnasamba_coding_potential == "coding" + and cpc2_coding_potential == "coding" + and peptide_length >= min_single_exon_pep_length + ): + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + elif ( + (rnasamba_coding_potential == "coding" or cpc2_coding_potential == "coding") + and peptide_length >= min_single_exon_pep_length + and max_coding_probability >= min_single_source_probability + ): + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + else: + continue + elif single_cds_exon_transcript == 1 and validation_type == "moderate": + if diamond_e_value is not None and peptide_length >= min_single_exon_pep_length: + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + elif ( + (rnasamba_coding_potential == "coding" and cpc2_coding_potential == "coding") + and peptide_length >= min_single_exon_pep_length + and avg_coding_probability >= min_single_exon_probability + ): + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + else: + continue + else: + if diamond_e_value is not None: + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + elif ( + rnasamba_coding_potential == "coding" + and cpc2_coding_potential == "coding" + and peptide_length >= min_multi_exon_pep_length + ): + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + elif ( + (rnasamba_coding_potential == "coding" or cpc2_coding_potential == "coding") + and peptide_length >= min_multi_exon_pep_length + and max_coding_probability >= min_single_source_probability + ): + transcript_line = update_biotype(transcript_line, biotype, "protein_coding") + elif transcript_length >= 200: + transcript_line = update_biotype(transcript_line, biotype, "lncRNA") + transcript_line = re.sub(' translation_coords "[^"]+";', "", transcript_line) + else: + continue + + output_lines.append(transcript_line) + output_lines.extend(exon_lines) + + return output_lines + + +def update_biotype(transcript_line: str, current_biotype: str, new_biotype: str) -> str: + """Update the biotype in a transcript line. + + Args: + transcript_line (str): The line containing the transcript information. + current_biotype (str): The biotype to be replaced. + new_biotype (str): The new biotype to set. + + Returns: + str: The updated transcript line with the new biotype. + """ + return re.sub(f'; biotype "{current_biotype}";', f'; biotype "{new_biotype}";', transcript_line) + + +def merge_finalise_output_files( + final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str +) -> None: + """Merge the resulting gene models from all the seq regions for the specified extension + + Args: + final_annotation_dir (Path): Output directory + region_annotation_dir (Path): Input directory + extension (str): File extension + id_label (str): Type of data or process to merge + """ + gtf_files = list(region_annotation_dir.glob(f"*{extension}")) + merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf" + merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa" + merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa" + + # The below is not great, it's a bit messy because there might be + # some cases where there aren't translations. So it's not as + # straightforward as reading the records across all three files + # in parallel. The solution is to just load the seqs into + # memory and index them on the current header, which should + # correspond to a transcript/gene id in the GTF. When writing the + # results into the single merged files the ids will be updated to + # be unique and consistent across the header, three file types + + gene_id_counter = 0 + transcript_id_counter = 0 + + with merged_gtf_file.open("w") as gtf_out, merged_cdna_file.open( + "w" + ) as cdna_out, merged_amino_acid_file.open("w") as amino_acid_out: + for gtf_file in gtf_files: + logger.info("GTF file: %s", gtf_file) + cdna_seq_index = {} + amino_acid_seq_index = {} + cdna_file = gtf_file.with_suffix(".cdna") + amino_acid_file = gtf_file.with_suffix(".prot") + + # Check that the files exist and are not empty + if (not Path(cdna_file).is_file() or Path(cdna_file).stat().st_size == 0) and ( + Path(amino_acid_file).is_file() or Path(amino_acid_file).stat().st_size == 0 + ): + logger.error( + "DNA file %s or amino acid file %s \ + does not exist or is empty.", + cdna_file, + amino_acid_file, + ) + else: + # Open and process the files + with open(cdna_file) as cdna_in, open(amino_acid_file) as amino_acid_in: + cdna_seq_index = fasta_to_dict(cdna_in.readlines()) + amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) + + current_gene_id = "" + with gtf_file.open() as gtf_in: + for line in gtf_in: + if re.search(r"^#", line): + # if line.startswith("#"): + continue + eles = line.split("\t") + if len(eles) != 9: + continue + + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) + gene_id = "" + transcript_id = "" + if match and eles[2] == "transcript": + transcript_id_counter += 1 + gene_id = match.group(1) + transcript_id = match.group(2) + + if not current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + + if gene_id != current_gene_id: + gene_id_counter += 1 + current_gene_id = gene_id + new_gene_id = f"{id_label}_{gene_id_counter}" + new_transcript_id = f"{id_label}_{transcript_id_counter}" + + line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line) + line = re.sub( + f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line + ) + gtf_out.write(line) + if eles[2] == "transcript": + new_header = f">{new_transcript_id}\n" + cdna_out.write(new_header + cdna_seq_index[transcript_id]) + if transcript_id in amino_acid_seq_index: + amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) + + +def copy_raw_files(raw_file: Path, destination_file: Path) -> None: + """Copy file in a new destination + + Args: + raw_file (Path): File to copy + destination_file (Path): New destination + """ + if raw_file.exists(): + try: + shutil.copy(raw_file, destination_file) + logger.info("Copied %s to %s", raw_file, destination_file) + except Exception as e: # pylint:disable=broad-exception-caught + logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e) + else: + logger.info("No file found at %s, skipping", raw_file) + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Genblast arguments") + + parser.add_argument("--main_output_dir", required=True, help="Output directory path") + parser.add_argument("--genome_file", required=True, help="Path for the genome file") + parser.add_argument("--seq_region_names", type=List[str], help="List of seq region names") + parser.add_argument("--diamond_validation_db", required=True, help="Diamond validation db file path") + parser.add_argument( + "--validation_type", type=int, choices=["relaxed", "moderate"], help="Type of validation" + ) + parser.add_argument( + "--diamond_bin", + help="DIAMOND executable path", + ) + parser.add_argument( + "--cpc2_bin", + help="CPC2 executable path", + ) + parser.add_argument( + "--rnasamba_bin", + help="RNAsamba executable path", + ) + parser.add_argument( + "--rnasamba_weights", + help="Rnasamba weights path", + ) + parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") + parser.add_argument( + "--min_single_exon_pep_length", type=int, default=100, help="Minimum peptide length for single exon." + ) + parser.add_argument( + "--min_multi_exon_pep_length", type=int, default=75, help="Minimum peptide length for multiple exons." + ) + parser.add_argument( + "--min_single_source_probability", + type=float, + default=0.8, + help="Minimum probability for single source.", + ) + parser.add_argument( + "--min_single_exon_probability", + type=float, + default=0.9, + help="Minimum average probability for single exon.", + ) + return parser.parse_args() + + +def main(): + """Finalisation's entry-point.""" + args = parse_args() + + log_file_path = create_dir(args.output_dir, "log") / "finalisation.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + + run_finalise_geneset( + Path(args.main_output_dir), + Path(args.genome_file), + args.seq_region_names, + Path(args.diamond_validation_db), + args.validation_type, + args.num_threads, + Path(args.cpc2_bin), + Path(args.rnasamba_bin), + Path(args.rnasamba_weights), + Path(args.diamond_bin), + args.min_single_exon_pep_length, + args.min_multi_exon_pep_length, + args.min_single_source_probability, + args.min_single_exon_probability, + ) + + +if __name__ == "__main__": + main() diff --git a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py b/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py deleted file mode 100644 index 178fbb4..0000000 --- a/src/python/ensembl/tools/anno/finalise_genset/finalise_geneset_utils.py +++ /dev/null @@ -1,664 +0,0 @@ -# See the NOTICE file distributed with this work for additional information -# regarding copyright ownership. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Set of functions used to collect the results of the subpipelines and generate the final output -This logic will be revised in the nextflow pipeline -""" -__all__ = ["run_finalise_geneset"] - -import logging -import logging.config -import multiprocessing -import os -from pathlib import Path -import re -import subprocess -from typing import List -import shutil - -from ensembl.tools.anno.utils._utils import ( - check_file, - get_seq_region_length, - fasta_to_dict, - check_exe, - create_dir, - check_gtf_content, - split_protein_file -) - -logger = logging.getLogger(__name__) - -def run_finalise_geneset( - main_script_dir: Path, - main_output_dir: Path, - genome_file: Path, - seq_region_names: List[str], - diamond_validation_db :Path, - validation_type: str = "relaxed", - num_threads: int = 1, - cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), - rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), - rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), - -): - """Collect results - - :param main_script_dir: - :type main_script_dir: Path - :param main_output_dir: _description_ - :type main_output_dir: Path - :param genome_file: Path for genome fasta file - :type genome_file: Path - :param seq_region_names: list of seq region names - :type seq_region_names: List[str] - :param diamond_validation_db: Path for diamond validation db - :type diamond_validation_db: Path - :param validation_type: type of validation - :type validation_type: str, default "relaxed" - :param num_threads: Num of threads. Defaults to 1. - :type num_threads: int, default 1 - :param cpc2_bin: CPC2 software path - :type cpc2_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif" - :param rnasamba_bin: RNASamba software path - :type rnasamba_bin: Path, default "/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif" - :param rnasamba_weights: RNASamba weights path - :type rnasamba_weights: Path, default "/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5" - - """ - - - final_annotation_dir = create_dir(main_output_dir, "annotation_output") - region_annotation_dir = create_dir(final_annotation_dir, "initial_region_gtfs") - final_region_annotation_dir = create_dir( - final_annotation_dir, "final_region_gtfs" - ) - utr_region_annotation_dir = create_dir(final_annotation_dir, "utr_region_gtfs") - validation_dir = create_dir(final_annotation_dir, "cds_validation") - seq_region_lengths = get_seq_region_length(genome_file, 0) - output_file = final_annotation_dir / "annotation.gtf" - if output_file.exists(): - transcript_count = check_gtf_content(output_file, "transcript") - if transcript_count > 0: - logger.info("Final gtf file exists, skipping analysis") - return - - protein_annotation_raw = main_output_dir / "genblast_output" / "annotation.gtf" - minimap2_annotation_raw = main_output_dir / "minimap2_output" / "annotation.gtf" - stringtie_annotation_raw = main_output_dir / "stringtie_output" / "annotation.gtf" - scallop_annotation_raw = main_output_dir / "scallop_output" / "annotation.gtf" - busco_annotation_raw = main_output_dir / "busco_output" / "annotation.gtf" - transcript_selector_script = main_output_dir / "support_scripts_perl" / "select_best_transcripts.pl" - finalise_geneset_script = main_output_dir / "support_scripts_perl" / "finalise_geneset.pl" - clean_geneset_script = main_output_dir / "support_scripts_perl" / "clean_geneset.pl" - clean_utrs_script = main_output_dir / "support_scripts_perl" / "clean_utrs_and_lncRNAs.pl" - gtf_to_seq_script = main_output_dir / "support_scripts_perl" / "gtf_to_seq.pl" - - transcriptomic_annotation_raw = main_output_dir / "transcriptomic_raw.gtf" - - with transcriptomic_annotation_raw.open("w") as file_out: - for transcriptomic_file in [ - minimap2_annotation_raw, - scallop_annotation_raw, - stringtie_annotation_raw, - ]: - if not transcriptomic_file.exists(): - logger.info("No annotation.gtf file found in %s, skipping", transcriptomic_file) - continue - - with transcriptomic_file.open("r") as file_in: - for line in file_in: - file_out.write(line) - - # Copy the raw files into the annotation dir - copy_raw_files(busco_annotation_raw, final_annotation_dir / "busco_raw.gtf") - copy_raw_files(protein_annotation_raw, final_annotation_dir / "protein_raw.gtf") - - #select best transcript - generic_select_cmd = [ - "perl", - str(transcript_selector_script), - "-genome_file", - str(genome_file), - ] - pool = multiprocessing.Pool(int(num_threads)) # pylint: disable=consider-using-with - for seq_region_name in seq_region_names: - # The selection script needs different params depending on - # whether the seqs are from transcriptomic data or not - region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" - transcriptomic_region_gtf_path = region_annotation_dir / f"{region_details}.trans.gtf" - busco_region_gtf_path = region_annotation_dir / f"{region_details}.busco.gtf" - protein_region_gtf_path = region_annotation_dir / f"{region_details}.protein.gtf" - if transcriptomic_annotation_raw: - cmd = generic_select_cmd.copy() - cmd.extend( - [ - "-region_details", - region_details, - "-input_gtf_file", - str(transcriptomic_annotation_raw), - "-output_gtf_file", - str(transcriptomic_region_gtf_path), - "-cds_search", - "-final_biotype", - "transcriptomic", - ] - ) - pool.apply_async(run_command, args=(cmd,)) - if busco_annotation_raw: - cmd = generic_select_cmd.copy() - cmd.extend( - [ - "-region_details", - region_details, - "-input_gtf_file", - str(busco_annotation_raw), - "-output_gtf_file", - str(busco_region_gtf_path), - "-all_cds_exons", - "-final_biotype", - "busco", - ] - ) - pool.apply_async(run_command, args=(cmd,)) - if protein_annotation_raw: - cmd = generic_select_cmd.copy() - cmd.extend( - [ - "-region_details", - region_details, - "-input_gtf_file", - str(protein_annotation_raw), - "-output_gtf_file", - str(protein_region_gtf_path), - "-clean_transcripts", - "-all_cds_exons", - "-final_biotype", - "protein", - ] - ) - pool.apply_async(run_command, args=(cmd,)) - - pool.close() - pool.join() - - # At this point we will have the region files for each seq region and we can merge them - merge_finalise_output_files( - final_annotation_dir, - region_annotation_dir, - ".trans.gtf", - "transcriptomic", - ) - merge_finalise_output_files( - final_annotation_dir, region_annotation_dir, ".busco.gtf", "busco" - ) - merge_finalise_output_files( - final_annotation_dir, region_annotation_dir, ".protein.gtf", "protein" - ) - - # Create a single GTF file with all the selected transcripts - # now that they have proper ids - fully_merged_gtf_path = final_annotation_dir / "all_selected_transcripts.gtf" - # Collect all *_sel.gtf files - gtf_files = list(final_annotation_dir.glob("*_sel.gtf")) - - # Ensure there are files to merge - if not gtf_files: - raise FileNotFoundError("No *_sel.gtf files found in %s",final_annotation_dir) - - - try: - with fully_merged_gtf_path.open("w+") as fully_merged_gtf_out: - merge_gtf_cmd = ["cat"] + [str(gtf_file) for gtf_file in gtf_files] - subprocess.run(merge_gtf_cmd, stdout=fully_merged_gtf_out, check=True) - logger.info("Merged GTF files into %s", fully_merged_gtf_path) - except Exception as e: - print("An error occurred while merging GTF files: %s",e) - - # Now collapse the gene set - generic_finalise_cmd = [ - "perl", - str(finalise_geneset_script), - "-genome_file", - str(genome_file), - ] - - pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with - for seq_region_name in seq_region_names: - region_details = f"{seq_region_name}.rs1.re{seq_region_lengths[seq_region_name]}" - final_region_gtf_path = final_region_annotation_dir / f"{region_details}.final.gtf" - cmd = generic_finalise_cmd.copy() - cmd.extend( - [ - "-region_details", - region_details, - "-input_gtf_file", - str(fully_merged_gtf_path), - "-output_gtf_file", - str(final_region_gtf_path), - ] - ) - pool.apply_async(run_command, args=(cmd,)) - - pool.close() - pool.join() - - merge_finalise_output_files( - final_annotation_dir, - final_region_annotation_dir, - ".final.gtf", - "prevalidation", - ) - merged_gtf_file = final_annotation_dir / "prevalidation_sel.gtf" - merged_cdna_file = final_annotation_dir / "prevalidation_sel.cdna.fa" - merged_amino_acid_file = final_annotation_dir /"prevalidation_sel.prot.fa" - #validation - updated_gtf_lines = validate_coding_transcripts( - merged_cdna_file, - merged_amino_acid_file, - validation_dir, - validation_type, - diamond_validation_db, - merged_gtf_file, - num_threads, - cpc2_bin, - rnasamba_bin, - rnasamba_weights - ) - - #### - postvalidation_gtf_file = os.path.join(final_annotation_dir, ("postvalidation.gtf")) - file_out = open(postvalidation_gtf_file, "w+") - for line in updated_gtf_lines: - file_out.write(line) - file_out.close() - - cleaned_initial_gtf_file = os.path.join(final_annotation_dir, ("cleaned_pre_utr.gtf")) - cleaned_utr_gtf_file = os.path.join(final_annotation_dir, ("annotation.gtf")) - - logger.info("Cleaning initial set") - cleaning_cmd = [ - "perl", - clean_geneset_script, - "-genome_file", - genome_file, - "-gtf_file", - postvalidation_gtf_file, - "-output_gtf_file", - cleaned_initial_gtf_file, - ] - logger.info(" ".join(cleaning_cmd)) - subprocess.run(cleaning_cmd) - - # Clean UTRs - generic_clean_utrs_cmd = [ - "perl", - clean_utrs_script, - "-genome_file", - genome_file, - "-input_gtf_file", - cleaned_initial_gtf_file, - ] - pool = multiprocessing.Pool(int(num_threads)) - for seq_region_name in seq_region_names: - region_details = ( - seq_region_name + ".rs1" + ".re" + str(seq_region_lengths[seq_region_name]) - ) - utr_region_gtf_path = os.path.join( - utr_region_annotation_dir, (region_details + ".utr.gtf") - ) - - cmd = generic_clean_utrs_cmd.copy() - cmd.extend( - [ - "-region_details", - region_details, - "-input_gtf_file", - cleaned_initial_gtf_file, - "-output_gtf_file", - utr_region_gtf_path, - ] - ) - pool.apply_async(multiprocess_generic, args=(cmd,)) - pool.close() - pool.join() - - merge_finalise_output_files( - final_annotation_dir, - utr_region_annotation_dir, - ".utr.gtf", - "annotation", - ) - subprocess.run( - [ - "mv", - os.path.join(final_annotation_dir, "annotation_sel.gtf"), - cleaned_utr_gtf_file, - ] - ) - - logger.info("Dumping transcript and translation sequences") - dumping_cmd = [ - "perl", - gtf_to_seq_script, - "-genome_file", - genome_file, - "-gtf_file", - cleaned_utr_gtf_file, - ] - logger.info(" ".join(dumping_cmd)) - subprocess.run(dumping_cmd) - - logger.info("Finished creating gene set") - - -def validate_coding_transcripts( - cdna_file:Path, - amino_acid_file:Path, - validation_dir:Path, - validation_type:str, - diamond_validation_db:Path, - gtf_file:Path, - num_threads:int, - cpc2_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/test_cpc2.sif"), - rnasamba_bin: Path = Path("/hps/software/users/ensembl/genebuild/genebuild_virtual_user/singularity/rnasamba_latest.sif"), - rnasamba_weights: Path= Path("/nfs/production/flicek/ensembl/genebuild/genebuild_virtual_user/rnasamba_data/full_length_weights.hdf5"), - -): - """Run validation on protein coding transcript - using CPC2, RNASamba, Diamond - - Args: - cdna_file (Path): Input cdna file - amino_acid_file (Path): Input protein file - validation_dir (Path): Validation output directory - validation_type (str): _description_ - diamond_validation_db (Path): _description_ - gtf_file (Path): _description_ - num_threads (int): number of threads - - Returns: - _type_: _description_ - """ - - logger.info("Running CDS validation with RNAsamba and CPC2") - #rnasamba_weights = config["rnasamba"]["weights"] - - - - - - - - - logger.info("read results") - rnasamba_results = read_rnasamba_results(rnasamba_output_path) - cpc2_results = read_cpc2_results(cpc2_output_path) - combined_results = combine_results(rnasamba_results, cpc2_results, diamond_results) - logger.info("read gtf genes") - parsed_gtf_genes = read_gtf_genes(gtf_file) - updated_gtf_lines = update_gtf_genes( - parsed_gtf_genes, combined_results, validation_type - ) - - return updated_gtf_lines - - - - -def combine_results(rnasamba_results, cpc2_results, diamond_results): - - transcript_ids = {} - - for result in rnasamba_results: - transcript_id = result[0] - coding_probability = result[1] - coding_potential = result[2] - - if transcript_id not in transcript_ids: - transcript_ids[transcript_id] = [ - coding_probability, - coding_potential, - ] - - for result in cpc2_results: - transcript_id = result[0] - coding_probability = result[1] - coding_potential = result[2] - transcript_length = result[3] - peptide_length = result[4] - transcript_ids[transcript_id].extend( - [ - coding_probability, - coding_potential, - transcript_length, - peptide_length, - ] - ) - - if diamond_results is not None: - for result in diamond_results: - transcript_id = result[0] - e_value = result[1] - # There seems to be an issue where there are a small number - # of sequences that don't make it into the cpc2/rnasamba oqutput - # Should code in a system for this, but it would be good to - # understand why it happens to begin with. Seems to be the same - # number of missing seqs in both, so maybe a shared cut-off - if transcript_id in transcript_ids: - transcript_ids[transcript_id].extend([e_value]) - - return transcript_ids - -def merge_finalise_output_files( - final_annotation_dir: Path, region_annotation_dir: Path, extension: str, id_label: str -)-> None: - """Merge the resulting gene models from all the seq regions for the specified extension - - Args: - final_annotation_dir (Path): Output directory - region_annotation_dir (Path): Input directory - extension (str): File extension - id_label (str): Type of data or process to merge - """ - gtf_files = list(region_annotation_dir.glob(f"*{extension}")) - merged_gtf_file = final_annotation_dir / f"{id_label}_sel.gtf" - merged_cdna_file = final_annotation_dir / f"{id_label}_sel.cdna.fa" - merged_amino_acid_file = final_annotation_dir / f"{id_label}_sel.prot.fa" - - # The below is not great, it's a bit messy because there might be - # some cases where there aren't translations. So it's not as - # straightforward as reading the records across all three files - # in parallel. The solution is to just load the seqs into - # memory and index them on the current header, which should - # correspond to a transcript/gene id in the GTF. When writing the - # results into the single merged files the ids will be updated to - # be unique and consistent across the header, three file types - - gene_id_counter = 0 - transcript_id_counter = 0 - - with merged_gtf_file.open("w") as gtf_out, \ - merged_cdna_file.open("w") as cdna_out, \ - merged_amino_acid_file.open("w") as amino_acid_out: - - for gtf_file in gtf_files: - logger.info("GTF file: %s", gtf_file) - cdna_seq_index = {} - amino_acid_seq_index = {} - cdna_file = gtf_file.with_suffix(".cdna") - amino_acid_file = gtf_file.with_suffix(".prot") - if cdna_file.exists(): - with cdna_file.open() as cdna_in: - cdna_seq_index = fasta_to_dict(cdna_in.readlines()) - - if amino_acid_file.exists(): - with amino_acid_file.open() as amino_acid_in: - amino_acid_seq_index = fasta_to_dict(amino_acid_in.readlines()) - current_gene_id = "" - with gtf_file.open() as gtf_in: - for line in gtf_in: - if re.search(r"^#", line): - #if line.startswith("#"): - continue - eles = line.split("\t") - if len(eles) != 9: - continue - - match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) - if match and eles[2] == "transcript": - transcript_id_counter += 1 - gene_id = match.group(1) - transcript_id = match.group(2) - - if not current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - - if gene_id != current_gene_id: - gene_id_counter += 1 - current_gene_id = gene_id - new_gene_id = f"{id_label}_{gene_id_counter}" - new_transcript_id = f"{id_label}_{transcript_id_counter}" - - line = re.sub(f'gene_id "{gene_id}"', f'gene_id "{new_gene_id}"', line) - line = re.sub(f'transcript_id "{transcript_id}"', f'transcript_id "{new_transcript_id}"', line) - gtf_out.write(line) - if eles[2] == "transcript": - new_header = f">{new_transcript_id}\n" - cdna_out.write(new_header + cdna_seq_index[transcript_id]) - if transcript_id in amino_acid_seq_index: - amino_acid_out.write(new_header + amino_acid_seq_index[transcript_id]) - - -def multiprocess_finalise_geneset(cmd): - - print(" ".join(cmd)) - subprocess.run(cmd) - - - -def run_command(cmd)->None: - """Submit command in multiprocessing action - - Args: - cmd (List[str]): command to execute - """ - try: - result = subprocess.run(cmd, check=True, capture_output=True, text=True) - logger.info("Command executed successfully: %s", cmd) - logger.info("Output: %s", result.stdout) - except subprocess.CalledProcessError as e: - logger.error("Command failed: %s", cmd) - logger.error("Error: %s", e.stderr) - -def copy_raw_files(raw_file: Path, destination_file :Path)-> None: - """Copy file in a new destination - - Args: - raw_file (Path): File to copy - destination_file (Path): New destination - """ - if raw_file.exists(): - try: - shutil.copy(raw_file, destination_file) - logger.info("Copied %s to %s", raw_file, destination_file) - except Exception as e: - logger.error("Failed to copy %s to %s: %s", raw_file, destination_file, e) - else: - logger.info("No file found at %s, skipping", raw_file) - - - - - - - - - - - - - - - - - - -def parse_args(): - """Parse command line arguments.""" - parser = argparse.ArgumentParser(description="Genblast arguments") - parser.add_argument( - "--masked_genome_file", required=True, help="Masked genome file path" - ) - parser.add_argument("--output_dir", required=True, help="Output directory path") - parser.add_argument("--protein_file", required=True, help="Path for the protein dataset") - parser.add_argument( - "--genblast_timeout_secs", type=int, default=10800, help="Genblast timeout period" - ) - parser.add_argument( - "--max_intron_length", type=int, required=True, help="Maximum intron length" - ) - parser.add_argument( - "--genblast_bin", - default="genblast", - help="Genblast executable path", - ) - parser.add_argument( - "--convert2blastmask_bin", - default="convert2blastmask", - help="convert2blastmask executable path", - ) - parser.add_argument( - "--makeblastdb_bin", - default="makeblastdb", - help="makeblastdb executable path", - ) - parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") - parser.add_argument( - "--protein_set", - required=True, - choices=["uniprot", "orthodb"], - help="Protein set [uniprot, orthodb]", - ) - return parser.parse_args() - -def main(): - """Genblast's entry-point.""" - args = parse_args() - - log_file_path = create_dir(args.output_dir, "log") / "genblast.log" - loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" - - logging.config.fileConfig( - loginipath, - defaults={"logfilename": str(log_file_path)}, - disable_existing_loggers=False, - ) - - run_genblast( - Path(args.masked_genome_file), - Path(args.output_dir), - Path(args.protein_file), - args.max_intron_length, - args.genblast_timeout_secs, - Path(args.genblast_bin), - Path(args.convert2blastmask_bin), - Path(args.makeblastdb_bin), - args.num_threads, - args.protein_set, - ) - -if __name__ == "__main__": - main() diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py index 8f882a5..b4bbc72 100644 --- a/src/python/ensembl/tools/anno/utils/_utils.py +++ b/src/python/ensembl/tools/anno/utils/_utils.py @@ -16,7 +16,6 @@ import errno import io -import json import logging import os from os import PathLike @@ -25,9 +24,8 @@ import re import subprocess import shutil -import glob import tempfile -from typing import Dict, List +from typing import Dict, List, Optional logger = logging.getLogger("__name__." + __name__) @@ -128,7 +126,7 @@ def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dic if len(current_seq) > min_seq_length: seq_region_to_length[current_header] = len(current_seq) - #logging.info("length of the seq region %s", seq_region_to_length) + # logging.info("length of the seq region %s", seq_region_to_length) return seq_region_to_length @@ -168,7 +166,7 @@ def get_slice_id( if (end - start + 1) >= min_length: slice_ids_per_region.append([seq_region, start, end]) start = end + 1 - #logging.info("list of slice ids %s", str(slice_ids_per_region)) + # logging.info("list of slice ids %s", str(slice_ids_per_region)) return slice_ids_per_region @@ -254,9 +252,7 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches # Example key KS8000.rs1.re1000000.1 gene_id_slice = input_file.name + "." + str(current_gene_id) # Example key KS8000.rs1.re1000000.1.transcript.1 - transcript_id_slice = ( - gene_id_slice + "." + str(current_transcript_id) - ) + transcript_id_slice = gene_id_slice + "." + str(current_transcript_id) # If there is no existing entry, the gene key will be added # and the gene counter is incremented. # gene_id "gene1"; transcript_id "gene1.t1" @@ -281,16 +277,12 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches + ".t" + str(transcript_id_count_gene_id[gene_id_slice]) ) - gene_transcript_id_collection[ - transcript_id_slice - ] = new_transcript_id + gene_transcript_id_collection[transcript_id_slice] = new_transcript_id transcript_id_count_gene_id[gene_id_slice] += 1 else: # If a transcript of the same set is already present, # the new id with the incremented counter is added - new_transcript_id = gene_transcript_id_collection[ - transcript_id_slice - ] + new_transcript_id = gene_transcript_id_collection[transcript_id_slice] attribs = re.sub( full_transcript_id_string, 'transcript_id "' + new_transcript_id + '"', @@ -303,13 +295,15 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches else: if new_id_prefix == "repeatmask": match_feature_type = re.search( - r"(" + feature_id_label + "\d+)", + r"(" + feature_id_label + r"\d+)", line, ) else: - match_feature_type = re.search( # repeat_id\d+ r"(" + feature_id_label + ' +"([^"]+)")', - r"(" + feature_id_label + ' +"([^"]+)")', # "\d+)", - line, + match_feature_type = ( + re.search( # repeat_id\d+ r"(" + feature_id_label + ' +"([^"]+)")', + r"(" + feature_id_label + ' +"([^"]+)")', # "\d+)", + line, + ) ) if match_feature_type: full_feature_id = match_feature_type.group(1) @@ -327,6 +321,8 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches "Feature type not recognised, will skip. Feature type: %s", values[2], ) + + def get_sequence( seq_region: str, start: int, @@ -355,9 +351,7 @@ def get_sequence( "get_sequence %s", f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}", ) - with tempfile.NamedTemporaryFile( - mode="w+t", delete=False, dir=str(output_dir) - ) as bed_temp_file: + with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file: bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}") bed_temp_file.close() bedtools_command = [ @@ -368,10 +362,10 @@ def get_sequence( "-bed", str(bed_temp_file.name), ] - bedtools_output = subprocess.Popen(bedtools_command, stdout=subprocess.PIPE) - for idx, line in enumerate( - io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8") # type: ignore - ): + bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with + bedtools_command, stdout=subprocess.PIPE + ) # pylint:disable=consider-using-with + for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")): # type: ignore if idx == 1: if strand == 1: sequence = line.rstrip() @@ -404,49 +398,46 @@ def check_file(file_path: Path) -> None: if not file_path.is_file(): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) + def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]: """ Save fastq sequences in a dictionary fasta_lines: List[str]: List of sequences """ - index = {} - header = None - seq = [] + index: Dict[str, str] = {} + header: Optional[str] = None + seq: List[str] = [] for line in fasta_lines: line = line.strip() - if line.startswith(">"): + header_match = re.match(r">(.+)\n$", line) + if header_match: if header: - index[header] = ''.join(seq) - header_match = re.match(r">(.+)", line) - if header_match: - header = header_match.group(1) - seq = [] - else: - raise ValueError(f"Invalid FASTA header line: {line}") + index[header] = "".join(seq) + header = None + header = header_match.group(1) + seq = [] else: seq.append(line) - + # Add the last sequence if header: - index[header] = ''.join(seq) - - return index -""" -def fasta_to_dict(fasta_lines:List[str]): - index = {} - for line in fasta_lines: - #it = iter(fasta_lines) - #for header in it: - match = re.search(r">(.+)\n$", line) - header = match.group(1) - seq = next(line) - index[header] = seq + index[header] = "".join(seq) return index -""" -def split_protein_file( - protein_dataset: Path, output_dir: Path, batch_size: int = 20 -) -> List: + +# def fasta_to_dict(fasta_lines:List[str]): +# index = {} +# for line in fasta_lines: +# #it = iter(fasta_lines) +# #for header in it: +# match = re.search(r">(.+)\n$", line) +# header = match.group(1) +# seq = next(line) +# index[header] = seq +# return index + + +def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List: """ The protein dataset file is splitted by a number of sequence equals to the batch_size in batch files stored in 10 output directories. @@ -459,7 +450,7 @@ def split_protein_file( for i in range(0, 10): create_dir(output_dir, (f"bin_{i}")) - with open(protein_dataset,"r", encoding="utf8") as file_in: + with open(protein_dataset, "r", encoding="utf8") as file_in: seq_count = 0 batch_count = 0 current_record = "" @@ -494,950 +485,69 @@ def split_protein_file( batched_protein_files.append(batch_file) return batched_protein_files -def load_results_to_ensembl_db( - main_script_dir, - load_to_ensembl_db, - genome_file, - main_output_dir, - db_details, - num_threads, -): - db_loading_script = os.path.join( - main_script_dir, "support_scripts_perl", "load_gtf_ensembl.pl" - ) - db_loading_dir = create_dir(main_output_dir, "db_loading") - - # Should collapse this into a function - annotation_results_gtf_file = os.path.join( - main_output_dir, "annotation_output", "annotation.gtf" - ) - if os.path.exists(annotation_results_gtf_file): - logger.info("Loading main geneset to db") - batch_size = 200 - load_type = "gene" - analysis_name = "ensembl" - gtf_records = batch_gtf_records( - annotation_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find the main gene annotation file, so not loading. Path checked:\n" - + annotation_results_gtf_file - ) - - rfam_results_gtf_file = os.path.join(main_output_dir, "rfam_output", "annotation.gtf") - if os.path.exists(rfam_results_gtf_file): - logger.info("Loading Rfam-based sncRNA genes to db") - batch_size = 500 - load_type = "gene" - analysis_name = "ncrna" - gtf_records = batch_gtf_records( - rfam_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find an Rfam annotation file, so not loading. Path checked:\n" - + rfam_results_gtf_file - ) - - trnascan_results_gtf_file = os.path.join( - main_output_dir, "trnascan_output", "annotation.gtf" - ) - if os.path.exists(trnascan_results_gtf_file): - logger.info("Loading tRNAScan-SE tRNA genes to db") - batch_size = 500 - load_type = "gene" - analysis_name = "ncrna" - gtf_records = batch_gtf_records( - trnascan_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find an tRNAScan-SE annotation file, so not loading. Path checked:\n" - + trnascan_results_gtf_file - ) - - dust_results_gtf_file = os.path.join(main_output_dir, "dust_output", "annotation.gtf") - if os.path.exists(dust_results_gtf_file): - logger.info("Loading Dust repeats to db") - batch_size = 500 - load_type = "single_line_feature" - analysis_name = "dust" - gtf_records = batch_gtf_records( - dust_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find a Dust annotation file, so not loading. Path checked:\n" - + dust_results_gtf_file - ) - - red_results_gtf_file = os.path.join(main_output_dir, "red_output", "annotation.gtf") - if os.path.exists(red_results_gtf_file): - logger.info("Loading Red repeats to db") - batch_size = 500 - load_type = "single_line_feature" - analysis_name = "repeatdetector" - gtf_records = batch_gtf_records( - red_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find a Red annotation file, so not loading. Path checked:\n" - + red_results_gtf_file - ) - trf_results_gtf_file = os.path.join(main_output_dir, "trf_output", "annotation.gtf") - if os.path.exists(trf_results_gtf_file): - logger.info("Loading TRF repeats to db") - batch_size = 500 - load_type = "single_line_feature" - analysis_name = "trf" - gtf_records = batch_gtf_records( - trf_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find a TRF annotation file, so not loading. Path checked:\n" - + trf_results_gtf_file - ) +def run_command(cmd) -> None: + """Submit command in multiprocessing action - cpg_results_gtf_file = os.path.join(main_output_dir, "cpg_output", "annotation.gtf") - if os.path.exists(cpg_results_gtf_file): - logger.info("Loading CpG islands to db") - batch_size = 500 - load_type = "single_line_feature" - analysis_name = "cpg" - gtf_records = batch_gtf_records( - cpg_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find a CpG annotation file, so not loading. Path checked:\n" - + cpg_results_gtf_file - ) + Args: + cmd (List[str]): command to execute + """ + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info("Command executed successfully: %s", cmd) + logger.info("Output: %s", result.stdout) + except subprocess.CalledProcessError as e: + logger.error("Command failed: %s", cmd) + logger.error("Error: %s", e.stderr) - eponine_results_gtf_file = os.path.join( - main_output_dir, "eponine_output", "annotation.gtf" - ) - if os.path.exists(eponine_results_gtf_file): - logger.info("Loading Eponine repeats to db") - batch_size = 500 - load_type = "single_line_feature" - analysis_name = "eponine" - gtf_records = batch_gtf_records( - eponine_results_gtf_file, batch_size, db_loading_dir, load_type - ) - generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, - ) - else: - logger.error( - "Did not find an Eponine annotation file, so not loading. Path checked:\n" - + eponine_results_gtf_file - ) - logger.info("Finished loading records to db") - - -def generic_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - gtf_records, - num_threads, -): - pool = multiprocessing.Pool(int(num_threads)) - for record_batch in gtf_records: - pool.apply_async( - multiprocess_load_records_to_ensembl_db, - args=( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - db_loading_dir, - load_type, - analysis_name, - record_batch, - ), - ) +def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]: + """ + Parses a GTF file to extract gene, transcript, and exon information. - pool.close() - pool.join() - - -def multiprocess_load_records_to_ensembl_db( - load_to_ensembl_db, - db_loading_script, - genome_file, - db_details, - output_dir, - load_type, - analysis_name, - record_batch, -): - with tempfile.NamedTemporaryFile( - mode="w+t", delete=False, dir=output_dir - ) as gtf_temp_out: - for line in record_batch: - gtf_temp_out.writelines(line) - gtf_temp_file_path = gtf_temp_out.name - - (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",") - - loading_cmd = [ - "perl", - db_loading_script, - "-genome_file", - genome_file, - "-dbname", - db_name, - "-host", - db_host, - "-port", - str(db_port), - "-user", - db_user, - "-pass", - db_pass, - "-gtf_file", - gtf_temp_file_path, - "-analysis_name", - analysis_name, - "-load_type", - load_type, - ] + Args: + gtf_file: Path to the GTF file. - if load_type == "gene" and analysis_name == "ensembl": - loading_cmd.extend( - [ - "-protein_coding_biotype", - "anno_protein_coding", - "-non_coding_biotype", - "anno_lncRNA", - ] - ) + Returns: + A dictionary containing gene information, organized by gene_id, transcript_id, and features. + """ + gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {} - if load_to_ensembl_db == "single_transcript_genes": - loading_cmd.append("-make_single_transcript_genes") - - logger.info(" ".join(loading_cmd)) - subprocess.run(loading_cmd) - gtf_temp_out.close() - os.remove(gtf_temp_file_path) # doesn't seem to be working - logger.info("Finished: " + gtf_temp_file_path) - gc.collect() - - -def batch_gtf_records(input_gtf_file, batch_size, output_dir, record_type): - records = [] - gtf_in = open(input_gtf_file) - if record_type == "gene": - # NOTE that the neverending variations on GTF reading/writing/merging - # is becoming very messy - # need to create a set of utility methods outside of this script - # This one assumes the file has unique ids for the parent features. - # It then batches them into - # sets of records based on the batch size passed in - record_counter = 0 - current_record_batch = [] - current_gene_id = "" - line = gtf_in.readline() - while line: - if re.search(r"^#", line): - line = gtf_in.readline() + with open(gtf_file, "r") as gtf_in: + for line in gtf_in: + eles = line.strip().split("\t") + if len(eles) != 9: continue - eles = line.split("\t") - if not len(eles) == 9: - line = gtf_in.readline() + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8]) + if not match: continue - match = re.search(r'gene_id "([^"]+)"', line) gene_id = match.group(1) + transcript_id = match.group(2) + feature_type = eles[2] - if not current_gene_id: - record_counter += 1 - current_gene_id = gene_id - - if not gene_id == current_gene_id: - record_counter += 1 - if record_counter % batch_size == 0: - records.append(current_record_batch) - current_record_batch = [] - current_gene_id = gene_id - - current_record_batch.append(line) - line = gtf_in.readline() - - records.append(current_record_batch) - - elif record_type == "single_line_feature": - record_counter = 0 - current_record_batch = [] - current_gene_id = "" - line = gtf_in.readline() - while line: - if re.search(r"^#", line): - line = gtf_in.readline() - continue - - eles = line.split("\t") - if not len(eles) == 9: - line = gtf_in.readline() - continue - - record_counter += 1 - - if record_counter % batch_size == 0: - records.append(current_record_batch) - current_record_batch = [] - - current_record_batch.append(line) - line = gtf_in.readline() - - records.append(current_record_batch) - - gtf_in.close() + if gene_id not in gtf_genes: + gtf_genes[gene_id] = {} - return records - - -def run_find_orfs(genome_file, main_output_dir): - min_orf_length = 600 - - orf_output_dir = create_dir(main_output_dir, "orf_output") - seq_region_lengths = get_seq_region_lengths(genome_file, 5000) - for region_name in seq_region_lengths: - region_length = seq_region_lengths[region_name] - seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir) - for phase in range(0, 6): - find_orf_phased_region( - region_name, seq, phase, min_orf_length, orf_output_dir - ) - - -def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir): - current_index = phase - orf_counter = 1 - if phase > 2: - seq = reverse_complement(seq) - current_index = current_index % 3 - - orf_file_path = os.path.join( - orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa") - ) - orf_out = open(orf_file_path, "w+") - - while current_index < len(seq): - codon = seq[current_index : current_index + 3] - if codon == "ATG": - orf_seq = codon - for j in range(current_index + 3, len(seq), 3): - next_codon = seq[j : j + 3] - if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA": - orf_seq += next_codon - if len(orf_seq) >= min_orf_length: - orf_out.write( - ">" - + region_name - + "_phase" - + str(phase) - + "_orf" - + str(orf_counter) - + "\n" - ) - orf_out.write(orf_seq + "\n") - orf_counter += 1 - orf_seq = "" - break - - # If there's another met in phase, then put i to the start of - # the codon after j so that only the longest ORF is found - if next_codon == "ATG": - current_index = j + 3 - orf_seq += next_codon - current_index += 3 - orf_out.close() - -def bed_to_gff(input_dir, hints_file): - gff_out = open(hints_file, "w+") - exons_dict = {} - for bed_file in glob.glob(input_dir + "/*.bed"): - logger.info("Processing file for hints:") - logger.info(bed_file) - bed_in = open(bed_file) - bed_lines = bed_in.readlines() - for line in bed_lines: - line = line.rstrip() - elements = line.split("\t") - seq_region_name = elements[0] - offset = int(elements[1]) - hit_name = elements[3] - strand = elements[5] - block_sizes = elements[10].split(",") - block_sizes = list(filter(None, block_sizes)) - block_starts = elements[11].split(",") - block_starts = list(filter(None, block_starts)) - exons = bed_block_to_exons(block_sizes, block_starts, offset) - for i, element in enumerate(exons): - exon_coords = exons[i] - exon_key = ( - seq_region_name - + ":" - + exon_coords[0] - + ":" - + exon_coords[1] - + ":" - + strand - ) - if exon_key in exons_dict: - exons_dict[exon_key][5] += 1 - else: - gff_list = [ - seq_region_name, - "CDNA", - "exon", - exon_coords[0], - exon_coords[1], - 1.0, - strand, - ".", - ] - exons_dict[exon_key] = gff_list - - for exon_key, gff_list in exons_dict.items(): - gff_list[5] = str(gff_list[5]) - gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n" - gff_out.write(gff_line) - - gff_out.close() - - sorted_hints_out = open((hints_file + ".srt"), "w+") - subprocess.run( - ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file], - stdout=sorted_hints_out, - ) - sorted_hints_out.close() - -def splice_junction_to_gff(input_dir, hints_file): - sjf_out = open(hints_file, "w+") - - for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): - sjf_in = open(sj_tab_file) - sjf_lines = sjf_in.readlines() - for line in sjf_lines: - elements = line.split("\t") - strand = "+" - # If the strand is undefined then skip, Augustus expects a strand - if elements[3] == "0": - continue - elif elements[3] == "2": - strand = "-" - - junction_length = int(elements[2]) - int(elements[1]) + 1 - if junction_length < 100: - continue - - if not elements[4] and elements[7] < 10: - continue - - # For the moment treat multimapping and single mapping things as a combined score - score = float(elements[6]) + float(elements[7]) - score = str(score) - output_line = [ - elements[0], - "RNASEQ", - "intron", - elements[1], - elements[2], - score, - strand, - ".", - ("src=W;mul=" + score + ";"), - ] - sjf_out.write("\t".join(output_line) + "\n") - - sjf_out.close() - - -def split_genome(genome_file, target_dir, min_seq_length): - # This is the lazy initial way of just splitting into a dir - # of files based on the toplevel sequence with a min sequence length filter - # There are a couple of obvious improvements: - # 1) Instead of making files for all seqs, just process N seqs - # parallel, where N = num_threads. Then you could clean up the seq file - # after each seq finishes, thus avoiding potentially - # having thousands of file in a dir - # 2) Split the seq into even slices and process these in parallel - # (which the same cleanup as in 1). For sequences smaller than the - # target slice size, bundle them up together into a single file. Vastly - # more complex, partially implemented in the splice_genome method - # Allows for more consistency with parallelisation (since there should - # be no large outliers). But require a mapping strategy for the - # coords and sequence names and all the hints will need to be adjusted - current_header = "" - current_seq = "" - - file_in = open(genome_file) - line = file_in.readline() - while line: - match = re.search(r">(.+)$", line) - if match and current_header: - if len(current_seq) > min_seq_length: - file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) - if not os.path.exists(file_out_name): - file_out = open(file_out_name, "w+") - file_out.write(">" + current_header + "\n" + current_seq + "\n") - file_out.close() - - else: - print( - "Found an existing split file, so will not overwrite. File found:" - ) - print(file_out_name) - - current_seq = "" - current_header = match.group(1) - elif match: - current_header = match.group(1) - else: - current_seq += line.rstrip() - - line = file_in.readline() - - if len(current_seq) > min_seq_length: - file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) - if not os.path.exists(file_out_name): - file_out = open(file_out_name, "w+") - file_out.write(">" + current_header + "\n" + current_seq + "\n") - file_out.close() - - else: - logger.info( - "Found an existing split file, so will not overwrite. File found:" - ) - logger.info(file_out_name) - - file_in.close() - - -def update_gtf_genes(parsed_gtf_genes, combined_results, validation_type): - output_lines = [] - - for gene_id in parsed_gtf_genes.keys(): - transcript_ids = parsed_gtf_genes[gene_id].keys() - for transcript_id in transcript_ids: - transcript_line = parsed_gtf_genes[gene_id][transcript_id]["transcript"] - single_cds_exon_transcript = 0 - translation_match = re.search( - r'; translation_coords "([^"]+)";', transcript_line - ) - if translation_match: - translation_coords = translation_match.group(1) - translation_coords_list = translation_coords.split(":") - # If the start exon coords of both exons are the same, - # then it's the same exon and thus a single exon cds - if translation_coords_list[0] == translation_coords_list[3]: - single_cds_exon_transcript = 1 - - exon_lines = parsed_gtf_genes[gene_id][transcript_id]["exons"] - validation_results = combined_results[transcript_id] - rnasamba_coding_probability = float(validation_results[0]) - rnasamba_coding_potential = validation_results[1] - cpc2_coding_probability = float(validation_results[2]) - cpc2_coding_potential = validation_results[3] - transcript_length = int(validation_results[4]) - peptide_length = int(validation_results[5]) - diamond_e_value = None - if len(validation_results) == 7: - diamond_e_value = validation_results[6] - - avg_coding_probability = ( - rnasamba_coding_probability + cpc2_coding_probability - ) / 2 - max_coding_probability = max( - rnasamba_coding_probability, cpc2_coding_probability - ) - - match = re.search(r'; biotype "([^"]+)";', transcript_line) - biotype = match.group(1) - if biotype == "busco" or biotype == "protein": - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - output_lines.append(transcript_line) - output_lines.extend(exon_lines) - continue - - min_single_exon_pep_length = 100 - min_multi_exon_pep_length = 75 - min_single_source_probability = 0.8 - min_single_exon_probability = 0.9 - - # Note that the below looks at validating things - # under different levels of strictness - # There are a few different continue statements, - # where transcripts will be skipped resulting - # in a smaller post validation file. It mainly - # removes single coding exon genes with no real - # support or for multi-exon lncRNAs that are less than 200bp long - if single_cds_exon_transcript == 1 and validation_type == "relaxed": - if diamond_e_value is not None: - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif ( - rnasamba_coding_potential == "coding" - and cpc2_coding_potential == "coding" - and peptide_length >= min_single_exon_pep_length - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif ( - ( - rnasamba_coding_potential == "coding" - or cpc2_coding_potential == "coding" - ) - and peptide_length >= min_single_exon_pep_length - and max_coding_probability >= min_single_source_probability - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - else: - continue - elif single_cds_exon_transcript == 1 and validation_type == "moderate": - if ( - diamond_e_value is not None - and peptide_length >= min_single_exon_pep_length - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif ( - ( - rnasamba_coding_potential == "coding" - and cpc2_coding_potential == "coding" - ) - and peptide_length >= min_single_exon_pep_length - and avg_coding_probability >= min_single_exon_probability - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - else: - continue - else: - if diamond_e_value is not None: - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif ( - rnasamba_coding_potential == "coding" - and cpc2_coding_potential == "coding" - and peptide_length >= min_multi_exon_pep_length - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif ( - ( - rnasamba_coding_potential == "coding" - or cpc2_coding_potential == "coding" - ) - and peptide_length >= min_multi_exon_pep_length - and max_coding_probability >= min_single_source_probability - ): - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "protein_coding";', - transcript_line, - ) - elif transcript_length >= 200: - transcript_line = re.sub( - '; biotype "' + biotype + '";', - '; biotype "lncRNA";', - transcript_line, - ) - transcript_line = re.sub( - ' translation_coords "[^"]+";', "", transcript_line - ) - else: - continue - - output_lines.append(transcript_line) - output_lines.extend(exon_lines) - - return output_lines - - -def read_gtf_genes(gtf_file): - gtf_genes = {} - gtf_in = open(gtf_file) - line = gtf_in.readline() - while line: - eles = line.split("\t") - if not len(eles) == 9: - line = gtf_in.readline() - continue - - match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) - - if not match: - line = gtf_in.readline() - continue - - gene_id = match.group(1) - transcript_id = match.group(2) - feature_type = eles[2] - if gene_id not in gtf_genes: - gtf_genes[gene_id] = {} - if feature_type == "transcript": - gtf_genes[gene_id][transcript_id] = {} - gtf_genes[gene_id][transcript_id]["transcript"] = line - gtf_genes[gene_id][transcript_id]["exons"] = [] - elif feature_type == "exon": - gtf_genes[gene_id][transcript_id]["exons"].append(line) - line = gtf_in.readline() - gtf_in.close() + if feature_type == "transcript": + gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []} + elif feature_type == "exon": + if transcript_id in gtf_genes[gene_id]: + gtf_genes[gene_id][transcript_id]["exons"].append(line.strip()) return gtf_genes +def check_transcriptomic_output(main_output_dir: Path) -> None: + """Check across the various transcriptomic if there is a reasonable amount of data + Args: + main_output_dir (Path): Output dir path + Raises: + IOError: If no lines or fewer than the minimum expected lines are found in the output files. + """ - -# def merge_gtf_files(file_paths,id_label): - -# gtf_file_path = os.path.join(output_dir,'annotation.gtf') -# gtf_out = open(gtf_file_path,'w+') -# for gtf_file_path in gtf_files: -# gtf_file_name = os.path.basename(gtf_file_path) -# match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name) -# start_offset = int(match.group(1)) -# gtf_in = open(gtf_file_path,'r') -# line = gtf_in.readline() -# while line: -# values = line.split("\t") -# if len(values) == 9 and (values[2] in feature_types): -# values[3] = str(int(values[3]) + (start_offset - 1)) -# values[4] = str(int(values[4]) + (start_offset - 1)) -# gtf_out.write("\t".join(values)) -# line = gtf_in.readline() -# gtf_in.close() -# gtf_out.close() - - -def multiprocess_generic(cmd): - print(" ".join(cmd)) - subprocess.run(cmd) - - -def seq_region_names(genome_file): - region_list = [] - - file_in = open(genome_file) - line = file_in.readline() - while line: - match = re.search(r">([^\s]+)", line) - if match: - region_name = match.group(1) - if region_name == "MT": - logger.info("Skipping region named MT") - line = file_in.readline() - continue - else: - region_list.append(match.group(1)) - line = file_in.readline() - - return region_list - - -def slice_genome(genome_file, target_dir, target_slice_size): - # The below is sort of tested - # Without the - target_seq_length = 50000000 - min_seq_length = 1000 - current_header = "" - current_seq = "" - seq_dict = {} - for line in seq: # pylint:disable=used-before-assignment - match = re.search(r">(.+)$", line) - if match and current_header: - seq_dict[current_header] = current_seq - current_seq = "" - current_header = match.group(1) - elif match: - current_header = match.group(1) - else: - current_seq += line.rstrip() - - seq_dict[current_header] = current_seq - - seq_buffer = 0 - file_number = 0 - file_name = "genome_file_" + str(file_number) - - for header in seq_dict: - seq_iterator = 0 - seq = seq_dict[header] - - while len(seq) > target_seq_length: - file_out = open(os.path.join(target_dir, file_name), "w+") - subseq = seq[0:target_seq_length] - file_out.write( - ">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n" - ) - file_out.close() - seq = seq[target_seq_length:] - seq_iterator += 1 - file_number += 1 - file_name = "genome_file_" + str(file_number) - - if len(seq) >= min_seq_length: - file_name = "genome_file_" + str(file_number) - file_out = open(os.path.join(file_name), "w+") - file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n") - file_out.close() - file_number += 1 - file_name = "genome_file_" + str(file_number) - - -def coallate_results(main_output_dir): - results_dir = create_dir(main_output_dir, "results") - output_dirs = [ - "augustus_output", - "cpg_output", - "dust_output", - "eponine_output", - "red_output", - "rfam_output", - "trf_output", - "trnascan_output", - ] - for output_dir in output_dirs: - match = re.search(r"(.+)_output", output_dir) - result_type = match.group(1) - results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf") - copy_path = os.path.join(results_dir, (result_type + ".gtf")) - if os.path.exists(results_path): - cpy_cmd = ["cp", results_path, copy_path] - subprocess.run(cpy_cmd) - - - -def check_transcriptomic_output(main_output_dir): - - # This will check across the various transcriptomic - # dirs and check there's actually some data transcriptomic_dirs = [ "scallop_output", "stringtie_output", @@ -1446,31 +556,23 @@ def check_transcriptomic_output(main_output_dir): total_lines = 0 min_lines = 100000 for transcriptomic_dir in transcriptomic_dirs: - full_file_path = os.path.join( - main_output_dir, transcriptomic_dir, "annotation.gtf" - ) - if not os.path.exists(full_file_path): + full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf" + if not full_file_path.exists(): logger.warning( - "Warning, no annotation.gtf found for " - + transcriptomic_dir - + ". This might be fine, e.g. no long read data were provided" + "Warning, no annotation.gtf found for %s. This might be fine, \ + e.g. no long read data were provided", + transcriptomic_dir, ) continue - num_lines = sum(1 for line in open(full_file_path)) - total_lines = total_lines + num_lines - logger.info( - "For " - + transcriptomic_dir - + " found a total of " - + str(num_lines) - + " in the annotation.gtf file" - ) + num_lines = sum(1 for _ in full_file_path.open("r")) + total_lines += num_lines + logging.info("For %s found a total of in the annotation.gtf file", num_lines) if total_lines == 0: raise IOError( "Anno was run with transcriptomic mode enabled,\ but the transcriptomic annotation files are empty" ) - elif total_lines <= min_lines: + if total_lines <= min_lines: raise IOError( "Anno was run with transcriptomic mode enabled, \ but the total number of lines in the output \ @@ -1480,9 +582,318 @@ def check_transcriptomic_output(main_output_dir): "Min allowed: " + str(min_lines) ) - else: - logger.info( - "Found " - + str(total_lines) - + " total lines across the transcriptomic files. Checks passed" - ) + + +# def run_find_orfs(genome_file, main_output_dir): +# min_orf_length = 600 + +# orf_output_dir = create_dir(main_output_dir, "orf_output") +# seq_region_lengths = get_seq_region_lengths(genome_file, 5000) +# for region_name in seq_region_lengths: +# region_length = seq_region_lengths[region_name] +# seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir) +# for phase in range(0, 6): +# find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir) + + +# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir): +# current_index = phase +# orf_counter = 1 +# if phase > 2: +# seq = reverse_complement(seq) +# current_index = current_index % 3 + +# orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")) +# orf_out = open(orf_file_path, "w+") + +# while current_index < len(seq): +# codon = seq[current_index : current_index + 3] +# if codon == "ATG": +# orf_seq = codon +# for j in range(current_index + 3, len(seq), 3): +# next_codon = seq[j : j + 3] +# if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA": +# orf_seq += next_codon +# if len(orf_seq) >= min_orf_length: +# orf_out.write( +# ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n" +# ) +# orf_out.write(orf_seq + "\n") +# orf_counter += 1 +# orf_seq = "" +# break + +# # If there's another met in phase, then put i to the start of +# # the codon after j so that only the longest ORF is found +# if next_codon == "ATG": +# current_index = j + 3 +# orf_seq += next_codon +# current_index += 3 +# orf_out.close() + + +# def bed_to_gff(input_dir, hints_file): +# gff_out = open(hints_file, "w+") +# exons_dict = {} +# for bed_file in glob.glob(input_dir + "/*.bed"): +# logger.info("Processing file for hints:") +# logger.info(bed_file) +# bed_in = open(bed_file) +# bed_lines = bed_in.readlines() +# for line in bed_lines: +# line = line.rstrip() +# elements = line.split("\t") +# seq_region_name = elements[0] +# offset = int(elements[1]) +# hit_name = elements[3] +# strand = elements[5] +# block_sizes = elements[10].split(",") +# block_sizes = list(filter(None, block_sizes)) +# block_starts = elements[11].split(",") +# block_starts = list(filter(None, block_starts)) +# exons = bed_block_to_exons(block_sizes, block_starts, offset) +# for i, element in enumerate(exons): +# exon_coords = exons[i] +# exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand +# if exon_key in exons_dict: +# exons_dict[exon_key][5] += 1 +# else: +# gff_list = [ +# seq_region_name, +# "CDNA", +# "exon", +# exon_coords[0], +# exon_coords[1], +# 1.0, +# strand, +# ".", +# ] +# exons_dict[exon_key] = gff_list + +# for exon_key, gff_list in exons_dict.items(): +# gff_list[5] = str(gff_list[5]) +# gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n" +# gff_out.write(gff_line) + +# gff_out.close() + +# sorted_hints_out = open((hints_file + ".srt"), "w+") +# subprocess.run( +# ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file], +# stdout=sorted_hints_out, +# ) +# sorted_hints_out.close() + + +# def splice_junction_to_gff(input_dir, hints_file): +# sjf_out = open(hints_file, "w+") + +# for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): +# sjf_in = open(sj_tab_file) +# sjf_lines = sjf_in.readlines() +# for line in sjf_lines: +# elements = line.split("\t") +# strand = "+" +# # If the strand is undefined then skip, Augustus expects a strand +# if elements[3] == "0": +# continue +# elif elements[3] == "2": +# strand = "-" + +# junction_length = int(elements[2]) - int(elements[1]) + 1 +# if junction_length < 100: +# continue + +# if not elements[4] and elements[7] < 10: +# continue + +# # For the moment treat multimapping and single mapping things as a combined score +# score = float(elements[6]) + float(elements[7]) +# score = str(score) +# output_line = [ +# elements[0], +# "RNASEQ", +# "intron", +# elements[1], +# elements[2], +# score, +# strand, +# ".", +# ("src=W;mul=" + score + ";"), +# ] +# sjf_out.write("\t".join(output_line) + "\n") + +# sjf_out.close() + + +# def split_genome(genome_file, target_dir, min_seq_length): +# # This is the lazy initial way of just splitting into a dir +# # of files based on the toplevel sequence with a min sequence length filter +# # There are a couple of obvious improvements: +# # 1) Instead of making files for all seqs, just process N seqs +# # parallel, where N = num_threads. Then you could clean up the seq file +# # after each seq finishes, thus avoiding potentially +# # having thousands of file in a dir +# # 2) Split the seq into even slices and process these in parallel +# # (which the same cleanup as in 1). For sequences smaller than the +# # target slice size, bundle them up together into a single file. Vastly +# # more complex, partially implemented in the splice_genome method +# # Allows for more consistency with parallelisation (since there should +# # be no large outliers). But require a mapping strategy for the +# # coords and sequence names and all the hints will need to be adjusted +# current_header = "" +# current_seq = "" + +# file_in = open(genome_file) +# line = file_in.readline() +# while line: +# match = re.search(r">(.+)$", line) +# if match and current_header: +# if len(current_seq) > min_seq_length: +# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) +# if not os.path.exists(file_out_name): +# file_out = open(file_out_name, "w+") +# file_out.write(">" + current_header + "\n" + current_seq + "\n") +# file_out.close() + +# else: +# print("Found an existing split file, so will not overwrite. File found:") +# print(file_out_name) + +# current_seq = "" +# current_header = match.group(1) +# elif match: +# current_header = match.group(1) +# else: +# current_seq += line.rstrip() + +# line = file_in.readline() + +# if len(current_seq) > min_seq_length: +# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) +# if not os.path.exists(file_out_name): +# file_out = open(file_out_name, "w+") +# file_out.write(">" + current_header + "\n" + current_seq + "\n") +# file_out.close() + +# else: +# logger.info("Found an existing split file, so will not overwrite. File found:") +# logger.info(file_out_name) + +# file_in.close() + + +# # def merge_gtf_files(file_paths,id_label): + +# # gtf_file_path = os.path.join(output_dir,'annotation.gtf') +# # gtf_out = open(gtf_file_path,'w+') +# # for gtf_file_path in gtf_files: +# # gtf_file_name = os.path.basename(gtf_file_path) +# # match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name) +# # start_offset = int(match.group(1)) +# # gtf_in = open(gtf_file_path,'r') +# # line = gtf_in.readline() +# # while line: +# # values = line.split("\t") +# # if len(values) == 9 and (values[2] in feature_types): +# # values[3] = str(int(values[3]) + (start_offset - 1)) +# # values[4] = str(int(values[4]) + (start_offset - 1)) +# # gtf_out.write("\t".join(values)) +# # line = gtf_in.readline() +# # gtf_in.close() +# # gtf_out.close() + + +# def multiprocess_generic(cmd): +# print(" ".join(cmd)) +# subprocess.run(cmd) + + +# def seq_region_names(genome_file): +# region_list = [] + +# file_in = open(genome_file) +# line = file_in.readline() +# while line: +# match = re.search(r">([^\s]+)", line) +# if match: +# region_name = match.group(1) +# if region_name == "MT": +# logger.info("Skipping region named MT") +# line = file_in.readline() +# continue +# else: +# region_list.append(match.group(1)) +# line = file_in.readline() + +# return region_list + + +# def slice_genome(genome_file, target_dir, target_slice_size): +# # The below is sort of tested +# # Without the +# target_seq_length = 50000000 +# min_seq_length = 1000 +# current_header = "" +# current_seq = "" +# seq_dict = {} +# for line in seq: # pylint:disable=used-before-assignment +# match = re.search(r">(.+)$", line) +# if match and current_header: +# seq_dict[current_header] = current_seq +# current_seq = "" +# current_header = match.group(1) +# elif match: +# current_header = match.group(1) +# else: +# current_seq += line.rstrip() + +# seq_dict[current_header] = current_seq + +# seq_buffer = 0 +# file_number = 0 +# file_name = "genome_file_" + str(file_number) + +# for header in seq_dict: +# seq_iterator = 0 +# seq = seq_dict[header] + +# while len(seq) > target_seq_length: +# file_out = open(os.path.join(target_dir, file_name), "w+") +# subseq = seq[0:target_seq_length] +# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n") +# file_out.close() +# seq = seq[target_seq_length:] +# seq_iterator += 1 +# file_number += 1 +# file_name = "genome_file_" + str(file_number) + +# if len(seq) >= min_seq_length: +# file_name = "genome_file_" + str(file_number) +# file_out = open(os.path.join(file_name), "w+") +# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n") +# file_out.close() +# file_number += 1 +# file_name = "genome_file_" + str(file_number) + + +# def coallate_results(main_output_dir): +# results_dir = create_dir(main_output_dir, "results") +# output_dirs = [ +# "augustus_output", +# "cpg_output", +# "dust_output", +# "eponine_output", +# "red_output", +# "rfam_output", +# "trf_output", +# "trnascan_output", +# ] +# for output_dir in output_dirs: +# match = re.search(r"(.+)_output", output_dir) +# result_type = match.group(1) +# results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf") +# copy_path = os.path.join(results_dir, (result_type + ".gtf")) +# if os.path.exists(results_path): +# cpy_cmd = ["cp", results_path, copy_path] +# subprocess.run(cpy_cmd) From feae8308f4c82082eb903d0de925fbfe55a8fb17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:17:21 +0000 Subject: [PATCH 17/20] added modules calls --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6656e13..e9b6a4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,6 +150,11 @@ run_star = "ensembl.tools.anno.transcriptomic_annotation.star:main" run_minimap2 = "ensembl.tools.anno.transcriptomic_annotation.minimap:main" run_stringtie = "ensembl.tools.anno.transcriptomic_annotation.stringtie:main" run_scallop = "ensembl.tools.anno.transcriptomic_annotation.scallop:main" +run_rnasamba = "ensembl.tools.anno.finalise_geneset.rnasamba:main" +run_cpc2 = "ensembl.tools.anno.finalise_geneset.cpc2:main" +run_diamond = "ensembl.tools.anno.finalise_geneset.diamond:main" +run_finalise_geneset = "ensembl.tools.anno.finalise_geneset.finalise_geneset:main" +load_results_to_ensembl_db = "ensembl.tools.anno.load_results_to_ensembl_db:main" # This is configuration specific to the `setuptools` build backend. # If you are using a different build backend, you will need to change this. [tool.setuptools] From c5b49e663fc1b0864bd49f6de330ad54bc468663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:19:27 +0000 Subject: [PATCH 18/20] added load into db module --- docs/source/load_into_core.rst | 8 + .../tools/anno/utils/load_into_core.py | 435 ++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 docs/source/load_into_core.rst create mode 100644 src/python/ensembl/tools/anno/utils/load_into_core.py diff --git a/docs/source/load_into_core.rst b/docs/source/load_into_core.rst new file mode 100644 index 0000000..bd030f6 --- /dev/null +++ b/docs/source/load_into_core.rst @@ -0,0 +1,8 @@ +Annotation loading into core db - Module Documentation +=================================== + +.. automodule:: ensembl.tools.anno.load_into_core + :members: + :undoc-members: + :show-inheritance: + diff --git a/src/python/ensembl/tools/anno/utils/load_into_core.py b/src/python/ensembl/tools/anno/utils/load_into_core.py new file mode 100644 index 0000000..10b930e --- /dev/null +++ b/src/python/ensembl/tools/anno/utils/load_into_core.py @@ -0,0 +1,435 @@ +# pylint: disable=missing-module-docstring +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Loads GTF records for multiple features into an Ensembl database. +""" +__all__ = ["load_results_to_ensembl_db"] + +import argparse +import logging +import logging.config +import multiprocessing +from pathlib import Path +import gc +import re +import subprocess +from typing import Generator, List, Dict +import tempfile +from ensembl.tools.anno.utils._utils import ( + create_dir, +) + + +def load_results_to_ensembl_db( + genome_file: Path, + main_output_dir: Path, + db_details: str, + features: Dict[str, tuple[int, str, str]], + num_threads: int = 1, + single_transcript_genes_loading: bool = False, +) -> None: + """ + Loads GTF records for multiple features into an Ensembl database. + + This function processes each feature defined in `features`, batches the associated GTF records, + and then loads them into the Ensembl database using the `generic_load_records_to_ensembl_db` function. + + :param genome_file: Path to the genome file used for the loading process. + :type genome_file: Path + :param main_output_dir: The directory where the feature directories and GTF files are stored. + :type main_output_dir: Path + :param db_details: Comma-separated database connection details \ + (db_name, db_host, db_port, db_user, db_pass). + :type db_details: str + :param features: A dictionary with feature names as keys, and tuples \ + with batch size, load type (e.g., 'gene'), and analysis name (e.g., 'ensembl'). + :type features: (Dict[str, Tuple[int, str, str]]) + :param num_threads: The number of threads to use for loading. + :type num_threads: int, default 1. + :param single_transcript_genes_loading: Flag to indicate whether to load single transcript genes. + :type single_transcript_genes_loading: bool, default False. + + """ + db_loading_script = ( + Path(__file__).parents[5] + / "perl" + / "ensembl" + / "tools" + / "anno" + / "support_scripts_perl" + / "load_gtf_ensembl.pl" + ) + db_loading_dir = create_dir(main_output_dir, "db_loading") + # Process each selected feature + for feature in features: + # Retrieve settings for the current feature + batch_size, load_type, analysis_name = features[feature] + + # Construct the path to the GTF file for the feature + annotation_results_gtf_file = main_output_dir / feature / "annotation.gtf" + + # Check if the file exists and load the feature + if annotation_results_gtf_file.exists(): + logging.info("Loading %s geneset to db", feature) + gtf_records = batch_gtf_records(annotation_results_gtf_file, batch_size, load_type) + generic_load_records_to_ensembl_db( + single_transcript_genes_loading, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logging.error( + "Did not find the %s annotation file, so not loading. Path checked: %s", + feature, + annotation_results_gtf_file, + ) + + +def generic_load_records_to_ensembl_db( + single_transcript_genes_loading: bool, + db_loading_script: Path, + genome_file: Path, + db_details: str, + db_loading_dir: Path, + load_type: str, + analysis_name: str, + gtf_records: List[List[str]], + num_threads: int, +) -> None: + """ + Loads GTF records into the Ensembl database using multiple threads. + + This function processes the GTF records in batches and loads them asynchronously + into the database using multiprocessing. The function delegates the actual loading + of records to `multiprocess_load_records_to_ensembl_db` for each batch. + + Args: + single_transcript_genes_loading (str): Option for loading transcript separately without + defining the canonical. + db_loading_script (Path): Path to the Perl script used for loading records into Ensembl. + genome_file (Path): Path to the genome file. + db_details (str): Database details. + db_loading_dir (Path): Directory for storing temporary files related to DB loading. + load_type (str): Type of the records being loaded (e.g., "gene", "transcript"). + analysis_name (str): Name of the analysis being performed. + gtf_records (List[List[str]]): A list of GTF record batches to be loaded. + num_threads (int): Number of threads to use for loading records. + + Returns: + None: This function performs an asynchronous operation and doesn't return any value. + """ + pool = multiprocessing.Pool(int(num_threads))#pylint:disable=consider-using-with + for record_batch in gtf_records: + pool.apply_async( + multiprocess_load_records_to_ensembl_db, + args=( + single_transcript_genes_loading, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + record_batch, + ), + ) + try: + pool.close() + pool.join() + except Exception as e:#pylint:disable=broad-exception-caught + logging.error("Error during pool shutdown: %s", e) + finally: + pool.terminate() + + +def multiprocess_load_records_to_ensembl_db( + single_transcript_genes_loading: bool, + db_loading_script: str, + genome_file: str, + db_details: str, + output_dir: str, + load_type: str, + analysis_name: str, + record_batch: List[str], +) -> None: + """ + Loads a batch of GTF records into an Ensembl database using a temporary GTF file. + + This function writes the provided `record_batch` to a temporary GTF file, constructs + a command to load the records into the database using a Perl script, and executes it + via a subprocess. After execution, the temporary GTF file is removed. + + Args: + single_transcript_genes_loading (bool): Flag to indicate if single transcript genes + should be loaded. + db_loading_script (str): Path to the Perl script used to load records into the database. + genome_file (str): Path to the genome file to be used by the loading script. + db_details (str): Comma-separated string containing database connection details + (db_name, db_host, db_port, db_user, db_pass). + output_dir (str): Directory where the temporary GTF file will be stored. + load_type (str): Type of the record being loaded (e.g., "gene"). + analysis_name (str): Name of the analysis being performed (e.g., "ensembl"). + record_batch (List[str]): List of GTF records (as strings) to be written to the temporary file. + + Returns: + None: This function performs a subprocess call and does not return any value. + """ + with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=output_dir) as gtf_temp_out: + for line in record_batch: + gtf_temp_out.writelines(line) + # gtf_temp_file_path = gtf_temp_out.name + + (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",") + + loading_cmd = [ + "perl", + str(db_loading_script), + "-genome_file", + str(genome_file), + "-dbname", + db_name, + "-host", + db_host, + "-port", + str(db_port), + "-user", + db_user, + "-pass", + db_pass, + "-gtf_file", + str(gtf_temp_out), + "-analysis_name", + analysis_name, + "-load_type", + load_type, + ] + + if load_type == "gene" and analysis_name == "ensembl": + loading_cmd.extend( + [ + "-protein_coding_biotype", + "anno_protein_coding", + "-non_coding_biotype", + "anno_lncRNA", + ] + ) + + if single_transcript_genes_loading: + loading_cmd.append("-make_single_transcript_genes") + + logging.info(" ".join(loading_cmd)) + try: + # Run the loading command as a subprocess + subprocess.run(loading_cmd, check=True) + except subprocess.CalledProcessError as e: + logging.error("Error executing loading command: %s", e) + finally: + # Clean up temporary files + try: + gtf_temp_out.unlink() + logging.info("Temporary file %s removed.", gtf_temp_out) + except OSError as e: + logging.error("Error removing temporary file %s, %s", gtf_temp_out, e) + # Run garbage collection to free up memory + gc.collect() + logging.info("Finished processing batch from %s", gtf_temp_out) + + +# The following function assumes the file has unique ids for the parent features. +# It then batches them into +# sets of records based on the batch size passed in +def batch_gtf_records(input_gtf_file: Path, batch_size: int, record_type: str) -> List[List[str]]: + """ + Processes a GTF file in batches and returns a list of record batches. + + Args: + input_gtf_file (Path): The path to the input GTF file. + batch_size (int): The number of records per batch. + record_type (str): The type of records to process ('gene' or 'single_line_feature'). + + Returns: + List[List[str]]: A list of batches, each containing GTF lines as lists of strings. + """ + + def read_gtf_lines(gtf_file: Path) -> Generator[str, None, None]: + """ + Generator that reads and yields valid GTF lines from the file. + + Args: + gtf_file (Path): The path to the input GTF file. + + Yields: + str: A valid GTF line. + """ + with open(gtf_file) as gtf_in: + for line in gtf_in: + if re.match(r"^#", line): # Skip comment lines + continue + elems = line.split("\t") + if len(elems) == 9: # Ensure valid GTF line + yield line + + records: List[List[str]] = [] + current_record_batch: List[str] = [] + record_counter: int = 0 + current_gene_id: str = "" + + # Iterate over the GTF lines + for line in read_gtf_lines(input_gtf_file): + if record_type == "gene": + match = re.search(r'gene_id "([^"]+)"', line) + if match: + gene_id = match.group(1) + + if not current_gene_id: + record_counter += 1 + current_gene_id = gene_id + + if gene_id != current_gene_id: + record_counter += 1 + if record_counter % batch_size == 0: + records.append(current_record_batch) + current_record_batch = [] + current_gene_id = gene_id + + current_record_batch.append(line) + + elif record_type == "single_line_feature": + record_counter += 1 + if record_counter % batch_size == 0: + records.append(current_record_batch) + current_record_batch = [] + + current_record_batch.append(line) + + # Append the last batch of records + if current_record_batch: + records.append(current_record_batch) + + return records + + +def build_feature_settings( + args: argparse.Namespace, repeatmasker_analysis: str = "repeatmask_repbase_human" +) -> Dict[str, tuple[int, str, str]]: + """ + Builds the feature settings dictionary based on the enabled flags from the argument object. + + This function uses the provided argument flags (`args`) to determine which features + to include in the returned settings dictionary. The dictionary contains the configuration + for each feature, including batch size, record type, and analysis name. + + Args: + args (object): An object containing flags (e.g., `args.annotation`, `args.rfam`) + that determine which features should be included. + repeatmasker_analysis (str, optional): The analysis name for RepeatMasker output. + Defaults to "repeatmask_repbase_human". + + Returns: + Dict[str, tuple[int, str, str]]: A dictionary where keys are feature names (e.g., "annotation_output") + and values are tuples containing batch size (int), + record type (str), and analysis name (str). + """ + # Base configurations for all features + all_features = { + "annotation_output": (200, "gene", "ensembl"), + "rfam_output": (500, "gene", "ncrna"), + "trnascan_output": (500, "gene", "ncrna"), + "dust_output": (500, "single_line_feature", "dust"), + "red_output": (500, "single_line_feature", "repeatdetector"), + "trf_output": (500, "single_line_feature", "trf"), + "repeatmasker_output": (500, "single_line_feature", repeatmasker_analysis), + "cpg_output": (500, "single_line_feature", "cpg"), + "eponine_output": (500, "single_line_feature", "eponine"), + } + + # Start with an empty dictionary + feature_settings = {} + + # Add features based on enabled flags + if args.annotation: + feature_settings["annotation_output"] = all_features["annotation_output"] + if args.rfam: + feature_settings["rfam_output"] = all_features["rfam_output"] + if args.trnascan: + feature_settings["trnascan_output"] = all_features["trnascan_output"] + if args.dust: + feature_settings["dust_output"] = all_features["dust_output"] + if args.red: + feature_settings["red_output"] = all_features["red_output"] + if args.trf: + feature_settings["trf_output"] = all_features["trf_output"] + if args.repeatmasker: + feature_settings["repeatmasker_output"] = all_features["repeatmasker_output"] + if args.cpg: + feature_settings["cpg_output"] = all_features["cpg_output"] + if args.eponine: + feature_settings["eponine_output"] = all_features["eponine_output"] + + return feature_settings + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Load multiple features into the Ensembl database.") + parser.add_argument("--output_dir", required=True, help="Output directory path") + parser.add_argument("--genome_file", required=True, help="Path for the genome fasta file") + parser.add_argument("--annotation", action="store_true", help="Enable annotation output feature") + parser.add_argument("--rfam", action="store_true", help="Enable RFAM feature") + parser.add_argument("--trnascan", action="store_true", help="Enable tRNA scan feature") + parser.add_argument("--dust", action="store_true", help="Enable dust feature") + parser.add_argument("--red", action="store_true", help="Enable RED feature") + parser.add_argument("--trf", action="store_true", help="Enable TRF feature") + parser.add_argument("--repeatmasker", action="store_true", help="Enable RepeatMasker feature") + parser.add_argument("--cpg", action="store_true", help="Enable CpG feature") + parser.add_argument("--eponine", action="store_true", help="Enable Eponine feature") + parser.add_argument( + "--db_details", + required=True, + type=str, + help="Comma-separated string containing database details ", + ) + parser.add_argument("--num_threads", type=int, default=1, help="Number of threads") + + parser.add_argument( + "--single_transcript_genes_loading", + action="store_true", + help="Flag to indicate whether to load single transcript genes (default is False).", + ) + return parser.parse_args() + + +def main(): + """Loading into core database's entry-point.""" + args = parse_args() + + # Build the feature settings dictionary dynamically + feature_setting = build_feature_settings(args, repeatmasker_analysis="repeatmask_repbase_human") + + load_results_to_ensembl_db( + args.genome_file, + args.output_dir, + args.db_details, + feature_setting, + args.num_threads, + args.single_transcript_genes_loading, + ) From a9a9ac57f862f15557fca2efaf897b800d663699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:24:27 +0000 Subject: [PATCH 19/20] updated index --- docs/source/index.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 1f4fe61..e2d9862 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -38,18 +38,24 @@ to install the project. license cmsearch + cpc2 cpg + diamond dust eponine genblast minimap red repeatmasker + rnasamba scallop star stringtie trf trnascan + + finalise genset + load into core Indices and tables ================== From 445a2b2d05f64199252b698d034a3c2a65702e12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98ens-ftricomi=E2=80=99?= Date: Thu, 12 Dec 2024 16:31:00 +0000 Subject: [PATCH 20/20] resored utils to merge --- src/python/ensembl/tools/anno/utils/_utils.py | 1447 +++++++++++------ .../ensembl/tools/anno/utils/_utils_copy.py | 899 ++++++++++ 2 files changed, 1876 insertions(+), 470 deletions(-) create mode 100644 src/python/ensembl/tools/anno/utils/_utils_copy.py diff --git a/src/python/ensembl/tools/anno/utils/_utils.py b/src/python/ensembl/tools/anno/utils/_utils.py index b4bbc72..4915d37 100644 --- a/src/python/ensembl/tools/anno/utils/_utils.py +++ b/src/python/ensembl/tools/anno/utils/_utils.py @@ -16,16 +16,17 @@ import errno import io +import json import logging import os from os import PathLike from pathlib import Path -import random import re import subprocess import shutil +import glob import tempfile -from typing import Dict, List, Optional +from typing import Dict, List logger = logging.getLogger("__name__." + __name__) @@ -80,8 +81,8 @@ def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int: Check number of transcript lines in the GTF Arg: - gtf_file: GTF file path - content_obj: Object to detect and count in the gtf i.e transcript, repeat + gtf_file: GTF file path + content_obj: Object to detect and count in the gtf i.e transcript, repeat Return: Number of occurences """ @@ -126,7 +127,7 @@ def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dic if len(current_seq) > min_seq_length: seq_region_to_length[current_header] = len(current_seq) - # logging.info("length of the seq region %s", seq_region_to_length) + #logging.info("length of the seq region %s", seq_region_to_length) return seq_region_to_length @@ -166,7 +167,7 @@ def get_slice_id( if (end - start + 1) >= min_length: slice_ids_per_region.append([seq_region, start, end]) start = end + 1 - # logging.info("list of slice ids %s", str(slice_ids_per_region)) + #logging.info("list of slice ids %s", str(slice_ids_per_region)) return slice_ids_per_region @@ -252,7 +253,9 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches # Example key KS8000.rs1.re1000000.1 gene_id_slice = input_file.name + "." + str(current_gene_id) # Example key KS8000.rs1.re1000000.1.transcript.1 - transcript_id_slice = gene_id_slice + "." + str(current_transcript_id) + transcript_id_slice = ( + gene_id_slice + "." + str(current_transcript_id) + ) # If there is no existing entry, the gene key will be added # and the gene counter is incremented. # gene_id "gene1"; transcript_id "gene1.t1" @@ -277,12 +280,16 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches + ".t" + str(transcript_id_count_gene_id[gene_id_slice]) ) - gene_transcript_id_collection[transcript_id_slice] = new_transcript_id + gene_transcript_id_collection[ + transcript_id_slice + ] = new_transcript_id transcript_id_count_gene_id[gene_id_slice] += 1 else: # If a transcript of the same set is already present, # the new id with the incremented counter is added - new_transcript_id = gene_transcript_id_collection[transcript_id_slice] + new_transcript_id = gene_transcript_id_collection[ + transcript_id_slice + ] attribs = re.sub( full_transcript_id_string, 'transcript_id "' + new_transcript_id + '"', @@ -295,15 +302,13 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches else: if new_id_prefix == "repeatmask": match_feature_type = re.search( - r"(" + feature_id_label + r"\d+)", + r"(" + feature_id_label + "\d+)", line, ) else: - match_feature_type = ( - re.search( # repeat_id\d+ r"(" + feature_id_label + ' +"([^"]+)")', - r"(" + feature_id_label + ' +"([^"]+)")', # "\d+)", - line, - ) + match_feature_type = re.search( # repeat_id\d+ r"(" + feature_id_label + ' +"([^"]+)")', + r"(" + feature_id_label + ' +"([^"]+)")', # "\d+)", + line, ) if match_feature_type: full_feature_id = match_feature_type.group(1) @@ -321,8 +326,6 @@ def slice_output_to_gtf( # pylint: disable=too-many-branches "Feature type not recognised, will skip. Feature type: %s", values[2], ) - - def get_sequence( seq_region: str, start: int, @@ -351,7 +354,9 @@ def get_sequence( "get_sequence %s", f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}", ) - with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file: + with tempfile.NamedTemporaryFile( + mode="w+t", delete=False, dir=str(output_dir) + ) as bed_temp_file: bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}") bed_temp_file.close() bedtools_command = [ @@ -362,10 +367,10 @@ def get_sequence( "-bed", str(bed_temp_file.name), ] - bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with - bedtools_command, stdout=subprocess.PIPE - ) # pylint:disable=consider-using-with - for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")): # type: ignore + bedtools_output = subprocess.Popen(bedtools_command, stdout=subprocess.PIPE) + for idx, line in enumerate( + io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8") # type: ignore + ): if idx == 1: if strand == 1: sequence = line.rstrip() @@ -399,155 +404,958 @@ def check_file(file_path: Path) -> None: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) -def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]: - """ - Save fastq sequences in a dictionary - fasta_lines: List[str]: List of sequences - """ - index: Dict[str, str] = {} - header: Optional[str] = None - seq: List[str] = [] - - for line in fasta_lines: - line = line.strip() - header_match = re.match(r">(.+)\n$", line) - if header_match: - if header: - index[header] = "".join(seq) - header = None - header = header_match.group(1) - seq = [] - else: - seq.append(line) - # Add the last sequence - if header: - index[header] = "".join(seq) - return index - +def load_results_to_ensembl_db( + main_script_dir, + load_to_ensembl_db, + genome_file, + main_output_dir, + db_details, + num_threads, + ): + db_loading_script = os.path.join( + main_script_dir, "support_scripts_perl", "load_gtf_ensembl.pl" + ) + db_loading_dir = create_dir(main_output_dir, "db_loading") -# def fasta_to_dict(fasta_lines:List[str]): -# index = {} -# for line in fasta_lines: -# #it = iter(fasta_lines) -# #for header in it: -# match = re.search(r">(.+)\n$", line) -# header = match.group(1) -# seq = next(line) -# index[header] = seq -# return index + # Should collapse this into a function + annotation_results_gtf_file = os.path.join( + main_output_dir, "annotation_output", "annotation.gtf" + ) + if os.path.exists(annotation_results_gtf_file): + logger.info("Loading main geneset to db") + batch_size = 200 + load_type = "gene" + analysis_name = "ensembl" + gtf_records = batch_gtf_records( + annotation_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find the main gene annotation file, so not loading. Path checked:\n" + + annotation_results_gtf_file + ) + rfam_results_gtf_file = os.path.join(main_output_dir, "rfam_output", "annotation.gtf") + if os.path.exists(rfam_results_gtf_file): + logger.info("Loading Rfam-based sncRNA genes to db") + batch_size = 500 + load_type = "gene" + analysis_name = "ncrna" + gtf_records = batch_gtf_records( + rfam_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find an Rfam annotation file, so not loading. Path checked:\n" + + rfam_results_gtf_file + ) -def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List: - """ - The protein dataset file is splitted by a number of sequence equals to the batch_size - in batch files stored in 10 output directories. - protein_dataset : Path for the protein dataset. - output_dir : Output directory path. - batch_size : Size of the batch, it needs to be equals to the number of threads - to parallelise the sequence processing for each file. - """ - batched_protein_files = [] - - for i in range(0, 10): - create_dir(output_dir, (f"bin_{i}")) - with open(protein_dataset, "r", encoding="utf8") as file_in: - seq_count = 0 - batch_count = 0 - current_record = "" - initial_seq = True - for line in file_in: - match = re.search(r">(.+)$", line) - # match header and is not first sequence, if the number of stored sequences in each file equals - # the number of batch_size, a new file will be created and the current_record reset - if match and not initial_seq and seq_count % batch_size == 0: - bin_num = random.randint(0, 9) - batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" - with batch_file.open("w+") as file_out: - file_out.write(current_record) - batch_count += 1 - seq_count += 1 - current_record = line - batched_protein_files.append(batch_file) - # match header and is the first sequence - elif match: - current_record += line - initial_seq = False - seq_count += 1 - # other lines - else: - current_record += line + trnascan_results_gtf_file = os.path.join( + main_output_dir, "trnascan_output", "annotation.gtf" + ) + if os.path.exists(trnascan_results_gtf_file): + logger.info("Loading tRNAScan-SE tRNA genes to db") + batch_size = 500 + load_type = "gene" + analysis_name = "ncrna" + gtf_records = batch_gtf_records( + trnascan_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find an tRNAScan-SE annotation file, so not loading. Path checked:\n" + + trnascan_results_gtf_file + ) - if current_record: - bin_num = random.randint(0, 9) - batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" - with batch_file.open("w+") as file_out: - file_out.write(current_record) - batched_protein_files.append(batch_file) - return batched_protein_files + dust_results_gtf_file = os.path.join(main_output_dir, "dust_output", "annotation.gtf") + if os.path.exists(dust_results_gtf_file): + logger.info("Loading Dust repeats to db") + batch_size = 500 + load_type = "single_line_feature" + analysis_name = "dust" + gtf_records = batch_gtf_records( + dust_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find a Dust annotation file, so not loading. Path checked:\n" + + dust_results_gtf_file + ) + red_results_gtf_file = os.path.join(main_output_dir, "red_output", "annotation.gtf") + if os.path.exists(red_results_gtf_file): + logger.info("Loading Red repeats to db") + batch_size = 500 + load_type = "single_line_feature" + analysis_name = "repeatdetector" + gtf_records = batch_gtf_records( + red_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find a Red annotation file, so not loading. Path checked:\n" + + red_results_gtf_file + ) -def run_command(cmd) -> None: - """Submit command in multiprocessing action + trf_results_gtf_file = os.path.join(main_output_dir, "trf_output", "annotation.gtf") + if os.path.exists(trf_results_gtf_file): + logger.info("Loading TRF repeats to db") + batch_size = 500 + load_type = "single_line_feature" + analysis_name = "trf" + gtf_records = batch_gtf_records( + trf_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find a TRF annotation file, so not loading. Path checked:\n" + + trf_results_gtf_file + ) - Args: - cmd (List[str]): command to execute - """ - try: - result = subprocess.run(cmd, check=True, capture_output=True, text=True) - logger.info("Command executed successfully: %s", cmd) - logger.info("Output: %s", result.stdout) - except subprocess.CalledProcessError as e: - logger.error("Command failed: %s", cmd) - logger.error("Error: %s", e.stderr) + cpg_results_gtf_file = os.path.join(main_output_dir, "cpg_output", "annotation.gtf") + if os.path.exists(cpg_results_gtf_file): + logger.info("Loading CpG islands to db") + batch_size = 500 + load_type = "single_line_feature" + analysis_name = "cpg" + gtf_records = batch_gtf_records( + cpg_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find a CpG annotation file, so not loading. Path checked:\n" + + cpg_results_gtf_file + ) + eponine_results_gtf_file = os.path.join( + main_output_dir, "eponine_output", "annotation.gtf" + ) + if os.path.exists(eponine_results_gtf_file): + logger.info("Loading Eponine repeats to db") + batch_size = 500 + load_type = "single_line_feature" + analysis_name = "eponine" + gtf_records = batch_gtf_records( + eponine_results_gtf_file, batch_size, db_loading_dir, load_type + ) + generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ) + else: + logger.error( + "Did not find an Eponine annotation file, so not loading. Path checked:\n" + + eponine_results_gtf_file + ) -def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]: - """ - Parses a GTF file to extract gene, transcript, and exon information. + logger.info("Finished loading records to db") + + +def generic_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + gtf_records, + num_threads, + ): + pool = multiprocessing.Pool(int(num_threads)) + for record_batch in gtf_records: + pool.apply_async( + multiprocess_load_records_to_ensembl_db, + args=( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + db_loading_dir, + load_type, + analysis_name, + record_batch, + ), + ) - Args: - gtf_file: Path to the GTF file. + pool.close() + pool.join() + + +def multiprocess_load_records_to_ensembl_db( + load_to_ensembl_db, + db_loading_script, + genome_file, + db_details, + output_dir, + load_type, + analysis_name, + record_batch, + ): + with tempfile.NamedTemporaryFile( + mode="w+t", delete=False, dir=output_dir + ) as gtf_temp_out: + for line in record_batch: + gtf_temp_out.writelines(line) + gtf_temp_file_path = gtf_temp_out.name + + (db_name, db_host, db_port, db_user, db_pass) = db_details.split(",") + + loading_cmd = [ + "perl", + db_loading_script, + "-genome_file", + genome_file, + "-dbname", + db_name, + "-host", + db_host, + "-port", + str(db_port), + "-user", + db_user, + "-pass", + db_pass, + "-gtf_file", + gtf_temp_file_path, + "-analysis_name", + analysis_name, + "-load_type", + load_type, + ] - Returns: - A dictionary containing gene information, organized by gene_id, transcript_id, and features. - """ - gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {} + if load_type == "gene" and analysis_name == "ensembl": + loading_cmd.extend( + [ + "-protein_coding_biotype", + "anno_protein_coding", + "-non_coding_biotype", + "anno_lncRNA", + ] + ) - with open(gtf_file, "r") as gtf_in: - for line in gtf_in: - eles = line.strip().split("\t") - if len(eles) != 9: + if load_to_ensembl_db == "single_transcript_genes": + loading_cmd.append("-make_single_transcript_genes") + + logger.info(" ".join(loading_cmd)) + subprocess.run(loading_cmd) + gtf_temp_out.close() + os.remove(gtf_temp_file_path) # doesn't seem to be working + logger.info("Finished: " + gtf_temp_file_path) + gc.collect() + + +def batch_gtf_records(input_gtf_file, batch_size, output_dir, record_type): + records = [] + gtf_in = open(input_gtf_file) + if record_type == "gene": + # NOTE that the neverending variations on GTF reading/writing/merging + # is becoming very messy + # need to create a set of utility methods outside of this script + # This one assumes the file has unique ids for the parent features. + # It then batches them into + # sets of records based on the batch size passed in + record_counter = 0 + current_record_batch = [] + current_gene_id = "" + line = gtf_in.readline() + while line: + if re.search(r"^#", line): + line = gtf_in.readline() continue - match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8]) - if not match: + eles = line.split("\t") + if not len(eles) == 9: + line = gtf_in.readline() continue + match = re.search(r'gene_id "([^"]+)"', line) gene_id = match.group(1) - transcript_id = match.group(2) - feature_type = eles[2] - if gene_id not in gtf_genes: - gtf_genes[gene_id] = {} + if not current_gene_id: + record_counter += 1 + current_gene_id = gene_id + + if not gene_id == current_gene_id: + record_counter += 1 + if record_counter % batch_size == 0: + records.append(current_record_batch) + current_record_batch = [] + current_gene_id = gene_id + + current_record_batch.append(line) + line = gtf_in.readline() + + records.append(current_record_batch) + + elif record_type == "single_line_feature": + record_counter = 0 + current_record_batch = [] + current_gene_id = "" + line = gtf_in.readline() + while line: + if re.search(r"^#", line): + line = gtf_in.readline() + continue + + eles = line.split("\t") + if not len(eles) == 9: + line = gtf_in.readline() + continue + + record_counter += 1 + + if record_counter % batch_size == 0: + records.append(current_record_batch) + current_record_batch = [] + + current_record_batch.append(line) + line = gtf_in.readline() + + records.append(current_record_batch) - if feature_type == "transcript": - gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []} - elif feature_type == "exon": - if transcript_id in gtf_genes[gene_id]: - gtf_genes[gene_id][transcript_id]["exons"].append(line.strip()) + gtf_in.close() + + return records + + +def run_find_orfs(genome_file, main_output_dir): + min_orf_length = 600 + + orf_output_dir = create_dir(main_output_dir, "orf_output") + seq_region_lengths = get_seq_region_lengths(genome_file, 5000) + for region_name in seq_region_lengths: + region_length = seq_region_lengths[region_name] + seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir) + for phase in range(0, 6): + find_orf_phased_region( + region_name, seq, phase, min_orf_length, orf_output_dir + ) + + +def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir): + current_index = phase + orf_counter = 1 + if phase > 2: + seq = reverse_complement(seq) + current_index = current_index % 3 + + orf_file_path = os.path.join( + orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa") + ) + orf_out = open(orf_file_path, "w+") + + while current_index < len(seq): + codon = seq[current_index : current_index + 3] + if codon == "ATG": + orf_seq = codon + for j in range(current_index + 3, len(seq), 3): + next_codon = seq[j : j + 3] + if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA": + orf_seq += next_codon + if len(orf_seq) >= min_orf_length: + orf_out.write( + ">" + + region_name + + "_phase" + + str(phase) + + "_orf" + + str(orf_counter) + + "\n" + ) + orf_out.write(orf_seq + "\n") + orf_counter += 1 + orf_seq = "" + break + + # If there's another met in phase, then put i to the start of + # the codon after j so that only the longest ORF is found + if next_codon == "ATG": + current_index = j + 3 + orf_seq += next_codon + current_index += 3 + orf_out.close() + +def bed_to_gff(input_dir, hints_file): + gff_out = open(hints_file, "w+") + exons_dict = {} + for bed_file in glob.glob(input_dir + "/*.bed"): + logger.info("Processing file for hints:") + logger.info(bed_file) + bed_in = open(bed_file) + bed_lines = bed_in.readlines() + for line in bed_lines: + line = line.rstrip() + elements = line.split("\t") + seq_region_name = elements[0] + offset = int(elements[1]) + hit_name = elements[3] + strand = elements[5] + block_sizes = elements[10].split(",") + block_sizes = list(filter(None, block_sizes)) + block_starts = elements[11].split(",") + block_starts = list(filter(None, block_starts)) + exons = bed_block_to_exons(block_sizes, block_starts, offset) + for i, element in enumerate(exons): + exon_coords = exons[i] + exon_key = ( + seq_region_name + + ":" + + exon_coords[0] + + ":" + + exon_coords[1] + + ":" + + strand + ) + if exon_key in exons_dict: + exons_dict[exon_key][5] += 1 + else: + gff_list = [ + seq_region_name, + "CDNA", + "exon", + exon_coords[0], + exon_coords[1], + 1.0, + strand, + ".", + ] + exons_dict[exon_key] = gff_list + + for exon_key, gff_list in exons_dict.items(): + gff_list[5] = str(gff_list[5]) + gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n" + gff_out.write(gff_line) + + gff_out.close() + + sorted_hints_out = open((hints_file + ".srt"), "w+") + subprocess.run( + ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file], + stdout=sorted_hints_out, + ) + sorted_hints_out.close() + +def splice_junction_to_gff(input_dir, hints_file): + sjf_out = open(hints_file, "w+") + + for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): + sjf_in = open(sj_tab_file) + sjf_lines = sjf_in.readlines() + for line in sjf_lines: + elements = line.split("\t") + strand = "+" + # If the strand is undefined then skip, Augustus expects a strand + if elements[3] == "0": + continue + elif elements[3] == "2": + strand = "-" + + junction_length = int(elements[2]) - int(elements[1]) + 1 + if junction_length < 100: + continue + + if not elements[4] and elements[7] < 10: + continue + + # For the moment treat multimapping and single mapping things as a combined score + score = float(elements[6]) + float(elements[7]) + score = str(score) + output_line = [ + elements[0], + "RNASEQ", + "intron", + elements[1], + elements[2], + score, + strand, + ".", + ("src=W;mul=" + score + ";"), + ] + sjf_out.write("\t".join(output_line) + "\n") + + sjf_out.close() + + +def split_genome(genome_file, target_dir, min_seq_length): + # This is the lazy initial way of just splitting into a dir + # of files based on the toplevel sequence with a min sequence length filter + # There are a couple of obvious improvements: + # 1) Instead of making files for all seqs, just process N seqs + # parallel, where N = num_threads. Then you could clean up the seq file + # after each seq finishes, thus avoiding potentially + # having thousands of file in a dir + # 2) Split the seq into even slices and process these in parallel + # (which the same cleanup as in 1). For sequences smaller than the + # target slice size, bundle them up together into a single file. Vastly + # more complex, partially implemented in the splice_genome method + # Allows for more consistency with parallelisation (since there should + # be no large outliers). But require a mapping strategy for the + # coords and sequence names and all the hints will need to be adjusted + current_header = "" + current_seq = "" + + file_in = open(genome_file) + line = file_in.readline() + while line: + match = re.search(r">(.+)$", line) + if match and current_header: + if len(current_seq) > min_seq_length: + file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) + if not os.path.exists(file_out_name): + file_out = open(file_out_name, "w+") + file_out.write(">" + current_header + "\n" + current_seq + "\n") + file_out.close() + + else: + print( + "Found an existing split file, so will not overwrite. File found:" + ) + print(file_out_name) + + current_seq = "" + current_header = match.group(1) + elif match: + current_header = match.group(1) + else: + current_seq += line.rstrip() + + line = file_in.readline() + + if len(current_seq) > min_seq_length: + file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) + if not os.path.exists(file_out_name): + file_out = open(file_out_name, "w+") + file_out.write(">" + current_header + "\n" + current_seq + "\n") + file_out.close() + + else: + logger.info( + "Found an existing split file, so will not overwrite. File found:" + ) + logger.info(file_out_name) + + file_in.close() + + +def update_gtf_genes(parsed_gtf_genes, combined_results, validation_type): + output_lines = [] + + for gene_id in parsed_gtf_genes.keys(): + transcript_ids = parsed_gtf_genes[gene_id].keys() + for transcript_id in transcript_ids: + transcript_line = parsed_gtf_genes[gene_id][transcript_id]["transcript"] + single_cds_exon_transcript = 0 + translation_match = re.search( + r'; translation_coords "([^"]+)";', transcript_line + ) + if translation_match: + translation_coords = translation_match.group(1) + translation_coords_list = translation_coords.split(":") + # If the start exon coords of both exons are the same, + # then it's the same exon and thus a single exon cds + if translation_coords_list[0] == translation_coords_list[3]: + single_cds_exon_transcript = 1 + + exon_lines = parsed_gtf_genes[gene_id][transcript_id]["exons"] + validation_results = combined_results[transcript_id] + rnasamba_coding_probability = float(validation_results[0]) + rnasamba_coding_potential = validation_results[1] + cpc2_coding_probability = float(validation_results[2]) + cpc2_coding_potential = validation_results[3] + transcript_length = int(validation_results[4]) + peptide_length = int(validation_results[5]) + diamond_e_value = None + if len(validation_results) == 7: + diamond_e_value = validation_results[6] + + avg_coding_probability = ( + rnasamba_coding_probability + cpc2_coding_probability + ) / 2 + max_coding_probability = max( + rnasamba_coding_probability, cpc2_coding_probability + ) + + match = re.search(r'; biotype "([^"]+)";', transcript_line) + biotype = match.group(1) + if biotype == "busco" or biotype == "protein": + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + output_lines.append(transcript_line) + output_lines.extend(exon_lines) + continue + + min_single_exon_pep_length = 100 + min_multi_exon_pep_length = 75 + min_single_source_probability = 0.8 + min_single_exon_probability = 0.9 + + # Note that the below looks at validating things + # under different levels of strictness + # There are a few different continue statements, + # where transcripts will be skipped resulting + # in a smaller post validation file. It mainly + # removes single coding exon genes with no real + # support or for multi-exon lncRNAs that are less than 200bp long + if single_cds_exon_transcript == 1 and validation_type == "relaxed": + if diamond_e_value is not None: + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif ( + rnasamba_coding_potential == "coding" + and cpc2_coding_potential == "coding" + and peptide_length >= min_single_exon_pep_length + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif ( + ( + rnasamba_coding_potential == "coding" + or cpc2_coding_potential == "coding" + ) + and peptide_length >= min_single_exon_pep_length + and max_coding_probability >= min_single_source_probability + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + else: + continue + elif single_cds_exon_transcript == 1 and validation_type == "moderate": + if ( + diamond_e_value is not None + and peptide_length >= min_single_exon_pep_length + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif ( + ( + rnasamba_coding_potential == "coding" + and cpc2_coding_potential == "coding" + ) + and peptide_length >= min_single_exon_pep_length + and avg_coding_probability >= min_single_exon_probability + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + else: + continue + else: + if diamond_e_value is not None: + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif ( + rnasamba_coding_potential == "coding" + and cpc2_coding_potential == "coding" + and peptide_length >= min_multi_exon_pep_length + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif ( + ( + rnasamba_coding_potential == "coding" + or cpc2_coding_potential == "coding" + ) + and peptide_length >= min_multi_exon_pep_length + and max_coding_probability >= min_single_source_probability + ): + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "protein_coding";', + transcript_line, + ) + elif transcript_length >= 200: + transcript_line = re.sub( + '; biotype "' + biotype + '";', + '; biotype "lncRNA";', + transcript_line, + ) + transcript_line = re.sub( + ' translation_coords "[^"]+";', "", transcript_line + ) + else: + continue + + output_lines.append(transcript_line) + output_lines.extend(exon_lines) + + return output_lines + + +def read_gtf_genes(gtf_file): + gtf_genes = {} + gtf_in = open(gtf_file) + line = gtf_in.readline() + while line: + eles = line.split("\t") + if not len(eles) == 9: + line = gtf_in.readline() + continue + + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', line) + + if not match: + line = gtf_in.readline() + continue + + gene_id = match.group(1) + transcript_id = match.group(2) + feature_type = eles[2] + if gene_id not in gtf_genes: + gtf_genes[gene_id] = {} + if feature_type == "transcript": + gtf_genes[gene_id][transcript_id] = {} + gtf_genes[gene_id][transcript_id]["transcript"] = line + gtf_genes[gene_id][transcript_id]["exons"] = [] + elif feature_type == "exon": + gtf_genes[gene_id][transcript_id]["exons"].append(line) + line = gtf_in.readline() + gtf_in.close() return gtf_genes -def check_transcriptomic_output(main_output_dir: Path) -> None: - """Check across the various transcriptomic if there is a reasonable amount of data +def fasta_to_dict(fasta_list): + index = {} + it = iter(fasta_list) + for header in it: + match = re.search(r">(.+)\n$", header) + header = match.group(1) + seq = next(it) + index[header] = seq + return index - Args: - main_output_dir (Path): Output dir path - Raises: - IOError: If no lines or fewer than the minimum expected lines are found in the output files. - """ +# def merge_gtf_files(file_paths,id_label): + +# gtf_file_path = os.path.join(output_dir,'annotation.gtf') +# gtf_out = open(gtf_file_path,'w+') +# for gtf_file_path in gtf_files: +# gtf_file_name = os.path.basename(gtf_file_path) +# match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name) +# start_offset = int(match.group(1)) +# gtf_in = open(gtf_file_path,'r') +# line = gtf_in.readline() +# while line: +# values = line.split("\t") +# if len(values) == 9 and (values[2] in feature_types): +# values[3] = str(int(values[3]) + (start_offset - 1)) +# values[4] = str(int(values[4]) + (start_offset - 1)) +# gtf_out.write("\t".join(values)) +# line = gtf_in.readline() +# gtf_in.close() +# gtf_out.close() + + +def multiprocess_generic(cmd): + print(" ".join(cmd)) + subprocess.run(cmd) + + +def seq_region_names(genome_file): + region_list = [] + + file_in = open(genome_file) + line = file_in.readline() + while line: + match = re.search(r">([^\s]+)", line) + if match: + region_name = match.group(1) + if region_name == "MT": + logger.info("Skipping region named MT") + line = file_in.readline() + continue + else: + region_list.append(match.group(1)) + line = file_in.readline() + + return region_list + + +def slice_genome(genome_file, target_dir, target_slice_size): + # The below is sort of tested + # Without the + target_seq_length = 50000000 + min_seq_length = 1000 + current_header = "" + current_seq = "" + seq_dict = {} + for line in seq: # pylint:disable=used-before-assignment + match = re.search(r">(.+)$", line) + if match and current_header: + seq_dict[current_header] = current_seq + current_seq = "" + current_header = match.group(1) + elif match: + current_header = match.group(1) + else: + current_seq += line.rstrip() + + seq_dict[current_header] = current_seq + + seq_buffer = 0 + file_number = 0 + file_name = "genome_file_" + str(file_number) + + for header in seq_dict: + seq_iterator = 0 + seq = seq_dict[header] + + while len(seq) > target_seq_length: + file_out = open(os.path.join(target_dir, file_name), "w+") + subseq = seq[0:target_seq_length] + file_out.write( + ">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n" + ) + file_out.close() + seq = seq[target_seq_length:] + seq_iterator += 1 + file_number += 1 + file_name = "genome_file_" + str(file_number) + + if len(seq) >= min_seq_length: + file_name = "genome_file_" + str(file_number) + file_out = open(os.path.join(file_name), "w+") + file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n") + file_out.close() + file_number += 1 + file_name = "genome_file_" + str(file_number) + + +def coallate_results(main_output_dir): + results_dir = create_dir(main_output_dir, "results") + output_dirs = [ + "augustus_output", + "cpg_output", + "dust_output", + "eponine_output", + "red_output", + "rfam_output", + "trf_output", + "trnascan_output", + ] + for output_dir in output_dirs: + match = re.search(r"(.+)_output", output_dir) + result_type = match.group(1) + results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf") + copy_path = os.path.join(results_dir, (result_type + ".gtf")) + if os.path.exists(results_path): + cpy_cmd = ["cp", results_path, copy_path] + subprocess.run(cpy_cmd) + + + +def check_transcriptomic_output(main_output_dir): + + # This will check across the various transcriptomic + # dirs and check there's actually some data transcriptomic_dirs = [ "scallop_output", "stringtie_output", @@ -556,23 +1364,31 @@ def check_transcriptomic_output(main_output_dir: Path) -> None: total_lines = 0 min_lines = 100000 for transcriptomic_dir in transcriptomic_dirs: - full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf" - if not full_file_path.exists(): + full_file_path = os.path.join( + main_output_dir, transcriptomic_dir, "annotation.gtf" + ) + if not os.path.exists(full_file_path): logger.warning( - "Warning, no annotation.gtf found for %s. This might be fine, \ - e.g. no long read data were provided", - transcriptomic_dir, + "Warning, no annotation.gtf found for " + + transcriptomic_dir + + ". This might be fine, e.g. no long read data were provided" ) continue - num_lines = sum(1 for _ in full_file_path.open("r")) - total_lines += num_lines - logging.info("For %s found a total of in the annotation.gtf file", num_lines) + num_lines = sum(1 for line in open(full_file_path)) + total_lines = total_lines + num_lines + logger.info( + "For " + + transcriptomic_dir + + " found a total of " + + str(num_lines) + + " in the annotation.gtf file" + ) if total_lines == 0: raise IOError( "Anno was run with transcriptomic mode enabled,\ but the transcriptomic annotation files are empty" ) - if total_lines <= min_lines: + elif total_lines <= min_lines: raise IOError( "Anno was run with transcriptomic mode enabled, \ but the total number of lines in the output \ @@ -582,318 +1398,9 @@ def check_transcriptomic_output(main_output_dir: Path) -> None: "Min allowed: " + str(min_lines) ) - - -# def run_find_orfs(genome_file, main_output_dir): -# min_orf_length = 600 - -# orf_output_dir = create_dir(main_output_dir, "orf_output") -# seq_region_lengths = get_seq_region_lengths(genome_file, 5000) -# for region_name in seq_region_lengths: -# region_length = seq_region_lengths[region_name] -# seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir) -# for phase in range(0, 6): -# find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir) - - -# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir): -# current_index = phase -# orf_counter = 1 -# if phase > 2: -# seq = reverse_complement(seq) -# current_index = current_index % 3 - -# orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")) -# orf_out = open(orf_file_path, "w+") - -# while current_index < len(seq): -# codon = seq[current_index : current_index + 3] -# if codon == "ATG": -# orf_seq = codon -# for j in range(current_index + 3, len(seq), 3): -# next_codon = seq[j : j + 3] -# if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA": -# orf_seq += next_codon -# if len(orf_seq) >= min_orf_length: -# orf_out.write( -# ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n" -# ) -# orf_out.write(orf_seq + "\n") -# orf_counter += 1 -# orf_seq = "" -# break - -# # If there's another met in phase, then put i to the start of -# # the codon after j so that only the longest ORF is found -# if next_codon == "ATG": -# current_index = j + 3 -# orf_seq += next_codon -# current_index += 3 -# orf_out.close() - - -# def bed_to_gff(input_dir, hints_file): -# gff_out = open(hints_file, "w+") -# exons_dict = {} -# for bed_file in glob.glob(input_dir + "/*.bed"): -# logger.info("Processing file for hints:") -# logger.info(bed_file) -# bed_in = open(bed_file) -# bed_lines = bed_in.readlines() -# for line in bed_lines: -# line = line.rstrip() -# elements = line.split("\t") -# seq_region_name = elements[0] -# offset = int(elements[1]) -# hit_name = elements[3] -# strand = elements[5] -# block_sizes = elements[10].split(",") -# block_sizes = list(filter(None, block_sizes)) -# block_starts = elements[11].split(",") -# block_starts = list(filter(None, block_starts)) -# exons = bed_block_to_exons(block_sizes, block_starts, offset) -# for i, element in enumerate(exons): -# exon_coords = exons[i] -# exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand -# if exon_key in exons_dict: -# exons_dict[exon_key][5] += 1 -# else: -# gff_list = [ -# seq_region_name, -# "CDNA", -# "exon", -# exon_coords[0], -# exon_coords[1], -# 1.0, -# strand, -# ".", -# ] -# exons_dict[exon_key] = gff_list - -# for exon_key, gff_list in exons_dict.items(): -# gff_list[5] = str(gff_list[5]) -# gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n" -# gff_out.write(gff_line) - -# gff_out.close() - -# sorted_hints_out = open((hints_file + ".srt"), "w+") -# subprocess.run( -# ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file], -# stdout=sorted_hints_out, -# ) -# sorted_hints_out.close() - - -# def splice_junction_to_gff(input_dir, hints_file): -# sjf_out = open(hints_file, "w+") - -# for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): -# sjf_in = open(sj_tab_file) -# sjf_lines = sjf_in.readlines() -# for line in sjf_lines: -# elements = line.split("\t") -# strand = "+" -# # If the strand is undefined then skip, Augustus expects a strand -# if elements[3] == "0": -# continue -# elif elements[3] == "2": -# strand = "-" - -# junction_length = int(elements[2]) - int(elements[1]) + 1 -# if junction_length < 100: -# continue - -# if not elements[4] and elements[7] < 10: -# continue - -# # For the moment treat multimapping and single mapping things as a combined score -# score = float(elements[6]) + float(elements[7]) -# score = str(score) -# output_line = [ -# elements[0], -# "RNASEQ", -# "intron", -# elements[1], -# elements[2], -# score, -# strand, -# ".", -# ("src=W;mul=" + score + ";"), -# ] -# sjf_out.write("\t".join(output_line) + "\n") - -# sjf_out.close() - - -# def split_genome(genome_file, target_dir, min_seq_length): -# # This is the lazy initial way of just splitting into a dir -# # of files based on the toplevel sequence with a min sequence length filter -# # There are a couple of obvious improvements: -# # 1) Instead of making files for all seqs, just process N seqs -# # parallel, where N = num_threads. Then you could clean up the seq file -# # after each seq finishes, thus avoiding potentially -# # having thousands of file in a dir -# # 2) Split the seq into even slices and process these in parallel -# # (which the same cleanup as in 1). For sequences smaller than the -# # target slice size, bundle them up together into a single file. Vastly -# # more complex, partially implemented in the splice_genome method -# # Allows for more consistency with parallelisation (since there should -# # be no large outliers). But require a mapping strategy for the -# # coords and sequence names and all the hints will need to be adjusted -# current_header = "" -# current_seq = "" - -# file_in = open(genome_file) -# line = file_in.readline() -# while line: -# match = re.search(r">(.+)$", line) -# if match and current_header: -# if len(current_seq) > min_seq_length: -# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) -# if not os.path.exists(file_out_name): -# file_out = open(file_out_name, "w+") -# file_out.write(">" + current_header + "\n" + current_seq + "\n") -# file_out.close() - -# else: -# print("Found an existing split file, so will not overwrite. File found:") -# print(file_out_name) - -# current_seq = "" -# current_header = match.group(1) -# elif match: -# current_header = match.group(1) -# else: -# current_seq += line.rstrip() - -# line = file_in.readline() - -# if len(current_seq) > min_seq_length: -# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) -# if not os.path.exists(file_out_name): -# file_out = open(file_out_name, "w+") -# file_out.write(">" + current_header + "\n" + current_seq + "\n") -# file_out.close() - -# else: -# logger.info("Found an existing split file, so will not overwrite. File found:") -# logger.info(file_out_name) - -# file_in.close() - - -# # def merge_gtf_files(file_paths,id_label): - -# # gtf_file_path = os.path.join(output_dir,'annotation.gtf') -# # gtf_out = open(gtf_file_path,'w+') -# # for gtf_file_path in gtf_files: -# # gtf_file_name = os.path.basename(gtf_file_path) -# # match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name) -# # start_offset = int(match.group(1)) -# # gtf_in = open(gtf_file_path,'r') -# # line = gtf_in.readline() -# # while line: -# # values = line.split("\t") -# # if len(values) == 9 and (values[2] in feature_types): -# # values[3] = str(int(values[3]) + (start_offset - 1)) -# # values[4] = str(int(values[4]) + (start_offset - 1)) -# # gtf_out.write("\t".join(values)) -# # line = gtf_in.readline() -# # gtf_in.close() -# # gtf_out.close() - - -# def multiprocess_generic(cmd): -# print(" ".join(cmd)) -# subprocess.run(cmd) - - -# def seq_region_names(genome_file): -# region_list = [] - -# file_in = open(genome_file) -# line = file_in.readline() -# while line: -# match = re.search(r">([^\s]+)", line) -# if match: -# region_name = match.group(1) -# if region_name == "MT": -# logger.info("Skipping region named MT") -# line = file_in.readline() -# continue -# else: -# region_list.append(match.group(1)) -# line = file_in.readline() - -# return region_list - - -# def slice_genome(genome_file, target_dir, target_slice_size): -# # The below is sort of tested -# # Without the -# target_seq_length = 50000000 -# min_seq_length = 1000 -# current_header = "" -# current_seq = "" -# seq_dict = {} -# for line in seq: # pylint:disable=used-before-assignment -# match = re.search(r">(.+)$", line) -# if match and current_header: -# seq_dict[current_header] = current_seq -# current_seq = "" -# current_header = match.group(1) -# elif match: -# current_header = match.group(1) -# else: -# current_seq += line.rstrip() - -# seq_dict[current_header] = current_seq - -# seq_buffer = 0 -# file_number = 0 -# file_name = "genome_file_" + str(file_number) - -# for header in seq_dict: -# seq_iterator = 0 -# seq = seq_dict[header] - -# while len(seq) > target_seq_length: -# file_out = open(os.path.join(target_dir, file_name), "w+") -# subseq = seq[0:target_seq_length] -# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n") -# file_out.close() -# seq = seq[target_seq_length:] -# seq_iterator += 1 -# file_number += 1 -# file_name = "genome_file_" + str(file_number) - -# if len(seq) >= min_seq_length: -# file_name = "genome_file_" + str(file_number) -# file_out = open(os.path.join(file_name), "w+") -# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n") -# file_out.close() -# file_number += 1 -# file_name = "genome_file_" + str(file_number) - - -# def coallate_results(main_output_dir): -# results_dir = create_dir(main_output_dir, "results") -# output_dirs = [ -# "augustus_output", -# "cpg_output", -# "dust_output", -# "eponine_output", -# "red_output", -# "rfam_output", -# "trf_output", -# "trnascan_output", -# ] -# for output_dir in output_dirs: -# match = re.search(r"(.+)_output", output_dir) -# result_type = match.group(1) -# results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf") -# copy_path = os.path.join(results_dir, (result_type + ".gtf")) -# if os.path.exists(results_path): -# cpy_cmd = ["cp", results_path, copy_path] -# subprocess.run(cpy_cmd) + else: + logger.info( + "Found " + + str(total_lines) + + " total lines across the transcriptomic files. Checks passed" + ) \ No newline at end of file diff --git a/src/python/ensembl/tools/anno/utils/_utils_copy.py b/src/python/ensembl/tools/anno/utils/_utils_copy.py new file mode 100644 index 0000000..b4bbc72 --- /dev/null +++ b/src/python/ensembl/tools/anno/utils/_utils_copy.py @@ -0,0 +1,899 @@ +# pylint: disable=missing-module-docstring +# See the NOTICE file distributed with this work for additional information +# regarding copyright ownership. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import errno +import io +import logging +import os +from os import PathLike +from pathlib import Path +import random +import re +import subprocess +import shutil +import tempfile +from typing import Dict, List, Optional + +logger = logging.getLogger("__name__." + __name__) + +# with open(Path(__file__).parents[6] / "conf/config.json", "r", encoding="utf8") as f: +# config = json.load(f) + + +def create_dir(input_dir: Path, dir_name: str) -> Path: + """ + Create directory or subdirectory and log operations. + Args: + main_output_dir: str main output directory path + dir_name: str optional subdirectory to be created + Returns: + str Path to the created directory + """ + if dir_name: + target_dir = Path(input_dir) / str(dir_name) + else: + target_dir = input_dir + + if os.path.exists(target_dir): + logger.warning("Directory already exists, will not create again") + return target_dir + + logger.info("Attempting to create target dir: %s", target_dir) + + try: + os.mkdir(target_dir) + except OSError: + logger.error("Creation of the dir failed, path used: %s", target_dir) + else: + logger.info("Successfully created the dir on the following path: %s", target_dir) + return target_dir + + +def check_exe(exe_bin: PathLike) -> None: + """ + Check executable path + Args: + exe_bin: Executable path + + Raises: + OSError: Executable path does not exist + """ + if not shutil.which(exe_bin): + raise OSError(f"Exe does not exist. Path checked: {exe_bin}") + + +def check_gtf_content(gtf_file: PathLike, content_obj: str) -> int: + """ + Check number of transcript lines in the GTF + + Arg: + gtf_file: GTF file path + content_obj: Object to detect and count in the gtf i.e transcript, repeat + + Return: Number of occurences + """ + obj_count = 0 + with open(gtf_file) as gtf_in: + for line in gtf_in: + gtf_raw = line.split("\t") + if not len(gtf_raw) == 9: + continue + if gtf_raw[2] == content_obj: + obj_count += 1 + logger.info("%s; Number of %s detected: %d", gtf_file, content_obj, int(obj_count)) + return obj_count + + +def get_seq_region_length(genome_file: PathLike, min_seq_length: int = 0) -> Dict: + """ + Split the genome file according to the header and store in a dictionary + all the sequences whose length is greater than min_seq_length. + Args: + genome_file: Genome file path. + min_seq_length: Minimum slice length. + Return: Dictionary of sequence headers and the corresponding sequence length + """ + current_header = "" + current_seq = "" + + seq_region_to_length = {} + with open(genome_file) as file_in: + for line in file_in: + match = re.search(r">(.+)$", line) + if match and current_header: + if len(current_seq) > min_seq_length: + seq_region_to_length[current_header] = len(current_seq) + + current_seq = "" + current_header = match.group(1) + elif match: + current_header = match.group(1) + else: + current_seq += line.rstrip() + + if len(current_seq) > min_seq_length: + seq_region_to_length[current_header] = len(current_seq) + # logging.info("length of the seq region %s", seq_region_to_length) + return seq_region_to_length + + +def get_slice_id( + seq_region_to_length: Dict, + slice_size: int = 1000000, + overlap: int = 0, + min_length: int = 0, +) -> List: + """ + Get list of ids for a genomic slice + Arg: + seq_region_to_length: Dictionary with the sequence headers as keys and the sequence lengths as values + slice_size: Size of the slice + overlap: Overlap length between two slices + min_length: Min length of the slice + Return: List of IDs for each genomic slice + """ + + slice_ids_per_region = [] + for seq_region in seq_region_to_length: + seq_region_length = int(seq_region_to_length[seq_region]) + if seq_region_length < min_length: + continue + + if seq_region_length <= slice_size: + slice_ids_per_region.append([seq_region, 1, seq_region_length]) + continue + + start = 1 + end = start + slice_size - 1 + while end < seq_region_length: + start = start - overlap + start = max(start, 1) + end = start + slice_size - 1 + end = min(end, seq_region_length) + if (end - start + 1) >= min_length: + slice_ids_per_region.append([seq_region, start, end]) + start = end + 1 + # logging.info("list of slice ids %s", str(slice_ids_per_region)) + return slice_ids_per_region + + +def update_string(text, lookup_text, new_text) -> str: + """ + Update substring + Args: + text : Text. + lookup_text : String to look for. + new_text : String to substitute. + Return + Text with string substitution + """ + text = re.sub( + lookup_text, + new_text, + text, + ) + return text + + +# slice_output_to_gtf(repeatmasker_dir, "repeat_id", "repeatmask", True, ".rm.gtf") +def slice_output_to_gtf( # pylint: disable=too-many-branches + output_dir: Path, + feature_id_label: str = "", + new_id_prefix: str = "", + unique_ids: bool = True, + file_extension: str = ".gtf", +) -> None: + """ + Collect all the gtf files per file extension and merge them in a final gtf file assigning unique ids. + + This holds keys of the current slice details with the gene id to form unique keys. + Each time a new key is added the overall gene counter is incremented + and the value of the key is set to the new gene id. Any subsequent + lines with the same region/gene id key will then just get + the new id without incrementing the counter. + + Args: + output_dir : Output directory. + feature_id_label : Feature identifier. + new_id_prefix : New feature identifier. + unique_ids : If True assign unique ids for the same feature type. + file_extension : Input file extension. + """ + feature_types = ["exon", "transcript", "repeat", "simple_feature"] + # Initialise gene and feature counter + gene_counter = 1 + feature_counter = 1 + # Initialise dictionaries that will store the list of gene and transcript indexes + gene_id_collection = {} + gene_transcript_id_collection = {} + transcript_id_count_gene_id = {} # one gene can have multiple transcripts + input_files = output_dir.glob(str("*" + file_extension)) + with open(output_dir / "annotation.gtf", "w+") as output_file: + for input_file in input_files: + if os.stat(input_file).st_size == 0: + logger.info("File is empty, will skip %s", input_file.name) + continue + match = re.search(r"\.rs(\d+)\.re(\d+)\.", input_file.name) + assert match is not None + start_offset = int(match.group(1)) + with open(input_file, "r") as gtf_in: + for line in gtf_in: + values = line.split("\t") + if len(values) == 9 and (values[2] in feature_types) and unique_ids: + # each slice start from 0 so we need to add the offset to get the real coordinates + values[3] = str(int(values[3]) + (start_offset - 1)) + values[4] = str(int(values[4]) + (start_offset - 1)) + # Unique id based on gene and transcript ids + attribs = values[8] + # Get the slice details, gene id and transcript id. + match_gene_type = re.search( + r'(gene_id +"([^"]+)").+(transcript_id +"([^"]+)")', + line, + ) + # gene_id "1"; transcript_id "1"; biotype "tRNA_pseudogene"; + if match_gene_type: + full_gene_id_string = match_gene_type.group(1) + current_gene_id = match_gene_type.group(2) + full_transcript_id_string = match_gene_type.group(3) + current_transcript_id = match_gene_type.group(4) + # Example key KS8000.rs1.re1000000.1 + gene_id_slice = input_file.name + "." + str(current_gene_id) + # Example key KS8000.rs1.re1000000.1.transcript.1 + transcript_id_slice = gene_id_slice + "." + str(current_transcript_id) + # If there is no existing entry, the gene key will be added + # and the gene counter is incremented. + # gene_id "gene1"; transcript_id "gene1.t1" + if gene_id_slice not in gene_id_collection: + new_gene_id = f"gene{gene_counter}" + gene_id_collection[gene_id_slice] = new_gene_id + transcript_id_count_gene_id[gene_id_slice] = 1 + gene_counter += 1 + else: + # If there is a key then the gene id will be replaced with the new gene id + new_gene_id = gene_id_collection[gene_id_slice] + attribs = re.sub( + full_gene_id_string, + 'gene_id "' + new_gene_id + '"', + attribs, + ) + # If there is no existing entry, the transcript key will be added + # and the transcript counter is incremented. + if transcript_id_slice not in gene_transcript_id_collection: + new_transcript_id = ( + gene_id_collection[gene_id_slice] + + ".t" + + str(transcript_id_count_gene_id[gene_id_slice]) + ) + gene_transcript_id_collection[transcript_id_slice] = new_transcript_id + transcript_id_count_gene_id[gene_id_slice] += 1 + else: + # If a transcript of the same set is already present, + # the new id with the incremented counter is added + new_transcript_id = gene_transcript_id_collection[transcript_id_slice] + attribs = re.sub( + full_transcript_id_string, + 'transcript_id "' + new_transcript_id + '"', + attribs, + ) + values[8] = attribs + logger.info("FINAL GTF %s", values) + output_file.write("\t".join(values)) + # Unique id based on the feature type + else: + if new_id_prefix == "repeatmask": + match_feature_type = re.search( + r"(" + feature_id_label + r"\d+)", + line, + ) + else: + match_feature_type = ( + re.search( # repeat_id\d+ r"(" + feature_id_label + ' +"([^"]+)")', + r"(" + feature_id_label + ' +"([^"]+)")', # "\d+)", + line, + ) + ) + if match_feature_type: + full_feature_id = match_feature_type.group(1) + new_feature_id = new_id_prefix + str(feature_counter) + attribs = re.sub( + full_feature_id, + feature_id_label + ' "' + new_feature_id + '"', + attribs, + ) + feature_counter += 1 + values[8] = attribs + output_file.write("\t".join(values)) + else: + logger.info( + "Feature type not recognised, will skip. Feature type: %s", + values[2], + ) + + +def get_sequence( + seq_region: str, + start: int, + end: int, + strand: int, + fasta_file: Path, + output_dir: Path, +) -> str: + """ + Creates a tempfile and writes the bed info to it based on whatever information + has been passed in about the sequence. Then runs bedtools getfasta. The fasta file + should have a faidx. This can be created with the create_faidx static method prior + to fetching sequence + + Arg: + seq_region: str region name + start: int region start + end: int region end + strand: int strand of the sequence + fasta_file: genome FASTA file with indexing + output_dir: str working dir + Return: str sequence + """ + start -= 1 + logger.info( + "get_sequence %s", + f"{seq_region}\t{start}\t{end}\t{strand}\t{fasta_file}\t{output_dir}", + ) + with tempfile.NamedTemporaryFile(mode="w+t", delete=False, dir=str(output_dir)) as bed_temp_file: + bed_temp_file.writelines(f"{seq_region}\t{start}\t{end}") + bed_temp_file.close() + bedtools_command = [ + "bedtools", + "getfasta", + "-fi", + str(fasta_file), + "-bed", + str(bed_temp_file.name), + ] + bedtools_output = subprocess.Popen(#pylint:disable=consider-using-with + bedtools_command, stdout=subprocess.PIPE + ) # pylint:disable=consider-using-with + for idx, line in enumerate(io.TextIOWrapper(bedtools_output.stdout, encoding="utf-8")): # type: ignore + if idx == 1: + if strand == 1: + sequence = line.rstrip() + else: + sequence = reverse_complement(line.rstrip()) + + os.remove(bed_temp_file.name) + # logger.info(f"sequence : {sequence}") + return sequence + + +def reverse_complement(sequence: str) -> str: + """ + Get the reverse complement of a nucleotide sequence. + Args: + sequence: The nucleotide sequence + Returns: + The reverse complement of the sequence + """ + rev_matrix = str.maketrans("atgcATGC", "tacgTACG") + return sequence.translate(rev_matrix)[::-1] + + +def check_file(file_path: Path) -> None: + """ + Raise an error when the file doesn't exist + Args: + file_path: File path + """ + if not file_path.is_file(): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) + + +def fasta_to_dict(fasta_lines: List[str]) -> Dict[str, str]: + """ + Save fastq sequences in a dictionary + fasta_lines: List[str]: List of sequences + """ + index: Dict[str, str] = {} + header: Optional[str] = None + seq: List[str] = [] + + for line in fasta_lines: + line = line.strip() + header_match = re.match(r">(.+)\n$", line) + if header_match: + if header: + index[header] = "".join(seq) + header = None + header = header_match.group(1) + seq = [] + else: + seq.append(line) + # Add the last sequence + if header: + index[header] = "".join(seq) + return index + + +# def fasta_to_dict(fasta_lines:List[str]): +# index = {} +# for line in fasta_lines: +# #it = iter(fasta_lines) +# #for header in it: +# match = re.search(r">(.+)\n$", line) +# header = match.group(1) +# seq = next(line) +# index[header] = seq +# return index + + +def split_protein_file(protein_dataset: Path, output_dir: Path, batch_size: int = 20) -> List: + """ + The protein dataset file is splitted by a number of sequence equals to the batch_size + in batch files stored in 10 output directories. + protein_dataset : Path for the protein dataset. + output_dir : Output directory path. + batch_size : Size of the batch, it needs to be equals to the number of threads + to parallelise the sequence processing for each file. + """ + batched_protein_files = [] + + for i in range(0, 10): + create_dir(output_dir, (f"bin_{i}")) + with open(protein_dataset, "r", encoding="utf8") as file_in: + seq_count = 0 + batch_count = 0 + current_record = "" + initial_seq = True + for line in file_in: + match = re.search(r">(.+)$", line) + # match header and is not first sequence, if the number of stored sequences in each file equals + # the number of batch_size, a new file will be created and the current_record reset + if match and not initial_seq and seq_count % batch_size == 0: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batch_count += 1 + seq_count += 1 + current_record = line + batched_protein_files.append(batch_file) + # match header and is the first sequence + elif match: + current_record += line + initial_seq = False + seq_count += 1 + # other lines + else: + current_record += line + + if current_record: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batched_protein_files.append(batch_file) + return batched_protein_files + + +def run_command(cmd) -> None: + """Submit command in multiprocessing action + + Args: + cmd (List[str]): command to execute + """ + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + logger.info("Command executed successfully: %s", cmd) + logger.info("Output: %s", result.stdout) + except subprocess.CalledProcessError as e: + logger.error("Command failed: %s", cmd) + logger.error("Error: %s", e.stderr) + + +def read_gtf_genes(gtf_file: Path) -> Dict[str, Dict[str, Dict[str, List[str]]]]: + """ + Parses a GTF file to extract gene, transcript, and exon information. + + Args: + gtf_file: Path to the GTF file. + + Returns: + A dictionary containing gene information, organized by gene_id, transcript_id, and features. + """ + gtf_genes: Dict[str, Dict[str, Dict[str, List[str]]]] = {} + + with open(gtf_file, "r") as gtf_in: + for line in gtf_in: + eles = line.strip().split("\t") + if len(eles) != 9: + continue + + match = re.search(r'gene_id "([^"]+)".+transcript_id "([^"]+)"', eles[8]) + if not match: + continue + + gene_id = match.group(1) + transcript_id = match.group(2) + feature_type = eles[2] + + if gene_id not in gtf_genes: + gtf_genes[gene_id] = {} + + if feature_type == "transcript": + gtf_genes[gene_id][transcript_id] = {"transcript": [line.strip()], "exons": []} + elif feature_type == "exon": + if transcript_id in gtf_genes[gene_id]: + gtf_genes[gene_id][transcript_id]["exons"].append(line.strip()) + + return gtf_genes + + +def check_transcriptomic_output(main_output_dir: Path) -> None: + """Check across the various transcriptomic if there is a reasonable amount of data + + Args: + main_output_dir (Path): Output dir path + Raises: + IOError: If no lines or fewer than the minimum expected lines are found in the output files. + """ + + transcriptomic_dirs = [ + "scallop_output", + "stringtie_output", + "minimap2_output", + ] + total_lines = 0 + min_lines = 100000 + for transcriptomic_dir in transcriptomic_dirs: + full_file_path = main_output_dir / transcriptomic_dir / "annotation.gtf" + if not full_file_path.exists(): + logger.warning( + "Warning, no annotation.gtf found for %s. This might be fine, \ + e.g. no long read data were provided", + transcriptomic_dir, + ) + continue + num_lines = sum(1 for _ in full_file_path.open("r")) + total_lines += num_lines + logging.info("For %s found a total of in the annotation.gtf file", num_lines) + if total_lines == 0: + raise IOError( + "Anno was run with transcriptomic mode enabled,\ + but the transcriptomic annotation files are empty" + ) + if total_lines <= min_lines: + raise IOError( + "Anno was run with transcriptomic mode enabled, \ + but the total number of lines in the output \ + files were less than the min expected value" + + "\n" + "Found: " + str(total_lines) + "\n" + "Min allowed: " + str(min_lines) + ) + + + +# def run_find_orfs(genome_file, main_output_dir): +# min_orf_length = 600 + +# orf_output_dir = create_dir(main_output_dir, "orf_output") +# seq_region_lengths = get_seq_region_lengths(genome_file, 5000) +# for region_name in seq_region_lengths: +# region_length = seq_region_lengths[region_name] +# seq = get_sequence(region_name, 1, region_length, 1, genome_file, orf_output_dir) +# for phase in range(0, 6): +# find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir) + + +# def find_orf_phased_region(region_name, seq, phase, min_orf_length, orf_output_dir): +# current_index = phase +# orf_counter = 1 +# if phase > 2: +# seq = reverse_complement(seq) +# current_index = current_index % 3 + +# orf_file_path = os.path.join(orf_output_dir, (region_name + ".phase" + str(phase) + ".orf.fa")) +# orf_out = open(orf_file_path, "w+") + +# while current_index < len(seq): +# codon = seq[current_index : current_index + 3] +# if codon == "ATG": +# orf_seq = codon +# for j in range(current_index + 3, len(seq), 3): +# next_codon = seq[j : j + 3] +# if next_codon == "TAA" or next_codon == "TAG" or next_codon == "TGA": +# orf_seq += next_codon +# if len(orf_seq) >= min_orf_length: +# orf_out.write( +# ">" + region_name + "_phase" + str(phase) + "_orf" + str(orf_counter) + "\n" +# ) +# orf_out.write(orf_seq + "\n") +# orf_counter += 1 +# orf_seq = "" +# break + +# # If there's another met in phase, then put i to the start of +# # the codon after j so that only the longest ORF is found +# if next_codon == "ATG": +# current_index = j + 3 +# orf_seq += next_codon +# current_index += 3 +# orf_out.close() + + +# def bed_to_gff(input_dir, hints_file): +# gff_out = open(hints_file, "w+") +# exons_dict = {} +# for bed_file in glob.glob(input_dir + "/*.bed"): +# logger.info("Processing file for hints:") +# logger.info(bed_file) +# bed_in = open(bed_file) +# bed_lines = bed_in.readlines() +# for line in bed_lines: +# line = line.rstrip() +# elements = line.split("\t") +# seq_region_name = elements[0] +# offset = int(elements[1]) +# hit_name = elements[3] +# strand = elements[5] +# block_sizes = elements[10].split(",") +# block_sizes = list(filter(None, block_sizes)) +# block_starts = elements[11].split(",") +# block_starts = list(filter(None, block_starts)) +# exons = bed_block_to_exons(block_sizes, block_starts, offset) +# for i, element in enumerate(exons): +# exon_coords = exons[i] +# exon_key = seq_region_name + ":" + exon_coords[0] + ":" + exon_coords[1] + ":" + strand +# if exon_key in exons_dict: +# exons_dict[exon_key][5] += 1 +# else: +# gff_list = [ +# seq_region_name, +# "CDNA", +# "exon", +# exon_coords[0], +# exon_coords[1], +# 1.0, +# strand, +# ".", +# ] +# exons_dict[exon_key] = gff_list + +# for exon_key, gff_list in exons_dict.items(): +# gff_list[5] = str(gff_list[5]) +# gff_line = "\t".join(gff_list) + "\tsrc=W;mul=" + gff_list[5] + ";\n" +# gff_out.write(gff_line) + +# gff_out.close() + +# sorted_hints_out = open((hints_file + ".srt"), "w+") +# subprocess.run( +# ["sort", "-k1,1", "-k7,7", "-k4,4", "-k5,5", hints_file], +# stdout=sorted_hints_out, +# ) +# sorted_hints_out.close() + + +# def splice_junction_to_gff(input_dir, hints_file): +# sjf_out = open(hints_file, "w+") + +# for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): +# sjf_in = open(sj_tab_file) +# sjf_lines = sjf_in.readlines() +# for line in sjf_lines: +# elements = line.split("\t") +# strand = "+" +# # If the strand is undefined then skip, Augustus expects a strand +# if elements[3] == "0": +# continue +# elif elements[3] == "2": +# strand = "-" + +# junction_length = int(elements[2]) - int(elements[1]) + 1 +# if junction_length < 100: +# continue + +# if not elements[4] and elements[7] < 10: +# continue + +# # For the moment treat multimapping and single mapping things as a combined score +# score = float(elements[6]) + float(elements[7]) +# score = str(score) +# output_line = [ +# elements[0], +# "RNASEQ", +# "intron", +# elements[1], +# elements[2], +# score, +# strand, +# ".", +# ("src=W;mul=" + score + ";"), +# ] +# sjf_out.write("\t".join(output_line) + "\n") + +# sjf_out.close() + + +# def split_genome(genome_file, target_dir, min_seq_length): +# # This is the lazy initial way of just splitting into a dir +# # of files based on the toplevel sequence with a min sequence length filter +# # There are a couple of obvious improvements: +# # 1) Instead of making files for all seqs, just process N seqs +# # parallel, where N = num_threads. Then you could clean up the seq file +# # after each seq finishes, thus avoiding potentially +# # having thousands of file in a dir +# # 2) Split the seq into even slices and process these in parallel +# # (which the same cleanup as in 1). For sequences smaller than the +# # target slice size, bundle them up together into a single file. Vastly +# # more complex, partially implemented in the splice_genome method +# # Allows for more consistency with parallelisation (since there should +# # be no large outliers). But require a mapping strategy for the +# # coords and sequence names and all the hints will need to be adjusted +# current_header = "" +# current_seq = "" + +# file_in = open(genome_file) +# line = file_in.readline() +# while line: +# match = re.search(r">(.+)$", line) +# if match and current_header: +# if len(current_seq) > min_seq_length: +# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) +# if not os.path.exists(file_out_name): +# file_out = open(file_out_name, "w+") +# file_out.write(">" + current_header + "\n" + current_seq + "\n") +# file_out.close() + +# else: +# print("Found an existing split file, so will not overwrite. File found:") +# print(file_out_name) + +# current_seq = "" +# current_header = match.group(1) +# elif match: +# current_header = match.group(1) +# else: +# current_seq += line.rstrip() + +# line = file_in.readline() + +# if len(current_seq) > min_seq_length: +# file_out_name = os.path.join(target_dir, (current_header + ".split.fa")) +# if not os.path.exists(file_out_name): +# file_out = open(file_out_name, "w+") +# file_out.write(">" + current_header + "\n" + current_seq + "\n") +# file_out.close() + +# else: +# logger.info("Found an existing split file, so will not overwrite. File found:") +# logger.info(file_out_name) + +# file_in.close() + + +# # def merge_gtf_files(file_paths,id_label): + +# # gtf_file_path = os.path.join(output_dir,'annotation.gtf') +# # gtf_out = open(gtf_file_path,'w+') +# # for gtf_file_path in gtf_files: +# # gtf_file_name = os.path.basename(gtf_file_path) +# # match = re.search(r'\.rs(\d+)\.re(\d+)\.',gtf_file_name) +# # start_offset = int(match.group(1)) +# # gtf_in = open(gtf_file_path,'r') +# # line = gtf_in.readline() +# # while line: +# # values = line.split("\t") +# # if len(values) == 9 and (values[2] in feature_types): +# # values[3] = str(int(values[3]) + (start_offset - 1)) +# # values[4] = str(int(values[4]) + (start_offset - 1)) +# # gtf_out.write("\t".join(values)) +# # line = gtf_in.readline() +# # gtf_in.close() +# # gtf_out.close() + + +# def multiprocess_generic(cmd): +# print(" ".join(cmd)) +# subprocess.run(cmd) + + +# def seq_region_names(genome_file): +# region_list = [] + +# file_in = open(genome_file) +# line = file_in.readline() +# while line: +# match = re.search(r">([^\s]+)", line) +# if match: +# region_name = match.group(1) +# if region_name == "MT": +# logger.info("Skipping region named MT") +# line = file_in.readline() +# continue +# else: +# region_list.append(match.group(1)) +# line = file_in.readline() + +# return region_list + + +# def slice_genome(genome_file, target_dir, target_slice_size): +# # The below is sort of tested +# # Without the +# target_seq_length = 50000000 +# min_seq_length = 1000 +# current_header = "" +# current_seq = "" +# seq_dict = {} +# for line in seq: # pylint:disable=used-before-assignment +# match = re.search(r">(.+)$", line) +# if match and current_header: +# seq_dict[current_header] = current_seq +# current_seq = "" +# current_header = match.group(1) +# elif match: +# current_header = match.group(1) +# else: +# current_seq += line.rstrip() + +# seq_dict[current_header] = current_seq + +# seq_buffer = 0 +# file_number = 0 +# file_name = "genome_file_" + str(file_number) + +# for header in seq_dict: +# seq_iterator = 0 +# seq = seq_dict[header] + +# while len(seq) > target_seq_length: +# file_out = open(os.path.join(target_dir, file_name), "w+") +# subseq = seq[0:target_seq_length] +# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + subseq + "\n") +# file_out.close() +# seq = seq[target_seq_length:] +# seq_iterator += 1 +# file_number += 1 +# file_name = "genome_file_" + str(file_number) + +# if len(seq) >= min_seq_length: +# file_name = "genome_file_" + str(file_number) +# file_out = open(os.path.join(file_name), "w+") +# file_out.write(">" + header + "_sli" + str(seq_iterator) + "\n" + seq + "\n") +# file_out.close() +# file_number += 1 +# file_name = "genome_file_" + str(file_number) + + +# def coallate_results(main_output_dir): +# results_dir = create_dir(main_output_dir, "results") +# output_dirs = [ +# "augustus_output", +# "cpg_output", +# "dust_output", +# "eponine_output", +# "red_output", +# "rfam_output", +# "trf_output", +# "trnascan_output", +# ] +# for output_dir in output_dirs: +# match = re.search(r"(.+)_output", output_dir) +# result_type = match.group(1) +# results_path = os.path.join(main_output_dir, output_dir, "annotation.gtf") +# copy_path = os.path.join(results_dir, (result_type + ".gtf")) +# if os.path.exists(results_path): +# cpy_cmd = ["cp", results_path, copy_path] +# subprocess.run(cpy_cmd)