diff --git a/eva_submission/eload_utils.py b/eva_submission/eload_utils.py index 7933bbd9..e8b2a0f3 100644 --- a/eva_submission/eload_utils.py +++ b/eva_submission/eload_utils.py @@ -1,4 +1,5 @@ import glob +import gzip import os import re import urllib @@ -211,7 +212,53 @@ def detect_vcf_aggregation(vcf_file): in every line checked. Otherwise it returns None meaning that the aggregation type could not be determined. """ + + try: + samples, af_in_info, gt_in_format = _assess_vcf_aggregation_with_pysam(vcf_file) + except Exception: + logger.error(f"Pysam Failed to open and read {vcf_file}") + try: + samples, af_in_info, gt_in_format = _assess_vcf_aggregation_manual(vcf_file) + except Exception: + logger.error(f"Manual parsing Failed to open or read {vcf_file}") + return None + if len(samples) > 0 and gt_in_format: + return 'none' + elif len(samples) == 0 and af_in_info: + return 'basic' + else: + logger.error(f'Aggregation type could not be detected for {vcf_file}') + return None + + +def _assess_vcf_aggregation_manual(vcf_file): try: + if vcf_file.endswith('.gz'): + open_file = gzip.open(vcf_file, 'rt') + else: + open_file = open(vcf_file, 'r') + + nb_line_checked = 0 + max_line_check = 10 + gt_in_format = True + af_in_info = True + samples = [] + for line in open_file: + sp_line = line.strip().split('\t') + if line.startswith('#CHROM'): + if len(sp_line) > 9: + samples = sp_line[9:] + if not line.startswith('#'): + gt_in_format = gt_in_format and len(sp_line) > 8 and 'GT' in sp_line[8] + af_in_info = af_in_info and (sp_line[7].find('AF=') or (sp_line[7].find('AC=') and sp_line[7].find('AN='))) + if nb_line_checked >= max_line_check: + break + return samples, af_in_info, gt_in_format + finally: + open_file.close() + + +def _assess_vcf_aggregation_with_pysam(vcf_file): with pysam.VariantFile(vcf_file, 'r') as vcf_in: samples = list(vcf_in.header.samples) # check that the first 10 lines have genotypes for all the samples present and if they have allele frequency @@ -225,16 +272,7 @@ def detect_vcf_aggregation(vcf_file): nb_line_checked += 1 if nb_line_checked >= max_line_check: break - except Exception: - logger.error(f"Pysam Failed to open and read {vcf_file}") - return None - if len(samples) > 0 and gt_in_format: - return 'none' - elif len(samples) == 0 and af_in_info: - return 'basic' - else: - logger.error(f'Aggregation type could not be detected for {vcf_file}') - return None + return samples, af_in_info, gt_in_format def create_assembly_report_from_fasta(assembly_fasta_path): diff --git a/eva_submission/samples_checker.py b/eva_submission/samples_checker.py index d42affd6..10b270c3 100644 --- a/eva_submission/samples_checker.py +++ b/eva_submission/samples_checker.py @@ -21,7 +21,7 @@ def get_samples_from_vcf_manual(vcf_file): try: for line in open_file: if line.startswith('#CHROM'): - sp_line = line.split('\t') + sp_line = line.strip().split('\t') if len(sp_line) > 9: return sp_line[9:] finally: diff --git a/tests/resources/vcf_files/file_basic_aggregation.vcf.gz b/tests/resources/vcf_files/file_basic_aggregation.vcf.gz new file mode 100644 index 00000000..1047ea59 Binary files /dev/null and b/tests/resources/vcf_files/file_basic_aggregation.vcf.gz differ diff --git a/tests/resources/vcf_files/file_no_aggregation.vcf.gz b/tests/resources/vcf_files/file_no_aggregation.vcf.gz new file mode 100644 index 00000000..74ba40a8 Binary files /dev/null and b/tests/resources/vcf_files/file_no_aggregation.vcf.gz differ diff --git a/tests/test_eload_utils.py b/tests/test_eload_utils.py index 58843ad3..ea38b800 100644 --- a/tests/test_eload_utils.py +++ b/tests/test_eload_utils.py @@ -43,9 +43,15 @@ def test_detect_vcf_aggregation(self): assert detect_vcf_aggregation( os.path.join(self.resources_folder, 'vcf_files', 'file_basic_aggregation.vcf') ) == 'basic' + assert detect_vcf_aggregation( + os.path.join(self.resources_folder, 'vcf_files', 'file_basic_aggregation.vcf.gz') + ) == 'basic' assert detect_vcf_aggregation( os.path.join(self.resources_folder, 'vcf_files', 'file_no_aggregation.vcf') ) == 'none' + assert detect_vcf_aggregation( + os.path.join(self.resources_folder, 'vcf_files', 'file_no_aggregation.vcf.gz') + ) == 'none' assert detect_vcf_aggregation( os.path.join(self.resources_folder, 'vcf_files', 'file_undetermined_aggregation.vcf') ) is None diff --git a/tests/test_vep_utils.py b/tests/test_vep_utils.py index 2feba964..7e8153a1 100644 --- a/tests/test_vep_utils.py +++ b/tests/test_vep_utils.py @@ -154,7 +154,7 @@ def test_get_species_and_assembly(self): def test_get_releases(self): with get_ftp_connection(ensembl_ftp_url) as ftp: - assert get_releases(ftp, 'pub', current_only=True) == {112: 'pub/release-112'} + assert get_releases(ftp, 'pub', current_only=True) == {113: 'pub/release-113'} with get_ftp_connection(ensembl_genome_ftp_url) as ftp: assert get_releases(ftp, ensembl_genome_dirs[0], current_only=True) == {59: 'ensemblgenomes/pub/plants/release-59'}