Skip to content

Commit

Permalink
Fix aggregation check (#223)
Browse files Browse the repository at this point in the history
* Allow aggregation checks to be performed without opening the file with pysam

* add tests

* Add exception catching

* Update Ensembl version
  • Loading branch information
tcezard authored Sep 11, 2024
1 parent eba7b2a commit 9b1360c
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 12 deletions.
58 changes: 48 additions & 10 deletions eva_submission/eload_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import gzip
import os
import re
import urllib
Expand Down Expand Up @@ -211,7 +212,53 @@ def detect_vcf_aggregation(vcf_file):
in every line checked.
Otherwise it returns None meaning that the aggregation type could not be determined.
"""

try:
samples, af_in_info, gt_in_format = _assess_vcf_aggregation_with_pysam(vcf_file)
except Exception:
logger.error(f"Pysam Failed to open and read {vcf_file}")
try:
samples, af_in_info, gt_in_format = _assess_vcf_aggregation_manual(vcf_file)
except Exception:
logger.error(f"Manual parsing Failed to open or read {vcf_file}")
return None
if len(samples) > 0 and gt_in_format:
return 'none'
elif len(samples) == 0 and af_in_info:
return 'basic'
else:
logger.error(f'Aggregation type could not be detected for {vcf_file}')
return None


def _assess_vcf_aggregation_manual(vcf_file):
try:
if vcf_file.endswith('.gz'):
open_file = gzip.open(vcf_file, 'rt')
else:
open_file = open(vcf_file, 'r')

nb_line_checked = 0
max_line_check = 10
gt_in_format = True
af_in_info = True
samples = []
for line in open_file:
sp_line = line.strip().split('\t')
if line.startswith('#CHROM'):
if len(sp_line) > 9:
samples = sp_line[9:]
if not line.startswith('#'):
gt_in_format = gt_in_format and len(sp_line) > 8 and 'GT' in sp_line[8]
af_in_info = af_in_info and (sp_line[7].find('AF=') or (sp_line[7].find('AC=') and sp_line[7].find('AN=')))
if nb_line_checked >= max_line_check:
break
return samples, af_in_info, gt_in_format
finally:
open_file.close()


def _assess_vcf_aggregation_with_pysam(vcf_file):
with pysam.VariantFile(vcf_file, 'r') as vcf_in:
samples = list(vcf_in.header.samples)
# check that the first 10 lines have genotypes for all the samples present and if they have allele frequency
Expand All @@ -225,16 +272,7 @@ def detect_vcf_aggregation(vcf_file):
nb_line_checked += 1
if nb_line_checked >= max_line_check:
break
except Exception:
logger.error(f"Pysam Failed to open and read {vcf_file}")
return None
if len(samples) > 0 and gt_in_format:
return 'none'
elif len(samples) == 0 and af_in_info:
return 'basic'
else:
logger.error(f'Aggregation type could not be detected for {vcf_file}')
return None
return samples, af_in_info, gt_in_format


def create_assembly_report_from_fasta(assembly_fasta_path):
Expand Down
2 changes: 1 addition & 1 deletion eva_submission/samples_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_samples_from_vcf_manual(vcf_file):
try:
for line in open_file:
if line.startswith('#CHROM'):
sp_line = line.split('\t')
sp_line = line.strip().split('\t')
if len(sp_line) > 9:
return sp_line[9:]
finally:
Expand Down
Binary file not shown.
Binary file added tests/resources/vcf_files/file_no_aggregation.vcf.gz
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/test_eload_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,15 @@ def test_detect_vcf_aggregation(self):
assert detect_vcf_aggregation(
os.path.join(self.resources_folder, 'vcf_files', 'file_basic_aggregation.vcf')
) == 'basic'
assert detect_vcf_aggregation(
os.path.join(self.resources_folder, 'vcf_files', 'file_basic_aggregation.vcf.gz')
) == 'basic'
assert detect_vcf_aggregation(
os.path.join(self.resources_folder, 'vcf_files', 'file_no_aggregation.vcf')
) == 'none'
assert detect_vcf_aggregation(
os.path.join(self.resources_folder, 'vcf_files', 'file_no_aggregation.vcf.gz')
) == 'none'
assert detect_vcf_aggregation(
os.path.join(self.resources_folder, 'vcf_files', 'file_undetermined_aggregation.vcf')
) is None
Expand Down
2 changes: 1 addition & 1 deletion tests/test_vep_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_get_species_and_assembly(self):

def test_get_releases(self):
with get_ftp_connection(ensembl_ftp_url) as ftp:
assert get_releases(ftp, 'pub', current_only=True) == {112: 'pub/release-112'}
assert get_releases(ftp, 'pub', current_only=True) == {113: 'pub/release-113'}

with get_ftp_connection(ensembl_genome_ftp_url) as ftp:
assert get_releases(ftp, ensembl_genome_dirs[0], current_only=True) == {59: 'ensemblgenomes/pub/plants/release-59'}
Expand Down

0 comments on commit 9b1360c

Please sign in to comment.