From 9b3af64b6972309019125e0c5f7b4d8c775d1460 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Thu, 5 Sep 2024 13:31:15 +0100 Subject: [PATCH] EVA-3650 - Run new statistic calculation step (#220) * integrate new statistics calculation in ingestion * Refactor QC for submission_qc_checks.py * Fix copy/paste error * Add missing test files * Apply suggestions from code review Co-authored-by: April Shen * Add fixes from review comments and caching * Fix tests * Update eva_submission/submission_qc_checks.py Co-authored-by: April Shen --------- Co-authored-by: April Shen --- eva_submission/nextflow/accession_and_load.nf | 54 +- eva_submission/submission_qc_checks.py | 828 ++++++++++-------- .../eloads/ELOAD_103/.ELOAD_103_config.yml | 11 + ...000003205.1_eva_remapped.vcf_ingestion.log | 9 + .../00_logs/GCA_000003205.1_vcf_extractor.log | 17 + ...795.2_backpropagate_to_GCA_000003205.1.log | 12 + .../00_logs/accessioning.test2.vcf.gz.log | 21 + .../00_logs/pipeline.test2.vcf.gz.log | 5 + tests/test_submission_qc.py | 457 +++++----- 9 files changed, 791 insertions(+), 623 deletions(-) create mode 100644 tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_eva_remapped.vcf_ingestion.log create mode 100644 tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_vcf_extractor.log create mode 100644 tests/resources/projects/PRJEB33333/00_logs/GCA_000247795.2_backpropagate_to_GCA_000003205.1.log create mode 100644 tests/resources/projects/PRJEB33333/00_logs/accessioning.test2.vcf.gz.log create mode 100644 tests/resources/projects/PRJEB33333/00_logs/pipeline.test2.vcf.gz.log diff --git a/eva_submission/nextflow/accession_and_load.nf b/eva_submission/nextflow/accession_and_load.nf index 68d6a693..5c9f2382 100644 --- a/eva_submission/nextflow/accession_and_load.nf +++ b/eva_submission/nextflow/accession_and_load.nf @@ -145,7 +145,8 @@ workflow { .groupTuple(by: [3, 4, 8]) // group by analysis_accession, db_name, aggregation .map{tuple(it[3], it[4], it[8], it[1])} // analysis_accession, db_name, aggregation, grouped normalised_vcf_files - calculate_statistics_vcf(stats_ch, load_variants_vcf.out.variant_load_complete.collect()) + calculate_variant_statistics_vcf(stats_ch, load_variants_vcf.out.variant_load_complete.collect()) + calculate_study_statistics_vcf(stats_ch, load_variants_vcf.out.variant_load_complete.collect()) if (!is_human_study) { vcf_files_dbname = Channel.fromPath(params.valid_vcfs) @@ -390,14 +391,14 @@ process run_vep_on_variants { /* - * Calculate statistics using eva-pipeline. + * Calculate variant statistics using eva-pipeline. */ -process calculate_statistics_vcf { +process calculate_variant_statistics_vcf { label 'long_time', 'med_mem' clusterOptions { - return "-o $params.logs_dir/statistics.${analysis_accession}.log \ - -e $params.logs_dir/statistics.${analysis_accession}.err" + return "-o $params.logs_dir/variant.statistics.${analysis_accession}.log \ + -e $params.logs_dir/variant.statistics.${analysis_accession}.err" } when: @@ -409,12 +410,51 @@ process calculate_statistics_vcf { val variant_load_complete output: - val true, emit: statistics_calc_complete + val true, emit: variant_statistics_calc_complete script: def pipeline_parameters = "" - pipeline_parameters += " --spring.batch.job.names=calculate-statistics-job" + pipeline_parameters += " --spring.batch.job.names=variant-stats-job" + + pipeline_parameters += " --input.vcf.aggregation=" + aggregation.toString().toUpperCase() + pipeline_parameters += " --input.vcf=" + file(vcf_files[0]).toRealPath().toString() // If there are multiple file only use the first + pipeline_parameters += " --input.vcf.id=" + analysis_accession.toString() + + pipeline_parameters += " --spring.data.mongodb.database=" + db_name.toString() + + """ + java -Xmx${task.memory.toGiga()-1}G -jar $params.jar.eva_pipeline --spring.config.location=file:$params.load_job_props --parameters.path=$params.load_job_props $pipeline_parameters + """ +} + + +/* + * Calculate study statistics using eva-pipeline. + */ +process calculate_study_statistics_vcf { + label 'long_time', 'med_mem' + + clusterOptions { + return "-o $params.logs_dir/study.statistics.${analysis_accession}.log \ + -e $params.logs_dir/study.statistics.${analysis_accession}.err" + } + + when: + // Statistics calculation is not required for Already aggregated analysis/study + aggregation.toString() == "none" + + input: + tuple val(analysis_accession), val(db_name), val(aggregation), val(vcf_files) + val variant_load_complete + + output: + val true, emit: study_statistics_calc_complete + + script: + def pipeline_parameters = "" + + pipeline_parameters += " --spring.batch.job.names=file-stats-job" pipeline_parameters += " --input.vcf.aggregation=" + aggregation.toString().toUpperCase() pipeline_parameters += " --input.vcf=" + file(vcf_files[0]).toRealPath().toString() // If there are multiple file only use the first diff --git a/eva_submission/submission_qc_checks.py b/eva_submission/submission_qc_checks.py index d2368e90..98ac0b62 100644 --- a/eva_submission/submission_qc_checks.py +++ b/eva_submission/submission_qc_checks.py @@ -2,6 +2,7 @@ import os from collections import defaultdict from ftplib import FTP +from functools import cached_property, lru_cache from pathlib import Path import requests @@ -18,6 +19,94 @@ logger = logging_config.get_logger(__name__) +job_launched_and_completed_text_map = { + 'accession': ( + {'Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] launched'}, + {'Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] completed'} + ), + 'variant_load': ( + {'Job: [FlowJob: [name=genotyped-vcf-job]] launched', + 'Job: [FlowJob: [name=aggregated-vcf-job]] launched'}, + {'Job: [FlowJob: [name=genotyped-vcf-job]] completed', + 'Job: [FlowJob: [name=aggregated-vcf-job]] completed'} + ), + 'load_vcf': ( + {'Job: [FlowJob: [name=load-vcf-job]] launched'}, + {'Job: [FlowJob: [name=load-vcf-job]] completed'} + ), + 'annotate_variants': ( + {'Job: [FlowJob: [name=annotate-variants-job]] launched'}, + {'Job: [FlowJob: [name=annotate-variants-job]] completed'} + ), + 'calculate_statistics': ( + {'Job: [FlowJob: [name=calculate-statistics-job]] launched'}, + {'Job: [FlowJob: [name=calculate-statistics-job]] completed'} + ), + 'variant-stats': ( + {'Job: [FlowJob: [name=variant-stats-job]] launched'}, + {'Job: [FlowJob: [name=variant-stats-job]] completed'} + ), + 'study-stats': ( + {'Job: [FlowJob: [name=study-stats-job]] launched'}, + {'Job: [FlowJob: [name=study-stats-job]] completed'} + ), + 'acc_import': ( + {'Job: [SimpleJob: [name=accession-import-job]] launched'}, + {'Job: [SimpleJob: [name=accession-import-job]] completed'} + ), + 'clustering': ( + {'Job: [SimpleJob: [name=STUDY_CLUSTERING_JOB]] launched'}, + {'Job: [SimpleJob: [name=STUDY_CLUSTERING_JOB]] completed'} + ), + 'clustering_qc': ( + {'Job: [SimpleJob: [name=NEW_CLUSTERED_VARIANTS_QC_JOB]] launched'}, + {'Job: [SimpleJob: [name=NEW_CLUSTERED_VARIANTS_QC_JOB]] completed'} + ), + 'vcf_extractor': ( + {'Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] launched'}, + {'Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] completed'} + ), + 'remapping_ingestion': ( + {'Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] launched'}, + {'Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] completed'} + ), + 'backpropagation': ( + {'Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] launched'}, + {'Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] completed'} + ) +} + + +@lru_cache(maxsize=None) +def _did_job_complete_successfully_from_log(file_path, job_type): + with open(file_path, 'r') as f: + job_status = 'FAILED' + job_launched_str, job_completed_str = job_launched_and_completed_text_map[job_type] + for line in f: + if any(text in line for text in job_launched_str): + job_status = "" + if any(text in line for text in job_completed_str): + job_status = line.split(" ")[-1].replace("[", "").replace("]", "").strip() + if job_status == 'COMPLETED': + return True + elif job_status == 'FAILED': + return False + else: + logger.error(f'Could not determine status of {job_type} job in file {file_path}') + return False + + +def _get_failed_job_or_step_name(file_name): + with open(file_name, 'r') as f: + job_name = 'job name could not be retrieved' + for line in f: + if 'Encountered an error executing step' in line: + job_name = line[line.index("Encountered an error executing step"): line.rindex("in job")] \ + .strip().split(" ")[-1] + + return job_name + + class EloadQC(Eload): def __init__(self, eload_number, config_object: EloadConfig = None): super().__init__(eload_number, config_object) @@ -28,101 +117,48 @@ def __init__(self, eload_number, config_object: EloadConfig = None): self.path_to_logs_dir = os.path.join(self.path_to_data_dir, '00_logs') self.taxonomy = self.eload_cfg.query('submission', 'taxonomy_id') self.analyses = self.eload_cfg.query('brokering', 'analyses') - self.job_launched_and_completed_text_map = { - 'accession': ( - {'Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] launched'}, - {'Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] completed'} - ), - 'variant_load': ( - {'Job: [FlowJob: [name=genotyped-vcf-job]] launched', - 'Job: [FlowJob: [name=aggregated-vcf-job]] launched'}, - {'Job: [FlowJob: [name=genotyped-vcf-job]] completed', - 'Job: [FlowJob: [name=aggregated-vcf-job]] completed'} - ), - 'load_vcf': ( - {'Job: [FlowJob: [name=load-vcf-job]] launched'}, - {'Job: [FlowJob: [name=load-vcf-job]] completed'} - ), - 'annotate_variants': ( - {'Job: [FlowJob: [name=annotate-variants-job]] launched'}, - {'Job: [FlowJob: [name=annotate-variants-job]] completed'} - ), - 'calculate_statistics': ( - {'Job: [FlowJob: [name=calculate-statistics-job]] launched'}, - {'Job: [FlowJob: [name=calculate-statistics-job]] completed'} - ), - 'acc_import': ( - {'Job: [SimpleJob: [name=accession-import-job]] launched'}, - {'Job: [SimpleJob: [name=accession-import-job]] completed'} - ), - 'clustering': ( - {'Job: [SimpleJob: [name=STUDY_CLUSTERING_JOB]] launched'}, - {'Job: [SimpleJob: [name=STUDY_CLUSTERING_JOB]] completed'} - ), - 'clustering_qc': ( - {'Job: [SimpleJob: [name=NEW_CLUSTERED_VARIANTS_QC_JOB]] launched'}, - {'Job: [SimpleJob: [name=NEW_CLUSTERED_VARIANTS_QC_JOB]] completed'} - ), - 'vcf_extractor': ( - {'Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] launched'}, - {'Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] completed'} - ), - 'remapping_ingestion': ( - {'Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] launched'}, - {'Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] completed'} - ), - 'backpropagation': ( - {'Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] launched'}, - {'Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] completed'} - ) - } - def check_if_study_appears(self): - url = f"https://wwwdev.ebi.ac.uk/eva/webservices/rest/v1/studies/{self.project_accession}/summary" - try: - json_response = self.get_result_from_webservice(url) - except HTTPError as e: - logger.error(str(e)) - json_response = {} - if self.check_if_study_present_in_response(json_response, 'id'): - self._study_check_result = "PASS" - else: - self._study_check_result = "FAIL" + @cached_property + def vcf_files(self): + vcf_files = [] + for analysis_data in self.analyses.values(): + for v_files in analysis_data['vcf_files'].values(): + vcf_files.append(os.path.basename(v_files['output_vcf_file'])) + return vcf_files - return f""" - pass: {self._study_check_result}""" + @cached_property + def analysis_to_file_names(self): + analysis_to_file_names = {} + for analysis_alias, analysis_accession in self.eload_cfg.query('brokering', 'ena', 'ANALYSIS').items(): + # Find the files associated with this analysis + analysis_to_file_names[analysis_accession] = [ + os.path.basename(f) for f in self.analyses.get(analysis_alias).get('vcf_files') + ] + return analysis_to_file_names + + ### + # Helper methods + ### - def check_if_study_appears_in_variant_browser(self, species_name): + def _check_if_study_appears_in_variant_browser(self, species_name): url = f"https://wwwdev.ebi.ac.uk/eva/webservices/rest/v1/meta/studies/list?species={species_name}" try: - json_response = self.get_result_from_webservice(url) + json_response = self._get_result_from_webservice(url) except HTTPError as e: logger.error(str(e)) json_response = {} - if self.check_if_study_present_in_response(json_response, 'studyId'): + if self._check_if_study_present_in_response(json_response, 'studyId'): return True else: return False - def check_if_study_appears_in_metadata(self): - missing_assemblies = [] - for analysis_data in self.analyses.values(): - species_name = self.get_species_name(analysis_data['assembly_accession']) - if not self.check_if_study_appears_in_variant_browser(species_name): - missing_assemblies.append(f"{species_name}({analysis_data['assembly_accession']})") - - self._study_metadata_check_result = "PASS" if not missing_assemblies else "FAIL" - return f""" - pass: {self._study_metadata_check_result} - missing assemblies: {missing_assemblies if missing_assemblies else None}""" - @retry(tries=3, delay=2, backoff=1.5, jitter=(1, 3)) - def get_result_from_webservice(self, url): + def _get_result_from_webservice(self, url): response = requests.get(url) response.raise_for_status() return response.json() - def check_if_study_present_in_response(self, res, key): + def _check_if_study_present_in_response(self, res, key): if any(res) and 'response' in res and len(res['response']) > 0: for response in res['response']: if response['numTotalResults'] >= 1: @@ -131,36 +167,145 @@ def check_if_study_present_in_response(self, res, key): return True return False - def get_species_name(self, assembly): + def _get_species_name(self, assembly): with get_metadata_connection_handle(self.profile, self.private_config_xml_file) as pg_conn: query = f"""select concat(t.taxonomy_code, '_',a.assembly_code) from evapro.taxonomy t join evapro.assembly a on a.taxonomy_id = t.taxonomy_id where t.taxonomy_id = {self.taxonomy} and assembly_accession='{assembly}'""" return get_all_results_for_query(pg_conn, query)[0][0] - def get_browsable_files_for_study(self): + def _get_browsable_files_for_study(self): with get_metadata_connection_handle(self.profile, self.private_config_xml_file) as pg_conn: query = f"select filename from evapro.browsable_file where project_accession='{self.project_accession}'" return [filename for filename, in get_all_results_for_query(pg_conn, query)] - def check_all_browsable_files_are_available_in_ftp(self, vcf_files): + @retry(tries=3, delay=2, backoff=1.5, jitter=(1, 3)) + def _get_files_from_ftp(self, project_accession): + ftp = FTP('ftp.ebi.ac.uk', timeout=600) + ftp.login() + ftp.cwd(f'pub/databases/eva/{project_accession}') + return ftp.nlst() + + def _check_if_variants_were_skipped_in_log(self, file_path): + with open(file_path, 'r') as f: + variants_skipped = -1 + for line in f: + if "Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] launched" in line: + variants_skipped = None + if 'lines in the original VCF were skipped' in line: + variants_skipped = line.strip().split(":")[-1].strip().split(" ")[0].strip() + + return variants_skipped + + def _check_multiple_logs(self, search_unit, log_patterns, job_types): + """ + Go through the list of provided logs and search for the given job types. + It returns a positive result if at least one if these jobs is found to pass, similar to the any() function. + The search_unit is group for which this search is perform, typically a file name or analysis accession + Returns a tuple with the test result as boolean and the last error message if none of the jobs are found. + """ + assert len(log_patterns) == len(job_types) + any_pass = False + last_error = f'No log checked for {search_unit}' + for log_pattern, job_type in zip(log_patterns, job_types): + check_pass, last_error = self._find_log_and_check_job(search_unit, log_pattern, job_type) + any_pass = any_pass or check_pass + if any_pass: + break + return any_pass, last_error + + def _find_log_and_check_job(self, search_unit, log_file_pattern, job_type): + """ + Find a log file using the provided log_file_pattern and check if the specified job_type was run successfully. + The search_unit is group for which this search is perform, typically a file name or analysis accession + Returns a tuple with the test result as boolean and optional error message + """ + log_files = glob.glob(os.path.join(self.path_to_logs_dir, log_file_pattern)) + report_text = "" + if log_files: + # check if job completed successfully + if not _did_job_complete_successfully_from_log(log_files[0], job_type): + report_text += f"{job_type} failed job/step : {_get_failed_job_or_step_name(log_files[0])}" + job_passed = False + else: + job_passed = True + else: + report_text += f"{job_type} error : No {job_type} log file found for {search_unit}" + job_passed = False + return job_passed, report_text + + ### + # Reporting methods + ### + + @staticmethod + def _report_for_human(): + result = 'N/A - Human Taxonomy' + report = f"""Success: {result}""" + return result, report + + @staticmethod + def _report_for_log(failed_unit): + """Create a result string and a detailed report based on the error reported in failed unit""" + result = "PASS" if not failed_unit else "FAIL" + report = f"""Success: {result}""" + if failed_unit: + report += f""" + Errors:""" + for unit, value in failed_unit.items(): + report += f""" + {unit} - {value}""" + return result, report + + ### + # Check methods + ### + + def check_if_study_appears(self): + url = f"https://wwwdev.ebi.ac.uk/eva/webservices/rest/v1/studies/{self.project_accession}/summary" try: - files_in_ftp = self.get_files_from_ftp(self.project_accession) + json_response = self._get_result_from_webservice(url) + except HTTPError as e: + logger.error(str(e)) + json_response = {} + if self._check_if_study_present_in_response(json_response, 'id'): + result = "PASS" + else: + result = "FAIL" + + report = f"""Success: {result}""" + return result, report + + def check_if_study_appears_in_metadata(self): + missing_assemblies = [] + for analysis_data in self.analyses.values(): + species_name = self._get_species_name(analysis_data['assembly_accession']) + if not self._check_if_study_appears_in_variant_browser(species_name): + missing_assemblies.append(f"{species_name}({analysis_data['assembly_accession']})") + + result = "PASS" if not missing_assemblies else "FAIL" + report = f"""Success: {result} + missing assemblies: {missing_assemblies if missing_assemblies else None}""" + return result, report + + def check_all_browsable_files_are_available_in_ftp(self): + try: + files_in_ftp = self._get_files_from_ftp(self.project_accession) except Exception as e: logger.error(f"Error fetching files from ftp for study {self.project_accession}. Exception {e}") - self._ftp_check_result = "FAIL" - return f""" - Error: Error fetching files from ftp for study {self.project_accession}""" + result = "FAIL" + report = f"""Error: Error fetching files from ftp for study {self.project_accession}""" + return result, report if not files_in_ftp: logger.error(f"No file found in ftp for study {self.project_accession}") - self._ftp_check_result = "FAIL" - return f""" - Error: No files found in FTP for study {self.project_accession}""" + result = "FAIL" + report = f"""Error: No files found in FTP for study {self.project_accession}""" + return result, report missing_files = [] - for file in vcf_files: + for file in self.vcf_files: no_ext_file, _ = os.path.splitext(file) if file not in files_in_ftp: missing_files.append(file) @@ -173,204 +318,131 @@ def check_all_browsable_files_are_available_in_ftp(self, vcf_files): no_ext_accessioned_file, _ = os.path.splitext(accessioned_file) if accessioned_file not in files_in_ftp: missing_files.append(accessioned_file) - if f'{accessioned_file}.csi' not in files_in_ftp and f'{no_ext_accessioned_file}.csi' not in files_in_ftp: + if f'{accessioned_file}.csi' not in files_in_ftp and \ + f'{no_ext_accessioned_file}.csi' not in files_in_ftp: missing_files.append(f'{accessioned_file}.csi or {no_ext_accessioned_file}.csi') - self._ftp_check_result = "PASS" if not missing_files else "FAIL" - return f""" - pass: {self._ftp_check_result} - missing files: {missing_files if missing_files else None}""" - - @retry(tries=3, delay=2, backoff=1.5, jitter=(1, 3)) - def get_files_from_ftp(self, project_accession): - ftp = FTP('ftp.ebi.ac.uk', timeout=600) - ftp.login() - ftp.cwd(f'pub/databases/eva/{project_accession}') - return ftp.nlst() - - def _did_job_complete_successfully_from_log(self, file_path, job_type): - with open(file_path, 'r') as f: - job_status = 'FAILED' - job_launched_str, job_completed_str = self.job_launched_and_completed_text_map[job_type] - for line in f: - if any(str in line for str in job_launched_str): - job_status = "" - if any(str in line for str in job_completed_str): - job_status = line.split(" ")[-1].replace("[", "").replace("]", "").strip() - if job_status == 'COMPLETED': - return True - elif job_status == 'FAILED': - return False - else: - logger.error(f'Could not determine status of {job_type} job in file {file_path}') - return False - - def check_if_variants_were_skipped(self, file_path): - with open(file_path, 'r') as f: - variants_skipped = -1 - for line in f: - if "Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] launched" in line: - variants_skipped = None - if 'lines in the original VCF were skipped' in line: - variants_skipped = line.strip().split(":")[-1].strip().split(" ")[0].strip() - - return variants_skipped - - def get_failed_job_or_step_name(self, file_name): - with open(file_name, 'r') as f: - job_name = 'job name could not be retrieved' - for line in f: - if 'Encountered an error executing step' in line: - job_name = line[line.index("Encountered an error executing step"): line.rindex("in job")] \ - .strip().split(" ")[-1] + result = "PASS" if not missing_files else "FAIL" + report = f"""Success: {result} + Missing files: {missing_files if missing_files else None}""" + return result, report - return job_name + def check_if_accessioning_completed_successfully(self): + # No accessioning check is required for human + if self.taxonomy == 9606: + return self._report_for_human() - def check_if_accessioning_completed_successfully(self, vcf_files): failed_files = {} - for file in vcf_files: + for file in self.vcf_files: accessioning_log_files = glob.glob(f"{self.path_to_logs_dir}/accessioning.*{file}*.log") if accessioning_log_files: # check if accessioning job completed successfully - if not self._did_job_complete_successfully_from_log(accessioning_log_files[0], 'accession'): + if not _did_job_complete_successfully_from_log(accessioning_log_files[0], 'accession'): failed_files[ - file] = f"failed job/step : {self.get_failed_job_or_step_name(accessioning_log_files[0])}" + file] = f"failed job/step : {_get_failed_job_or_step_name(accessioning_log_files[0])}" else: failed_files[file] = f"Accessioning Error : No accessioning file found for {file}" - self._accessioning_job_check_result = "PASS" if not failed_files else "FAIL" - report = f""" - pass: {self._accessioning_job_check_result}""" + result = "PASS" if not failed_files else "FAIL" + report = f"""Success: {result}""" if failed_files: report += f""" - failed_files:""" + failed_files:""" for file, value in failed_files.items(): report += f""" - {file} - {value}""" + {file} - {value}""" - return report + return result, report - def check_if_variant_load_completed_successfully(self, vcf_files): - failed_files = defaultdict(dict) - for file_name in vcf_files: - self._find_log_and_check_job( - file_name, f"pipeline.*{file_name}*.log", "variant_load", failed_files - ) - self._find_log_and_check_job( - file_name, f"load_variants.*{file_name}*.log", "load_vcf", failed_files - ) - self._find_log_and_check_job( - file_name, f"acc_import.*{file_name}*.log", "acc_import", failed_files - ) - self._load_vcf_job_check_result = "PASS" - self._acc_import_job_check_result = "PASS" - if failed_files: - for file_name in list(failed_files): - errors = failed_files[file_name] - if 'load_vcf' in errors and 'variant_load' in errors: - self._load_vcf_job_check_result = "FAIL" - elif 'load_vcf' in errors and 'variant_load' not in errors: - # We can remove the load_vcf error because it is covered by variant_load - errors.pop('load_vcf') - if 'acc_import' in errors: - self._acc_import_job_check_result = "FAIL" - if not errors: - # If there are no more error we can remove the file completely - failed_files.pop(file_name) - - failed_analysis = defaultdict(dict) - analysis_to_file_names = {} + def check_if_variant_load_completed_successfully(self): + failed_files = {} + for file_name in self.vcf_files: + file_pass, last_error = self._check_multiple_logs( + file_name, + [f"pipeline.*{file_name}*.log", f"load_variants.*{file_name}*.log"], + ["variant_load", "load_vcf"]) + if not file_pass: + failed_files[file_name] = last_error + return self._report_for_log(failed_files) + + def check_if_acc_load_completed_successfully(self): + failed_files = {} + for file_name in self.vcf_files: + file_pass, last_error = self._check_multiple_logs( + file_name, + [f"pipeline.*{file_name}*.log", f"acc_import.*{file_name}*.log"], + ["variant_load", "acc_import"]) + if not file_pass: + failed_files[file_name] = last_error + return self._report_for_log(failed_files) + + def check_if_vep_completed_successfully(self): + failed_analysis = {} + any_vep_run = False for analysis_alias, analysis_accession in self.eload_cfg.query('brokering', 'ena', 'ANALYSIS').items(): - # Find the files associated with this analysis - analysis_to_file_names[analysis_accession] = [ - os.path.basename(f) for f in self.analyses.get(analysis_alias).get('vcf_files') - ] # annotation only happens if a VEP cache can be found assembly_accession = self.eload_cfg.query('brokering', 'analyses', analysis_alias, 'assembly_accession') if self.eload_cfg.query('ingestion', 'vep', assembly_accession, 'cache_version') is not None: - self._find_log_and_check_job( - analysis_accession, f"annotation.*{analysis_accession}*.log", "annotate_variants", failed_analysis - ) - # Statistics is only run if the aggregation is set to none - if self.eload_cfg.query('ingestion', 'aggregation', analysis_accession, ret_default='none') == 'none': - self._find_log_and_check_job( - analysis_accession, f"statistics.*{analysis_accession}*.log", "calculate_statistics", failed_analysis - ) - - self._annotation_job_check_result = "PASS" - self._statistics_job_check_result = "PASS" - if failed_analysis: - for analysis_accession in list(failed_analysis): - errors = failed_analysis[analysis_accession] - # Check that the variant_load step didn't run the annotation and calculate statistics - variant_load_error = any( - 'variant_load' in failed_files.get(f, {}) for f in analysis_to_file_names[analysis_accession] - ) - if 'annotate_variants' in errors and variant_load_error: - self._annotation_job_check_result = "FAIL" - elif 'annotate_variants' in errors and not variant_load_error: - # We can remove the annotate_variants error because it is covered by variant_load - errors.pop('annotate_variants') - if 'calculate_statistics' in errors and variant_load_error: - self._statistics_job_check_result = "FAIL" - elif 'calculate_statistics' in errors and not variant_load_error: - # We can remove the calculate_statistics error because it is covered by variant_load - errors.pop('calculate_statistics') - if not errors: - # If there are no more error we can remove the analysis completely - failed_analysis.pop(analysis_accession) - - report = f""" - vcf load result: {self._load_vcf_job_check_result} - annotation result: {self._annotation_job_check_result} - statistics result: {self._statistics_job_check_result} - accession import result: {self._acc_import_job_check_result}""" - if failed_files: - # For the report the variant_load does not needs to be reported because any new run will be done - # with the new load_vcf method. Remove the variant_load from the failed files - for file_name in failed_files: - failed_files[file_name].pop('variant_load', None) - report += f""" - Failed Files:""" - for file_name, error_txt in failed_files.items(): - report += f""" - {file_name}: - {error_txt.get("load_vcf", "")} - {error_txt.get("acc_import", "")}""" - if failed_analysis: - report += f""" - Failed Analysis:""" - for analysis_accession, error_txt in failed_analysis.items(): - report += f""" - {analysis_accession}: - {error_txt.get('annotate_variants', "")} - {error_txt.get('calculate_statistics', "")}""" - return report - - def _find_log_and_check_job(self, search_unit, log_file_pattern, job_type, failure_dict=None): - log_files = glob.glob(os.path.join(self.path_to_logs_dir, log_file_pattern)) - report_text = "" - if log_files: - # check if job completed successfully - if not self._did_job_complete_successfully_from_log(log_files[0], job_type): - report_text += f"{job_type} failed job/step : {self.get_failed_job_or_step_name(log_files[0])}" - job_passed = False - else: - job_passed = True + any_vep_run = True + logs_to_check = [] + jobs_to_check = [] + for file_name in self.analysis_to_file_names[analysis_accession]: + logs_to_check.append(f"pipeline.*{file_name}*.log") + jobs_to_check.append("variant_load") + logs_to_check.append(f"annotation.*{analysis_accession}*.log") + jobs_to_check.append("annotate_variants") + analysis_pass, last_error = self._check_multiple_logs(analysis_accession, logs_to_check, jobs_to_check) + if not analysis_pass: + failed_analysis[analysis_accession] = last_error + if any_vep_run: + return self._report_for_log(failed_analysis) else: - report_text += f"{job_type} error : No {job_type} log file found for {search_unit}" - job_passed = False - if not job_passed and failure_dict is not None: - failure_dict[search_unit][job_type] = report_text - return job_passed, report_text + return 'SKIP', f"""annotation result - SKIPPED""" - def check_if_variants_were_skipped_while_accessioning(self, vcf_files): + def check_if_variant_statistic_completed_successfully(self): + failed_analysis = {} + for analysis_alias, analysis_accession in self.eload_cfg.query('brokering', 'ena', 'ANALYSIS').items(): + logs_to_check = [] + jobs_to_check = [] + for file_name in self.analysis_to_file_names[analysis_accession]: + logs_to_check.append(f"pipeline.*{file_name}*.log") + jobs_to_check.append("variant_load") + logs_to_check.extend([ + f"statistics.*{analysis_accession}*.log", + f"variant.statistics.{analysis_accession}.log" + ]) + jobs_to_check.extend(["calculate_statistics", "variant-stats"]) + analysis_pass, last_error = self._check_multiple_logs(analysis_accession, logs_to_check, jobs_to_check) + if not analysis_pass: + failed_analysis[analysis_accession] = last_error + return self._report_for_log(failed_analysis) + + def study_statistic_check_report(self): + failed_analysis = {} + for analysis_alias, analysis_accession in self.eload_cfg.query('brokering', 'ena', 'ANALYSIS').items(): + logs_to_check = [] + jobs_to_check = [] + for file_name in self.analysis_to_file_names[analysis_accession]: + logs_to_check.append(f"pipeline.*{file_name}*.log") + jobs_to_check.append("variant_load") + logs_to_check.extend( + [f"statistics.*{analysis_accession}*.log", f"study.statistics.{analysis_accession}.log"]) + jobs_to_check.extend(["calculate_statistics", "study-stats"]) + analysis_pass, last_error = self._check_multiple_logs(analysis_accession, logs_to_check, jobs_to_check) + if not analysis_pass: + failed_analysis[analysis_accession] = last_error + return self._report_for_log(failed_analysis) + + def check_if_variants_were_skipped_while_accessioning(self): + # No accessioning check is required for human + if self.taxonomy == 9606: + return self._report_for_human() failed_files = {} - for file in vcf_files: + for file in self.vcf_files: accessioning_log_files = glob.glob(f"{self.path_to_logs_dir}/accessioning.*{file}*.log") if accessioning_log_files: # check if any variants were skippped while accessioning - variants_skipped = self.check_if_variants_were_skipped(accessioning_log_files[0]) + variants_skipped = self._check_if_variants_were_skipped_in_log(accessioning_log_files[0]) if variants_skipped: if variants_skipped == -1: failed_files[file] = f"could not retrieve skipped variants count" @@ -379,29 +451,32 @@ def check_if_variants_were_skipped_while_accessioning(self, vcf_files): else: failed_files[file] = f"Accessioning Error : No accessioning file found for {file}" - self._variants_skipped_accessioning_check_result = "PASS" if not failed_files else "PASS with Warning (Manual Check Required)" - report = f""" - pass: {self._variants_skipped_accessioning_check_result}""" + result = "PASS" if not failed_files else "PASS with Warning (Manual Check Required)" + report = f"""Success: {result}""" if failed_files: report += f""" - failed_files:""" + Failures:""" for file, value in failed_files.items(): report += f""" - {file} - {value}""" + {file} - {value}""" - return report - - def check_if_browsable_files_entered_correctly_in_db(self, vcf_files): - browsable_files_from_db = self.get_browsable_files_for_study() - missing_files = set(vcf_files) - set(browsable_files_from_db) - self._browsable_files_check_result = "PASS" if len(missing_files) == 0 else "FAIL" + return result, report - return f""" - pass : {self._browsable_files_check_result} - expected files: {vcf_files} - missing files: {missing_files if missing_files else 'None'}""" + def check_if_browsable_files_entered_correctly_in_db(self): + browsable_files_from_db = self._get_browsable_files_for_study() + missing_files = set(self.vcf_files) - set(browsable_files_from_db) + result = "PASS" if len(missing_files) == 0 else "FAIL" + report = f"""Success : {result} + Expected files: {self.vcf_files} + Missing files: {missing_files if missing_files else 'None'}""" + return result, report - def clustering_check_report(self, target_assembly): + def clustering_check_report(self): + target_assembly = self.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') + if not target_assembly: + result = 'DID NOT RUN' + report = """N/A""" + return result, report clustering_check_pass, clustering_error = self._find_log_and_check_job( target_assembly, f'{target_assembly}_clustering.log', 'clustering' ) @@ -410,17 +485,20 @@ def clustering_check_report(self, target_assembly): ) if clustering_check_pass and clustering_qc_check_pass: - self._clustering_check_result = 'PASS' + result = 'PASS' else: - self._clustering_check_result = 'FAIL' + result = 'FAIL' - return f"""Clustering Job: {'PASS' if clustering_check_pass else "FAIL"} - {clustering_error if not clustering_check_pass else ""} - Clustering QC Job: {'PASS' if clustering_qc_check_pass else "FAIL"} - {clustering_qc_error if not clustering_qc_check_pass else ""} - """ + report = f"""Clustering Job: {'PASS' if clustering_check_pass else "FAIL"} - {clustering_error if not clustering_check_pass else "No error"} + Clustering QC Job: {'PASS' if clustering_qc_check_pass else "FAIL"} - {clustering_qc_error if not clustering_qc_check_pass else "No error"}""" + return result, report - def remapping_check_report(self, target_assembly): + def remapping_check_report(self): + target_assembly = self.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') + if not target_assembly: + result = 'DID NOT RUN' + report = """N/A""" + return result, report asm_res = defaultdict(dict) for analysis_data in self.analyses.values(): assembly_accession = analysis_data['assembly_accession'] @@ -440,32 +518,36 @@ def remapping_check_report(self, target_assembly): asm_res[assembly_accession]['remapping_ingestion_result'] = remapping_ingestion_result asm_res[assembly_accession]['remapping_ingestion_error'] = remapping_ingestion_error - self._remapping_check_result = 'PASS' + result = 'PASS' - report = f"""remapping result of assemblies:""" + report_lines = [] for asm, res in asm_res.items(): vcf_ext_res = res['vcf_extractor_result'] vcf_ext_err = 'No Error' if res['vcf_extractor_error'] == "" else res['vcf_extractor_error'] remap_ingest_res = res['remapping_ingestion_result'] - remap_ingest_err = 'No Error' if res['remapping_ingestion_error'] == "" else res['remapping_ingestion_error'] + remap_ingest_err = 'No Error' if res['remapping_ingestion_error'] == "" \ + else res['remapping_ingestion_error'] if vcf_ext_res == 'FAIL' or remap_ingest_res == 'FAIL': - self._remapping_check_result = 'FAIL' - - report += f""" - {asm}: - - vcf_extractor_result : {vcf_ext_res} - {vcf_ext_err} - - remapping_ingestion_result: {remap_ingest_res} - {remap_ingest_err} - """ + result = 'FAIL' - return report + report_lines.append(f"""Source assembly {asm}: + - vcf_extractor_result : {vcf_ext_res} - {vcf_ext_err} + - remapping_ingestion_result: {remap_ingest_res} - {remap_ingest_err}""") + return result, '\n '.join(report_lines) - def backpropagation_check_report(self, target_assembly): + def backpropagation_check_report(self): + target_assembly = self.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') + if not target_assembly: + result = 'DID NOT RUN' + report = """N/A""" + return result, report asm_res = defaultdict(dict) for analysis_data in self.analyses.values(): assembly_accession = analysis_data['assembly_accession'] if assembly_accession != target_assembly: backpropagation_pass, backpropagation_error = self._find_log_and_check_job( - assembly_accession, f"{target_assembly}_backpropagate_to_{assembly_accession}.log", "backpropagation" + assembly_accession, f"{target_assembly}_backpropagate_to_{assembly_accession}.log", + "backpropagation" ) asm_res[assembly_accession]['result'] = 'PASS' if backpropagation_pass else 'FAIL' asm_res[assembly_accession]['error'] = backpropagation_error @@ -473,126 +555,104 @@ def backpropagation_check_report(self, target_assembly): asm_res[assembly_accession]['result'] = "SKIP" asm_res[assembly_accession]['error'] = "" - self._backpropagation_check_result = 'PASS' + result = 'PASS' - report = f"""backpropagation result of assemblies:""" - for asm, result in asm_res.items(): - res = result['result'] - err = 'No Error' if result['error'] == '' else result['error'] + report_lines = [] + for asm, bckp_result in asm_res.items(): + res = bckp_result['result'] + err = 'No Error' if bckp_result['error'] == '' else bckp_result['error'] if res == 'FAIL': - self._backpropagation_check_result = 'FAIL' - report += f""" - {asm}: {res} - {err}""" - - return report + result = 'FAIL' + report_lines.append(f"""Backpropagation result to {asm}: {res} - {err}""") - def check_if_remapping_and_clustering_finished_successfully(self): - target_assembly = self.eload_cfg.query('ingestion', 'remap_and_cluster', 'target_assembly') - if not target_assembly: - self._remapping_check_result = "FAIL" - self._clustering_check_result = "FAIL" - self._backpropagation_check_result = "FAIL" - return f""" - clustering check: {self._clustering_check_result} - remapping check: {self._remapping_check_result} - backpropagation check: {self._backpropagation_check_result} - Remapping and clustering have not run for this study (or eload configuration file is missing taxonomy) - Note: This results might not be accurate for older studies. It is advisable to checks those manually - """ - else: - clustering_check_report = self.clustering_check_report(target_assembly) - remapping_check_report = self.remapping_check_report(target_assembly) - backpropagation_check_report = self.backpropagation_check_report(target_assembly) - return f""" - clustering check: {self._clustering_check_result} - {clustering_check_report} - remapping check: {self._remapping_check_result} - {remapping_check_report} - backpropagation check: {self._backpropagation_check_result} - {backpropagation_check_report} - """ + return result, '\n '.join(report_lines) def run_qc_checks_for_submission(self): - """Collect information from different qc methods and write the report.""" - vcf_files = [] - for analysis_data in self.analyses.values(): - for v_files in analysis_data['vcf_files'].values(): - vcf_files.append(os.path.basename(v_files['output_vcf_file'])) - - browsable_files_report = self.check_if_browsable_files_entered_correctly_in_db(vcf_files) + """Collect information from different qc methods format and write the report.""" + browsable_files_result, browsable_files_report = self.check_if_browsable_files_entered_correctly_in_db() # No accessioning check is required for human - if self.taxonomy != 9606: - accessioning_job_report = self.check_if_accessioning_completed_successfully(vcf_files) - variants_skipped_report = self.check_if_variants_were_skipped_while_accessioning(vcf_files) - else: - self._accessioning_job_check_result = 'N/A - Human Taxonomy' - self._variants_skipped_accessioning_check_result = 'N/A - Human Taxonomy' - accessioning_job_report = f""" - pass: {self._accessioning_job_check_result}""" - variants_skipped_report = f""" - pass: {self._variants_skipped_accessioning_check_result}""" - - variant_load_report = self.check_if_variant_load_completed_successfully(vcf_files) + accessioning_job_result, accessioning_job_report = self.check_if_accessioning_completed_successfully() + variants_skipped_result, variants_skipped_report = self.check_if_variants_were_skipped_while_accessioning() - remapping_and_clustering_report = self.check_if_remapping_and_clustering_finished_successfully() + variant_load_result, variant_load_report = self.check_if_variant_load_completed_successfully() + annotation_result, annotation_report = self.check_if_vep_completed_successfully() + variant_statistic_result, variant_statistic_report = self.check_if_variant_statistic_completed_successfully() + study_statistic_result, study_statistic_report = self.check_if_variant_statistic_completed_successfully() + acc_import_result, acc_import_report = self.check_if_acc_load_completed_successfully() - ftp_report = self.check_all_browsable_files_are_available_in_ftp(vcf_files) + clustering_check_result, clustering_check_report = self.clustering_check_report() + remapping_check_result, remapping_check_report = self.remapping_check_report() + backpropagation_check_result, backpropagation_check_report = self.backpropagation_check_report() + ftp_check_result, ftp_check_report = self.check_all_browsable_files_are_available_in_ftp() - study_report = self.check_if_study_appears() + study_check_result, study_check_report = self.check_if_study_appears() - study_metadata_report = self.check_if_study_appears_in_metadata() + study_metadata_check_result, study_metadata_check_report = self.check_if_study_appears_in_metadata() report = f""" QC Result Summary: ------------------ - Browsable files check: {self._browsable_files_check_result} - Accessioning job check: {self._accessioning_job_check_result} - Variants Skipped accessioning check: {self._variants_skipped_accessioning_check_result} + Browsable files check: {browsable_files_result} + Accessioning job check: {accessioning_job_result} + Variants Skipped accessioning check: {variants_skipped_result} Variant load and Accession Import check: - Variant load check: {self._load_vcf_job_check_result} - Annotation check: {self._annotation_job_check_result} - Statistics check: {self._statistics_job_check_result} - Accession Import check: {self._acc_import_job_check_result} + Variant load check: {variant_load_result} + Annotation check: {annotation_result} + Variant Statistics check: {variant_statistic_result} + Study Statistics check: {study_statistic_result} + Accession Import check: {acc_import_result} Remapping and Clustering Check: - Clustering check: {self._clustering_check_result} - Remapping check: {self._remapping_check_result} - Back-propogation check: {self._backpropagation_check_result} - FTP check: {self._ftp_check_result} - Study check: {self._study_check_result} - Study metadata check: {self._study_metadata_check_result} + Remapping check: {remapping_check_result} + Clustering check: {clustering_check_result} + Back-propogation check: {backpropagation_check_result} + FTP check: {ftp_check_result} + Study check: {study_check_result} + Study metadata check: {study_metadata_check_result} + + QC Details: ---------------------------------- - Browsable files check: - {browsable_files_report} + {browsable_files_report} --------------------------------- - Accessioning job check: - {accessioning_job_report} + {accessioning_job_report} ---------------------------------- - Variants skipped check: - {variants_skipped_report} + {variants_skipped_report} ---------------------------------- - Variant load check: - {variant_load_report} + {variant_load_report} ---------------------------------- - - Remapping and Clustering check: - {remapping_and_clustering_report} + Annotation check: + {annotation_report} + ---------------------------------- + Variant Statistics check: + {variant_statistic_report} + ---------------------------------- + Study Statistics check: + {study_statistic_report} + ---------------------------------- + Accession Import check: + {acc_import_report} + ---------------------------------- + Remapping Check: + {remapping_check_report} + ---------------------------------- + Clustering check: + {clustering_check_report} + ---------------------------------- + Backpropagation check: + {backpropagation_check_report} ---------------------------------- - FTP check: - {ftp_report} + {ftp_check_report} ---------------------------------- - Study check: - {study_report} + {study_check_report} ---------------------------------- - Study metadata check: - {study_metadata_report} + {study_metadata_check_report} ---------------------------------- """ diff --git a/tests/resources/eloads/ELOAD_103/.ELOAD_103_config.yml b/tests/resources/eloads/ELOAD_103/.ELOAD_103_config.yml index 6bb3025a..0bc6f7fb 100644 --- a/tests/resources/eloads/ELOAD_103/.ELOAD_103_config.yml +++ b/tests/resources/eloads/ELOAD_103/.ELOAD_103_config.yml @@ -9,9 +9,19 @@ brokering: csi: tests/resources/eloads/ELOAD_103/18_brokering/ena/test1.vcf.gz.csi index: tests/resources/eloads/ELOAD_103/18_brokering/ena/test1.vcf.gz.tbi output_vcf_file: tests/resources/eloads/ELOAD_103/18_brokering/ena/test1.vcf.gz + Analysis alias test2: + assembly_accession: GCA_000003205.1 + assembly_fasta: fasta.fa + assembly_report: assembly_report.txt + vcf_files: + tests/resources/eloads/ELOAD_103/18_brokering/ena/test2.vcf.gz: + csi: tests/resources/eloads/ELOAD_103/18_brokering/ena/test2.vcf.gz.csi + index: tests/resources/eloads/ELOAD_103/18_brokering/ena/test2.vcf.gz.tbi + output_vcf_file: tests/resources/eloads/ELOAD_103/18_brokering/ena/test2.vcf.gz ena: ANALYSIS: Analysis alias test: ERZ2499196 + Analysis alias test2: ERZ2499197 PROJECT: PRJEB33333 hold_date: 2021-01-04 ingestion: @@ -26,6 +36,7 @@ submission: assembly_report: assembly_report.txt vcf_files: - tests/resources/eloads/ELOAD_103/18_brokering/ena/test1.vcf.gz + - tests/resources/eloads/ELOAD_103/18_brokering/ena/test2.vcf.gz scientific_name: Equus caballus taxonomy_id: 9796 validation: diff --git a/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_eva_remapped.vcf_ingestion.log b/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_eva_remapped.vcf_ingestion.log new file mode 100644 index 00000000..63547898 --- /dev/null +++ b/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_eva_remapped.vcf_ingestion.log @@ -0,0 +1,9 @@ + +2023-04-15 23:11:31.163 INFO 1111439 --- [ main] u.a.e.eva.remapping.ingest.Application : Started Application in 15.137 seconds (JVM running for 19.408) +2023-04-15 23:11:32.288 INFO 1111439 --- [ main] .e.r.i.r.IngestRemappedCommandLineRunner : Running job 'INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB' with parameters: +2023-04-15 23:11:32.450 INFO 1111439 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] launched with the following parameters: [] +2023-04-15 23:11:32.502 INFO 1111439 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [STORE_REMAPPING_METADATA_STEP] +2023-04-15 23:11:33.071 INFO 1111439 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [INGEST_REMAPPED_VARIANTS_FROM_VCF_STEP] +2023-04-15 23:12:05.681 INFO 1111439 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : INGEST_REMAPPED_VARIANTS_FROM_VCF_STEP: Items read = 10572, items written = 10572 +2023-04-15 23:12:05.681 INFO 1111439 --- [ main] u.a.e.e.r.i.b.l.RemappingIngestCounts : Step INGEST_REMAPPED_VARIANTS_FROM_VCF_STEP finished: Items (remapped ss) read = 10572, ss ingested = 1, ss skipped (duplicate) = 10569, ss discarded from db = 2 +2023-04-15 23:12:05.688 INFO 1111439 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=INGEST_REMAPPED_VARIANTS_FROM_VCF_JOB]] completed with the following parameters: [] and the following status: [COMPLETED] \ No newline at end of file diff --git a/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_vcf_extractor.log b/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_vcf_extractor.log new file mode 100644 index 00000000..6f08bf03 --- /dev/null +++ b/tests/resources/projects/PRJEB33333/00_logs/GCA_000003205.1_vcf_extractor.log @@ -0,0 +1,17 @@ +2023-04-15 21:23:21.566 INFO 2255770 --- [ main] u.a.e.eva.remapping.source.Application : Started Application in 53.64 seconds (JVM running for 58.383) +2023-04-15 21:23:21.891 INFO 2255770 --- [ main] ionRemappingJobLauncherCommandLineRunner : Running job 'EXPORT_SUBMITTED_VARIANTS_JOB' with parameters: +2023-04-15 21:23:22.049 INFO 2255770 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] launched with the following parameters: [] +2023-04-15 21:23:22.091 INFO 2255770 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [EXPORT_EVA_SUBMITTED_VARIANTS_STEP] +2023-04-15 21:23:42.753 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_EVA_SUBMITTED_VARIANTS_STEP: Items read = 1000, items written = 1000 +2023-04-15 21:23:42.811 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_EVA_SUBMITTED_VARIANTS_STEP: Items read = 2000, items written = 2000 +2023-04-15 21:23:43.594 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_EVA_SUBMITTED_VARIANTS_STEP: Items read = 26000, items written = 26000 +2023-04-15 21:23:43.621 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_EVA_SUBMITTED_VARIANTS_STEP: Items read = 27000, items written = 27000 +2023-04-15 21:23:43.634 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_EVA_SUBMITTED_VARIANTS_STEP: Items read = 27408, items written = 27408 +2023-04-15 21:23:43.634 INFO 2255770 --- [ main] u.a.e.e.r.s.b.l.ExcludeVariantsListener : Processors filtered out 0 variants (possible reason is that they were named variants) +2023-04-15 21:23:43.634 WARN 2255770 --- [ main] u.a.e.e.r.s.b.l.ExcludeVariantsListener : Processors skipped 0 variants because the start position is greater than the chromosome end +2023-04-15 21:23:43.703 INFO 2255770 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [EXPORT_DBSNP_SUBMITTED_VARIANTS_STEP] +2023-04-15 21:23:43.707 INFO 2255770 --- [ main] SubmittedVariantMongoReaderConfiguration : Injecting DbsnpSubmittedVariantMongoReader with parameters: uk.ac.ebi.eva.remapping.source.parameters.InputParameters@29f7cefd55361"] }, "tax" : 9915 }, Fields: { }, Sort: { } +2023-04-15 21:33:52.932 INFO 2255770 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : EXPORT_DBSNP_SUBMITTED_VARIANTS_STEP: Items read = 0, items written = 0 +2023-04-15 21:33:52.932 INFO 2255770 --- [ main] u.a.e.e.r.s.b.l.ExcludeVariantsListener : Processors filtered out 0 variants (possible reason is that they were named variants) +2023-04-15 21:33:52.932 WARN 2255770 --- [ main] u.a.e.e.r.s.b.l.ExcludeVariantsListener : Processors skipped 0 variants because the start position is greater than the chromosome end +2023-04-15 21:33:52.940 INFO 2255770 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=EXPORT_SUBMITTED_VARIANTS_JOB]] completed with the following parameters: [] and the following status: [COMPLETED] \ No newline at end of file diff --git a/tests/resources/projects/PRJEB33333/00_logs/GCA_000247795.2_backpropagate_to_GCA_000003205.1.log b/tests/resources/projects/PRJEB33333/00_logs/GCA_000247795.2_backpropagate_to_GCA_000003205.1.log new file mode 100644 index 00000000..06ee7717 --- /dev/null +++ b/tests/resources/projects/PRJEB33333/00_logs/GCA_000247795.2_backpropagate_to_GCA_000003205.1.log @@ -0,0 +1,12 @@ +2023-04-24 07:45:36.133 INFO 1094758 --- [ main] u.a.e.e.a.clustering.Application : Started Application in 41.564 seconds (JVM running for 43.354) +2023-04-24 07:45:36.267 INFO 1094758 --- [ main] .a.e.e.a.c.r.ClusteringCommandLineRunner : Running job 'BACK_PROPAGATE_NEW_RS_JOB' with parameters: +2023-04-24 07:45:36.316 INFO 1094758 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] launched with the following parameters: [ +2023-04-24 07:45:36.355 INFO 1094758 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [BACK_PROPAGATE_NEW_RS_STEP] +2023-04-24 07:46:58.119 INFO 1094758 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : BACK_PROPAGATE_NEW_RS_STEP: Items read = 28800, items written = 28800 +2023-04-24 07:46:58.229 INFO 1094758 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : BACK_PROPAGATE_NEW_RS_STEP: Items read = 28811, items written = 28811 +2023-04-24 07:46:58.235 INFO 1094758 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Count{id=null, process='clustering', identifier='{"assembly":"GCA_000247795.2","projects":["PRJEB55361"]}', metric='submitted_variants_ss_split', count=0} +2023-04-24 07:46:58.236 INFO 1094758 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Count{id=null, process='clustering', identifier='{"assembly":"GCA_000247795.2","projects":["PRJEB55361"]}', metric='submitted_variants_deprecated', count=0} +2023-04-24 07:46:58.745 INFO 1094758 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Metric Count successfully saved In DB +2023-04-24 07:46:58.753 INFO 1094758 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=BACK_PROPAGATE_NEW_RS_JOB]] completed with the following parameters: [] and the following status: [COMPLETED] +2023-04-24 07:46:58.759 INFO 1094758 --- [ main] o.s.b.f.support.DisposableBeanAdapter : Destroy method 'close' on bean with name 'REMAPPED_RS_READER' threw an exception: java.lang.NullPointerException +2023-04-24 07:46:58.760 INFO 1094758 --- [ main] o.s.b.f.support.DisposableBeanAdapter : Destroy method 'close' on bean with name 'QC_REMAPPED_SS_READER' threw an exception: java.lang.NullPointerException \ No newline at end of file diff --git a/tests/resources/projects/PRJEB33333/00_logs/accessioning.test2.vcf.gz.log b/tests/resources/projects/PRJEB33333/00_logs/accessioning.test2.vcf.gz.log new file mode 100644 index 00000000..680c2bbf --- /dev/null +++ b/tests/resources/projects/PRJEB33333/00_logs/accessioning.test2.vcf.gz.log @@ -0,0 +1,21 @@ +2023-04-15 21:13:02.919 INFO 2638664 --- [ main] u.a.e.e.accession.pipeline.Application : Started Application in 30.494 seconds (JVM running for 32.778) +2023-04-15 21:13:07.048 INFO 2638664 --- [ main] EvaAccessionJobLauncherCommandLineRunner : Running job 'CREATE_SUBSNP_ACCESSION_JOB' with parameters: +2023-04-15 21:13:07.101 INFO 2638664 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] launched with the following parameters: [] +2023-04-15 21:13:07.141 INFO 2638664 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [CREATE_SUBSNP_ACCESSION_STEP] +2023-04-15 21:13:08.311 INFO 2638664 --- [ main] .a.p.c.b.p.VariantProcessorConfiguration : Injecting VariantProcessor with parameters: uk.ac.ebi.eva.accession.pipeline.parameters.InputParameters@54c5a2ff +2023-04-15 21:13:09.860 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 100, items written = 100 +2023-04-15 21:13:10.620 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 200, items written = 200 +2023-04-15 21:13:11.776 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 300, items written = 300 +023-04-15 21:17:06.632 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 27100, items written = 27100 +2023-04-15 21:17:07.320 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 27200, items written = 27200 +2023-04-15 21:17:07.777 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 27300, items written = 27300 +2023-04-15 21:17:08.503 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 27400, items written = 27400 +2023-04-15 21:17:08.628 INFO 2638664 --- [ main] u.a.e.e.a.c.b.l.GenericProgressListener : CREATE_SUBSNP_ACCESSION_STEP: Items read = 27408, items written = 27408 +2023-04-15 21:17:08.635 INFO 2638664 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Count{id=null, process='accessioning_warehouse_ingestion', identifier='{"assembly":"GCA_000003205.6","study":"PRJEB55361"}', metric='duplicate_variants', count=0} +2023-04-15 21:17:08.635 INFO 2638664 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Count{id=null, process='accessioning_warehouse_ingestion', identifier='{"assembly":"GCA_000003205.6","study":"PRJEB55361"}', metric='discarded_variants', count=0} +2023-04-15 21:17:09.154 INFO 2638664 --- [ main] u.a.e.eva.metrics.metric.MetricCompute : Metric Count successfully saved In DB +2023-04-15 21:17:09.199 INFO 2638664 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [BUILD_REPORT_STEP] +2023-04-15 21:17:10.402 INFO 2638664 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [CHECK_SUBSNP_ACCESSION_STEP] +2023-04-15 21:17:10.969 INFO 2638664 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [SimpleJob: [name=CREATE_SUBSNP_ACCESSION_JOB]] completed with the following parameters: [] and the following status: [COMPLETED] +2023-04-15 21:17:10.982 INFO 2638664 --- [ main] org.mongodb.driver.connection : Closed connection [connectionId{localValue:2}] to mongos-hx-eva-pro-001.ebi.ac.uk:27017 because the pool has been closed. +2023-04-15 21:17:10.983 INFO 2638664 --- [ main] j.LocalContainerEntityManagerFactoryBean : Closing JPA EntityManagerFactory for persistence unit 'default' diff --git a/tests/resources/projects/PRJEB33333/00_logs/pipeline.test2.vcf.gz.log b/tests/resources/projects/PRJEB33333/00_logs/pipeline.test2.vcf.gz.log new file mode 100644 index 00000000..cfa7627b --- /dev/null +++ b/tests/resources/projects/PRJEB33333/00_logs/pipeline.test2.vcf.gz.log @@ -0,0 +1,5 @@ +2023-04-24 10:47:07.917 INFO 384460 --- [ main] uk.ac.ebi.eva.pipeline.Application : Started Application in 5.971 seconds (JVM running for 7.569) +2023-04-24 10:51:39.462 INFO 384460 --- [ main] .EvaPipelineJobLauncherCommandLineRunner : Running job 'genotyped-vcf-job' with parameters: +2023-04-24 10:51:39.777 INFO 384460 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [FlowJob: [name=genotyped-vcf-job]] launched with the following parameters: [] +2023-04-24 10:51:39.883 INFO 384460 --- [ main] o.s.batch.core.job.SimpleStepHandler : Executing step: [load-variants-step] +2023-04-24 10:52:01.350 INFO 384460 --- [ main] o.s.b.c.l.support.SimpleJobLauncher : Job: [FlowJob: [name=genotyped-vcf-job]] completed with the following parameters: [] and the following status: [COMPLETED] \ No newline at end of file diff --git a/tests/test_submission_qc.py b/tests/test_submission_qc.py index 8a8beb41..10fdea06 100644 --- a/tests/test_submission_qc.py +++ b/tests/test_submission_qc.py @@ -78,18 +78,23 @@ def test_submission_qc_checks_passed(self): patch('eva_submission.submission_qc_checks.FTP.cwd'), \ patch('eva_submission.submission_qc_checks.FTP.nlst') as m_ftp_nlst, \ patch('eva_submission.submission_qc_checks.requests.get') as m_get, \ - patch('eva_submission.submission_qc_checks.get_all_results_for_query') as m_get_browsable_files: - m_get_browsable_files.side_effect = [[['test1.vcf.gz'], ['test2.vcf.gz']], [[['Homo Sapiens']]]] + patch('eva_submission.submission_qc_checks.get_all_results_for_query') as m_get_all_results_for_query: + m_get_all_results_for_query.side_effect = [[['test1.vcf.gz'], ['test2.vcf.gz']], [['ecaballus_30']], [['ecaballus_30']]] + json_with_id = { + "response": [{"numResults": 1, "numTotalResults": 1, "result": [{"id": "PRJEB33333"}]}] + } + json_with_project_id = { + "response": [{"numResults": 1, "numTotalResults": 1, "result": [{"studyId": "PRJEB33333"}]}] + } m_get.side_effect = [ - self._mock_response(json_data={ - "response": [{"numResults": 1, "numTotalResults": 1, "result": [{"id": "PRJEB33333"}]}] - }), - self._mock_response(json_data={ - "response": [{"numResults": 1, "numTotalResults": 1, "result": [{"studyId": "PRJEB33333"}]}] - }) + self._mock_response(json_data=json_with_id), + self._mock_response(json_data=json_with_project_id), + self._mock_response(json_data=json_with_project_id) ] m_ftp_nlst.return_value = ['test1.vcf.gz', 'test1.vcf.gz.csi', 'test1.vcf.csi', 'test1.accessioned.vcf.gz', - 'test1.accessioned.vcf.gz.csi', 'test1.accessioned.vcf.csi'] + 'test1.accessioned.vcf.gz.csi', 'test1.accessioned.vcf.csi', 'test2.vcf.gz', + 'test2.vcf.gz.csi', 'test2.vcf.csi', 'test2.accessioned.vcf.gz', + 'test2.accessioned.vcf.gz.csi', 'test2.accessioned.vcf.csi'] self.assertEqual(self.expected_report_of_eload_103(), self.eload.run_qc_checks_for_submission()) def test_submission_qc_checks_missing_files(self): @@ -114,6 +119,12 @@ def test_submission_qc_checks_missing_files(self): 'test1.accessioned.vcf.gz.csi', 'test1.accessioned.vcf.csi'] self.assertEqual(self.expected_report_of_eload_104(), self.eload.run_qc_checks_for_submission()) + def test_check_if_variant_load_completed_successfully(self): + self.eload = EloadQC(103) + result, report = self.eload.check_if_variant_load_completed_successfully() + assert result == 'PASS' + assert report == 'Success: PASS' + def expected_report_of_eload_101(self): return """ QC Result Summary: @@ -124,82 +135,80 @@ def expected_report_of_eload_101(self): Variant load and Accession Import check: Variant load check: FAIL Annotation check: FAIL - Statistics check: FAIL + Variant Statistics check: FAIL + Study Statistics check: FAIL Accession Import check: FAIL Remapping and Clustering Check: - Clustering check: FAIL - Remapping check: FAIL - Back-propogation check: FAIL + Remapping check: DID NOT RUN + Clustering check: DID NOT RUN + Back-propogation check: DID NOT RUN FTP check: FAIL Study check: FAIL Study metadata check: FAIL + + QC Details: ---------------------------------- - Browsable files check: - - pass : FAIL - expected files: ['test1.vcf.gz', 'test2.vcf.gz'] - missing files: {'test2.vcf.gz'} + Success : FAIL + Expected files: ['test1.vcf.gz', 'test2.vcf.gz'] + Missing files: {'test2.vcf.gz'} --------------------------------- - Accessioning job check: - - pass: FAIL - failed_files: - test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz - test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz + Success: FAIL + failed_files: + test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz + test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz ---------------------------------- - Variants skipped check: - - pass: PASS with Warning (Manual Check Required) - failed_files: - test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz - test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz + Success: PASS with Warning (Manual Check Required) + Failures: + test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz + test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz ---------------------------------- - Variant load check: - - vcf load result: FAIL - annotation result: FAIL - statistics result: FAIL - accession import result: FAIL - Failed Files: - test1.vcf.gz: - load_vcf error : No load_vcf log file found for test1.vcf.gz - acc_import error : No acc_import log file found for test1.vcf.gz - test2.vcf.gz: - load_vcf error : No load_vcf log file found for test2.vcf.gz - acc_import error : No acc_import log file found for test2.vcf.gz - Failed Analysis: - ERZ2499196: - annotate_variants error : No annotate_variants log file found for ERZ2499196 - calculate_statistics error : No calculate_statistics log file found for ERZ2499196 + Success: FAIL + Errors: + test1.vcf.gz - load_vcf error : No load_vcf log file found for test1.vcf.gz + test2.vcf.gz - load_vcf error : No load_vcf log file found for test2.vcf.gz ---------------------------------- - - Remapping and Clustering check: - - clustering check: FAIL - remapping check: FAIL - backpropagation check: FAIL - Remapping and clustering have not run for this study (or eload configuration file is missing taxonomy) - Note: This results might not be accurate for older studies. It is advisable to checks those manually - + Annotation check: + Success: FAIL + Errors: + ERZ2499196 - annotate_variants error : No annotate_variants log file found for ERZ2499196 + ---------------------------------- + Variant Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Study Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Accession Import check: + Success: FAIL + Errors: + test1.vcf.gz - acc_import error : No acc_import log file found for test1.vcf.gz + test2.vcf.gz - acc_import error : No acc_import log file found for test2.vcf.gz + ---------------------------------- + Remapping Check: + N/A + ---------------------------------- + Clustering check: + N/A + ---------------------------------- + Backpropagation check: + N/A ---------------------------------- - FTP check: - - Error: No files found in FTP for study PRJEB11111 + Error: No files found in FTP for study PRJEB11111 ---------------------------------- - Study check: - - pass: FAIL + Success: FAIL ---------------------------------- - Study metadata check: - - pass: FAIL + Success: FAIL missing assemblies: ["['Homo Sapiens'](GCA_000001000.1)"] ---------------------------------- """ @@ -214,83 +223,81 @@ def expected_report_of_eload_102(self): Variant load and Accession Import check: Variant load check: FAIL Annotation check: FAIL - Statistics check: FAIL + Variant Statistics check: FAIL + Study Statistics check: FAIL Accession Import check: FAIL Remapping and Clustering Check: - Clustering check: FAIL - Remapping check: FAIL - Back-propogation check: FAIL + Remapping check: DID NOT RUN + Clustering check: DID NOT RUN + Back-propogation check: DID NOT RUN FTP check: FAIL Study check: FAIL Study metadata check: FAIL + + QC Details: ---------------------------------- - Browsable files check: - - pass : PASS - expected files: ['test1.vcf.gz', 'test2.vcf.gz'] - missing files: None + Success : PASS + Expected files: ['test1.vcf.gz', 'test2.vcf.gz'] + Missing files: None --------------------------------- - Accessioning job check: - - pass: FAIL - failed_files: - test1.vcf.gz - failed job/step : CREATE_SUBSNP_ACCESSION_STEP - test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz + Success: FAIL + failed_files: + test1.vcf.gz - failed job/step : CREATE_SUBSNP_ACCESSION_STEP + test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz ---------------------------------- - Variants skipped check: - - pass: PASS with Warning (Manual Check Required) - failed_files: - test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz + Success: PASS with Warning (Manual Check Required) + Failures: + test2.vcf.gz - Accessioning Error : No accessioning file found for test2.vcf.gz ---------------------------------- - Variant load check: - - vcf load result: FAIL - annotation result: FAIL - statistics result: FAIL - accession import result: FAIL - Failed Files: - test1.vcf.gz: - load_vcf error : No load_vcf log file found for test1.vcf.gz - acc_import failed job/step : accession-import-step - test2.vcf.gz: - load_vcf error : No load_vcf log file found for test2.vcf.gz - acc_import error : No acc_import log file found for test2.vcf.gz - Failed Analysis: - ERZ2499196: - annotate_variants error : No annotate_variants log file found for ERZ2499196 - calculate_statistics error : No calculate_statistics log file found for ERZ2499196 + Success: FAIL + Errors: + test1.vcf.gz - load_vcf error : No load_vcf log file found for test1.vcf.gz + test2.vcf.gz - load_vcf error : No load_vcf log file found for test2.vcf.gz ---------------------------------- - - Remapping and Clustering check: - - clustering check: FAIL - remapping check: FAIL - backpropagation check: FAIL - Remapping and clustering have not run for this study (or eload configuration file is missing taxonomy) - Note: This results might not be accurate for older studies. It is advisable to checks those manually - + Annotation check: + Success: FAIL + Errors: + ERZ2499196 - annotate_variants error : No annotate_variants log file found for ERZ2499196 + ---------------------------------- + Variant Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Study Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Accession Import check: + Success: FAIL + Errors: + test1.vcf.gz - acc_import failed job/step : accession-import-step + test2.vcf.gz - acc_import error : No acc_import log file found for test2.vcf.gz + ---------------------------------- + Remapping Check: + N/A + ---------------------------------- + Clustering check: + N/A + ---------------------------------- + Backpropagation check: + N/A ---------------------------------- - FTP check: - - pass: FAIL - missing files: ['test1.vcf.gz', 'test1.accessioned.vcf.gz', 'test2.vcf.gz', 'test2.vcf.gz.csi or test2.vcf.csi', 'test2.accessioned.vcf.gz', 'test2.accessioned.vcf.gz.csi or test2.accessioned.vcf.csi'] + Success: FAIL + Missing files: ['test1.vcf.gz', 'test1.accessioned.vcf.gz', 'test2.vcf.gz', 'test2.vcf.gz.csi or test2.vcf.csi', 'test2.accessioned.vcf.gz', 'test2.accessioned.vcf.gz.csi or test2.accessioned.vcf.csi'] ---------------------------------- - Study check: - - pass: FAIL + Success: FAIL ---------------------------------- - Study metadata check: - - pass: FAIL - missing assemblies: ["[\'Homo Sapiens\'](GCA_000001000.1)"] + Success: FAIL + missing assemblies: ["['Homo Sapiens'](GCA_000001000.1)"] ---------------------------------- """ @@ -303,77 +310,71 @@ def expected_report_of_eload_103(self): Variants Skipped accessioning check: PASS Variant load and Accession Import check: Variant load check: PASS - Annotation check: PASS - Statistics check: PASS + Annotation check: SKIP + Variant Statistics check: PASS + Study Statistics check: PASS Accession Import check: PASS Remapping and Clustering Check: - Clustering check: PASS Remapping check: PASS + Clustering check: PASS Back-propogation check: PASS FTP check: PASS Study check: PASS Study metadata check: PASS + + QC Details: ---------------------------------- - Browsable files check: - - pass : PASS - expected files: ['test1.vcf.gz'] - missing files: None + Success : PASS + Expected files: ['test1.vcf.gz', 'test2.vcf.gz'] + Missing files: None --------------------------------- - Accessioning job check: - - pass: PASS + Success: PASS ---------------------------------- - Variants skipped check: - - pass: PASS + Success: PASS ---------------------------------- - Variant load check: - - vcf load result: PASS - annotation result: PASS - statistics result: PASS - accession import result: PASS + Success: PASS ---------------------------------- - - Remapping and Clustering check: - - clustering check: PASS - Clustering Job: PASS - - Clustering QC Job: PASS - - - remapping check: PASS - remapping result of assemblies: - GCA_000003205.6: - - vcf_extractor_result : PASS - No Error - - remapping_ingestion_result: PASS - No Error - - backpropagation check: PASS - backpropagation result of assemblies: - GCA_000003205.6: PASS - No Error - + Annotation check: + annotation result - SKIPPED + ---------------------------------- + Variant Statistics check: + Success: PASS + ---------------------------------- + Study Statistics check: + Success: PASS + ---------------------------------- + Accession Import check: + Success: PASS + ---------------------------------- + Remapping Check: + Source assembly GCA_000003205.6: + - vcf_extractor_result : PASS - No Error + - remapping_ingestion_result: PASS - No Error + Source assembly GCA_000003205.1: + - vcf_extractor_result : PASS - No Error + - remapping_ingestion_result: PASS - No Error + ---------------------------------- + Clustering check: + Clustering Job: PASS - No error + Clustering QC Job: PASS - No error + ---------------------------------- + Backpropagation check: + Backpropagation result to GCA_000003205.6: PASS - No Error + Backpropagation result to GCA_000003205.1: PASS - No Error ---------------------------------- - FTP check: - - pass: PASS - missing files: None + Success: PASS + Missing files: None ---------------------------------- - Study check: - - pass: PASS + Success: PASS ---------------------------------- - Study metadata check: - - pass: PASS + Success: PASS missing assemblies: None ---------------------------------- """ @@ -388,88 +389,80 @@ def expected_report_of_eload_104(self): Variant load and Accession Import check: Variant load check: FAIL Annotation check: FAIL - Statistics check: FAIL + Variant Statistics check: FAIL + Study Statistics check: FAIL Accession Import check: FAIL Remapping and Clustering Check: - Clustering check: FAIL Remapping check: FAIL + Clustering check: FAIL Back-propogation check: FAIL FTP check: PASS Study check: PASS Study metadata check: PASS + + QC Details: ---------------------------------- - Browsable files check: - - pass : PASS - expected files: ['test1.vcf.gz'] - missing files: None + Success : PASS + Expected files: ['test1.vcf.gz'] + Missing files: None --------------------------------- - Accessioning job check: - - pass: FAIL - failed_files: - test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz + Success: FAIL + failed_files: + test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz ---------------------------------- - Variants skipped check: - - pass: PASS with Warning (Manual Check Required) - failed_files: - test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz + Success: PASS with Warning (Manual Check Required) + Failures: + test1.vcf.gz - Accessioning Error : No accessioning file found for test1.vcf.gz ---------------------------------- - Variant load check: - - vcf load result: FAIL - annotation result: FAIL - statistics result: FAIL - accession import result: FAIL - Failed Files: - test1.vcf.gz: - load_vcf error : No load_vcf log file found for test1.vcf.gz - acc_import error : No acc_import log file found for test1.vcf.gz - Failed Analysis: - ERZ2499196: - annotate_variants error : No annotate_variants log file found for ERZ2499196 - calculate_statistics error : No calculate_statistics log file found for ERZ2499196 - ---------------------------------- - - Remapping and Clustering check: - - clustering check: FAIL - Clustering Job: FAIL - clustering error : No clustering log file found for GCA_000247795.2 - Clustering QC Job: FAIL - clustering_qc error : No clustering_qc log file found for GCA_000247795.2 - - remapping check: FAIL - remapping result of assemblies: - GCA_000003205.6: - - vcf_extractor_result : FAIL - vcf_extractor error : No vcf_extractor log file found for GCA_000003205.6 - - remapping_ingestion_result: FAIL - remapping_ingestion error : No remapping_ingestion log file found for GCA_000003205.6 - - backpropagation check: FAIL - backpropagation result of assemblies: - GCA_000003205.6: FAIL - backpropagation error : No backpropagation log file found for GCA_000003205.6 - + Success: FAIL + Errors: + test1.vcf.gz - load_vcf error : No load_vcf log file found for test1.vcf.gz + ---------------------------------- + Annotation check: + Success: FAIL + Errors: + ERZ2499196 - annotate_variants error : No annotate_variants log file found for ERZ2499196 + ---------------------------------- + Variant Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Study Statistics check: + Success: FAIL + Errors: + ERZ2499196 - variant-stats error : No variant-stats log file found for ERZ2499196 + ---------------------------------- + Accession Import check: + Success: FAIL + Errors: + test1.vcf.gz - acc_import error : No acc_import log file found for test1.vcf.gz + ---------------------------------- + Remapping Check: + Source assembly GCA_000003205.6: + - vcf_extractor_result : FAIL - vcf_extractor error : No vcf_extractor log file found for GCA_000003205.6 + - remapping_ingestion_result: FAIL - remapping_ingestion error : No remapping_ingestion log file found for GCA_000003205.6 + ---------------------------------- + Clustering check: + Clustering Job: FAIL - clustering error : No clustering log file found for GCA_000247795.2 + Clustering QC Job: FAIL - clustering_qc error : No clustering_qc log file found for GCA_000247795.2 + ---------------------------------- + Backpropagation check: + Backpropagation result to GCA_000003205.6: FAIL - backpropagation error : No backpropagation log file found for GCA_000003205.6 ---------------------------------- - FTP check: - - pass: PASS - missing files: None + Success: PASS + Missing files: None ---------------------------------- - Study check: - - pass: PASS + Success: PASS ---------------------------------- - Study metadata check: - - pass: PASS + Success: PASS missing assemblies: None ---------------------------------- """ \ No newline at end of file