From 034b837027788b7ca2920ed0e0da0ebc1fa6cd07 Mon Sep 17 00:00:00 2001 From: Michael Franklin Date: Mon, 25 Sep 2023 15:19:03 +1000 Subject: [PATCH] Resolve main conflicts (#556) --- metamist/audit/audithelper.py | 25 +++++------------------- scripts/find_sequence_files_to_delete.py | 19 ++++++------------ scripts/parse_existing_cohort.py | 4 ++-- scripts/sync_seqr.py | 8 ++------ 4 files changed, 15 insertions(+), 41 deletions(-) diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index 72fb56561..54f7318b6 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -1,36 +1,21 @@ # pylint: disable=no-member import csv -from collections import defaultdict import logging +import os +from collections import defaultdict from typing import Any from cloudpathlib import AnyPath from cpg_utils.cloud import get_path_components_from_gcp_path + from metamist.parser.cloudhelper import CloudHelper class AuditHelper(CloudHelper): """General helper class for bucket auditing""" - EXCLUDED_SGS = [ - 'CPG11783', # acute-care, no FASTQ data - 'CPG13409', # perth-neuro, coverage ~0x - 'CPG243717', # validation, NA12878_KCCG low coverage https://main-web.populationgenomics.org.au/validation/qc/cram/multiqc.html, - 'CPG246645', # ag-hidden, eof issue https://batch.hail.populationgenomics.org.au/batches/97645/jobs/440 - 'CPG246678', # ag-hidden, diff fastq size https://batch.hail.populationgenomics.org.au/batches/97645/jobs/446 - 'CPG261792', # rdp-kidney misformated fastq - https://batch.hail.populationgenomics.org.au/batches/378736/jobs/43 - # acute care fasq parsing errors https://batch.hail.populationgenomics.org.au/batches/379303/jobs/24 - 'CPG259150', - 'CPG258814', - 'CPG258137', - 'CPG258111', - 'CPG258012', - # ohmr4 cram parsing in align issues - 'CPG261339', - 'CPG261347', - # IBMDX truncated sample? https://batch.hail.populationgenomics.org.au/batches/422181/jobs/99 - 'CPG265876', - ] + # + EXCLUDED_SGS = os.getenv('SM_AUDIT_HELPER_EXCLUDED_SGS', '').split(',') @staticmethod def get_gcs_bucket_subdirs_to_search(paths: list[str]) -> defaultdict[str, list]: diff --git a/scripts/find_sequence_files_to_delete.py b/scripts/find_sequence_files_to_delete.py index 4a1ade899..700152183 100644 --- a/scripts/find_sequence_files_to_delete.py +++ b/scripts/find_sequence_files_to_delete.py @@ -11,14 +11,15 @@ # pylint: disable=W0703 -import os import asyncio import logging +import os from collections import defaultdict from typing import Any import click from google.cloud import storage + from metamist.apis import ProjectApi from metamist.graphql import query_async @@ -33,17 +34,7 @@ projapi = ProjectApi() # TODO: fetch this from metamist -CPG_SEQUENCING_GROUP_IDS_TO_SKIP = { - 'CPG11783', # acute-care, no FASTQ data - 'CPG255232', # perth-neuro: new - 'CPG255240', # perth-neuro: new - 'CPG253328', # perth-neuro, contamination rate 32% - 'CPG13409', # perth-neuro, coverage ~0x - 'CPG243717', - 'CPG246645', # ag-hidden, eof issue https://batch.hail.populationgenomics.org.au/batches/97645/jobs/440 - 'CPG246678', # ag-hidden, diff fastq size https://batch.hail.populationgenomics.org.au/batches/97645/jobs/446 - 'CPG246561', # ag-hidden, coverage ~0x https://main-web.populationgenomics.org.au/ag-hidden/qc/cram/multiqc.html -} +CPG_SEQUENCING_GROUP_IDS_TO_SKIP = {} def get_bucket_name_from_path(path_input): @@ -183,7 +174,9 @@ def find_existing_files_in_bucket( async def find_files_to_delete( - sequencing_types_to_remove: list[str], projects_to_ignore: list[str], output_path: str + sequencing_types_to_remove: list[str], + projects_to_ignore: list[str], + output_path: str, ): """ Get all the sequences across all the projects from metamist diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py index 1e85ffca4..7c6e322e7 100644 --- a/scripts/parse_existing_cohort.py +++ b/scripts/parse_existing_cohort.py @@ -90,8 +90,8 @@ def sequence_meta_map(): def fastq_file_name_to_sample_id(filename: str) -> str: """ - HG3FMDSX3_2_220208_FD02700641_Homo-sapiens_AACGAGGCCG-ATCCAGGTAT_R_220208_BINKAN1_PROPHECY_M002_R1. - -> 220208_FD02700641 + SOMEID1_2_220101_SAMPLEID_Homo-sapiens_AACGAGGCCG-ATCCAGGTAT_R_220101_FIRLAS1_PROJECT_M002_R1. + -> 220101_SAMPLEID """ return '_'.join(filename.split('_')[2:4]) diff --git a/scripts/sync_seqr.py b/scripts/sync_seqr.py index 12be0fd93..380217aa5 100644 --- a/scripts/sync_seqr.py +++ b/scripts/sync_seqr.py @@ -14,11 +14,7 @@ import yaml from cloudpathlib import AnyPath -from metamist.apis import ( - AnalysisApi, - ProjectApi, - SeqrApi, -) +from metamist.apis import AnalysisApi, ProjectApi, SeqrApi from metamist.graphql import query_async from metamist.model.analysis_query_model import AnalysisQueryModel from metamist.model.analysis_status import AnalysisStatus @@ -59,7 +55,7 @@ ), } -SGS_TO_IGNORE = {'CPG227355', 'CPG227397'} +SGS_TO_IGNORE = {} BASE, SEQR_AUDIENCE = ENVS[ENVIRONMENT] url_individuals_sync = '/api/project/sa/{projectGuid}/individuals/sync'