From 034b837027788b7ca2920ed0e0da0ebc1fa6cd07 Mon Sep 17 00:00:00 2001
From: Michael Franklin <illusional@users.noreply.github.com>
Date: Mon, 25 Sep 2023 15:19:03 +1000
Subject: [PATCH] Resolve main conflicts (#556)

---
 metamist/audit/audithelper.py            | 25 +++++-------------------
 scripts/find_sequence_files_to_delete.py | 19 ++++++------------
 scripts/parse_existing_cohort.py         |  4 ++--
 scripts/sync_seqr.py                     |  8 ++------
 4 files changed, 15 insertions(+), 41 deletions(-)

diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py
index 72fb56561..54f7318b6 100644
--- a/metamist/audit/audithelper.py
+++ b/metamist/audit/audithelper.py
@@ -1,36 +1,21 @@
 # pylint: disable=no-member
 import csv
-from collections import defaultdict
 import logging
+import os
+from collections import defaultdict
 from typing import Any
 
 from cloudpathlib import AnyPath
 from cpg_utils.cloud import get_path_components_from_gcp_path
+
 from metamist.parser.cloudhelper import CloudHelper
 
 
 class AuditHelper(CloudHelper):
     """General helper class for bucket auditing"""
 
-    EXCLUDED_SGS = [
-        'CPG11783',  # acute-care, no FASTQ data
-        'CPG13409',  # perth-neuro, coverage ~0x
-        'CPG243717',  # validation, NA12878_KCCG low coverage https://main-web.populationgenomics.org.au/validation/qc/cram/multiqc.html,
-        'CPG246645',  # ag-hidden, eof issue  https://batch.hail.populationgenomics.org.au/batches/97645/jobs/440
-        'CPG246678',  # ag-hidden, diff fastq size  https://batch.hail.populationgenomics.org.au/batches/97645/jobs/446
-        'CPG261792',  # rdp-kidney misformated fastq - https://batch.hail.populationgenomics.org.au/batches/378736/jobs/43
-        # acute care fasq parsing errors https://batch.hail.populationgenomics.org.au/batches/379303/jobs/24
-        'CPG259150',
-        'CPG258814',
-        'CPG258137',
-        'CPG258111',
-        'CPG258012',
-        # ohmr4 cram parsing in align issues
-        'CPG261339',
-        'CPG261347',
-        # IBMDX truncated sample? https://batch.hail.populationgenomics.org.au/batches/422181/jobs/99
-        'CPG265876',
-    ]
+    #
+    EXCLUDED_SGS = os.getenv('SM_AUDIT_HELPER_EXCLUDED_SGS', '').split(',')
 
     @staticmethod
     def get_gcs_bucket_subdirs_to_search(paths: list[str]) -> defaultdict[str, list]:
diff --git a/scripts/find_sequence_files_to_delete.py b/scripts/find_sequence_files_to_delete.py
index 4a1ade899..700152183 100644
--- a/scripts/find_sequence_files_to_delete.py
+++ b/scripts/find_sequence_files_to_delete.py
@@ -11,14 +11,15 @@
 
 # pylint: disable=W0703
 
-import os
 import asyncio
 import logging
+import os
 from collections import defaultdict
 from typing import Any
 
 import click
 from google.cloud import storage
+
 from metamist.apis import ProjectApi
 from metamist.graphql import query_async
 
@@ -33,17 +34,7 @@
 projapi = ProjectApi()
 
 # TODO: fetch this from metamist
-CPG_SEQUENCING_GROUP_IDS_TO_SKIP = {
-    'CPG11783',  # acute-care, no FASTQ data
-    'CPG255232',  # perth-neuro: new
-    'CPG255240',  # perth-neuro: new
-    'CPG253328',  # perth-neuro, contamination rate 32%
-    'CPG13409',  # perth-neuro, coverage ~0x
-    'CPG243717',
-    'CPG246645',  # ag-hidden, eof issue  https://batch.hail.populationgenomics.org.au/batches/97645/jobs/440
-    'CPG246678',  # ag-hidden, diff fastq size  https://batch.hail.populationgenomics.org.au/batches/97645/jobs/446
-    'CPG246561',  # ag-hidden, coverage ~0x https://main-web.populationgenomics.org.au/ag-hidden/qc/cram/multiqc.html
-}
+CPG_SEQUENCING_GROUP_IDS_TO_SKIP = {}
 
 
 def get_bucket_name_from_path(path_input):
@@ -183,7 +174,9 @@ def find_existing_files_in_bucket(
 
 
 async def find_files_to_delete(
-    sequencing_types_to_remove: list[str], projects_to_ignore: list[str], output_path: str
+    sequencing_types_to_remove: list[str],
+    projects_to_ignore: list[str],
+    output_path: str,
 ):
     """
     Get all the sequences across all the projects from metamist
diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index 1e85ffca4..7c6e322e7 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -90,8 +90,8 @@ def sequence_meta_map():
 
 def fastq_file_name_to_sample_id(filename: str) -> str:
     """
-    HG3FMDSX3_2_220208_FD02700641_Homo-sapiens_AACGAGGCCG-ATCCAGGTAT_R_220208_BINKAN1_PROPHECY_M002_R1.
-    -> 220208_FD02700641
+    SOMEID1_2_220101_SAMPLEID_Homo-sapiens_AACGAGGCCG-ATCCAGGTAT_R_220101_FIRLAS1_PROJECT_M002_R1.
+    -> 220101_SAMPLEID
     """
     return '_'.join(filename.split('_')[2:4])
 
diff --git a/scripts/sync_seqr.py b/scripts/sync_seqr.py
index 12be0fd93..380217aa5 100644
--- a/scripts/sync_seqr.py
+++ b/scripts/sync_seqr.py
@@ -14,11 +14,7 @@
 import yaml
 from cloudpathlib import AnyPath
 
-from metamist.apis import (
-    AnalysisApi,
-    ProjectApi,
-    SeqrApi,
-)
+from metamist.apis import AnalysisApi, ProjectApi, SeqrApi
 from metamist.graphql import query_async
 from metamist.model.analysis_query_model import AnalysisQueryModel
 from metamist.model.analysis_status import AnalysisStatus
@@ -59,7 +55,7 @@
     ),
 }
 
-SGS_TO_IGNORE = {'CPG227355', 'CPG227397'}
+SGS_TO_IGNORE = {}
 BASE, SEQR_AUDIENCE = ENVS[ENVIRONMENT]
 
 url_individuals_sync = '/api/project/sa/{projectGuid}/individuals/sync'