Merge pull request #618 from populationgenomics/dev

Release
populationgenomics · Nov 21, 2023 · a684651 · a684651
2 parents f8acb63 + 4422f3e
commit a684651
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 10 deletions.
diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
@@ -30,6 +30,11 @@
 Additionally, the reads-column is not provided for existing-cohort csvs.
 This information is derived from the fluidX id pulled from the filename.
 
+Additional Options:
+--allow-missing-files:
+Set this flag to parse manifests with missing data and generate warnings instead of raising errors.
+This allows the script to proceed even if some data is missing.
+
 """
 
 import csv
@@ -105,12 +110,15 @@ def __init__(
         search_locations,
         batch_number,
         include_participant_column,
+        allow_missing_files,
     ):
         if include_participant_column:
             participant_column = Columns.PARTICIPANT_COLUMN
         else:
             participant_column = Columns.EXTERNAL_ID
 
+        self.allow_missing_files = allow_missing_files
+
         super().__init__(
             project=project,
             search_locations=search_locations,
@@ -134,7 +142,9 @@ def _get_dict_reader(self, file_pointer, delimiter: str):
         return reader
 
     async def get_read_filenames(
-        self, sample_id: Optional[str], row: SingleRow
+        self,
+        sample_id: Optional[str],
+        row: SingleRow,
     ) -> List[str]:
         """
         We don't have fastq urls in a manifest, so overriding this method to take
@@ -149,7 +159,11 @@ async def get_read_filenames(
         ]
 
         if not read_filenames:
-            raise ValueError(f'No read files found for {sample_id}')
+            if not self.allow_missing_files:
+                raise ValueError(f'No read files found for {sample_id}')
+
+            logger.warning(f'No read files found for {sample_id}')
+
         return read_filenames
 
     def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]:
@@ -205,6 +219,12 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]):
 @click.option(
     '--include-participant-column', 'include_participant_column', is_flag=True
 )
+@click.option(
+    '--allow-missing-files',
+    'allow_missing_files',
+    is_flag=True,
+    help='Set this flag to parse/ingest sequencing groups with missing reads',
+)
 @click.argument('manifests', nargs=-1)
 @run_as_sync
 async def main(
@@ -215,6 +235,7 @@ async def main(
     confirm=True,
     dry_run=False,
     include_participant_column=False,
+    allow_missing_files=False,
 ):
     """Run script from CLI arguments"""
 
@@ -223,6 +244,7 @@ async def main(
         search_locations=search_locations,
         batch_number=batch_number,
         include_participant_column=include_participant_column,
+        allow_missing_files=allow_missing_files,
     )
 
     for manifest_path in manifests:

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
@@ -1,13 +1,12 @@
 from datetime import datetime
 from io import StringIO
+from test.testbase import DbIsolatedTest, run_as_sync
 from unittest.mock import patch
 
-from test.testbase import run_as_sync, DbIsolatedTest
-
 from db.python.layers import ParticipantLayer
-from scripts.parse_existing_cohort import ExistingCohortParser
-from models.models import ParticipantUpsertInternal, SampleUpsertInternal
 from metamist.parser.generic_parser import ParsedParticipant
+from models.models import ParticipantUpsertInternal, SampleUpsertInternal
+from scripts.parse_existing_cohort import Columns, ExistingCohortParser
 
 
 class TestExistingCohortParser(DbIsolatedTest):
@@ -45,6 +44,7 @@ async def test_single_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -115,6 +115,7 @@ async def test_no_header(self):
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -141,7 +142,7 @@ async def test_no_header(self):
     #     Tests case where the fastq's in the storage do not match the ingested samples.
     #     """
     #     mock_graphql_query.side_effect = self.run_graphql_query_async
-    #
+
     #     rows = [
     #         'HEADER',
     #         '""',
@@ -153,15 +154,16 @@ async def test_no_header(self):
     #         batch_number='M01',
     #         search_locations=[],
     #         project=self.project_name,
+    #         allow_missing_files=False,
     #     )
-    #
+
     #     parser.filename_map = {
     #         'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
     #         'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
     #     }
-    #
+
     #     file_contents = '\n'.join(rows)
-    #
+
     #     with self.assertRaises(ValueError):
     #         await parser.parse_manifest(
     #             StringIO(file_contents), delimiter='\t', dry_run=True
@@ -214,6 +216,7 @@ async def test_existing_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -232,3 +235,48 @@ async def test_existing_row(
         self.assertEqual(0, summary['assays']['update'])
 
         return
+
+    @run_as_sync
+    async def test_get_read_filenames_no_reads_fail(self):
+        """Test ValueError is raised when allow_missing_files is False and sequencing groups have no reads"""
+
+        single_row = {Columns.MANIFEST_FLUID_X: ''}
+
+        parser = ExistingCohortParser(
+            include_participant_column=False,
+            batch_number='M01',
+            search_locations=[],
+            project=self.project_name,
+            allow_missing_files=False,
+        )
+        parser.filename_map = {}
+
+        with self.assertRaises(ValueError):
+            # this will raise a ValueError because the allow_missing_files=False,
+            # and there are no matching reads in the filename map
+            await parser.get_read_filenames(sample_id='', row=single_row)
+
+    @run_as_sync
+    async def test_get_read_filenames_no_reads_pass(self):
+        """Test when allow_missing_files is True and records with missing fastqs, no ValueError is raised"""
+
+        single_row = {Columns.MANIFEST_FLUID_X: ''}
+
+        parser = ExistingCohortParser(
+            include_participant_column=False,
+            batch_number='M01',
+            search_locations=[],
+            project=self.project_name,
+            allow_missing_files=True,
+        )
+        parser.filename_map = {}
+
+        with self.assertLogs(level='INFO') as cm:
+            read_filenames = await parser.get_read_filenames(
+                sample_id='', row=single_row
+            )
+
+        self.assertEqual(len(cm.output), 1)
+        self.assertIn('No read files found for ', cm.output[0])
+
+        self.assertEqual(len(read_filenames), 0)