diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py index 7c6e322e7..34d64f370 100644 --- a/scripts/parse_existing_cohort.py +++ b/scripts/parse_existing_cohort.py @@ -30,6 +30,11 @@ Additionally, the reads-column is not provided for existing-cohort csvs. This information is derived from the fluidX id pulled from the filename. +Additional Options: +--allow-missing-files: +Set this flag to parse manifests with missing data and generate warnings instead of raising errors. +This allows the script to proceed even if some data is missing. + """ import csv @@ -105,12 +110,15 @@ def __init__( search_locations, batch_number, include_participant_column, + allow_missing_files, ): if include_participant_column: participant_column = Columns.PARTICIPANT_COLUMN else: participant_column = Columns.EXTERNAL_ID + self.allow_missing_files = allow_missing_files + super().__init__( project=project, search_locations=search_locations, @@ -134,7 +142,9 @@ def _get_dict_reader(self, file_pointer, delimiter: str): return reader async def get_read_filenames( - self, sample_id: Optional[str], row: SingleRow + self, + sample_id: Optional[str], + row: SingleRow, ) -> List[str]: """ We don't have fastq urls in a manifest, so overriding this method to take @@ -149,7 +159,11 @@ async def get_read_filenames( ] if not read_filenames: - raise ValueError(f'No read files found for {sample_id}') + if not self.allow_missing_files: + raise ValueError(f'No read files found for {sample_id}') + + logger.warning(f'No read files found for {sample_id}') + return read_filenames def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]: @@ -205,6 +219,12 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]): @click.option( '--include-participant-column', 'include_participant_column', is_flag=True ) +@click.option( + '--allow-missing-files', + 'allow_missing_files', + is_flag=True, + help='Set this flag to parse/ingest sequencing groups with missing reads', +) @click.argument('manifests', nargs=-1) @run_as_sync async def main( @@ -215,6 +235,7 @@ async def main( confirm=True, dry_run=False, include_participant_column=False, + allow_missing_files=False, ): """Run script from CLI arguments""" @@ -223,6 +244,7 @@ async def main( search_locations=search_locations, batch_number=batch_number, include_participant_column=include_participant_column, + allow_missing_files=allow_missing_files, ) for manifest_path in manifests: diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py index 6f9b67fb9..d8e755bf7 100644 --- a/test/test_parse_existing_cohort.py +++ b/test/test_parse_existing_cohort.py @@ -1,13 +1,12 @@ from datetime import datetime from io import StringIO +from test.testbase import DbIsolatedTest, run_as_sync from unittest.mock import patch -from test.testbase import run_as_sync, DbIsolatedTest - from db.python.layers import ParticipantLayer -from scripts.parse_existing_cohort import ExistingCohortParser -from models.models import ParticipantUpsertInternal, SampleUpsertInternal from metamist.parser.generic_parser import ParsedParticipant +from models.models import ParticipantUpsertInternal, SampleUpsertInternal +from scripts.parse_existing_cohort import Columns, ExistingCohortParser class TestExistingCohortParser(DbIsolatedTest): @@ -45,6 +44,7 @@ async def test_single_row( batch_number='M01', search_locations=[], project=self.project_name, + allow_missing_files=False, ) parser.filename_map = { @@ -115,6 +115,7 @@ async def test_no_header(self): batch_number='M01', search_locations=[], project=self.project_name, + allow_missing_files=False, ) parser.filename_map = { @@ -141,7 +142,7 @@ async def test_no_header(self): # Tests case where the fastq's in the storage do not match the ingested samples. # """ # mock_graphql_query.side_effect = self.run_graphql_query_async - # + # rows = [ # 'HEADER', # '""', @@ -153,15 +154,16 @@ async def test_no_header(self): # batch_number='M01', # search_locations=[], # project=self.project_name, + # allow_missing_files=False, # ) - # + # parser.filename_map = { # 'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq', # 'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq', # } - # + # file_contents = '\n'.join(rows) - # + # with self.assertRaises(ValueError): # await parser.parse_manifest( # StringIO(file_contents), delimiter='\t', dry_run=True @@ -214,6 +216,7 @@ async def test_existing_row( batch_number='M01', search_locations=[], project=self.project_name, + allow_missing_files=False, ) parser.filename_map = { @@ -232,3 +235,48 @@ async def test_existing_row( self.assertEqual(0, summary['assays']['update']) return + + @run_as_sync + async def test_get_read_filenames_no_reads_fail(self): + """Test ValueError is raised when allow_missing_files is False and sequencing groups have no reads""" + + single_row = {Columns.MANIFEST_FLUID_X: ''} + + parser = ExistingCohortParser( + include_participant_column=False, + batch_number='M01', + search_locations=[], + project=self.project_name, + allow_missing_files=False, + ) + parser.filename_map = {} + + with self.assertRaises(ValueError): + # this will raise a ValueError because the allow_missing_files=False, + # and there are no matching reads in the filename map + await parser.get_read_filenames(sample_id='', row=single_row) + + @run_as_sync + async def test_get_read_filenames_no_reads_pass(self): + """Test when allow_missing_files is True and records with missing fastqs, no ValueError is raised""" + + single_row = {Columns.MANIFEST_FLUID_X: ''} + + parser = ExistingCohortParser( + include_participant_column=False, + batch_number='M01', + search_locations=[], + project=self.project_name, + allow_missing_files=True, + ) + parser.filename_map = {} + + with self.assertLogs(level='INFO') as cm: + read_filenames = await parser.get_read_filenames( + sample_id='', row=single_row + ) + + self.assertEqual(len(cm.output), 1) + self.assertIn('No read files found for ', cm.output[0]) + + self.assertEqual(len(read_filenames), 0)