From dd0e6f786c6c5ae01ba851ffb247a9d89fb05382 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 07:32:19 +1000
Subject: [PATCH 01/14] Test that fails if warning is set to true and
 ValueError is raised

---
 test/test_parse_existing_cohort.py | 109 ++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 33 deletions(-)

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 6f9b67fb9..c457c44fc 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -134,39 +134,40 @@ async def test_no_header(self):
     #   to exclude absolute paths (as absolute paths are NOT in the file map).
     #   I don't know what needs to change to fix this test, except maybe
     #   that the EC parser shouldn't return absolute paths
-    # @run_as_sync
-    # @patch('metamist.parser.generic_parser.query_async')
-    # async def test_missing_fastqs(self, mock_graphql_query):
-    #     """
-    #     Tests case where the fastq's in the storage do not match the ingested samples.
-    #     """
-    #     mock_graphql_query.side_effect = self.run_graphql_query_async
-    #
-    #     rows = [
-    #         'HEADER',
-    #         '""',
-    #         'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\tParticipant ID\t',
-    #         'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\tPID123',
-    #     ]
-    #     parser = ExistingCohortParser(
-    #         include_participant_column=False,
-    #         batch_number='M01',
-    #         search_locations=[],
-    #         project=self.project_name,
-    #     )
-    #
-    #     parser.filename_map = {
-    #         'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
-    #         'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
-    #     }
-    #
-    #     file_contents = '\n'.join(rows)
-    #
-    #     with self.assertRaises(ValueError):
-    #         await parser.parse_manifest(
-    #             StringIO(file_contents), delimiter='\t', dry_run=True
-    #         )
-    #     return
+    @run_as_sync
+    @patch('metamist.parser.generic_parser.query_async')
+    async def test_missing_fastqs(self, mock_graphql_query):
+        """
+        Tests case where the fastq's in the storage do not match the ingested samples.
+        """
+        mock_graphql_query.side_effect = self.run_graphql_query_async
+
+        rows = [
+            'HEADER',
+            '""',
+            'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\tParticipant ID\t',
+            'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\tPID123',
+        ]
+        parser = ExistingCohortParser(
+            include_participant_column=False,
+            batch_number='M01',
+            search_locations=[],
+            project=self.project_name,
+            warning_flag=False,
+        )
+
+        parser.filename_map = {
+            'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
+            'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
+        }
+
+        file_contents = '\n'.join(rows)
+
+        with self.assertRaises(ValueError):
+            await parser.parse_manifest(
+                StringIO(file_contents), delimiter='\t', dry_run=True
+            )
+        return
 
     @run_as_sync
     @patch('metamist.parser.generic_parser.query_async')
@@ -232,3 +233,45 @@ async def test_existing_row(
         self.assertEqual(0, summary['assays']['update'])
 
         return
+
+    @run_as_sync
+    @patch('metamist.parser.generic_parser.query_async')
+    @patch(
+        'metamist.parser.generic_metadata_parser.GenericMetadataParser.get_read_filenames',
+        return_value=[],
+    )
+    async def test_parse_cohort_with_warning(
+        self, mock_graphql_query, mock_get_read_filenames
+    ):
+        """Test when warning_flag is True and records with missing fastqs, no ValueError is raised"""
+
+        mock_graphql_query.side_effect = self.run_graphql_query_async
+        # mock_get_read_filenames.return_value = []
+
+        rows = [
+            'HEADER',
+            '""',
+            'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\t',
+            'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\t',
+        ]
+
+        parser = ExistingCohortParser(
+            include_participant_column=False,
+            batch_number='M01',
+            search_locations=[],
+            project=self.project_name,
+            warning_flag=True,
+        )
+
+        file_contents = '\n'.join(rows)
+
+        try:
+            await parser.parse_manifest(
+                StringIO(file_contents), delimiter='\t', dry_run=True
+            )
+        except ValueError:
+            self.fail("ValueError was raised")
+
+        mock_get_read_filenames.assert_called()
+
+        return

From 83f21744f12a5d2c5bda920ea6347a58a6841449 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 07:33:28 +1000
Subject: [PATCH 02/14] Added warning flag to ignore missing data when parsing

---
 .gitignore                       |  4 ++++
 scripts/parse_existing_cohort.py | 13 +++++++++++--
 web/package-lock.json            |  4 ++--
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index d5fa95d5a..78ba17012 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,7 @@ web/src/__generated__
 
 # pulumi config files
 Pulumi*.yaml
+
+venv/
+scraps.ipynb
+db/mariadb-java-client-3.0.3.jar.1
\ No newline at end of file
diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index 7c6e322e7..64a580caf 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -105,12 +105,15 @@ def __init__(
         search_locations,
         batch_number,
         include_participant_column,
+        warning_flag,
     ):
         if include_participant_column:
             participant_column = Columns.PARTICIPANT_COLUMN
         else:
             participant_column = Columns.EXTERNAL_ID
 
+        self.warning_flag = warning_flag
+
         super().__init__(
             project=project,
             search_locations=search_locations,
@@ -134,13 +137,17 @@ def _get_dict_reader(self, file_pointer, delimiter: str):
         return reader
 
     async def get_read_filenames(
-        self, sample_id: Optional[str], row: SingleRow
+        self,
+        sample_id: Optional[str],
+        row: SingleRow,
     ) -> List[str]:
         """
         We don't have fastq urls in a manifest, so overriding this method to take
         urls from a bucket listing.
         """
 
+        warning_flag = self.warning_flag
+
         read_filenames = [
             filename
             for filename, path in self.filename_map.items()
@@ -148,8 +155,10 @@ async def get_read_filenames(
             and any(filename.endswith(ext) for ext in READS_EXTENSIONS)
         ]
 
-        if not read_filenames:
+        if not read_filenames and not warning_flag:
             raise ValueError(f'No read files found for {sample_id}')
+        else:
+            logger.warning(f'No read files found for {sample_id}')
         return read_filenames
 
     def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]:
diff --git a/web/package-lock.json b/web/package-lock.json
index 5d81f4805..3ee6c289b 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -1,12 +1,12 @@
 {
     "name": "metamist",
-    "version": "6.3.0",
+    "version": "6.5.0",
     "lockfileVersion": 3,
     "requires": true,
     "packages": {
         "": {
             "name": "metamist",
-            "version": "6.3.0",
+            "version": "6.5.0",
             "dependencies": {
                 "@apollo/client": "^3.7.3",
                 "@emotion/react": "^11.10.4",

From cbd08af315bfe35b3633ac6d76381f7898fce102 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 07:45:11 +1000
Subject: [PATCH 03/14] Commented this out again for consistency with dev
 branch

---
 test/test_parse_existing_cohort.py | 68 +++++++++++++++---------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index c457c44fc..1b32b0d64 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -134,40 +134,40 @@ async def test_no_header(self):
     #   to exclude absolute paths (as absolute paths are NOT in the file map).
     #   I don't know what needs to change to fix this test, except maybe
     #   that the EC parser shouldn't return absolute paths
-    @run_as_sync
-    @patch('metamist.parser.generic_parser.query_async')
-    async def test_missing_fastqs(self, mock_graphql_query):
-        """
-        Tests case where the fastq's in the storage do not match the ingested samples.
-        """
-        mock_graphql_query.side_effect = self.run_graphql_query_async
-
-        rows = [
-            'HEADER',
-            '""',
-            'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\tParticipant ID\t',
-            'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\tPID123',
-        ]
-        parser = ExistingCohortParser(
-            include_participant_column=False,
-            batch_number='M01',
-            search_locations=[],
-            project=self.project_name,
-            warning_flag=False,
-        )
-
-        parser.filename_map = {
-            'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
-            'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
-        }
-
-        file_contents = '\n'.join(rows)
-
-        with self.assertRaises(ValueError):
-            await parser.parse_manifest(
-                StringIO(file_contents), delimiter='\t', dry_run=True
-            )
-        return
+    # @run_as_sync
+    # @patch('metamist.parser.generic_parser.query_async')
+    # async def test_missing_fastqs(self, mock_graphql_query):
+    #     """
+    #     Tests case where the fastq's in the storage do not match the ingested samples.
+    #     """
+    #     mock_graphql_query.side_effect = self.run_graphql_query_async
+
+    #     rows = [
+    #         'HEADER',
+    #         '""',
+    #         'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\tParticipant ID\t',
+    #         'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\tPID123',
+    #     ]
+    #     parser = ExistingCohortParser(
+    #         include_participant_column=False,
+    #         batch_number='M01',
+    #         search_locations=[],
+    #         project=self.project_name,
+    #         warning_flag=False,
+    #     )
+
+    #     parser.filename_map = {
+    #         'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
+    #         'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
+    #     }
+
+    #     file_contents = '\n'.join(rows)
+
+    #     with self.assertRaises(ValueError):
+    #         await parser.parse_manifest(
+    #             StringIO(file_contents), delimiter='\t', dry_run=True
+    #         )
+    #     return
 
     @run_as_sync
     @patch('metamist.parser.generic_parser.query_async')

From 9207b6735778dbee4a1ea7d8cca03a80a4028b96 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 08:12:18 +1000
Subject: [PATCH 04/14] .gitignore updated

---
 .gitignore | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 78ba17012..eb675baf5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 db/postgres*.jar
 .vscode/
 env/
+venv/
 __pycache__/
 *.pyc
 .DS_Store
@@ -60,7 +61,3 @@ web/src/__generated__
 
 # pulumi config files
 Pulumi*.yaml
-
-venv/
-scraps.ipynb
-db/mariadb-java-client-3.0.3.jar.1
\ No newline at end of file

From b4a86a0cf493acaf1022248576b4f2e4def5069b Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 08:44:36 +1000
Subject: [PATCH 05/14] added warning_flag to cmd flags and updated doc string

---
 scripts/parse_existing_cohort.py   | 13 +++++++++++++
 test/test_parse_existing_cohort.py |  5 ++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index 64a580caf..a2473e428 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -30,6 +30,11 @@
 Additionally, the reads-column is not provided for existing-cohort csvs.
 This information is derived from the fluidX id pulled from the filename.
 
+Additional Options:
+--warning-flag:
+Set this flag to parse manifests with missing data and generate warnings instead of raising errors.
+This allows the script to proceed even if some data is missing.
+
 """
 
 import csv
@@ -214,6 +219,12 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]):
 @click.option(
     '--include-participant-column', 'include_participant_column', is_flag=True
 )
+@click.option(
+    '--warning-flag',
+    'warning_flag',
+    is_flag=True,
+    help='Set this flag to parse manifests with missing data',
+)
 @click.argument('manifests', nargs=-1)
 @run_as_sync
 async def main(
@@ -224,6 +235,7 @@ async def main(
     confirm=True,
     dry_run=False,
     include_participant_column=False,
+    warning_flag=False,
 ):
     """Run script from CLI arguments"""
 
@@ -232,6 +244,7 @@ async def main(
         search_locations=search_locations,
         batch_number=batch_number,
         include_participant_column=include_participant_column,
+        warning_flag=warning_flag,
     )
 
     for manifest_path in manifests:
diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 1b32b0d64..6edf6f331 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -45,6 +45,7 @@ async def test_single_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            warning_flag=False,
         )
 
         parser.filename_map = {
@@ -115,6 +116,7 @@ async def test_no_header(self):
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            warning_flag=False,
         )
 
         parser.filename_map = {
@@ -215,6 +217,7 @@ async def test_existing_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
+            warning_flag=False,
         )
 
         parser.filename_map = {
@@ -270,7 +273,7 @@ async def test_parse_cohort_with_warning(
                 StringIO(file_contents), delimiter='\t', dry_run=True
             )
         except ValueError:
-            self.fail("ValueError was raised")
+            self.fail('ValueError was raised')
 
         mock_get_read_filenames.assert_called()
 

From 89c005850064dfaf929714e4c6d3707f47f59426 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 08:52:34 +1000
Subject: [PATCH 06/14] fixed linting issue

---
 scripts/parse_existing_cohort.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index a2473e428..46b97f637 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -162,8 +162,9 @@ async def get_read_filenames(
 
         if not read_filenames and not warning_flag:
             raise ValueError(f'No read files found for {sample_id}')
-        else:
-            logger.warning(f'No read files found for {sample_id}')
+
+        logger.warning(f'No read files found for {sample_id}')
+
         return read_filenames
 
     def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]:

From ebb53793ab6eaba31557b65e656a8942c5b8d925 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 09:42:35 +1000
Subject: [PATCH 07/14] removed a commented out bit of code

---
 test/test_parse_existing_cohort.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 6edf6f331..1633ef888 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -249,7 +249,6 @@ async def test_parse_cohort_with_warning(
         """Test when warning_flag is True and records with missing fastqs, no ValueError is raised"""
 
         mock_graphql_query.side_effect = self.run_graphql_query_async
-        # mock_get_read_filenames.return_value = []
 
         rows = [
             'HEADER',

From 01c61392febd32d536eec88d49293656c92d526c Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 13:52:29 +1000
Subject: [PATCH 08/14] fixed testing error and renamed warning_flag to
 allow_missing_files to be more obvious

---
 scripts/parse_existing_cohort.py   | 23 +++++-----
 test/test_parse_existing_cohort.py | 67 ++++++++++++++----------------
 2 files changed, 43 insertions(+), 47 deletions(-)

diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index 46b97f637..4f96570d6 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -31,7 +31,7 @@
 This information is derived from the fluidX id pulled from the filename.
 
 Additional Options:
---warning-flag:
+--allow-missing-files:
 Set this flag to parse manifests with missing data and generate warnings instead of raising errors.
 This allows the script to proceed even if some data is missing.
 
@@ -110,14 +110,14 @@ def __init__(
         search_locations,
         batch_number,
         include_participant_column,
-        warning_flag,
+        allow_missing_files,
     ):
         if include_participant_column:
             participant_column = Columns.PARTICIPANT_COLUMN
         else:
             participant_column = Columns.EXTERNAL_ID
 
-        self.warning_flag = warning_flag
+        self.allow_missing_files = allow_missing_files
 
         super().__init__(
             project=project,
@@ -151,8 +151,6 @@ async def get_read_filenames(
         urls from a bucket listing.
         """
 
-        warning_flag = self.warning_flag
-
         read_filenames = [
             filename
             for filename, path in self.filename_map.items()
@@ -160,10 +158,11 @@ async def get_read_filenames(
             and any(filename.endswith(ext) for ext in READS_EXTENSIONS)
         ]
 
-        if not read_filenames and not warning_flag:
-            raise ValueError(f'No read files found for {sample_id}')
+        if not read_filenames:
+            if not self.allow_missing_files:
+                raise ValueError(f'No read files found for {sample_id}')
 
-        logger.warning(f'No read files found for {sample_id}')
+            logger.warning(f'No read files found for {sample_id}')
 
         return read_filenames
 
@@ -221,8 +220,8 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]):
     '--include-participant-column', 'include_participant_column', is_flag=True
 )
 @click.option(
-    '--warning-flag',
-    'warning_flag',
+    '--allow-missing-files',
+    'allow_missing_files',
     is_flag=True,
     help='Set this flag to parse manifests with missing data',
 )
@@ -236,7 +235,7 @@ async def main(
     confirm=True,
     dry_run=False,
     include_participant_column=False,
-    warning_flag=False,
+    allow_missing_files=False,
 ):
     """Run script from CLI arguments"""
 
@@ -245,7 +244,7 @@ async def main(
         search_locations=search_locations,
         batch_number=batch_number,
         include_participant_column=include_participant_column,
-        warning_flag=warning_flag,
+        allow_missing_files=allow_missing_files,
     )
 
     for manifest_path in manifests:
diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 1633ef888..67839bc5e 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -1,13 +1,12 @@
 from datetime import datetime
 from io import StringIO
+from test.testbase import DbIsolatedTest, run_as_sync
 from unittest.mock import patch
 
-from test.testbase import run_as_sync, DbIsolatedTest
-
 from db.python.layers import ParticipantLayer
-from scripts.parse_existing_cohort import ExistingCohortParser
-from models.models import ParticipantUpsertInternal, SampleUpsertInternal
 from metamist.parser.generic_parser import ParsedParticipant
+from models.models import ParticipantUpsertInternal, SampleUpsertInternal
+from scripts.parse_existing_cohort import Columns, ExistingCohortParser
 
 
 class TestExistingCohortParser(DbIsolatedTest):
@@ -45,7 +44,7 @@ async def test_single_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
-            warning_flag=False,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -116,7 +115,7 @@ async def test_no_header(self):
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
-            warning_flag=False,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -155,7 +154,7 @@ async def test_no_header(self):
     #         batch_number='M01',
     #         search_locations=[],
     #         project=self.project_name,
-    #         warning_flag=False,
+    #         allow_missing_files=False,
     #     )
 
     #     parser.filename_map = {
@@ -217,7 +216,7 @@ async def test_existing_row(
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
-            warning_flag=False,
+            allow_missing_files=False,
         )
 
         parser.filename_map = {
@@ -238,42 +237,40 @@ async def test_existing_row(
         return
 
     @run_as_sync
-    @patch('metamist.parser.generic_parser.query_async')
-    @patch(
-        'metamist.parser.generic_metadata_parser.GenericMetadataParser.get_read_filenames',
-        return_value=[],
-    )
-    async def test_parse_cohort_with_warning(
-        self, mock_graphql_query, mock_get_read_filenames
-    ):
-        """Test when warning_flag is True and records with missing fastqs, no ValueError is raised"""
-
-        mock_graphql_query.side_effect = self.run_graphql_query_async
+    async def test_get_read_filenames_no_reads_fail(self):
+        """Test when allow_missing_files is False and records with missing fastqs, ValueError is raised"""
 
-        rows = [
-            'HEADER',
-            '""',
-            'Application\tExternal ID\tSample Concentration (ng/ul)\tVolume (uL)\tSex\tSample/Name\tReference Genome\t',
-            'App\tEXTID1234\t100\t100\tFemale\t220405_FLUIDX1234\thg38\t',
-        ]
+        single_row = {Columns.MANIFEST_FLUID_X: ''}
 
         parser = ExistingCohortParser(
             include_participant_column=False,
             batch_number='M01',
             search_locations=[],
             project=self.project_name,
-            warning_flag=True,
+            allow_missing_files=False,
         )
+        parser.filename_map = {}
 
-        file_contents = '\n'.join(rows)
+        with self.assertRaises(ValueError):
+            # this will raise a ValueError because the allow_missing_files=False,
+            # and there are no matching reads in the filename map
+            await parser.get_read_filenames(sample_id='', row=single_row)
 
-        try:
-            await parser.parse_manifest(
-                StringIO(file_contents), delimiter='\t', dry_run=True
-            )
-        except ValueError:
-            self.fail('ValueError was raised')
+    @run_as_sync
+    async def test_get_read_filenames_no_reads_pass(self):
+        """Test when allow_missing_files is True and records with missing fastqs, no ValueError is raised"""
 
-        mock_get_read_filenames.assert_called()
+        single_row = {Columns.MANIFEST_FLUID_X: ''}
 
-        return
+        parser = ExistingCohortParser(
+            include_participant_column=False,
+            batch_number='M01',
+            search_locations=[],
+            project=self.project_name,
+            allow_missing_files=True,
+        )
+        parser.filename_map = {}
+
+        read_filenames = await parser.get_read_filenames(sample_id='', row=single_row)
+
+        self.assertEqual(len(read_filenames), 0)

From 778585d6a21a2b92af16742ea9a9d3bd574e9b2d Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 15:19:41 +1000
Subject: [PATCH 09/14] added check for warning log in
 test_get_read_filenames_no_reads_pass as well as length of read_filenames.
 Also changed removed /venv in .gitignore file - probably best for a separate
 PR

---
 .gitignore                         |  1 -
 test/test_parse_existing_cohort.py | 10 +++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index eb675baf5..d5fa95d5a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 db/postgres*.jar
 .vscode/
 env/
-venv/
 __pycache__/
 *.pyc
 .DS_Store
diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 67839bc5e..c3617a87b 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -2,6 +2,8 @@
 from io import StringIO
 from test.testbase import DbIsolatedTest, run_as_sync
 from unittest.mock import patch
+import sys
+import logging
 
 from db.python.layers import ParticipantLayer
 from metamist.parser.generic_parser import ParsedParticipant
@@ -271,6 +273,12 @@ async def test_get_read_filenames_no_reads_pass(self):
         )
         parser.filename_map = {}
 
-        read_filenames = await parser.get_read_filenames(sample_id='', row=single_row)
+        with self.assertLogs(level='INFO') as cm:
+            read_filenames = await parser.get_read_filenames(
+                sample_id='', row=single_row
+            )
+
+        self.assertEqual(len(cm.output), 1)
+        self.assertIn('No read files found for ', cm.output[0])
 
         self.assertEqual(len(read_filenames), 0)

From d85ba2dd15d12943ca33bfada7adb685079b806c Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 15:21:30 +1000
Subject: [PATCH 10/14] removed unused imports

---
 test/test_parse_existing_cohort.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index c3617a87b..6521cb17d 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -2,8 +2,6 @@
 from io import StringIO
 from test.testbase import DbIsolatedTest, run_as_sync
 from unittest.mock import patch
-import sys
-import logging
 
 from db.python.layers import ParticipantLayer
 from metamist.parser.generic_parser import ParsedParticipant

From 44779bca049a7badf93381e1acc42b589b8a5dc6 Mon Sep 17 00:00:00 2001
From: michael-harper <109899932+michael-harper@users.noreply.github.com>
Date: Mon, 20 Nov 2023 15:23:33 +1000
Subject: [PATCH 11/14] Update scripts/parse_existing_cohort.py

Co-authored-by: Vivian Bakiris <79084890+vivbak@users.noreply.github.com>
---
 scripts/parse_existing_cohort.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py
index 4f96570d6..34d64f370 100644
--- a/scripts/parse_existing_cohort.py
+++ b/scripts/parse_existing_cohort.py
@@ -223,7 +223,7 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]):
     '--allow-missing-files',
     'allow_missing_files',
     is_flag=True,
-    help='Set this flag to parse manifests with missing data',
+    help='Set this flag to parse/ingest sequencing groups with missing reads',
 )
 @click.argument('manifests', nargs=-1)
 @run_as_sync

From 9f0909cd109f894282035ee29ad9d769faa2ee06 Mon Sep 17 00:00:00 2001
From: michael-harper <109899932+michael-harper@users.noreply.github.com>
Date: Mon, 20 Nov 2023 15:24:00 +1000
Subject: [PATCH 12/14] Update test/test_parse_existing_cohort.py

Co-authored-by: Vivian Bakiris <79084890+vivbak@users.noreply.github.com>
---
 test/test_parse_existing_cohort.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py
index 6521cb17d..d8e755bf7 100644
--- a/test/test_parse_existing_cohort.py
+++ b/test/test_parse_existing_cohort.py
@@ -238,7 +238,7 @@ async def test_existing_row(
 
     @run_as_sync
     async def test_get_read_filenames_no_reads_fail(self):
-        """Test when allow_missing_files is False and records with missing fastqs, ValueError is raised"""
+        """Test ValueError is raised when allow_missing_files is False and sequencing groups have no reads"""
 
         single_row = {Columns.MANIFEST_FLUID_X: ''}
 

From 2aec9dca5ac970813923e1e96df192f2c925ef07 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 15:35:28 +1000
Subject: [PATCH 13/14] de-bumping metamist version

---
 web/package-lock.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/web/package-lock.json b/web/package-lock.json
index 3ee6c289b..0989b7ee7 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -1,12 +1,12 @@
 {
     "name": "metamist",
-    "version": "6.5.0",
+    "version": "6.3.0",
     "lockfileVersion": 3,
     "requires": true,
     "packages": {
         "": {
             "name": "metamist",
-            "version": "6.5.0",
+            "version": "6.3.0",
             "dependencies": {
                 "@apollo/client": "^3.7.3",
                 "@emotion/react": "^11.10.4",
@@ -12140,4 +12140,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file

From 74b9031ec4bb074ba26ee08b627eee1914cb49a9 Mon Sep 17 00:00:00 2001
From: Michael Harper <michael.harper@populationgenomics.org.au>
Date: Mon, 20 Nov 2023 15:50:17 +1000
Subject: [PATCH 14/14] fixed linting issue

---
 web/package-lock.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/package-lock.json b/web/package-lock.json
index 0989b7ee7..5d81f4805 100644
--- a/web/package-lock.json
+++ b/web/package-lock.json
@@ -12140,4 +12140,4 @@
             }
         }
     }
-}
\ No newline at end of file
+}