Skip to content

Commit

Permalink
Merge pull request #616 from populationgenomics/update-existing-cohor…
Browse files Browse the repository at this point in the history
…t-parser-with-skip-flag

Update existing cohort parser with warning flag
  • Loading branch information
vivbak authored Nov 20, 2023
2 parents bdb637d + 74b9031 commit 4422f3e
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 10 deletions.
26 changes: 24 additions & 2 deletions scripts/parse_existing_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
Additionally, the reads-column is not provided for existing-cohort csvs.
This information is derived from the fluidX id pulled from the filename.
Additional Options:
--allow-missing-files:
Set this flag to parse manifests with missing data and generate warnings instead of raising errors.
This allows the script to proceed even if some data is missing.
"""

import csv
Expand Down Expand Up @@ -105,12 +110,15 @@ def __init__(
search_locations,
batch_number,
include_participant_column,
allow_missing_files,
):
if include_participant_column:
participant_column = Columns.PARTICIPANT_COLUMN
else:
participant_column = Columns.EXTERNAL_ID

self.allow_missing_files = allow_missing_files

super().__init__(
project=project,
search_locations=search_locations,
Expand All @@ -134,7 +142,9 @@ def _get_dict_reader(self, file_pointer, delimiter: str):
return reader

async def get_read_filenames(
self, sample_id: Optional[str], row: SingleRow
self,
sample_id: Optional[str],
row: SingleRow,
) -> List[str]:
"""
We don't have fastq urls in a manifest, so overriding this method to take
Expand All @@ -149,7 +159,11 @@ async def get_read_filenames(
]

if not read_filenames:
raise ValueError(f'No read files found for {sample_id}')
if not self.allow_missing_files:
raise ValueError(f'No read files found for {sample_id}')

logger.warning(f'No read files found for {sample_id}')

return read_filenames

def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]:
Expand Down Expand Up @@ -205,6 +219,12 @@ def get_existing_external_sequence_ids(self, participant_map: dict[str, dict]):
@click.option(
'--include-participant-column', 'include_participant_column', is_flag=True
)
@click.option(
'--allow-missing-files',
'allow_missing_files',
is_flag=True,
help='Set this flag to parse/ingest sequencing groups with missing reads',
)
@click.argument('manifests', nargs=-1)
@run_as_sync
async def main(
Expand All @@ -215,6 +235,7 @@ async def main(
confirm=True,
dry_run=False,
include_participant_column=False,
allow_missing_files=False,
):
"""Run script from CLI arguments"""

Expand All @@ -223,6 +244,7 @@ async def main(
search_locations=search_locations,
batch_number=batch_number,
include_participant_column=include_participant_column,
allow_missing_files=allow_missing_files,
)

for manifest_path in manifests:
Expand Down
64 changes: 56 additions & 8 deletions test/test_parse_existing_cohort.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from datetime import datetime
from io import StringIO
from test.testbase import DbIsolatedTest, run_as_sync
from unittest.mock import patch

from test.testbase import run_as_sync, DbIsolatedTest

from db.python.layers import ParticipantLayer
from scripts.parse_existing_cohort import ExistingCohortParser
from models.models import ParticipantUpsertInternal, SampleUpsertInternal
from metamist.parser.generic_parser import ParsedParticipant
from models.models import ParticipantUpsertInternal, SampleUpsertInternal
from scripts.parse_existing_cohort import Columns, ExistingCohortParser


class TestExistingCohortParser(DbIsolatedTest):
Expand Down Expand Up @@ -45,6 +44,7 @@ async def test_single_row(
batch_number='M01',
search_locations=[],
project=self.project_name,
allow_missing_files=False,
)

parser.filename_map = {
Expand Down Expand Up @@ -115,6 +115,7 @@ async def test_no_header(self):
batch_number='M01',
search_locations=[],
project=self.project_name,
allow_missing_files=False,
)

parser.filename_map = {
Expand All @@ -141,7 +142,7 @@ async def test_no_header(self):
# Tests case where the fastq's in the storage do not match the ingested samples.
# """
# mock_graphql_query.side_effect = self.run_graphql_query_async
#

# rows = [
# 'HEADER',
# '""',
Expand All @@ -153,15 +154,16 @@ async def test_no_header(self):
# batch_number='M01',
# search_locations=[],
# project=self.project_name,
# allow_missing_files=False,
# )
#

# parser.filename_map = {
# 'HG3F_2_220405_FLUIDXMISTMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R1.fastq',
# 'HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq': '/path/to/HG3F_2_220405_FLUIDXMISMATCH1234_Homo-sapiens_AAC-TAT_R_220208_VB_BLAH_M002_R2.fastq',
# }
#

# file_contents = '\n'.join(rows)
#

# with self.assertRaises(ValueError):
# await parser.parse_manifest(
# StringIO(file_contents), delimiter='\t', dry_run=True
Expand Down Expand Up @@ -214,6 +216,7 @@ async def test_existing_row(
batch_number='M01',
search_locations=[],
project=self.project_name,
allow_missing_files=False,
)

parser.filename_map = {
Expand All @@ -232,3 +235,48 @@ async def test_existing_row(
self.assertEqual(0, summary['assays']['update'])

return

@run_as_sync
async def test_get_read_filenames_no_reads_fail(self):
"""Test ValueError is raised when allow_missing_files is False and sequencing groups have no reads"""

single_row = {Columns.MANIFEST_FLUID_X: ''}

parser = ExistingCohortParser(
include_participant_column=False,
batch_number='M01',
search_locations=[],
project=self.project_name,
allow_missing_files=False,
)
parser.filename_map = {}

with self.assertRaises(ValueError):
# this will raise a ValueError because the allow_missing_files=False,
# and there are no matching reads in the filename map
await parser.get_read_filenames(sample_id='', row=single_row)

@run_as_sync
async def test_get_read_filenames_no_reads_pass(self):
"""Test when allow_missing_files is True and records with missing fastqs, no ValueError is raised"""

single_row = {Columns.MANIFEST_FLUID_X: ''}

parser = ExistingCohortParser(
include_participant_column=False,
batch_number='M01',
search_locations=[],
project=self.project_name,
allow_missing_files=True,
)
parser.filename_map = {}

with self.assertLogs(level='INFO') as cm:
read_filenames = await parser.get_read_filenames(
sample_id='', row=single_row
)

self.assertEqual(len(cm.output), 1)
self.assertIn('No read files found for ', cm.output[0])

self.assertEqual(len(read_filenames), 0)

0 comments on commit 4422f3e

Please sign in to comment.