From 9199dab65cb1d3ad1e0d4c731ba89ef6a770b637 Mon Sep 17 00:00:00 2001 From: EddieLF <34049565+EddieLF@users.noreply.github.com> Date: Fri, 16 Feb 2024 16:11:18 +1100 Subject: [PATCH] Rna ingestion updates (#676) * Updating parsers with library type etc * Add new sample map column defaults, update sg meta on RNA ingest * Update parser column default, update rna sg meta, update parsing prints * Change reference to 'sequence' to 'assay' * Change reference to 'library_type' to 'sequencing_library' * Change another reference to 'library_type' to 'sequencing_library' * Add required fields for exomes, more tidying, use tabulate * Add tabulate to requirements * Fix parse_manifest return for tests, update script referencing group_assays fn * Remove required exome fields, re-add default analysis status 'completed' * Update parser testing with RNA examples * Remove added external_family_id from ParsedParticipant class * Refactor parser to use SequencingDefualts class * Collapse sequencing defaults into new class * Update setup.py and resolve merge conflicts * isort linting * Fix typehint for read and ref files function return --- metamist/parser/generic_metadata_parser.py | 209 ++++++++++++++------- metamist/parser/generic_parser.py | 191 ++++++++++++++----- metamist/parser/sample_file_map_parser.py | 103 +++++++--- scripts/parse_existing_cohort.py | 6 +- scripts/parse_ont_sheet.py | 38 ++-- scripts/parse_sample_file_map.py | 18 +- scripts/parse_vcgs_manifest.py | 6 +- setup.py | 1 + test/test_parse_existing_cohort.py | 4 +- test/test_parse_file_map.py | 172 ++++++++++++++++- 10 files changed, 569 insertions(+), 179 deletions(-) diff --git a/metamist/parser/generic_metadata_parser.py b/metamist/parser/generic_metadata_parser.py index d89104df0..3c9758770 100644 --- a/metamist/parser/generic_metadata_parser.py +++ b/metamist/parser/generic_metadata_parser.py @@ -4,15 +4,17 @@ import re import shlex from functools import reduce -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import click from metamist.parser.generic_parser import ( # noqa + DefaultSequencing, GenericParser, GroupedRow, ParsedAnalysis, ParsedAssay, + ParsedSample, ParsedSequencingGroup, SingleRow, run_as_sync, @@ -92,16 +94,22 @@ def __init__( seq_type_column: Optional[str] = None, seq_technology_column: Optional[str] = None, seq_platform_column: Optional[str] = None, + seq_facility_column: Optional[str] = None, + seq_library_column: Optional[str] = None, + read_end_type_column: Optional[str] = None, + read_length_column: Optional[str] = None, gvcf_column: Optional[str] = None, meta_column: Optional[str] = None, seq_meta_column: Optional[str] = None, batch_number: Optional[str] = None, reference_assembly_location_column: Optional[str] = None, default_reference_assembly_location: Optional[str] = None, - default_sequencing_type='genome', default_sample_type=None, - default_sequencing_technology='short-read', - default_sequencing_platform='illumina', + default_sequencing=DefaultSequencing( + seq_type='genome', technology='short-read', platform='illumina' + ), + default_read_end_type: Optional[str] = None, + default_read_length: Optional[str | int] = None, allow_extra_files_in_search_path=False, **kwargs, ): @@ -109,10 +117,10 @@ def __init__( path_prefix=None, search_paths=search_locations, project=project, - default_sequencing_type=default_sequencing_type, default_sample_type=default_sample_type, - default_sequencing_technology=default_sequencing_technology, - default_sequencing_platform=default_sequencing_platform, + default_sequencing=default_sequencing, + default_read_end_type=default_read_end_type, + default_read_length=default_read_length, **kwargs, ) @@ -130,6 +138,10 @@ def __init__( self.seq_type_column = seq_type_column self.seq_technology_column = seq_technology_column self.seq_platform_column = seq_platform_column + self.seq_facility_column = seq_facility_column + self.seq_library_column = seq_library_column + self.read_end_type_column = read_end_type_column + self.read_length_column = read_length_column self.reference_assembly_location_column = reference_assembly_location_column self.default_reference_assembly_location = default_reference_assembly_location @@ -168,14 +180,14 @@ def get_sequencing_types(self, row: GroupedRow) -> list[str]: if isinstance(row, dict): return [self.get_sequencing_type(row)] return [ - str(r.get(self.seq_type_column, self.default_sequencing_type)) for r in row + str(r.get(self.seq_type_column, self.default_sequencing.seq_type)) for r in row ] def get_sequencing_technology(self, row: SingleRow) -> str: """Get assay technology for single row""" value = ( row.get(self.seq_technology_column, None) - or self.default_sequencing_technology + or self.default_sequencing.technology ) value = value.lower() @@ -187,7 +199,7 @@ def get_sequencing_technology(self, row: SingleRow) -> str: def get_sequencing_platform(self, row: SingleRow) -> str: """Get sequencing platform for single row""" value = ( - row.get(self.seq_platform_column, None) or self.default_sequencing_platform + row.get(self.seq_platform_column, None) or self.default_sequencing.platform ) value = value.lower() @@ -195,7 +207,7 @@ def get_sequencing_platform(self, row: SingleRow) -> str: def get_sequencing_type(self, row: SingleRow) -> str: """Get assay type from row""" - value = row.get(self.seq_type_column, None) or self.default_sequencing_type + value = row.get(self.seq_type_column, None) or self.default_sequencing.seq_type value = value.lower() if value == 'wgs': @@ -204,9 +216,47 @@ def get_sequencing_type(self, row: SingleRow) -> str: value = 'exome' elif 'mt' in value: value = 'mtseq' + elif 'polya' in value or 'mrna' in value: + value = 'polyarna' + elif 'total' in value: + value = 'totalrna' + elif 'single' in value: + value = 'singlecellrna' return str(value) + def get_sequencing_facility(self, row: SingleRow) -> str: + """Get sequencing facility from row""" + value = ( + row.get(self.seq_facility_column, None) or self.default_sequencing.facility + ) + value = str(value) if value else None + return value + + def get_sequencing_library(self, row: SingleRow) -> str: + """Get sequencing library from row""" + value = ( + row.get(self.seq_library_column, None) or self.default_sequencing.library + ) + value = str(value) if value else None + return value + + def get_read_end_type(self, row: SingleRow) -> str: + """Get read end type from row""" + value = ( + row.get(self.read_end_type_column, None) or self.default_read_end_type + ) + value = str(value).lower() if value else None + return value + + def get_read_length(self, row: SingleRow) -> int: + """Get read length from row""" + value = ( + row.get(self.read_length_column, None) or self.default_read_length + ) + value = int(value) if value else None + return value + def get_assay_id(self, row: GroupedRow) -> Optional[dict[str, str]]: """Get external assay ID from row. Needs to be implemented per parser. NOTE: To be re-thought after assay group changes are applied""" @@ -515,6 +565,7 @@ async def get_participant_meta_from_group(self, rows: GroupedRow): async def get_sequencing_group_meta( self, sequencing_group: ParsedSequencingGroup ) -> dict: + """Get sequencing group metadata from variant (vcf) files""" meta: dict[str, Any] = {} if not sequencing_group.sample.external_sid: @@ -552,6 +603,55 @@ async def get_sequencing_group_meta( return meta + async def get_read_and_ref_files_and_checksums(self, sample_id: str, rows: GroupedRow) -> ( + Tuple[List[str], List[str], Set[str]]): + """Get read filenames and checksums from rows.""" + read_filenames: List[str] = [] + read_checksums: List[str] = [] + reference_assemblies: set[str] = set() + for r in rows: + _rfilenames = await self.get_read_filenames(sample_id=sample_id, row=r) + read_filenames.extend(_rfilenames) + if self.checksum_column and self.checksum_column in r: + checksums = await self.get_checksums_from_row(sample_id, r, _rfilenames) + if not checksums: + checksums = [None] * len(_rfilenames) + read_checksums.extend(checksums) + if self.reference_assembly_location_column: + ref = r.get(self.reference_assembly_location_column) + if ref: + reference_assemblies.add(ref) + return read_filenames, read_checksums, reference_assemblies + + async def parse_cram_assays(self, sample: ParsedSample, reference_assemblies: set[str]) -> dict[str, Any]: + """Parse CRAM assays""" + if len(reference_assemblies) > 1: + # sorted for consistent testing + str_ref_assemblies = ', '.join(sorted(reference_assemblies)) + raise ValueError( + f'Multiple reference assemblies were defined for {sample.external_sid}: {str_ref_assemblies}' + ) + if len(reference_assemblies) == 1: + ref = next(iter(reference_assemblies)) + else: + ref = self.default_reference_assembly_location + if not ref: + raise ValueError( + f'Reads type for {sample.external_sid!r} is CRAM, but a reference ' + f'is not defined, please set the default reference assembly path' + ) + + ref_fp = self.file_path(ref) + secondary_files = ( + await self.create_secondary_file_objects_by_potential_pattern( + ref_fp, ['.fai'] + ) + ) + cram_reference = await self.create_file_object( + ref_fp, secondary_files=secondary_files + ) + return {'reference_assembly' : cram_reference} + async def get_assays_from_group( self, sequencing_group: ParsedSequencingGroup ) -> list[ParsedAssay]: @@ -566,27 +666,9 @@ async def get_assays_from_group( assays = [] - read_filenames: List[str] = [] - read_checksums: List[str] = [] - reference_assemblies: set[str] = set() - - for r in rows: - _rfilenames = await self.get_read_filenames( - sample_id=sample.external_sid, row=r - ) - read_filenames.extend(_rfilenames) - if self.checksum_column and self.checksum_column in r: - checksums = await self.get_checksums_from_row( - sample.external_sid, r, _rfilenames - ) - if not checksums: - checksums = [None] * len(_rfilenames) - read_checksums.extend(checksums) - - if self.reference_assembly_location_column: - ref = r.get(self.reference_assembly_location_column) - if ref: - reference_assemblies.add(ref) + read_filenames, read_checksums, reference_assemblies = await self.get_read_and_ref_files_and_checksums( + sample.external_sid, rows + ) # strip in case collaborator put "file1, file2" full_read_filenames: List[str] = [] @@ -614,37 +696,32 @@ async def get_assays_from_group( # collapsed_assay_meta['reads'] = reads[reads_type] if reads_type == 'cram': - if len(reference_assemblies) > 1: - # sorted for consistent testing - str_ref_assemblies = ', '.join(sorted(reference_assemblies)) - raise ValueError( - f'Multiple reference assemblies were defined for {sample.external_sid}: {str_ref_assemblies}' - ) - if len(reference_assemblies) == 1: - ref = next(iter(reference_assemblies)) - else: - ref = self.default_reference_assembly_location - - if not ref: - raise ValueError( - f'Reads type for {sample.external_sid!r} is CRAM, but a reference ' - f'is not defined, please set the default reference assembly path' - ) - - ref_fp = self.file_path(ref) - secondary_files = ( - await self.create_secondary_file_objects_by_potential_pattern( - ref_fp, ['.fai'] - ) + collapsed_assay_meta.update( + await self.parse_cram_assays(sample, reference_assemblies) ) - cram_reference = await self.create_file_object( - ref_fp, secondary_files=secondary_files - ) - collapsed_assay_meta['reference_assembly'] = cram_reference if self.batch_number is not None: collapsed_assay_meta['batch'] = self.batch_number + if sequencing_group.sequencing_type in ['exome', 'polyarna', 'totalrna', 'singlecellrna']: + rows = sequencing_group.rows + # Exome / RNA should have facility and library, allow missing for exomes - for now + if self.get_sequencing_facility(rows[0]): + collapsed_assay_meta['sequencing_facility'] = self.get_sequencing_facility(rows[0]) + if self.get_sequencing_library(rows[0]): + collapsed_assay_meta['sequencing_library'] = self.get_sequencing_library(rows[0]) + # RNA requires read end type and length as well + if sequencing_group.sequencing_type != 'exome': + collapsed_assay_meta['read_end_type'] = self.get_read_end_type(rows[0]) + collapsed_assay_meta['read_length'] = self.get_read_length(rows[0]) + # if any of the above fields are not set for an RNA assay, raise an error + if not all(collapsed_assay_meta.values()): + raise ValueError( + f'Not all required fields were set for RNA sample {sample.external_sid}:\n' + f'{collapsed_assay_meta}\n' + f'Use the default value arguments if they are not present in the manifest.' + ) + for read in reads[reads_type]: assays.append( ParsedAssay( @@ -653,8 +730,8 @@ async def get_assays_from_group( # as we grab all reads, and then determine assay # grouping from there. rows=sequencing_group.rows, - internal_seq_id=None, - external_seq_ids={}, + internal_assay_id=None, + external_assay_ids={}, # unfortunately hard to break them up by row in the current format # assay_status=self.get_assay_status(rows), assay_type='sequencing', @@ -667,6 +744,8 @@ async def get_assays_from_group( }, ) ) + if not sequencing_group.meta: + sequencing_group.meta = self.get_sequencing_group_meta_from_assays(assays) return assays @@ -803,9 +882,9 @@ async def main( if not manifests: raise ValueError('Expected at least 1 manifest') - extra_seach_paths = [m for m in manifests if m.startswith('gs://')] - if extra_seach_paths: - search_path = list(set(search_path).union(set(extra_seach_paths))) + extra_search_paths = [m for m in manifests if m.startswith('gs://')] + if extra_search_paths: + search_path = list(set(search_path).union(set(extra_search_paths))) participant_meta_map: Dict[Any, Any] = {} sample_meta_map: Dict[Any, Any] = {} @@ -839,7 +918,9 @@ async def main( reported_gender_column=reported_gender_column, karyotype_column=karyotype_column, default_sample_type=default_sample_type, - default_sequencing_type=default_assay_type, + default_sequencing=DefaultSequencing( + seq_type='genome', technology='short-read', platform='illumina' + ), search_locations=search_path, ) for manifest in manifests: diff --git a/metamist/parser/generic_parser.py b/metamist/parser/generic_parser.py index 9bee4fe70..8dc2de5d7 100644 --- a/metamist/parser/generic_parser.py +++ b/metamist/parser/generic_parser.py @@ -28,6 +28,7 @@ ) from cloudpathlib import AnyPath +from tabulate import tabulate from metamist.apis import AnalysisApi, AssayApi, ParticipantApi, SampleApi from metamist.graphql import gql, query_async @@ -65,6 +66,7 @@ + GVCF_EXTENSIONS + VCF_EXTENSIONS ) +RNA_SEQ_TYPES = ['polyarna', 'totalrna', 'singlecellrna'] # construct rmatch string to capture all fastq patterns rmatch_str = ( @@ -259,7 +261,7 @@ def to_sm(self) -> SampleUpsert: class ParsedSequencingGroup: - """Class for holding sequence metadata grouped by type""" + """Class for holding sequencing group metadata""" def __init__( self, @@ -268,8 +270,8 @@ def __init__( internal_seqgroup_id: int | None, external_seqgroup_id: str | None, sequencing_type: str, - sequence_technology: str, - sequence_platform: str | None, + sequencing_technology: str, + sequencing_platform: str | None, meta: dict[str, Any] | None, ): self.sample = sample @@ -278,8 +280,8 @@ def __init__( self.internal_seqgroup_id = internal_seqgroup_id self.external_seqgroup_id = external_seqgroup_id self.sequencing_type = sequencing_type - self.sequencing_technology = sequence_technology - self.sequencing_platform = sequence_platform + self.sequencing_technology = sequencing_technology + self.sequencing_platform = sequencing_platform self.meta = meta self.assays: list[ParsedAssay] = [] @@ -292,7 +294,7 @@ def to_sm(self) -> SequencingGroupUpsert: technology=self.sequencing_technology, platform=self.sequencing_platform, meta=self.meta, - assays=[sq.to_sm() for sq in self.assays or []], + assays=[a.to_sm() for a in self.assays or []], id=self.internal_seqgroup_id, ) @@ -304,16 +306,16 @@ def __init__( self, group: ParsedSequencingGroup, rows: GroupedRow, - internal_seq_id: int | None, - external_seq_ids: dict[str, str], + internal_assay_id: int | None, + external_assay_ids: dict[str, str], assay_type: str | None, meta: dict[str, Any] | None, ): self.sequencing_group = group self.rows = rows - self.internal_id = internal_seq_id - self.external_ids = external_seq_ids + self.internal_id = internal_assay_id + self.external_ids = external_assay_ids self.assay_type = assay_type self.meta = meta @@ -350,7 +352,7 @@ def __init__( def to_sm(self): """To SM model""" if not self.sequencing_group.internal_seqgroup_id: - raise ValueError('Sequence group ID must be filled in by now') + raise ValueError('Sequencing group ID must be filled in by now') return Analysis( status=AnalysisStatus(self.status), type=str(self.type), @@ -360,9 +362,26 @@ def to_sm(self): ) +class DefaultSequencing: + """Groups default sequencing information""" + def __init__( + self, + seq_type: str = 'genome', # seq_type because `type` is a built-in + technology: str = 'short-read', + platform: str = 'illumina', + facility: str = None, + library: str = None, + ): + self.seq_type = seq_type + self.technology = technology + self.platform = platform + self.facility = facility + self.library = library + + def chunk(iterable: Iterable[T], chunk_size=50) -> Iterator[List[T]]: """ - Chunk a sequence by yielding lists of `chunk_size` + Chunk an iterable by yielding lists of `chunk_size` """ chnk: List[T] = [] for element in iterable: @@ -398,23 +417,23 @@ def wrapper(*args, **kwargs): class GenericParser( CloudHelper ): # pylint: disable=too-many-public-methods,too-many-arguments - """Parser for VCGS manifest""" + """Parser for ingesting rows of metadata""" def __init__( # pylint: disable=too-many-arguments self, path_prefix: Optional[str], search_paths: list[str], project: str, - default_sequencing_type='genome', - default_sequencing_technology='short-read', - default_sequencing_platform: str | None = None, - default_sample_type=None, - default_analysis_type='qc', - default_analysis_status='completed', - skip_checking_gcs_objects=False, + default_sample_type: str = None, + default_sequencing: DefaultSequencing = DefaultSequencing(), + default_read_end_type: str = None, + default_read_length: str | int = None, + default_analysis_type: str = None, + default_analysis_status: str = 'completed', key_map: Dict[str, str] = None, - ignore_extra_keys=False, required_keys: Set[str] = None, + ignore_extra_keys=False, + skip_checking_gcs_objects=False, verbose=True, ): self.path_prefix = path_prefix @@ -430,12 +449,12 @@ def __init__( # pylint: disable=too-many-arguments self.project = project - self.default_sequencing_type: str = default_sequencing_type - self.default_sequencing_technology: str = default_sequencing_technology - self.default_sequencing_platform: Optional[str] = default_sequencing_platform + self.default_sequencing = default_sequencing + self.default_read_end_type: Optional[str] = default_read_end_type + self.default_read_length: Optional[str] = default_read_length self.default_sample_type: Optional[str] = default_sample_type - self.default_analysis_type: str = default_analysis_type - self.default_analysis_status: str = default_analysis_status + self.default_analysis_type: Optional[str] = default_analysis_type + self.default_analysis_status: Optional[str] = default_analysis_status # gs specific self.default_bucket = None @@ -500,8 +519,7 @@ async def parse_manifest( # pylint: disable=too-many-branches ) -> Any: """ Parse manifest from iterable (file pointer / String.IO) - - Returns a dict mapping external sample ID to CPG sample ID + Returns a summary of the parsed records. """ rows = await self.file_pointer_to_rows( file_pointer=file_pointer, delimiter=delimiter @@ -509,7 +527,13 @@ async def parse_manifest( # pylint: disable=too-many-branches return await self.from_json(rows, confirm, dry_run) async def from_json(self, rows, confirm=False, dry_run=False): - """Parse passed rows""" + """ + Asynchronously parse rows of data, adding chunks of participants, samples, sequencing groups, assays, and analyses. + + Groups rows of participants by their IDs. For each participant, group samples by their IDs. + If no participants are present, groups samples by their IDs. + For each sample, gets its sequencing groups by their keys. For each sequencing group, groups assays and analyses. + """ await self.validate_rows(rows) # one participant with no value @@ -536,7 +560,9 @@ async def from_json(self, rows, confirm=False, dry_run=False): sequencing_groups: list[ParsedSequencingGroup] = [] for schunk in chunk(samples): - seq_groups_for_chunk = await asyncio.gather(*map(self.group_assays, schunk)) + seq_groups_for_chunk = await asyncio.gather( + *map(self.get_sample_sequencing_groups, schunk) + ) for sample, seqgroups in zip(schunk, seq_groups_for_chunk): sample.sequencing_groups = seqgroups @@ -581,6 +607,7 @@ async def from_json(self, rows, confirm=False, dry_run=False): if dry_run: logger.info('Dry run, so returning without inserting / updating metadata') + self.prepare_detail(samples) return summary, (participants if participants else samples) if confirm: @@ -601,7 +628,10 @@ async def from_json(self, rows, confirm=False, dry_run=False): [s.to_sm() for s in samples], ) - print(json.dumps(result, indent=2)) + if self.verbose: + logger.info(json.dumps(result, indent=2)) + else: + self.prepare_detail(samples) def _get_dict_reader(self, file_pointer, delimiter: str): """ @@ -662,6 +692,29 @@ def prepare_summary( return summary + def prepare_detail(self, samples: list[ParsedSample]): + """Uses tabulate to print a detailed summary of the samples being inserted / updated""" + sample_participants = {} + for sample in samples: + sample_participants[sample.external_sid] = sample.participant.external_pid if sample.participant else None + sample_sequencing_groups = {sample.external_sid: sample.sequencing_groups for sample in samples} + + details = [] + for sample, participant in sample_participants.items(): + for sg in sample_sequencing_groups[sample]: + sg_details = { + 'Participant': participant if participant else '', + 'Sample': sample, + 'Sequencing Type': sg.sequencing_type, + 'Assays': sum(1 for a in sg.assays if not a.internal_id), + } + details.append(sg_details) + + headers = ['Participant', 'Sample', 'Sequencing Type', 'Assays'] + table = list(list(detail.values()) for detail in details) + + print(tabulate(table, headers=headers, tablefmt='grid')) + def prepare_message( self, summary, @@ -681,32 +734,38 @@ def prepare_message( header = f'Processing samples: {external_sample_ids}' assays_count: dict[str, int] = defaultdict(int) + assays_types_count: dict[str, int] = defaultdict(int) sequencing_group_counts: dict[str, int] = defaultdict(int) - for s in assays: - assays_count[str(s.assay_type)] += 1 + for a in assays: + assays_count[str(a.meta.get('sequencing_type'))] += 1 + assays_types_count[str(a.assay_type)] += 1 for sg in sequencing_groups: sequencing_group_counts[str(sg.sequencing_type)] += 1 - str_seq_count = ', '.join(f'{k}={v}' for k, v in assays_count.items()) + str_assay_count = ', '.join(f'{k}={v}' for k, v in assays_count.items()) + str_assay_types_count = ', '.join(f'{k}={v}' for k, v in assays_types_count.items()) str_seqg_count = ', '.join( f'{k}={v}' for k, v in sequencing_group_counts.items() ) message = f"""\ + + {self.project}: {header} - Sequence types: {str_seq_count} - Sequence group types: {str_seqg_count} + Assays count: {str_assay_count} + Assays types count: {str_assay_types_count} + Sequencing group count: {str_seqg_count} Adding {summary['participants']['insert']} participants Adding {summary['samples']['insert']} samples - Adding {summary['sequencing_groups']['insert']} sequence groups + Adding {summary['sequencing_groups']['insert']} sequencing groups Adding {summary['assays']['insert']} assays - Adding {summary['analyses']['insert']} analysis + Adding {summary['analyses']['insert']} analyses Updating {summary['participants']['update']} participants Updating {summary['samples']['update']} samples - Updating {summary['sequencing_groups']['update']} sequence groups + Updating {summary['sequencing_groups']['update']} sequencing groups Updating {summary['assays']['update']} assays """ return message @@ -905,7 +964,7 @@ async def group_samples( ) -> list[ParsedSample]: """ From a set of rows, group (by calling self.get_sample_id) - and parse sample other sample values. + and parse samples and their values. """ samples = [] for sid, sample_rows in group_by(rows, self.get_sample_id).items(): @@ -928,7 +987,7 @@ async def get_sample_meta_from_group(self, rows: GroupedRow) -> dict: def get_sequencing_group_key(self, row: SingleRow) -> Hashable: """ - Get a key to group sequencing rows by. + Get a key to group sequencing group rows by. """ if seq_group_id := self.get_sequencing_group_id(row): return seq_group_id @@ -945,10 +1004,10 @@ def get_sequencing_group_key(self, row: SingleRow) -> Hashable: return tuple(v for _, v in keys) - async def group_assays(self, sample: ParsedSample) -> list[ParsedSequencingGroup]: + async def get_sample_sequencing_groups(self, sample: ParsedSample) -> list[ParsedSequencingGroup]: """ - From a set of rows, group (by calling self.get_sequencing_group_key) - and parse sequencing group other sequencing group values. + From a set of samples, group (by calling self.get_sequencing_group_key) + and parse sequencing groups and their values. """ sequencing_groups = [] for seq_rows in group_by(sample.rows, self.get_sequencing_group_key).values(): @@ -960,8 +1019,8 @@ async def group_assays(self, sample: ParsedSample) -> list[ParsedSequencingGroup internal_seqgroup_id=None, external_seqgroup_id=self.get_sequencing_group_id(seq_rows[0]), sequencing_type=seq_type, - sequence_technology=seq_tech, - sequence_platform=seq_platform, + sequencing_technology=seq_tech, + sequencing_platform=seq_platform, meta={}, sample=sample, rows=seq_rows, @@ -998,6 +1057,23 @@ async def get_assays_from_group( return list[ParsedAssay] (does not have to equal number of rows). """ + def get_sequencing_group_meta_from_assays(self, assays: list[ParsedAssay]) -> dict: + """ + From a list of assays, get any relevant sequencing group meta + """ + meta = {} + for assay in assays: + if assay.meta.get('sequencing_type') == 'exome': + keys = ('sequencing_facility', 'sequencing_library') + elif assay.meta.get('sequencing_type') in RNA_SEQ_TYPES: + keys = ('sequencing_facility', 'sequencing_library', 'read_end_type', 'read_length') + else: + continue + for key in keys: + if assay.meta.get(key) is not None: + meta[key] = assay.meta[key] + return meta + def get_sample_type(self, row: GroupedRow) -> str: """Get sample type from row""" return self.default_sample_type @@ -1014,15 +1090,31 @@ def get_sequencing_group_id(self, row: SingleRow) -> str | None: def get_sequencing_type(self, row: SingleRow) -> str: """Get sequence types from row""" - return self.default_sequencing_type + return self.default_sequencing.seq_type def get_sequencing_technology(self, row: SingleRow) -> str: """Get sequencing technology from row""" - return self.default_sequencing_technology + return self.default_sequencing.technology def get_sequencing_platform(self, row: SingleRow) -> str | None: """Get sequencing platform from row""" - return self.default_sequencing_platform + return self.default_sequencing.platform + + def get_sequencing_facility(self, row: SingleRow) -> str | None: + """Get sequencing facility from row""" + return self.default_sequencing.facility + + def get_sequencing_library(self, row: SingleRow) -> str | None: + """Get library type from row""" + return self.default_sequencing.library + + def get_read_end_type(self, row: SingleRow) -> str | None: + """Get read end type from row""" + return self.default_read_end_type + + def get_read_length(self, row: SingleRow) -> str | None: + """Get read length from row""" + return self.default_read_length def get_analysis_type(self, sample_id: str, row: GroupedRow) -> str: """Get analysis type from row""" @@ -1293,6 +1385,7 @@ def parse_fastqs_structure(fastqs) -> List[List[str]]: invalid_fastq_groups = [grp for grp in fastq_groups.values() if len(grp) != 2] if invalid_fastq_groups: + # TODO: implement handling for single-ended reads raise ValueError(f'Invalid fastq group {invalid_fastq_groups}') sorted_groups = sorted( diff --git a/metamist/parser/sample_file_map_parser.py b/metamist/parser/sample_file_map_parser.py index 077aca912..e095d8036 100644 --- a/metamist/parser/sample_file_map_parser.py +++ b/metamist/parser/sample_file_map_parser.py @@ -6,13 +6,17 @@ import click from metamist.parser.generic_metadata_parser import GenericMetadataParser, run_as_sync -from metamist.parser.generic_parser import SingleRow +from metamist.parser.generic_parser import DefaultSequencing, SingleRow PARTICIPANT_COL_NAME = 'individual_id' SAMPLE_ID_COL_NAME = 'sample_id' READS_COL_NAME = 'filenames' SEQ_TYPE_COL_NAME = 'type' CHECKSUM_COL_NAME = 'checksum' +SEQ_FACILITY_COL_NAME = 'sequencing_facility' +SEQ_LIBRARY_COL_NAME = 'sequencing_library' +READ_END_TYPE_COL_NAME = 'read_end_type' +READ_LENGTH_COL_NAME = 'read_length' KeyMap = { PARTICIPANT_COL_NAME: [ @@ -28,6 +32,10 @@ SAMPLE_ID_COL_NAME: ['sample_id', 'sample', 'sample id'], READS_COL_NAME: ['filename', 'filenames', 'files', 'file'], SEQ_TYPE_COL_NAME: ['type', 'types', 'sequencing type', 'sequencing_type'], + SEQ_FACILITY_COL_NAME: ['facility', 'sequencing facility', 'sequencing_facility'], + SEQ_LIBRARY_COL_NAME: ['library', 'library_prep', 'library prep', 'library type', 'library_type', 'sequencing_library', 'sequencing library'], + READ_END_TYPE_COL_NAME: ['read_end_type', 'read end type', 'read_end_types', 'read end types', 'end type', 'end_type', 'end_types', 'end types'], + READ_LENGTH_COL_NAME: ['length', 'read length', 'read_length', 'read lengths', 'read_lengths'], CHECKSUM_COL_NAME: ['md5', 'checksum'], } @@ -41,6 +49,10 @@ - 'Filenames' - ['Type'] - 'Checksum' +- ['Sequencing Facility'] - needed for exome & rna samples +- ['Library Type'] - needed for exome & rna samples +- ['End Type'] - needed for rna samples +- ['Read Length'] - needed for rna samples e.g. Sample ID Filenames @@ -63,6 +75,13 @@ Apollo sample_id004 sample_id004.filename-R1.fastq.gz Apollo sample_id004 sample_id004.filename-R2.fastq.gz +Example with optional columns for RNA samples +e.g. + Individual ID Sample ID Filenames Type Facility Library End Type Read Length + Hera sample_id001 sample_id001_TSStrtRNA_R1.fastq.gz,sample_id001_TSStrtRNA_R2.fastq.gz totalrna VCGS TSStrtRNA paired 151 + Hestia sample_id002 sample_id002_TSStrmRNA_R1.fastq.gz,sample_id002_TSStrmRNA_R2.fastq.gz polyarna VCGS TSStrmRNA paired 151 + + This format is useful for ingesting filenames for the seqr loading pipeline """ @@ -78,12 +97,15 @@ def __init__( self, search_locations: List[str], project: str, - default_sequencing_type='genome', default_sample_type='blood', - default_sequencing_technology='short-read', - default_sequencing_platform='illumina', + default_sequencing=DefaultSequencing( + seq_type='genome', technology='short-read', platform='illumina' + ), + default_read_end_type: str = None, + default_read_length: str | int = None, allow_extra_files_in_search_path=False, default_reference_assembly_location: str | None = None, + verbose=True, ): super().__init__( search_locations=search_locations, @@ -93,9 +115,14 @@ def __init__( reads_column=READS_COL_NAME, checksum_column=CHECKSUM_COL_NAME, seq_type_column=SEQ_TYPE_COL_NAME, - default_sequencing_type=default_sequencing_type, + seq_facility_column=SEQ_FACILITY_COL_NAME, + seq_library_column=SEQ_LIBRARY_COL_NAME, + read_end_type_column=READ_END_TYPE_COL_NAME, + read_length_column=READ_LENGTH_COL_NAME, default_sample_type=default_sample_type, - default_sequencing_technology=default_sequencing_technology, + default_sequencing=default_sequencing, + default_read_end_type=default_read_end_type, + default_read_length=default_read_length, default_reference_assembly_location=default_reference_assembly_location, participant_meta_map={}, sample_meta_map={}, @@ -103,6 +130,7 @@ def __init__( qc_meta_map={}, allow_extra_files_in_search_path=allow_extra_files_in_search_path, key_map=KeyMap, + verbose=verbose, ) def get_sample_id(self, row: SingleRow) -> str: @@ -127,10 +155,19 @@ def get_info() -> tuple[str, str]: help='The metamist project to import manifest into', ) @click.option('--default-sample-type', default='blood') -@click.option('--default-sequence-type', default='wgs') -@click.option('--default-sequence-technology', default='short-read') +@click.option('--default-sequencing-type', default='wgs') +@click.option('--default-sequencing-technology', default='short-read') +@click.option('--default-sequencing-facility', default=None) +@click.option('--default-sequencing-library', default=None) +@click.option('--default-read-end-type', default=None) +@click.option('--default-read-length', default=None) @click.option( - '--confirm', is_flag=True, help='Confirm with user input before updating server' + '--default-reference-assembly', + required=False, + help=( + 'CRAMs require a reference assembly to realign. ' + 'This must be provided if any of the reads are crams' + ), ) @click.option( '--search-path', @@ -139,52 +176,62 @@ def get_info() -> tuple[str, str]: help='Search path to search for files within', ) @click.option( - '--dry-run', is_flag=True, help='Just prepare the run, without comitting it' -) -@click.option( - '--allow-extra-files-in-search_path', + '--allow-extra-files-in-search-path', is_flag=True, help='By default, this parser will fail if there are crams, bams, fastqs ' 'in the search path that are not covered by the sample map.', ) @click.option( - '--default-reference-assembly', - required=False, - help=( - 'CRAMs require a reference assembly to realign. ' - 'This must be provided if any of the reads are crams' - ), + '--confirm', is_flag=True, help='Confirm with user input before updating server' ) +@click.option( + '--dry-run', is_flag=True, help='Just prepare the run, without comitting it' +) +@click.option('--verbose', '-v', is_flag=True, help='Verbose output') @click.argument('manifests', nargs=-1) @run_as_sync -async def main( +async def main( # pylint: disable=too-many-arguments manifests, search_path: List[str], project, default_sample_type='blood', default_sequencing_type='genome', - default_sequence_technology='short-read', + default_sequencing_technology='short-read', + default_sequencing_platform='illumina', + default_sequencing_facility: str = None, + default_sequencing_library: str = None, + default_read_end_type: str = None, + default_read_length: str = None, default_reference_assembly: str = None, + allow_extra_files_in_search_path=False, confirm=False, dry_run=False, - allow_extra_files_in_search_path=False, + verbose=False, ): """Run script from CLI arguments""" if not manifests: raise ValueError('Expected at least 1 manifest') - extra_seach_paths = [m for m in manifests if m.startswith('gs://')] - if extra_seach_paths: - search_path = list(set(search_path).union(set(extra_seach_paths))) + extra_search_paths = [m for m in manifests if m.startswith('gs://')] + if extra_search_paths: + search_path = list(set(search_path).union(set(extra_search_paths))) parser = SampleFileMapParser( project=project, default_sample_type=default_sample_type, - default_sequencing_type=default_sequencing_type, - default_sequencing_technology=default_sequence_technology, + default_sequencing=DefaultSequencing( + seq_type=default_sequencing_type, + technology=default_sequencing_technology, + platform=default_sequencing_platform, + facility=default_sequencing_facility, + library=default_sequencing_library, + ), + default_read_end_type=default_read_end_type, + default_read_length=default_read_length, + default_reference_assembly_location=default_reference_assembly, search_locations=search_path, allow_extra_files_in_search_path=allow_extra_files_in_search_path, - default_reference_assembly_location=default_reference_assembly, + verbose=verbose, ) for manifest in manifests: logger.info(f'Importing {manifest}') diff --git a/scripts/parse_existing_cohort.py b/scripts/parse_existing_cohort.py index 1df35063c..e0665e1ca 100644 --- a/scripts/parse_existing_cohort.py +++ b/scripts/parse_existing_cohort.py @@ -49,7 +49,7 @@ SingleRow, run_as_sync, ) -from metamist.parser.generic_parser import READS_EXTENSIONS +from metamist.parser.generic_parser import READS_EXTENSIONS, DefaultSequencing logger = logging.getLogger(__file__) logger.addHandler(logging.StreamHandler()) @@ -132,7 +132,9 @@ def __init__( assay_meta_map=Columns.sequence_meta_map(), batch_number=batch_number, allow_extra_files_in_search_path=True, - default_sequencing_type=sequencing_type, + default_sequencing=DefaultSequencing( + seq_type=sequencing_type, + ) ) def _get_dict_reader(self, file_pointer, delimiter: str): diff --git a/scripts/parse_ont_sheet.py b/scripts/parse_ont_sheet.py index 61865a122..29fe0c2ba 100644 --- a/scripts/parse_ont_sheet.py +++ b/scripts/parse_ont_sheet.py @@ -6,7 +6,11 @@ import click from metamist.parser.generic_metadata_parser import GenericMetadataParser, run_as_sync -from metamist.parser.generic_parser import ParsedSample, ParsedSequencingGroup +from metamist.parser.generic_parser import ( + DefaultSequencing, + ParsedSample, + ParsedSequencingGroup, +) logger = logging.getLogger(__file__) logger.addHandler(logging.StreamHandler()) @@ -37,10 +41,12 @@ def __init__( self, search_locations: List[str], project: str, - default_sequencing_type='genome', - default_sequencing_technology='long-read', - default_sequencing_platform='oxford-nanopore', default_sample_type='blood', + default_sequencing=DefaultSequencing( + seq_type='genome', + technology='long-read', + platform='oxford-nanopore', + ), allow_extra_files_in_search_path=False, ): sequence_meta_map = { @@ -66,9 +72,7 @@ def __init__( sample_name_column=Columns.SAMPLE_ID, reads_column=Columns.PASS_FASTQ_FILENAME, default_sample_type=default_sample_type, - default_sequencing_type=default_sequencing_type, - default_sequencing_technology=default_sequencing_technology, - default_sequencing_platform=default_sequencing_platform, + default_sequencing=default_sequencing, participant_meta_map={}, sample_meta_map={}, assay_meta_map=sequence_meta_map, @@ -93,8 +97,8 @@ def parse_fastqs_structure(fastqs) -> List[List[str]]: """ return [fastqs] - async def group_assays(self, sample: ParsedSample) -> list[ParsedSequencingGroup]: - sequencing_groups = await super().group_assays(sample) + async def get_sample_sequencing_groups(self, sample: ParsedSample) -> list[ParsedSequencingGroup]: + sequencing_groups = await super().get_sample_sequencing_groups(sample) for sequencing_group in sequencing_groups: failed_fastqs: list[str] = [] @@ -128,7 +132,7 @@ async def group_assays(self, sample: ParsedSample) -> list[ParsedSequencingGroup help='The metamist project to import manifest into', ) @click.option('--default-sample-type', default='blood') -@click.option('--default-sequence-type', default='genome') +@click.option('--default-sequencing-type', default='genome') @click.option('--default-sequencing-technology', default='long-read') @click.option('--default-sequencing-platform', default='oxford-nanopore') @click.option( @@ -167,16 +171,18 @@ async def main( if not manifests: raise ValueError('Expected at least 1 manifest') - extra_seach_paths = [m for m in manifests if m.startswith('gs://')] - if extra_seach_paths: - search_path = list(set(search_path).union(set(extra_seach_paths))) + extra_search_paths = [m for m in manifests if m.startswith('gs://')] + if extra_search_paths: + search_path = list(set(search_path).union(set(extra_search_paths))) parser = OntParser( project=project, default_sample_type=default_sample_type, - default_sequencing_type=default_sequencing_type, - default_sequencing_platform=default_sequencing_platform, - default_sequencing_technology=default_sequencing_technology, + default_sequencing=DefaultSequencing( + seq_type=default_sequencing_type, + technology=default_sequencing_technology, + platform=default_sequencing_platform, + ), search_locations=search_path, allow_extra_files_in_search_path=allow_extra_files_in_search_path, ) diff --git a/scripts/parse_sample_file_map.py b/scripts/parse_sample_file_map.py index ccc5059a3..e4e4ca7bb 100755 --- a/scripts/parse_sample_file_map.py +++ b/scripts/parse_sample_file_map.py @@ -5,7 +5,7 @@ import click -from metamist.parser.generic_metadata_parser import run_as_sync +from metamist.parser.generic_metadata_parser import DefaultSequencing, run_as_sync from metamist.parser.sample_file_map_parser import SampleFileMapParser __DOC = """ @@ -36,7 +36,7 @@ ) @click.option('--default-sample-type', default='blood') @click.option('--default-sequencing-type', default='wgs') -@click.option('--default-sequence-technology', default='short-read') +@click.option('--default-sequencing-technology', default='short-read') @click.option( '--confirm', is_flag=True, help='Confirm with user input before updating server' ) @@ -67,7 +67,7 @@ async def main( project, default_sample_type='blood', default_sequencing_type='wgs', - default_sequence_technology='short-read', + default_sequencing_technology='short-read', confirm=False, dry_run=False, allow_extra_files_in_search_path=False, @@ -77,15 +77,17 @@ async def main( if not manifests: raise ValueError('Expected at least 1 manifest') - extra_seach_paths = [m for m in manifests if m.startswith('gs://')] - if extra_seach_paths: - search_path = list(set(search_path).union(set(extra_seach_paths))) + extra_search_paths = [m for m in manifests if m.startswith('gs://')] + if extra_search_paths: + search_path = list(set(search_path).union(set(extra_search_paths))) parser = SampleFileMapParser( project=project, default_sample_type=default_sample_type, - default_sequencing_type=default_sequencing_type, - default_sequencing_technology=default_sequence_technology, + default_sequencing=DefaultSequencing( + seq_type=default_sequencing_type, + technology=default_sequencing_technology, + ), search_locations=search_path, allow_extra_files_in_search_path=allow_extra_files_in_search_path, default_reference_assembly_location=ref, diff --git a/scripts/parse_vcgs_manifest.py b/scripts/parse_vcgs_manifest.py index 491640322..78c4691a6 100644 --- a/scripts/parse_vcgs_manifest.py +++ b/scripts/parse_vcgs_manifest.py @@ -114,14 +114,14 @@ def get_sequencing_type(self, row: SingleRow) -> SequenceType: types = [types] if len(types) <= 0: if ( - self.default_sequencing_type is None - or self.default_sequencing_type.lower() == 'none' + self.default_sequencing.seq_type is None + or self.default_sequencing.seq_type.lower() == 'none' ): raise ValueError( f"Couldn't detect sequence type for sample {sample_id}, and " 'no default was available.' ) - return SequenceType(self.default_sequencing_type) + return SequenceType(self.default_sequencing.seq_type) if len(types) > 1: raise ValueError( f'Multiple library types for same sample {sample_id}, ' diff --git a/setup.py b/setup.py index 1c1d579c0..d49473b98 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ # for get id-token 'cpg-utils >= 4.9.4', 'gql[aiohttp,requests]', + 'tabulate >= 0.9.0' ], entry_points={ 'metamist_parser': [ diff --git a/test/test_parse_existing_cohort.py b/test/test_parse_existing_cohort.py index 8fb169803..a40989857 100644 --- a/test/test_parse_existing_cohort.py +++ b/test/test_parse_existing_cohort.py @@ -299,7 +299,7 @@ async def test_genome_sequencing_type(self): allow_missing_files=True, sequencing_type='genome', ) - self.assertEqual(parser.default_sequencing_type, 'genome') + self.assertEqual(parser.default_sequencing.seq_type, 'genome') @run_as_sync async def test_exome_sequencing_type(self): @@ -314,7 +314,7 @@ async def test_exome_sequencing_type(self): allow_missing_files=True, sequencing_type='exome', ) - self.assertEqual(parser.default_sequencing_type, 'exome') + self.assertEqual(parser.default_sequencing.seq_type, 'exome') @run_as_sync @patch('metamist.parser.generic_parser.query_async') diff --git a/test/test_parse_file_map.py b/test/test_parse_file_map.py index 22e7b700f..ff1aa5046 100644 --- a/test/test_parse_file_map.py +++ b/test/test_parse_file_map.py @@ -2,7 +2,7 @@ from test.testbase import DbIsolatedTest, run_as_sync from unittest.mock import patch -from metamist.parser.generic_parser import ParsedParticipant +from metamist.parser.generic_parser import DefaultSequencing, ParsedParticipant from metamist.parser.sample_file_map_parser import SampleFileMapParser @@ -25,7 +25,7 @@ async def test_single_row_fastq(self, mock_graphql_query): parser = SampleFileMapParser( search_locations=[], project=self.project_name, - default_sequencing_technology='short-read', + default_sequencing=DefaultSequencing(), ) fs = ['.filename-R1.fastq.gz', '.filename-R2.fastq.gz'] parser.filename_map = {k: 'gs://BUCKET/FAKE/' + k for k in fs} @@ -91,7 +91,7 @@ async def test_to_external(self, mock_graphql_query): parser = SampleFileMapParser( search_locations=[], project=self.project_name, - default_sequencing_technology='short-read', + default_sequencing=DefaultSequencing(), ) fs = ['.filename-R1.fastq.gz', '.filename-R2.fastq.gz'] parser.filename_map = {k: 'gs://BUCKET/FAKE/' + k for k in fs} @@ -148,7 +148,7 @@ async def test_two_rows_with_provided_checksums(self, mock_graphql_query): self.maxDiff = None self.assertDictEqual({}, participants[0].samples[0].meta) - expected_sequence1_reads = [ + expected_assay1_reads = [ { 'location': 'gs://BUCKET/FAKE/.filename-R1.fastq.gz', 'basename': '.filename-R1.fastq.gz', @@ -168,11 +168,11 @@ async def test_two_rows_with_provided_checksums(self, mock_graphql_query): ] self.assertListEqual( - expected_sequence1_reads, + expected_assay1_reads, participants[0].samples[0].sequencing_groups[0].assays[0].meta['reads'], ) - expected_sequence2_reads = [ + expected_assay2_reads = [ { 'location': 'gs://BUCKET/FAKE/.filename-R1.fastq.gz', 'basename': '.filename-R1.fastq.gz', @@ -191,6 +191,164 @@ async def test_two_rows_with_provided_checksums(self, mock_graphql_query): }, ] self.assertListEqual( - expected_sequence2_reads, + expected_assay2_reads, participants[1].samples[0].sequencing_groups[0].assays[0].meta['reads'], ) + + @run_as_sync + @patch('metamist.parser.generic_parser.query_async') + async def test_valid_rna_rows(self, mock_graphql_query): + """ + Test importing a single row of rna data + """ + + mock_graphql_query.side_effect = self.run_graphql_query_async + + rows = [ + 'Sample ID\tFilenames\tType\tfacility\tlibrary\tend_type\tread_length', + '\t.filename-R1.fastq.gz,.filename-R2.fastq.gz\tpolyarna\tVCGS\tTSStrmRNA\tpaired\t151', + '\t.filename-R1.fastq.gz\ttotalrna\tVCGS\tTSStrtRNA\tpaired\t151', + '\t.filename-R2.fastq.gz\ttotalrna\tVCGS\tTSStrtRNA\tpaired\t151', + ] + + parser = SampleFileMapParser( + search_locations=[], + # doesn't matter, we're going to mock the call anyway + project=self.project_name, + ) + fs = [ + '.filename-R1.fastq.gz', + '.filename-R2.fastq.gz', + '.filename-R1.fastq.gz', + '.filename-R2.fastq.gz', + ] + parser.filename_map = {k: 'gs://BUCKET/FAKE/' + k for k in fs} + parser.skip_checking_gcs_objects = True + + file_contents = '\n'.join(rows) + summary, samples = await parser.parse_manifest( + StringIO(file_contents), delimiter='\t', dry_run=True + ) + + self.assertEqual(0, summary['participants']['insert']) + self.assertEqual(0, summary['participants']['update']) + self.assertEqual(2, summary['samples']['insert']) + self.assertEqual(0, summary['samples']['update']) + self.assertEqual(2, summary['assays']['insert']) + self.assertEqual(0, summary['assays']['update']) + self.maxDiff = None + + self.assertEqual('polyarna', samples[0].sequencing_groups[0].sequencing_type) + expected_sg1_meta = { + 'sequencing_facility': 'VCGS', + 'sequencing_library': 'TSStrmRNA', + 'read_end_type': 'paired', + 'read_length': 151 + } + self.assertDictEqual( + expected_sg1_meta, + samples[0].sequencing_groups[0].meta, + ) + + self.assertEqual('totalrna', samples[1].sequencing_groups[0].sequencing_type) + expected_sg2_meta = { + 'sequencing_facility': 'VCGS', + 'sequencing_library': 'TSStrtRNA', + 'read_end_type': 'paired', + 'read_length': 151 + } + self.assertDictEqual( + expected_sg2_meta, + samples[1].sequencing_groups[0].meta, + ) + + @run_as_sync + @patch('metamist.parser.generic_parser.query_async') + async def test_invalid_rna_row(self, mock_graphql_query): + """ + Test importing a single row of rna data + """ + + mock_graphql_query.side_effect = self.run_graphql_query_async + + rows = [ + 'Sample ID\tFilenames\tType', + '\t.filename-R1.fastq.gz,.filename-R2.fastq.gz\tpolyarna' + ] + + parser = SampleFileMapParser( + search_locations=[], + # doesn't matter, we're going to mock the call anyway + project=self.project_name, + ) + fs = [ + '.filename-R1.fastq.gz', + '.filename-R2.fastq.gz', + ] + parser.filename_map = {k: 'gs://BUCKET/FAKE/' + k for k in fs} + parser.skip_checking_gcs_objects = True + + file_contents = '\n'.join(rows) + with self.assertRaises( + ValueError + ): + _, _ = await parser.parse_manifest( + StringIO(file_contents), delimiter='\t', dry_run=True + ) + + @run_as_sync + @patch('metamist.parser.generic_parser.query_async') + async def test_rna_row_with_default_field_values(self, mock_graphql_query): + """ + Test importing a single row of rna data + """ + + mock_graphql_query.side_effect = self.run_graphql_query_async + + rows = [ + 'Sample ID\tFilenames\tType', + '\t.filename-R1.fastq.gz,.filename-R2.fastq.gz\tpolyarna' + ] + + parser = SampleFileMapParser( + search_locations=[], + # doesn't matter, we're going to mock the call anyway + project=self.project_name, + default_sequencing=DefaultSequencing( + facility='VCGS', + library='TSStrmRNA' + ), + default_read_end_type='paired', + default_read_length=151 + ) + fs = [ + '.filename-R1.fastq.gz', + '.filename-R2.fastq.gz', + ] + parser.filename_map = {k: 'gs://BUCKET/FAKE/' + k for k in fs} + parser.skip_checking_gcs_objects = True + + file_contents = '\n'.join(rows) + summary, samples = await parser.parse_manifest( + StringIO(file_contents), delimiter='\t', dry_run=True + ) + + self.assertEqual(0, summary['participants']['insert']) + self.assertEqual(0, summary['participants']['update']) + self.assertEqual(1, summary['samples']['insert']) + self.assertEqual(0, summary['samples']['update']) + self.assertEqual(1, summary['assays']['insert']) + self.assertEqual(0, summary['assays']['update']) + self.maxDiff = None + + self.assertEqual('polyarna', samples[0].sequencing_groups[0].sequencing_type) + expected_sg1_meta = { + 'sequencing_facility': 'VCGS', + 'sequencing_library': 'TSStrmRNA', + 'read_end_type': 'paired', + 'read_length': 151 + } + self.assertDictEqual( + expected_sg1_meta, + samples[0].sequencing_groups[0].meta, + )