From a4025aa6e9b33271bb9301b04c014bbdb8d4feca Mon Sep 17 00:00:00 2001 From: EddieLF Date: Wed, 16 Aug 2023 12:24:57 +1000 Subject: [PATCH] Update VCGS filename parser, linting fixes --- scripts/parse_redcap__mcri-lrp.py | 140 +++++++++++++++++++----------- scripts/redcap_parsing_utils.py | 76 ++++++++++------ 2 files changed, 136 insertions(+), 80 deletions(-) diff --git a/scripts/parse_redcap__mcri-lrp.py b/scripts/parse_redcap__mcri-lrp.py index 73de78ab1..b868b0c67 100644 --- a/scripts/parse_redcap__mcri-lrp.py +++ b/scripts/parse_redcap__mcri-lrp.py @@ -1,13 +1,10 @@ """ Parses acute-care redcap dump ingest into metamist """ +# pylint: disable=too-many-locals import csv import tempfile import click -from sample_metadata.apis import FamilyApi, ImportApi -from sample_metadata.parser.generic_metadata_parser import run_as_sync -from sample_metadata.parser.sample_file_map_parser import SampleFileMapParser - from redcap_parsing_utils import ( PEDFILE_HEADERS, INDIVIDUAL_METADATA_HEADERS, @@ -17,10 +14,17 @@ find_fastq_pairs, ) -PROJECT = "mcri-lrp" +from metamist.apis import FamilyApi, ImportApi +from metamist.parser.generic_metadata_parser import run_as_sync +from metamist.parser.sample_file_map_parser import SampleFileMapParser + + +PROJECT = 'mcri-lrp' UPLOAD_BUCKET_SEARCH_PATH = 'gs://cpg-mcri-lrp-main-upload/' + def get_individual_sample_ids(individual_id, column_keys, row): + """Converts a row of redcap data into a dictionary of sample_id to sequencing_group_id""" sample_id_to_individual_id = {} for key in column_keys: if row[key].strip(): @@ -35,8 +39,8 @@ def get_individual_sample_ids(individual_id, column_keys, row): '-f', '--facility', type=click.Choice(list(map(lambda x: x.name, Facility)), case_sensitive=False), - default="VCGS", - help="Facility to use for fastq file parsing.", + default='VCGS', + help='Facility to use for fastq file parsing.', ) @click.argument('redcap_csv') @run_as_sync @@ -57,7 +61,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): # Sanity check: use redcap auto generated filename to ensure this file is from # the acute-care project - assert 'AustralianLeukodystr' in redcap_csv, "Is this an LRP redcap csv?" + assert 'AustralianLeukodystr' in redcap_csv, 'Is this an LRP redcap csv?' # Prepare temp out files pedfile = tempfile.NamedTemporaryFile(mode='w') @@ -76,7 +80,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): filemap_writer.writeheader() # Parse rows into family units - print("Parsing redcap csv") + print('Parsing redcap csv') samples_by_individual_id = {} with open(redcap_csv) as csvfile: # The columns in redcap reports, and even their relative order can change @@ -94,64 +98,94 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): if not row['pt_study_id']: continue proband_id = row['pt_study_id'] - family_id = proband_id.rsplit('-',1)[0] + family_id = proband_id.rsplit('-', 1)[0] paternal_id = f'{family_id}-2' maternal_id = f'{family_id}-1' - proband_sample_id_columns = ['vcgs_exome_id','stored_dna_sampleno', 'vcgs_genome_id', 'research_rna_program_id'] - maternal_sample_id_columns = ['maternal_vcgs_exome_id','bio_mat_sam', 'maternal_vcgs_genome_id'] - paternal_sample_id_columns = ['paternal_vcgs_exome_id','bio_pat_sam', 'bio_pat_sam', 'paternal_vcgs_genome_id'] + proband_sample_id_columns = [ + 'vcgs_exome_id', + 'stored_dna_sampleno', + 'vcgs_genome_id', + 'research_rna_program_id', + ] + maternal_sample_id_columns = [ + 'maternal_vcgs_exome_id', + 'bio_mat_sam', + 'maternal_vcgs_genome_id', + ] + paternal_sample_id_columns = [ + 'paternal_vcgs_exome_id', + 'bio_pat_sam', + 'bio_pat_sam', + 'paternal_vcgs_genome_id', + ] # collect all of the different sample ids per individual - samples_by_individual_id.update(get_individual_sample_ids(proband_id, proband_sample_id_columns, row)) - samples_by_individual_id.update(get_individual_sample_ids(paternal_id, paternal_sample_id_columns, row)) - samples_by_individual_id.update(get_individual_sample_ids(maternal_id, maternal_sample_id_columns, row)) + samples_by_individual_id.update( + get_individual_sample_ids(proband_id, proband_sample_id_columns, row) + ) + samples_by_individual_id.update( + get_individual_sample_ids(paternal_id, paternal_sample_id_columns, row) + ) + samples_by_individual_id.update( + get_individual_sample_ids(maternal_id, maternal_sample_id_columns, row) + ) # Write proband pedfile line - ped_writer.writerow({ - 'Family ID': family_id, - 'Individual ID': proband_id, - 'Paternal ID': paternal_id or "0", - 'Maternal ID': maternal_id or "0", - 'Sex': row['dem_sex'], - 'Affected Status': "2", - 'Notes': "", - }) + ped_writer.writerow( + { + 'Family ID': family_id, + 'Individual ID': proband_id, + 'Paternal ID': paternal_id or '0', + 'Maternal ID': maternal_id or '0', + 'Sex': row['dem_sex'], + 'Affected Status': '2', + 'Notes': '', + } + ) # Write maternal pedfile line if maternal_id and proband_id.endswith('-A'): - ped_writer.writerow({ - 'Family ID': family_id, - 'Individual ID': maternal_id, - 'Paternal ID': "0", - 'Maternal ID': "0", - 'Sex': "2", - 'Affected Status': "1", - 'Notes': "", - }) + ped_writer.writerow( + { + 'Family ID': family_id, + 'Individual ID': maternal_id, + 'Paternal ID': '0', + 'Maternal ID': '0', + 'Sex': '2', + 'Affected Status': '1', + 'Notes': '', + } + ) # Write paternal pedfile line if paternal_id and proband_id.endswith('-A'): - ped_writer.writerow({ - 'Family ID': family_id, - 'Individual ID': paternal_id, - 'Paternal ID': "0", - 'Maternal ID': "0", - 'Sex': "1", - 'Affected Status': "1", - 'Notes': "", - }) + ped_writer.writerow( + { + 'Family ID': family_id, + 'Individual ID': paternal_id, + 'Paternal ID': '0', + 'Maternal ID': '0', + 'Sex': '1', + 'Affected Status': '1', + 'Notes': '', + } + ) # Write individual metadata line (proband only) - ind_writer.writerow({ - 'Family ID': family_id, - 'Individual ID': proband_id, - 'Individual Notes': row['pt_comments'] + '\n\n' + row['dx_comments'], - }) + ind_writer.writerow( + { + 'Family ID': family_id, + 'Individual ID': proband_id, + 'Individual Notes': row['pt_comments'] + + '\n\n' + + row['dx_comments'], + } + ) # Save ped and individual files to disk then write to api - print("Saving pedfile:") + print('Saving pedfile:') pedfile.flush() with open(pedfile.name) as p: print(p.read()) @@ -166,7 +200,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): ) print('API response:', response) - print("\nSaving Individual metadatafile:") + print('\nSaving Individual metadatafile:') ind_file.flush() with open(ind_file.name) as f: print(f.read()) @@ -183,7 +217,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): # Find fastqs in upload bucket if not search_path: - print("Search_path not provided so skipping file mapping") + print('Search_path not provided so skipping file mapping') else: # Find all fastqs in search path found_files = False @@ -200,14 +234,14 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool): ), 'Type': fastq_pair[0].seq_type.value, } - ## temporarily skip RNA + # temporarily skip RNA if 'RNA' in fastq_pair[0].seq_type.value: continue filemap_writer.writerow(row) found_files = True if found_files: - print("\nSaving filemap") + print('\nSaving filemap') filemap_file.flush() with open(filemap_file.name) as f: print(f.read()) diff --git a/scripts/redcap_parsing_utils.py b/scripts/redcap_parsing_utils.py index 61b8469b8..d797606d7 100644 --- a/scripts/redcap_parsing_utils.py +++ b/scripts/redcap_parsing_utils.py @@ -1,3 +1,4 @@ +# pylint: disable=unused-variable from enum import Enum from collections import defaultdict from cloudpathlib import CloudPath @@ -26,23 +27,25 @@ 'Description', 'Coded Phenotype', ] -FILEMAP_HEADERS = ["Individual ID", "Sample ID", "Filenames", "Type"] +FILEMAP_HEADERS = ['Individual ID', 'Sample ID', 'Filenames', 'Type'] class Facility(str, Enum): - GARVAN = "Garvan" - VCGS = "VCGS" - NSWPath = "NSWPath" - QPath = "QPath" + """Sequencing facilities""" + + GARVAN = 'Garvan' + VCGS = 'VCGS' + NSWPath = 'NSWPath' + QPath = 'QPath' class SeqType(Enum): """Sequence types""" - GENOME = "genome" - EXOME = "exome" - TRNA = "totalRNA" - MRNA = "mRNA" + GENOME = 'genome' + EXOME = 'exome' + TRNA = 'totalRNA' + MRNA = 'mRNA' # VCGS library types that appear in fastq names mapped to metamist @@ -64,7 +67,7 @@ class SeqType(Enum): } -class FacilityFastq: +class FacilityFastq: # pylint: disable=too-many-instance-attributes """ Object for parsing filenames of fastqs prepared by known sequence providers """ @@ -83,7 +86,7 @@ def __init__(self, path, facility: Facility) -> None: elif facility == Facility.QPath: self.parse_qpath_fastq_name() else: - assert False, f"Facility {facility} not supported" + raise AssertionError(f'Facility {facility} not supported') def parse_vcgs_fastq_name(self): """ @@ -104,20 +107,37 @@ def parse_vcgs_fastq_name(self): """ # Sanity checks assert self.path.match('*.fastq.gz') - assert len(self.path.name.split('_')) == 9 base = self.path.name.rsplit('.', 2)[0] - ( - date, - a, - b, - library_id, - sample_id, - library_prep_id, - library_type, - lane, - read, - ) = base.split('_') + if len(self.path.name.split('_')) == 8: + # pre ~2018 format + ( + a, + b, + library_id, + sample_id, + library_prep_id, + library_type, + lane, + read, + ) = base.split('_') + elif len(self.path.name.split('_')) == 9: + # modern VCGS format + ( + date, + a, + b, + library_id, + sample_id, + library_prep_id, + library_type, + lane, + read, + ) = base.split('_') + else: + raise AssertionError( + f'Unexpected number of fields in filename {self.path.name}' + ) self.sample_id = sample_id.rsplit('-', 1)[0] self.read_pair_prefix = base.rsplit('_', 1)[0] @@ -154,7 +174,9 @@ def parse_garvan_fastq_name(self): # logic: Garvan do not provide exome seq service self.seq_type = SeqType.GENOME else: - assert False, f"Sample type {self.sample_type} not supported for GARVAN." + raise AssertionError( + f'Sample type {self.sample_type} not supported for GARVAN.' + ) def parse_nswpath_fastq_name(self): """ @@ -229,14 +251,14 @@ def find_fastq_pairs( read_pairs = defaultdict(list) if recursive: - assert False, "rglob not supported by CloudPathLib?" + raise AssertionError('rglob not supported by CloudPathLib?') # glob/rglob not supported by CloudPathLib? # AttributeError: type object '_CloudPathSelectable' has no attribute '_scandir'. Did you mean: 'scandir'? # falling back to non-recursive implementation using iterdir - # fastq_iter = search_dir.rglob("*.fastq.gz") + # fastq_iter = search_dir.rglob('*.fastq.gz') # else: - # fastq_iter = search_dir.glob("*.fastq.gz") + # fastq_iter = search_dir.glob('*.fastq.gz') # for f in fastq_iter: for f in search_dir.iterdir():