Skip to content

Commit

Permalink
Update VCGS filename parser, linting fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
EddieLF committed Aug 16, 2023
1 parent da7d673 commit a4025aa
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 80 deletions.
140 changes: 87 additions & 53 deletions scripts/parse_redcap__mcri-lrp.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
""" Parses acute-care redcap dump ingest into metamist """
# pylint: disable=too-many-locals

import csv
import tempfile
import click

from sample_metadata.apis import FamilyApi, ImportApi
from sample_metadata.parser.generic_metadata_parser import run_as_sync
from sample_metadata.parser.sample_file_map_parser import SampleFileMapParser

from redcap_parsing_utils import (
PEDFILE_HEADERS,
INDIVIDUAL_METADATA_HEADERS,
Expand All @@ -17,10 +14,17 @@
find_fastq_pairs,
)

PROJECT = "mcri-lrp"
from metamist.apis import FamilyApi, ImportApi
from metamist.parser.generic_metadata_parser import run_as_sync
from metamist.parser.sample_file_map_parser import SampleFileMapParser


PROJECT = 'mcri-lrp'
UPLOAD_BUCKET_SEARCH_PATH = 'gs://cpg-mcri-lrp-main-upload/'


def get_individual_sample_ids(individual_id, column_keys, row):
"""Converts a row of redcap data into a dictionary of sample_id to sequencing_group_id"""
sample_id_to_individual_id = {}
for key in column_keys:
if row[key].strip():
Expand All @@ -35,8 +39,8 @@ def get_individual_sample_ids(individual_id, column_keys, row):
'-f',
'--facility',
type=click.Choice(list(map(lambda x: x.name, Facility)), case_sensitive=False),
default="VCGS",
help="Facility to use for fastq file parsing.",
default='VCGS',
help='Facility to use for fastq file parsing.',
)
@click.argument('redcap_csv')
@run_as_sync
Expand All @@ -57,7 +61,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):

# Sanity check: use redcap auto generated filename to ensure this file is from
# the acute-care project
assert 'AustralianLeukodystr' in redcap_csv, "Is this an LRP redcap csv?"
assert 'AustralianLeukodystr' in redcap_csv, 'Is this an LRP redcap csv?'

# Prepare temp out files
pedfile = tempfile.NamedTemporaryFile(mode='w')
Expand All @@ -76,7 +80,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):
filemap_writer.writeheader()

# Parse rows into family units
print("Parsing redcap csv")
print('Parsing redcap csv')
samples_by_individual_id = {}
with open(redcap_csv) as csvfile:
# The columns in redcap reports, and even their relative order can change
Expand All @@ -94,64 +98,94 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):
if not row['pt_study_id']:
continue
proband_id = row['pt_study_id']
family_id = proband_id.rsplit('-',1)[0]
family_id = proband_id.rsplit('-', 1)[0]

paternal_id = f'{family_id}-2'
maternal_id = f'{family_id}-1'

proband_sample_id_columns = ['vcgs_exome_id','stored_dna_sampleno', 'vcgs_genome_id', 'research_rna_program_id']
maternal_sample_id_columns = ['maternal_vcgs_exome_id','bio_mat_sam', 'maternal_vcgs_genome_id']
paternal_sample_id_columns = ['paternal_vcgs_exome_id','bio_pat_sam', 'bio_pat_sam', 'paternal_vcgs_genome_id']
proband_sample_id_columns = [
'vcgs_exome_id',
'stored_dna_sampleno',
'vcgs_genome_id',
'research_rna_program_id',
]
maternal_sample_id_columns = [
'maternal_vcgs_exome_id',
'bio_mat_sam',
'maternal_vcgs_genome_id',
]
paternal_sample_id_columns = [
'paternal_vcgs_exome_id',
'bio_pat_sam',
'bio_pat_sam',
'paternal_vcgs_genome_id',
]

# collect all of the different sample ids per individual
samples_by_individual_id.update(get_individual_sample_ids(proband_id, proband_sample_id_columns, row))
samples_by_individual_id.update(get_individual_sample_ids(paternal_id, paternal_sample_id_columns, row))
samples_by_individual_id.update(get_individual_sample_ids(maternal_id, maternal_sample_id_columns, row))
samples_by_individual_id.update(
get_individual_sample_ids(proband_id, proband_sample_id_columns, row)
)
samples_by_individual_id.update(
get_individual_sample_ids(paternal_id, paternal_sample_id_columns, row)
)
samples_by_individual_id.update(
get_individual_sample_ids(maternal_id, maternal_sample_id_columns, row)
)

# Write proband pedfile line
ped_writer.writerow({
'Family ID': family_id,
'Individual ID': proband_id,
'Paternal ID': paternal_id or "0",
'Maternal ID': maternal_id or "0",
'Sex': row['dem_sex'],
'Affected Status': "2",
'Notes': "",
})
ped_writer.writerow(
{
'Family ID': family_id,
'Individual ID': proband_id,
'Paternal ID': paternal_id or '0',
'Maternal ID': maternal_id or '0',
'Sex': row['dem_sex'],
'Affected Status': '2',
'Notes': '',
}
)

# Write maternal pedfile line
if maternal_id and proband_id.endswith('-A'):
ped_writer.writerow({
'Family ID': family_id,
'Individual ID': maternal_id,
'Paternal ID': "0",
'Maternal ID': "0",
'Sex': "2",
'Affected Status': "1",
'Notes': "",
})
ped_writer.writerow(
{
'Family ID': family_id,
'Individual ID': maternal_id,
'Paternal ID': '0',
'Maternal ID': '0',
'Sex': '2',
'Affected Status': '1',
'Notes': '',
}
)

# Write paternal pedfile line
if paternal_id and proband_id.endswith('-A'):
ped_writer.writerow({
'Family ID': family_id,
'Individual ID': paternal_id,
'Paternal ID': "0",
'Maternal ID': "0",
'Sex': "1",
'Affected Status': "1",
'Notes': "",
})
ped_writer.writerow(
{
'Family ID': family_id,
'Individual ID': paternal_id,
'Paternal ID': '0',
'Maternal ID': '0',
'Sex': '1',
'Affected Status': '1',
'Notes': '',
}
)

# Write individual metadata line (proband only)
ind_writer.writerow({
'Family ID': family_id,
'Individual ID': proband_id,
'Individual Notes': row['pt_comments'] + '\n\n' + row['dx_comments'],
})
ind_writer.writerow(
{
'Family ID': family_id,
'Individual ID': proband_id,
'Individual Notes': row['pt_comments']
+ '\n\n'
+ row['dx_comments'],
}
)

# Save ped and individual files to disk then write to api
print("Saving pedfile:")
print('Saving pedfile:')
pedfile.flush()
with open(pedfile.name) as p:
print(p.read())
Expand All @@ -166,7 +200,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):
)
print('API response:', response)

print("\nSaving Individual metadatafile:")
print('\nSaving Individual metadatafile:')
ind_file.flush()
with open(ind_file.name) as f:
print(f.read())
Expand All @@ -183,7 +217,7 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):

# Find fastqs in upload bucket
if not search_path:
print("Search_path not provided so skipping file mapping")
print('Search_path not provided so skipping file mapping')
else:
# Find all fastqs in search path
found_files = False
Expand All @@ -200,14 +234,14 @@ async def main(redcap_csv: str, search_path: str, facility: str, dry_run: bool):
),
'Type': fastq_pair[0].seq_type.value,
}
## temporarily skip RNA
# temporarily skip RNA
if 'RNA' in fastq_pair[0].seq_type.value:
continue
filemap_writer.writerow(row)
found_files = True

if found_files:
print("\nSaving filemap")
print('\nSaving filemap')
filemap_file.flush()
with open(filemap_file.name) as f:
print(f.read())
Expand Down
76 changes: 49 additions & 27 deletions scripts/redcap_parsing_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pylint: disable=unused-variable
from enum import Enum
from collections import defaultdict
from cloudpathlib import CloudPath
Expand Down Expand Up @@ -26,23 +27,25 @@
'Description',
'Coded Phenotype',
]
FILEMAP_HEADERS = ["Individual ID", "Sample ID", "Filenames", "Type"]
FILEMAP_HEADERS = ['Individual ID', 'Sample ID', 'Filenames', 'Type']


class Facility(str, Enum):
GARVAN = "Garvan"
VCGS = "VCGS"
NSWPath = "NSWPath"
QPath = "QPath"
"""Sequencing facilities"""

GARVAN = 'Garvan'
VCGS = 'VCGS'
NSWPath = 'NSWPath'
QPath = 'QPath'


class SeqType(Enum):
"""Sequence types"""

GENOME = "genome"
EXOME = "exome"
TRNA = "totalRNA"
MRNA = "mRNA"
GENOME = 'genome'
EXOME = 'exome'
TRNA = 'totalRNA'
MRNA = 'mRNA'


# VCGS library types that appear in fastq names mapped to metamist
Expand All @@ -64,7 +67,7 @@ class SeqType(Enum):
}


class FacilityFastq:
class FacilityFastq: # pylint: disable=too-many-instance-attributes
"""
Object for parsing filenames of fastqs prepared by known sequence providers
"""
Expand All @@ -83,7 +86,7 @@ def __init__(self, path, facility: Facility) -> None:
elif facility == Facility.QPath:
self.parse_qpath_fastq_name()
else:
assert False, f"Facility {facility} not supported"
raise AssertionError(f'Facility {facility} not supported')

def parse_vcgs_fastq_name(self):
"""
Expand All @@ -104,20 +107,37 @@ def parse_vcgs_fastq_name(self):
"""
# Sanity checks
assert self.path.match('*.fastq.gz')
assert len(self.path.name.split('_')) == 9

base = self.path.name.rsplit('.', 2)[0]
(
date,
a,
b,
library_id,
sample_id,
library_prep_id,
library_type,
lane,
read,
) = base.split('_')
if len(self.path.name.split('_')) == 8:
# pre ~2018 format
(
a,
b,
library_id,
sample_id,
library_prep_id,
library_type,
lane,
read,
) = base.split('_')
elif len(self.path.name.split('_')) == 9:
# modern VCGS format
(
date,
a,
b,
library_id,
sample_id,
library_prep_id,
library_type,
lane,
read,
) = base.split('_')
else:
raise AssertionError(
f'Unexpected number of fields in filename {self.path.name}'
)

self.sample_id = sample_id.rsplit('-', 1)[0]
self.read_pair_prefix = base.rsplit('_', 1)[0]
Expand Down Expand Up @@ -154,7 +174,9 @@ def parse_garvan_fastq_name(self):
# logic: Garvan do not provide exome seq service
self.seq_type = SeqType.GENOME
else:
assert False, f"Sample type {self.sample_type} not supported for GARVAN."
raise AssertionError(
f'Sample type {self.sample_type} not supported for GARVAN.'
)

def parse_nswpath_fastq_name(self):
"""
Expand Down Expand Up @@ -229,14 +251,14 @@ def find_fastq_pairs(
read_pairs = defaultdict(list)

if recursive:
assert False, "rglob not supported by CloudPathLib?"
raise AssertionError('rglob not supported by CloudPathLib?')
# glob/rglob not supported by CloudPathLib?
# AttributeError: type object '_CloudPathSelectable' has no attribute '_scandir'. Did you mean: 'scandir'?
# falling back to non-recursive implementation using iterdir

# fastq_iter = search_dir.rglob("*.fastq.gz")
# fastq_iter = search_dir.rglob('*.fastq.gz')
# else:
# fastq_iter = search_dir.glob("*.fastq.gz")
# fastq_iter = search_dir.glob('*.fastq.gz')

# for f in fastq_iter:
for f in search_dir.iterdir():
Expand Down

0 comments on commit a4025aa

Please sign in to comment.