Skip to content

Commit b22076e

Browse files
committed
Updated version of rdnow redcap parser
1 parent a4025aa commit b22076e

File tree

1 file changed

+356
-0
lines changed

1 file changed

+356
-0
lines changed

scripts/parse_redcap__rdnow_v2.py

+356
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
""" Parses acute-care redcap dump ingest into metamist """
2+
3+
import csv
4+
import tempfile
5+
import click
6+
7+
from redcap_parsing_utils import (
8+
PEDFILE_HEADERS,
9+
INDIVIDUAL_METADATA_HEADERS,
10+
FAMILY_METADATA_HEADERS,
11+
FILEMAP_HEADERS,
12+
Facility,
13+
find_fastq_pairs,
14+
)
15+
16+
from metamist.apis import FamilyApi
17+
from metamist.parser.generic_metadata_parser import run_as_sync
18+
from metamist.parser.sample_file_map_parser import SampleFileMapParser
19+
20+
21+
PROJECT = 'rdnow'
22+
UPLOAD_BUCKET_SEARCH_PATH = 'gs://cpg-rdnow-main-upload/'
23+
FACILITY = 'VCGS'
24+
25+
26+
def get_individual_sample_ids(individual_id, column_keys, row):
27+
"""
28+
Extracts sample IDs from a row of redcap data retuning a dictionary of sample_ids by
29+
individual id.
30+
"""
31+
sample_id_to_individual_id = {}
32+
for key in column_keys:
33+
if row[key].strip():
34+
sample_id_to_individual_id[row[key].strip()] = individual_id
35+
return sample_id_to_individual_id
36+
37+
38+
def convert_sex(sex_str: str) -> int:
39+
"""converts from male/female to 1/2"""
40+
if sex_str.lower() in ['male', '1']:
41+
return 1
42+
if sex_str.lower() in ['female', '2']:
43+
return 2
44+
return 0
45+
46+
47+
def process_family_row(row: dict):
48+
"""
49+
Extract the pedigree information from the proband and relatives 1-10
50+
Assumptions include:
51+
- rel1 is mum
52+
- rel2 is dad
53+
"""
54+
# Handle this if and when we run into it (ask cas)
55+
if row['rdn_rel_existing_proband'] != '0':
56+
assert False, f'Handle 2nd fam member: {row}'
57+
58+
family_id = row['rdn_family_id']
59+
proband_id = row['rdn_individual_id']
60+
individuals = []
61+
62+
# Proband
63+
mother_id = row['rdn_rel_1_study_id'] if row['rdn_rel_1_study_id'] else 0
64+
father_id = row['rdn_rel_2_study_id'] if row['rdn_rel_2_study_id'] else 0
65+
66+
individuals.append(
67+
{
68+
'Family ID': family_id,
69+
'Individual ID': proband_id,
70+
'Paternal ID': father_id,
71+
'Maternal ID': mother_id,
72+
'Sex': convert_sex(row['rdn_sex']),
73+
'Affected Status': 2,
74+
}
75+
)
76+
77+
# Add an individual for each defined relative
78+
for i in range(1, 9):
79+
# skip if ID not set
80+
if not row[f'rdn_rel_{i}_study_id']:
81+
continue
82+
83+
# Construct parent IDs
84+
if row[f'rdn_rel{i}_mother'] and row[f'rdn_rel{i}_mother'].strip() != '-1':
85+
mother_int = row[f'rdn_rel{i}_mother']
86+
if mother_int == '0':
87+
mother_id = proband_id
88+
else:
89+
mother_id = row[f'rdn_rel_{mother_int}_study_id']
90+
else:
91+
mother_id = 0
92+
93+
if row[f'rdn_rel{i}_father'] and row[f'rdn_rel{i}_father'].strip() != '-1':
94+
father_int = row[f'rdn_rel{i}_father']
95+
if father_int == '0':
96+
father_id = proband_id
97+
else:
98+
father_id = row[f'rdn_rel_{father_int}_study_id']
99+
else:
100+
father_id = 0
101+
102+
individuals.append(
103+
{
104+
'Family ID': family_id,
105+
'Individual ID': row[f'rdn_rel_{i}_study_id'],
106+
'Paternal ID': father_id,
107+
'Maternal ID': mother_id,
108+
'Sex': convert_sex(row[f'rdn_rel_{i}_sex']),
109+
'Affected Status': row[f'rdn_rel_{i}_affected'],
110+
}
111+
)
112+
113+
return proband_id, individuals
114+
115+
116+
def process_sample_row(row: dict, family_row: dict, proband_id: str):
117+
"""
118+
Extract all sample IDs for each individual. Returns a pair of dicts (DNA, RNA)
119+
mapping each sampleID to the matching individual (sample_id, tissue tuple
120+
for the RNA samples).
121+
122+
"""
123+
individuals_by_sample_id__dna = {}
124+
individuals_by_sample_id__rna = {}
125+
126+
# Proband DNA samples
127+
if sample_id := row['rdn_th_seq_id_v2'].strip():
128+
individuals_by_sample_id__dna[sample_id] = proband_id
129+
130+
if sample_id := row['rdn_transfer_id'].strip():
131+
individuals_by_sample_id__dna[sample_id] = proband_id
132+
133+
# Proband RNA
134+
if sample_id := row['rdn_rna_ps_id'].strip():
135+
recorded_tissue = row['rdn_th_seq_specimen_v2'].strip()
136+
137+
# Use some assumed knowledge to determine the sequenced tissue
138+
if recorded_tissue == 'blood':
139+
tissue = 'lcl'
140+
elif recorded_tissue == 'skin':
141+
tissue = 'fibroblast'
142+
elif recorded_tissue == 'other':
143+
if 'lymphoblast' in row['rdn_th_seq_spec_oth_v2'].lower():
144+
tissue = 'lcl'
145+
else:
146+
tissue = row['rdn_th_seq_spec_oth_v2'].strip()
147+
else:
148+
tissue = row['rdn_th_seq_specimen_v2'].strip()
149+
150+
individuals_by_sample_id__rna[sample_id] = (proband_id, tissue)
151+
152+
# Process relatives (DNA only)
153+
for i in range(1, 9):
154+
individual_id = family_row[f'rdn_rel_{i}_study_id'].strip()
155+
156+
if individual_id and f'rdn_th_seq_id_rel{i}_v2' in family_row:
157+
if sample_id := row[f'rdn_th_seq_id_rel{i}_v2'].strip():
158+
print(f"found {sample_id}")
159+
160+
# Handle some mess in how they have set up this redcap
161+
if sample_id == 'N/A':
162+
continue
163+
if len(sample_id.split()) > 1:
164+
for x in sample_id.split():
165+
if 'PS' in x:
166+
sample_id = x
167+
break
168+
169+
individuals_by_sample_id__dna[sample_id] = individual_id
170+
171+
return individuals_by_sample_id__dna, individuals_by_sample_id__rna
172+
173+
174+
@click.command()
175+
@click.option('-p', '--search_path')
176+
@click.option('-d', '--dry_run', is_flag=True, default=False)
177+
@click.option('-t', '--test_dataset', is_flag=True, default=False)
178+
@click.argument('redcap_csv')
179+
@run_as_sync
180+
async def main(
181+
redcap_csv: str, search_path: str, dry_run: bool, test_dataset: bool, facility: str = FACILITY
182+
):
183+
"""
184+
Parse a custom formatted CSV dump from the acute-care Redcap database
185+
and import metadata into metamist.
186+
187+
Due to the structure of the redcap database, this script must infer some
188+
fields based on our knowledge of the project.
189+
190+
Prepares temporary pedigree, individual and family metadata files then uses
191+
metamist api to trigger uploads.
192+
193+
"""
194+
195+
# Sanity check: use redcap auto generated filename to ensure this file is from
196+
# the RDNow project
197+
assert 'RDNowParticipantData-' in redcap_csv, "Is this an RDNow redcap csv?"
198+
199+
if test_dataset:
200+
# use test version of the dataset
201+
project = PROJECT + '-test'
202+
203+
# Prepare temp out files
204+
pedfile = tempfile.NamedTemporaryFile(mode='w')
205+
ind_file = tempfile.NamedTemporaryFile(mode='w')
206+
fam_file = tempfile.NamedTemporaryFile(mode='w')
207+
filemap_file = tempfile.NamedTemporaryFile(mode='w')
208+
209+
fam_writer = csv.DictWriter(fam_file, fieldnames=FAMILY_METADATA_HEADERS)
210+
ped_writer = csv.DictWriter(pedfile, fieldnames=PEDFILE_HEADERS, delimiter=',')
211+
ind_writer = csv.DictWriter(ind_file, fieldnames=INDIVIDUAL_METADATA_HEADERS)
212+
filemap_writer = csv.DictWriter(filemap_file, fieldnames=FILEMAP_HEADERS)
213+
214+
ind_writer.writeheader()
215+
ped_writer.writeheader()
216+
fam_writer.writeheader()
217+
filemap_writer.writeheader()
218+
219+
# Parse rows into family units
220+
print("Parsing redcap csv")
221+
with open(redcap_csv) as csvfile:
222+
# The columns in redcap reports, and even their relative order can change
223+
# without warning. To protect against this we explicitly use the column headers.
224+
reader = csv.DictReader(csvfile, delimiter='\t')
225+
226+
individuals_by_sample_id__dna = {}
227+
individuals_by_sample_id__rna = {}
228+
family_row = None
229+
proband_id = None
230+
for row in reader:
231+
# Process family header row
232+
if row['rdn_individual_id']:
233+
family_row = row
234+
proband_id, individuals = process_family_row(row)
235+
236+
# write individuals to pedfile
237+
for individual in individuals:
238+
ped_writer.writerow(individual)
239+
240+
else:
241+
# Process sample row
242+
dna_sample_map, rna_sample_map = process_sample_row(
243+
row, family_row, proband_id
244+
)
245+
individuals_by_sample_id__dna.update(dna_sample_map)
246+
individuals_by_sample_id__rna.update(rna_sample_map)
247+
248+
# Save ped and individual files to disk then write to api
249+
print('Saving pedfile:')
250+
pedfile.flush()
251+
with open(pedfile.name) as p:
252+
print(p.read())
253+
254+
if not dry_run:
255+
with open(pedfile.name) as p:
256+
response = FamilyApi().import_pedigree(
257+
file=p,
258+
has_header=True,
259+
project=project,
260+
create_missing_participants=True,
261+
)
262+
print('API response:', response)
263+
264+
# TODO: parse fields to extract individual metadata
265+
# print('\nSaving Individual metadatafile:')
266+
# ind_file.flush()
267+
# with open(ind_file.name) as f:
268+
# print(f.read())
269+
270+
# if not dry_run:
271+
# with open(ind_file.name) as f:
272+
# response = ImportApi().import_individual_metadata_manifest(
273+
# file=f,
274+
# project=project,
275+
# )
276+
# print('API response:', response)
277+
278+
print('\n\nDNA sample map:', individuals_by_sample_id__dna, '\n\n')
279+
print('\n\nRNA sample map:', individuals_by_sample_id__rna, '\n\n')
280+
281+
# Find fastqs in upload bucket
282+
if not search_path:
283+
print('Search_path not provided so skipping file mapping')
284+
else:
285+
# Find all fastqs in search path
286+
found_files = False
287+
for sample_id, fastq_pairs in find_fastq_pairs(
288+
search_path, Facility(facility), recursive=False
289+
).items():
290+
291+
# handle concatinated sample id
292+
if len(sample_id.split('-')) > 1:
293+
for x in sample_id.split('-'):
294+
if 'PS' in x:
295+
sample_id = x
296+
break
297+
298+
print('checking:', sample_id)
299+
# Process DNA samples
300+
if sample_id in individuals_by_sample_id__dna:
301+
for fastq_pair in fastq_pairs:
302+
row = {
303+
'Individual ID': individuals_by_sample_id__dna[sample_id],
304+
'Sample ID': sample_id,
305+
'Filenames': ','.join(
306+
sorted([x.path.name for x in fastq_pair])
307+
),
308+
'Type': fastq_pair[0].seq_type.value,
309+
}
310+
# temporarily skip RNA
311+
if 'RNA' in fastq_pair[0].seq_type.value:
312+
continue
313+
filemap_writer.writerow(row)
314+
found_files = True
315+
316+
# Process RNA samples
317+
if sample_id in individuals_by_sample_id__rna:
318+
for fastq_pair in fastq_pairs:
319+
row = {
320+
'Individual ID': individuals_by_sample_id__rna[sample_id][0],
321+
'Sample ID': sample_id,
322+
'Filenames': ','.join(
323+
sorted([x.path.name for x in fastq_pair])
324+
),
325+
'Type': 'transcriptome',
326+
}
327+
filemap_writer.writerow(row)
328+
found_files = True
329+
330+
if found_files:
331+
print('\nSaving filemap')
332+
filemap_file.flush()
333+
with open(filemap_file.name) as f:
334+
print(f.read())
335+
336+
parser = SampleFileMapParser(
337+
project=project,
338+
search_locations=[search_path],
339+
allow_extra_files_in_search_path=True,
340+
)
341+
342+
resp = await parser.from_manifest_path(
343+
manifest=filemap_file.name,
344+
confirm=True,
345+
dry_run=dry_run,
346+
)
347+
print(resp)
348+
else:
349+
print(
350+
'WARNING: did not find any read files to ingest. Remember I do not do recursive search'
351+
)
352+
353+
354+
if __name__ == '__main__':
355+
# pylint: disable=no-value-for-parameter
356+
main()

0 commit comments

Comments
 (0)