|
| 1 | +""" Parses acute-care redcap dump ingest into metamist """ |
| 2 | + |
| 3 | +import csv |
| 4 | +import tempfile |
| 5 | +import click |
| 6 | + |
| 7 | +from redcap_parsing_utils import ( |
| 8 | + PEDFILE_HEADERS, |
| 9 | + INDIVIDUAL_METADATA_HEADERS, |
| 10 | + FAMILY_METADATA_HEADERS, |
| 11 | + FILEMAP_HEADERS, |
| 12 | + Facility, |
| 13 | + find_fastq_pairs, |
| 14 | +) |
| 15 | + |
| 16 | +from metamist.apis import FamilyApi |
| 17 | +from metamist.parser.generic_metadata_parser import run_as_sync |
| 18 | +from metamist.parser.sample_file_map_parser import SampleFileMapParser |
| 19 | + |
| 20 | + |
| 21 | +PROJECT = 'rdnow' |
| 22 | +UPLOAD_BUCKET_SEARCH_PATH = 'gs://cpg-rdnow-main-upload/' |
| 23 | +FACILITY = 'VCGS' |
| 24 | + |
| 25 | + |
| 26 | +def get_individual_sample_ids(individual_id, column_keys, row): |
| 27 | + """ |
| 28 | + Extracts sample IDs from a row of redcap data retuning a dictionary of sample_ids by |
| 29 | + individual id. |
| 30 | + """ |
| 31 | + sample_id_to_individual_id = {} |
| 32 | + for key in column_keys: |
| 33 | + if row[key].strip(): |
| 34 | + sample_id_to_individual_id[row[key].strip()] = individual_id |
| 35 | + return sample_id_to_individual_id |
| 36 | + |
| 37 | + |
| 38 | +def convert_sex(sex_str: str) -> int: |
| 39 | + """converts from male/female to 1/2""" |
| 40 | + if sex_str.lower() in ['male', '1']: |
| 41 | + return 1 |
| 42 | + if sex_str.lower() in ['female', '2']: |
| 43 | + return 2 |
| 44 | + return 0 |
| 45 | + |
| 46 | + |
| 47 | +def process_family_row(row: dict): |
| 48 | + """ |
| 49 | + Extract the pedigree information from the proband and relatives 1-10 |
| 50 | + Assumptions include: |
| 51 | + - rel1 is mum |
| 52 | + - rel2 is dad |
| 53 | + """ |
| 54 | + # Handle this if and when we run into it (ask cas) |
| 55 | + if row['rdn_rel_existing_proband'] != '0': |
| 56 | + assert False, f'Handle 2nd fam member: {row}' |
| 57 | + |
| 58 | + family_id = row['rdn_family_id'] |
| 59 | + proband_id = row['rdn_individual_id'] |
| 60 | + individuals = [] |
| 61 | + |
| 62 | + # Proband |
| 63 | + mother_id = row['rdn_rel_1_study_id'] if row['rdn_rel_1_study_id'] else 0 |
| 64 | + father_id = row['rdn_rel_2_study_id'] if row['rdn_rel_2_study_id'] else 0 |
| 65 | + |
| 66 | + individuals.append( |
| 67 | + { |
| 68 | + 'Family ID': family_id, |
| 69 | + 'Individual ID': proband_id, |
| 70 | + 'Paternal ID': father_id, |
| 71 | + 'Maternal ID': mother_id, |
| 72 | + 'Sex': convert_sex(row['rdn_sex']), |
| 73 | + 'Affected Status': 2, |
| 74 | + } |
| 75 | + ) |
| 76 | + |
| 77 | + # Add an individual for each defined relative |
| 78 | + for i in range(1, 9): |
| 79 | + # skip if ID not set |
| 80 | + if not row[f'rdn_rel_{i}_study_id']: |
| 81 | + continue |
| 82 | + |
| 83 | + # Construct parent IDs |
| 84 | + if row[f'rdn_rel{i}_mother'] and row[f'rdn_rel{i}_mother'].strip() != '-1': |
| 85 | + mother_int = row[f'rdn_rel{i}_mother'] |
| 86 | + if mother_int == '0': |
| 87 | + mother_id = proband_id |
| 88 | + else: |
| 89 | + mother_id = row[f'rdn_rel_{mother_int}_study_id'] |
| 90 | + else: |
| 91 | + mother_id = 0 |
| 92 | + |
| 93 | + if row[f'rdn_rel{i}_father'] and row[f'rdn_rel{i}_father'].strip() != '-1': |
| 94 | + father_int = row[f'rdn_rel{i}_father'] |
| 95 | + if father_int == '0': |
| 96 | + father_id = proband_id |
| 97 | + else: |
| 98 | + father_id = row[f'rdn_rel_{father_int}_study_id'] |
| 99 | + else: |
| 100 | + father_id = 0 |
| 101 | + |
| 102 | + individuals.append( |
| 103 | + { |
| 104 | + 'Family ID': family_id, |
| 105 | + 'Individual ID': row[f'rdn_rel_{i}_study_id'], |
| 106 | + 'Paternal ID': father_id, |
| 107 | + 'Maternal ID': mother_id, |
| 108 | + 'Sex': convert_sex(row[f'rdn_rel_{i}_sex']), |
| 109 | + 'Affected Status': row[f'rdn_rel_{i}_affected'], |
| 110 | + } |
| 111 | + ) |
| 112 | + |
| 113 | + return proband_id, individuals |
| 114 | + |
| 115 | + |
| 116 | +def process_sample_row(row: dict, family_row: dict, proband_id: str): |
| 117 | + """ |
| 118 | + Extract all sample IDs for each individual. Returns a pair of dicts (DNA, RNA) |
| 119 | + mapping each sampleID to the matching individual (sample_id, tissue tuple |
| 120 | + for the RNA samples). |
| 121 | +
|
| 122 | + """ |
| 123 | + individuals_by_sample_id__dna = {} |
| 124 | + individuals_by_sample_id__rna = {} |
| 125 | + |
| 126 | + # Proband DNA samples |
| 127 | + if sample_id := row['rdn_th_seq_id_v2'].strip(): |
| 128 | + individuals_by_sample_id__dna[sample_id] = proband_id |
| 129 | + |
| 130 | + if sample_id := row['rdn_transfer_id'].strip(): |
| 131 | + individuals_by_sample_id__dna[sample_id] = proband_id |
| 132 | + |
| 133 | + # Proband RNA |
| 134 | + if sample_id := row['rdn_rna_ps_id'].strip(): |
| 135 | + recorded_tissue = row['rdn_th_seq_specimen_v2'].strip() |
| 136 | + |
| 137 | + # Use some assumed knowledge to determine the sequenced tissue |
| 138 | + if recorded_tissue == 'blood': |
| 139 | + tissue = 'lcl' |
| 140 | + elif recorded_tissue == 'skin': |
| 141 | + tissue = 'fibroblast' |
| 142 | + elif recorded_tissue == 'other': |
| 143 | + if 'lymphoblast' in row['rdn_th_seq_spec_oth_v2'].lower(): |
| 144 | + tissue = 'lcl' |
| 145 | + else: |
| 146 | + tissue = row['rdn_th_seq_spec_oth_v2'].strip() |
| 147 | + else: |
| 148 | + tissue = row['rdn_th_seq_specimen_v2'].strip() |
| 149 | + |
| 150 | + individuals_by_sample_id__rna[sample_id] = (proband_id, tissue) |
| 151 | + |
| 152 | + # Process relatives (DNA only) |
| 153 | + for i in range(1, 9): |
| 154 | + individual_id = family_row[f'rdn_rel_{i}_study_id'].strip() |
| 155 | + |
| 156 | + if individual_id and f'rdn_th_seq_id_rel{i}_v2' in family_row: |
| 157 | + if sample_id := row[f'rdn_th_seq_id_rel{i}_v2'].strip(): |
| 158 | + print(f"found {sample_id}") |
| 159 | + |
| 160 | + # Handle some mess in how they have set up this redcap |
| 161 | + if sample_id == 'N/A': |
| 162 | + continue |
| 163 | + if len(sample_id.split()) > 1: |
| 164 | + for x in sample_id.split(): |
| 165 | + if 'PS' in x: |
| 166 | + sample_id = x |
| 167 | + break |
| 168 | + |
| 169 | + individuals_by_sample_id__dna[sample_id] = individual_id |
| 170 | + |
| 171 | + return individuals_by_sample_id__dna, individuals_by_sample_id__rna |
| 172 | + |
| 173 | + |
| 174 | +@click.command() |
| 175 | +@click.option('-p', '--search_path') |
| 176 | +@click.option('-d', '--dry_run', is_flag=True, default=False) |
| 177 | +@click.option('-t', '--test_dataset', is_flag=True, default=False) |
| 178 | +@click.argument('redcap_csv') |
| 179 | +@run_as_sync |
| 180 | +async def main( |
| 181 | + redcap_csv: str, search_path: str, dry_run: bool, test_dataset: bool, facility: str = FACILITY |
| 182 | +): |
| 183 | + """ |
| 184 | + Parse a custom formatted CSV dump from the acute-care Redcap database |
| 185 | + and import metadata into metamist. |
| 186 | +
|
| 187 | + Due to the structure of the redcap database, this script must infer some |
| 188 | + fields based on our knowledge of the project. |
| 189 | +
|
| 190 | + Prepares temporary pedigree, individual and family metadata files then uses |
| 191 | + metamist api to trigger uploads. |
| 192 | +
|
| 193 | + """ |
| 194 | + |
| 195 | + # Sanity check: use redcap auto generated filename to ensure this file is from |
| 196 | + # the RDNow project |
| 197 | + assert 'RDNowParticipantData-' in redcap_csv, "Is this an RDNow redcap csv?" |
| 198 | + |
| 199 | + if test_dataset: |
| 200 | + # use test version of the dataset |
| 201 | + project = PROJECT + '-test' |
| 202 | + |
| 203 | + # Prepare temp out files |
| 204 | + pedfile = tempfile.NamedTemporaryFile(mode='w') |
| 205 | + ind_file = tempfile.NamedTemporaryFile(mode='w') |
| 206 | + fam_file = tempfile.NamedTemporaryFile(mode='w') |
| 207 | + filemap_file = tempfile.NamedTemporaryFile(mode='w') |
| 208 | + |
| 209 | + fam_writer = csv.DictWriter(fam_file, fieldnames=FAMILY_METADATA_HEADERS) |
| 210 | + ped_writer = csv.DictWriter(pedfile, fieldnames=PEDFILE_HEADERS, delimiter=',') |
| 211 | + ind_writer = csv.DictWriter(ind_file, fieldnames=INDIVIDUAL_METADATA_HEADERS) |
| 212 | + filemap_writer = csv.DictWriter(filemap_file, fieldnames=FILEMAP_HEADERS) |
| 213 | + |
| 214 | + ind_writer.writeheader() |
| 215 | + ped_writer.writeheader() |
| 216 | + fam_writer.writeheader() |
| 217 | + filemap_writer.writeheader() |
| 218 | + |
| 219 | + # Parse rows into family units |
| 220 | + print("Parsing redcap csv") |
| 221 | + with open(redcap_csv) as csvfile: |
| 222 | + # The columns in redcap reports, and even their relative order can change |
| 223 | + # without warning. To protect against this we explicitly use the column headers. |
| 224 | + reader = csv.DictReader(csvfile, delimiter='\t') |
| 225 | + |
| 226 | + individuals_by_sample_id__dna = {} |
| 227 | + individuals_by_sample_id__rna = {} |
| 228 | + family_row = None |
| 229 | + proband_id = None |
| 230 | + for row in reader: |
| 231 | + # Process family header row |
| 232 | + if row['rdn_individual_id']: |
| 233 | + family_row = row |
| 234 | + proband_id, individuals = process_family_row(row) |
| 235 | + |
| 236 | + # write individuals to pedfile |
| 237 | + for individual in individuals: |
| 238 | + ped_writer.writerow(individual) |
| 239 | + |
| 240 | + else: |
| 241 | + # Process sample row |
| 242 | + dna_sample_map, rna_sample_map = process_sample_row( |
| 243 | + row, family_row, proband_id |
| 244 | + ) |
| 245 | + individuals_by_sample_id__dna.update(dna_sample_map) |
| 246 | + individuals_by_sample_id__rna.update(rna_sample_map) |
| 247 | + |
| 248 | + # Save ped and individual files to disk then write to api |
| 249 | + print('Saving pedfile:') |
| 250 | + pedfile.flush() |
| 251 | + with open(pedfile.name) as p: |
| 252 | + print(p.read()) |
| 253 | + |
| 254 | + if not dry_run: |
| 255 | + with open(pedfile.name) as p: |
| 256 | + response = FamilyApi().import_pedigree( |
| 257 | + file=p, |
| 258 | + has_header=True, |
| 259 | + project=project, |
| 260 | + create_missing_participants=True, |
| 261 | + ) |
| 262 | + print('API response:', response) |
| 263 | + |
| 264 | + # TODO: parse fields to extract individual metadata |
| 265 | + # print('\nSaving Individual metadatafile:') |
| 266 | + # ind_file.flush() |
| 267 | + # with open(ind_file.name) as f: |
| 268 | + # print(f.read()) |
| 269 | + |
| 270 | + # if not dry_run: |
| 271 | + # with open(ind_file.name) as f: |
| 272 | + # response = ImportApi().import_individual_metadata_manifest( |
| 273 | + # file=f, |
| 274 | + # project=project, |
| 275 | + # ) |
| 276 | + # print('API response:', response) |
| 277 | + |
| 278 | + print('\n\nDNA sample map:', individuals_by_sample_id__dna, '\n\n') |
| 279 | + print('\n\nRNA sample map:', individuals_by_sample_id__rna, '\n\n') |
| 280 | + |
| 281 | + # Find fastqs in upload bucket |
| 282 | + if not search_path: |
| 283 | + print('Search_path not provided so skipping file mapping') |
| 284 | + else: |
| 285 | + # Find all fastqs in search path |
| 286 | + found_files = False |
| 287 | + for sample_id, fastq_pairs in find_fastq_pairs( |
| 288 | + search_path, Facility(facility), recursive=False |
| 289 | + ).items(): |
| 290 | + |
| 291 | + # handle concatinated sample id |
| 292 | + if len(sample_id.split('-')) > 1: |
| 293 | + for x in sample_id.split('-'): |
| 294 | + if 'PS' in x: |
| 295 | + sample_id = x |
| 296 | + break |
| 297 | + |
| 298 | + print('checking:', sample_id) |
| 299 | + # Process DNA samples |
| 300 | + if sample_id in individuals_by_sample_id__dna: |
| 301 | + for fastq_pair in fastq_pairs: |
| 302 | + row = { |
| 303 | + 'Individual ID': individuals_by_sample_id__dna[sample_id], |
| 304 | + 'Sample ID': sample_id, |
| 305 | + 'Filenames': ','.join( |
| 306 | + sorted([x.path.name for x in fastq_pair]) |
| 307 | + ), |
| 308 | + 'Type': fastq_pair[0].seq_type.value, |
| 309 | + } |
| 310 | + # temporarily skip RNA |
| 311 | + if 'RNA' in fastq_pair[0].seq_type.value: |
| 312 | + continue |
| 313 | + filemap_writer.writerow(row) |
| 314 | + found_files = True |
| 315 | + |
| 316 | + # Process RNA samples |
| 317 | + if sample_id in individuals_by_sample_id__rna: |
| 318 | + for fastq_pair in fastq_pairs: |
| 319 | + row = { |
| 320 | + 'Individual ID': individuals_by_sample_id__rna[sample_id][0], |
| 321 | + 'Sample ID': sample_id, |
| 322 | + 'Filenames': ','.join( |
| 323 | + sorted([x.path.name for x in fastq_pair]) |
| 324 | + ), |
| 325 | + 'Type': 'transcriptome', |
| 326 | + } |
| 327 | + filemap_writer.writerow(row) |
| 328 | + found_files = True |
| 329 | + |
| 330 | + if found_files: |
| 331 | + print('\nSaving filemap') |
| 332 | + filemap_file.flush() |
| 333 | + with open(filemap_file.name) as f: |
| 334 | + print(f.read()) |
| 335 | + |
| 336 | + parser = SampleFileMapParser( |
| 337 | + project=project, |
| 338 | + search_locations=[search_path], |
| 339 | + allow_extra_files_in_search_path=True, |
| 340 | + ) |
| 341 | + |
| 342 | + resp = await parser.from_manifest_path( |
| 343 | + manifest=filemap_file.name, |
| 344 | + confirm=True, |
| 345 | + dry_run=dry_run, |
| 346 | + ) |
| 347 | + print(resp) |
| 348 | + else: |
| 349 | + print( |
| 350 | + 'WARNING: did not find any read files to ingest. Remember I do not do recursive search' |
| 351 | + ) |
| 352 | + |
| 353 | + |
| 354 | +if __name__ == '__main__': |
| 355 | + # pylint: disable=no-value-for-parameter |
| 356 | + main() |
0 commit comments