52
52
53
53
DEFAULT_SAMPLES_N = 10
54
54
55
- SG_ID_QUERY = gql (
56
- """
57
- query getSGIds($project: String!) {
58
- project(name: $project) {
59
- samples{
60
- id
61
- externalId
62
- sequencingGroups {
63
- id
64
- }
65
- }
66
- }
67
- }
68
- """
69
- )
70
-
71
55
QUERY_ALL_DATA = gql (
72
56
"""
73
57
query getAllData($project: String!, $sids: [String!]) {
157
141
"""
158
142
)
159
143
144
+ SG_ID_QUERY = gql (
145
+ """
146
+ query getSGIds($project: String!) {
147
+ project(name: $project) {
148
+ samples{
149
+ id
150
+ externalId
151
+ sequencingGroups {
152
+ id
153
+ }
154
+ }
155
+ }
156
+ }
157
+ """
158
+ )
159
+
160
+ PARTICIPANT_QUERY = """
161
+ query ($project: String!) {
162
+ project (externalId: $project) {
163
+ participants {
164
+ id
165
+ externalId
166
+ }
167
+ }
168
+ }
169
+ """
170
+
160
171
161
172
@click .command ()
162
173
@click .option (
@@ -221,6 +232,7 @@ def main(
221
232
_additional_families : list [str ] = list (additional_families )
222
233
_additional_samples : list [str ] = list (additional_samples )
223
234
235
+ # 1. Determine the sids to be moved into -test.
224
236
specific_sids = _get_sids_for_families (
225
237
project ,
226
238
families_n ,
@@ -233,29 +245,26 @@ def main(
233
245
234
246
specific_sids = specific_sids + _additional_samples
235
247
236
- # 1. Query all the SGIDS
248
+ # 2. Get all sids in project.
237
249
sid_output = query (SG_ID_QUERY , variables = {'project' : project })
238
250
all_sids = [sid ['id' ] for sid in sid_output .get ('project' ).get ('samples' )]
239
251
240
- # 2 . Subtract the specific_sgs from all the sgs
252
+ # 3 . Subtract the specific_sgs from all the sgs
241
253
sgids_after_inclusions = list (set (all_sids ) - set (specific_sids ))
242
- # 3 . Randomly select from the remaining sgs
254
+ # 4 . Randomly select from the remaining sgs
243
255
random_sgs : list [str ] = []
244
256
random .seed (42 ) # for reproducibility
245
257
if (samples_n - len (specific_sids )) > 0 :
246
258
random_sgs = random .sample (
247
259
sgids_after_inclusions , samples_n - len (specific_sids )
248
260
)
249
- # 4 . Add the specific_sgs to the randomly selected sgs
261
+ # 5 . Add the specific_sgs to the randomly selected sgs
250
262
final_subset_sids = specific_sids + random_sgs
251
- # 5 . Query all the samples from the selected sgs
263
+ # 6 . Query all the samples from the selected sgs
252
264
original_project_subset_data = query (
253
265
QUERY_ALL_DATA , {'project' : project , 'sids' : final_subset_sids }
254
266
)
255
267
256
- # Populating test project
257
- target_project = project + '-test'
258
-
259
268
# Pull Participant Data
260
269
participant_data = []
261
270
participant_ids : list = []
@@ -265,6 +274,9 @@ def main(
265
274
participant_data .append (participant )
266
275
participant_ids .append (participant .get ('externalId' ))
267
276
277
+ # Populating test project
278
+ target_project = project + '-test'
279
+
268
280
# Parse Families & Participants
269
281
if skip_ped :
270
282
# If no family data is available, only the participants should be transferred.
@@ -275,7 +287,7 @@ def main(
275
287
276
288
else :
277
289
family_ids = transfer_families (project , target_project , participant_ids )
278
- transfer_ped (project , target_project , family_ids )
290
+ upserted_participant_map = transfer_ped (project , target_project , family_ids )
279
291
280
292
existing_data = query (EXISTING_DATA_QUERY , {'project' : target_project })
281
293
@@ -304,7 +316,7 @@ def transfer_samples_sgs_assays(
304
316
_existing_sg = _get_existing_sg (
305
317
existing_data , s .get ('externalId' ), sg .get ('type' )
306
318
)
307
- _existing_sgid = _existing_sg .get ('id' , None )
319
+ _existing_sgid = _existing_sg .get ('id' ) if _existing_sg else None
308
320
for assay in sg .get ('assays' ):
309
321
_existing_assay : dict [str , str ] = {}
310
322
if _existing_sgid :
@@ -314,9 +326,12 @@ def transfer_samples_sgs_assays(
314
326
_existing_sgid ,
315
327
assay .get ('type' ),
316
328
)
329
+ existing_assay_id = (
330
+ _existing_assay .get ('id' ) if _existing_assay else None
331
+ )
317
332
assay_upsert = AssayUpsert (
318
333
type = assay .get ('type' ),
319
- id = _existing_assay . get ( 'id' , None ) ,
334
+ id = existing_assay_id ,
320
335
external_ids = assay .get ('externalIds' ) or {},
321
336
# sample_id=self.s,
322
337
meta = assay .get ('meta' ),
@@ -380,13 +395,12 @@ def transfer_analyses(
380
395
_existing_sg = _get_existing_sg (
381
396
existing_data , s .get ('externalId' ), sg .get ('type' )
382
397
)
383
- _existing_sgid = _existing_sg .get ('id' , None )
398
+ _existing_sgid = _existing_sg .get ('id' ) if _existing_sg else None
384
399
for analysis in sg ['analyses' ]:
385
400
if analysis ['type' ] not in ['cram' , 'gvcf' ]:
386
401
# Currently the create_test_subset script only handles crams or gvcf files.
387
402
continue
388
403
389
- _existing_analysis_id = None
390
404
_existing_analysis : dict = {}
391
405
if _existing_sgid :
392
406
_existing_analysis = _get_existing_analysis (
@@ -395,7 +409,9 @@ def transfer_analyses(
395
409
_existing_sgid ,
396
410
analysis ['type' ],
397
411
)
398
- _existing_analysis_id = _existing_analysis .get ('id' , None )
412
+ _existing_analysis_id = (
413
+ _existing_analysis .get ('id' ) if _existing_analysis else None
414
+ )
399
415
if _existing_analysis_id :
400
416
am = AnalysisUpdateModel (
401
417
type = analysis ['type' ],
@@ -443,11 +459,12 @@ def _get_existing_sg(
443
459
if not sg_type and not sg_id :
444
460
raise ValueError ('Must provide sg_type or sg_id when getting exsisting sg' )
445
461
sample = _get_existing_sample (existing_data , sample_id )
446
- for sg in sample .get ('sequencingGroups' ):
447
- if sg_id and sg .get ('id' ) == sg_id :
448
- return sg
449
- if sg_type and sg .get ('type' ) == sg_type :
450
- return sg
462
+ if sample :
463
+ for sg in sample .get ('sequencingGroups' ):
464
+ if sg_id and sg .get ('id' ) == sg_id :
465
+ return sg
466
+ if sg_type and sg .get ('type' ) == sg_type :
467
+ return sg
451
468
452
469
return None
453
470
@@ -564,20 +581,13 @@ def transfer_families(
564
581
565
582
def transfer_ped (
566
583
initial_project : str , target_project : str , family_ids : list [int ]
567
- ) -> list [str ]:
584
+ ) -> dict [str , int ]:
568
585
"""Pull pedigree from the input project, and copy to target_project"""
569
586
ped_tsv = fapi .get_pedigree (
570
587
initial_project ,
571
588
export_type = 'tsv' ,
572
589
internal_family_ids = family_ids ,
573
590
)
574
- ped_json = fapi .get_pedigree (
575
- initial_project ,
576
- export_type = 'json' ,
577
- internal_family_ids = family_ids ,
578
- )
579
-
580
- external_participant_ids = [ped ['individual_id' ] for ped in ped_json ]
581
591
tmp_ped_tsv = 'tmp_ped.tsv'
582
592
# Work-around as import_pedigree takes a file.
583
593
with open (tmp_ped_tsv , 'w' ) as tmp_ped :
@@ -591,7 +601,14 @@ def transfer_ped(
591
601
create_missing_participants = True ,
592
602
)
593
603
594
- return external_participant_ids
604
+ # Get map of external participant id to internal
605
+ participant_output = query (PARTICIPANT_QUERY , {'project' : target_project })
606
+ participant_map = {
607
+ participant ['externalId' ]: participant ['id' ]
608
+ for participant in participant_output .get ('project' ).get ('participants' )
609
+ }
610
+
611
+ return participant_map
595
612
596
613
597
614
def transfer_participants (
0 commit comments