Skip to content

Commit 8c7d0ff

Browse files
committed
Fixes + Cleanup
1 parent b04f6ae commit 8c7d0ff

File tree

1 file changed

+61
-44
lines changed

1 file changed

+61
-44
lines changed

scripts/create_test_subset.py

+61-44
Original file line numberDiff line numberDiff line change
@@ -52,22 +52,6 @@
5252

5353
DEFAULT_SAMPLES_N = 10
5454

55-
SG_ID_QUERY = gql(
56-
"""
57-
query getSGIds($project: String!) {
58-
project(name: $project) {
59-
samples{
60-
id
61-
externalId
62-
sequencingGroups {
63-
id
64-
}
65-
}
66-
}
67-
}
68-
"""
69-
)
70-
7155
QUERY_ALL_DATA = gql(
7256
"""
7357
query getAllData($project: String!, $sids: [String!]) {
@@ -157,6 +141,33 @@
157141
"""
158142
)
159143

144+
SG_ID_QUERY = gql(
145+
"""
146+
query getSGIds($project: String!) {
147+
project(name: $project) {
148+
samples{
149+
id
150+
externalId
151+
sequencingGroups {
152+
id
153+
}
154+
}
155+
}
156+
}
157+
"""
158+
)
159+
160+
PARTICIPANT_QUERY = """
161+
query ($project: String!) {
162+
project (externalId: $project) {
163+
participants {
164+
id
165+
externalId
166+
}
167+
}
168+
}
169+
"""
170+
160171

161172
@click.command()
162173
@click.option(
@@ -221,6 +232,7 @@ def main(
221232
_additional_families: list[str] = list(additional_families)
222233
_additional_samples: list[str] = list(additional_samples)
223234

235+
# 1. Determine the sids to be moved into -test.
224236
specific_sids = _get_sids_for_families(
225237
project,
226238
families_n,
@@ -233,29 +245,26 @@ def main(
233245

234246
specific_sids = specific_sids + _additional_samples
235247

236-
# 1. Query all the SGIDS
248+
# 2. Get all sids in project.
237249
sid_output = query(SG_ID_QUERY, variables={'project': project})
238250
all_sids = [sid['id'] for sid in sid_output.get('project').get('samples')]
239251

240-
# 2. Subtract the specific_sgs from all the sgs
252+
# 3. Subtract the specific_sgs from all the sgs
241253
sgids_after_inclusions = list(set(all_sids) - set(specific_sids))
242-
# 3. Randomly select from the remaining sgs
254+
# 4. Randomly select from the remaining sgs
243255
random_sgs: list[str] = []
244256
random.seed(42) # for reproducibility
245257
if (samples_n - len(specific_sids)) > 0:
246258
random_sgs = random.sample(
247259
sgids_after_inclusions, samples_n - len(specific_sids)
248260
)
249-
# 4. Add the specific_sgs to the randomly selected sgs
261+
# 5. Add the specific_sgs to the randomly selected sgs
250262
final_subset_sids = specific_sids + random_sgs
251-
# 5. Query all the samples from the selected sgs
263+
# 6. Query all the samples from the selected sgs
252264
original_project_subset_data = query(
253265
QUERY_ALL_DATA, {'project': project, 'sids': final_subset_sids}
254266
)
255267

256-
# Populating test project
257-
target_project = project + '-test'
258-
259268
# Pull Participant Data
260269
participant_data = []
261270
participant_ids: list = []
@@ -265,6 +274,9 @@ def main(
265274
participant_data.append(participant)
266275
participant_ids.append(participant.get('externalId'))
267276

277+
# Populating test project
278+
target_project = project + '-test'
279+
268280
# Parse Families & Participants
269281
if skip_ped:
270282
# If no family data is available, only the participants should be transferred.
@@ -275,7 +287,7 @@ def main(
275287

276288
else:
277289
family_ids = transfer_families(project, target_project, participant_ids)
278-
transfer_ped(project, target_project, family_ids)
290+
upserted_participant_map = transfer_ped(project, target_project, family_ids)
279291

280292
existing_data = query(EXISTING_DATA_QUERY, {'project': target_project})
281293

@@ -304,7 +316,7 @@ def transfer_samples_sgs_assays(
304316
_existing_sg = _get_existing_sg(
305317
existing_data, s.get('externalId'), sg.get('type')
306318
)
307-
_existing_sgid = _existing_sg.get('id', None)
319+
_existing_sgid = _existing_sg.get('id') if _existing_sg else None
308320
for assay in sg.get('assays'):
309321
_existing_assay: dict[str, str] = {}
310322
if _existing_sgid:
@@ -314,9 +326,12 @@ def transfer_samples_sgs_assays(
314326
_existing_sgid,
315327
assay.get('type'),
316328
)
329+
existing_assay_id = (
330+
_existing_assay.get('id') if _existing_assay else None
331+
)
317332
assay_upsert = AssayUpsert(
318333
type=assay.get('type'),
319-
id=_existing_assay.get('id', None),
334+
id=existing_assay_id,
320335
external_ids=assay.get('externalIds') or {},
321336
# sample_id=self.s,
322337
meta=assay.get('meta'),
@@ -380,13 +395,12 @@ def transfer_analyses(
380395
_existing_sg = _get_existing_sg(
381396
existing_data, s.get('externalId'), sg.get('type')
382397
)
383-
_existing_sgid = _existing_sg.get('id', None)
398+
_existing_sgid = _existing_sg.get('id') if _existing_sg else None
384399
for analysis in sg['analyses']:
385400
if analysis['type'] not in ['cram', 'gvcf']:
386401
# Currently the create_test_subset script only handles crams or gvcf files.
387402
continue
388403

389-
_existing_analysis_id = None
390404
_existing_analysis: dict = {}
391405
if _existing_sgid:
392406
_existing_analysis = _get_existing_analysis(
@@ -395,7 +409,9 @@ def transfer_analyses(
395409
_existing_sgid,
396410
analysis['type'],
397411
)
398-
_existing_analysis_id = _existing_analysis.get('id', None)
412+
_existing_analysis_id = (
413+
_existing_analysis.get('id') if _existing_analysis else None
414+
)
399415
if _existing_analysis_id:
400416
am = AnalysisUpdateModel(
401417
type=analysis['type'],
@@ -443,11 +459,12 @@ def _get_existing_sg(
443459
if not sg_type and not sg_id:
444460
raise ValueError('Must provide sg_type or sg_id when getting exsisting sg')
445461
sample = _get_existing_sample(existing_data, sample_id)
446-
for sg in sample.get('sequencingGroups'):
447-
if sg_id and sg.get('id') == sg_id:
448-
return sg
449-
if sg_type and sg.get('type') == sg_type:
450-
return sg
462+
if sample:
463+
for sg in sample.get('sequencingGroups'):
464+
if sg_id and sg.get('id') == sg_id:
465+
return sg
466+
if sg_type and sg.get('type') == sg_type:
467+
return sg
451468

452469
return None
453470

@@ -564,20 +581,13 @@ def transfer_families(
564581

565582
def transfer_ped(
566583
initial_project: str, target_project: str, family_ids: list[int]
567-
) -> list[str]:
584+
) -> dict[str, int]:
568585
"""Pull pedigree from the input project, and copy to target_project"""
569586
ped_tsv = fapi.get_pedigree(
570587
initial_project,
571588
export_type='tsv',
572589
internal_family_ids=family_ids,
573590
)
574-
ped_json = fapi.get_pedigree(
575-
initial_project,
576-
export_type='json',
577-
internal_family_ids=family_ids,
578-
)
579-
580-
external_participant_ids = [ped['individual_id'] for ped in ped_json]
581591
tmp_ped_tsv = 'tmp_ped.tsv'
582592
# Work-around as import_pedigree takes a file.
583593
with open(tmp_ped_tsv, 'w') as tmp_ped:
@@ -591,7 +601,14 @@ def transfer_ped(
591601
create_missing_participants=True,
592602
)
593603

594-
return external_participant_ids
604+
# Get map of external participant id to internal
605+
participant_output = query(PARTICIPANT_QUERY, {'project': target_project})
606+
participant_map = {
607+
participant['externalId']: participant['id']
608+
for participant in participant_output.get('project').get('participants')
609+
}
610+
611+
return participant_map
595612

596613

597614
def transfer_participants(

0 commit comments

Comments
 (0)