From daca8ccf3ccb6324edd4c9d14d572a1e71404cb6 Mon Sep 17 00:00:00 2001 From: vivbak Date: Sun, 9 Jun 2024 20:00:24 +1000 Subject: [PATCH] Add support for cohorts in create_test_subset script --- scripts/create_test_subset.py | 84 ++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/scripts/create_test_subset.py b/scripts/create_test_subset.py index 250abf284..6605c2347 100755 --- a/scripts/create_test_subset.py +++ b/scripts/create_test_subset.py @@ -166,34 +166,65 @@ """ ) +COHORT_QUERY = gql( + """ + query CohortQuery($project: String!) { + project(name: $project) { + cohorts { + id + sequencingGroups { + sample { + id + } + } + } + } + } + """ +) + def main( project: str, samples_n: int, families_n: int, + cohort_samples_n: int, additional_families: set[str], additional_samples: set[str], + cohorts: set[str], skip_ped: bool = True, ): """ Script creates a test subset for a given project. - A new project with a prefix -test is created, and for any files in sample/meta, + A new project with a suffix -test is created, and for any files in sample/meta, sequence/meta, or analysis/output a copy in the -test namespace is created. """ - - if not any([additional_families, additional_samples, samples_n, families_n]): + if not any( + [additional_families, additional_samples, samples_n, families_n, cohorts] + ): raise ValueError('Come on, what exactly are you asking for?') + if cohorts and not cohort_samples_n: + raise ValueError( + 'You must specify the number of samples to transfer from the cohort.' + ) + # for reproducibility logger.info('Setting random seed to 42') random.seed(42) - # 1. Find and SG IDs to be moved by Family ID -test. + # 1. Find SG IDs to be moved by Family ID to -test. if families_n or additional_families: additional_samples.update( get_sids_for_families(project, families_n, additional_families) ) + # 1.5 Find SG IDs to be moved by Cohort ID to -test. + if cohorts: + additional_samples.update( + get_sids_for_cohorts(project, cohorts, cohort_samples_n) + ) + # 2. Get all sample IDs and their SG IDs in project. logger.info(f'Querying all samples in {project}') sid_output = query(SG_ID_QUERY, variables={'project': project}) @@ -206,7 +237,7 @@ def main( ) # 4. Query all the samples from the selected sgs - logger.info(f'Transfering {len(additional_samples)} samples. Querying metadata.') + logger.info(f'Transferring {len(additional_samples)} samples. Querying metadata.') original_project_subset_data = query( QUERY_ALL_DATA, {'project': project, 'sids': list(additional_samples)} ) @@ -627,6 +658,30 @@ def get_sids_for_families( return included_sids +def get_sids_for_cohorts( + project: str, cohorts: set[str], cohort_samples_n: int +) -> set[str]: + """Returns cohort_samples_n specific samples for given cohort IDs.""" + + cohort_sid_output = query(COHORT_QUERY, {'project': project}) + + all_cohort_groups = cohort_sid_output.get('project', {}).get('cohorts', []) + + all_cohorts_sample_ids_subset: set[str] = set() + for cohort in all_cohort_groups: + sids_for_cohort: list[str] = [] + if cohort.get('id') in cohorts: + seq_groups = cohort.get('sequencingGroups', []) + for seq_group in seq_groups: + sample = seq_group.get('sample') + sids_for_cohort.append(sample['id']) + all_cohorts_sample_ids_subset.update( + random.sample(sids_for_cohort, cohort_samples_n) + ) + + return all_cohorts_sample_ids_subset + + def transfer_families( initial_project: str, target_project: str, internal_participant_ids: list[int] ) -> list[int]: @@ -852,8 +907,16 @@ def file_exists(path: str) -> bool: parser.add_argument( '--project', required=True, help='The sample-metadata project ($DATASET)' ) - parser.add_argument('-n', type=int, help='# Random Samples to copy', default=0) + parser.add_argument( + '-n', type=int, help='# Random Samples to copy', default=DEFAULT_SAMPLES_N + ) parser.add_argument('-f', type=int, help='# Random families to copy', default=0) + parser.add_argument( + '-nsamples-cohort', + type=int, + help='# Random samples to copy from each cohort', + default=0, + ) # Flag to be used when there isn't available pedigree/family information. parser.add_argument( '--skip-ped', @@ -874,6 +937,13 @@ def file_exists(path: str) -> bool: type=str, default={}, ) + parser.add_argument( + '--cohorts', + nargs='+', + help='Cohorts to take random samples from.', + type=str, + default={}, + ) parser.add_argument( '--noninteractive', action='store_true', help='Skip interactive confirmation' ) @@ -886,7 +956,9 @@ def file_exists(path: str) -> bool: project=args.project, samples_n=args.n, families_n=args.f, + cohort_samples_n=args.nsamples_cohort, additional_samples=set(args.samples), additional_families=set(args.families), skip_ped=args.skip_ped, + cohorts=set(args.cohorts), )