From 654d1be6da47ee5fabf2cf8e3760a4dfe78ac6a9 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Mon, 18 Sep 2023 14:51:04 +1000 Subject: [PATCH] clean up selection logic --- scripts/new_test_subset.py | 48 ++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/scripts/new_test_subset.py b/scripts/new_test_subset.py index 03d88494c..dc78d6fbf 100644 --- a/scripts/new_test_subset.py +++ b/scripts/new_test_subset.py @@ -661,20 +661,28 @@ def main( raise SystemExit() random.seed(42) # for reproducibility + + # end goal of this initial step is having a collection of SGs we want to transfer + # first process the specific SG/Families + # sample_set here will contain SG IDs, or be empty + # if we manually specify families, add their corresponding IDs to that set + if family_set: + sample_set |= set( + chain.from_iterable( + v for k, v in main_sgs_by_family.items() if k in family_set + ) + ) + if families_n: - # if there are additional samples specified, find the corresponding families - # these specifically requested families & samples are copied over in addition - # to the random selection families_n number of families - if sample_set: - family_set |= {metamist_main_content[sgid]['family'] for sgid in sample_set} - - # family_set is ones we definitely want to include - # we need to make sure we don't include them in the random selection + # find the family IDs of all SGs we specifically want to transfer + family_set |= {metamist_main_content[sgid]['family'] for sgid in sample_set} + + # family_set is ones we definitely want to include. Do a size-weighted + # subset of the remaining families, removing specifed ones from consideration family_set |= get_random_families_from_fam_sg_dict( main_sgs_by_family, family_set, families_n ) - # update the set of chosen samples (which can be empty) # with all the SGIDs from the selected families # chain.from_iterable flattens a generator of all SG ID sets # across all families into a single set @@ -684,33 +692,13 @@ def main( ) ) - elif samples_n: - # if there are additional samples specified, find the corresponding families - # these specifically requested families & samples are copied over in addition - # to the random selection families_n number of families - if family_set: - # we already have this data in query result - sample_set |= set( - chain.from_iterable( - v for k, v in main_sgs_by_family.items() if k in family_set - ) - ) - + if samples_n: # top up the selected SGIDs with random selections # resulting SG IDs we want to copy into the test project sample_set |= set( random.sample(set(metamist_main_content.keys()) - sample_set, samples_n) ) - # maybe we only want to copy specific samples or families... - else: - if sample_set: - sample_set |= set( - chain.from_iterable( - v for k, v in main_sgs_by_family.items() if k in family_set - ) - ) - logging.info(f'Subset to {len(sample_set)} samples') # Populating test project