Skip to content

Commit

Permalink
clean up selection logic
Browse files Browse the repository at this point in the history
  • Loading branch information
MattWellie committed Sep 18, 2023
1 parent 51946dd commit 654d1be
Showing 1 changed file with 18 additions and 30 deletions.
48 changes: 18 additions & 30 deletions scripts/new_test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,20 +661,28 @@ def main(
raise SystemExit()

random.seed(42) # for reproducibility

# end goal of this initial step is having a collection of SGs we want to transfer
# first process the specific SG/Families
# sample_set here will contain SG IDs, or be empty
# if we manually specify families, add their corresponding IDs to that set
if family_set:
sample_set |= set(
chain.from_iterable(
v for k, v in main_sgs_by_family.items() if k in family_set
)
)

if families_n:
# if there are additional samples specified, find the corresponding families
# these specifically requested families & samples are copied over in addition
# to the random selection families_n number of families
if sample_set:
family_set |= {metamist_main_content[sgid]['family'] for sgid in sample_set}

# family_set is ones we definitely want to include
# we need to make sure we don't include them in the random selection
# find the family IDs of all SGs we specifically want to transfer
family_set |= {metamist_main_content[sgid]['family'] for sgid in sample_set}

# family_set is ones we definitely want to include. Do a size-weighted
# subset of the remaining families, removing specifed ones from consideration
family_set |= get_random_families_from_fam_sg_dict(
main_sgs_by_family, family_set, families_n
)

# update the set of chosen samples (which can be empty)
# with all the SGIDs from the selected families
# chain.from_iterable flattens a generator of all SG ID sets
# across all families into a single set
Expand All @@ -684,33 +692,13 @@ def main(
)
)

elif samples_n:
# if there are additional samples specified, find the corresponding families
# these specifically requested families & samples are copied over in addition
# to the random selection families_n number of families
if family_set:
# we already have this data in query result
sample_set |= set(
chain.from_iterable(
v for k, v in main_sgs_by_family.items() if k in family_set
)
)

if samples_n:
# top up the selected SGIDs with random selections
# resulting SG IDs we want to copy into the test project
sample_set |= set(
random.sample(set(metamist_main_content.keys()) - sample_set, samples_n)
)

# maybe we only want to copy specific samples or families...
else:
if sample_set:
sample_set |= set(
chain.from_iterable(
v for k, v in main_sgs_by_family.items() if k in family_set
)
)

logging.info(f'Subset to {len(sample_set)} samples')

# Populating test project
Expand Down

0 comments on commit 654d1be

Please sign in to comment.