Skip to content

Commit

Permalink
diploSHIC preds (#105)
Browse files Browse the repository at this point in the history
* adding diploshic preds

* added predictions

* training param tweak

* cpu training

* fixing edited sequence length; changes ds training params

* tweak DAG to make diploshic retrain
  • Loading branch information
andrewkern authored Jul 12, 2023
1 parent 2d0b547 commit eb16df2
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 16 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@ logs_slurm/*
stairwayplot/*
workflows/.snakemake/*
.tests/*

train/
test/
trainingSets/

4 changes: 2 additions & 2 deletions workflows/diploshic.snake
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ testSampleNumber = params['testSampleNumber'] #the number of simulations to crea
sampleSize = params['sampleSize'] #the number of individuals in our population sample
numSites = params['numSites'] #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u = params['u'] #per-site mutation rate (used to calculate mean theta)
gensPerYear = params['gensPerYear'] #number of generations per year
maxSoftSweepInitFreq = params['maxSoftSweepInitFreq'] #maximum initial selected frequency for soft sweeps
tauHigh = params['tauHigh'] #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
rhoOverTheta = params['rhoOverTheta'] #crossover rate over mut rate (used to calculate mean rho)
Expand Down Expand Up @@ -152,7 +151,8 @@ rule train_classifier:
"trained_model.json",
"trained_model.weights.hdf5"
run:
cmd = f"{diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10"
# cpu training below
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=100"
shell(cmd)

rule clean:
Expand Down
21 changes: 10 additions & 11 deletions workflows/diploshic_params.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
#some parameters that should probably be moved to a .yaml file
trainSampleNumber: 10 #the number of simulation replicates we want to generate for each file in our training set
testSampleNumber: 10 #the number of simulations to create for each file in the test set
sampleSize: 10 #the number of individuals in our population sample
numSites: 100_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 3.27e-9 #per-site mutation rate (used to calculate mean theta)
gensPerYear: 10.0 #number of generations per year
trainSampleNumber: 20 #the number of simulation replicates we want to generate for each file in our training set
testSampleNumber: 20 #the number of simulations to create for each file in the test set
sampleSize: 20 #the number of individuals in our population sample
numSites: 110_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
u: 1e-8 #per-site mutation rate (used to calculate mean theta)
maxSoftSweepInitFreq: 0.1 #maximum initial selected frequency for soft sweeps
tauHigh: 0.05 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
tauHigh: 0.02 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
rhoOverTheta: 1.0 #crossover rate over mut rate (used to calculate mean rho)

ne0: 100_000
ne0: 10_000
thetaMean: 4*N0*u*numSites
rhoMean: thetaMean * rhoOverTheta

seln_coeff_max: 0.005
seln_coeff_min: 0.0001
totSimRuns: 10
seln_coeff_max: 0.05
seln_coeff_min: 0.001
totSimRuns: 20
98 changes: 96 additions & 2 deletions workflows/sweep_simulate.snake
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,18 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
tss = tss.delete_intervals(del_intervals)
tss = tss.trim()
tss.write_vcf(fh_vcf)
fh_vcf.close()
# write seqlen of shortened ts
with open(output[2], 'w') as f:
f.write(str(int(tss.sequence_length)))
# write anc file
seq = list("A" * int(tss.sequence_length))
with open(output[3], "w") as f:
f.write(">1\n")
for v in tss.variants():
seq[int(v.site.position)] = v.alleles[0]
f.write(''.join(seq) + "\n")

# Computing tsstats diversity
divs, windows = compute_stats(ts, sample_sets, num_subwins=num_subwins)
assert divs.shape == (num_subwins,len(target_pops))
Expand All @@ -423,6 +435,49 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
fh.close()



#######################
# Import diploshic rules
#
##########################

module diploshic_workflow:
snakefile:
"diploshic.snake"
config:
"diploshic_params.yaml"


use rule * from diploshic_workflow as diploshic_*


#### diploshic helpers ##########

def write_diploshic_sampleToPopFile(ts, filename, key):
samp_dict = demo_sample_size_dict[key]
with open(filename, "w") as f:
count = 0
for pop in samp_dict:
for i in range(samp_dict[pop]):
f.write(f"tsk_{count}\t{pop}\n")
count += 1
f.close()


def write_diploshic_files(ts_path,sl_path, anc_path):
ts = tskit.load(ts_path)
with open(sl_path,'r') as f:
seq_len = f.read().strip()
seq = list("A" * int(seq_len))
with open(anc_path, "w") as f:
f.write(">1\n")
for v in ts.variants():
seq[int(v.site.position)] = v.alleles[0]
f.write(''.join(seq) + "\n")
f.close()



# ###############################################################################
# GENERAL RULES
# ###############################################################################
Expand Down Expand Up @@ -463,10 +518,15 @@ trees_outs = [file_prefix+".trees" for file_prefix in neutral_outs_prefix + bgs_
stats_outs = [file_prefix+".stats.tsv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
vcf_outs = [file_prefix+".vcf" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
annot_outs = [output_dir + f'/simulated_data/sweeps/annot_overlap_{annot}_{chrom}_{config["num_windows"]}.tsv' for annot in annotation_list]
anc_outs = [file_prefix+".diploshic.ancFile" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
fv_outs = [file_prefix+".diploshic.fv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
pred_outs = [file_prefix+".diploshic.preds" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]

rule all:
input:
boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs
rules.diploshic_all.input,
boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs
default_target: True

rule write_rec:
input:
Expand Down Expand Up @@ -646,7 +706,10 @@ rule get_stats:
input: output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees'
output:
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.stats.tsv",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf"
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen",
output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",

resources: time=3000, mem_mb=2000
run:
target_pops = None
Expand Down Expand Up @@ -717,3 +780,34 @@ rule get_sw_stats:
run:
dump_results(input, output)
"""


rule diploshic_fvs:
input:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen',
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf',
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile',

output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.fv'
run:
with open(input[0],'r') as f:
seq_len = f.read().strip()
cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --ancFileName {input[2]}"
shell(cmd)

rule diploshic_pred:
input:
rules.diploshic_fvs.output,
rules.diploshic_train_classifier.output
output:
output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.preds'

run:
cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.hdf5 {input[0]} {output[0]}"
shell(cmd)





0 comments on commit eb16df2

Please sign in to comment.