diff --git a/.gitignore b/.gitignore index 0f11b4c..a99e4b9 100644 --- a/.gitignore +++ b/.gitignore @@ -12,5 +12,7 @@ logs_slurm/* stairwayplot/* workflows/.snakemake/* .tests/* - +train/ +test/ +trainingSets/ diff --git a/workflows/diploshic.snake b/workflows/diploshic.snake index f49dd94..e76e9d0 100644 --- a/workflows/diploshic.snake +++ b/workflows/diploshic.snake @@ -25,7 +25,6 @@ testSampleNumber = params['testSampleNumber'] #the number of simulations to crea sampleSize = params['sampleSize'] #the number of individuals in our population sample numSites = params['numSites'] #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11) u = params['u'] #per-site mutation rate (used to calculate mean theta) -gensPerYear = params['gensPerYear'] #number of generations per year maxSoftSweepInitFreq = params['maxSoftSweepInitFreq'] #maximum initial selected frequency for soft sweeps tauHigh = params['tauHigh'] #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past rhoOverTheta = params['rhoOverTheta'] #crossover rate over mut rate (used to calculate mean rho) @@ -152,7 +151,8 @@ rule train_classifier: "trained_model.json", "trained_model.weights.hdf5" run: - cmd = f"{diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10" + # cpu training below + cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=100" shell(cmd) rule clean: diff --git a/workflows/diploshic_params.yaml b/workflows/diploshic_params.yaml index 1f84419..25a31d8 100644 --- a/workflows/diploshic_params.yaml +++ b/workflows/diploshic_params.yaml @@ -1,18 +1,17 @@ #some parameters that should probably be moved to a .yaml file -trainSampleNumber: 10 #the number of simulation replicates we want to generate for each file in our training set -testSampleNumber: 10 #the number of simulations to create for each file in the test set -sampleSize: 10 #the number of individuals in our population sample -numSites: 100_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11) -u: 3.27e-9 #per-site mutation rate (used to calculate mean theta) -gensPerYear: 10.0 #number of generations per year +trainSampleNumber: 20 #the number of simulation replicates we want to generate for each file in our training set +testSampleNumber: 20 #the number of simulations to create for each file in the test set +sampleSize: 20 #the number of individuals in our population sample +numSites: 110_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11) +u: 1e-8 #per-site mutation rate (used to calculate mean theta) maxSoftSweepInitFreq: 0.1 #maximum initial selected frequency for soft sweeps -tauHigh: 0.05 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past +tauHigh: 0.02 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past rhoOverTheta: 1.0 #crossover rate over mut rate (used to calculate mean rho) -ne0: 100_000 +ne0: 10_000 thetaMean: 4*N0*u*numSites rhoMean: thetaMean * rhoOverTheta -seln_coeff_max: 0.005 -seln_coeff_min: 0.0001 -totSimRuns: 10 +seln_coeff_max: 0.05 +seln_coeff_min: 0.001 +totSimRuns: 20 diff --git a/workflows/sweep_simulate.snake b/workflows/sweep_simulate.snake index 6c8fc89..5792240 100644 --- a/workflows/sweep_simulate.snake +++ b/workflows/sweep_simulate.snake @@ -411,6 +411,18 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1): tss = tss.delete_intervals(del_intervals) tss = tss.trim() tss.write_vcf(fh_vcf) + fh_vcf.close() + # write seqlen of shortened ts + with open(output[2], 'w') as f: + f.write(str(int(tss.sequence_length))) + # write anc file + seq = list("A" * int(tss.sequence_length)) + with open(output[3], "w") as f: + f.write(">1\n") + for v in tss.variants(): + seq[int(v.site.position)] = v.alleles[0] + f.write(''.join(seq) + "\n") + # Computing tsstats diversity divs, windows = compute_stats(ts, sample_sets, num_subwins=num_subwins) assert divs.shape == (num_subwins,len(target_pops)) @@ -423,6 +435,49 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1): fh.close() + +####################### +# Import diploshic rules +# +########################## + +module diploshic_workflow: + snakefile: + "diploshic.snake" + config: + "diploshic_params.yaml" + + +use rule * from diploshic_workflow as diploshic_* + + +#### diploshic helpers ########## + +def write_diploshic_sampleToPopFile(ts, filename, key): + samp_dict = demo_sample_size_dict[key] + with open(filename, "w") as f: + count = 0 + for pop in samp_dict: + for i in range(samp_dict[pop]): + f.write(f"tsk_{count}\t{pop}\n") + count += 1 + f.close() + + +def write_diploshic_files(ts_path,sl_path, anc_path): + ts = tskit.load(ts_path) + with open(sl_path,'r') as f: + seq_len = f.read().strip() + seq = list("A" * int(seq_len)) + with open(anc_path, "w") as f: + f.write(">1\n") + for v in ts.variants(): + seq[int(v.site.position)] = v.alleles[0] + f.write(''.join(seq) + "\n") + f.close() + + + # ############################################################################### # GENERAL RULES # ############################################################################### @@ -463,10 +518,15 @@ trees_outs = [file_prefix+".trees" for file_prefix in neutral_outs_prefix + bgs_ stats_outs = [file_prefix+".stats.tsv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix] vcf_outs = [file_prefix+".vcf" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix] annot_outs = [output_dir + f'/simulated_data/sweeps/annot_overlap_{annot}_{chrom}_{config["num_windows"]}.tsv' for annot in annotation_list] +anc_outs = [file_prefix+".diploshic.ancFile" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix] +fv_outs = [file_prefix+".diploshic.fv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix] +pred_outs = [file_prefix+".diploshic.preds" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix] rule all: input: - boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs + rules.diploshic_all.input, + boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs + default_target: True rule write_rec: input: @@ -646,7 +706,10 @@ rule get_stats: input: output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees' output: output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.stats.tsv", - output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf" + output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf", + output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen", + output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile", + resources: time=3000, mem_mb=2000 run: target_pops = None @@ -717,3 +780,34 @@ rule get_sw_stats: run: dump_results(input, output) """ + + +rule diploshic_fvs: + input: + output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen', + output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf', + output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile', + + output: + output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.fv' + run: + with open(input[0],'r') as f: + seq_len = f.read().strip() + cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --ancFileName {input[2]}" + shell(cmd) + +rule diploshic_pred: + input: + rules.diploshic_fvs.output, + rules.diploshic_train_classifier.output + output: + output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.preds' + + run: + cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.hdf5 {input[0]} {output[0]}" + shell(cmd) + + + + +