diploSHIC preds (#105)

* adding diploshic preds * added predictions * training param tweak * cpu training * fixing edited sequence length; changes ds training params * tweak DAG to make diploshic retrain
popsim-consortium · Jul 12, 2023 · eb16df2 · eb16df2
1 parent 2d0b547
commit eb16df2
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 16 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,5 +12,7 @@ logs_slurm/*
 stairwayplot/*
 workflows/.snakemake/*
 .tests/*
-
+train/
+test/
+trainingSets/
 
diff --git a/workflows/diploshic.snake b/workflows/diploshic.snake
@@ -25,7 +25,6 @@ testSampleNumber = params['testSampleNumber'] #the number of simulations to crea
 sampleSize = params['sampleSize'] #the number of individuals in our population sample
 numSites = params['numSites'] #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
 u = params['u'] #per-site mutation rate (used to calculate mean theta)
-gensPerYear = params['gensPerYear'] #number of generations per year
 maxSoftSweepInitFreq = params['maxSoftSweepInitFreq'] #maximum initial selected frequency for soft sweeps
 tauHigh = params['tauHigh'] #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
 rhoOverTheta = params['rhoOverTheta'] #crossover rate over mut rate (used to calculate mean rho)
@@ -152,7 +151,8 @@ rule train_classifier:
         "trained_model.json",
         "trained_model.weights.hdf5"
     run:
-        cmd = f"{diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=10"
+        # cpu training below
+        cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && {diploSHIC_exec} train trainingSets/ trainingSets/ trained_model --epochs=100"
         shell(cmd)
 
 rule clean:

diff --git a/workflows/diploshic_params.yaml b/workflows/diploshic_params.yaml
@@ -1,18 +1,17 @@
 #some parameters that should probably be moved to a .yaml file
-trainSampleNumber: 10 #the number of simulation replicates we want to generate for each file in our training set
-testSampleNumber: 10 #the number of simulations to create for each file in the test set
-sampleSize: 10 #the number of individuals in our population sample
-numSites: 100_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
-u: 3.27e-9 #per-site mutation rate (used to calculate mean theta)
-gensPerYear: 10.0 #number of generations per year
+trainSampleNumber: 20 #the number of simulation replicates we want to generate for each file in our training set
+testSampleNumber: 20 #the number of simulations to create for each file in the test set
+sampleSize: 20 #the number of individuals in our population sample
+numSites: 110_000 #total number of sites in our simulated window (i.e. S/HIC's subwindow size * 11)
+u: 1e-8 #per-site mutation rate (used to calculate mean theta)
 maxSoftSweepInitFreq: 0.1 #maximum initial selected frequency for soft sweeps
-tauHigh: 0.05 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
+tauHigh: 0.02 #maximum FIXATION (not mutation) time (in units of 4N generations ago) in the past
 rhoOverTheta: 1.0 #crossover rate over mut rate (used to calculate mean rho)
 
-ne0: 100_000
+ne0: 10_000
 thetaMean: 4*N0*u*numSites
 rhoMean: thetaMean * rhoOverTheta
 
-seln_coeff_max: 0.005
-seln_coeff_min: 0.0001
-totSimRuns: 10
+seln_coeff_max: 0.05
+seln_coeff_min: 0.001
+totSimRuns: 20
diff --git a/workflows/sweep_simulate.snake b/workflows/sweep_simulate.snake
@@ -411,6 +411,18 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
         tss = tss.delete_intervals(del_intervals)
         tss = tss.trim()
     tss.write_vcf(fh_vcf)
+    fh_vcf.close()
+    # write seqlen of shortened ts
+    with open(output[2], 'w') as f:
+        f.write(str(int(tss.sequence_length)))
+    # write anc file
+    seq = list("A" * int(tss.sequence_length))
+    with open(output[3], "w") as f:
+        f.write(">1\n")
+        for v in tss.variants():
+            seq[int(v.site.position)] = v.alleles[0]
+        f.write(''.join(seq) + "\n")
+
     # Computing tsstats diversity
     divs, windows = compute_stats(ts, sample_sets, num_subwins=num_subwins)
     assert divs.shape == (num_subwins,len(target_pops))
@@ -423,6 +435,49 @@ def dump_results(input, output, params_dict, target_pops, num_subwins=1):
     fh.close()
 
 
+
+#######################
+# Import diploshic rules
+#
+##########################
+
+module diploshic_workflow:
+    snakefile:
+        "diploshic.snake"
+    config:
+        "diploshic_params.yaml"
+
+
+use rule * from diploshic_workflow as diploshic_*
+
+
+#### diploshic helpers ##########
+
+def write_diploshic_sampleToPopFile(ts, filename, key):
+    samp_dict = demo_sample_size_dict[key]
+    with open(filename, "w") as f:
+        count = 0
+        for pop in samp_dict:
+            for i in range(samp_dict[pop]):
+                f.write(f"tsk_{count}\t{pop}\n")
+                count += 1
+    f.close()
+
+
+def write_diploshic_files(ts_path,sl_path, anc_path):
+    ts = tskit.load(ts_path)
+    with open(sl_path,'r') as f:
+        seq_len = f.read().strip()
+    seq = list("A" * int(seq_len))
+    with open(anc_path, "w") as f:
+        f.write(">1\n")
+        for v in ts.variants():
+            seq[int(v.site.position)] = v.alleles[0]
+        f.write(''.join(seq) + "\n")
+    f.close()
+
+
+
 # ###############################################################################
 # GENERAL RULES
 # ###############################################################################
@@ -463,10 +518,15 @@ trees_outs = [file_prefix+".trees" for file_prefix in neutral_outs_prefix + bgs_
 stats_outs = [file_prefix+".stats.tsv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
 vcf_outs = [file_prefix+".vcf" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
 annot_outs = [output_dir + f'/simulated_data/sweeps/annot_overlap_{annot}_{chrom}_{config["num_windows"]}.tsv' for annot in annotation_list]
+anc_outs = [file_prefix+".diploshic.ancFile" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
+fv_outs = [file_prefix+".diploshic.fv" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
+pred_outs = [file_prefix+".diploshic.preds" for file_prefix in neutral_outs_prefix + bgs_outs_prefix + sw_outs_prefix]
 
 rule all:
     input:
-        boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs
+        rules.diploshic_all.input,
+        boundary_outs + trees_outs + stats_outs + vcf_outs + [output_dir + f'/simulated_data/sweeps/all_sims.stats.tsv', output_dir+f'/simulated_data/sweeps/rec_map_{chrom}_{config["num_windows"]}.tsv'] + annot_outs +anc_outs + fv_outs + pred_outs
+    default_target: True
 
 rule write_rec:
     input:
@@ -646,7 +706,10 @@ rule get_stats:
     input: output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.trees'
     output:
         output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.stats.tsv", 
-        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf"
+        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf",
+        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen",
+        output_dir + "/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile",
+
     resources: time=3000, mem_mb=2000
     run:
         target_pops = None
@@ -717,3 +780,34 @@ rule get_sw_stats:
     run:
         dump_results(input, output)
 """
+
+
+rule diploshic_fvs:
+   input: 
+       output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.seqlen',
+       output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.vcf',
+       output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.ancFile',
+
+   output:
+       output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.fv'
+   run:
+        with open(input[0],'r') as f:
+            seq_len = f.read().strip()
+        cmd = f"diploSHIC fvecVcf haploid {input[1]} 1 {int(seq_len)} {output[0]} --ancFileName {input[2]}"    
+        shell(cmd)    
+
+rule diploshic_pred:
+    input:
+        rules.diploshic_fvs.output,
+        rules.diploshic_train_classifier.output
+    output:
+        output_dir + '/simulated_data/sweeps/{middle}/sim_{chrom}_{left}_{right}.diploshic.preds'
+
+    run:
+        cmd = f"export CUDA_VISIBLE_DEVICES=\"\" && diploSHIC predict trained_model.json trained_model.weights.hdf5 {input[0]} {output[0]}" 
+        shell(cmd)   
+
+
+
+
+