Merge branch 'develop' of ssh://github.com/phylo42/PEWO into develop

phylo42 · Nov 20, 2021 · 089c643 · 089c643
2 parents 596b7ae + 577d981
commit 089c643
Show file tree

Hide file tree

Showing 32 changed files with 1,735 additions and 172 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,12 +5,20 @@ logs
 run
 #snakemake temporary files
 .snakemake
+__pycache__
 #images/results built by workflows
 *.svg
 *.pdf
 *.csv
 *.tsv
 #directories built by demos
-demos/HIV_accuracy_test/run
-demos/16SrRNA_accuracy_test/run
-demos/16SrRNA_resource_test/run
+examples/1_fast_test_of_accuracy_procedure/run
+examples/2_placement_accuracy_for_a_bacterial_taxonomy/run
+examples/3_placement_accuracy_for_HIV_genomes/run
+examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_12S/run
+examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_16S/run
+examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_cob/run
+examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_cox1/run
+examples/5_CPU_RAM_requirements_evaluation/run
+examples/6_placement_likelihood/run
+
diff --git a/README.md b/README.md
diff --git a/config.yaml b/config.yaml
@@ -16,8 +16,8 @@ workdir: /home/nikolai/dev/pewo/run_accuracy1
 #states used in analysis, either '0' for nucleotides or '1' for amino acids
 states: 0
 
-#which software to test, at least one of : epa, epang, pplacer, rappas, apples
-test_soft: [epa, epang, rappas, pplacer]
+#which software to test, at least one of : epa, epang, pplacer, rappas, apples, appspam
+test_soft: [epa, epang, rappas, pplacer,appspam]
 
 # READ GENERATION
 # Read lengths to generate
@@ -192,6 +192,32 @@ config_apples:
   #!warning, be sure to set criteria VALUES as UPPER CASE
   criteria: [MLSE,ME]
 
+### APP-SPAM
+###############################
+
+config_appspam:
+
+  #appspam calculates phylogenetic distances between all query and reference distances based on 
+  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
+  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
+
+  #List of placement heuristics to test.
+  #Possible values are:
+  # MINDIST   : Above reference with smallest phylogenetic distance.
+  # SPAMCOUNT : Above reference with most filtered spaced word matches.
+  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
+  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
+  # APPLES    : Our calculated distances are used as input matrix for APPLES.
+  mode: [LCACOUNT]
+
+  #List of weights for the pattern to be tested (number of match positions).
+  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
+  w: [8, 12]
+
+  #Number of pattern from which spaced words are generated.
+  #At the moment 1 is heavily recommended. 
+  pattern: [1]
+
 
 ########################################################################################################################
 # OPTIONS COMMON TO ALL SOFTWARE

diff --git a/envs/environment.yaml b/envs/environment.yaml
@@ -7,6 +7,7 @@ dependencies:
   - rappas
   - epa-ng
   - pplacer
+  - appspam
   - dendropy
   - _libgcc_mutex=0.1=conda_forge
   - _openmp_mutex=4.5=0_gnu

diff --git a/eval_accuracy.smk b/eval_accuracy.smk
@@ -47,6 +47,8 @@ include:
 #distance-based placements, e.g.: apples
 include:
     "rules/placement/apples.smk"
+include:
+    "rules/placement/appspam.smk"
 #results evaluation and plots
 include:
     "rules/op/operate_nodedistance.smk"

diff --git a/eval_resources.smk b/eval_resources.smk
@@ -46,6 +46,8 @@ include:
 #distance-based placements, e.g.: apples
 include:
     "rules/placement/apples.smk"
+include:
+    "rules/placement/appspam.smk"
 #results and plots
 include:
     "rules/op/operate_plots.smk"

diff --git a/examples/1_fast_test_of_accuracy_procedure/README.md b/examples/1_fast_test_of_accuracy_procedure/README.md
@@ -3,12 +3,11 @@
 ## Overview
 
 This demo measures placement accuracy in terms of Node Distance (ND)
-and expected Node Distance (eND)for a reference dataset
+and expected Node Distance (eND) for a reference dataset
 of 150 16S-rRNA barcodes.
 
-EPA-ng, PPlacer and RAPPAS are run using only their default parameters.
-Only 2 pruning are launched, to rapidly produce results in less than
-20 minutes.
+EPA-ng, PPlacer and RAPPAS are run using their default parameters.
+Only two prunings are launched to yield results in less than 20 minutes.
 
 A better analysis would require for >50 prunings to generate a wide
 range of topologies (1 leaf pruned, large clades pruned, ...).
@@ -17,32 +16,32 @@ range of topologies (1 leaf pruned, large clades pruned, ...).
 ## How to launch
 
 Download pipeline.
-```
+``` bash
 git clone --recursive https://github.com/phylo42/PEWO.git
 cd PEWO
 ```
 
 Execute installation script
-```
+``` bash
 chmod u+x INSTALL.sh
 ./INSTALL.sh
 ```
 
 After installation, load environement.
-```
+``` bash
 conda activate PEWO
 ```
 
 Test workflow before launch.
-```
+``` bash
 snakemake -np \
 --snakefile eval_accuracy.smk \
 --config workdir=`pwd`/examples/1_fast_test_of_accuracy_procedure/run \
 --configfile examples/1_fast_test_of_accuracy_procedure/config.yaml
 ```
 
 Execute workflow, using 1 CPU core.
-```
+``` bash
 snakemake -p --cores 1 \
 --snakefile eval_accuracy.smk \
 --config workdir=`pwd`/examples/1_fast_test_of_accuracy_procedure/run \
@@ -51,16 +50,16 @@ snakemake -p --cores 1 \
 
 ## Comments
 
-In this example, 'workdir' and 'query_user' config flags are set
+In this example, `workdir` and `query_user` config flags are set
 dynamically, as it is required they are passed as absolute paths.
 You could also set them manually by editing the config.yaml file
 before launch.
 
 Raw results will be written in
-'examples/1_fast_test_of_accuracy_procedure/run'.
+`examples/1_fast_test_of_accuracy_procedure/run`.
 
 Results summaries and plots will be written in
-'examples/1_fast_test_of_accuracy_procedure/run'.
+`examples/1_fast_test_of_accuracy_procedure/run`.
 
 See PEWO wiki for a more detailed explanation of the results:
 https://github.com/phylo42/PEWO/wiki/IV.-Tutorials-and-results-interpretation
diff --git a/examples/1_fast_test_of_accuracy_procedure/config.yaml b/examples/1_fast_test_of_accuracy_procedure/config.yaml
@@ -198,6 +198,31 @@ config_apples:
   #!warning, be sure to set criteria VALUES as UPPER CASE
   criteria: [MLSE]
 
+### APP-SPAM
+###############################
+
+config_appspam:
+
+  #appspam calculates phylogenetic distances between all query and reference distances based on 
+  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
+  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
+
+  #List of placement heuristics to test.
+  #Possible values are:
+  # MINDIST   : Above reference with smallest phylogenetic distance.
+  # SPAMCOUNT : Above reference with most filtered spaced word matches.
+  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
+  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
+  # APPLES    : Our calculated distances are used as input matrix for APPLES.
+  mode: [LCACOUNT]
+
+  #List of weights for the pattern to be tested (number of match positions).
+  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
+  w: [8, 12]
+
+  #Number of pattern from which spaced words are generated.
+  #At the moment 1 is heavily recommended. 
+  pattern: [1]
 
 ########################################################################################################################
 # OPTIONS COMMON TO ALL SOFTWARE

diff --git a/examples/2_placement_accuracy_for_a_bacterial_taxonomy/README.md b/examples/2_placement_accuracy_for_a_bacterial_taxonomy/README.md
@@ -3,39 +3,38 @@
 ## Overview
 
 This demo measures placement accuracy in terms of Node Distance (ND)
-and expected Node Distance (eND)for a reference dataset
+and expected Node Distance (eND) for a reference dataset
 of 150 16S-rRNA barcodes.
 
 EPA-ng, PPlacer, RAPPAS and Apples are tested.
 
-Only 10 pruning are launched and for a set of parameters in each program.
+Only 10 prunings are executeed and for a set of parameters in each program.
 This analysis will require around 2 hours of computation.
 
 A better analysis would require for >50 prunings to generate a wide
 range of topologies (1 leaf pruned, large clades pruned, ...).
 
 
-## How to launch
-
-Download pipeline.
-```
+## How to run the pipeline
+Download the pipeline.
+``` bash
 git clone --recursive https://github.com/phylo42/PEWO.git
 cd PEWO
 ```
 
-Execute installation script
-```
+Execute the installation script
+``` bash
 chmod u+x INSTALL.sh
 ./INSTALL.sh
 ```
 
-After installation, load environement.
-```
+After installation, load the environment.
+``` bash
 conda activate PEWO
 ```
 
-Test workflow before launch.
-```
+Test workflow before execution.
+``` bash
 snakemake -np \
 --snakefile eval_accuracy.smk \
 --config workdir=`pwd`/examples/2_placement_accuracy_for_a_bacterial_taxonomy/run \
@@ -44,24 +43,24 @@ snakemake -np \
 
 Execute workflow, using 2 CPU cores and 8Gb of RAM.
 ```
-snakemake -p --cores 2 --resources mem_mb=8000\
+snakemake -p --cores 2 --resources mem_mb=8000 \
 --snakefile eval_accuracy.smk \
 --config workdir=`pwd`/examples/2_placement_accuracy_for_a_bacterial_taxonomy/run \
 --configfile examples/2_placement_accuracy_for_a_bacterial_taxonomy/config.yaml
 ```
 
 ## Comments
 
-In this example, 'workdir' and 'query_user' config flags are set
+In this example, `workdir` and `query_user` config flags are set
 dynamically, as it is required they are passed as absolute paths.
-You could also set them manually by editing the config.yaml file
-before launch.
+You could also set them manually by editing the `config.yaml`  file
+before execution.
 
 Raw results will be written in
-'examples/2_placement_accuracy_for_a_bacterial_taxonomy/run'.
+`examples/2_placement_accuracy_for_a_bacterial_taxonomy/run`.
 
 Results summaries and plots will be written in
-'examples/2_placement_accuracy_for_a_bacterial_taxonomy/run'.
+`examples/2_placement_accuracy_for_a_bacterial_taxonomy/run`.
 
 See PEWO wiki for a more detailed explanation of the results:
 https://github.com/phylo42/PEWO/wiki/IV.-Tutorials-and-results-interpretation
diff --git a/examples/2_placement_accuracy_for_a_bacterial_taxonomy/config.yaml b/examples/2_placement_accuracy_for_a_bacterial_taxonomy/config.yaml
@@ -198,6 +198,32 @@ config_apples:
   #!warning, be sure to set criteria VALUES as UPPER CASE
   criteria: [MLSE,ME,HYBRID]
 
+### APP-SPAM
+###############################
+
+config_appspam:
+
+  #appspam calculates phylogenetic distances between all query and reference distances based on 
+  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
+  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
+
+  #List of placement heuristics to test.
+  #Possible values are:
+  # MINDIST   : Above reference with smallest phylogenetic distance.
+  # SPAMCOUNT : Above reference with most filtered spaced word matches.
+  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
+  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
+  # APPLES    : Our calculated distances are used as input matrix for APPLES.
+  mode: [LCACOUNT]
+
+  #List of weights for the pattern to be tested (number of match positions).
+  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
+  w: [8, 12]
+
+  #Number of pattern from which spaced words are generated.
+  #At the moment 1 is heavily recommended. 
+  pattern: [1]
+
 
 ########################################################################################################################
 # OPTIONS COMMON TO ALL SOFTWARE

diff --git a/examples/3_placement_accuracy_for_HIV_genomes/config.yaml b/examples/3_placement_accuracy_for_HIV_genomes/config.yaml
@@ -198,6 +198,31 @@ config_apples:
   #!warning, be sure to set criteria VALUES as UPPER CASE
   criteria: [MLSE]
 
+### APP-SPAM
+###############################
+
+config_appspam:
+
+  #appspam calculates phylogenetic distances between all query and reference distances based on 
+  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
+  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
+
+  #List of placement heuristics to test.
+  #Possible values are:
+  # MINDIST   : Above reference with smallest phylogenetic distance.
+  # SPAMCOUNT : Above reference with most filtered spaced word matches.
+  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
+  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
+  # APPLES    : Our calculated distances are used as input matrix for APPLES.
+  mode: [LCACOUNT]
+
+  #List of weights for the pattern to be tested (number of match positions).
+  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
+  w: [8, 12]
+
+  #Number of pattern from which spaced words are generated.
+  #At the moment 1 is heavily recommended. 
+  pattern: [1]
 
 ########################################################################################################################
 # OPTIONS COMMON TO ALL SOFTWARE

diff --git a/examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_12S/config_12S.yaml b/examples/4_search_for_most_accurate_taxonomic_marker/coleoptera_12S/config_12S.yaml
@@ -198,6 +198,32 @@ config_apples:
   #!warning, be sure to set criteria VALUES as UPPER CASE
   criteria: [MLSE]
 
+### APP-SPAM
+###############################
+
+config_appspam:
+
+  #appspam calculates phylogenetic distances between all query and reference distances based on 
+  #filtered spaced word matches. The placement position is determined with different heuristics (mode).
+  #(Blanke, Morgenstern, 2020 ; https://doi.org/10.1101/2020.10.19.344986)
+
+  #List of placement heuristics to test.
+  #Possible values are:
+  # MINDIST   : Above reference with smallest phylogenetic distance.
+  # SPAMCOUNT : Above reference with most filtered spaced word matches.
+  # LCADIST   : LCA of two leaves with smallest phylogenetic distances.
+  # LCACOUNT  : LCA of two leaves with most filtered spaced word matches.
+  # APPLES    : Our calculated distances are used as input matrix for APPLES.
+  mode: [LCACOUNT]
+
+  #List of weights for the pattern to be tested (number of match positions).
+  #Largest values tend result in shorter running times. w between [8, 16] recommended, use 12 as default.
+  w: [8, 12]
+
+  #Number of pattern from which spaced words are generated.
+  #At the moment 1 is heavily recommended. 
+  pattern: [1]
+
 
 ########################################################################################################################
 # OPTIONS COMMON TO ALL SOFTWARE