Skip to content

Commit

Permalink
new gtdb database (#518)
Browse files Browse the repository at this point in the history
* new gtdbtk

* less memory for gtdbtk

* new gtdb database

* gtdb uses predicted genes with prodigal

* infer gene dir from flag

* use gtdb tk >2.1.1

* export gtdb env variable

* gtdb in conda env

* download gtdb data myself

* localrule gtdb download

* don't download cat db

* 2 rules gtdb download

* draom_concat with üandas

* Revert "draom_concat with üMandas"

This reverts commit 876076e.

* add changlog v2.10
  • Loading branch information
SilasK authored Jul 26, 2022
1 parent 011455a commit 591446d
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 22 deletions.
2 changes: 1 addition & 1 deletion atlas/workflow/envs/gtdbtk.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ channels:
- bioconda
- defaults
dependencies:
- gtdbtk =1.5.*
- gtdbtk >=2.1.1, < 3
33 changes: 18 additions & 15 deletions atlas/workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import os
ZENODO_ARCHIVE = "1134890"
EGGNOG_VERSION = "5"

GTDB_DATA_URL = "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_v2_data.tar.gz"

def md5(fname):
# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
Expand All @@ -24,7 +25,7 @@ CHECKM_ARCHIVE = "checkm_data_v1.0.9.tar.gz"
CAT_DIR = os.path.join(DBDIR, "CAT")
CAT_flag_downloaded = os.path.join(CAT_DIR, "downloaded")
EGGNOG_DIR = os.path.join(DBDIR, "EggNOG_V5")
GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_V06")
GTDBTK_DATA_PATH = os.path.join(DBDIR, "GTDB_V07")
CONDAENV = "../envs"

# note: saving OG_fasta.tar.gz in order to not create secondary "success" file
Expand Down Expand Up @@ -164,22 +165,23 @@ rule initialize_checkm:
shell:
"checkm data setRoot {params.database_dir} &> {log} "


rule download_cat_db:
localrules: download_gtdb
rule download_gtdb:
output:
touch(CAT_flag_downloaded),
params:
db_folder=CAT_DIR,
resources:
mem=config.get("large_mem", 250),
threads: config.get("large_threads", 16)
temp(f"{GTDBTK_DATA_PATH}/gtdb_data.tar.gz")
conda:
"%s/cat.yaml" % CONDAENV
"../envs/gtdbtk.yaml"
threads: 1
resources:
time=int(config.get("runtime", {"long": 10})["long"]),
log:
"logs/download/gtdbtk.log",
shell:
" CAT prepare -d {params.db_folder} -t {params.db_folder} --existing --nproc {threads}"
' wget {GTDB_DATA_URL} -O {output} &> {log} '


rule download_gtdb:
rule extract_gtdb:
input:
rules.download_gtdb.output
output:
touch(os.path.join(GTDBTK_DATA_PATH, "downloaded_success")),
conda:
Expand All @@ -190,8 +192,9 @@ rule download_gtdb:
log:
"logs/download/gtdbtk.log",
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"download-db.sh &> {log};"
' tar -xzvf {input} -C "{GTDBTK_DATA_PATH}" --strip 1 2> {log} '
' echo "Set the GTDBTK_DATA_PATH environment variable to {GTDBTK_DATA_PATH} " >> {log}'
" conda env config vars set GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} "


onsuccess:
Expand Down
13 changes: 7 additions & 6 deletions atlas/workflow/rules/gtdbtk.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ gtdb_dir = "genomes/taxonomy/gtdb"

rule identify:
input:
dir=genome_dir,
flag=rules.download_gtdb.output,
genes_flag= "genomes/annotations/genes/predicted"
output:
directory(f"{gtdb_dir}/identify"),
threads: config["threads"]
Expand All @@ -15,10 +15,12 @@ rule identify:
f"{gtdb_dir}/gtdbtk.log",
params:
outdir=gtdb_dir,
extension="fasta",
extension="faa",
gene_dir = lambda wc, input: os.path.abspath(os.path.dirname(input.genes_flag))
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk identify --genome_dir {input.dir} --out_dir {params.outdir} "
"gtdbtk identify "
"--genes --genome_dir {params.gene_dir} "
" --out_dir {params.outdir} "
"--extension {params.extension} "
"--cpus {threads} &> {log[0]}"

Expand All @@ -37,7 +39,6 @@ checkpoint align:
params:
outdir=gtdb_dir,
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
"--cpus {threads} &> {log[0]}"

Expand All @@ -61,9 +62,9 @@ rule classify:
outdir=gtdb_dir,
extension="fasta",
shell:
"GTDBTK_DATA_PATH={GTDBTK_DATA_PATH} ; "
"gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} "
"--out_dir {params.outdir} "
" --tmpdir {resources.tmpdir} "
"--extension {params.extension} "
"--cpus {threads} &> {log[0]}"

Expand Down
4 changes: 4 additions & 0 deletions docs/usage/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@

# Change log

## [2.10](https://github.com/metagenome-atlas/atlas/compare/v2.9.1...v2.10.0)

### Features
* GTDB version 207
* Low memory taxonomic annotation


## [2.9](https://github.com/metagenome-atlas/atlas/compare/v2.8.2...v2.9.0)
Expand Down

0 comments on commit 591446d

Please sign in to comment.