Skip to content

Commit

Permalink
Suport for checkm2 (#607)
Browse files Browse the repository at this point in the history
* add blank file for bin quality

* Added BUSCO and GUNC support.

* specify versions

* specify different pachs for busco and gunc

* dont' run busco offline

* add new busco rule parserscritp WIP

* remove checkm from genomes remove also checkm tree

* adapt drep for busco

* rename genome quality rather than

* add busco to config.yaml

* Revert "dont' run busco offline"

This reverts commit b7767a4.

* run busco offline

* fixing error with download

* black

* bug fixing

* bin report adapted to busco

* busco WIP

* busco v 5.4

* check passed

* bug fixes for genome

* testing busco

* add checkm2 download

* add checkm2 predict

* checkm2 corrections

* build index for genecatalog map each reads individ

* delete index when no longer used

* print rpkm

* usejni in config

* set build explicitly

* formating

* using bwa mem for genecatalog

* Revert "using bwa mem for genecatalog"

This reverts commit b157856.

* specify not interleaved for genes

* bbmap cat first

* use minimap for genecatalog

* cat reads independent of minimap

* use minimap outside of wrapper

* format

* tested

* formating

* Dead end at gen counts 1GB/sample

* add gene info rule

* parquet with gene nrs not string

* median coverage for genes

* use median fold

* unnecessary log file

* skip error when only QC reads

* remove error code

* " --rerun-triggers mtime "

* galah parameters in genecatalog

* remove junck

* need source links for renaming

* parse samples with underscores

* add minimap mapping for strains

* minimap maps paired end reads

* remove crap code

* remove crap code

* formating

* set checkmn2 as default, update conda

* formating

* update bin report

* removed redundanct merge checkm

* tested checkm2

* formated

* add binning and assembly to cricle ci

* add checkm2 toi download

* do not circelci conda envs

* dont use checkm for filtered bins

---------

Co-authored-by: Matija <[email protected]>
Co-authored-by: silas kieser <[email protected]>
  • Loading branch information
3 people authored Feb 3, 2023
1 parent 2d09bd7 commit c0b97a7
Show file tree
Hide file tree
Showing 24 changed files with 636 additions and 369 deletions.
40 changes: 17 additions & 23 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,6 @@ jobs:
command: |
source activate ./atlasenv
test/test_assembly.sh --resources mem=$MEM --jobs=$N_THREADS --restart-times=2
# --omit-from build_qc_report build_assembly_report
- store_artifacts:
path: test/Test_assembly/logs
destination: assembly_logs
Expand All @@ -190,6 +188,7 @@ jobs:
source activate ./atlasenv
WD='test/Test_assembly'
atlas run genecatalog --omit-from combine_egg_nogg_annotations -w $WD --resources mem=$MEM --jobs=$N_THREADS --restart-times=2
binning:
<<: *defaults
environment:
Expand All @@ -199,22 +198,18 @@ jobs:
steps:
- attach_workspace:
at: /root/project/
- run: tar -cf conda_envs.tar atlas/envs
- restore_cache:
keys:
- conda-environements-{{ checksum "conda_envs.tar" }}
- conda-environements-
- run:
name: test binning
name: run binning
command: |
source activate ./atlasenv
test/test_binning.sh --resources mem=$MEM java_mem=$MEM --jobs=$N_THREADS --restart-times=2 --omit-from get_bins
WD='test/Test_assembly'
atlas run binning -w $WD --resources mem=$MEM --jobs=$N_THREADS --restart-times=2 --config final_binner=metabat
- store_test_results:
path: example_data/binning/reports
path: test/Test_assembly/reports/bin_report_metabat.html
- store_artifacts:
path: example_data/binning/reports
destination: binning_results
path: test/Test_assembly/reports/bin_report_metabat.html
destination: bin_report

#
#
Expand Down Expand Up @@ -251,19 +246,18 @@ workflows:
- genome_quantify:
requires:
- build_and_dryrun

# - sra_init:
# requires:
# - build_and_dryrun
# - get_example_data
- get_example_data
- assembly:
requires:
- build_and_dryrun
- get_example_data
- binning:
requires:
- assembly
# - getenvs:
# requires:
# - build_and_dryrun
# - assembly:
# requires:
# - build_and_dryrun
# - get_example_data
# - binning:

# - sra_init:
# requires:
# - build_and_dryrun
# - get_example_data
1 change: 0 additions & 1 deletion atlas/atlas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from email.policy import default
import os, sys
from .color_logger import logger

Expand Down
8 changes: 5 additions & 3 deletions atlas/color_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
# root logger
logger = logging.getLogger()

# logger= logging


grey = "\x1b[38;21m"
green = "\x1b[32;21m"
yellow = "\x1b[33;21m"
Expand Down Expand Up @@ -89,3 +86,8 @@ def handle_exception(exc_type, exc_value, exc_traceback):

# Install exception handler
sys.excepthook = handle_exception

# root logger
logger = logging.getLogger()

# logger= logging
3 changes: 3 additions & 0 deletions atlas/make_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def make_default_config():

config["maximum_counted_map_sites"] = MAXIMUM_COUNTED_MAP_SITES

config["bin_quality_asesser"] = "checkm"
# gene cluster
config["genecatalog"] = {
"source": "genomes",
Expand Down Expand Up @@ -150,6 +151,8 @@ def make_default_config():
"megabin_penalty": 0.5,
}

config["gunc_database"] = "gtdb"

config["cobining_min_contig_length"] = 2000
config["cobining_min_bin_size"] = 200 * 1000
config["semibin_options"] = " --max-node 1 --max-edges 200 "
Expand Down
1 change: 0 additions & 1 deletion atlas/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ rule binning:
sample=SAMPLES,
),
expand("reports/bin_report_{binner}.html", binner=config["final_binner"]),
"genomes/all_bins/checkm_all_bins.tsv",
"finished_assembly",
output:
temp(touch("finished_binning")),
Expand Down
4 changes: 3 additions & 1 deletion atlas/workflow/config/default_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@ genome_dereplication:
length: 0
centrality: 1

genome_aligner: "minimap"
genome_aligner: "minimap"

bin_quality_asesser: checkm2 #[ checkm2, busco, cehckm]
6 changes: 6 additions & 0 deletions atlas/workflow/envs/busco.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- busco=5.4
6 changes: 6 additions & 0 deletions atlas/workflow/envs/checkm2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- checkm2>=1.0.1, <1.1
6 changes: 6 additions & 0 deletions atlas/workflow/envs/gunc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- gunc=1.0
62 changes: 50 additions & 12 deletions atlas/workflow/report/bin_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,48 @@ def make_plots(bin_table):

# Prepare data
df = pd.read_table(bin_table)
df.index = df["Bin Id"]
df = df.join(tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA"))

df["Quality Score"] = df.eval("Completeness - 5* Contamination")
if snakemake.config["bin_quality_asesser"].lower() == "busco":

df["Bin Id"] = df["Input_file"].str.replace(".fasta", "", regex=False)

logging.info("No taxonomic information available, use busco Dataset")

lineage_name = "Dataset"
hover_data = [
"Scores_archaea_odb10",
"Scores_bacteria_odb10",
"Scores_eukaryota_odb10",
]
size_name = None

elif snakemake.config["bin_quality_asesser"].lower() == "checkm":

df = df.join(
tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA")
)

lineage_name = "phylum"
size_name = "Genome size (Mbp)"
hover_data = ["genus"]

elif snakemake.config["bin_quality_asesser"].lower() == "checkm2":

df["Bin Id"] = df.index

lineage_name = "Translation_Table_Used"
hover_data = [
"Completeness_Model_Used",
"Coding_Density",
"Contig_N50",
"GC_Content",
"Additional_Notes",
]
size_name = "Genome_Size"
else:
raise Exception(f"bin_quality_asesser in the config file not understood")

df.index = df["Bin Id"]

div[
"QualityScore"
Expand All @@ -63,9 +101,9 @@ def make_plots(bin_table):
data_frame=df,
y="Completeness",
x="Contamination",
color="phylum",
size="Genome size (Mbp)",
hover_data=["genus"],
color=lineage_name,
size=size_name,
hover_data=hover_data,
hover_name="Bin Id",
)
fig.update_yaxes(range=(50, 102))
Expand All @@ -75,10 +113,10 @@ def make_plots(bin_table):
## By sample
fig = px.strip(
data_frame=df,
y="Quality Score",
y="Quality_score",
x="Sample",
color="phylum",
hover_data=["genus"],
color=lineage_name,
hover_data=hover_data,
hover_name="Bin Id",
)
fig.update_yaxes(range=(50, 102))
Expand All @@ -87,9 +125,9 @@ def make_plots(bin_table):
# By Phylum
fig = px.strip(
data_frame=df,
y="Quality Score",
x="phylum",
hover_data=["genus"],
y="Quality_score",
x=lineage_name,
hover_data=hover_data,
hover_name="Bin Id",
)
fig.update_yaxes(range=(50, 102))
Expand Down
2 changes: 1 addition & 1 deletion atlas/workflow/report/template_bin_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

<h1>Bin Report for Binner {binner}</h1>

<p>Genome completeness and contamination, and taxonomy were estimated unsing CheckM.</p>
<p>Genome completeness and contamination, and taxonomy were estimated unsing CheckM2. </p>
<p>Bins might represent the same species assembled from different samples. During the De-replication step only the gneome with the highest quality will be selected as representative for the species/cluster.</p>
<p>For all the information see the table '{div[input_file]}'</p>

Expand Down
Loading

0 comments on commit c0b97a7

Please sign in to comment.