Skip to content

Commit

Permalink
Genome dereplication using skani (#663)
Browse files Browse the repository at this point in the history
* add_skani

* add target_rule

* skani run

* added skani 2 parquet

* skani2 parquet tested

* species clustering with skani

* add info about translation table to cluter species

* rename genomes with skani

* drop dRep

* new bin report ugly

* formating

* use upercase names as in checkm2

* use upper case for quality headers

* fix upercase

* fix report

* formating

* remove drep option from config

* make pandas2 ready

* formatig

* make output of binning, genomes and genecatalog temp

* recalibrate quality

* formating

* move raw bin_finfo to subfolder

* path in raw bins
  • Loading branch information
SilasK authored Jun 15, 2023
1 parent a119b0a commit da97853
Show file tree
Hide file tree
Showing 21 changed files with 770 additions and 406 deletions.
17 changes: 9 additions & 8 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ include: "rules/download.smk" # contains hard coded variables
include: "rules/qc.smk"
include: "rules/assemble.smk"
include: "rules/binning.smk"
include: "rules/derep.smk"
include: "rules/genomes.smk"
include: "rules/dram.smk"
include: "rules/genecatalog.smk"
Expand Down Expand Up @@ -290,7 +291,7 @@ rule genecatalog:
"Genecatalog/clustering/orf_info.parquet",
get_gene_catalog_input(),
output:
touch("finished_genecatalog"),
temp(touch("finished_genecatalog")),


rule strains:
Expand Down Expand Up @@ -331,7 +332,7 @@ rule genomes:
*get_genome_annotations(),
"finished_binning",
output:
touch("finished_genomes"),
temp(touch("finished_genomes")),


rule quantify_genomes:
Expand All @@ -342,15 +343,15 @@ rule quantify_genomes:

rule binning:
input:
expand(
"{sample}/binning/{binner}/cluster_attribution.tsv",
binner=config["final_binner"],
sample=SAMPLES,
"Binning/{binner}/genome_similarities.parquet".format(
binner=config["final_binner"]
),
expand("Binning/{binner}/report.html", binner=config["final_binner"]),
"Binning/{binner}/bins2species.tsv".format(binner=config["final_binner"]),
"Binning/{binner}/bin_info.tsv".format(binner=config["final_binner"]),
expand("reports/bin_report_{binner}.html", binner=config["final_binner"]),
"finished_assembly",
output:
touch("finished_binning"),
temp(touch("finished_binning")),


rule assembly_one_sample:
Expand Down
7 changes: 0 additions & 7 deletions workflow/config/template_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,6 @@ gunc_database: "progenomes" # 'progenomes' or 'gtdb'
genome_dereplication:
ANI: 0.95 ## Genome dreplication threshold 0.95 is more or less species
overlap: 0.2 # See more on https://drep.readthedocs.io/en/latest/module_descriptions.html
greedy_clustering: "auto" # Add options for greedy clustering 'auto' when using more than 5k bins
score:
completeness: 1
contamination: 5
N50: 0.5
length: 0
centrality: 1

rename_mags_contigs: true #Rename contigs of representative MAGs

Expand Down
11 changes: 0 additions & 11 deletions workflow/envs/dRep.yaml

This file was deleted.

6 changes: 6 additions & 0 deletions workflow/envs/skani.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- skani=0.1
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ channels:
- bioconda
- defaults
dependencies:
- galah
- python=3.11
- pandas=2
- networkx=3.1
- scipy=1.10
110 changes: 64 additions & 46 deletions workflow/report/bin_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,58 +41,58 @@ def handle_exception(exc_type, exc_value, exc_traceback):
from utils.taxonomy import tax2table


def make_plots(bin_table):
def make_plots(bin_info):
div = {}

div["input_file"] = bin_table
div["input_file"] = f"{bin_info} and {snakemake.input.bins2species}"

# Prepare data
df = pd.read_table(bin_table)
df = pd.read_table(bin_info, index_col=0)
df["Bin Id"] = df.index # need it also as column

if snakemake.config["bin_quality_asesser"].lower() == "busco":
df["Bin Id"] = df["Input_file"].str.replace(".fasta", "", regex=False)
# add species info
bin2species = pd.read_table(snakemake.input.bins2species, index_col=0)
df = df.join(bin2species)

logging.info("No taxonomic information available, use busco Dataset")
logging.info(df.head())

lineage_name = "Dataset"
hover_data = [
"Scores_archaea_odb10",
"Scores_bacteria_odb10",
"Scores_eukaryota_odb10",
]
size_name = None
logging.info(bin2species.head())

elif snakemake.config["bin_quality_asesser"].lower() == "checkm":
df = df.join(
tax2table(df["Taxonomy (contained)"], remove_prefix=True).fillna("NA")
)

lineage_name = "phylum"
size_name = "Genome size (Mbp)"
hover_data = ["genus"]
# calculate number of genomes/bins
st = pd.DataFrame(columns=["Bins", "Species"])

elif snakemake.config["bin_quality_asesser"].lower() == "checkm2":
df["Bin Id"] = df.index
def add_stats(name, d):
st.loc[name, "Bins"] = d.shape[0]
st.loc[name, "Species"] = d.Representative.unique().shape[0]

lineage_name = "Translation_Table_Used"
hover_data = [
"Completeness_Model_Used",
"Coding_Density",
"Contig_N50",
"GC_Content",
"Additional_Notes",
]
size_name = "Genome_Size"
else:
raise Exception(f"bin_quality_asesser in the config file not understood")

df.index = df["Bin Id"]
add_stats("All", df)

df.eval("Quality_score = Completeness - 5* Contamination", inplace=True)
div[
"QualityScore"
] = "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>"
add_stats("Quality score >50 ", df.query("Quality_score>50"))
add_stats("Good quality", df.query("Completeness>90 & Contamination <5"))
add_stats("Quality score >90 ", df.query("Quality_score>90"))

div["table"] = st.to_html()

logging.info(df.describe())

# Bin Id Completeness completeness_general Contamination completeness_specific completeness_model_used translation_table_used coding_density contig_n50 average_gene_length genome_size gc_content total_coding_sequences additional_notes quality_score sample Ambigious_bases Length_contigs Length_scaffolds N50 N_contigs N_scaffolds logN50
hover_data = [
"Completeness_Model_Used",
"Coding_Density",
"N50",
"GC_Content",
]
size_name = "Genome_Size"

lineage_name = "Species"

# 2D plot

logging.info("make 2d plot")
fig = px.scatter(
data_frame=df,
y="Completeness",
Expand All @@ -106,36 +106,54 @@ def make_plots(bin_table):
fig.update_xaxes(range=(-0.2, 10.1))
div["2D"] = fig.to_html(**HTML_PARAMS)

## By sample
fig = px.strip(
data_frame=df,
y="Quality_score",
x="Sample",
# 2D plot

logging.info("make 2d plot species")
fig = px.scatter(
data_frame=df.loc[df.Representative.unique()],
y="Completeness",
x="Contamination",
color=lineage_name,
size=size_name,
hover_data=hover_data,
hover_name="Bin Id",
)
fig.update_yaxes(range=(50, 102))
div["bySample"] = fig.to_html(**HTML_PARAMS)
fig.update_xaxes(range=(-0.2, 10.1))
div["2Dsp"] = fig.to_html(**HTML_PARAMS)

# By Phylum
## By sample
logging.info("plot by sample")
fig = px.strip(
data_frame=df,
y="Quality_score",
x=lineage_name,
x="Sample",
color=lineage_name,
hover_data=hover_data,
hover_name="Bin Id",
)
fig.update_yaxes(range=(50, 102))
div["byPhylum"] = fig.to_html(**HTML_PARAMS)
div["bySample"] = fig.to_html(**HTML_PARAMS)

# # By species
# logging.info("plot by species")
# fig = px.strip(
# data_frame=df,
# y="Quality_score",
# x=lineage_name,
# hover_data=hover_data,
# hover_name="Bin Id",
# )
# fig.update_yaxes(range=(50, 102))
# div["byPhylum"] = fig.to_html(**HTML_PARAMS)

return div


# main


div = make_plots(bin_table=snakemake.input.bin_table)
div = make_plots(bin_info=snakemake.input.bin_info)


make_html(
Expand Down
52 changes: 32 additions & 20 deletions workflow/report/template_bin_report.html
Original file line number Diff line number Diff line change
@@ -1,40 +1,52 @@
<html>
<head>

<title>{binner} Metagenome-Atlas - Bin Report</title>
<head>

<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<title>{binner} Metagenome-Atlas - Bin Report</title>

<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css">
<style type="text/css">
{css_content}
</style>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>

</head>
<body>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css">
<style type="text/css">
{css_content}
</style>

</head>

<body>


<div class="container" id="notebook-container">

<h1>Bin Report for Binner {binner}</h1>

<p>Genome completeness and contamination, and taxonomy were estimated unsing CheckM2. </p>
<p>Bins might represent the same species assembled from different samples. During the De-replication step only the gneome with the highest quality will be selected as representative for the species/cluster.</p>
<p>For all the information see the table '{div[input_file]}'</p>
{div[QualityScore]}
<p>For all the information see the file {div[input_file]}</p>

<h2>Number of genomes<h/2></h>
{div[table]}

<p>"Good quality" refers to the standard of Completeness &gt; 90&percnt; and Contamination &lt; 5&percnt;. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score &gt; 90. </p>

<h2>Quality for all bins<h/2>
{div[2D]}


<h2>Quality for Species representatives<h/2>
{div[2Dsp]}



{div[2D]}
<h2>Quality score by Sample <h/2>



<h2>Quality score by Sample and phylum<h/2>
{div[bySample]}

{div[QualityScore]}

{div[bySample]}

{div[byPhylum]}

</div>
</body>

</div>
</body>
</html>
</html>
Loading

0 comments on commit da97853

Please sign in to comment.