Skip to content

Commit

Permalink
updated formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
tpall committed Oct 5, 2020
1 parent 596cb9c commit e3c4785
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 270 deletions.
44 changes: 24 additions & 20 deletions rules/assembly.smk
Original file line number Diff line number Diff line change
@@ -1,44 +1,48 @@

rule concatenate:
input:
rules.merge.output.out, rules.qtrim.output.out
rules.merge.output.out,
rules.qtrim.output.out,
output:
temp("output/{group}/{run}/concatenated.fq.gz")
temp("output/{group}/{run}/concatenated.fq.gz"),
resources:
runtime = 120,
mem_mb = 4000
runtime=120,
mem_mb=4000,
shell:
"cat {input} > {output}"


rule assembly:
input:
se = lambda wildcards: expand("output/{{group}}/{run}/concatenated.fq.gz", run = groups[wildcards.group])
output:
contigs = temp("output/{group}/assembly/final.contigs.fa")
input:
se=lambda wildcards: expand(
"output/{{group}}/{run}/concatenated.fq.gz", run=groups[wildcards.group]
),
output:
contigs=temp("output/{group}/assembly/final.contigs.fa"),
params:
extra = lambda wildcards, resources: f"--presets meta-large --min-contig-len 1000 --verbose -m {resources.mem_mb * 1048576}"
extra=(
lambda wildcards, resources: f"--presets meta-large --min-contig-len 1000 --verbose -m {resources.mem_mb * 1048576}"
),
threads: 8
log:
"output/{group}/log/assembly.log"
shadow:
log:
"output/{group}/log/assembly.log",
shadow:
"minimal"
resources:
runtime = lambda wildcards, input: round(600 + 0.06 * input.size_mb),
mem_mb = lambda wildcards, input: round(20000 + 2.22 * input.size_mb)
runtime=lambda wildcards, input: round(600 + 0.06 * input.size_mb),
mem_mb=lambda wildcards, input: round(20000 + 2.22 * input.size_mb),
wrapper:
f"{WRAPPER_PREFIX}/v0.2/assembly/megahit"
f"{WRAPPER_PREFIX}/v0.2/assembly/megahit"


rule fix_fasta:
input:
rules.assembly.output.contigs
input:
rules.assembly.output.contigs,
output:
"output/{group}/contigs-fixed.fa"
"output/{group}/contigs-fixed.fa",
params:
lambda wildcards: wildcards.group
lambda wildcards: wildcards.group,
conda:
f"{WRAPPER_PREFIX}/v0.2/subset_fasta/environment.yaml"
script:
"../scripts/fix_fasta.py"

173 changes: 92 additions & 81 deletions rules/blast.smk
Original file line number Diff line number Diff line change
@@ -1,185 +1,196 @@

RANKS_OF_INTEREST = ["superkingdom", "order", "family", "genus", "species"]
VIRUSES_TAXID = 10239


# Creates to required outputs viruses.taxids and negative.taxids.
# Output directory can be changed.
# Additional negative taxids (all listed taxids except viruses) can be added via params.
# Shadow=full ensures that only required outputs will be saved.
# Shadow=full ensures that only required outputs will be saved.
rule get_virus_taxids:
output:
f"output/blast/{VIRUSES_TAXID}.taxids"
output:
f"output/blast/{VIRUSES_TAXID}.taxids",
params:
taxid = VIRUSES_TAXID
taxid=VIRUSES_TAXID,
conda:
f"{WRAPPER_PREFIX}/v0.2/blast/query/environment.yaml"
resources:
runtime = 120
runtime=120,
shell:
"(get_species_taxids.sh -t {params.taxid} > {output} || true)"
"(get_species_taxids.sh -t {params.taxid} > {output} || true)"


rule megablast_virus:
input:
query = "output/{group}/splits/repmaskedgood_{n}.fa",
taxidlist = rules.get_virus_taxids.output[0]
query="output/{group}/splits/repmaskedgood_{n}.fa",
taxidlist=rules.get_virus_taxids.output[0],
output:
out = temp("output/{group}/splits/megablast-virus_{n}.tsv")
out=temp("output/{group}/splits/megablast-virus_{n}.tsv"),
params:
program = "blastn",
task = "megablast",
db = "nt_v5",
word_size = 16,
evalue = 1e-6,
outfmt = "'6 qseqid sacc staxid pident length qstart qend sstart send evalue'"
program="blastn",
task="megablast",
db="nt_v5",
word_size=16,
evalue=1e-6,
outfmt="'6 qseqid sacc staxid pident length qstart qend sstart send evalue'",
threads: 8
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 26000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=26000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/query"


# Filter blastn hits for the cutoff value.
rule parse_megablast_virus:
input:
query = rules.megablast_virus.input.query,
blast_result = rules.megablast_virus.output.out
query=rules.megablast_virus.input.query,
blast_result=rules.megablast_virus.output.out,
output:
mapped = temp("output/{group}/splits/megablast-virus_{n}_mapped.tsv"),
unmapped = temp("output/{group}/splits/megablast-virus_{n}_unmapped.fa")
mapped=temp("output/{group}/splits/megablast-virus_{n}_mapped.tsv"),
unmapped=temp("output/{group}/splits/megablast-virus_{n}_unmapped.fa"),
params:
e_cutoff = 1e-6,
outfmt = rules.megablast_virus.params.outfmt
e_cutoff=1e-6,
outfmt=rules.megablast_virus.params.outfmt,
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 4000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=4000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/parse"


# Blastn, megablast and blastx input, output, and params keys must match commandline blast option names.
# Blastn, megablast and blastx input, output, and params keys must match commandline blast option names.
# Please see https://www.ncbi.nlm.nih.gov/books/NBK279684/#appendices.Options_for_the_commandline_a for all available options.
# Blast against nt virus database.
rule blastn_virus:
input:
query = rules.parse_megablast_virus.output.unmapped,
taxidlist = "output/blast/10239.taxids"
query=rules.parse_megablast_virus.output.unmapped,
taxidlist="output/blast/10239.taxids",
output:
out = temp("output/{group}/splits/blastn-virus_{n}.tsv")
out=temp("output/{group}/splits/blastn-virus_{n}.tsv"),
params:
program = "blastn",
db = "nt_v5",
word_size = 11,
evalue = 1e-6,
outfmt = rules.megablast_virus.params.outfmt
program="blastn",
db="nt_v5",
word_size=11,
evalue=1e-6,
outfmt=rules.megablast_virus.params.outfmt,
threads: 4
resources:
runtime = lambda wildcards, attempt: 840 + (attempt * 120),
mem_mb = 26000
runtime=lambda wildcards, attempt: 840 + (attempt * 120),
mem_mb=26000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/query"


# Filter blastn hits for the cutoff value.
rule parse_blastn_virus:
input:
query = rules.blastn_virus.input.query,
blast_result = rules.blastn_virus.output.out
query=rules.blastn_virus.input.query,
blast_result=rules.blastn_virus.output.out,
output:
mapped = temp("output/{group}/splits/blastn-virus_{n}_mapped.tsv"),
unmapped = temp("output/{group}/splits/blastn-virus_{n}_unmapped.fa")
mapped=temp("output/{group}/splits/blastn-virus_{n}_mapped.tsv"),
unmapped=temp("output/{group}/splits/blastn-virus_{n}_unmapped.fa"),
params:
e_cutoff = 1e-6,
outfmt = rules.megablast_virus.params.outfmt
e_cutoff=1e-6,
outfmt=rules.megablast_virus.params.outfmt,
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 4000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=4000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/parse"


rule megablast_nt:
input:
query = rules.parse_blastn_virus.output.unmapped
query=rules.parse_blastn_virus.output.unmapped,
output:
out = temp("output/{group}/splits/megablast-nt_{n}.tsv")
out=temp("output/{group}/splits/megablast-nt_{n}.tsv"),
params:
program = "blastn",
task = "megablast",
db = "nt_v5",
word_size = 16,
evalue = 1e-6,
outfmt = rules.megablast_virus.params.outfmt
program="blastn",
task="megablast",
db="nt_v5",
word_size=16,
evalue=1e-6,
outfmt=rules.megablast_virus.params.outfmt,
threads: 8
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 96000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=96000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/query"


# Filter blastn hits for the cutoff value.
rule parse_megablast_nt:
input:
query = rules.megablast_nt.input.query,
blast_result = rules.megablast_nt.output.out
query=rules.megablast_nt.input.query,
blast_result=rules.megablast_nt.output.out,
output:
mapped = temp("output/{group}/splits/megablast-nt_{n}_mapped.tsv"),
unmapped = temp("output/{group}/splits/megablast-nt_{n}_unmapped.fa")
mapped=temp("output/{group}/splits/megablast-nt_{n}_mapped.tsv"),
unmapped=temp("output/{group}/splits/megablast-nt_{n}_unmapped.fa"),
params:
e_cutoff = 1e-6,
outfmt = rules.megablast_virus.params.outfmt
e_cutoff=1e-6,
outfmt=rules.megablast_virus.params.outfmt,
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 4000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=4000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/parse"


# Filter sequences by division id.
# Saves hits with division id
BLAST = ["megablast-virus", "blastn-virus", "megablast-nt"]


rule classify_all:
input:
expand("output/{{group}}/splits/{blastresult}_{{n}}_mapped.tsv", blastresult = BLAST)
expand(
"output/{{group}}/splits/{blastresult}_{{n}}_mapped.tsv", blastresult=BLAST
),
output:
temp("output/{group}/splits/all_{n}.csv")
temp("output/{group}/splits/all_{n}.csv"),
params:
pp_sway = 1,
ranks_of_interest = RANKS_OF_INTEREST,
dbfile = TAXON_DB
pp_sway=1,
ranks_of_interest=RANKS_OF_INTEREST,
dbfile=TAXON_DB,
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 8000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=8000,
wrapper:
f"{WRAPPER_PREFIX}/v0.2/blast/taxonomy"


# Split classification results into viruses and non-viral
rule filter_viruses:
input:
expand("output/{{group}}/splits/all_{n}.csv", n = N)
expand("output/{{group}}/splits/all_{n}.csv", n=N),
output:
viral = "output/{group}/viruses.csv",
non_viral = "output/{group}/non-viral.csv"
viral="output/{group}/viruses.csv",
non_viral="output/{group}/non-viral.csv",
params:
ranks = RANKS_OF_INTEREST
ranks=RANKS_OF_INTEREST,
resources:
runtime = lambda wildcards, attempt: attempt * 120,
mem_mb = 8000
runtime=lambda wildcards, attempt: attempt * 120,
mem_mb=8000,
run:
tab = concatenate_tables(input, sep = ",", cols_to_integer = params.ranks)
tab = concatenate_tables(input, sep=",", cols_to_integer=params.ranks)
mask = tab.superkingdom == VIRUSES_TAXID
mask = mask.fillna(False)
vir = tab[mask]
non_vir = tab[~mask]
vir.to_csv(output.viral, index = False)
non_vir.to_csv(output.non_viral, index = False)
vir.to_csv(output.viral, index=False)
non_vir.to_csv(output.non_viral, index=False)


# Merge unassigned sequences
rule merge_unassigned:
input:
expand("output/{{group}}/splits/{blastresult}_{n}_unmapped.fa", blastresult = BLAST, n = N)
expand(
"output/{{group}}/splits/{blastresult}_{n}_unmapped.fa",
blastresult=BLAST,
n=N,
),
output:
"output/{group}/unassigned.fa"
"output/{group}/unassigned.fa",
shell:
"cat {input} > {output}"
Loading

0 comments on commit e3c4785

Please sign in to comment.