-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.nf
240 lines (201 loc) · 8.57 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#!/usr/bin/env nextflow
/**
* main.nf: Nextflow workflow for Pfitmap/GTDB
*
* The GTDB genomes are expected to be downloaded and annotated.
*
* The workflow starts from a set of annotated genomes in the format of faa.gz files (--inputfaas)
* and, optionally, gff.gz files (--inputgffs) plus a set of hmm profiles (--hmms). The protein
* sequences will be searched with HMMER using the hmm files and subsequently classified into which
* profile it fits best into. The latter uses a table describing the hierarchy of hmm profiles
* (--profiles_hierarchy; see --help).
*
* Requirements:
* directory with faa.gz files
* directory with all hmm profiles to be run
* file describing the hmm profile hierarchy
*
* Processing steps:
* Concatenate all faa.gz files into a single one
* Optionally, concatenate all gff.gz files into a single one
* Perform an hmmsearch of all hmm profiles on all the proteomes
* Download the metadata files for archaeal and bacterial genomes from gtdb latest version
* repository and concatenate them into a single metadata file
* Classify the hits
*
*/
// Parameters
params.help = false
params.inputfaas = null
params.inputgffs = null
params.hmms = null
params.profiles_hierarchy = null
params.dbsource = 'GTDB:GTDB:latest'
params.hmm_mincov = 0.7
params.gtdb_arc_metadata = null
params.gtdb_bac_metadata = null
params.featherprefix = 'pfitmap-gtdb'
params.max_cpus = 2
params.max_time = "240.h"
def helpMessage() {
log.info """
Usage:
The typical command for running the pipeline is as follows:
nextflow run main.nf --inputfaas path/to/genomes.faa.gzs [--inputgffs path/to/genomes.gff.gzs] --outputdir path/to/results --hmm_mincov value --dbsource GTDB:GTDB:release
Mandatory arguments:
--inputfaas path/to/genomes.faa.gzs Path of directory containing annotated genomes in the format faa.gz
--gtdb_bac_metadata path/to/file Path of tsv file including the metadata for bacterial genomes
--gtdb_arc_metadata path/to/file Path of tsv file including the metadata for archaeal genomes
--hmms path/to/hmm_directory Path of directory with HMM profile files
--profiles_hierarchy path/to/file Path of tsv file including hmm profile names and information (See README.md file for more details)
--hmm_mincov value Set a value for the threshold of coverage hmm_profile/querry (default = 0.7)
--dbsource db:db:release Set the database source in the format db:db:release, where [db] is the name of the database and [release] mentions
the release number/name (default = GTDB:GTDB:latest)
--outputdir path/to/results Path to the results directory
--featherprefix prefix Prefix for generated feather files (default "pfitmap-gtdb").
Non Mandatory parameters:
--inputgffs path/to/genomes.gff.gzs Path of directory containing annotated genomes in the format gff.gz. Will output a concatenated file with the table part of all files.
--max_cpus Maximum number of CPU cores to be used (default = 2)
--max_time Maximum time per process (default = 10 days)
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
if (! ( params.dbsource =~ /.+:.+:.+/ ) ) { error "Error in dbsource format\ndbsource should be in the format db:db:release\nSee more using --help"}
if( !params.inputfaas ) {
error "Missing inputfaas parameter\n[Parameter error] Please specify the parameter --inputfaas\nSee more using --help"
}
if( !params.profiles_hierarchy ) {
error "Missing profiles_hierarchy parameter\n[Parameter error] Please specify the parameter --profiles_hierarchy\nSee more using --help"
}
if( !params.gtdb_arc_metadata ) {
error "Missing gtdb_arc_metadata parameter\n[Parameter error] Please specify the parameter --gtdb_arc_metadata\nSee more using --help"
}
if( !params.gtdb_bac_metadata ) {
error "Missing gtdb_bac_metadata parameter\n[Parameter error] Please specify the parameter --gtdb_bac_metadata\nSee more using --help"
}
// Create channels to start processing
genome_faas = Channel.fromPath(params.inputfaas, checkIfExists : true)
if ( params.inputgffs ) {
genome_gffs = Channel.fromPath(params.inputgffs, checkIfExists : true)
}
else {
genome_gffs = Channel.empty()
}
hmm_files = Channel.fromPath("$params.hmms/*.hmm")
profiles_hierarchy = Channel.fromPath(params.profiles_hierarchy, checkIfExists : true)
dbsource = Channel.value(params.dbsource)
hmm_mincov = Channel.value(params.hmm_mincov)
featherprefix = Channel.value(params.featherprefix)
gtdb_arc_metadata = Channel.fromPath(params.gtdb_arc_metadata, checkIfExists : true)
gtdb_bac_metadata = Channel.fromPath(params.gtdb_bac_metadata, checkIfExists : true)
results = params.outputdir
// Return personalized error when one of the files is missing
workflow.onError {
filename = workflow.errorReport
println ""
println "---------------- Error Message -----------------"
println " The file ${filename} is missing"
println " Please verify that the file exists"
println "------------------------------------------------"
println ""
}
process join_faas {
publishDir "$results/genomes", mode: "copy"
input:
file genome_dir from genome_faas
output:
file 'all_genomes.faa' into all_genome_faas_hmmsearch_ch
file 'all_genomes.faa' into all_genome_faas_classify_ch
file 'processed_genome_faas.txt' into processed_genome_faas_ch
shell:
"""
for f in \$(find ${genome_dir}/ -name '*.faa.gz'); do
a=\$(basename \$f | sed 's/\\..*//')
echo "\$a: \$f" >> processed_genome_faas.txt
gunzip -c \$f | sed "/^>/s/\$/ [\$a]/" >> all_genomes.faa
done
"""
}
process join_gffs {
publishDir "$results/genomes", mode: "copy"
input:
file genome_dir from genome_gffs
output:
file 'all_genomes.gff.gz' into all_genome_gffs_ch
shell:
"""
echo "seqname source feature start end score strand frame attribute genome_accno" > all_genomes.gff
for f in \$(find ${genome_dir}/ -name '*.gff.gz'); do
a=\$(basename \$f | sed 's/\\..*//')
gunzip -c \$f | sed "/^/s/\$/ \$a/" | grep '\t' >> all_genomes.gff
done
gzip all_genomes.gff
"""
}
/**
* Run the hmmsearches.
*
* This is now called *once* but should be called once per hmm file. I haven't yet found
* a way of duplicating the channel with the faa file though.
*/
process hmmsearch {
publishDir "$results/hmmsearch", mode: 'copy'
cpus params.max_cpus
input:
file genome from all_genome_faas_hmmsearch_ch
file hmms from hmm_files.collect()
output:
file ("*.tblout") into tblout_ch
file ("*.domtblout") into domtblout_ch
file ("*.hmmout.gz") into hmmout_ch
shell:
"""
for h in *.hmm; do
hmmsearch --cpu ${task.cpus} --tblout \$(basename \$h .hmm).tblout --domtblout \$(basename \$h .hmm).domtblout \$h $genome | gzip -c > \$(basename \$h .hmm).hmmout.gz
done
"""
}
process getMetadata {
publishDir "$results/metadata", mode: 'copy'
input:
file arc_metadata from gtdb_arc_metadata
file bac_metadata from gtdb_bac_metadata
output:
file 'gtdb_metadata.tsv' into gtdbmetadata_ch
shell:
"""
cat $arc_metadata > gtdb_metadata.tsv
cat $bac_metadata |grep -v ^accession >> gtdb_metadata.tsv
"""
}
process pfClassify {
publishDir "$results/classification", mode: 'copy'
input:
val hmm_mincov from hmm_mincov
val featherprefix from featherprefix
val dbsource from dbsource
file profiles_hierarchy from profiles_hierarchy
file "gtdb_metadata.tsv" from gtdbmetadata_ch
file tblouts from tblout_ch
file domtblouts from domtblout_ch
file genome_faas from all_genome_faas_classify_ch
output:
file "gtdb.tsv.gz" into gtdb_tsv_ch
file "gtdb.pf-classify.warnings.txt" into gtdb_classify_warnings_ch
file "missing_genomes.txt" into gtdb_classify_missing_ch
file "*.feather" into feather_files
shell:
"""
pf-classify.r --hmm_mincov=${hmm_mincov} --dbsource=${dbsource} --gtdbmetadata=gtdb_metadata.tsv --profilehierarchies=$profiles_hierarchy --singletable=gtdb.tsv.gz --seqfaa=${genome_faas} --featherprefix=${featherprefix} --missing=missing_genomes.txt *.tblout *.domtblout > gtdb.pf-classify.warnings.txt 2>&1
"""
}
workflow.onComplete {
println
println "*** Workflow finished, status: ${ workflow.success ? 'OK' : 'failed' }. Make sure you check $results/classification/gtdb.pf-classify.warnings.txt. ***"
println
}