-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathHybPhyloMaker1_rawprocess.sh
359 lines (341 loc) · 14.8 KB
/
HybPhyloMaker1_rawprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=12:0:0
#PBS -l select=1:ncpus=2:mem=4gb:scratch_local=80gb
#PBS -j oe
#PBS -N HybPhyloMaker1_rawprocess
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -pe mthread 2
#$ -q mThC.q
#$ -l mres=4G,h_data=4G,h_vmem=4G
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker1_rawprocess
#$ -o HybPhyloMaker1_rawprocess.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * https://github.com/tomas-fer/HybPhyloMaker *
# * Script 01 - Raw data processing *
# * v.1.8.0b *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2025 *
# * [email protected] *
# * based on Weitemier et al. (2014), Applications in Plant Science 2(9): 1400042*
# ********************************************************************************
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker1 is running on MetaCentrum...\n"
#settings for MetaCentrum
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Move to scratch
cd $SCRATCHDIR
#Add necessary modules
module add bowtie2-2.2.4
module add samtools
module add bam2fastq-1.1.0
module add trimmomatic-0.32
module add fastuniq-1.1
module add jdk-8
#module add perl-5.10.1
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker1 is running on Hydra...\n"
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir01
cd workdir01
#Add necessary modules
module load bioinformatics/bowtie2/2.2.9
module load bioinformatics/samtools/1.3
module load bioinformatics/bam2fastq/1.1.0
#module load bioinformatics/trimmomatic/0.33
module load bioinformatics/fastuniq/1.1
module load java/1.7
else
echo -e "\nHybPhyloMaker1 is running locally...\n"
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir01
cd workdir01
fi
if [[ ! $location == "1" ]]; then
if [ "$(ls -A ../workdir01)" ]; then
echo -e "Directory 'workdir01' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
else
if [ -d "$path/20filtered" ]; then
echo -e "Directory '$path/20filtered' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
fi
fi
else
if [ -d "$path/20filtered" ]; then
echo -e "Directory '$path/20filtered' already exists. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
fi
fi
#Write log
logname=HPM1
echo -e "HybPhyloMaker1: raw reads filtering and summary" > ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "Job run on MetaCentrum: $PBS_JOBID" >> ${logname}.log
echo -e "From: $PBS_O_HOST" >> ${logname}.log
echo -e "Host: $HOSTNAME" >> ${logname}.log
echo -e "$PBS_NUM_NODES node(s) with $PBS_NCPUS core(s)" >> ${logname}.log
memM=$(bc <<< "scale=2; $(echo $PBS_RESC_MEM) / 1024 / 1024 ")
memG=$(bc <<< "scale=2; $(echo $PBS_RESC_MEM) / 1024 / 1024 / 1024 ")
if (( $(echo $memG 1 | awk '{if ($1 < $2) print 1;}') )); then
echo -e "Memory: $memM Mb" >> ${logname}.log
else
echo -e "Memory: $memG Gb" >> ${logname}.log
fi
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "run on Hydra: $HOSTNAME" >> ${logname}.log
else
echo -e "local run: "`hostname`"/"`whoami` >> ${logname}.log
fi
echo -e "\nBegin:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
echo -e "\nSettings" >> ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
printf "%-25s %s\n" `echo -e "\nServer:\t$server"` >> ${logname}.log
fi
for set in data adapterfile; do
printf "%-25s %s\n" `echo -e "${set}:\t" ${!set}` >> ${logname}.log
done
#Test data structure
echo -en "Testing input data structure..."
if [ -d "$path/10rawreads" ]; then #Test if 10rawreads folder exists
if [ -f "$path/10rawreads/SamplesFileNames.txt" ]; then #Test if SamplesFileNames.txt exists
#Copy SamplesFileNames.txt and modify it
cp $path/10rawreads/SamplesFileNames.txt .
#Add LF at the end of last line in SamplesFileNames.txt if missing
sed -i.bak '$a\' SamplesFileNames.txt
#Delete empty lines from SamplesFileNames.txt (if any)
sed -i.bak2 '/^$/d' SamplesFileNames.txt
for sample in $(cat SamplesFileNames.txt); do
if [ ! -d "$path/10rawreads/$sample" ]; then #Test if each samples-specific folder exists
echo -e "Directory $sample does not exist.\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
else
if [ ! -f "$path/10rawreads/$sample/${sample}_"*"R1"*".fastq.gz" ] || [ ! -f "$path/10rawreads/$sample/${sample}_"*"R2"*".fastq.gz" ]; then #Test if FASTQ.gz files exist
echo -e "Proper fastq.gz files missing in $sample folder...\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
fi
fi
done
else
echo "List of samples (SamplesFileNames.txt) is missing. Should be in 10rawreads...\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
fi
else
echo -e "Folder 10rawreads does not exist within your homedir.\n"
rm -d ../workdir01/ 2>/dev/null
exit 3
fi
echo -e "OK for running HybPhyloMaker...\n"
#Copy raw reads folders and sample list to scratch/workdir
cp -r $path/10rawreads/* .
#Copy necessary files
cp $source/fastq2fasta.pl .
cp $source/PhiX.fsa .
cp $source/${adapterfile} .
cp $source/trimmomatic-0.33.jar .
#Make a new folder for results
mkdir $path/20filtered
#Create a reference PhiX index
echo -en "Creating PhiX reference..."
bowtie2-build PhiX.fsa phiX.index 1>buildPhiX.log
#Add LF at the end of last line in SamplesFileNames.txt if missing
sed -i.bak '$a\' SamplesFileNames.txt
#Delete empty lines from SamplesFileNames.txt (if any)
sed -i.bak2 '/^$/d' SamplesFileNames.txt
#A loop to process all samples in folders named as specified in SaplesFileNames.txt
numberfiles=$(cat SamplesFileNames.txt | wc -l)
calculating=0
for file in $(cat SamplesFileNames.txt)
do
calculating=$((calculating + 1))
echo -e "\nProcessing sample $file ($calculating out of $numberfiles)"
#Make a new folder for results
mkdir $path/20filtered/$file
#Go to sample folder
cd $file
#Unzip all fastq.gz files (and store only unziped files)
echo "Unzipping..."
gunzip *.fastq.gz
#Get variables (read1 and read2) from list of fastq files in the folder
read1=$(ls *.fastq | sed -n 1p)
read2=$(ls *.fastq | sed -n 2p)
#Map the Hyb-Seq reads of each accession to the phiX.index
echo "Removing PhiX..."
bowtie2 -x ../phiX.index -1 $read1 -2 $read2 -S $file.sam >bowtie2.log 2>&1
rm *.fastq
#Convert .sam (resulting file of Bowtie 2) to .bam with SAMtools
samtools view -bT ../PhiX.fsa $file.sam > $file.bam 2>/dev/null
rm $file.sam
#Remove the PhiX reads
bam2fastq -o $file-noPhiX#.fq --no-aligned $file.bam >PhiX-removal.log 2>&1
rm $file.bam
#The resulting files need to be compressed for the next step of quality trimming
gzip $file-noPhiX_1.fq
gzip $file-noPhiX_2.fq
#Adapter and general quality trimming
echo "Adapter removal & quality trimming..."
if [[ $location == "1" ]]; then
java -jar /software/trimmomatic-0.32/dist/jar/trimmomatic-0.32.jar PE -phred33 $file-noPhiX_1.fq.gz $file-noPhiX_2.fq.gz $file-1P $file-1U $file-2P $file-2U ILLUMINACLIP:../${adapterfile}:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:20 MINLEN:36 >Trimmomatic.log 2>&1
#trimmomatic PE -phred33 $file-noPhiX_1.fq.gz $file-noPhiX_2.fq.gz $file-1P $file-1U $file-2P $file-2U ILLUMINACLIP:../NEBNext-PE.fa:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:20 MINLEN:36 >Trimmomatic.log 2>&1
elif [[ $location == "2" ]]; then
java -d64 -server -XX:MaxHeapSize=4g -jar ../trimmomatic-0.33.jar PE -phred33 $file-noPhiX_1.fq.gz $file-noPhiX_2.fq.gz $file-1P $file-1U $file-2P $file-2U ILLUMINACLIP:../${adapterfile}:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:20 MINLEN:36 >Trimmomatic.log 2>&1
else
java -jar ../trimmomatic-0.33.jar PE -phred33 $file-noPhiX_1.fq.gz $file-noPhiX_2.fq.gz $file-1P $file-1U $file-2P $file-2U ILLUMINACLIP:../${adapterfile}:2:30:10 LEADING:20 TRAILING:20 SLIDINGWINDOW:5:20 MINLEN:36 >Trimmomatic.log 2>&1
fi
#Convert .fastq files to .fasta
# perl ../fastq2fasta.pl -a $file-1P
# perl ../fastq2fasta.pl -a $file-1U
# perl ../fastq2fasta.pl -a $file-2P
# perl ../fastq2fasta.pl -a $file-2U
rm $file-noPhiX_1.fq.gz
rm $file-noPhiX_2.fq.gz
#Combine the four Trimmomatic output files in one file
# cat $file-1P.fa $file-1U.fa $file-2P.fa $file-2U.fa > $file-all.fa
# rm $file-1P* $file-1U* $file-2P* $file-2U*
#Remove duplicate reads
echo "Removing duplicates..."
# fastx_collapser -v -i $file-all.fa -o $file-all-no-dups.fas >duplicate-removal.log 2>&1
ls $file-?P > fastqlist.txt
fastuniq -i fastqlist.txt -t q -o ${file}-1P_no-dups.fastq -p ${file}-2P_no-dups.fastq
#Count number of duplicates
#number of unpaired reads - UNFINISHED!!!
u1=`expr $(cat ${file}-1U | wc -l) / 4`
u2=`expr $(cat ${file}-2U | wc -l) / 4`
#paired-read number before duplicate removal
a=`expr $(cat ${file}-1P | wc -l) / 4`
echo "Nr. of reads before duplicate removal:" >> duplicate-removal-fastuniq.log
before=`expr $a + $a + $u1 + $u2`
echo $before >> duplicate-removal-fastuniq.log
#paired-read number after duplicate removal
echo "Nr. of reads after duplicate removal:" >> duplicate-removal-fastuniq.log
b=`expr $(cat ${file}-1P_no-dups.fastq | wc -l) / 4`
after=`expr $b + $b + $u1 + $u2`
echo $after >> duplicate-removal-fastuniq.log
#gzip trimmed and deduplicated reads
gzip ${file}-1P
gzip ${file}-1U
gzip ${file}-2P
gzip ${file}-2U
gzip ${file}-1P_no-dups.fastq
gzip ${file}-2P_no-dups.fastq
#Copy results from scratch to storage
if [[ $location == "1" ]]; then
cp ${file}-1P_no-dups.fastq.gz ${path}/20filtered/${file}
cp ${file}-2P_no-dups.fastq.gz ${path}/20filtered/${file}
cp ${file}-1P.gz ${path}/20filtered/${file}
cp ${file}-1U.gz ${path}/20filtered/${file}
cp ${file}-2P.gz ${path}/20filtered/${file}
cp ${file}-2U.gz ${path}/20filtered/${file}
# cp $file-1U ${path}/20filtered/${file}
# cp $file-2U ${path}/20filtered/${file}
# cp $file-all-no-dups.fas ${path}/20filtered/${file}
# cp $file-all.fa ${path}/20filtered/${file}
cp *.log $path/20filtered/$file
else
cp ${file}-1P_no-dups.fastq.gz ../${path}/20filtered/${file}
cp ${file}-2P_no-dups.fastq.gz ../${path}/20filtered/${file}
cp ${file}-1P.gz ../${path}/20filtered/${file}
cp ${file}-1U.gz ../${path}/20filtered/${file}
cp ${file}-2P.gz ../${path}/20filtered/${file}
cp ${file}-2U.gz ../${path}/20filtered/${file}
# cp $file-1U ../${path}/20filtered/${file}
# cp $file-2U ../${path}/20filtered/${file}
# cp $file-all-no-dups.fas ../${path}/20filtered/${file}
# cp $file-all.fa ../$path/20filtered/$file
cp *.log ../$path/20filtered/$file
fi
# rm $file-all-no-dups.fas $file-all.fa
mv ${file}-1U.gz ${file}-1U_no-dups.fastq
mv ${file}-2U.gz ${file}-2U_no-dups.fastq
cd ..
done
echo -e "\nGenerating file for download and for import to Geneious...\n"
#Copy all -all-no-dups.fas from all subfolders to folder 'for_Geneious' (for easy import to Geneious)
mkdir $path/20filtered/for_Geneious
find $path/20filtered/ -name '*no-dups.fastq.gz' -exec cp -t $path/20filtered/for_Geneious/ {} +
#Pack and combine all *all-no-dups.fas files to one archive for easy download
tar cfz $path/20filtered/for_Geneious/$data-no-dups.tar.gz -C $path/20filtered/for_Geneious/ . 2>/dev/null
#Delete copies of fastq files in 'for Geneious' folder
rm $path/20filtered/for_Geneious/*.fastq.gz
#Summary of basic reads processing (nr. reads, PhiX filtering, quality trimming, duplicate removal)
#Produce tab-separated table
echo -e "Creating summary table..."
#Write headers (number of pairs, number of reads, number of reads after PhiX removal, percentage od PhiX reads,
#quality trimming - both surviving, forward only surviving, reverse only surviving, reads after quality trimming,
#percentage of trimmed reads, reads after duplicate removal, percentage of duplicated reads)
echo -e "Sample no.\tGenus\tSpecies\tNr. of pairs\tNr. of reads\tNr. of reads without PhiX\t% PhiX reads\tBoth surviving\tForward only surviving\tReverse only surviving\tNr. reads after quality trimming\t% quality trimmed reads\tNr. reads without duplicates\t% duplicates" > reads_summary.txt
cat SamplesFileNames.txt | while read line
do
echo "Processing $line"
number=$(cut -d'_' -f2 <<< $line)
genus=$(cut -d'-' -f1 <<< $line)
species=$(cut -d'_' -f1 <<< $line | cut -d'-' -f2)
cd $line
#Number of pairs: first number on first line in bowtie2.log
npairs=`cat bowtie2.log | head -n 1 | cut -d' ' -f1`
#Number of reads: first number on third line in PhiX-removal.log
nreads=`cat PhiX-removal.log | head -n 3 | tail -n 1 | cut -d' ' -f1`
#Number of reads after removal of PhiX reads: first number on fourth line in PhiX-removal.log
withoutPhiX=`cat PhiX-removal.log | head -n 4 | tail -n 1 | cut -d' ' -f1`
#Percentage of PhiX reads: first number on last line in bowtie2.log
percPhiX=`cat bowtie2.log | tail -n 1 | cut -d'%' -f1`
#Number of both surviving reads after Trimmomatic
bs=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f7`
#Number of forward only surviving reads after Trimmomatic
fos=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f12`
#Number of reverse only surviving reads after Trimmomatic
ros=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f17`
# Counting number of reads after quality trimming
let aftertrimming="(2 * $bs) + $fos + $ros"
#Calculate percentage of quality trimmed reads
perctrimm=$(echo -e "scale=4;100 - 100 * ($aftertrimming / $withoutPhiX)" | bc)
#Number of reads after duplicate removal
withoutdups=`cat duplicate-removal-fastuniq.log | tail -n 1`
#withoutdups=`cat duplicate-removal.log | head -n 2 | tail -n 1 | cut -d' ' -f2`
#Calculate percentage of duplicate reads removed
percdups=$(echo -e "scale=4;100 - 100 * ($withoutdups / $aftertrimming)" | bc)
cd ..
echo -e "$number\t$genus\t$species\t$npairs\t$nreads\t$withoutPhiX\t$percPhiX\t$bs\t$fos\t$ros\t$aftertrimming\t$perctrimm\t$withoutdups\t$percdups" >> reads_summary.txt
rm -r $line
done
#Copy table to storage
cp reads_summary.txt $path/20filtered/
#Copy log to home
echo -e "\nEnd:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
cp ${logname}.log $path/20filtered/
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir01
fi
echo -e "\nScript HybPhyloMaker1 finished...\n"