-
Notifications
You must be signed in to change notification settings - Fork 0
/
jb_SSR_fq2SeqLengths.sh
executable file
·78 lines (61 loc) · 3.43 KB
/
jb_SSR_fq2SeqLengths.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/bin/bash -l
# Jimmy ([email protected]) - 20141002
# Take Merged FastQ file and split by F/R primer and create histogram of seq lengths
if [ "$#" != "2" ]; then
echo "Usage: jb_SSR_ngs_separatePrimerGrep.sh [Merged fastQ.gz] [primer list]"
exit 0
fi
## Primer list must be in the format:
## SampleName Forward_seq Reverse_seq Est_Amplicon_Size
## (separared by tabs)
# File IO and naming
file=$1
primers=$2
fname=$(basename $file .fastq.gz)
gammytools=/home/users/jbreen/personal_git/gammytools
fqCon=$gammytools/jb_fastq2fasta.py
plotH=$gammytools/jb_plot_histogram.py
# Create fasta from fastq and Remove newline in fastafile
zcat $file | python $fqCon | awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > "$fname"_new.fa
while read line
do
# Pipe in variables
Fseq=$(echo $line | awk '{print $2}')
Rseq_rev=$(echo $line | awk '{print $3}' | tr ACGT TGCA | rev)
#Rseq=$(echo $line | awk '{print $3}')
#Fseq_rev=$(echo $line | awk '{print $2}' | tr ACGT TGCA | rev)
name=$(echo $line | awk '{print $1}')
#regex=$(echo $line | awk '{print $4}')
# Look for 100% matches in Forward direction and remove primer sequence
grep "^"$Fseq".*"$Rseq_rev"$" "$fname"_new.fa | awk 'length($0) > 60' >> "$fname"_"$name".seq.list
#sed -e "s/^$Fseq//g" -e "s/$Rseq_rev$//g" >> "$fname"_"$name".seq.list
# Give a summary of the number of files found
echo -e "\n" Sample "$name" >> "$fname"_summary_list.txt
wc -l "$fname"_"$name".seq.list >> "$fname"_summary_list.txt
# Look for 100% matches in Reverse direction and remove primer sequence
#grep "^"$Rseq".*"$Fseq_rev"$" "$fname"_new.fa | \
# sed -e "s/^$Rseq//g" -e "s/$Fseq_rev$//g" | tr ACGT TGCA | rev >> "$fname"_"$name".seq.list
# Give a summary of the number of files found
#echo -e "\nTotal Sequences for "$name" (including Reverse Reads)" >> "$fname"_summary_list.txt
#wc -l "$fname"_"$name".seq.list >> "$fname"_summary_list.txt
# Remove sequences that dont contain the SSR
# awk -v ref="$regex" 'match($0, ref) {print $0}' "$fname"_"$name".seq.list > "$fname"_"$name".seqflt.list
# echo -e "\nTotal Sequences for "$name" that actually contain the SSR" >> "$fname"_summary_list.txt
# wc -l "$fname"_"$name".seqflt.list >> "$fname"_summary_list.txt
# Print fasta and align using clustal-omega
#awk '/^/{print ">seq"(++i)}!/>/' "$fname"_"$name".seqflt.list > "$fname"_"$name".fasta
awk '/^/{print ">seq"(++i)}!/>/' "$fname"_"$name".seq.list > "$fname"_"$name".fasta
clustalo -i "$fname"_"$name".fasta --output-order=tree-order | \
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' > "$fname"_"$name".aligned.fa
# Make length histogram
#awk '{print length}' "$fname"_"$name".seqflt.list | sort | \
awk '{print length}' "$fname"_"$name".seq.list | sort | \
uniq -c | sort -k2,2n -r | awk '{print $2"\t"$1}' > "$fname"_"$name".seq.list.histogram
# Plot histogram with separate code
python $plotH "$fname"_"$name".seq.list.histogram > "$fname"_"$name".seq.list.histogram.pdf
# Clean up after yourself
rm "$fname"_"$name".seq.list
rm "$fname"_"$name".seq.list.histogram
#rm "$fname"_"$name".seqflt.list
done < $primers
rm "$fname"_new.fa