-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind.dupes.bash
executable file
·132 lines (117 loc) · 3.79 KB
/
find.dupes.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
set -e
function usage {
echo "
Given a FastA file, identifies repetitive regions, and outputs a BED file
usage: `basename $0` -r reference.fasta -o maskedRegions.bed [-l <int>] [-i <float>] [-t <dir>]
-r [default: ./reference.fasta] FastA formatted input file
-o [default: ./maskedRegions.bed] BED formatted output file
-i [default: 99] Minimum percent identity required between two
repeat regions
-l [default: 249] Minimum repeat length to find (in bp)
-t [default: ./tmp/] Temporary directory where intermediate files
are written to and removed
Paths can be absolute or relative.
"
}
# Defaults
PERC_ID=99
LEN=249
OUTFILE="$PWD/maskedRegions.bed"
REF="$PWD/reference.fasta"
TMP="$PWD/tmp"
nopts=$#
for ((i=1 ; i <= nopts ; i++)); do
case "$1" in
-i | --percid)
PERC_ID="$2"
echo " min percent identity required: $2"
shift 2
;;
-l | --len)
LEN="$2"
echo " min length: $2"
shift 2
;;
-o | --out)
OUTFILE="$2"
echo " output file: $2"
shift 2
;;
-r | --ref)
REF=`readlink -f $2`
echo " reference genome: $2"
shift 2
;;
-t | --temp)
TMP="$2"
echo " temporary directory: $2"
shift 2
;;
-h | --help)
usage
exit 1
;;
\?)
echo " ERROR: $2 is not a valid argument"
usage
exit 1
;;
esac
done
# Input file requirements and dependency check
[[ ! -e "$REF" || ! -s "$REF" ]] && { echo "ERROR: $REF cannot be read"; exit 1; }
command -v nucmer >/dev/null 2>&1 || { echo 'ERROR: nucmer not found' >&2; exit 1; }
command -v show-coords >/dev/null 2>&1 || { echo 'ERROR: show-coords not found' >&2; exit 1; }
command -v bedtools >/dev/null 2>&1 || { echo 'ERROR: bedtools not found' >&2; exit 1; }
command -v sortBed >/dev/null 2>&1 || { echo 'ERROR: sortBed not found' >&2; exit 1; }
# Create tmp dir (if absent)
if [ ! -d "$TMP" ]; then
mkdir -p "$TMP"
else
echo "WARNING: Temporary directory $TMP already exists."
echo 'Files present in the specified tmp dir might disrupt analysis.'
fi
# Primary system commands
cd "$TMP" #cannot specific outpath in nucmer, so change into tmp dir
nucmer --nosimplify --maxmatch -p DUPES "$REF" "$REF" 2> /dev/null
show-coords -I "$PERC_ID" -L "$LEN" -r -l -o -T DUPES.delta > filtered.coords
# Parse nucmer output into BED format
[[ -a DUPES.bed ]] && rm DUPES.bed
while IFS=$'\t' read -r -a line; do
if [ ${line[0]} -gt ${line[1]} ]; then
l0=${line[0]}
l1=${line[1]}
line[0]=$l1
line[1]=$l0
fi
if [ ${line[2]} -gt ${line[3]} ]; then
l2=${line[2]}
l3=${line[3]}
line[2]=$l3
line[3]=$l2
fi
echo -e "${line[9]}\t${line[0]}\t${line[1]}" >> DUPES.bed
echo -e "${line[9]}\t${line[2]}\t${line[3]}" >> DUPES.bed
done < <(grep '^[1-9]' filtered.coords)
# Sort and remove duplicate lines
sort -t$'\t' -V -k 1,3 DUPES.bed | sortBed > DUPES_sorted.bed
awk '{if(++dup[$0]==1) {print $0}}' DUPES_sorted.bed > DUPES_sorted_deduped.bed
# Remove full-length chromosome matches; handles >1 contig
while read -r -a deflines; do
CHROM_ID=${deflines[1]}
END_POS=${deflines[3]}
# awk -v chrom="${CHROM_ID}" -v end="${END_POS}" '!/chrom\t1\tend/' DUPES_sorted_deduped.bed > tmp.bed
# mv tmp.bed DUPES_sorted_deduped.bed
TAB=$'\t'
sed -i "/${CHROM_ID}${TAB}1${TAB}${END_POS}/d" DUPES_sorted_deduped.bed
done < <(grep '^>' DUPES.delta)
# Merge common overlapping regions
bedtools merge -i DUPES_sorted_deduped.bed > maskedRegions.unsorted.bed
sort -t$'\t' -V -k 1,3 maskedRegions.unsorted.bed > maskedRegions.bed
REGIONS=$(wc -l maskedRegions.bed | awk '{print $1}')
SITES=$(awk -F'\t' 'BEGIN{SUM=0}; {SUM+=$3-$2}; END{print SUM}' maskedRegions.bed)
echo "found $REGIONS regions and $SITES total sites to mask"
# Cleanup
mv -v maskedRegions.bed "$OUTFILE"
rm -r "$TMP"