-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.sh
142 lines (124 loc) · 4.98 KB
/
main.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/bin/bash
### input files
FILE_READ="input/read.fastq" #http://gage.cbcb.umd.edu/data/Staphylococcus_aureus/
FILE_TARGET="input/target20k.txt"
PAM="NGG"
GRNA_LEN=20
CAND_LEN=23
K=22
ROUTINE0=0
ROUTINE1=0
########################
# #
# Filenames (serially) #
# #
########################
JF_MER_COUNT_FILE="output/mer_$(echo $K)_reads.jf"
TARGET_KMER_LIST="output/target_$(echo $K)mers"
TARGET_KMER_WITH_COUNT="output/target_$(echo $K)mers_with_count"
TARGET_KMER_ONLY="output/target_$(echo $K)mers"
COUNT_IN_READS="output/count_in_reads_$(echo $K)"
HIST_OUTPUT="output/hist_output_data"
##R2
CANDIDATES="output/candidates"
if [ $ROUTINE0 -eq 1 ]; then
################################################
# #
# Subroutine 0: From read, gen all k-mers (jf) #
# #
################################################
jellyfish count -m $K -s 100M -o $JF_MER_COUNT_FILE -t 20 -C $FILE_READ
fi
if [ $ROUTINE1 -eq 1 ]; then
## ROUTINE 1: determining the target-coverage
#~ ---------------------------------------------------------------------------
#~ determining the target-coverage
#~ ---------------------------------------------------------------------------
#~ comment: we will break the target in small k-mers, and determine histogram
#~ only from those k-mer counts
#~ ---------------------------------------------------------------------------
#~ smallK = 4/5/6 (something that makes sense, will discuss with sir later)
#~ split target into smallK-spectrum. List the small mers in a file F --(I already coded this)--
#~ call jellyfish with to count smallK-mers
#~ for each mer listed in F:
#~ count = queryJellyfish(mer)
#~ write count in another file F2
#~ from all integers listed in F2, determine the histogram --(I already coded this)--
#~ from histogram, see the first peak to determine the target-coverage
#~ ---------------------------------------------------------------------------
###############################################
# #
# Subroutine 1: Generate all k-mers of target #
# #
###############################################
#usage: Arguments
# arg 1 - k
# arg 2 - input file (target)
# arg 3 - output filename (with all k-mers)
# outputs list of all canonical k-mers
python generate_k_spectrum.py $K $FILE_TARGET $TARGET_KMER_LIST
# sort target-kmers to show counts, and remove whitespace -> Entry: count KMER
sort $TARGET_KMER_LIST | uniq --count | sed 's/^[ \t]*//;s/[ \t]*$//' > $TARGET_KMER_WITH_COUNT
cut -d' ' -f2 < $TARGET_KMER_WITH_COUNT > $TARGET_KMER_ONLY
#############################################################
# #
# Subroutine 2: Query read-kmer with target k-mers #
# #
#############################################################
# start with a blank file
echo -n "" > $COUNT_IN_READS
while read -r line
do
kmer="$line"
result=$(jellyfish query $JF_MER_COUNT_FILE $kmer)
#result=${result:($k+1)}
echo $result >> $COUNT_IN_READS
done < "$TARGET_KMER_ONLY"
#####################################
# #
# Subroutine 3: Generate histogram #
# #
#####################################
# add a header to the file
(echo "NUM_KMER NUM_REPEAT" && cat $COUNT_IN_READS) > tmp_filename1
grep '[^[:space:]]' tmp_filename1 > $COUNT_IN_READS
rm tmp_filename1
python histogram_generator.py $COUNT_IN_READS $HIST_OUTPUT
python histogram_plotter.py $K $HIST_OUTPUT
echo "h,sello"
fi
#~ ## manually find the target coverage
#~ target_coverage=20
echo "hello: Routine 2"
## ROUTINE 2: determining the target-coverage
#~ --------------------------------------------------------------------------
#~ determining the best candidate where target-coverage is known
#~ --------------------------------------------------------------------------
#~ target_coverage = (determined beforehand, from a histogram analysis)
#~ candidates = set of all possible candidates
#~ for each candidate C in candidates:
#~ q = candidateCountInTarget(target, c)
#~ k = len(C)
#~ call jellyfish with this value of k
#~ p = queryJellyfish(C)
#~ r = p / (q * target_coverage)
#~ append <C, r> tuple in a list, sat L
#~ Output the C with minimum r in L
#~ --------------------------------------------------------------------------
JF_MER_CANDI_LEN="output/jf_mer_candi_len.jf"
TARGET_COVERAGE=20
python list_all_candidates.py $FILE_TARGET $PAM $GRNA_LEN $CANDIDATES
#jellyfish count -m $CAND_LEN -s 100M -o $JF_MER_CANDI_LEN -t 20 -C $FILE_READ
FINAL_CAND_R="final_cand_r"
echo -n "" > $FINAL_CAND_R
while read -r line
do
q=$(echo "$line" | cut -d' ' -f2)
kmer=$(echo "$line" | cut -d' ' -f1)
result=$(jellyfish query $JF_MER_CANDI_LEN $kmer)
#result=${result:($k+1)}
p=$(echo "$result" | cut -d' ' -f2 )
r=$(python tc_calc.py $p $q $TARGET_COVERAGE)
echo "$r $kmer" >> $FINAL_CAND_R
done < "$CANDIDATES"
sort -g $FINAL_CAND_R | sed 's/^[ \t]*//;s/[ \t]*$//' > sorted