-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path05c_PostQCMichigan.sh
executable file
·223 lines (183 loc) · 6.4 KB
/
05c_PostQCMichigan.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/bin/bash
#SBATCH --export=ALL # export all environment variables to the batch job.
#SBATCH -p mrcq # submit to the serial queue
#SBATCH --time=24:00:00 # Maximum wall time for the job.
#SBATCH -A Research_Project-MRC190311 # research project to submit under.
#SBATCH --nodes=1 # specify number of nodes.
#SBATCH --mem=100G
#SBATCH --ntasks-per-node=16 # specify number of processors per node
#SBATCH --mail-type=END # send email at job completion
#SBATCH --output=05cPostQCMichigan.o
#SBATCH --error=05cPostQCMichigan.e
#SBATCH --job-name=PostQCMichigan
## output files for use with Michegan Imputation Server
# this script is for deal with the output file from Michegan Imputation Server
# So, please run the imputation on Michegan first and download the zip file for each chr into the ${IMPUTEDIR}/chrzip folder
# keep the passcode that Michegan server send to you!
## EXECUTION
# sbatch ./05b_PostQCMichigan.sh <reference panel> <passcode>
## REQUIRES the following variables in config file
# ${IMPUTEDIR}, ${RAWDATADIR}/${FILEPREFIX}, ${SCRIPTDIR}
## REQUIRES the following software
# plink1.9, plink2, GCTA, R
## INPUT
# chr.zip under the ${IMPUTEDIR}/chrzip folder
## OUTPUT
# data_filtered_Michigan.bim, data_filtered_Michigan.bed, data_filtered_Michigan.fam data_filtered_Michigan.info
echo "checking the arguments for config file----------------------------------------------------------------------------"
datapeth=$1
if [ -z "$1" ]
then
echo "No argument supplied"
echo "Please input the paht of the data folder as the first argument"
exit 1 # fail
fi
echo "running the PostQCSanger at $datapeth"
source ${datapeth}/config
touch "$logfile_05c"
exec > >(tee "$logfile_05c") 2>&1
module purge
module load R/4.2.1-foss-2022a
module load p7zip
mv 05cPostQCMichigan.o ${JOBSDIR}/05cPostQCMichigan.o
mv 05cPostQCMichigan.e ${JOBSDIR}/05cPostQCMichigan.e
# function for unzip all the zip file
un7zip () {
passcode=$1
for chr in $(seq 1 22); do
7z x -p"${passcode}" ./chr_$chr.zip;
gzip -d chr${chr}.info.gz
done
if [ -s chr_X.zip ]
then
7z x -p"${passcode}" ./chr_X.zip
gzip -d chrX.info.gz
mv chrX.dose.vcf.gz chr23.dose.vcf.gz
mv chrX.info chr23.info
else
echo "Warning: There is no chrX."
fi
}
echo "checking the arguments--------------------------------------------------"
panel=$2
if [ -z "$2" ];
then
echo "No argument supplied"
echo "Please input the argument either 1000G nor HRC"
exit
else
cd ${IMPUTEDIR}/ImputationOutputMichigan${panel} || exit 1
md5sum -c results.md5
fi
# unzip the result
if [ -z "$3" ];
then
echo "No passcode supplied"
echo "Skip the setp for unziping the chr.zip"
else
passcode=$3
echo "The passcode is" ${passcode}
echo "chr.zip files are being unzipped"
un7zip ${passcode}
fi
echo 'runing PostQC for for imputed data from Michigan------------------------'
if [ -s chr_X.zip ]
then
chrNum=23
# chr23 will fail on --make-pgen if there is no sex info
awk '{print $1,$2,$5}' ${RAWDATADIR}/${FILEPREFIX}.fam > sex.info
else
chrNum=22
echo "Warnning: There is no SEX CHR!!!!!!!!!!!!!!!!!!!!!!!!"
fi
for i in $(seq 1 $chrNum)
do
# convert the vcf.gz into pgen formats
if [[ $i -eq 23 && $panel == "1000G" ]];
then
# chrX in present in the input files, needs sex.info
# Human chrX pseudoautosomal variant(s) appear in chrX on 1000G, needs split-par or merge-par
${PLINK2} --vcf chr${i}.dose.vcf.gz \
--make-pgen \
--update-sex sex.info \
--out data_chr${i}_dose \
--split-par 'hg19'
elif [[ $i -eq 23 && $panel == "HRC" ]];
then
${PLINK2} --vcf chr${i}.dose.vcf.gz \
--make-pgen \
--update-sex sex.info \
--out data_chr${i}_dose
else
${PLINK2} --vcf chr${i}.dose.vcf.gz \
--make-pgen \
--out data_chr${i}_dose
fi
# Keep SNPs with MAF>0.01, R2>0.8. hwe 1e-5
${PLINK2} --pfile data_chr${i}_dose \
--extract-if-info "R2>0.8" \
--maf 0.01 \
--hwe 1e-5 \
--make-pgen \
--out data_chr${i}_filtered_temp2 \
--keep-allele-order
# Remove duplicate variants
${PLINK2} --pfile data_chr${i}_filtered_temp2 \
--rm-dup exclude-mismatch \
--make-pgen \
--out data_chr${i}_filtered \
--keep-allele-order
# Convert `pvar/pgen/psam` to `bim/bed/fam` format
${PLINK2} --pfile data_chr${i}_filtered \
--make-bed \
--out data_chr${i}_filtered
#Reformat pvar file to id maf r2
awk -v x='#CHROM' '$0~x,EOF''{print $3,$7}' < data_chr${i}_filtered.pvar | perl -pe 's/;/ /g'|awk '{print $1,$3,$4}' |perl -pe 's/ID/ID MAF R2/g' |perl -pe 's/MAF=//g'| perl -pe 's/R2=//g' > data_chr${i}_filtered_temp4.info
sed 's/X/23/g' data_chr${i}_filtered_temp4.info > data_chr${i}_filtered.info
done
# Merge them into one dataset
echo "Merge--------------------------------------------------"
rm -f mergefile.txt
touch mergefile.txt
for i in $(seq 2 $chrNum)
do
if [ -s "data_chr${i}_filtered.bed" ]; then
echo "data_chr${i}_filtered" >> mergefile.txt
fi
done
${PLINK2} --bfile data_chr1_filtered \
--pmerge-list mergefile.txt \
--make-bed \
--out data_filtered_Michigan_temp
awk '{print $2}' data_filtered_Michigan_temp.bim > oldidchrX.txt
sed 's/X/23/g' oldidchrX.txt > newidchr23.txt
paste oldidchrX.txt newidchr23.txt > updatechrid.txt
# make sure the all the chrX convert to chr23
${PLINK}/plink --bfile data_filtered_Michigan_temp \
--make-bed \
--update-name updatechrid.txt 2 1 \
--out data_filtered_Michigan
# The FIDs in the fam file are set to 0.
cp data_filtered_Michigan.fam data_filtered_Michigan.fam.orig
awk '{print $2,$2,$3,$4,$5,$6}' < data_filtered_Michigan.fam.orig > data_filtered_Michigan.fam
# correct the FID and IID for fam file
Rscript ${DATADIR}/4_Resources/correctFIDIID.r \
data_filtered_Michigan.fam \
${RAWDATADIR}/${FILEPREFIX}.fam
# Combine info files into a single file
cp data_chr1_filtered.info data_filtered_Michigan.info
for i in $(seq 1 $chrNum)
do
if [ -s "data_chr${i}_filtered.bed" ]; then
awk ' NR>1 {print $0}' < data_chr${i}_filtered.info | cat >> data_filtered_Michigan.info
fi
done
# clean up redundant files
rm data_chr*_filtered_temp*
# rm data_chr*_filtered.p*
rm data_filtered_Michigan.fam.orig
rm data_filtered_Michigan_temp* oldidchrX.txt newidchr23.txt updatechrid.txt
rm data_chr*_filtered.info
# check whether the number of variants is the same
echo "The number of variants in bim file is" $(wc -l data_filtered_Michigan.bim)
echo "The number of variants in info file is" $(wc -l data_filtered_Michigan.info)