-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathexampleMySQL.sh
executable file
·226 lines (184 loc) · 8.07 KB
/
exampleMySQL.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/bin/bash
# Your usage of the data returned by the public MySQL server is
# subject to same conditions as laid out on the Ensembl website.
#
# Copyright [2020-21] EMBL-European Bioinformatics Institute
# documentation about Ensembl schemas can be found at
# http://www.ensembl.org/info/docs/api/index.html
# set server details
SERVER=mysql-eg-publicsql.ebi.ac.uk
USER=anonymous
PORT=4157
# get Ensembl Plants current release number from FTP server
# Note: wget is used, this can be modified to use alternatives ie curl
FTPSERVER="ftp://ftp.ensemblgenomes.org/pub"
DIV=plants
SUMFILE="${FTPSERVER}/${DIV}/current/summary.txt"
RELEASE=$(wget --quiet -O - $SUMFILE | \
perl -lne 'if(/Release (\d+) of Ensembl/){ print $1 }')
# work out Ensembl Genomes release
EGRELEASE=$((RELEASE - 53));
# alternatively set other EG release number
# EGRELEASE=
echo "EGRELEASE=${EGRELEASE}"
echo
# stop here if just a test
if [[ $# -ge 0 ]] && [[ $1 = "test" ]]; then
mysql --host $SERVER --user $USER --port $PORT -e "show databases"
exit 0
fi
## S1) Check currently supported Ensembl Genomes (EG)/non-vertebrates core schemas,
# Note: includes non-plants as well
mysql --host $SERVER --user $USER --port $PORT \
-e "show databases" | grep "core_${EGRELEASE}_${RELEASE}"
# The following API script can also be used:
# https://github.com/Ensembl/ensembl-metadata/blob/master/misc_scripts/get_list_databases_for_division.pl
## S2) Count protein-coding genes of a particular species
SPECIES=arabidopsis_thaliana
SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \
-e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}")
mysql --host $SERVER --user $USER --port $PORT \
$SPECIESCORE -e "SELECT COUNT(*) FROM gene WHERE biotype='protein_coding'"
## S3) Get stable_ids of transcripts used in Compara analyses
# Canonical transcripts are used in the gene tree analysis,
# which usually are the longest translations with no stop codons.
# This file can be combined to that obtained in recipe F3 to
# obtain the sequences
mysql --host $SERVER --user $USER --port $PORT \
"ensembl_compara_plants_${EGRELEASE}_${RELEASE}" \
-e "SELECT sm.stable_id \
FROM seq_member sm, gene_member gm, genome_db gdb \
WHERE sm.seq_member_id = gm.canonical_member_id \
AND sm.genome_db_id = gdb.genome_db_id \
AND gdb.name = '$SPECIES' \
LIMIT 10"
## S4) Get variants significantly associated to phenotypes
# Variation schema documented at
# http://www.ensembl.org/info/docs/api/variation/variation_schema.html
SPECIESVAR=$(mysql --host $SERVER --user $USER --port $PORT \
-e "show databases" | grep "${SPECIES}_variation_${EGRELEASE}_${RELEASE}")
mysql --host $SERVER --user $USER --port $PORT \
$SPECIESVAR<<SQL
SELECT f.object_id, s.name, f.seq_region_start,
f.seq_region_end, p.description
FROM phenotype p
JOIN phenotype_feature f ON p.phenotype_id = f.phenotype_id
JOIN seq_region s ON f.seq_region_id = s.name
WHERE f.type = 'Variation' AND f.is_significant=1 LIMIT 10
SQL
## S5) Get Triticum aestivum homeologous genes across A,B & D subgenomes
# Compara schema is described at
# https://m.ensembl.org/info/docs/api/compara/compara_schema.html
# find out the correct method_link_species_set_id in this release
MLSSID=$(mysql --host $SERVER --user $USER --port $PORT \
ensembl_compara_plants_${EGRELEASE}_$RELEASE -Nb -e \
"SELECT method_link_species_set_id \
FROM method_link_species_set \
WHERE name LIKE 'T%aes%homoeologues'")
# actually retrieve the homeologues using the MLSSID retrieved above
# remove LIMIT 10 if you want the complete set
mysql --host $SERVER --user $USER --port $PORT \
ensembl_compara_plants_${EGRELEASE}_$RELEASE <<SQL
SELECT
homology_member.homology_id, cigar_line, perc_cov, perc_id,
perc_pos, gene_member.stable_id as genes, gene_member.genome_db_id
FROM
homology_member
INNER JOIN homology USING (homology_id)
INNER JOIN gene_member USING (gene_member_id)
WHERE method_link_species_set_id = ${MLSSID}
LIMIT 10
SQL
## S6) Count the number of whole-genome alignments of all genomes
# Compara schema is described at
# https://m.ensembl.org/info/docs/api/compara/compara_schema.html
mysql --host $SERVER --user $USER --port $PORT \
ensembl_compara_plants_${EGRELEASE}_$RELEASE <<SQL
SELECT
genome_db.name,
SUM(type = "LASTZ_NET") AS n_lastz,
SUM(type = "SYNTENY") AS n_syntenies,
SUM(type IN ("EPO", "EPO_LOW_COVERAGE")) AS n_multiple
FROM
genome_db
JOIN species_set USING (genome_db_id)
JOIN method_link_species_set USING (species_set_id)
JOIN method_link USING (method_link_id)
WHERE
genome_component IS NULL
AND genome_db.name != "ancestral_sequences"
GROUP BY genome_db_id
ORDER BY genome_db.name
SQL
## S7) Extract all the mutations and consequences for a selected wheat line
# The variation name contains the name of the line where the muation is present.
# Variation schema documented at
# http://www.ensembl.org/info/docs/api/variation/variation_schema.html
SPECIES=triticum_aestivum
LINE="Cadenza1441"
SPECIESVAR=$(mysql --host $SERVER --user $USER --port $PORT \
-e "show databases" | grep "${SPECIES}_variation_${EGRELEASE}_${RELEASE}")
mysql --host $SERVER --user $USER --port $PORT \
$SPECIESVAR <<SQL
SELECT
seq_region.name as CHR,
variation_feature.seq_region_start as POS,
variation_feature.allele_string,
variation.name as ID,
transcript_variation.feature_stable_id,
transcript_variation.consequence_types,
transcript_variation.sift_prediction,
transcript_variation.sift_score,
variation_set.name as variation_set
FROM variation
JOIN variation_set_variation
ON variation.variation_id =variation_set_variation.variation_id
JOIN variation_set
ON variation_set.variation_set_id = variation_set_variation.variation_set_id
JOIN variation_feature
ON variation_feature.variation_id = variation.variation_id
JOIN transcript_variation
ON transcript_variation.variation_feature_id = variation_feature.variation_feature_id
JOIN seq_region
ON variation_feature.seq_region_id = seq_region.seq_region_id
WHERE
variation.name LIKE "${LINE}%"
SQL
## S8) Get FASTA of repeated sequences from selected species
# This recipe first interrogates the MySQL server (BED file) and
# then the FTP server (genome FASTA file) and finally produces a
# FASTA file with repeated sequences. Uses MINLEN to skip short repeats
# and skips repeats starting on some coordinate.
# Note: requires wget, sort, gzip and bedtools
MINLEN=90
SPECIES=oryza_sativa
SPECIESCORE=$(mysql --host $SERVER --user $USER --port $PORT \
-e "show databases" | grep "${SPECIES}_core_${EGRELEASE}_${RELEASE}")
BEDFILE=${SPECIES}.repeats.bed
FASTANAME="*${SPECIES^}*.dna.toplevel.fa.gz"
FASTA=${SPECIES}.toplevel.fasta
REPFASTA=${SPECIES}.repeats.fasta
mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
"SELECT sr.name,(r.seq_region_start-1),r.seq_region_end,rc.repeat_class \
FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \
WHERE r.seq_region_id=sr.seq_region_id \
AND r.repeat_consensus_id=rc.repeat_consensus_id \
AND (r.seq_region_end-r.seq_region_start+1) > ${MINLEN}" | \
sort -u -k1,1 -k2,2n > $BEDFILE
# similar to recipe F4
URL="${FTPSERVER}/${DIV}/current/fasta/${SPECIES}/dna/${FASTANAME}"
wget -c $URL -O- | gunzip > $FASTA
bedtools getfasta -name -fi $FASTA -bed $BEDFILE > $REPFASTA
## S9) Get GFF of repeated sequences from selected species
# This recipe first interrogates the MySQL server and produces a GFF file
# with repeat sequences. Uses MINLEN to skip short repeats.
# Note: requires wget, sort and perl
mysql --host $SERVER --user $USER --port $PORT $SPECIESCORE -Nb -e \
"SELECT sr.name,rc.repeat_class,'Repeat',r.seq_region_start, \
r.seq_region_end,r.score,r.seq_region_strand,0,rc.repeat_name \
FROM repeat_feature r JOIN seq_region sr JOIN repeat_consensus rc \
WHERE r.seq_region_id=sr.seq_region_id \
AND r.repeat_consensus_id=rc.repeat_consensus_id \
AND (r.seq_region_end-r.seq_region_start+1) > 90" | sort -k1,1 -k4,4n | \
perl -lane 'if($F[5] eq "NULL"){ $F[5]="."}; if($F[6]==1){ $F[6]="+" } else {$F[6]="-" }; $F[8]="Note=\"$F[8]\";"; print join("\t",@F)'
: