forked from OpenDataDayBilbao/teseo2014
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatabase_cleaner.py
356 lines (286 loc) · 10.8 KB
/
database_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 04 10:41:28 2014
@author: aitor
"""
import difflib
import json
import mysql.connector
import os, sys
lib_path = os.path.abspath('../')
sys.path.append(lib_path)
base_dir = os.path.dirname(os.path.abspath(__file__))
def load_config():
from model.dbconnection import dbconfig
return dbconfig
config = load_config()
def get_complete_names():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(name) FROM person")
result = []
for name in cursor:
result.append(name[0])
cursor.close()
return result
#checks for similar names
def check_similar_names():
print 'Getting names'
names = get_complete_names()
print 'Total names:', len(names)
# min similarity ratio between strings
threshold_ratio = 0.90
repeated = []
total = float(len(names))
for i, str_1 in enumerate(names):
if i%100 == 0:
print 'Name percentage processed:', str((i/total) * 100)
for j in range(i+1, len(names)):
str_2 = names[j]
if (difflib.SequenceMatcher(None, str_1, str_2).ratio() > threshold_ratio):
repeated.append((str_1, str_2))
with open( base_dir + "/cache/repeated_names.json", "wb" ) as outfile:
json.dump(repeated, outfile)
return repeated
def get_thesis_names():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(title) FROM thesis")
result = []
for name in cursor:
result.append(name[0])
cursor.close()
return result
def get_repeated_thesis_ids(distinct_titles):
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
repeated_ids = []
total = float(len(distinct_titles))
for i, title in enumerate(distinct_titles):
if i%100 == 0:
print 'Getting repeated ids:', (i/total)*100
cursor.execute("SELECT id FROM thesis WHERE title= %s", (title,))
name_ids = []
for thesis_id in cursor:
name_ids.append(thesis_id[0])
if len(name_ids) > 1:
print 'Repeated ids', name_ids
repeated_ids.append(name_ids)
cursor.close()
with open( base_dir + "/cache/repeated_thesis_ids.json", "wb" ) as outfile:
json.dump(repeated_ids, outfile)
return repeated_ids
def check_repeated_thesis():
print 'Getting names'
distinct_names = get_thesis_names()
print 'Distinct names: ', len(distinct_names)
repeated_ids = get_repeated_thesis_ids(distinct_names)
print repeated_ids
def delete_repeated_thesis():
repeated_ids = json.load(open( base_dir + "/cache/repeated_thesis_ids.json", "rb" ))
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
deleted = 0
for id_group in repeated_ids:
id_group.sort()
to_delete = id_group[0:len(id_group)-1]
for thesis_id in to_delete:
print 'Deleting:', thesis_id
cursor.execute("DELETE FROM thesis WHERE id=" + str(thesis_id))
deleted +=1
print 'Deleted tesis:', deleted
cursor.close()
def get_person_ids():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(id) FROM person")
result = []
for person_id in cursor:
result.append(person_id[0])
cursor.close()
return result
def get_unused_person_ids(person_ids):
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
unused_ids = []
total = float(len(person_ids))
for i, person_id in enumerate(person_ids):
if i%100 == 0:
print 'Getting unused persons:', (i/total)*100
author = False
advisor = False
panel = False
cursor.execute("SELECT COUNT(id) FROM thesis WHERE author_id=" + str(person_id))
for total_thesis in cursor:
try:
if total_thesis[0] > 0:
author = True
except:
pass
cursor.execute("SELECT COUNT(thesis_id) FROM advisor WHERE person_id="+ str(person_id))
for total_thesis in cursor:
try:
if total_thesis[0] > 0:
advisor = True
except:
pass
cursor.execute("SELECT COUNT(thesis_id) FROM panel_member WHERE person_id="+ str(person_id))
for total_thesis in cursor:
try:
if total_thesis[0] > 0:
panel = True
except:
pass
if not (author or advisor or panel):
unused_ids.append(person_id)
cursor.close()
with open( base_dir + "/cache/unused_person_ids.json", "wb" ) as outfile:
json.dump(unused_ids, outfile)
return unused_ids
def check_unused_person_ids():
print 'Getting all person ids'
person_ids = get_person_ids()
print 'Distinct ids: ', len(person_ids)
unused_ids = get_unused_person_ids(person_ids)
print unused_ids
def nuke_unused_persons():
unused_ids = json.load(open( base_dir + "/cache/unused_person_ids.json", "rb" ))
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
deleted = 0
for person_id in unused_ids:
cursor.execute("DELETE FROM advisor WHERE person_id=" + str(person_id))
cursor.execute("DELETE FROM panel_member WHERE person_id=" + str(person_id))
cursor.execute("DELETE FROM person WHERE id=" + str(person_id))
deleted +=1
print 'Deleted persons:', deleted
cursor.close()
def get_distinct_names():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(name) FROM person")
result = []
for name in cursor:
result.append(name[0])
cursor.close()
return result
def get_distinct_first_names():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(first_name) FROM person")
result = []
for name in cursor:
result.append(name[0])
cursor.close()
return result
def get_first_names_ids():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT id, first_name FROM person")
names = {}
for name in cursor:
names[name[0]] = name[1]
cursor.close()
return names
def get_same_name_ids(distinct_names):
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
name_ids = {}
total = float(len(distinct_names))
for i, name in enumerate(distinct_names):
if i%50 == 0:
print 'Getting name ids:', (i/total)*100
try:
cursor.execute("SELECT id FROM person WHERE name=%s", (name,))
ids = []
for person_id in cursor:
ids.append(person_id[0])
name_ids[name] = ids
except:
print 'Problem with name:', name
with open( base_dir + "/cache/person_name_ids.json", "wb" ) as outfile:
json.dump(name_ids, outfile)
cursor.close()
return name_ids
def check_repeated_name_ids():
print 'Getting distinct names'
distinct_names = get_distinct_names()
print 'Distinct names: ', len(distinct_names)
name_ids = get_same_name_ids(distinct_names)
print name_ids
def merge_names():
print 'Merging all ids of the same names'
name_ids = json.load(open( base_dir + "/cache/person_name_ids.json", "rb" ))
print 'Total names:', len(name_ids)
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
total = float(len(name_ids))
for i, name in enumerate(name_ids):
if i%50 == 0:
print 'Merging:', (i/total)*100
id_group = name_ids[name]
if len(id_group) > 1:
base_id = id_group[0]
dying_ids = id_group[1:]
for dying_id in dying_ids:
try:
cursor.execute("UPDATE advisor SET person_id = " + str(base_id) + " WHERE person_id = " + str(dying_id))
except mysql.connector.errors.IntegrityError:
print 'Already is an advisor for that thesis'
cursor.execute("UPDATE thesis SET author_id = " + str(base_id) + " WHERE author_id = " + str(dying_id))
try:
cursor.execute("UPDATE panel_member SET person_id = " + str(base_id) + " WHERE person_id = " + str(dying_id))
except mysql.connector.errors.IntegrityError:
print 'Already is a panel member for that thesis'
cursor.execute("DELETE FROM person WHERE id=" + str(dying_id))
cursor.close()
print 'The great merge has ended, all hail the new clean database'
#all unesco codes must have the following format: ######
def clean_unesco_codes():
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cnx2 = mysql.connector.connect(**config)
cursor2 = cnx2.cursor()
cursor.execute("SELECT id, code FROM descriptor")
for descriptor in cursor:
code = descriptor[1]
desc_id = descriptor[0]
if len(str(descriptor[1])) == 2:
code = code * 10000
elif len(str(descriptor[1])) == 4:
code = code * 100
if code != descriptor[1]:
cursor2.execute("UPDATE descriptor SET code = " + str(code) + " WHERE id = " + str(desc_id))
cursor.close()
cursor2.close()
def set_genders():
from cache import name_genders
names = get_first_names_ids()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
for cont, person_id in enumerate(names):
first_name = names[person_id]
if cont%100 == 0:
print 'Setting genders:', (float(cont)/len(names)) * 100
sys.stdout.flush()
if first_name != '':
name = first_name[0].split(' ')[0]
try:
gender = name_genders[name]
cursor.execute("UPDATE person SET gender = '" + gender +"' WHERE id = " + str(person_id))
except KeyError:
cursor.execute("UPDATE person SET gender = 'None' WHERE id = " + str(person_id))
print 'Name does not exist'
else:
cursor.execute("UPDATE person SET gender = 'None' WHERE id = " + str(person_id))
cursor.close()
if __name__=='__main__':
# check_repeated_thesis()
# delete_repeated_thesis()
# check_unused_person_ids()
# nuke_unused_persons()
# check_repeated_name_ids()
# check_similar_names()
# merge_names()
# clean_unesco_codes()
set_genders()