forked from OpenDataDayBilbao/teseo2014
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcache.py
454 lines (395 loc) · 13.5 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 18 11:29:20 2014
@author: aitor
"""
# Own stuff imports
import gender
import os, sys
lib_path = os.path.abspath('../')
sys.path.append(lib_path)
# Library imports
import pickle
import difflib
base_dir = os.path.dirname(os.path.abspath(__file__))
def load_config():
from model.dbconnection import dbconfig
return dbconfig
# config = {
# 'user': 'foo',
# 'password': 'bar',
# 'host': '127.0.0.1',
# 'database': 'teseo',
# }
#
# with open('pass.config', 'r') as inputfile:
# for i, line in enumerate(inputfile):
# if i == 0:
# config['user'] = line
# elif i == 1:
# config['password'] = line
# elif i > 1:
# break
#
# return config
def get_university_ids():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor_unis = cnx.cursor()
cursor_unis.execute("SELECT id, name FROM university")
result = {}
for university in cursor_unis:
result[university[0]] = university[1]
cursor_unis.close()
with open( base_dir + "/cache/university_ids.p", "wb" ) as outfile:
pickle.dump(result, outfile)
def load_university_ids():
result = ""
try:
with open( base_dir + "/cache/university_ids.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/university_ids.p"
return result
def save_thesis_ids():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT id FROM thesis")
result = set()
for thesis_id in cursor:
result.add(thesis_id[0])
cursor.close()
result = list(result)
with open( base_dir + "/cache/thesis_ids.p", "wb" ) as outfile:
pickle.dump(result, outfile)
def load_thesis_ids():
result = ""
try:
with open( base_dir + "/cache/thesis_ids.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/thesis_ids.p"
return result
def save_descriptors():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT id, text FROM descriptor")
result = {}
for descriptor in cursor:
result[descriptor[0]] = descriptor[1]
cursor.close()
with open( base_dir + "/cache/descriptors.p", "wb" ) as outfile:
pickle.dump(result, outfile)
def load_descriptors():
result = ""
try:
with open( base_dir + "/cache/descriptors.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/descriptors.p"
return result
def get_names():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(first_name) FROM person")
result = set()
for name in cursor:
first = name[0].split(' ')[0]
result.add(first)
cursor.close()
return list(result)
def save_name_genders():
name_pool = get_names()
result = {}
bad_names = []
chunk_size = 50
total_chunks = len(name_pool)/chunk_size
rest = len(name_pool)%chunk_size
for j in range(0, total_chunks):
print '*******Chunk', j, '/', total_chunks
names = []
if j == total_chunks - 1:
names = name_pool[j * chunk_size:total_chunks*chunk_size+rest]
else:
names = name_pool[j * chunk_size:(j+1)*chunk_size]
gender_list = gender.getGenders(names) #gender, prob, count
for i, name in enumerate(names):
infered_gender = gender_list[i][0]
prob = float(gender_list[i][1])
print name, infered_gender, prob
if infered_gender == 'None' or prob < 0.6:
bad_names.append(name)
# else:
# cursor.execute('UPDATE person SET gender=%s WHERE')
result[name] = infered_gender
with open( base_dir + "/cache/genders.p", "wb" ) as outfile:
pickle.dump(result, outfile)
with open( base_dir + "/cache/badnames.p", "wb" ) as outfile:
pickle.dump(bad_names, outfile)
return bad_names
def load_genders():
result = ""
try:
with open( base_dir + "/cache/genders.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/genders.p"
return result
def get_complete_names():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT DISTINCT(name) FROM person")
result = []
for name in cursor:
result.append(name[0])
cursor.close()
return result
def check_similar_names():
print 'Getting names'
names = get_complete_names()
print 'Total names:', len(names)
# min similarity ratio between strings
threshold_ratio = 0.8
repeated = []
count = 0.0
for i, str_1 in enumerate(names):
for j in range(i+1, len(names)):
if count%10000 == 0:
print 'Similar', count
count+=1
str_2 = names[j]
if (difflib.SequenceMatcher(None, str_1, str_2).ratio() > threshold_ratio):
print 'Similar', str_1, str_2
repeated.append((str_1, str_2))
with open( base_dir + "/cache/repeated.p", "wb" ) as outfile:
pickle.dump(repeated, outfile)
return repeated
def load_descriptor_codes():
result = ""
try:
with open( base_dir + "/cache/descriptor_codes.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/descriptor_codes.p"
return result
def load_codes_descriptor():
result = ""
try:
with open( base_dir + "/cache/codes_descriptor.p", "rb" ) as infile:
result = pickle.load(infile)
except:
print "No cache file created: /cache/codes_descriptor.p"
return result
def save_descriptor_codes():
import mysql.connector
config = load_config()
cnx = mysql.connector.connect(**config)
cursor = cnx.cursor()
cursor.execute("SELECT code, text FROM descriptor")
descriptor = {}
codes = {}
for desc in cursor:
code = str(desc[0])
text = desc[1]
if len(code) < 6:
print code
print text
descriptor[text] = code
codes[code] = text
cursor.close()
with open( base_dir + "/cache/descriptor_codes.p", "wb" ) as outfile:
pickle.dump(descriptor, outfile)
with open( base_dir + "/cache/codes_descriptor.p", "wb" ) as outfile:
pickle.dump(codes, outfile)
def regenerate_cache_files():
print 'Creating universities id cache'
get_university_ids()
print 'Creating thesis id cache'
save_thesis_ids()
print 'Creating descriptor cache'
save_descriptors()
print 'Creating gender cache'
save_name_genders()
print 'Creating descriptor cache'
save_descriptor_codes()
####################DATA#############################
thesis_ids = load_thesis_ids()
descriptors = load_descriptors()
name_genders = load_genders()
descriptor_codes = load_descriptor_codes()
codes_descriptor = load_codes_descriptor()
university_locations = {
u'SANTIAGO DE COMPOSTELA':u'Galicia',
u'AUT\xd3NOMA DE BARCELONA':u'Cataluña',
u'UNIVERSITAT DE VAL\xc8NCIA (ESTUDI GENERAL)':u'Valencia',
u'COMPLUTENSE DE MADRID':u'Madrid',
u'OVIEDO':u'Asturias',
u'AUT\xd3NOMA DE MADRID':u'Madrid',
u'PA\xcdS VASCO/EUSKAL HERRIKO UNIBERTSITATEA':u'País Vasco',
u'GRANADA':u'Andalucía',
u'NACIONAL DE EDUCACI\xd3N A DISTANCIA':u'Madrid',
u'BURGOS':u'Castilla y León',
u'NAVARRA':u'Navarra',
u'ALICANTE':u'Valencia',
u'ROVIRA I VIRGILI':u'Cataluña',
u'POLIT\xc9CNICA DE VALENCIA' :u'Valencia',
u'SEVILLA' :u'Andalucía',
u'EXTREMADURA' :u'Extremadura',
u'ZARAGOZA' :u'Aragon',
u'POMPEU FABRA' :u'Cataluña',
u'POLIT\xc9CNICA DE MADRID' :u'Madrid',
u'M\xc1LAGA' :u'Andalucía',
u'POLIT\xc9CNICA DE CATALUNYA' :u'Cataluña',
u'MIGUEL HERN\xc1NDEZ DE ELCHE' :u'Valencia',
u'RIOJA' :u'La Rioja',
u'CARLOS III DE MADRID' :u'Madrid',
u'GIRONA' :u'Cataluña',
u'BARCELONA' :u'Cataluña',
u'VIGO' :u'Galicia',
u'SALAMANCA' :u'Castilla y León',
u'MURCIA' :u'Murcia',
u'P\xdaBLICA DE NAVARRA' :u'Navarra',
u'VALLADOLID' :u'Castilla y León',
u'PALMAS DE GRAN CANARIA' :u'Islas Canarias',
u'ALMER\xcdA' :u'Extremadura',
u'LA LAGUNA' :u'Islas Canarias',
u'LLEIDA' :u'Cataluña',
u'C\xd3RDOBA' :u'Andalucía',
u'C\xc1DIZ' :u'Andalucía',
u'ILLES BALEARS' :u'Islas Baleares',
u'ABAT OLIBA CEU' :u'Cataluña',
u'ALCAL\xc1' :u'Madrid',
u'DEUSTO' :u'País Vasco',
u'EUROPEA DE MADRID' :u'Madrid',
u'CANTABRIA' :u'Cantabria',
u'JA\xc9N' :u'Andalucía',
u'PONTIFICIA DE SALAMANCA' :u'Castilla y León',
u'REY JUAN CARLOS' :u'Madrid',
u'LE\xd3N' :u'Castilla y León',
u'RAM\xd3N LLULL' :u'Cataluña',
u'POLIT\xc9CNICA DE CARTAGENA' :u'Andalucía',
u'PONTIFICIA COMILLAS' :u'Madrid',
u'CASTILLA-LA MANCHA' :u'Castilla La Mancha',
u'JAUME I DE CASTELL\xd3N' :u'Valencia',
u'CAT\xd3LICA DE VALENCIA SAN VICENTE M\xc1RTIR' :u'Valencia',
u'A CORU\xd1A' :u'Galicia',
u'PABLO DE OLAVIDE' :u'Andalucía',
u'SAN PABLO-CEU' :u'Madrid',
u'HUELVA' :u'Andalucía',
u'CARDENAL HERRERA-CEU' :u'Valencia',
u'OBERTA DE CATALUNYA' :u'Cataluña',
u'CAT\xd3LICA SAN ANTONIO' :u'Murcia',
u'INTERNACIONAL DE CATALUNYA' :u'Cataluña',
u'ANTONIO DE NEBRIJA' :u'Madrid',
u'MONDRAG\xd3N UNIBERTSITATEA' :u'País Vasco',
u'FRANCISCO DE VITORIA' :u'Madrid',
u'CAMILO JOS\xc9 CELA' :u'Madrid',
u'IE UNIVERSITY' :u'Madrid',
u'INTERNACIONAL MEN\xc9NDEZ PELAYO' :u'Madrid',
u'VIC' :u'Cataluña',
u'INTERNACIONAL DE VALENCIA' :u'Valencia',
u'ALFONSO X EL SABIO' :u'Madrid',
u'A DISTANCIA DE MADRID' :u'Madrid',
u'CAT\xd3LICA SANTA TERESA DE JES\xdaS DE \xc1VILA' :u'Castilla y León',
u'SAN JORGE' :u'Aragón',
u'INTERNACIONAL DE ANDALUC\xcdA' :u'Andalucía',
u'EUROPEA MIGUEL DE CERVANTES' :u'Castilla y León',
u'INTERNACIONAL DE LA RIOJA' :u'La Rioja',
u'EUROPEA DE CANARIAS' :u'Islas Canarias',
u'TECNOLOGÍA Y EMPRESA' :u'Madrid',
u'INTERNACIONAL DE BURGOS' :u'Castilla y León',
}
university_types = {
u'OVIEDO': 'public',
u'AUTÓNOMA DE BARCELONA': 'public',
u'BURGOS': 'public',
u'GRANADA': 'public',
u'INTERNACIONAL DE VALENCIA': 'private',
u'ZARAGOZA': 'public',
u'UNIVERSITAT DE VALÈNCIA (ESTUDI GENERAL)': 'public',
u'RAMÓN LLULL': 'private',
u'ILLES BALEARS': 'public',
u'FRANCISCO DE VITORIA': 'private',
u'GIRONA': 'public',
u'EUROPEA MIGUEL DE CERVANTES': 'private',
u'ROVIRA I VIRGILI': 'public',
u'PONTIFICIA DE SALAMANCA': 'private',
u'EUROPEA DE CANARIAS': 'private',
u'VALLADOLID': 'public',
u'MONDRAGÓN UNIBERTSITATEA': 'private',
u'EUROPEA DE MADRID': 'private',
u'IE UNIVERSITY': 'private',
u'LEÓN': 'public',
u'EXTREMADURA': 'public',
u'PÚBLICA DE NAVARRA':'public',
u'POLITÉCNICA DE CARTAGENA': 'public',
u'SAN PABLO-CEU': 'private',
u'ALFONSO X EL SABIO': 'private',
u'COMPLUTENSE DE MADRID': 'public',
u'INTERNACIONAL MENÉNDEZ PELAYO': 'public',
u'SEVILLA': 'public',
u'PALMAS DE GRAN CANARIA': 'public',
u'A DISTANCIA DE MADRID': 'private',
u'CÁDIZ': 'public',
u'POMPEU FABRA': 'public',
u'ALICANTE': 'public',
u'JAÉN': 'public',
u'PONTIFICIA COMILLAS': 'private',
u'POLITÉCNICA DE VALENCIA': 'public',
u'NACIONAL DE EDUCACIÓN A DISTANCIA': 'public',
u'CÓRDOBA': 'public',
u'LLEIDA': 'public',
u'HUELVA': 'public',
u'CASTILLA-LA MANCHA': 'public',
u'JAUME I DE CASTELLÓN': 'public',
u'SAN JORGE': 'private',
u'POLITÉCNICA DE MADRID': 'public',
u'LA LAGUNA': 'public',
u'INTERNACIONAL DE BURGOS': 'private',
u'CATÓLICA SANTA TERESA DE JESÚS DE ÁVILA': 'private',
u'BARCELONA': 'public',
u'RIOJA': 'public',
u'PAÍS VASCO/EUSKAL HERRIKO UNIBERTSITATEA': 'public',
u'CAMILO JOSÉ CELA': 'private',
u'OBERTA DE CATALUNYA': 'private',
u'INTERNACIONAL DE CATALUNYA': 'private',
u'SANTIAGO DE COMPOSTELA':'public',
u'MIGUEL HERNÁNDEZ DE ELCHE': 'public',
u'NAVARRA': 'private',
u'CARDENAL HERRERA-CEU': 'private',
u'ABAT OLIBA CEU':'private',
u'VIC': 'private',
u'MÁLAGA':'public',
u'SALAMANCA': 'public',
u'CARLOS III DE MADRID': 'public',
u'ALMERÍA': 'public',
u'INTERNACIONAL DE ANDALUCÍA': 'public',
u'MURCIA': 'public',
u'AUTÓNOMA DE MADRID': 'public',
u'REY JUAN CARLOS': 'public',
u'A CORUÑA': 'public',
u'CATÓLICA SAN ANTONIO': 'private',
u'PABLO DE OLAVIDE': 'public',
u'DEUSTO': 'private',
u'ALCALÁ': 'public',
u'CANTABRIA': 'public',
u'VIGO': 'public',
u'POLITÉCNICA DE CATALUNYA': 'public',
u'INTERNACIONAL DE LA RIOJA': 'private',
u'ANTONIO DE NEBRIJA': 'private',
u'CATÓLICA DE VALENCIA SAN VICENTE MÁRTIR': 'private',
u'TECNOLOGÍA Y EMPRESA': 'private',
}
university_ids = load_university_ids()
#this should be done the first time running this scripts
if __name__=='__main__':
regenerate_cache_files()