-
Notifications
You must be signed in to change notification settings - Fork 1
/
concepts_createNetwork.py
127 lines (100 loc) · 3.98 KB
/
concepts_createNetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import pandas as pd
import numpy as np
import json
from nltk.tokenize import word_tokenize
import re
import geopandas as gpd
import networkx as nx
import matplotlib.pyplot as plt
import collections
from sklearn.cluster import KMeans
from PIL import Image
import time
from datetime import datetime, timedelta, date
from os import listdir
from os.path import isfile, join
from geopandas import GeoDataFrame
from shapely.geometry import Point
import pickle
import math
from timeit import default_timer as timer
#Se pasa como argumento el nombre de la tabla de la base de datos a procesar
doFilter=True
filtertag='_f'
tagfile="3"
db_name_table = 'PostsMadCar'#str(argv[1])
datapath='/home/davidpastor/Narrativas/MadCar/'
m_database='twitterdb'
#Lista con las palabras clave definidas
keywords_list = ['descarbonización','descarbonizacion','clima','climático','climatico','combustible', 'CO2', 'climática', 'climatica', 'transición energética', 'renovable', 'energía', 'energia', 'energético', 'energética', 'energetico', 'energetica']
m_user='david'
m_pass='password'
address='192.168.0.154'
address='127.0.0.1:3306'
encoding = 'utf-8'
path_dicts = 'Tweets/'
with open(datapath+path_dicts+'distFreq'+db_name_table+tagfile+'.pkl', 'rb') as f:
dict_distFreqPost = pickle.load(f)
path_dfs = 'Tweets/'
dfProcessed = pd.read_pickle(datapath+path_dfs+ db_name_table+'Processed'+tagfile+'.pkl')
print('Table loaded')
start=timer()
f=[]
fin=[]
fout=[]
for w in dict_distFreqPost:
f.append(dict_distFreqPost[w])
if w in keywords_list:
fin.append(dict_distFreqPost[w])
else:
fout.append(dict_distFreqPost[w])
Gu=nx.Graph()
th=np.percentile(f,95)
print(th)
#ITERATE ON POSTS
#ITERATE ON WORDS AFTER FILTERING
#Iterar tokens
for index, row in dfProcessed.iterrows():
tokens_list = row['tokens_text']
for keyword in keywords_list:
if keyword in tokens_list:
fk=dict_distFreqPost.get(keyword, 1)
for relatedword in tokens_list:
if relatedword != keyword:
f=dict_distFreqPost.get(relatedword, 1)
if doFilter and f>=th:
if not Gu.has_node(keyword):
Gu.add_node(keyword, freq = fk)
if not Gu.has_node(relatedword):
Gu.add_node(relatedword, freq = f)
if not Gu.has_edge(keyword,relatedword):
Gu.add_edge(keyword,relatedword)
#Flow
Gu[keyword][relatedword]['weight']=1
else:
Gu[keyword][relatedword]['weight']=Gu[keyword][relatedword]['weight']+1
if not doFilter:
print('sin filtro')
if not Gu.has_node(keyword):
Gu.add_node(keyword, freq = fk)
if not Gu.has_node(relatedword):
Gu.add_node(relatedword, freq = f)
if not Gu.has_edge(keyword,relatedword):
Gu.add_edge(keyword,relatedword)
#Flow
Gu[keyword][relatedword]['weight']=1
else:
Gu[keyword][relatedword]['weight']=Gu[keyword][relatedword]['weight']+1
end = timer()
print(end - start)
print('Finished. Writing to file')
path_graphs = 'Tweets/'
nx.write_gexf(Gu, datapath+path_graphs+db_name_table+'NetworkGraph'+tagfile+filtertag+'.gexf')
with open(datapath+path_graphs+db_name_table+'Net'+tagfile+filtertag+'.cnf', 'wb') as handle:
pickle.dump(Gu, handle, protocol=pickle.HIGHEST_PROTOCOL)
print('Saved')