-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reviewing_latent_topics.py
140 lines (107 loc) · 4.13 KB
/
Reviewing_latent_topics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Created on 8 dec. 2016
@author: tewdewildt
'''
#import csvkit
import csv
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import re
from six import iteritems
import pickle
import logging
import gensim
from gensim import corpora, models, similarities
from pprint import pprint
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#csv.field_size_limit(500 * 1024 * 1024)
'''
Import data from the csv file and put into a dictionary
CSV file should include columns named 'Abstract', 'Author Keywords', 'Title' and 'DOI'.
One row should be one article
'''
with open('./data/scopus.csv', encoding="utf8", errors='ignore') as infile:
reader = csv.DictReader(infile)
data = {}
for row in reader:
for header, value in row.items():
try:
data[header].append(value)
except KeyError:
data[header] = [value]
'''
Group abstract, keywords and title into one tokenized string
'''
tokenizer = RegexpTokenizer(r'\w+')
nltk.download('stopwords')
stopwords = stopwords.words('english')
p_stemmer = PorterStemmer()
text_of_articles = []
i = 0
for row in data['Abstract']:
text_of_article = str(data['Abstract'][i]+' '+data['Author Keywords'][i]+' '+data['Title'][i])
text_of_article = ''.join(text_of_article)
text_of_article = re.sub(r'"' or r'[', ' ', text_of_article)
text_of_article = re.sub(r'http\S+', '', text_of_article)
text_of_article = re.sub('\W+',' ', text_of_article)
text_of_article = text_of_article.strip("\t").strip("abstract available]")
text_of_article = text_of_article.lower()
text_of_article = ''.join([a for a in text_of_article if not a.isdigit()])
tokens = tokenizer.tokenize(text_of_article)
stopped_tokens = [h for h in tokens if not h in stopwords]
for f in stopped_tokens:
if(len(f))<=1:
stopped_tokens.remove(f)
# stem tokens
#stemmed_tokens = [p_stemmer.stem(h) for h in stopped_tokens]
text_of_articles.append(stopped_tokens)
i += 1
'''
Make a topic model
'''
logging.basicConfig(level=logging.INFO,
format='%(asctime)s : %(levelname)s : %(message)s',
datefmt='%m-%d %H:%M',
filename='../save/log_topic_model_creation.log',
filemode='w')
number_topics_to_be_found = 5
number_passes = 5
num_words_to_show = 10
dictionary = corpora.Dictionary(text_of_articles)
dictionary.compactify() # remove gaps in id sequence after words that were removed
dictionary.save('../save/gensim_dictionary.dict')
corpus = [dictionary.doc2bow(text) for text in text_of_articles]
corpora.MmCorpus.serialize('../save/gensim_corpus.mm', corpus)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics = number_topics_to_be_found, passes = number_passes)
lda.save('../save/gensim_LDA_model.lda')
'''
Create dictionary with DOI as key and text as value for articles only related to selected topics
'''
topics_of_interest = [0, 1]
threshold = 0.25
doc_lda = lda[corpus_tfidf]
dict_articles_selected_topics = {}
i = 0
for o in doc_lda:
counter = 0
for x in o:
if x[0] in topics_of_interest and x[1] >= threshold:
counter += 1
if counter > 0:
dict_articles_selected_topics[data['DOI'][i]]= text_of_articles[i]
i += 1
'''
Select articles addressing value for selected topics
'''
dict_articles_selected_topics_selected_values = {}
justice = ['equity', 'inequity', 'fair', 'unfair', 'justice', 'injustice', 'impartial', 'unbiased', 'objectivity', 'lawful', 'unlawful', 'egalitarian', 'inegalitarian', 'distributive', 'fairness', 'justness', 'impartiality', 'equitable']
for key, value in dict_articles_selected_topics.items():
if any(x in value for x in justice):
dict_articles_selected_topics_selected_values[key] = value
for key, value in dict_articles_selected_topics_selected_values.items():
print(key)