-
Notifications
You must be signed in to change notification settings - Fork 1
/
prepare_data.py
290 lines (249 loc) · 14.5 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import os
import pandas.io.json as pd_json
from collections import Counter,defaultdict
import string
import textstat
import statistics
import pickle
import torch
from transformers import *
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from tqdm import tqdm
import difflib
def load_data_files_into_raw_df(data_path, data_split_category, email_institute_affiliation_mapper, tfidf_matrix):
stop_words = stopwords.words('english')
punct_removal_table = {ord(char): None for char in string.punctuation}
MODELS = {"BertModel" : (BertModel, BertTokenizer, 'bert-base-uncased'),
"OpenAIGPTModel" : (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
"GPT2Model" : (GPT2Model, GPT2Tokenizer, 'gpt2'),
"TransfoXLModel" : (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
"XLNetModel" : (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
"XLMModel" : (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
"DistilBertModel" : (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
"RobertaModel" : (RobertaModel, RobertaTokenizer, 'roberta-base'),
"XLMRobertaModel" : (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base')}
model_class, tokenizer_class, pretrained_weights = MODELS['BertModel']
#Load pretrained model/tokenizer
print("LOADING TRANSFORMER MODEL ............")
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
nlp = pipeline('feature-extraction')
print("TRANSFORMER MODEL LOADED!")
with open(os.path.join(data_path,'WordsFromTitleofTop200Papers.txt'),encoding="utf8") as f:
top_200_titles_words_counter = Counter([word for word in f.read().translate(punct_removal_table).lower().split() if word not in stop_words])
#TAKE TOP 5%
top_200_titles_vocab = [key for key,value in top_200_titles_words_counter.most_common(int(0.05*len(top_200_titles_words_counter)))]
directory_in_string = data_path + '/iclr_2017/' + str(data_split_category) + '/parsed_pdfs'
#directory_in_string = data_path + '/iclr_2017/train/parsed_pdfs'
#review_directory_in_string = data_path + '/iclr_2017/train/reviews'
review_directory_in_string = data_path + '/iclr_2017/' + str(data_split_category) + '/reviews'
directory_content = os.fsencode(directory_in_string)
list_of_file_dicts,paper_data_df = [],pd.DataFrame()
file_number = 0
print("FEATURIZING DATA ............")
for file in tqdm(os.listdir(directory_content)):
filename = os.fsdecode(file)
file_dict = defaultdict()
#print("\nOpening file : ",filename)
with open(os.path.join(directory_in_string, filename),encoding="utf8") as file:
data = json.load(file)
file_dict['paper_id'] = data['name']
paper_metadata = data['metadata']
################################ ABSTRACT ###################################################
#print("Extracting abstract features....")
abstract_text = paper_metadata['abstractText'].translate(punct_removal_table).lower()
#1. BERT et al encoding
'''input_ids = torch.tensor([tokenizer.encode(abstract_text)]).unsqueeze(0)
outputs = model(input_ids)
last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple
file_dict['attention_based_encoding'] = last_hidden_states[0][0]'''
file_dict['feature_extraction_encoding'] = nlp(abstract_text)[0][0] #DistilBERT
#2. TFIDF ENCODING
file_dict['tfidf_encoding'] = tfidf_matrix.toarray()[file_number]
file_number = file_number + 1
#title = paper_metadata['title'].lower()
#3. IF ABSTRACT CONTAINS ATLEAST 2 WORDS FROM TOP 200 TITLES
if len(set(abstract_text.split()).intersection(set(top_200_titles_vocab))) > 2:
file_dict['words_from_top_200_title'] = True
else:
file_dict['words_from_top_200_title'] = False
#4. ABSTRACT LENTGTH
file_dict['abstract_length'] = textstat.lexicon_count(abstract_text, removepunct=True)
#5. ABSTRACT COMPLEXITY
flesch = 1/textstat.flesch_reading_ease(abstract_text)
dale_chall = textstat.dale_chall_readability_score(abstract_text)
file_dict['abstract_complexity'] = (flesch + dale_chall)/2
#6. ABSTRACT NOVELTY
if len({'outperforms','state-of-the-art','state of the art'}.intersection(set(abstract_text.split()))) > 0:
file_dict['abstract_novelty'] = True
else:
file_dict['abstract_novelty'] = False
################################ TITLE ###################################################
################################ AUTHORS ###################################################
#print("Extracting authors features.....")
#1. NUMBER OF AUTHORS
reported_num_of_authors = len(paper_metadata['authors'])
if reported_num_of_authors == 0:
file_dict['number_of_authors'] = 2 #AVG?
else:
file_dict['number_of_authors'] = reported_num_of_authors
#2. AUTHOR AFFILIATION SCORE
author_emails = paper_metadata['emails']
author_institutes = [email.split('@')[1] for email in author_emails]
research_strength_score = 0
if len(author_institutes) != 0:
for institute in author_institutes:
closest_matches = difflib.get_close_matches(institute, email_institute_affiliation_mapper.keys())
if len(closest_matches) == 0:
#if there are no closest matches it's not from a university from our list,
#which means its either a strange university or a corporate company.
#either way its safe to give it a HIGH SCORE - even if the paper is not from a good source
#other parameters will take care of it
research_strength_score = research_strength_score + 60 #DECISION!
else:
affiliated_unis_from_string_match = [email_institute_affiliation_mapper[match] for match in closest_matches]
sub_score = 0
for affiliated_uni in affiliated_unis_from_string_match:
try:
sub_score = sub_score + university_score_df.loc[university_score_df.institute == affiliated_uni, 'count'].values[0]
except Exception as e:
#if it's an excpetion this is a university alright,
#but doesnt fall in our 100 unis of csraniking scores
#this means its mostly not a top uni, so it is safe to give it a LOW SCORE
sub_score = sub_score + 6 #DECISION!
closest_matches_avg_score = sub_score/len(affiliated_unis_from_string_match)
research_strength_score = research_strength_score + closest_matches_avg_score
file_dict['research_strength_score'] = research_strength_score/len(author_institutes)
else:
file_dict['research_strength_score'] = 0
################################ AUTHORS ###################################################
################################ REFERENCES ###################################################
#print("Extracting references features.....")
references_list,ref_mentions_list = paper_metadata['references'],paper_metadata['referenceMentions']
#1. NUM OF REFRERENCES
file_dict['num_of_references'] = len(references_list)
#2. MOST RECENT REFERENCE YEAR
ref_years_list = [ref_dict['year'] for ref_dict in references_list]
file_dict['most_recent_ref_year'] = max(ref_years_list)
#3. AVG LENGTH OF REF MENTION
if len(ref_mentions_list) != 0:
file_dict['avg_len_of_ref_mention'] = statistics.mean([ref_dict['endOffset'] - ref_dict['startOffset'] for ref_dict in ref_mentions_list])
else:
file_dict['avg_len_of_ref_mention'] = 0
#4. NUMBER OF RECENT REFERENCES (current recent ref behnchmark = 4)
file_dict['num_of_recent_references'] = sum([1 for year in ref_years_list if paper_metadata['year']-year<4])
################################ REFERENCES ###################################################
################################ CONTENT ###################################################
#print("Extracting content features.....")
#content housekeeping
sections = paper_metadata['sections']
if sections is None:
continue
else:
section_content = ''
for section in sections:
section_content = section_content + " " + section['text'].translate(punct_removal_table).lower()
file_dict['contains_githib_link'],file_dict['contains_appendix'] = False,False
#1. NUMBER OF SECTIONS
file_dict['number_of_sections'] = len(sections)
#2. CONTAINS GITHUB LINK
for section in sections:
if 'github' in section['text'].lower():
file_dict['contains_githib_link'] = True
break
#3. READABILITY
flesch_score,dale_chall_score = 0,0
for section in sections:
flesch_score = flesch_score + textstat.flesch_reading_ease(section['text'])
dale_chall_score = dale_chall_score + textstat.dale_chall_readability_score(section['text'])
flesch_score,dale_chall_score = flesch_score/file_dict['number_of_sections'],dale_chall_score/file_dict['number_of_sections']
file_dict['content_complexity'] = ((1/flesch_score) + dale_chall_score)/2
#4. CONTAINS APPENDIX
for section in sections:
if section['heading'] is not None:
if 'APPENDIX' in section['heading'] or section['heading'].split()[0] in set(string.ascii_uppercase):
file_dict['contains_appendix'] = True
break
#5. NUMBER OF UNIQUE WORDS
file_dict['number_of_unique_words'] = len(Counter(section_content))
################################ CONTENT ###################################################
list_of_file_dicts.append(file_dict)
features_df = pd.DataFrame(list_of_file_dicts)
return features_df
def collect_data_labels(features_df, data_path, data_split_category):
print("COLLECTING DATA LABELS ............")
accepted_values = []
review_directory_in_string = data_path + '/iclr_2017/' + str(data_split_category) + '/reviews'
for index, row in tqdm(features_df.iterrows()):
filename = row['paper_id'].replace(".pdf","") + '.json'
with open(os.path.join(review_directory_in_string, filename),encoding="utf8") as file:
data = json.load(file)
accepted_values.append(data['accepted'])
features_df["accepted"] = accepted_values
return features_df
def build_and_save_tfidf_model(data_path,data_split_category):
#collect corpus
punct_removal_table = {ord(char): None for char in string.punctuation}
corpus = []
directory_in_string = data_path + '/iclr_2017/' + str(data_split_category) + '/parsed_pdfs'
directory_content = os.fsencode(directory_in_string)
for file in tqdm(os.listdir(directory_content)):
filename = os.fsdecode(file)
with open(os.path.join(directory_in_string, filename),encoding="utf8") as file:
paper_metadata = json.load(file)['metadata']
abstract_text = paper_metadata['abstractText'].translate(punct_removal_table)
corpus.append(abstract_text)
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(analyzer='word', stop_words = stop_words)
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
return X,feature_names
def build_affiliation_dictionary(author_university_df,university_score_df):
interim_mapper,mapper = {},{}
for index, row in author_university_df.iterrows():
#name,affiliation,homepage,scholarid
link = row['homepage']
try:
if "http" in link:
base = link.split("/")[2]
else:
base = link.split("/")[0]
if "www" in base:
base = ".".join(base.split(".")[1:])
if base in interim_mapper:
if not row["affiliation"] in interim_mapper[base]:
interim_mapper[base][row["affiliation"]] = 1
else:
interim_mapper[base][row["affiliation"]] += 1
else:
interim_mapper[base] = {}
interim_mapper[base][row["affiliation"]] = 1
except Exception as e:
print("Homepage : ",row['homepage']," defaulted!")
# for base in interim_mapper:
# if len( interim_mapper[base].keys()) > 1:
# print(base, interim_mapper[base])
#interim_mapper now has the email ids, and number of times, they were refferred to as an affiliations.
#we ignore if number of affliations are more than 4 as then they are generic ids like gmail.com and shoudn't be mapped
#else we take the maximum
for base in interim_mapper:
if len(interim_mapper[base].keys()) < 2:
#calculate the maximum referred affiliation
max_count = 0
max_ff = ""
for aff in interim_mapper[base]:
if interim_mapper[base][aff] > max_count:
max_count = interim_mapper[base][aff]
max_ff = aff
mapper[base] = aff
'''with open("processed_data/affiliation_dict", "wb") as output_file:
pickle.dump(mapper, output_file)'''
return mapper
#def read_reviews(filename):