-
Notifications
You must be signed in to change notification settings - Fork 0
/
classify_data_citation.py
426 lines (336 loc) · 12.8 KB
/
classify_data_citation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
#!/usr/bin/env python
# coding: utf-8
# In[2]:
# Basic Data Manipulation
import pandas as pd
import numpy as np
import re
# Information Extraction
from openie import StanfordOpenIE
# Calssify
from nltk.corpus import verbnet as vn
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
from spacy.symbols import dobj, obj, pobj, acomp, ccomp, pcomp, xcomp, conj, acomp, ccomp, pcomp, xcomp, advmod, amod
from spacy.symbols import neg, det, aux, prep, poss, nsubj, nsubjpass, csubj, csubjpass, det, prt
from spacy.symbols import VERB, AUX, DET, ADP, ADV, ADJ, NOUN, PRON, PROPN, PART
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
# In[3]:
### Extract
# ref: https://github.com/philipperemy/stanford-openie-python
def extract_triple(text):
with StanfordOpenIE() as client:
return client.annotate(text)
# In[4]:
### Select
# for similar jsons, keep the shortest json
# include triples with and without the dataset_prediction (for possible coref)
# Helper: count the "longest" object
def long_stuff(compare_stuff): # a list of objects, return a list of longest objects
stf = pd.DataFrame()
stf["stuff"] = compare_stuff
stf["len"] = stf.stuff.apply(lambda x: x.count(" ")) # a list of object length
return stf.stuff[stf.len.idxmax()] # we only interested in one object - the longest
# later, we can extract more info from the long object
# Helper: get the stuff that are not a subject of others
def no_subset(A): # a list of strings
return list(set([x for x in A if not any(x in y and x != y for y in A)]))
# Helper: Using rules to select tri:
def select_triple(triplets):
"""
Extract a equence of subject-verb-object (SVO) triples
from a opie_tri fucntion acquired and processed doc,
including both active and passive entities and actions.
Args:
triplets are lists of dictionaries;
assume number of triplets >=2, i.e. len(triplets)>=2.
Yields:
List of dictionaries: the main/longest triplets from ``triplets``
representing a (subject, verb, object) triple.
"""
# initiate
selected = []
# only extract the longest subject
compare_sub = list(map(lambda x: x["subject"], triplets))
subjects = no_subset(compare_sub)
for sub in subjects:
# extract different unique relations
tri_for_this_sub = [d for d in triplets if d["subject"] == sub]
compare_rel = list(map(lambda x: x["relation"], tri_for_this_sub))
relations = no_subset(compare_rel)
# for each of the relation, extract the longest obeject
for rel in relations:
tri_for_this_rel = [d for d in tri_for_this_sub if d["relation"] == rel]
compare_obj = list(map(lambda x: x["object"], tri_for_this_rel))
this_object = long_stuff(compare_obj)
selected.append({"subject": sub, "relation": rel, "object": this_object})
# for the selected ones, if both subject and object are the same
# we keep the one with the longest relation
if len(selected) > 1:
# initiate
re_select = []
# give group number
group_list = [0] * len(selected)
group_list[0] = 1 # initiate the first group
group_num = 1
pos = 0
# if both subject and object are the same
# assign the same group number
# but avoid reassignment
for i in range(len(selected)):
pos = i
for j in range(i + 1, len(selected)):
pos += 1
if group_list[pos] == 0:
if selected[i]["subject"] == selected[j]["subject"] and selected[i]["object"] == selected[j]["object"]:
group_list[pos] = group_num
else:
group_list[pos] = group_num + 1
group_num += 1
# for each group, find the longest relation
numbers = list(set(group_list))
selected_df = pd.DataFrame()
selected_df["tri"] = selected
selected_df["grp"] = group_list
for num in numbers:
# find all the triplets for this group
tri_for_this_grp = selected_df[selected_df.grp == num].tri
# acquire a list of relations in this group
compare_rel = list(map(lambda x: x["relation"], tri_for_this_grp))
# find the longest relation
this_rel = long_stuff(compare_rel)
# get the subjects and objects for this group
# since these values are the same for each one of the values
# we extract the first one
this_subject = list(tri_for_this_grp)[0]["subject"] # change a series to a list
this_object = list(tri_for_this_grp)[0]["object"] # change a series to a list
# all the triplets in this group to the list
re_select.append({"subject": this_subject, "relation": this_rel, "object": this_object})
return re_select
return selected
# In[6]:
### Classify
# Subjects and Objects
# identify "I" and "we"
# detect (probably not full currently) dataset in part or in ful
def find_author(text):
'''
input: full sentence (text)
output: bool of find or not
'''
text_lower = text.lower()
words = text_lower.split()
if ("i" in words or "we" in words):
return True
return False
data_keywords = ['data', 'data\s*(?:set|base)s?', 'corp(us|ora)', 'tree\s*bank',
'(?:train|test|validation|testing|trainings?)\s*(?:set)',
'collections?', 'benchmarks?', 'surveys?', 'samples?', 'stud(y|ies)']
data_pattern= re.compile(r'\b(' + '|'.join(data_keywords) + r')\b', flags = re.IGNORECASE)
def find_dataset(text,data_name):
'''
input: full sentence (text) and dataset_prediction (data_name)
output: bool of find or not
'''
# use predicted dataset names to find
data_name_list = data_name.split()
words = text.split()
for data_name_token in data_name_list: # anything match counts
if data_name_token in words:
return True
# use data citation pattern to find
if re.search(data_pattern,text):
return True
return False
# In[22]:
# Relations
# ref: https://verbs.colorado.edu/verb-index/VerbNet_Guidelines.pdf
# https://docs.google.com/spreadsheets/d/18kn2z2df-M4ncUmoHPGqbs5nJyGL-k9R0d2slXdT830/edit?usp=sharing
verb_class_df = pd.read_csv("VerbNet_LF.csv")
verb_class = verb_class_df.drop("Verb Class",axis=1).set_index("Class Number").to_dict()['Verb Type']
def get_classes(classids):
"""
input VerbNet classids (long)
output a set of VerbNet classes (parenent-level, short)
"""
classes = set()
for classid in classids:
# remove the word itself
this_classid_long = classid.split("-")[1]
# get the class -- the string upto the first non-digit
this_classid_short = int(re.search(r'(\d+)',this_classid_long).group(1))
classes.add(this_classid_short)
return classes
def get_rel_classes(rel_nlp):
rel_classes = set()
for token in rel_nlp:
token_lemma = token.lemma_
token_classids = vn.classids(token_lemma)
token_classes = get_classes(token_classids)
rel_classes.update(token_classes)
return rel_classes
# In[8]:
# AEO Categories (tentative)
# ref: https://github.com/lizhouf/semantic_triplets/blob/main/scr/add_aeo.py
# clauses are characterized by:
# - temporal organization (the order in which the subject narrates events and actions in the story),
# - evaluative description (personal assessments made by the narrator), and
# - contextual orientation (usually information provided by the narrator that helps orient the listener)
# ref: Labov and Waletsky 1997 Labov, William, and Joshua Waletzky. 1997. “Narrative Analysis: Oral Versions of Personal Experience.” Journal of Narrative & Life History 7 (1–4): 3–38.
# We have:
# - Active Agency
# - Passive Agency
# - Possible Agency
# - Evaluative Description
# - Contextual Orientation
# Example Key Words
evaluation_verbs = ["feel","smell","taste","look","hear","see","think","know"]
orientation_verbs = ["remember","bear","grow","belong"]
imagine_verbs = ["want","should","would","could","can","might","may"]
def get_cat(this_rel, this_obj):
'''
input spaCy Spans this_rel, this_obj
output category result
'''
# initate category result
this_cat = ""
# initiate the rule components
rel_has_evaluation = 0
rel_has_orientation = 0
rel_has_imagine = 0
rel_has_be = 0
rel_has_have = 0
rel_has_to = 0
rel_has_neg = 0
rel_has_VBG = 0
rel_num_verb = 0
obj_is_adj = 0 # only adj, no NOUN+
obj_has_no = 0
# give value
for rel in this_rel:
# rel lemmas
try:
if rel.lemma_ in evaluation_verbs:
rel_has_evaluation = 1
if rel.lemma_ in imagine_verbs:
rel_has_imagine = 1
if rel.lemma_ in orientation_verbs:
rel_has_orientation = 1
if rel.lemma_ == "be":
rel_has_be = 1
if rel.lemma_ == "have":
rel_has_have = 1
if rel.lemma_ == "to":
rel_has_to = 1
except: # avoid no lemma
0
# rel dep
try:
if rel.dep == neg:
rel_has_neg = 1
except:
0
# rel pos
try:
if (rel.pos == VERB or rel.pos == AUX):
rel_num_verb = rel_num_verb + 1
except:
0
# rel tag
try:
if rel.tag_ == "VBG":
rel_has_VBG = 1
except:
0
for obj in this_obj:
# obj lemma
try:
if obj.lemma_ == "no":
obj_has_no = 1
except:
0
for obj in this_obj: # seperate, want to break
# obj pos
try:
if obj.pos == ADJ:
obj_is_adj = 1
if obj.pos in [NOUN,PRON,PROPN]:
obj_is_adj = 0
break
except:
0
# judge:
# fixed words
if rel_has_evaluation and obj_is_adj:
this_cat ="Evaluation"
elif rel_has_imagine:
this_cat ="Agency_Possible"
elif rel_has_orientation:
this_cat ="Orientation"
# neg
elif rel_has_neg or obj_has_no:
this_cat ="Orientation"
# have
elif rel_has_have:
if rel_has_to:
this_cat ="Agency_Passive" # no longer coercive
else:
this_cat ="Orientation"
# be
elif rel_has_be:
if obj_is_adj:
this_cat ="Evaluation"
elif rel_has_VBG:
this_cat ="Agency_Active"
elif rel_num_verb > 1:
this_cat ="Agency_Passive"
elif rel_num_verb == 1:
this_cat ="Orientation"
# if none of the above, then assign active:
else:
this_cat = "Agency_Active"
return this_cat
# In[9]:
### Import and Manipulate Data
citations = pd.read_csv("/nfs/turbo/hrg/data_detection/outputs_pipeline/4_pubs_sents_preds_ids.csv")
citations_true = citations[~citations.dataset_prediction.isna()].reset_index(drop=True)
# In[23]:
# %%capture
# %timeit
# Initiate result df
df = pd.DataFrame(columns=["paper_id","sentence_text","dataset_prediction",
"subject","relation","object",
"subject_category","relation_categories","object_category","AEO_category"])
for i in range(len(citations_true)):
#for i in range(3): # test
paper_id = citations_true.paper_id[i]
sentence_text = citations_true.sentence_text[i]
dataset_prediction = citations_true.dataset_prediction[i]
# Extract triple
triples = extract_triple(sentence_text)
# Select triple
triples_selected = select_triple(triples)
# loop
for triple in triples_selected:
sub = str(triple["subject"]) # ensure string, not int or bool
rel = str(triple["relation"])
rel_nlp = nlp(rel)
obj = str(triple["object"])
# Subject
Subject_Cat = "author" if find_author(sub) else "dataset" if find_dataset(sub,dataset_prediction) else "other"
# Object
Object_Cat = "author" if find_author(obj) else "dataset" if find_dataset(obj,dataset_prediction) else "other"
# Relation
Relation_Cats = get_rel_classes(rel_nlp)
# AEO
AEO_Cat = get_cat(rel_nlp,nlp(obj))
# Append
df.loc[len(df.index)] = [paper_id, sentence_text, dataset_prediction,
sub, rel, obj,
Subject_Cat, Relation_Cats, Object_Cat, AEO_Cat]
# In[24]:
df.to_csv("ICPSR_bib_data_citation_rhetoric_v01.csv",index=False)
# We are more interested in the columns with at least one author or dataset.