-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcalculate_mention_overlap.py
111 lines (96 loc) · 3.61 KB
/
calculate_mention_overlap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gzip
import json
import shlex
from pathlib import Path
from argparse import ArgumentParser
import tqdm
# reads through all lines of an (compressed) n-triples file and
# gathers the surface forms in a dictionary with
# the number of entities in the values (assumption that an entity does not have two equal labels)
def compute_mention_overlap(
filename: str, output_file_name=None, lang=None, filtered_relations=None
):
if not output_file_name or (
output_file_name and not Path(output_file_name).exists()
):
mention_dictionary = {}
tqdm_ = tqdm.tqdm(total=15290824)
if filename.endswith("gz"):
f = gzip.open(filename, "rt")
else:
f = open(filename)
line = f.readline()
identifiers = set()
while line:
line = line.replace("'", "\\'")
split_line = shlex.split(line)
tqdm_.update(1)
relation = split_line[1]
label: str = split_line[2]
if lang and not label.endswith("@" + lang):
line = f.readline()
continue
label = label[0: label.rfind("@")]
if relation in filtered_relations:
identifiers.add(split_line[0])
if label not in mention_dictionary:
mention_dictionary[label] = 0
mention_dictionary[label] += 1
line = f.readline()
with open(output_file_name, "w") as f:
json.dump(mention_dictionary, f, indent=4)
else:
with open(output_file_name) as f:
mention_dictionary = json.load(f)
identifiers = None
return mention_dictionary, identifiers
def calculate_overlap(mention_dictionary: dict, identifiers=None):
if not identifiers:
identifiers = []
number_of_entities = {}
for key, value in mention_dictionary.items():
if value not in number_of_entities:
number_of_entities[value] = 0
number_of_entities[value] += int(key)
three_ten = 0
for i in range(3, 11):
three_ten += number_of_entities.get(i, 0)
eleven_hundred = 0
for i in range(11, 101):
if i in number_of_entities:
eleven_hundred += number_of_entities.get(i,0)
rest = 0
for key, value in number_of_entities.items():
if key > 100:
rest += value
results = {
"num_identifiers": len(identifiers),
"exact_match": number_of_entities.get(1,0),
"two_match": number_of_entities.get(2,0),
"three-ten": three_ten,
"eleven_hundred": eleven_hundred,
"rest": rest,
"details": number_of_entities,
}
return results
if __name__ == "__main__":
filtered_relations = ["<http://www.w3.org/2000/01/rdf-schema#label>",
"<http://www.w3.org/2004/02/skos/core#altLabel>",
"<http://schema.org/alternateName>"]
parser = ArgumentParser()
parser.add_argument("filename", type=str)
parser.add_argument("mention_dictionary_filename", type=str)
parser.add_argument("results_filename", type=str)
args = parser.parse_args()
# Has to be replaced with the different KG dumps
# filename = "../DBpedia/labels_lang=en.ttl"
# output_file_name="dbpedia.json"
mention_dictionary, identifiers = compute_mention_overlap(
args.filename,
output_file_name=args.mention_dictionary_filename,
lang="en",
filtered_relations=filtered_relations,
)
results = calculate_overlap(mention_dictionary, identifiers)
with open(args.results_filename, "w") as f:
json.dump(results, f, indent=4)