-
Notifications
You must be signed in to change notification settings - Fork 1
/
rule-stats.py
executable file
·61 lines (52 loc) · 1.62 KB
/
rule-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python3
import sys
import utils
from collections import Counter
book = sys.argv[1]
parsed = utils.load_conllu(f'temp/merged/{book}.conllu')
checked = utils.load_conllu(f'data/checked/{book}.conllu')
manual = utils.load_conllu(f'data/manual/{book}.conllu')
cg3 = []
with open(f'temp/parsed-cg3/{book}.txt') as fin:
cg3 = fin.read().strip().split('\n\n')
right = Counter()
wrong = Counter()
def get_rules(cg):
ret = []
for l in cg.splitlines():
if not l or l[0] != '\t':
continue
cur = []
for w in l.split():
if w.startswith('SETPARENT:') or w.startswith('MAP:'):
cur.append(w)
ret.append(cur)
return ret
for k in checked:
for rl in get_rules(cg3[parsed[k][0]-1]):
right.update(rl)
def word_lines(block):
for l in block.splitlines():
ls = l.split('\t')
if len(ls) != 10: continue
if not ls[0].isnumeric(): continue
yield ls[6], ls[7]
for k in manual:
rules = get_rules(cg3[parsed[k][0]-1])
p = parsed[k][1]
m = manual[k][1]
for i, (pl, ml) in enumerate(zip(word_lines(p), word_lines(m))):
head = [r for r in rules[i] if 'SETPARENT' in r]
rel = [r for r in rules[i] if 'MAP' in r]
if pl[0] == ml[0]:
right.update(head)
else:
wrong.update(head)
if pl[1] == ml[1]:
right.update(rel)
else:
wrong.update(rel)
with open(f'temp/stats/{book}.txt', 'w') as fout:
l = set(list(right.keys()) + list(wrong.keys()))
for k in sorted(l):
fout.write(f'{k}\t{right[k]}\t{wrong[k]}\n')