-
Notifications
You must be signed in to change notification settings - Fork 0
/
visualise.py
96 lines (87 loc) · 3.83 KB
/
visualise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''Simple visualisation tool, creating html page from brat stand off files.
Takes two compulsory arguements:
1) the name of brat format annotation file
2) the name of the matching plain text file
see <http://brat.nlplab.org/standoff.html> for more on file formats
And two optional arguments:
prev - the name of an html file that logically precedes the viewed page
next - the name of an html file that logically succedes the viewed page
Note: HTML special characters are not escaped so relies on UTF-8 for
accurate rendering.
David King <[email protected]>
For ViBRANT <http://vbrant.eu//>
January 2013
License: GPLv2 <http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt>
'''
import re
extract_stand_off = re.compile('(?P<ID>\S+)\t'
'(?P<type>\S+)\W'
'(?P<start_offset>\d+)\W'
'(?P<end_offset>\d+)\t'
'(?P<text>\S+)')
top = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '\
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\n'\
'<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n'\
'<head>\n'\
'<title>Visualisation</title>\n'\
'<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n'\
'<style type="text/css">\n'\
'dfn { background-color: chartreuse; }\n'\
'</style>\n'\
'</head>\n'\
'<body>\n'\
'<div>\n'
tail = '</div>\n'\
'</body>\n'\
'</html>\n'
def mark_up(text_file, ann_file, prev=None, next=None):
# get the text as one long string
with open(text_file, 'r', encoding='utf-8', newline='\n') as f:
text = f.read()
# get the matching annotations as a list, each line becoming an entry
with open(ann_file, 'r', encoding='utf-8', newline='\n') as f:
annotations = f.readlines()
if annotations == []:
# there are no annotations so build plain text page instead
print('no annotations to apply in {:s}'.format(ann_file))
body = text.replace('\n', '<br/>\n')
else:
# somewhere to build body of html page from annotations
body_list = []
# get text from start to first annotation
stand_off = extract_stand_off.search(annotations[0])
body_list.append(text[:int(stand_off.group('start_offset'))])
start_text = int(stand_off.group('end_offset'))
# loop through annotations inserting html mark up for each one
for annotation in annotations:
stand_off = extract_stand_off.search(annotation)
body_list.append(text[start_text:int(stand_off.group('start_offset'))])
body_list.append('<dfn title="{:s}">{:s}</dfn>'
.format(stand_off.group('type'),
stand_off.group('text')))
start_text = int(stand_off.group('end_offset'))
# get remainging text from last annotation to end
body_list.append(text[start_text:])
# check optional arguments
if prev is not None:
body_list.append('<a href="' + prev + '"><button>prev</button></a>')
if next is not None:
body_list.append('<a href="' + next + '"><button>next</button></a>')
# and finish off bulding the page
body = ''.join(body_list)
body = body.replace('\n', '<br/>\n')
return top + body + tail
if __name__ == '__main__':
import os
dir = '../bca_03_aves_v4_ocr/'
ann_list = [entry for entry in os.listdir(dir) if entry.endswith('ann')]
for ann_file in ann_list:
txt_file = ann_file[:-3] + 'txt'
if os.path.exists(dir + txt_file):
html_file = ann_file[:-3] + 'html'
html_page = mark_up(
dir + txt_file,
dir + ann_file)
with open(dir + html_file, 'w', encoding='utf-8', newline='\n') as f:
if html_page is not None:
f.write(html_page)