forked from ppke-nlpg/gut-besser-chunker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexicalise.py
executable file
·61 lines (54 loc) · 2.68 KB
/
lexicalise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-
# Lexicalise and generate input to a Tagger: WORD POS CHUNK -> WORD POS (WORD-POS-CHUNK|POS-CHUNK)
# Parameters:
# a) data in 'WORD POS CHUNK' format (CoNLL-2000 format with any chunk type)
# b) W_s set: one word per line (anything, but first column is ignored)
# c) Lexicalization Type (default: Full):
# c1) Full: if word in W_s than word+POS+chunk else POS+chunk
# (as described in Molina, Antonio, and Ferran Pla. "Shallow parsing using specialized hmms."
# The Journal of Machine Learning Research 2 (2002): 595-613.)
# c2) Just words: if word in W_s than word+POS+chunk else chunk
# c3) None: chunk (Nothing is done)
# Output: word pos-tag chunk-tag (format either words-pos words-pos-chunk or word pos pos-chunk or word pos chunk)
import sys
if len(sys.argv) == 1:
print('USAGE: {0} data W_Set-file [--just-words|--none]'.format(sys.argv[0]), file=sys.stderr)
sys.exit(1)
just_words = False
do_nothing = False
if len(sys.argv) > 3 and sys.argv[3] in {'--just-words', '--none'}:
if sys.argv[3] == '--just-words':
just_words = True
else:
do_nothing = True
data = sys.argv[1]
if sys.argv[1] == '-':
data = sys.stdin
with open(data, encoding='UTF-8') as FP_origin,\
open(sys.argv[2], encoding='UTF-8') as FP_lex: # eg. train.iob2.wch
if do_nothing:
for line_origin in FP_origin:
print(line_origin, end='')
else:
# -----Build set for selected words(lex)-----------
lex = {line_lex.strip().split()[0] for line_lex in FP_lex if len(line_lex) > 1}
# -----Build new trainning file--------------------
for line_origin in FP_origin:
words_origin = line_origin.strip().split()
if len(words_origin) == 0: # blank line
print()
else:
words_origin_ch = words_origin[2].split('-')
if words_origin_ch[0] != 'O' and '{0}-{1}'.format(words_origin[0], words_origin_ch[1]) in lex:
print(words_origin[0], # Word
words_origin[0] + '+' + words_origin[1], # Word+POS-tag
words_origin[0] + '+' + words_origin[1] + '+' + words_origin[2]) # Word+POS-tag+IOB-label
elif just_words:
print(words_origin[0], # Word
words_origin[1], # POS-tag
words_origin[2]) # IOB-label
else:
print(words_origin[0], # Word
words_origin[1], # POS-tag
words_origin[1] + '+' + words_origin[2]) # POS-tag+IOB-label