Skip to content

Commit ae80aa4

Browse files
authored
Kalmyk paresr implementation -- ispras/lingvodoc-react#1119 (#1503)
* use postgres:ready if is * fix * image: 'docker_api:latest' * fix * dirty fix * docker.ini.template * minor * PGDATA * added hfst parser * added hfst parser * pretty output * correct output * lib * file dynamical name * fix * fix * More correct translation * Fix * Revert "Fix" This reverts commit 6150886. * Fix * /opt/hfst * moved * fixed requirement * docker-compose-proxy.yml * Cleanup * Catch when no gloss * Indicate markups with 'gr: Unknown' in any proposed variants with yellow color on OdtMarkupModal/PropertiesView * fix
1 parent c1676cd commit ae80aa4

File tree

11 files changed

+26394
-13
lines changed

11 files changed

+26394
-13
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""Kalmyk parser added
2+
3+
Revision ID: 0fc45203d6ab
4+
Revises: 6e02e6fdf0f9
5+
Create Date: 2024-01-11 12:20:38.119574
6+
7+
"""
8+
9+
# revision identifiers, used by Alembic.
10+
revision = '0fc45203d6ab'
11+
down_revision = '6e02e6fdf0f9'
12+
branch_labels = None
13+
depends_on = None
14+
15+
from alembic import op
16+
17+
def upgrade():
18+
op.execute('''
19+
INSERT INTO public.parser(additional_metadata, created_at, object_id, client_id, name, parameters, method)
20+
VALUES(null, '2024-01-11 12:20:38', 12, 1, 'Парсер калмыцкого языка (hfst)', '[]',
21+
'hfst_kalmyk');
22+
''')
23+
24+
def downgrade():
25+
op.execute('''
26+
DELETE FROM parser WHERE method = 'hfst_kalmyk';
27+
''')

aux_scripts/compile_xfst.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/python3
2+
from hfst_dev import compile_xfst_file
3+
import cgi
4+
import cgitb
5+
import shutil
6+
7+
STACK_FILENAME = 'rules.xfst.hfst'
8+
DIR = '/var/www/cgi-bin/xal/'
9+
TMP = '/var/www/tmp/'
10+
11+
cgitb.enable(format='text')
12+
POST = cgi.FieldStorage()
13+
14+
print('Content-Type: text/html; charset=utf-8')
15+
print('')
16+
17+
try:
18+
LF = POST['LEXC'].filename
19+
RF = POST['RULES'].filename
20+
except:
21+
LF = ''
22+
RF = ''
23+
if LF != '' and RF != '':
24+
LexcFile = open(TMP + POST['LEXC'].filename, 'wb')
25+
LexcFile.write(POST['LEXC'].file.read())
26+
LexcFile.flush()
27+
LexcFile.close()
28+
RulesFile = open(TMP + POST['RULES'].filename, 'wb')
29+
RulesFile.write(POST['RULES'].file.read())
30+
RulesFile.flush()
31+
RulesFile.close()
32+
RulesFile = open(TMP + POST['RULES'].filename, 'a+')
33+
RulesFile.write('\nsave stack ' + STACK_FILENAME)
34+
RulesFile.flush()
35+
RulesFile.close()
36+
shutil.copyfile(TMP + POST['LEXC'].filename, DIR + POST['LEXC'].filename)
37+
shutil.copyfile(TMP + POST['RULES'].filename, DIR + POST['RULES'].filename)
38+
compile_xfst_file(DIR + POST['RULES'].filename)
39+
print('XFST compiled!')
40+
else:
41+
print('''
42+
<form method="post" enctype="multipart/form-data">
43+
<input type="file" name="LEXC"><br/>
44+
<input type="file" name="RULES"><br/>
45+
<input type="submit" value="COMPILE!">
46+
</form>
47+
''')

docker/docker-compose-proxy.yml

+2
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ services:
7979
volumes:
8080
- ./frontend/dist:/dist
8181
- /opt/apertium:/opt/apertium
82+
- /opt/hfst:/opt/hfst
8283
- ./sock:/sock
8384
- /api/build/
8485
- ../:/api
@@ -102,6 +103,7 @@ services:
102103
volumes:
103104
- ./frontend-proxy/dist:/dist
104105
- /opt/apertium:/opt/apertium
106+
- /opt/hfst:/opt/hfst
105107
- ./sock-proxy:/sock
106108
- /api/build/
107109
- ../:/api

docker/docker-compose.yml

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ services:
4848
volumes:
4949
- ./frontend/dist:/dist
5050
- /opt/apertium:/opt/apertium
51+
- /opt/hfst:/opt/hfst
5152
- ./sock:/sock
5253
- /api/build/
5354
- ../:/api

lingvodoc/schema/gql_parserresult.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -299,10 +299,10 @@ def mutate(root, info, **args):
299299

300300
def get_parser_result_for_text(text, parse_method, apertium_path):
301301
method = getattr(ParseMethods, parse_method)
302-
if parse_method.find("timarkh") != -1:
303-
result = method(text)
304-
elif parse_method.find("apertium") != -1:
302+
if parse_method.find("apertium") != -1:
305303
result = method(text, apertium_path)
304+
else:
305+
result = method(text)
306306
return result
307307

308308

lingvodoc/static/parsers/hfst/xal/lexicon.lexc

+25,882
Large diffs are not rendered by default.

lingvodoc/static/parsers/hfst/xal/rules.xfst

+351
Large diffs are not rendered by default.
Binary file not shown.

lingvodoc/utils/creation.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -552,11 +552,10 @@ def create_parser_result(
552552
r = requests.post(url=dedoc_url, files=files, data=data)
553553
dedoc_output = re.sub(r"(<sub>.*?</sub>)", "", r.content.decode('utf-8'))
554554

555-
if parser.method.find("timarkh") != -1:
556-
result = parse_method(dedoc_output, **arguments)
557-
558-
elif parser.method.find("apertium") != -1:
555+
if parser.method.find("apertium") != -1:
559556
result = parse_method(dedoc_output, apertium_path, **arguments)
557+
else:
558+
result = parse_method(dedoc_output, **arguments)
560559

561560
dbparserresult = ParserResult(client_id=client_id, object_id=object_id,
562561
parser_object_id=parser_object_id, parser_client_id=parser_client_id,

lingvodoc/utils/doc_parser.py

+77-6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from uniparser_moksha import MokshaAnalyzer
88
from uniparser_komi_zyrian import KomiZyrianAnalyzer
99
from nltk.tokenize import RegexpTokenizer
10+
from hfst_dev import HfstTransducer
11+
from lxml.html import fromstring
1012
import csv
1113
import os
1214
import tempfile
@@ -24,7 +26,7 @@ def print_to_str(*args, **kwargs):
2426

2527

2628
span_id_counter = 0
27-
def generate_html_wrap(word, ana_tag_list, lang=""):
29+
def generate_html_wrap(word, ana_tag_list, lang="", extra_state=""):
2830

2931
json_list = list()
3032
for ana_tag in ana_tag_list:
@@ -40,16 +42,16 @@ def generate_html_wrap(word, ana_tag_list, lang=""):
4042

4143
global span_id_counter
4244
span_id_counter += 1
43-
wrap = "<span class=\"unverified\"" + " id=" + str(span_id_counter) + ">"
45+
wrap = f"<span class=\"unverified {extra_state}\"" + " id=" + str(span_id_counter) + ">"
4446
for attr_json in json_list:
4547
span_id_counter += 1
4648
encoded_attrs = ((json.dumps(attr_json, ensure_ascii=False)).encode('utf8')).decode()
47-
wrap += "<span class=\"result\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs + "</span>"
49+
wrap += f"<span class=\"result {extra_state}\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs + "</span>"
4850

4951
if lang == 'udm' and 'nom' in encoded_attrs:
5052
flag = True
5153
span_id_counter += 1
52-
wrap += "<span class=\"result\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs.replace('nom', 'acc0') + "</span>"
54+
wrap += f"<span class=\"result {extra_state}\"" + " id=" + str(span_id_counter) + ">" + encoded_attrs.replace('nom', 'acc0') + "</span>"
5355

5456
wrap += word + "</span>"
5557
return wrap
@@ -68,8 +70,9 @@ def insert_parser_output_to_text(text, parser_output, lang=""):
6870
if text[match_index-len(ESC_PAT):match_index] == ESC_PAT and text[match_index+len(word):match_index+len(word)+len(ESC_PAT)] == ESC_PAT:
6971
continue
7072
result_list.append(text[search_start_index:match_index])
71-
if (len(w_tag.contents) > 1):
72-
result_list.append(generate_html_wrap(word, w_tag.contents[0:-1], lang=lang))
73+
if len(w_tag.contents) > 1:
74+
extra_state = "broken" if any([a.get('gr') == "Unknown" for a in w_tag.find_all('ana')]) else ""
75+
result_list.append(generate_html_wrap(word, w_tag.contents[0:-1], lang=lang, extra_state=extra_state))
7376
search_start_index = match_index + len(word)
7477
result_list.append(text[search_start_index:])
7578
result = "".join(result_list)
@@ -358,6 +361,72 @@ def trans(elem):
358361

359362
return insert_parser_output_to_text(dedoc_output, parser_output, lang=lang)
360363

364+
def hfst_parser(dedoc_output, lang, debug_flag=False):
365+
366+
if debug_flag:
367+
with open("dedoc_output", 'w') as f:
368+
print(dedoc_output, file=f)
369+
370+
parser_path = f"/opt/hfst/{lang}"
371+
372+
with open(f"{parser_path}/lexicon.lexc", 'r') as f:
373+
lexicon = f.read()
374+
375+
xfst = HfstTransducer.read_from_file(f"{parser_path}/rules.xfst.hfst")
376+
xfst.invert()
377+
378+
sent_regex = re.compile(r'[.|!|?|...]')
379+
word_regex = re.compile(r'[,| |:|"|-|*]')
380+
381+
words = 0
382+
analyzed = 0
383+
parser_list = []
384+
385+
# remove html tags from dedoc_output
386+
text = fromstring(dedoc_output).text_content()
387+
sentences = filter(lambda t: t, [t.strip() for t in sent_regex.split(text)])
388+
for s in sentences:
389+
wordlist = filter(lambda t: t, [t.strip() for t in word_regex.split(s)])
390+
for w in wordlist:
391+
words = words + 1
392+
lookup = xfst.lookup(w)
393+
if len(lookup) == 0:
394+
lookup = xfst.lookup(w.lower())
395+
if len(lookup) > 0:
396+
analyzed = analyzed + 1
397+
section = "'<w>"
398+
for lkp in map(lambda l: l[0], lookup):
399+
400+
if '+' in lkp:
401+
plus_pos = lkp.index('+')
402+
lex = lkp[:plus_pos]
403+
gr = lkp[plus_pos + 1:].replace('+', ',')
404+
else:
405+
lex = lkp
406+
gr = "Unknown"
407+
408+
# Get translation
409+
if ((xln := re.search(f"[\r\n]{lex}:{w} .*!([^0].*)[\r\n]", lexicon)) or
410+
(xln := re.search(f"[\r\n]{lex}:{w.lower()} .*!([^0].*)[\r\n]", lexicon)) or
411+
(xln := re.search(f"[\r\n]{lex}:.*!([^0].*)[\r\n]", lexicon))):
412+
xln = xln.group(1)
413+
else:
414+
xln = "Unknown"
415+
416+
section += f'<ana lex={lex} gr={gr} parts="" gloss="" trans_ru={xln}></ana>'
417+
section += f"{w}</w>'"
418+
parser_list.append(section)
419+
else:
420+
parser_list.append(f'\'<w><ana lex="" gr="" parts="" gloss=""></ana>{w}</w>\'')
421+
422+
parser_output = ", ".join(parser_list)
423+
424+
if debug_flag:
425+
with open("parser_output", 'w') as f:
426+
print(parser_output, file=f)
427+
print(f"Analyzed per word: {analyzed / words}")
428+
429+
return insert_parser_output_to_text(dedoc_output, parser_output, lang=lang)
361430

362431
def timarkh_udm(dedoc_output):
363432
return timarkh_uniparser(dedoc_output, 'udm')
@@ -392,3 +461,5 @@ def apertium_bak(dedoc_output, apertium_path):
392461
def apertium_rus(dedoc_output, apertium_path):
393462
return apertium_parser(dedoc_output, apertium_path, 'rus')
394463

464+
def hfst_kalmyk(dedoc_output):
465+
return hfst_parser(dedoc_output, 'xal')

server-requirements-1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ graphene==2.0.1
3333
graphql-core==2.0
3434
graphql-relay==0.4.5
3535
gunicorn==19.7.1
36+
hfst_dev==3.15.0.10b0
3637
imagesize==1.1.0
3738
iso8601==0.1.11
3839
jdcal==1.4.1

0 commit comments

Comments
 (0)