7
7
from uniparser_moksha import MokshaAnalyzer
8
8
from uniparser_komi_zyrian import KomiZyrianAnalyzer
9
9
from nltk .tokenize import RegexpTokenizer
10
+ from hfst_dev import HfstTransducer
11
+ from lxml .html import fromstring
10
12
import csv
11
13
import os
12
14
import tempfile
@@ -24,7 +26,7 @@ def print_to_str(*args, **kwargs):
24
26
25
27
26
28
span_id_counter = 0
27
- def generate_html_wrap (word , ana_tag_list , lang = "" ):
29
+ def generate_html_wrap (word , ana_tag_list , lang = "" , extra_state = "" ):
28
30
29
31
json_list = list ()
30
32
for ana_tag in ana_tag_list :
@@ -40,16 +42,16 @@ def generate_html_wrap(word, ana_tag_list, lang=""):
40
42
41
43
global span_id_counter
42
44
span_id_counter += 1
43
- wrap = "<span class=\" unverified\" " + " id=" + str (span_id_counter ) + ">"
45
+ wrap = f "<span class=\" unverified { extra_state } \" " + " id=" + str (span_id_counter ) + ">"
44
46
for attr_json in json_list :
45
47
span_id_counter += 1
46
48
encoded_attrs = ((json .dumps (attr_json , ensure_ascii = False )).encode ('utf8' )).decode ()
47
- wrap += "<span class=\" result\" " + " id=" + str (span_id_counter ) + ">" + encoded_attrs + "</span>"
49
+ wrap += f "<span class=\" result { extra_state } \" " + " id=" + str (span_id_counter ) + ">" + encoded_attrs + "</span>"
48
50
49
51
if lang == 'udm' and 'nom' in encoded_attrs :
50
52
flag = True
51
53
span_id_counter += 1
52
- wrap += "<span class=\" result\" " + " id=" + str (span_id_counter ) + ">" + encoded_attrs .replace ('nom' , 'acc0' ) + "</span>"
54
+ wrap += f "<span class=\" result { extra_state } \" " + " id=" + str (span_id_counter ) + ">" + encoded_attrs .replace ('nom' , 'acc0' ) + "</span>"
53
55
54
56
wrap += word + "</span>"
55
57
return wrap
@@ -68,8 +70,9 @@ def insert_parser_output_to_text(text, parser_output, lang=""):
68
70
if text [match_index - len (ESC_PAT ):match_index ] == ESC_PAT and text [match_index + len (word ):match_index + len (word )+ len (ESC_PAT )] == ESC_PAT :
69
71
continue
70
72
result_list .append (text [search_start_index :match_index ])
71
- if (len (w_tag .contents ) > 1 ):
72
- result_list .append (generate_html_wrap (word , w_tag .contents [0 :- 1 ], lang = lang ))
73
+ if len (w_tag .contents ) > 1 :
74
+ extra_state = "broken" if any ([a .get ('gr' ) == "Unknown" for a in w_tag .find_all ('ana' )]) else ""
75
+ result_list .append (generate_html_wrap (word , w_tag .contents [0 :- 1 ], lang = lang , extra_state = extra_state ))
73
76
search_start_index = match_index + len (word )
74
77
result_list .append (text [search_start_index :])
75
78
result = "" .join (result_list )
@@ -358,6 +361,72 @@ def trans(elem):
358
361
359
362
return insert_parser_output_to_text (dedoc_output , parser_output , lang = lang )
360
363
364
+ def hfst_parser (dedoc_output , lang , debug_flag = False ):
365
+
366
+ if debug_flag :
367
+ with open ("dedoc_output" , 'w' ) as f :
368
+ print (dedoc_output , file = f )
369
+
370
+ parser_path = f"/opt/hfst/{ lang } "
371
+
372
+ with open (f"{ parser_path } /lexicon.lexc" , 'r' ) as f :
373
+ lexicon = f .read ()
374
+
375
+ xfst = HfstTransducer .read_from_file (f"{ parser_path } /rules.xfst.hfst" )
376
+ xfst .invert ()
377
+
378
+ sent_regex = re .compile (r'[.|!|?|...]' )
379
+ word_regex = re .compile (r'[,| |:|"|-|*]' )
380
+
381
+ words = 0
382
+ analyzed = 0
383
+ parser_list = []
384
+
385
+ # remove html tags from dedoc_output
386
+ text = fromstring (dedoc_output ).text_content ()
387
+ sentences = filter (lambda t : t , [t .strip () for t in sent_regex .split (text )])
388
+ for s in sentences :
389
+ wordlist = filter (lambda t : t , [t .strip () for t in word_regex .split (s )])
390
+ for w in wordlist :
391
+ words = words + 1
392
+ lookup = xfst .lookup (w )
393
+ if len (lookup ) == 0 :
394
+ lookup = xfst .lookup (w .lower ())
395
+ if len (lookup ) > 0 :
396
+ analyzed = analyzed + 1
397
+ section = "'<w>"
398
+ for lkp in map (lambda l : l [0 ], lookup ):
399
+
400
+ if '+' in lkp :
401
+ plus_pos = lkp .index ('+' )
402
+ lex = lkp [:plus_pos ]
403
+ gr = lkp [plus_pos + 1 :].replace ('+' , ',' )
404
+ else :
405
+ lex = lkp
406
+ gr = "Unknown"
407
+
408
+ # Get translation
409
+ if ((xln := re .search (f"[\r \n ]{ lex } :{ w } .*!([^0].*)[\r \n ]" , lexicon )) or
410
+ (xln := re .search (f"[\r \n ]{ lex } :{ w .lower ()} .*!([^0].*)[\r \n ]" , lexicon )) or
411
+ (xln := re .search (f"[\r \n ]{ lex } :.*!([^0].*)[\r \n ]" , lexicon ))):
412
+ xln = xln .group (1 )
413
+ else :
414
+ xln = "Unknown"
415
+
416
+ section += f'<ana lex={ lex } gr={ gr } parts="" gloss="" trans_ru={ xln } ></ana>'
417
+ section += f"{ w } </w>'"
418
+ parser_list .append (section )
419
+ else :
420
+ parser_list .append (f'\' <w><ana lex="" gr="" parts="" gloss=""></ana>{ w } </w>\' ' )
421
+
422
+ parser_output = ", " .join (parser_list )
423
+
424
+ if debug_flag :
425
+ with open ("parser_output" , 'w' ) as f :
426
+ print (parser_output , file = f )
427
+ print (f"Analyzed per word: { analyzed / words } " )
428
+
429
+ return insert_parser_output_to_text (dedoc_output , parser_output , lang = lang )
361
430
362
431
def timarkh_udm (dedoc_output ):
363
432
return timarkh_uniparser (dedoc_output , 'udm' )
@@ -392,3 +461,5 @@ def apertium_bak(dedoc_output, apertium_path):
392
461
def apertium_rus (dedoc_output , apertium_path ):
393
462
return apertium_parser (dedoc_output , apertium_path , 'rus' )
394
463
464
+ def hfst_kalmyk (dedoc_output ):
465
+ return hfst_parser (dedoc_output , 'xal' )
0 commit comments