Skip to content

Commit

Permalink
Work with urj-Cyrl generation
Browse files Browse the repository at this point in the history
and filterning
  • Loading branch information
rueter committed Aug 28, 2024
1 parent d214d90 commit 1a1c1ec
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 18 deletions.
9 changes: 7 additions & 2 deletions src/fst/morphology/Makefile.modifications-local.am
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@

if HAVE_SHARED_URJ_CYRL
generated_files/urj-Cyrl-$(GLANG)-propernouns.lexc: \
$(gt_SHARED_urj_Cyrl)/src/fst/stems/urj-Cyrl-propernouns.lexc
$(gt_SHARED_urj_Cyrl)/src/fst/stems/urj-Cyrl-propernouns.lexc \
$(top_srcdir)/src/fst/scripts/urj-$(GTLANG)-conversion.pl
$(AM_V_at)$(MKDIR_P) generated_files
$(AM_V_GEN)cp -f $< $@
$(AM_V_GEN)rm -f $@ && \
echo "" >> $@ && \
echo "! <--- Dump from URJ-CYRL -->" >> $@ && \
echo "" >> $@ && \
$(top_srcdir)/src/fst/scripts/urj-$(GTLANG)-conversion.pl $< >> $@
else
generated_files/mul-$(GLANG)-%.lexc:
$(AM_V_at)$(MKDIR_P) generated_files
Expand Down
32 changes: 16 additions & 16 deletions src/fst/morphology/affixes/propernouns.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ LEXICON CYRL-A_SUR !!= * @CODE@
! +Sem/Sur: N-ava_01 ;
! +Sem/Sur: ENDLEX ;

LEXICON PROP_KIT_SUR !!= * @CODE@
LEXICON PROP_KAL_SUR !!= * @CODE@
+Sem/Sur: N-ava_01 ;
+Sem/Sur: ENDLEX ;

Expand All @@ -177,9 +177,9 @@ LEXICON PROP_LAK_MAL !!= * @CODE@
+Sem/Mal: N-ava_01 ;
+Sem/Mal: ENDLEX ;

LEXICON PROP_KIT_MAL !!= * @CODE@
+Sem/Mal: N-ava_01 ;
+Sem/Mal: ENDLEX ;
!LEXICON PROP_KIT_MAL !!= * @CODE@
! +Sem/Mal: N-ava_01 ;
! +Sem/Mal: ENDLEX ;

LEXICON PROP_OSH_MAL !!= * @CODE@
+Sem/Mal: N-ava_01 ;
Expand Down Expand Up @@ -214,9 +214,9 @@ LEXICON PROP_KAL_FEM !!= * @CODE@
!LEXICON PROP_LAK_FEM
! +Sem/Fem: N-ava_01 ;

LEXICON PROP_KIT_FEM !!= * @CODE@
+Sem/Fem: N-ava_01 ;
+Sem/Fem: ENDLEX ;
!LEXICON PROP_KIT_FEM !!= * @CODE@
! +Sem/Fem: N-ava_01 ;
! +Sem/Fem: ENDLEX ;

LEXICON PROP_OSH_FEM !!= * @CODE@
+Sem/Fem: N-ava_01 ;
Expand All @@ -242,20 +242,20 @@ LEXICON LEXC_PROP_KUDO_FEM !!= * @CODE@

!! PLACE NAMES FROM TEMPLATE

LEXICON PROP-PLC_KAL !!= * @CODE@
+Sem/Plc: N-ava_01 ;
+Sem/Plc: ENDLEX ;
!LEXICON PROP-PLC_KAL !!= * @CODE@
! +Sem/Plc: N-ava_01 ;
! +Sem/Plc: ENDLEX ;

LEXICON PROP-PLC_KIT !!= * @CODE@
+Sem/Plc: N-ava_01 ;
+Sem/Plc: ENDLEX ;
!LEXICON PROP-PLC_KIT !!= * @CODE@
! +Sem/Plc: N-ava_01 ;
! +Sem/Plc: ENDLEX ;

!LEXICON PROP-PLC_OSH
! +Sem/Plc: N-ava_01 ;

LEXICON PROP-PLC_KUDO !!= * @CODE@
+Sem/Plc: N-ava_01 ;
+Sem/Plc: ENDLEX ;
!LEXICON PROP-PLC_KUDO !!= * @CODE@
! +Sem/Plc: N-ava_01 ;
! +Sem/Plc: ENDLEX ;



Expand Down
45 changes: 45 additions & 0 deletions src/fst/scripts/urj-mhr-conversion.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/perl -w
#
# urj-mhr-conversion.pl
# Convert names in the URJ propernoun lexicon to mhr.

use strict;
use utf8;
use feature 'unicode_strings';
BEGIN {
$| = 1;
binmode(STDIN, ':encoding(UTF-8)');
binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );

while(<>) {
# Continuation lexicon substitutions:
# s/ C-FI-NEN/nen LONDON/g ;
s/ PROP_KAL_PLC/ PROP-PLC_/g ;
s/ PROP_KUDO_PLC/ PROP-PLC_/g ;
s/ PROP_OSH_PLC/ PROP-PLC_/g ;
s/ PROP-PLC_KEL1/ PROP-PLC_/g ;
s/ PROP-PLC_VELE/ PROP-PLC_/g ;
s/ PROP-PLC_KUDO/ PROP-PLC_/g ;
s/ PROP_RUS_JA/ PROP-PLC_/g ;
s/ PROP_KUDO / PROP_ /g ;
s/ PROP_KAL / PROP_ /g ;
s/ PROP_OSH / PROP_ /g ;


# loanwords with compound border over identical vowels
s/Hjarteelva/Hjarte-elva/g ;


# names with Inari Saami inflection
s/^Aanaar\+/!Aanaar+/g ;

# sme special symbols
s/b9/b/g ;

my $line = $_;

print $line;
}

0 comments on commit 1a1c1ec

Please sign in to comment.