Skip to content

Commit

Permalink
Add urj-mdf-conversion.pl
Browse files Browse the repository at this point in the history
Still looking for how to get it to work.
I was unable to draw any conclusions from lang-mns
  • Loading branch information
rueter committed May 25, 2023
1 parent 0fe0e87 commit 9490db1
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 3 deletions.
12 changes: 9 additions & 3 deletions src/fst/affixes/propernouns.lexc
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,16 @@ LEXICON PROP-VOC_KIAJ

LEXICON PROP_KAL
: N_KAL ;

LEXICON PROP_KIAL1
: N_KIAL1 ;

!2023-05-25 these will be removed once src/scripts/urj-mdf-conversion.pl works
LEXICON PROP-PLC_KEL1
N_KIAL1 ;
LEXICON PROP-PLC_VELE
N_TISHE ;


LEXICON PROP_OSH
: N_OSH ;

Expand Down Expand Up @@ -203,8 +209,8 @@ LEXICON PROP_KUDO_FEM
LEXICON PROP-PLC_KAL !!≈ * **@CODE@** ending in other consonant
+Sem/Plc: NMN_KAL ;

LEXICON PROP-PLC_KIAL1
+Sem/Plc: N_KIAL1 ;
!LEXICON PROP-PLC_KIAL1
!+Sem/Plc: N_KIAL1 ;

LEXICON PROP-PLC_KIT !!≈ * **@CODE@** ending in t
+Sem/Plc: NMN_KAL ;
Expand Down
38 changes: 38 additions & 0 deletions src/scripts/urj-mdf-conversion.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/perl -w
#
# urj-mdf-conversion.pl
# Convert names in the URJ propernoun lexicon to mdf.

use strict;
use utf8;
use feature 'unicode_strings';
BEGIN {
$| = 1;
binmode(STDIN, ':encoding(UTF-8)');
binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );

while(<>) {
# Continuation lexicon substitutions:
# s/ C-FI-NEN/nen LONDON/g ;
s/ PROP-PLC_KEL1/ N_KIAL1/g ;
s/ PROP-PLC_VELE/ N_TISHE/g ;
s/ PROP_VELE/ N_TISHE/g ;


# loanwords with compound border over identical vowels
s/Hjarteelva/Hjarte-elva/g ;


# names with Inari Saami inflection
s/^Aanaar\+/!Aanaar+/g ;

# sme special symbols
s/b9/b/g ;

my $line = $_;

print $line;
}

77 changes: 77 additions & 0 deletions src/scripts/urj-mdf-conversion.pl~
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/perl -w
#
# urj-mns-conversion.pl
# Convert names in the URJ propernoun lexicon to mns.

use strict;
use utf8;
use feature 'unicode_strings';
BEGIN {
$| = 1;
binmode(STDIN, ':encoding(UTF-8)');
binmode(STDOUT, ':encoding(UTF-8)');
}
use open qw( :encoding(UTF-8) :std );

while(<>) {
# Continuation lexicon substitutions:
# s/ C-FI-NEN/nen LONDON/g ;
s/ PROP_KIT_FEM/ N_VCH/g ;
s/ PROP_LAK_MAL/ N_VCH/g ;
s/ PropNameMaleDer-Y-0Evich/ N_VCS/g ;
s/ PROP_OSH_SUR/ N_VCH/g ;
s/ Deriv-RUS-AN_SURMAL/ N_VCH/g ;
s/ CYRL-L_SUR/ N_VCH/g ;
s/ Deriv-RUS-IJ_SURMAL/ий N_VCS/g ;
s/ Deriv-RUS-YJ_SURMAL/ый N_VCS/g ;
s/ Deriv-RUS-KIJ_SURMAL/ий N_VCS/g ;
s/ Deriv-RUS-OJ_SURMAL/ой N_VCS/g ;
s/ CYRL-K_SUR/ N_VCH/g ;
s/ CYRL-SIBILANT_SUR/ N_VCH/g ;
s/ Deriv-RUS-IN_SURMAL/ N_VCH/g ;
s/ Deriv-RUS-V_SURMAL/ N_VCH/g ;
s/ CYRL-T_SUR/ N_VCH/g ;
s/ CYRL-CONS_SUR/ N_VCH/g ;
s/(ы|и) CYRL-VOW_SUR/$1 N_VI/g ;#ordering 1
s/ CYRL-VOW_SUR/ N_VO/g ;
s/ CYRL-A_SUR/ N_VO/g ;
s/ PROP_OSH_MAL/ N_VCH/g ;
s/ PROP_KIT_MAL/ N_VCH/g ;
s/ PROP_KUDO_PATRFEM/ N_VO/g ;
s/ PROP-PLC_KEL1/ N_VCS/g ;
s/ PROP-PLC_VELE/ N_VO/g ;
s/ PROP_VELE/ N_VO/g ;
s/ PROP_KUDO_MAL/ N_VO/g ;
s/ PropNameMaleDer-Ovich/ N_VCS/g ;
s/ PROP_OSH_PATRMAL/ N_VCH/g ;
s/ PropNameMaleDer-IJ-Y0Evich/ий N_VCS/g ;
s/ PropNameMaleDer-J-0Evich/й N_VCS/g ;
s/ PropNameMaleDer-IJ-I0Evich/ий N_VCS/g ;
s/ PropNameMaleDer-I-YEvich/и N_VI/g ;
s/(ь|й) PROP_KAL_MAL/$1 N_VCS/g ;
s/ PROP_KAL_MAL/ N_VCH/g ;
s/(ь|й) PROP_KAL_FEM/$1 N_VCS/g ;
s/ PROP_KAL_FEM/ N_VCH/g ;
s/ PROP_OSH_FEM/ N_VCH/g ;
s/ PROP_KUDO_FEM/ N_VO/g ;
s/ PROP-PLC_KIT/ N_VCH/g ;
s/(ь|й) PROP-PLC_KAL/$1 N_VCS/g ;
s/ PROP-PLC_KAL/ N_VCH/g ;
s/ PROP-PLC_KUDO/ N_VO/g ;
s/ PROP_KIT_SUR/ N_VCH/g ;

# loanwords with compound border over identical vowels
s/Hjarteelva/Hjarte-elva/g ;


# names with Inari Saami inflection
s/^Aanaar\+/!Aanaar+/g ;

# sme special symbols
s/b9/b/g ;

my $line = $_;

print $line;
}

0 comments on commit 9490db1

Please sign in to comment.