Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/giellalt/lang-srs
Browse files Browse the repository at this point in the history
  • Loading branch information
coxchristopher committed May 1, 2024
2 parents a692f18 + cadabe6 commit 44d5973
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 22 deletions.
4 changes: 2 additions & 2 deletions .gut/delta.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
template = "https://github.com/giellalt/template-lang-und"
rev_id = 172
template_sha = "ca311e6ba41f16538c7268df70c7048082eaadb2"
rev_id = 175
template_sha = "bf3ac2ead0081366d7a999df6f804fc6662bbe30"

[replacements]
__REPO__ = "lang-srs"
Expand Down
6 changes: 6 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ banner:
# 'nothing to be done for test'. By forwarding test to check we work around it.
test: check

# recurse all make devs just
dev:
$(MAKE) dev -C tools/tokenisers
$(MAKE) dev -C tools/grammarcheckers
$(MAKE) dev -C tools/tts

# Remove html tables created by some of the developer tools:
clean-local:
rm -f *.html
31 changes: 14 additions & 17 deletions m4/giella-macros.m4
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ AC_MSG_RESULT([$GIELLA_CORE])
###############################################################
### This is the version of the Giella Core that we require. ###
### UPDATE AS NEEDED.
_giella_core_min_version=0.22.0
_giella_core_min_version=0.23.0
# GIELLA_CORE/GTCORE env. variable, required by the infrastructure to find scripts:
AC_ARG_VAR([GIELLA_CORE], [directory for the Giella infra core scripts and other required resources])
Expand Down Expand Up @@ -162,21 +162,7 @@ AS_IF([test "x$enable_yamltests" = "xcheck"],
AM_CONDITIONAL([CAN_YAML_TEST], [test "x$enable_yamltests" != xno])
################ LXML or pip ################
AS_IF([test "x$enable_grammarchecker" != "xno"],
[AM_PATH_PYTHON([3.5],, [:])
AX_PYTHON_MODULE(lxml)
AX_PYTHON_MODULE(pip)
AC_MSG_CHECKING([whether we can use lxml])
AS_IF([test "x$HAVE_PYMOD_LXML" != "xyes"],
AS_IF([test "x$HAVE_PYMOD_PIP" != "xno"],
AC_MSG_RESULT(no)
AC_MSG_WARN([lxml or pip is needed for grammarcheckers]),
AC_MSG_RESULT([no but using pip])),
AC_MSG_RESULT(yes))])
AM_CONDITIONAL([CAN_LXML], [test "x$HAVE_PYMOD_LXML" != xno])
AM_CONDITIONAL([CAN_PIP], [test "x$HAVE_PYMOD_LXML" != xno])
################ Generated documentation ################
# Check for awk with required feature:
AC_CACHE_CHECK([for awk that supports gensub], [ac_cv_path_GAWK],
Expand Down Expand Up @@ -676,6 +662,18 @@ AS_IF([test "x$enable_grammarchecker" = "xyes" -a "x$gt_prog_vislcg3" = "xno"],
AS_IF([test "x$enable_ci" = "xyes" -a "x$enableval" = "x"], [enable_grammarchecker=no])
AM_CONDITIONAL([WANT_GRAMCHECK], [test "x$enable_grammarchecker" != xno])
enableval=''
################ gtgramtool for grammarchecking ################
AC_PATH_PROG([GTGRAMTOOL], [gtgramtool], [false])
AS_IF([test "x$enable_grammarchecker" != "xno"],
AX_PYTHON_MODULE(pip)
AC_MSG_CHECKING([whether we have gtgramtool])
AS_IF([test x$GTGRAMTOOL = xfalse],
[AC_MSG_ERROR([gtgramtool is needed for --enable grammarchecker.
on debian/ubuntu: sudo apt update; sudo apt install pipx; pipx ensurepath
on macbrew: brew install pipx; pipx ensurepath
then: pipx install git+https://github.com/divvun/giellaltgramtools
])]),
AC_MSG_RESULT(yes))
# Enable all spellers - default is 'no'
AC_ARG_ENABLE([spellers],
Expand Down Expand Up @@ -1048,7 +1046,6 @@ cd ..
git clone [email protected]:giellalt/$gt_SHARED_FAILS
cd $gt_SHARED_FAILS
./autogen.sh && ./configure && make])])
AC_MSG_WARN([January 2024: the lexc files and fsts have been moved up to src/fst/morphology])
]) # gt_PRINT_FOOTER
# vim: set ft=config:
160 changes: 160 additions & 0 deletions src/fst/morphology/incoming/srs-bound-demo.xfscript
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Demo script on morpheme boundary marking

read lexc ../../morphology/stems/verb_stems.lexc
define Stems

read lexc ../../morphology/affixes/verb_inner_affixes.lexc
define InnerAffixes

read lexc ../../morphology/affixes/verb_middle_affixes.lexc
define MiddleAffixes

read lexc ../../morphology/affixes/verb_outer_affixes.lexc
define OuterAffixes

read lexc ../../morphology/affixes/verb_oblique_affixes.lexc
define ObliqueAffixes

read lexc ../../morphology/affixes/postverbal_affixes.lexc
define PostverbalAffixes

# Mark prefix type (insert corresponding flags to the
# original prefixes we know to be present from the lexical entry.
# ab=cd_ef.gh => ab@P.PREFIX.OUTER@=cd@P.PREFIX.MIDDLE@_ef@P.PREFIX.INNER@.gh

define MarkPrefixes ~$["@P.PREFIX.OUTER@"|"@P.PREFIX.INNER@"|"@P.PREFIX.MIDDLE@"] .o.
"=" -> "@P.PREFIX.OUTER@" "=" ,
"_" -> "@P.PREFIX.MIDDLE@" "_" ,
"." -> "@P.PREFIX.INNER@" ".";

# Insert . (inner), _ (middle), and = (outer) if missing in the intermediate rep:
# tsiy > =_.tsiy
# ts'á=zíd > ts'á=_.zíd
# tsí=di.tł'á > tsí=_di.tł'á
# gu.blah > =_gu.blah
# gu_blah > =gu_.blah

# (1) No . > insert . after last marker (= or _), or in the beginning if none exists
# (2) No _ > insert _ (a) after =, if one exists, or (b) beginning
# (3) No = > insert at beginning

define InsInner [..] -> "." || "_" _ ~$"." .#. .o.
[..] -> "." || "=" _ ~$"." .#. .o.
[..] -> "." || .#. _ ~$"." .#. ;

define InsMiddle [..] -> "_" || "=" _ ~$["_"] .#. .o.
[..] -> "_" || .#. _ ~$["_"] .#. ;

define InsOuter [..] -> "=" || .#. _ ~$["="] .#. ;

# Some lexical entries include a "^L" immediately before the inner-prefix
# boundary marker "." (e.g., xá=_^L.ʔò "take it [solid object] out", or
# ta=_di^L.ʔò "lift/pick/hold it [solid object] up"), with or without
# any other inner prefixes present. In these (rare) cases, Tsuut'ina uses
# middle prefix TAMA allomorphs, rather than outer or inner prefix forms, e.g.
#
# 2SG xáaʔò "you (sg.) will carry it (solid object) past"
# (not *xaniʔò or something similar if 0-IPFV outer or no
# preceding prefix allomorphs were used)
# 1PL xáasaàʔò "we will carry it (solid object) past"
# (not *xaàʔò or something similar if 0-IPFV inner prefix
# allomorphs were used)
#
# 2SG tadiʔò "you (sg.) will lift it (solid object) up"
# (not *tadiniʔò if 0-IPFV outer or no preceding prefix
# allomorphs were used)
# 1PL tadìsaàʔò "we will lift it (solid object) up"
# (not *tadaàʔò or something similar if 0-IPFV inner
# prefix allomorphs were used)
#
# This rule therefore turns @P.PREFIX.INNER@ into @P.PREFIX.MIDDLE@ to ensure
# that middle-prefix allomorphs are used, then sets an additional flag (@P.
# LOWTONE.ON@) to help differentiate between this situation and all other
# middle-prefix contexts (for use in 'affixes/verb_inner_affixes.lexc').
define LInnerPrefixAllomorphs "@P.PREFIX.INNER@" ->
"@P.PREFIX.MIDDLE@" "@P.LOWTONE.ON@" || "^L" _ ;

# Some middle and inner prefixes (e.g., middle íH- conative/half-transitive,
# inner ná-) appear with outer prefix TAMA chunk allomorphs. In order to
# get this right, we append the symbol "^O" after the vowel in the lexical
# entry (e.g., í^H^O-, ná^O-), then turn that into a flag that requires outer
# prefix allomorphs.
define RequireOuterAllomorphs "^O" "@P.PREFIX.INNER@" -> "@P.PREFIX.OUTER@" .o.
"^O" "@P.PREFIX.MIDDLE@" -> "@P.PREFIX.OUTER@" .o.
"^O" -> "@P.PREFIX.OUTER@";

# We temporarily keep the boundary symbol for inner affixes ("."), middle
# affixes ("_"), and outer affixes ("=") in place so that we can target
# morphophonology in each position more easily (especially when aiming to drop
# the "weak" /i/ vowels that appear in inner lexical prefixes).
read regex [Stems PostverbalAffixes] .o.
MarkPrefixes .o.
InsInner .o. InsMiddle .o. InsOuter .o.
LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o.
"." -> "." "@P.BOUND.INN-L@" InnerAffixes "@P.BOUND.INN-R@" ,
"_" -> "_" "@P.BOUND.MID-L@" MiddleAffixes "@P.BOUND.MID-R@" ,
"=" -> "=" "@P.BOUND.OUT-L@" OuterAffixes "@P.BOUND.OUT-R@" ;
define WordForms;

# Rewrite rule for resurrecting the prefix boundary markers

define ShowBoundaries [ "@P.BOUND.INN-L@" -> "(" ,
"@P.BOUND.MID-L@" -> "[" ,
"@P.BOUND.OUT-L@" -> "<" ,
"@P.BOUND.INN-R@" -> ")" ,
"@P.BOUND.MID-R@" -> "]" ,
"@P.BOUND.OUT-R@" -> ">"
];

# define SimplifyBoundaries [ [ "<" "<" -> "<" , ">" ">" -> ">" , "[" "[" -> "[" , "]" "]" -> "]" , "(" "(" -> "(" , ")" ")" -> ")" ]
# ];

# define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ]
# .o. "=" "_" "." -> "="
# .o. "_" "." -> "_"
# .o. "=" "_" -> "="
# .o. [ "=" | "_" | "." ] -> 0 || .#. _
# ];

# Concatenate ObliqueAffixes and the other inflectional FSTs.
read regex WordForms .o. [..] -> ObliqueAffixes || .#. _;

twosided flag-diacritics
define Grammar;

# Morphophonology

source ../../morphology/phonology.xfscript
define MorphoPhonology;

read lexc ../../morphology/affixes/verb_tags.lexc
define Tags;

# To prevent morphophonology to be tripped up be intervening flags
set flag-is-epsilon ON

# regex Grammar Tags;

# Tentative code for dealing with Morphophonology as read for a separate
# file. Though we would want to figure a way to compose each rewrite rule
# the the morphological component one-by-one for faster compilation.

regex [Grammar Tags] .o. MorphoPhonology ;
define VerbModel

# regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing;

# Make flags visible, so that they can be converted to explicit boundary markers
set flag-is-epsilon OFF

# Output boundary markers based on flags

regex VerbModel .o. ShowBoundaries ;
# define VerbModelWithBound

# Make flags invisible again, so that they will not intervene in removing excessive boundary markers
set flag-is-epsilon ON

# regex VerbModelWithBound .o. SimplifyBoundaries ;

twosided flag-diacritics
33 changes: 30 additions & 3 deletions src/fst/morphology/verb_lexicon.xfscript.in
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,25 @@ read regex [Stems PostverbalAffixes] .o.
MarkPrefixes .o.
InsInner .o. InsMiddle .o. InsOuter .o.
LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o.
"." -> "." InnerAffixes ,
"_" -> "_" MiddleAffixes ,
"=" -> "=" OuterAffixes;
"." -> "." InnerAffixes "@P.PREFIX.INNER@" ,
"_" -> "_" MiddleAffixes "@P.PREFIX.MIDDLE@" ,
"=" -> "=" OuterAffixes "@P.PREFIX.OUTER@" ;
define WordForms;

# Rewrite rule for resurrecting the prefix boundary markers

define ShowBoundaries [ "@P.PREFIX.INNER@" -> "." ,
"@P.PREFIX.MIDDLE@" -> "_" ,
"@P.PREFIX.OUTER@" -> "="
];

define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ]
.o. "=" "_" "." -> "="
.o. "_" "." -> "_"
.o. "=" "_" -> "="
.o. [ "=" | "_" | "." ] -> 0 || .#. _
];

# Concatenate ObliqueAffixes and the other inflectional FSTs.
read regex WordForms .o. [..] -> ObliqueAffixes || .#. _;

Expand All @@ -125,9 +139,22 @@ set flag-is-epsilon ON
# the the morphological component one-by-one for faster compilation.

regex [Grammar Tags] .o. MorphoPhonology ;
define VerbModel

# regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing;

# Make flags visible, so that they can be converted to explicit boundary markers
set flag-is-epsilon OFF

# Output boundary markers based on flags

regex VerbModel .o. ShowBoundaries ;
define VerbModelWithBound

# Make flags invisible again, so that they will not intervene in removing excessive boundary markers
set flag-is-epsilon ON

regex VerbModelWithBound .o. SimplifyBoundaries ;

#eliminate flag TAMA
#eliminate flag SUBJECTNUMBER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,19 @@ checked_lemmas=checked_lemmas.txt
--exclude "(CmpN/Only|ShCmp|\+Cmp\/SplitR| Rreal | R | Rnoun |\+V\+|NOT-TO-LEMMATEST)" \
$source_files > $lemmas

if ! test -f $speller_dir/${GIELLA_LANG}.zhfst ; then
echo missing $speller_dir/${GIELLA_LANG}.zhfst
fi

####### Start testing: #######

$ospell $speller_dir/${GIELLA_LANG}.zhfst < $lemmas > $checked_lemmas

grep 'is NOT in the lexicon' $checked_lemmas > $rejected_lemmas

if [ -s $rejected_lemmas ] ; then
head $rejected_lemmas
echo see $rejected_lemmas for more
exit 1
fi

Expand Down

0 comments on commit 44d5973

Please sign in to comment.