diff --git a/.gut/delta.toml b/.gut/delta.toml index 14f3690b..603073f7 100644 --- a/.gut/delta.toml +++ b/.gut/delta.toml @@ -1,6 +1,6 @@ template = "https://github.com/giellalt/template-lang-und" -rev_id = 172 -template_sha = "ca311e6ba41f16538c7268df70c7048082eaadb2" +rev_id = 175 +template_sha = "bf3ac2ead0081366d7a999df6f804fc6662bbe30" [replacements] __REPO__ = "lang-srs" diff --git a/Makefile.am b/Makefile.am index 64ee2342..1cb40b3f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -20,6 +20,12 @@ banner: # 'nothing to be done for test'. By forwarding test to check we work around it. test: check +# recurse all make devs just +dev: + $(MAKE) dev -C tools/tokenisers + $(MAKE) dev -C tools/grammarcheckers + $(MAKE) dev -C tools/tts + # Remove html tables created by some of the developer tools: clean-local: rm -f *.html diff --git a/m4/giella-macros.m4 b/m4/giella-macros.m4 index f740727b..4cec2dba 100644 --- a/m4/giella-macros.m4 +++ b/m4/giella-macros.m4 @@ -88,7 +88,7 @@ AC_MSG_RESULT([$GIELLA_CORE]) ############################################################### ### This is the version of the Giella Core that we require. ### ### UPDATE AS NEEDED. -_giella_core_min_version=0.22.0 +_giella_core_min_version=0.23.0 # GIELLA_CORE/GTCORE env. variable, required by the infrastructure to find scripts: AC_ARG_VAR([GIELLA_CORE], [directory for the Giella infra core scripts and other required resources]) @@ -162,21 +162,7 @@ AS_IF([test "x$enable_yamltests" = "xcheck"], AM_CONDITIONAL([CAN_YAML_TEST], [test "x$enable_yamltests" != xno]) -################ LXML or pip ################ -AS_IF([test "x$enable_grammarchecker" != "xno"], - [AM_PATH_PYTHON([3.5],, [:]) - AX_PYTHON_MODULE(lxml) - AX_PYTHON_MODULE(pip) - AC_MSG_CHECKING([whether we can use lxml]) - AS_IF([test "x$HAVE_PYMOD_LXML" != "xyes"], - AS_IF([test "x$HAVE_PYMOD_PIP" != "xno"], - AC_MSG_RESULT(no) - AC_MSG_WARN([lxml or pip is needed for grammarcheckers]), - AC_MSG_RESULT([no but using pip])), - AC_MSG_RESULT(yes))]) - -AM_CONDITIONAL([CAN_LXML], [test "x$HAVE_PYMOD_LXML" != xno]) -AM_CONDITIONAL([CAN_PIP], [test "x$HAVE_PYMOD_LXML" != xno]) + ################ Generated documentation ################ # Check for awk with required feature: AC_CACHE_CHECK([for awk that supports gensub], [ac_cv_path_GAWK], @@ -676,6 +662,18 @@ AS_IF([test "x$enable_grammarchecker" = "xyes" -a "x$gt_prog_vislcg3" = "xno"], AS_IF([test "x$enable_ci" = "xyes" -a "x$enableval" = "x"], [enable_grammarchecker=no]) AM_CONDITIONAL([WANT_GRAMCHECK], [test "x$enable_grammarchecker" != xno]) enableval='' +################ gtgramtool for grammarchecking ################ +AC_PATH_PROG([GTGRAMTOOL], [gtgramtool], [false]) +AS_IF([test "x$enable_grammarchecker" != "xno"], + AX_PYTHON_MODULE(pip) + AC_MSG_CHECKING([whether we have gtgramtool]) + AS_IF([test x$GTGRAMTOOL = xfalse], + [AC_MSG_ERROR([gtgramtool is needed for --enable grammarchecker. + on debian/ubuntu: sudo apt update; sudo apt install pipx; pipx ensurepath + on macbrew: brew install pipx; pipx ensurepath + then: pipx install git+https://github.com/divvun/giellaltgramtools + ])]), + AC_MSG_RESULT(yes)) # Enable all spellers - default is 'no' AC_ARG_ENABLE([spellers], @@ -1048,7 +1046,6 @@ cd .. git clone git@github.com:giellalt/$gt_SHARED_FAILS cd $gt_SHARED_FAILS ./autogen.sh && ./configure && make])]) -AC_MSG_WARN([January 2024: the lexc files and fsts have been moved up to src/fst/morphology]) ]) # gt_PRINT_FOOTER # vim: set ft=config: diff --git a/src/fst/morphology/incoming/srs-bound-demo.xfscript b/src/fst/morphology/incoming/srs-bound-demo.xfscript new file mode 100644 index 00000000..08567b88 --- /dev/null +++ b/src/fst/morphology/incoming/srs-bound-demo.xfscript @@ -0,0 +1,160 @@ +# Demo script on morpheme boundary marking + +read lexc ../../morphology/stems/verb_stems.lexc +define Stems + +read lexc ../../morphology/affixes/verb_inner_affixes.lexc +define InnerAffixes + +read lexc ../../morphology/affixes/verb_middle_affixes.lexc +define MiddleAffixes + +read lexc ../../morphology/affixes/verb_outer_affixes.lexc +define OuterAffixes + +read lexc ../../morphology/affixes/verb_oblique_affixes.lexc +define ObliqueAffixes + +read lexc ../../morphology/affixes/postverbal_affixes.lexc +define PostverbalAffixes + +# Mark prefix type (insert corresponding flags to the +# original prefixes we know to be present from the lexical entry. +# ab=cd_ef.gh => ab@P.PREFIX.OUTER@=cd@P.PREFIX.MIDDLE@_ef@P.PREFIX.INNER@.gh + +define MarkPrefixes ~$["@P.PREFIX.OUTER@"|"@P.PREFIX.INNER@"|"@P.PREFIX.MIDDLE@"] .o. + "=" -> "@P.PREFIX.OUTER@" "=" , + "_" -> "@P.PREFIX.MIDDLE@" "_" , + "." -> "@P.PREFIX.INNER@" "."; + +# Insert . (inner), _ (middle), and = (outer) if missing in the intermediate rep: +# tsiy > =_.tsiy +# ts'á=zíd > ts'á=_.zíd +# tsí=di.tł'á > tsí=_di.tł'á +# gu.blah > =_gu.blah +# gu_blah > =gu_.blah + +# (1) No . > insert . after last marker (= or _), or in the beginning if none exists +# (2) No _ > insert _ (a) after =, if one exists, or (b) beginning +# (3) No = > insert at beginning + +define InsInner [..] -> "." || "_" _ ~$"." .#. .o. + [..] -> "." || "=" _ ~$"." .#. .o. + [..] -> "." || .#. _ ~$"." .#. ; + +define InsMiddle [..] -> "_" || "=" _ ~$["_"] .#. .o. + [..] -> "_" || .#. _ ~$["_"] .#. ; + +define InsOuter [..] -> "=" || .#. _ ~$["="] .#. ; + +# Some lexical entries include a "^L" immediately before the inner-prefix +# boundary marker "." (e.g., xá=_^L.ʔò "take it [solid object] out", or +# ta=_di^L.ʔò "lift/pick/hold it [solid object] up"), with or without +# any other inner prefixes present. In these (rare) cases, Tsuut'ina uses +# middle prefix TAMA allomorphs, rather than outer or inner prefix forms, e.g. +# +# 2SG xáaʔò "you (sg.) will carry it (solid object) past" +# (not *xaniʔò or something similar if 0-IPFV outer or no +# preceding prefix allomorphs were used) +# 1PL xáasaàʔò "we will carry it (solid object) past" +# (not *xaàʔò or something similar if 0-IPFV inner prefix +# allomorphs were used) +# +# 2SG tadiʔò "you (sg.) will lift it (solid object) up" +# (not *tadiniʔò if 0-IPFV outer or no preceding prefix +# allomorphs were used) +# 1PL tadìsaàʔò "we will lift it (solid object) up" +# (not *tadaàʔò or something similar if 0-IPFV inner +# prefix allomorphs were used) +# +# This rule therefore turns @P.PREFIX.INNER@ into @P.PREFIX.MIDDLE@ to ensure +# that middle-prefix allomorphs are used, then sets an additional flag (@P. +# LOWTONE.ON@) to help differentiate between this situation and all other +# middle-prefix contexts (for use in 'affixes/verb_inner_affixes.lexc'). +define LInnerPrefixAllomorphs "@P.PREFIX.INNER@" -> + "@P.PREFIX.MIDDLE@" "@P.LOWTONE.ON@" || "^L" _ ; + +# Some middle and inner prefixes (e.g., middle íH- conative/half-transitive, +# inner ná-) appear with outer prefix TAMA chunk allomorphs. In order to +# get this right, we append the symbol "^O" after the vowel in the lexical +# entry (e.g., í^H^O-, ná^O-), then turn that into a flag that requires outer +# prefix allomorphs. +define RequireOuterAllomorphs "^O" "@P.PREFIX.INNER@" -> "@P.PREFIX.OUTER@" .o. + "^O" "@P.PREFIX.MIDDLE@" -> "@P.PREFIX.OUTER@" .o. + "^O" -> "@P.PREFIX.OUTER@"; + +# We temporarily keep the boundary symbol for inner affixes ("."), middle +# affixes ("_"), and outer affixes ("=") in place so that we can target +# morphophonology in each position more easily (especially when aiming to drop +# the "weak" /i/ vowels that appear in inner lexical prefixes). +read regex [Stems PostverbalAffixes] .o. + MarkPrefixes .o. + InsInner .o. InsMiddle .o. InsOuter .o. + LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o. + "." -> "." "@P.BOUND.INN-L@" InnerAffixes "@P.BOUND.INN-R@" , + "_" -> "_" "@P.BOUND.MID-L@" MiddleAffixes "@P.BOUND.MID-R@" , + "=" -> "=" "@P.BOUND.OUT-L@" OuterAffixes "@P.BOUND.OUT-R@" ; +define WordForms; + +# Rewrite rule for resurrecting the prefix boundary markers + +define ShowBoundaries [ "@P.BOUND.INN-L@" -> "(" , + "@P.BOUND.MID-L@" -> "[" , + "@P.BOUND.OUT-L@" -> "<" , + "@P.BOUND.INN-R@" -> ")" , + "@P.BOUND.MID-R@" -> "]" , + "@P.BOUND.OUT-R@" -> ">" +]; + +# define SimplifyBoundaries [ [ "<" "<" -> "<" , ">" ">" -> ">" , "[" "[" -> "[" , "]" "]" -> "]" , "(" "(" -> "(" , ")" ")" -> ")" ] +# ]; + +# define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ] +# .o. "=" "_" "." -> "=" +# .o. "_" "." -> "_" +# .o. "=" "_" -> "=" +# .o. [ "=" | "_" | "." ] -> 0 || .#. _ +# ]; + +# Concatenate ObliqueAffixes and the other inflectional FSTs. +read regex WordForms .o. [..] -> ObliqueAffixes || .#. _; + +twosided flag-diacritics +define Grammar; + +# Morphophonology + +source ../../morphology/phonology.xfscript +define MorphoPhonology; + +read lexc ../../morphology/affixes/verb_tags.lexc +define Tags; + +# To prevent morphophonology to be tripped up be intervening flags +set flag-is-epsilon ON + +# regex Grammar Tags; + +# Tentative code for dealing with Morphophonology as read for a separate +# file. Though we would want to figure a way to compose each rewrite rule +# the the morphological component one-by-one for faster compilation. + +regex [Grammar Tags] .o. MorphoPhonology ; +define VerbModel + +# regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing; + +# Make flags visible, so that they can be converted to explicit boundary markers +set flag-is-epsilon OFF + +# Output boundary markers based on flags + +regex VerbModel .o. ShowBoundaries ; +# define VerbModelWithBound + +# Make flags invisible again, so that they will not intervene in removing excessive boundary markers +set flag-is-epsilon ON + +# regex VerbModelWithBound .o. SimplifyBoundaries ; + +twosided flag-diacritics diff --git a/src/fst/morphology/verb_lexicon.xfscript.in b/src/fst/morphology/verb_lexicon.xfscript.in index 43b743dd..7a534d1d 100644 --- a/src/fst/morphology/verb_lexicon.xfscript.in +++ b/src/fst/morphology/verb_lexicon.xfscript.in @@ -96,11 +96,25 @@ read regex [Stems PostverbalAffixes] .o. MarkPrefixes .o. InsInner .o. InsMiddle .o. InsOuter .o. LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o. - "." -> "." InnerAffixes , - "_" -> "_" MiddleAffixes , - "=" -> "=" OuterAffixes; + "." -> "." InnerAffixes "@P.PREFIX.INNER@" , + "_" -> "_" MiddleAffixes "@P.PREFIX.MIDDLE@" , + "=" -> "=" OuterAffixes "@P.PREFIX.OUTER@" ; define WordForms; +# Rewrite rule for resurrecting the prefix boundary markers + +define ShowBoundaries [ "@P.PREFIX.INNER@" -> "." , + "@P.PREFIX.MIDDLE@" -> "_" , + "@P.PREFIX.OUTER@" -> "=" +]; + +define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ] +.o. "=" "_" "." -> "=" +.o. "_" "." -> "_" +.o. "=" "_" -> "=" +.o. [ "=" | "_" | "." ] -> 0 || .#. _ +]; + # Concatenate ObliqueAffixes and the other inflectional FSTs. read regex WordForms .o. [..] -> ObliqueAffixes || .#. _; @@ -125,9 +139,22 @@ set flag-is-epsilon ON # the the morphological component one-by-one for faster compilation. regex [Grammar Tags] .o. MorphoPhonology ; +define VerbModel # regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing; +# Make flags visible, so that they can be converted to explicit boundary markers +set flag-is-epsilon OFF + +# Output boundary markers based on flags + +regex VerbModel .o. ShowBoundaries ; +define VerbModelWithBound + +# Make flags invisible again, so that they will not intervene in removing excessive boundary markers +set flag-is-epsilon ON + +regex VerbModelWithBound .o. SimplifyBoundaries ; #eliminate flag TAMA #eliminate flag SUBJECTNUMBER diff --git a/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in b/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in index eceb38a2..40e857fb 100755 --- a/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in +++ b/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in @@ -32,6 +32,10 @@ checked_lemmas=checked_lemmas.txt --exclude "(CmpN/Only|ShCmp|\+Cmp\/SplitR| Rreal | R | Rnoun |\+V\+|NOT-TO-LEMMATEST)" \ $source_files > $lemmas +if ! test -f $speller_dir/${GIELLA_LANG}.zhfst ; then + echo missing $speller_dir/${GIELLA_LANG}.zhfst +fi + ####### Start testing: ####### $ospell $speller_dir/${GIELLA_LANG}.zhfst < $lemmas > $checked_lemmas @@ -39,6 +43,8 @@ $ospell $speller_dir/${GIELLA_LANG}.zhfst < $lemmas > $checked_lemmas grep 'is NOT in the lexicon' $checked_lemmas > $rejected_lemmas if [ -s $rejected_lemmas ] ; then + head $rejected_lemmas + echo see $rejected_lemmas for more exit 1 fi