Merge branch 'main' of https://github.com/giellalt/lang-srs

giellalt · May 1, 2024 · 44d5973 · 44d5973
2 parents a692f18 + cadabe6
commit 44d5973
Show file tree

Hide file tree

Showing 6 changed files with 218 additions and 22 deletions.
diff --git a/.gut/delta.toml b/.gut/delta.toml
@@ -1,6 +1,6 @@
 template = "https://github.com/giellalt/template-lang-und"
-rev_id = 172
-template_sha = "ca311e6ba41f16538c7268df70c7048082eaadb2"
+rev_id = 175
+template_sha = "bf3ac2ead0081366d7a999df6f804fc6662bbe30"
 
 [replacements]
 __REPO__ = "lang-srs"

diff --git a/Makefile.am b/Makefile.am
@@ -20,6 +20,12 @@ banner:
 # 'nothing to be done for test'. By forwarding test to check we work around it.
 test: check
 
+# recurse all make devs just
+dev:
+	$(MAKE) dev -C tools/tokenisers
+	$(MAKE) dev -C tools/grammarcheckers
+	$(MAKE) dev -C tools/tts
+
 # Remove html tables created by some of the developer tools:
 clean-local:
 	rm -f *.html
diff --git a/m4/giella-macros.m4 b/m4/giella-macros.m4
@@ -88,7 +88,7 @@ AC_MSG_RESULT([$GIELLA_CORE])
 ###############################################################
 ### This is the version of the Giella Core that we require. ###
 ### UPDATE AS NEEDED.
-_giella_core_min_version=0.22.0
+_giella_core_min_version=0.23.0
 
 # GIELLA_CORE/GTCORE env. variable, required by the infrastructure to find scripts:
 AC_ARG_VAR([GIELLA_CORE], [directory for the Giella infra core scripts and other required resources])
@@ -162,21 +162,7 @@ AS_IF([test "x$enable_yamltests" = "xcheck"],
 
 AM_CONDITIONAL([CAN_YAML_TEST], [test "x$enable_yamltests" != xno])
 
-################ LXML or pip ################
-AS_IF([test "x$enable_grammarchecker" != "xno"],
-     [AM_PATH_PYTHON([3.5],, [:])
-     AX_PYTHON_MODULE(lxml)
-     AX_PYTHON_MODULE(pip)
-     AC_MSG_CHECKING([whether we can use lxml])
-     AS_IF([test "x$HAVE_PYMOD_LXML" != "xyes"],
-           AS_IF([test "x$HAVE_PYMOD_PIP" != "xno"],
-                 AC_MSG_RESULT(no)
-                 AC_MSG_WARN([lxml or pip is needed for grammarcheckers]),
-                 AC_MSG_RESULT([no but using pip])),
-           AC_MSG_RESULT(yes))])
-
-AM_CONDITIONAL([CAN_LXML], [test "x$HAVE_PYMOD_LXML" != xno])
-AM_CONDITIONAL([CAN_PIP], [test "x$HAVE_PYMOD_LXML" != xno])
+
 ################ Generated documentation ################
 # Check for awk with required feature:
 AC_CACHE_CHECK([for awk that supports gensub], [ac_cv_path_GAWK],
@@ -676,6 +662,18 @@ AS_IF([test "x$enable_grammarchecker" = "xyes" -a "x$gt_prog_vislcg3" = "xno"],
 AS_IF([test "x$enable_ci" = "xyes" -a "x$enableval" = "x"], [enable_grammarchecker=no])
 AM_CONDITIONAL([WANT_GRAMCHECK], [test "x$enable_grammarchecker" != xno])
 enableval=''
+################ gtgramtool for grammarchecking ################
+AC_PATH_PROG([GTGRAMTOOL], [gtgramtool], [false])
+AS_IF([test "x$enable_grammarchecker" != "xno"],
+    AX_PYTHON_MODULE(pip)
+    AC_MSG_CHECKING([whether we have gtgramtool])
+    AS_IF([test x$GTGRAMTOOL = xfalse], 
+    [AC_MSG_ERROR([gtgramtool is needed for --enable grammarchecker.
+        on debian/ubuntu: sudo apt update; sudo apt install pipx; pipx ensurepath
+        on macbrew: brew install pipx; pipx ensurepath
+        then: pipx install git+https://github.com/divvun/giellaltgramtools
+      ])]),
+    AC_MSG_RESULT(yes))
 
 # Enable all spellers - default is 'no'
 AC_ARG_ENABLE([spellers],
@@ -1048,7 +1046,6 @@ cd ..
 git clone [email protected]:giellalt/$gt_SHARED_FAILS
 cd $gt_SHARED_FAILS
 ./autogen.sh && ./configure && make])])
-AC_MSG_WARN([January 2024: the lexc files and fsts have been moved up to src/fst/morphology])
 ]) # gt_PRINT_FOOTER
 
 # vim: set ft=config:
diff --git a/src/fst/morphology/incoming/srs-bound-demo.xfscript b/src/fst/morphology/incoming/srs-bound-demo.xfscript
@@ -0,0 +1,160 @@
+# Demo script on morpheme boundary marking
+
+read lexc ../../morphology/stems/verb_stems.lexc
+define Stems
+
+read lexc ../../morphology/affixes/verb_inner_affixes.lexc
+define InnerAffixes
+
+read lexc ../../morphology/affixes/verb_middle_affixes.lexc
+define MiddleAffixes
+
+read lexc ../../morphology/affixes/verb_outer_affixes.lexc
+define OuterAffixes
+
+read lexc ../../morphology/affixes/verb_oblique_affixes.lexc
+define ObliqueAffixes
+
+read lexc ../../morphology/affixes/postverbal_affixes.lexc
+define PostverbalAffixes
+
+# Mark prefix type (insert corresponding flags to the
+# original prefixes we know to be present from the lexical entry.
+# ab=cd_ef.gh => ab@P.PREFIX.OUTER@=cd@P.PREFIX.MIDDLE@_ef@P.PREFIX.INNER@.gh
+
+define MarkPrefixes ~$["@P.PREFIX.OUTER@"|"@P.PREFIX.INNER@"|"@P.PREFIX.MIDDLE@"] .o.
+                  "=" -> "@P.PREFIX.OUTER@" "=" , 
+                  "_" -> "@P.PREFIX.MIDDLE@" "_" , 
+                  "." -> "@P.PREFIX.INNER@" ".";
+
+# Insert . (inner), _ (middle), and = (outer) if missing in the intermediate rep:
+# tsiy > =_.tsiy
+# ts'á=zíd > ts'á=_.zíd
+# tsí=di.tł'á > tsí=_di.tł'á
+# gu.blah > =_gu.blah
+# gu_blah > =gu_.blah
+
+# (1) No . > insert . after last marker (= or _), or in the beginning if none exists
+# (2) No _ > insert _ (a) after =, if one exists, or (b) beginning
+# (3) No = > insert at beginning
+
+define InsInner  [..] -> "." || "_" _ ~$"." .#. .o. 
+              [..] -> "." || "=" _ ~$"." .#. .o. 
+              [..] -> "." || .#. _ ~$"." .#. ;
+
+define InsMiddle [..] -> "_" || "=" _ ~$["_"] .#. .o. 
+              [..] -> "_" || .#. _ ~$["_"] .#. ; 
+
+define InsOuter  [..] -> "=" || .#. _ ~$["="] .#. ;
+
+# Some lexical entries include a "^L" immediately before the inner-prefix
+# boundary marker "." (e.g., xá=_^L.ʔò "take it [solid object] out", or
+# ta=_di^L.ʔò "lift/pick/hold it [solid object] up"), with or without
+# any other inner prefixes present.  In these (rare) cases, Tsuut'ina uses
+# middle prefix TAMA allomorphs, rather than outer or inner prefix forms, e.g.
+#
+#   2SG  xáaʔò      "you (sg.) will carry it (solid object) past"
+#                   (not *xaniʔò or something similar if 0-IPFV outer or no
+#                    preceding prefix allomorphs were used)
+#   1PL  xáasaàʔò   "we will carry it (solid object) past"
+#                   (not *xaàʔò or something similar if 0-IPFV inner prefix
+#                    allomorphs were used)
+#
+#   2SG  tadiʔò     "you (sg.) will lift it (solid object) up"
+#                   (not *tadiniʔò if 0-IPFV outer or no preceding prefix
+#                    allomorphs were used)
+#   1PL  tadìsaàʔò  "we will lift it (solid object) up"
+#                   (not *tadaàʔò or something similar if 0-IPFV inner
+#                    prefix allomorphs were used)
+#
+# This rule therefore turns @P.PREFIX.INNER@ into @P.PREFIX.MIDDLE@ to ensure
+# that middle-prefix allomorphs are used, then sets an additional flag (@P.
+# LOWTONE.ON@) to help differentiate between this situation and all other
+# middle-prefix contexts (for use in 'affixes/verb_inner_affixes.lexc').
+define LInnerPrefixAllomorphs "@P.PREFIX.INNER@" -> 
+    "@P.PREFIX.MIDDLE@" "@P.LOWTONE.ON@" || "^L" _ ;
+
+# Some middle and inner prefixes (e.g., middle íH- conative/half-transitive,
+# inner ná-) appear with outer prefix TAMA chunk allomorphs. In order to
+# get this right, we append the symbol "^O" after the vowel in the lexical
+# entry (e.g., í^H^O-, ná^O-), then turn that into a flag that requires outer
+# prefix allomorphs.
+define RequireOuterAllomorphs "^O" "@P.PREFIX.INNER@" -> "@P.PREFIX.OUTER@" .o.
+                              "^O" "@P.PREFIX.MIDDLE@" -> "@P.PREFIX.OUTER@" .o.
+                              "^O" -> "@P.PREFIX.OUTER@";
+
+# We temporarily keep the boundary symbol for inner affixes ("."), middle
+# affixes ("_"), and outer affixes ("=") in place so that we can target
+# morphophonology in each position more easily (especially when aiming to drop
+# the "weak" /i/ vowels that appear in inner lexical prefixes).
+read regex [Stems PostverbalAffixes] .o. 
+            MarkPrefixes .o. 
+            InsInner .o. InsMiddle .o. InsOuter .o. 
+            LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o.
+            "." -> "." "@P.BOUND.INN-L@" InnerAffixes "@P.BOUND.INN-R@" ,
+            "_" -> "_" "@P.BOUND.MID-L@" MiddleAffixes "@P.BOUND.MID-R@" ,
+            "=" -> "=" "@P.BOUND.OUT-L@" OuterAffixes "@P.BOUND.OUT-R@" ;
+define WordForms;
+
+# Rewrite rule for resurrecting the prefix boundary markers
+
+define ShowBoundaries [ "@P.BOUND.INN-L@" -> "(" , 
+    "@P.BOUND.MID-L@" -> "[" ,
+    "@P.BOUND.OUT-L@" -> "<" ,
+    "@P.BOUND.INN-R@" -> ")" , 
+    "@P.BOUND.MID-R@" -> "]" ,
+    "@P.BOUND.OUT-R@" -> ">"
+];
+
+# define SimplifyBoundaries [ [ "<" "<" -> "<" , ">" ">" -> ">" , "[" "[" -> "[" , "]" "]" -> "]" , "(" "(" -> "(" , ")" ")" -> ")" ]
+# ];
+
+# define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ]
+# .o. "=" "_" "." -> "="
+# .o. "_" "." -> "_"
+# .o. "=" "_" -> "="
+# .o. [ "=" | "_" | "." ] -> 0 || .#. _
+# ];
+
+# Concatenate ObliqueAffixes and the other inflectional FSTs.
+read regex WordForms .o. [..] -> ObliqueAffixes || .#. _;
+
+twosided flag-diacritics
+define Grammar;
+
+# Morphophonology
+
+source ../../morphology/phonology.xfscript
+define MorphoPhonology;
+
+read lexc ../../morphology/affixes/verb_tags.lexc
+define Tags;
+
+# To prevent morphophonology to be tripped up be intervening flags
+set flag-is-epsilon ON
+
+# regex Grammar Tags;
+
+# Tentative code for dealing with Morphophonology as read for a separate
+# file. Though we would want to figure a way to compose each rewrite rule
+# the the morphological component one-by-one for faster compilation.
+
+regex [Grammar Tags] .o. MorphoPhonology ;
+define VerbModel
+
+# regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing;
+
+# Make flags visible, so that they can be converted to explicit boundary markers
+set flag-is-epsilon OFF
+
+# Output boundary markers based on flags
+
+regex VerbModel .o. ShowBoundaries ;
+# define VerbModelWithBound
+
+# Make flags invisible again, so that they will not intervene in removing excessive boundary markers
+set flag-is-epsilon ON
+
+# regex VerbModelWithBound .o. SimplifyBoundaries ;
+
+twosided flag-diacritics
diff --git a/src/fst/morphology/verb_lexicon.xfscript.in b/src/fst/morphology/verb_lexicon.xfscript.in
@@ -96,11 +96,25 @@ read regex [Stems PostverbalAffixes] .o.
             MarkPrefixes .o. 
             InsInner .o. InsMiddle .o. InsOuter .o. 
             LInnerPrefixAllomorphs .o. RequireOuterAllomorphs .o.
-            "." -> "." InnerAffixes ,
-            "_" -> "_" MiddleAffixes ,
-            "=" -> "=" OuterAffixes;
+            "." -> "." InnerAffixes "@P.PREFIX.INNER@" ,
+            "_" -> "_" MiddleAffixes "@P.PREFIX.MIDDLE@" ,
+            "=" -> "=" OuterAffixes "@P.PREFIX.OUTER@" ;
 define WordForms;
 
+# Rewrite rule for resurrecting the prefix boundary markers
+
+define ShowBoundaries [ "@P.PREFIX.INNER@" -> "." , 
+    "@P.PREFIX.MIDDLE@" -> "_" ,
+    "@P.PREFIX.OUTER@" -> "="
+];
+
+define SimplifyBoundaries [ [ "=" "=" -> "=" , "_" "_" -> "_" , "." "." -> "." ]
+.o. "=" "_" "." -> "="
+.o. "_" "." -> "_"
+.o. "=" "_" -> "="
+.o. [ "=" | "_" | "." ] -> 0 || .#. _
+];
+
 # Concatenate ObliqueAffixes and the other inflectional FSTs.
 read regex WordForms .o. [..] -> ObliqueAffixes || .#. _;
 
@@ -125,9 +139,22 @@ set flag-is-epsilon ON
 # the the morphological component one-by-one for faster compilation.
 
 regex [Grammar Tags] .o. MorphoPhonology ;
+define VerbModel
 
 # regex [Grammar Tags] .o. deletePrefixI .o. hToneSpreading .o. hToneSpreadingCleanup .o. deleteBoundarySymbol .o. uBeforeA .o. aBeforeI .o. lowABeforeI .o. iBeforeA .o. iBeforeO .o. uBeforeO .o. lInitialStemsSbjPl2 .o. lInitialStemsSbjPl2Cleanup .o. slDissimilation .o. zhDevoicing .o. zDevoicing;
 
+# Make flags visible, so that they can be converted to explicit boundary markers
+set flag-is-epsilon OFF
+
+# Output boundary markers based on flags
+
+regex VerbModel .o. ShowBoundaries ;
+define VerbModelWithBound
+
+# Make flags invisible again, so that they will not intervene in removing excessive boundary markers
+set flag-is-epsilon ON
+
+regex VerbModelWithBound .o. SimplifyBoundaries ;
 
 #eliminate flag TAMA
 #eliminate flag SUBJECTNUMBER

diff --git a/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in b/test/tools/spellcheckers/fstbased/desktop/hfst/accept-all-lemmas.sh.in
@@ -32,13 +32,19 @@ checked_lemmas=checked_lemmas.txt
     --exclude "(CmpN/Only|ShCmp|\+Cmp\/SplitR| Rreal | R | Rnoun |\+V\+|NOT-TO-LEMMATEST)" \
     $source_files > $lemmas
 
+if ! test -f $speller_dir/${GIELLA_LANG}.zhfst ; then
+    echo missing $speller_dir/${GIELLA_LANG}.zhfst
+fi
+
 ####### Start testing: #######
 
 $ospell $speller_dir/${GIELLA_LANG}.zhfst < $lemmas > $checked_lemmas
 
 grep 'is NOT in the lexicon' $checked_lemmas > $rejected_lemmas
 
 if [ -s $rejected_lemmas ] ; then
+    head $rejected_lemmas
+    echo see $rejected_lemmas for more
     exit 1
 fi