CLDR-18197 kbd: update spec to mention abnf

- add keyboard abnf and sample files and automated tests
unicode-org · Jan 2, 2025 · afa4af5 · afa4af5
1 parent 9bbbc77
commit afa4af5
Show file tree

Hide file tree

Showing 6 changed files with 274 additions and 1 deletion.
diff --git a/.github/workflows/keyboard.yml b/.github/workflows/keyboard.yml
@@ -37,5 +37,7 @@ jobs:
         run: npm install -g @keymanapp/kmc
       - name: Compile Keyboards
         run: kmc --error-reporting build keyboards/3.0/*.xml
+      - name: Check ABNF
+        run: bash tools/scripts/keyboard-abnf-tests/check-keyboard-abnf.sh
       - name: Run Kbd Charts
         run: 'cd docs/charts/keyboards && npm ci && npm run build'
diff --git a/docs/ldml/tr35-keyboards.md b/docs/ldml/tr35-keyboards.md
@@ -121,6 +121,7 @@ The LDML specification is divided into the following parts:
     * [Additional Features](#additional-features)
     * [Disallowed Regex Features](#disallowed-regex-features)
     * [Replacement syntax](#replacement-syntax)
+    * [Transform ABNF](#transform-abnf)
   * [Element: reorder](#element-reorder)
     * [Using `<import>` with `<reorder>` elements](#using-import-with-reorder-elements)
     * [Example Post-reorder transforms](#example-post-reorder-transforms)
@@ -2412,6 +2413,33 @@ Used in the `to=`
 
     Emits the named mark. Also see [Markers](#markers).
 
+#### Transform ABNF
+
+The grammar for the transform rules is formally defined
+using the ABNF notation [[STD68](https://www.rfc-editor.org/info/std68)],
+including the modifications found in [RFC 7405](https://www.rfc-editor.org/rfc/rfc7405).
+
+RFC7405 defines a variation of ABNF that is case-sensitive.
+Some ABNF tools are only compatible with the specification found in
+[RFC 5234](https://www.rfc-editor.org/rfc/rfc5234).
+
+The ABNF files are located in the `keyboards/abnf` directory in the CLDR source directory:
+
+ * `transform-from-required.abnf`
+    This is an ABNF for the `<transform from="…"/>` attribute.  The `from=` attribute MUST match this ABNF. Not all strings which match this ABNF are valid:
+    * It is an error if there are more than 9 capture groups
+    * Unicode codepoints and escaped characters may not be invalid or unpaired surrogates
+    * The CLDR repository may have additional requirements on the repertoire, such as requiring all characters to be in a published Unicode version and disallowing private-use characters.
+    * It is an error to reference variables that are not defined.
+
+ * `transform-to-required.abnf`
+    This is an ABNF for the `<transform to="…"/>` attribute.  The `to=` attribute MUST match this ABNF. Not all strings which match this ABNF are valid:
+
+    * It is an error if a capture group is referenced that is not present in the match string.
+    * It is an error if the to= string has the `$[1:…]` set format but there is not exactly one capture group with a set variable on the from= side. See [Replacement syntax](#replacement-syntax).
+    * Unicode codepoints and escaped characters may not be invalid or unpaired surrogates
+    * The CLDR repository may have additional requirements on the repertoire, such as requiring all characters to be in a published Unicode version and disallowing private-use characters.
+
 * * *
 
 ### Element: reorder
@@ -2873,6 +2901,7 @@ The following are the design principles for the IDs.
 
 * * *
 
+
 © 2024–2024 Unicode, Inc.
 This publication is protected by copyright, and permission must be obtained from Unicode, Inc.
 prior to any reproduction, modification, or other use not permitted by the [Terms of Use](https://www.unicode.org/copyright.html).
@@ -2885,4 +2914,4 @@ The authors, contributors, and publishers have taken care in the preparation of
 but make no express or implied representation or warranty of any kind and assume no responsibility or liability for errors or omissions or for consequential or incidental damages that may arise therefrom.
 This publication is provided “AS-IS” without charge as a convenience to users.
 
-Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the United States and other countries.
+Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the United States and other countries.
diff --git a/keyboards/abnf/transform-from-required.abnf b/keyboards/abnf/transform-from-required.abnf
@@ -0,0 +1,135 @@
+; Copyright (c) 2025 Unicode, Inc.
+; For terms of use, see http://www.unicode.org/copyright.html
+; SPDX-License-Identifier: Unicode-3.0
+; CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
+
+; This is an ABNF grammar for the CLDR Keyboard spec transform match syntax.
+; Note that there are sample matching/failing data files in tools/scripts/keyboard-abnf-tests/
+
+; An entire <transform from="..." /> string.
+; Note that the empty string is not a match.
+; Also note that a string may match this ABNF but be invalid according to the spec - which see.
+
+from-match        = start-context atoms / atoms
+
+start-context  = "^"
+
+; an empty match is not allowed.
+atoms             = atom *(disjunction atom / atom)
+
+disjunction = "|"
+
+atom = quark quantifier / quark
+
+quark  = non-group / group
+
+non-group = simple-matcher / codepointseq / variable
+
+variable = string-variable / set-variable
+
+string-variable = "${" var-id "}"
+set-variable = "$[" var-id "]"
+
+group = capturing-group / non-capturing-group
+
+quantifier    =  bounded-quantifier / optional-quantifier
+
+codepointseq           = backslash "u" "{" cphexseq "}"
+codepoint           = backslash "u" "{" cphexseq "}"
+
+bounded-quantifier = "{" DIGIT "," DIGIT "}"
+optional-quantifier =  "?"
+
+non-capturing-group = "(" "?" ":" atoms ")"
+
+; a capturing group may not contain other capturing groups.
+capturing-group = "(" catoms ")"
+
+; capturing atoms can't include any groups
+catoms = catom *(catom)
+; capturing atoms can't include any groups
+catom = cquark / cquark quantifier
+
+; capturing atoms can't include groups
+cquark = non-group
+
+; multiple hex codepoints
+cphexseq = cphex *(SP cphex)
+
+; one hex codepoint (1-6 digits)
+cphex =  1*6LHEXDIG
+
+simple-matcher      = text-char / class / match-any-codepoint / match-marker
+
+match-any-codepoint = "."
+
+match-marker = match-any-marker / match-named-marker
+match-any-marker = "\m{.}"
+match-named-marker = "\m{" marker-id "}"
+
+; marker id is nmtoken, but may be UAX31 in the future.
+marker-id = NMTOKEN
+; variable ID
+var-id = 1*32IDCHAR
+
+class = fixed-class / set-class
+
+fixed-class = backslash fixed-class-char
+
+fixed-class-char = "s" / "S" / "t" / "r" / "n" / "f" / "v" / backslash / "$" / "d" / "w" / "D" / "W" / "0"
+
+set-class = "[" set-negator set-members "]"
+set-members = set-member *(set-member)
+set-member = text-char / char-range / match-marker
+char-range = range-edge "-" range-edge
+range-edge = codepoint / range-char
+set-negator = "^" / ""
+
+; Restrictions on characters in various contexts
+
+; normal text
+text-char         = content-char / ws / escaped-char / "-" / ":"
+; text in a range sequence
+range-char        = content-char / ws / escaped-char / "."/ "|" / "{" / "}"
+; group for everything BUT syntax chars.
+content-char      = ASCII-CTRLS / ASCII-PUNCT / ALPHA / DIGIT / NON-ASCII
+
+; Character escapes
+escaped-char = backslash ( backslash / "{" / "|" / "}" )
+
+backslash    = %x5C ; U+005C REVERSE SOLIDUS "\"
+ws = SP / HTAB / CR / LF / %x3000
+
+IDCHAR = ALPHA / DIGIT / "_"
+ASCII-CTRLS        = %x01-08       ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
+                  / %x0B-0C        ; omit CR (%x0D)
+                  / %x0E-1F        ; omit SP (%x20)
+ASCII-PUNCT        = %x21-23       ; omit DOLLAR
+                  / %x25-27        ; omit () * +
+                  / %x2C           ; omit . (%x2E) and - (%x2D)
+                  / %x2F           ; skip over digits and :
+                  / %x3B-3E        ; omit ? 3f
+                  / %x5F           ; omit upper A-Z and [\]^
+                  / %x60           ; omit a-z {|}
+                  / %x7E-7F        ; just for completeness
+NON-ASCII =         %x7E-D7FF      ; omit surrogates
+                  / %xE000-10FFFF  ; that's the rest. (TODO: omit other non-characters)
+
+; from STD-68
+DIGIT          =  %x30-39             ; 0-9
+ALPHA          =  %x41-5A / %x61-7A   ; A-Z / a-z
+SP             =  %x20
+HTAB           =  %xF900              ; horizontal tab
+LF             =  %x0A                ; linefeed
+CR             =  %x0D                ; carriage return
+HEXDIG         =  DIGIT / "A" / "B" / "C" / "D" / "E" / "F"
+; like HEXDIG but lowercase also
+LHEXDIG         =  HEXDIG / "a" / "b" / "c" / "d" / "e" / "f"
+
+; from XML
+NAMESTARTCHAR   =   	":" / ALPHA / "_" / %xC0-D6 / %xD8-F6 / %xF8-2FF / %x370-37D / %x37F-1FFF / %x200C-200D / %x2070-218F / %x2C00-2FEF / %x3001-D7FF / %xF900-FDCF / %xFDF0-FFFD / %x10000
+;           TODO: / %x10000-EFFFE
+NAMECHAR	   =   	NAMESTARTCHAR / "-" / "." / DIGIT / %xB7 / %x0300-036F / %x203F-2040
+; NAME	   =   	NAMESTARTCHAR *(NAMECHAR)
+NMTOKEN	   =   	1*NAMECHAR
+; NMTOKENS	   =   	NMTOKEN *(SP NMTOKEN)
diff --git a/tools/scripts/keyboard-abnf-tests/check-keyboard-abnf.sh b/tools/scripts/keyboard-abnf-tests/check-keyboard-abnf.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+ABNF_DIR=keyboards/abnf
+TEST_DIR=tools/scripts/keyboard-abnf-tests
+abnf_check="npx --package=abnf abnf_check"
+abnf_test="npx --package=abnf abnf_test"
+
+echo "-- checking ABNF --"
+
+for abnf in ${ABNF_DIR}/*.abnf; do
+    echo Validating ${abnf}
+    ${abnf_check} ${abnf} || exit 1
+done
+
+echo "-- running test suites --"
+
+for abnf in ${ABNF_DIR}/*.abnf; do
+    echo Testing ${abnf}
+    base=$(basename ${abnf} .abnf)
+    SUITEDIR=${TEST_DIR}/${base}.d
+    if [[ -d ${SUITEDIR} ]];
+    then
+        echo "  Test suite ${SUITEDIR}"
+        for testf in ${SUITEDIR}/*.pass.txt; do
+            start=$(basename ${testf} .pass.txt)
+            echo "   Testing PASS ${testf} for ${start}"
+            while IFS="" read -r str || [ -n "$str" ]
+            do
+                if echo "${str}" | grep -v -q '^#'; then
+                    echo "# '${str}'"
+                    (${abnf_test} ${abnf} -t "${str}") 2>&1 >/dev/null || exit 1
+                fi
+            done <${testf}
+        done
+        for testf in ${SUITEDIR}/*.fail.txt; do
+            start=$(basename ${testf} .fail.txt)
+            echo "   Testing FAIL ${testf} for ${start}"
+            while IFS="" read -r str || [ -n "$str" ]
+            do
+                if echo "${str}" | grep -v -q '^#'; then
+                    echo "# '${str}'"
+                    (${abnf_test} ${abnf} -t "${str}") 2>&1 > /dev/null && (echo ERROR should have failed ; exit 1)
+                fi
+            done <${testf}
+        done
+    else
+        echo "  Warning: ${SUITEDIR} did not exist"
+    fi
+    # npx --package=abnf abnf_check ${abnf} || exit 1
+done
+
+echo "All OK"
+exit 0
+
diff --git a/tools/scripts/keyboard-abnf-tests/transform-from-required.d/from-match.fail.txt b/tools/scripts/keyboard-abnf-tests/transform-from-required.d/from-match.fail.txt
@@ -0,0 +1,29 @@
+# this is a comment
+# only innermost group isn't the capturing
+(?:foo(?:bar(baz(?:bat))?))
+(?:foo(?:bar(baz(bat))?))
+((a|b|c)|(d|e|f))
+# disallowed features
+# empty string
+
+# props
+\p{Greek}
+# backreferences
+([abc])-\1 \k<something>
+# unbounded quantifiers
+.*
+(abc)*
+e(abc)+g
+(abc){1,}
+(abc){0,}
+(abc)*?
+(abc)+?
+# named capture groups
+(?<something>)
+# Assertions
+\b
+\B
+(?<!abc)
+# end marker
+Foo$
+^Foo$
diff --git a/tools/scripts/keyboard-abnf-tests/transform-from-required.d/from-match.pass.txt b/tools/scripts/keyboard-abnf-tests/transform-from-required.d/from-match.pass.txt
@@ -0,0 +1,24 @@
+#This is a comment.
+abc
+abc 𐒵
+#def?
+(def)?
+ab(cd\u{1234}ef){2,3}
+\u{1234} \u{012A} \u{22} \u{012a} \u{1234A} \u{123 456}
+(?:foo(?:bar(baz)?))
+([abc])([def\m{w}])
+(?:thismatches)
+(?:[abc]([def]))|(?:[ghi])
+abc|def
+# NO: nested!
+##(?:foo(?:bar(baz(?:bat))?))
+##(?:foo(?:bar(baz(bat))?))
+\m{q}:
+#\m{q}L
+#\m{q}।
+\m{q}ড\m{.}
+#\m{q}ঢ
+#\m{q}ত
+#\m{q}য
+#\m{q}র
+#\m{q}ল