Skip to content

Commit

Permalink
Add ABNF snippets for language tag and media type (#437)
Browse files Browse the repository at this point in the history
Extracted/derived from the RFCs referenced in the GEDCOM spec

Signed-off-by: Dave Thaler <[email protected]>
Co-authored-by: Dave Thaler <[email protected]>
  • Loading branch information
dthaler and dthaler2 authored Feb 22, 2024
1 parent a9bc003 commit 2b63597
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 0 deletions.
6 changes: 6 additions & 0 deletions build/extract-grammars.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ def get_paths():
header = line
if '{' in header: header = header[:header.find('{')]
header = header.strip('# \n\r\t')
with open('languagetag.abnf') as f:
abnf.append(f.read())
with open('mediatype.abnf') as f:
abnf.append(f.read())
with open('core.abnf') as f:
abnf.append(f.read())
with open(join(dst,'grammar.abnf'), 'w') as f:
f.write('''; This document is in ABNF, see <https://tools.ietf.org/html/std68>
; This document uses RFC 7405 to add case-sensitive literals to ABNF.
Expand Down
7 changes: 7 additions & 0 deletions extracted-files/core.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
; Core Rules extracted from RFC 5234 section B.1
ALPHA = %x41-5A / %x61-7A ; A-Z / a-z
;DIGIT = %x30-39 ; 0-9
SP = %x20
HTAB = %x09 ; horizontal tab
DQUOTE = %x22 ; " (Double Quote)
VCHAR = %x21-7E ; visible (printing) characters
72 changes: 72 additions & 0 deletions extracted-files/languagetag.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
; ABNF derived from RFC 5646 section 2.1
Language-Tag = langtag ; normal language tags
/ privateuse ; private use tag
/ grandfathered ; grandfathered tags
langtag = language
["-" script]
["-" region]
*("-" variant)
*("-" extension)
["-" privateuse]

language = 2*3ALPHA ; shortest ISO 639 code
["-" extlang] ; sometimes followed by
; extended language subtags
/ 4ALPHA ; or reserved for future use
/ 5*8ALPHA ; or registered language subtag

extlang = 3ALPHA ; selected ISO 639 codes
*2("-" 3ALPHA) ; permanently reserved

script = 4ALPHA ; ISO 15924 code

region = 2ALPHA ; ISO 3166-1 code
/ 3digit ; UN M.49 code

variant = 5*8alphanum ; registered variants
/ (digit 3alphanum)

extension = singleton 1*("-" (2*8alphanum))

; Single alphanumerics
; "x" reserved for private use
singleton = digit ; 0 - 9
/ %x41-57 ; A - W
/ %x59-5A ; Y - Z
/ %x61-77 ; a - w
/ %x79-7A ; y - z

privateuse = "x" 1*("-" (1*8alphanum))

grandfathered = irregular ; non-redundant tags registered
/ regular ; during the RFC 3066 era

irregular = "en-GB-oed" ; irregular tags do not match
/ "i-ami" ; the 'langtag' production and
/ "i-bnn" ; would not otherwise be
/ "i-default" ; considered 'well-formed'
/ "i-enochian" ; These tags are all valid,
/ "i-hak" ; but most are deprecated
/ "i-klingon" ; in favor of more modern
/ "i-lux" ; subtags or subtag
/ "i-mingo" ; combination
/ "i-navajo"
/ "i-pwn"
/ "i-tao"
/ "i-tay"
/ "i-tsu"
/ "sgn-BE-FR"
/ "sgn-BE-NL"
/ "sgn-CH-DE"

regular = "art-lojban" ; these tags match the 'langtag'
/ "cel-gaulish" ; production, but their subtags
/ "no-bok" ; are not extended language
/ "no-nyn" ; or variant subtags: their meaning
/ "zh-guoyu" ; is defined by their registration
/ "zh-hakka" ; and all of these are deprecated
/ "zh-min" ; in favor of a more modern
/ "zh-min-nan" ; subtag or sequence of subtags
/ "zh-xiang"

alphanum = (ALPHA / digit) ; letters and numbers
40 changes: 40 additions & 0 deletions extracted-files/mediatype.abnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
; ABNF derived from RFC 2045 section 5.1
type = discrete-type / composite-type
discrete-type = "text" / "image" / "audio" / "video" /
"application" / extension-token
composite-type = "message" / "multipart" / extension-token
extension-token = ietf-token / x-token
ietf-token = type-name
x-token = "x-" token
subtype = extension-token / iana-token
iana-token = subtype-name

; ABNF derived from RFC 6838 section 4.2
type-name = restricted-name
subtype-name = restricted-name

restricted-name = restricted-name-first *126restricted-name-chars
restricted-name-first = ALPHA / digit
restricted-name-chars = ALPHA / digit / "!" / "#" /
"$" / "&" / "-" / "^" / "_"
restricted-name-chars =/ "." ; Characters before first dot always
; specify a facet name
restricted-name-chars =/ "+" ; Characters after last plus always
; specify a structured syntax suffix

; ABNF derived from RFC 9110 section 5.6
parameters = *( OWS ";" OWS [ parameter ] )
parameter = parameter-name "=" parameter-value
parameter-name = token
parameter-value = ( token / quoted-string )
token = 1*tchar
tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*"
/ "+" / "-" / "." / "^" / "_" / "`" / "|" / "~"
/ digit / ALPHA
; any VCHAR, except delimiters
OWS = *( SP / HTAB )
; optional whitespace
quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE
qdtext = HTAB / SP / %x21 / %x23-5B / %x5D-7E / obs-text
obs-text = %x80-FF
quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text )

0 comments on commit 2b63597

Please sign in to comment.