From 768c8f1a4cba056ec791a87af78127ca24a01303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Erik=20K=C3=B6rner?= Date: Thu, 11 Jul 2024 22:19:37 +0200 Subject: [PATCH] Add CoNNL-U language support, see #3790 --- components.json | 4 + components/prism-conllu.js | 162 +++++ examples/prism-conllu.html | 119 ++++ tests/languages/conllu/index_feature.test | 97 +++ tests/languages/conllu/issue3790.test | 151 +++++ .../conllu/morphological_annotation.test | 135 ++++ .../languages/conllu/sentence_boundaries.test | 582 ++++++++++++++++++ .../conllu/sentence_separator_feature.test | 122 ++++ .../conllu/syntactical_annotation.test | 189 ++++++ 9 files changed, 1561 insertions(+) create mode 100644 components/prism-conllu.js create mode 100644 examples/prism-conllu.html create mode 100644 tests/languages/conllu/index_feature.test create mode 100644 tests/languages/conllu/issue3790.test create mode 100644 tests/languages/conllu/morphological_annotation.test create mode 100644 tests/languages/conllu/sentence_boundaries.test create mode 100644 tests/languages/conllu/sentence_separator_feature.test create mode 100644 tests/languages/conllu/syntactical_annotation.test diff --git a/components.json b/components.json index b353778e90..e0eb7433fe 100644 --- a/components.json +++ b/components.json @@ -342,6 +342,10 @@ "alias": "conc", "owner": "jasontatton" }, + "conllu": { + "title": "CoNLL-U", + "owner": "Querela" + }, "csp": { "title": "Content-Security-Policy", "owner": "ScottHelme" diff --git a/components/prism-conllu.js b/components/prism-conllu.js new file mode 100644 index 0000000000..f425a0ec57 --- /dev/null +++ b/components/prism-conllu.js @@ -0,0 +1,162 @@ +(function (Prism) { + + Prism.languages.conllu = { + // comment lines + comment: { + pattern: /#(?:[^\n])*/, + inside: { + metadata: { + pattern: /(?:\w+)\s*=\s*.*/, + inside: { + key: { + pattern: /\w+(?=\s*=)/, + alias: 'property', + }, + value: { + pattern: /(\s*=\s*)\S.*$/, + lookbehind: true, + alias: 'string', + }, + operator: /[=]/, + } + }, + punctuation: /^#/, + } + }, + // separator between two sentence blocks + "sentence-separator": { + pattern: /(\r?\n)(?=\r?\n)/s, + lookbehind: true, + }, + // word lines + token: { + pattern: /.+/, + inside: { + id: { + pattern: /^\d+(?:[.-]\d+)?/, + alias: 'number', + }, + // form / lemma / upos / xpos / feats / head / deprel / deps / misc + value: { + pattern: /^(\t)[^\t]*(?=\t|$)/, + lookbehind: true, + // alias: 'string', + // inside: { + // unspecified: /_/, + // } + }, + }, + }, + }; + + const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/; + const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/; + const featsGrammar = { + punctuation: /\|/, + feature: { + pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'), + inside: { + key: { + pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/, + alias: 'property', + }, + value: [ + { + pattern: /(=)(?:yes|no)$/i, + lookbehind: true, + alias: 'boolean', + }, { + pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/, + lookbehind: true, + alias: 'string', + } + ], + operator: /=/, + }, + }, + }; + + const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/; + const depsGrammar = { + punctuation: /\|/, + dep: { + pattern: /^\S+$/, + inside: { + head: { + pattern: /\d+(?=:)/, + alias: 'number', + }, + punctuation: /^:/, + relation: { + pattern: /.+/, // we just capture everything, should be ok + alias: 'symbol', + }, + } + }, + } + + // hook to assign roles to value fields + const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']; + const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null]; + const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar]; + Prism.hooks.add('after-tokenize', function (env) { + if (env.language !== 'conllu') { + return; + } + + for (const row of env.tokens) { + // go over each token row (if it is a "token" and not a comment/sentence-separator) + if (row.type === 'token') { + let entryTypeCounter = 0; + for (const field of row.content) { + // skip space between + if (typeof field === 'string') { continue; } + // only fields, not ids + if (field?.type !== 'value') { continue; } + + if (field.alias === undefined) { field.alias = []; } + if (typeof field.alias === 'string') { field.alias = [field.alias]; } + + // check if "_" value, and assign class + if (field.content === '_') { + field.alias.push('unspecified'); + } + + // assign role to value based on position + if (entryTypeCounter < entryTypes.length) { + // add "value" as one alias + field.alias.push(field.type); + // change field type + field.type = entryTypes[entryTypeCounter]; + // add alias if available + if (entryTypesAlias[entryTypeCounter] !== null) { + field.alias.push(entryTypesAlias[entryTypeCounter]); + } else if (entryTypeInside[entryTypeCounter] === null) { + // only assign string if there is no inner processing? + field.alias.push('string'); + } + + // run inner processing only for selected types! + if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) { + field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]); + } + } + + entryTypeCounter++; + } + } + } + }); + + // just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for + // insert dummy rules that do not match anything + // TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ... + // for (let index = 0; index < entryTypes.length; index++) { + // const entryType = entryTypes[index]; + // const entryTypeAlias = entryTypesAlias[index]; + // const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : ''); + // // use some invalid pattern + // Prism.languages.conllu.token.inside[name] = /\b\B/; + // } + +}(Prism)); diff --git a/examples/prism-conllu.html b/examples/prism-conllu.html new file mode 100644 index 0000000000..6883093226 --- /dev/null +++ b/examples/prism-conllu.html @@ -0,0 +1,119 @@ +

Full details can be fround at Universal Dependencies - Format.

+ +

Comments

+ +
# sent_id = 2
+# text = I have no clue.
+# or a simple string
+ +

Full Example

+ +
# sent_id = 2
+# text = I have no clue.
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	root	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	clue	clue	NOUN	NN	Number=Sing	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_
+ +

Words, Tokens and Empty Nodes

+ +
1-2	vámonos	_
+1	vamos	ir
+2	nos	nosotros
+3-4	al	_
+3	a	a
+4	el	el
+5	mar	mar
+ +
1	Sue	Sue
+2	likes	like
+3	coffee	coffee
+4	and	and
+5	Bill	Bill
+5.1	likes	like
+6	tea	tea
+ +
1	nosotros	nosotros
+2	vamos	ir
+3-4	al	_
+3	a	a
+4	el	el
+5	mar	mar
+6	y	y
+7	vosotros	vosotros
+7.1	vais	ir
+8-9	al	_
+8	a	a
+9	el	el
+10	parque	parque
+ +

Morphological Annotation

+ +
1	Då	då	ADV	AB	_
+2	var	vara	VERB	VB.PRET.ACT	Tense=Past|Voice=Act
+3	han	han	PRON	PN.UTR.SIN.DEF.NOM	Case=Nom|Definite=Def|Gender=Com|Number=Sing
+4	elva	elva	NUM	RG.NOM	Case=Nom|NumType=Card
+5	år	år	NOUN	NN.NEU.PLU.IND.NOM	Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
+6	.	.	PUNCT	DL.MAD	_
+ +

Syntactic Annotation

+ +
1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	root	0:root
+3	and	and	CCONJ	CC	_	4	cc	4:cc
+4	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	2	conj	0:root|2:conj
+5	books	book	NOUN	NNS	Number=Plur	2	obj	2:obj|4:obj
+6	.	.	PUNCT	.	_	2	punct	2:punct
+ +

Untokenized Text

+ +
# text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“).
+# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”).
+1	Er	er	PRON	…	_
+2	arbeitet	arbeiten	VERB	…	_
+3-4	fürs		_	_	…	_
+3	für	für	ADP	…	_
+4	das	der	DET	…	_
+5	FBI	FBI	PROPN	…	_
+6	(	(	PUNCT	…	SpaceAfter=No
+7	deutsch	deutsch	ADV	…	_
+8	etwa		etwa		ADV	…	SpaceAfter=No
+9	:	:	PUNCT	…	_
+10	„	„	PUNCT	…	SpaceAfter=No
+11	Bundesamt	Bundesamt	NOUN	…	_
+12	für	für	ADP	…	_
+13	Ermittlung	Ermittlung	NOUN	…	SpaceAfter=No
+14	“	“	PUNCT	…	SpaceAfter=No
+15	)	)	PUNCT	…	SpaceAfter=No
+16	.	.	PUNCT	…	_
+ +

Sentence Boundaries and Comments

+ +
# sent_id = 1
+# text = They buy and sell books.
+1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj	_
+2	buy	buy	VERB	VBP	Number=Plur|Person=3|Tense=Pres	0	root	0:root	_
+3	and	and	CCONJ	CC	_	4	cc	4:cc	_
+4	sell	sell	VERB	VBP	Number=Plur|Person=3|Tense=Pres	2	conj	0:root|2:conj	_
+5	books	book	NOUN	NNS	Number=Plur	2	obj	2:obj|4:obj	SpaceAfter=No
+6	.	.	PUNCT	.	_	2	punct	2:punct	_
+
+# sent_id = 2
+# text = I have no clue.
+1	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1	2	nsubj	_	_
+2	have	have	VERB	VBP	Number=Sing|Person=1|Tense=Pres	0	root	_	_
+3	no	no	DET	DT	PronType=Neg	4	det	_	_
+4	clue	clue	NOUN	NN	Number=Sing	2	obj	_	SpaceAfter=No
+5	.	.	PUNCT	.	_	2	punct	_	_
+
+# sent_id = panc0.s4
+# text = तत् यथानुश्रूयते।
+# translit = tat yathānuśrūyate.
+# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
+# text_en = This is what is heard.
+1	तत्	तद्	DET	_	Case=Nom|…|PronType=Dem	3	nsubj	_	Translit=tat|LTranslit=tad|Gloss=it
+2-3	यथानुश्रूयते	_	_	_	_	_	_	_	SpaceAfter=No
+2	यथा	यथा	ADV	_	PronType=Rel	3	advmod	_	Translit=yathā|LTranslit=yathā|Gloss=how
+3	अनुश्रूयते	अनु-श्रु	VERB	_	Mood=Ind|…|Voice=Pass	0	root	_	Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
+4	।	।	PUNCT	_	_	3	punct	_	Translit=.|LTranslit=.|Gloss=.
diff --git a/tests/languages/conllu/index_feature.test b/tests/languages/conllu/index_feature.test new file mode 100644 index 0000000000..41d7ea1587 --- /dev/null +++ b/tests/languages/conllu/index_feature.test @@ -0,0 +1,97 @@ +1-2 vámonos _ +1 vamos ir +2 nos nosotros +3-4 al _ +3 a a +4 el el +5 mar mar + +1 Sue Sue +2 likes like +3 coffee coffee +4 and and +5 Bill Bill +5.1 likes like +6 tea tea + +---------------------------------------------------- + +[ + ["token", [ + ["id", "1-2"], + ["form", "vámonos"], + ["lemma", "_"] + ]], + ["token", [ + ["id", "1"], + ["form", "vamos"], + ["lemma", "ir"] + ]], + ["token", [ + ["id", "2"], + ["form", "nos"], + ["lemma", "nosotros"] + ]], + ["token", [ + ["id", "3-4"], + ["form", "al"], + ["lemma", "_"] + ]], + ["token", [ + ["id", "3"], + ["form", "a"], + ["lemma", "a"] + ]], + ["token", [ + ["id", "4"], + ["form", "el"], + ["lemma", "el"] + ]], + ["token", [ + ["id", "5"], + ["form", "mar"], + ["lemma", "mar"] + ]], + ["sentence-separator", ""], + ["token", [ + ["id", "1"], + ["form", "Sue"], + ["lemma", "Sue"] + ]], + ["token", [ + ["id", "2"], + ["form", "likes"], + ["lemma", "like"] + ]], + ["token", [ + ["id", "3"], + ["form", "coffee"], + ["lemma", "coffee"] + ]], + ["token", [ + ["id", "4"], + ["form", "and"], + ["lemma", "and"] + ]], + ["token", [ + ["id", "5"], + ["form", "Bill"], + ["lemma", "Bill"] + ]], + ["token", [ + ["id", "5.1"], + ["form", "likes"], + ["lemma", "like"] + ]], + ["token", [ + ["id", "6"], + ["form", "tea"], + ["lemma", "tea"] + ]] +] + +---------------------------------------------------- + +Testing indexing schemes. + +https://universaldependencies.org/format.html diff --git a/tests/languages/conllu/issue3790.test b/tests/languages/conllu/issue3790.test new file mode 100644 index 0000000000..0b5bfd467c --- /dev/null +++ b/tests/languages/conllu/issue3790.test @@ -0,0 +1,151 @@ +# sent_id = 2 +# text = I have no clue. +# is a normal comment allowed? +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ +3 no no DET DT PronType=Neg 4 det _ _ +4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No +5 . . PUNCT . _ 2 punct _ _ + +---------------------------------------------------- + +[ + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "2"] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text"], + ["operator", "="], + ["value", "I have no clue."] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + " is a normal comment allowed?\r" + ]], + ["token", [ + ["id", "1"], + ["form", "I"], + ["lemma", "I"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "2"], + ["form", "have"], + ["lemma", "have"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "3"], + ["form", "no"], + ["lemma", "no"], + ["upos", "DET"], + ["xpos", "DT"], + ["feats", [ + ["feature", [ + ["key", "PronType"], + ["operator", "="], + ["value", "Neg"] + ]] + ]], + ["head", "4"], + ["deprel", "det"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "4"], + ["form", "clue"], + ["lemma", "clue"], + ["upos", "NOUN"], + ["xpos", "NN"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]] + ]], + ["head", "2"], + ["deprel", "obj"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "SpaceAfter"], + ["operator", "="], + ["value", "No"] + ]] + ]] + ]], + ["token", [ + ["id", "5"], + ["form", "."], + ["lemma", "."], + ["upos", "PUNCT"], + ["xpos", "."], + ["feats", "_"], + ["head", "2"], + ["deprel", "punct"], + ["deps", "_"], + ["misc", "_"] + ]] +] + +---------------------------------------------------- + +Language feature request. diff --git a/tests/languages/conllu/morphological_annotation.test b/tests/languages/conllu/morphological_annotation.test new file mode 100644 index 0000000000..ffea3ace4d --- /dev/null +++ b/tests/languages/conllu/morphological_annotation.test @@ -0,0 +1,135 @@ +1 Då då ADV AB _ +2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act +3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing +4 elva elva NUM RG.NOM Case=Nom|NumType=Card +5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur +6 . . PUNCT DL.MAD _ + +---------------------------------------------------- + +[ + ["token", [ + ["id", "1"], + ["form", "Då"], + ["lemma", "då"], + ["upos", "ADV"], + ["xpos", "AB"], + ["feats", "_"] + ]], + ["token", [ + ["id", "2"], + ["form", "var"], + ["lemma", "vara"], + ["upos", "VERB"], + ["xpos", "VB.PRET.ACT"], + ["feats", [ + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Past"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Voice"], + ["operator", "="], + ["value", "Act"] + ]] + ]] + ]], + ["token", [ + ["id", "3"], + ["form", "han"], + ["lemma", "han"], + ["upos", "PRON"], + ["xpos", "PN.UTR.SIN.DEF.NOM"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Definite"], + ["operator", "="], + ["value", "Def"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gender"], + ["operator", "="], + ["value", "Com"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]] + ]] + ]], + ["token", [ + ["id", "4"], + ["form", "elva"], + ["lemma", "elva"], + ["upos", "NUM"], + ["xpos", "RG.NOM"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "NumType"], + ["operator", "="], + ["value", "Card"] + ]] + ]] + ]], + ["token", [ + ["id", "5"], + ["form", "år"], + ["lemma", "år"], + ["upos", "NOUN"], + ["xpos", "NN.NEU.PLU.IND.NOM"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Definite"], + ["operator", "="], + ["value", "Ind"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gender"], + ["operator", "="], + ["value", "Neut"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]] + ]] + ]], + ["token", [ + ["id", "6"], + ["form", "."], + ["lemma", "."], + ["upos", "PUNCT"], + ["xpos", "DL.MAD"], + ["feats", "_"] + ]] +] + +---------------------------------------------------- + +Example for morphological annotation. diff --git a/tests/languages/conllu/sentence_boundaries.test b/tests/languages/conllu/sentence_boundaries.test new file mode 100644 index 0000000000..08ad52c5ac --- /dev/null +++ b/tests/languages/conllu/sentence_boundaries.test @@ -0,0 +1,582 @@ +# sent_id = 1 +# text = They buy and sell books. +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _ +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _ +3 and and CCONJ CC _ 4 cc 4:cc _ +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _ +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No +6 . . PUNCT . _ 2 punct 2:punct _ + +# sent_id = 2 +# text = I have no clue. +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ +3 no no DET DT PronType=Neg 4 det _ _ +4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No +5 . . PUNCT . _ 2 punct _ _ + +# sent_id = panc0.s4 +# text = तत् यथानुश्रूयते। +# translit = tat yathānuśrūyate. +# text_fr = Voilà ce qui nous est parvenu par la tradition orale. +# text_en = This is what is heard. +1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it +2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No +2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how +3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard +4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=. + +---------------------------------------------------- + +[ + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "1"] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text"], + ["operator", "="], + ["value", "They buy and sell books."] + ]] + ]], + ["token", [ + ["id", "1"], + ["form", "They"], + ["lemma", "they"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "nsubj"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "nsubj"] + ]] + ]], + ["misc", "_"] + ]], + ["token", [ + ["id", "2"], + ["form", "buy"], + ["lemma", "buy"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "3"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", [ + ["dep", [ + ["head", "0"], + ["punctuation", ":"], + ["relation", "root"] + ]] + ]], + ["misc", "_"] + ]], + ["token", [ + ["id", "3"], + ["form", "and"], + ["lemma", "and"], + ["upos", "CCONJ"], + ["xpos", "CC"], + ["feats", "_"], + ["head", "4"], + ["deprel", "cc"], + ["deps", [ + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "cc"] + ]] + ]], + ["misc", "_"] + ]], + ["token", [ + ["id", "4"], + ["form", "sell"], + ["lemma", "sell"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "3"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "2"], + ["deprel", "conj"], + ["deps", [ + ["dep", [ + ["head", "0"], + ["punctuation", ":"], + ["relation", "root"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "conj"] + ]] + ]], + ["misc", "_"] + ]], + ["token", [ + ["id", "5"], + ["form", "books"], + ["lemma", "book"], + ["upos", "NOUN"], + ["xpos", "NNS"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]] + ]], + ["head", "2"], + ["deprel", "obj"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "obj"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "obj"] + ]] + ]], + ["misc", [ + ["feature", [ + ["key", "SpaceAfter"], + ["operator", "="], + ["value", "No"] + ]] + ]] + ]], + ["token", [ + ["id", "6"], + ["form", "."], + ["lemma", "."], + ["upos", "PUNCT"], + ["xpos", "."], + ["feats", "_"], + ["head", "2"], + ["deprel", "punct"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "punct"] + ]] + ]], + ["misc", "_"] + ]], + ["sentence-separator", ""], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "2"] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text"], + ["operator", "="], + ["value", "I have no clue."] + ]] + ]], + ["token", [ + ["id", "1"], + ["form", "I"], + ["lemma", "I"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "2"], + ["form", "have"], + ["lemma", "have"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "3"], + ["form", "no"], + ["lemma", "no"], + ["upos", "DET"], + ["xpos", "DT"], + ["feats", [ + ["feature", [ + ["key", "PronType"], + ["operator", "="], + ["value", "Neg"] + ]] + ]], + ["head", "4"], + ["deprel", "det"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "4"], + ["form", "clue"], + ["lemma", "clue"], + ["upos", "NOUN"], + ["xpos", "NN"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]] + ]], + ["head", "2"], + ["deprel", "obj"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "SpaceAfter"], + ["operator", "="], + ["value", "No"] + ]] + ]] + ]], + ["token", [ + ["id", "5"], + ["form", "."], + ["lemma", "."], + ["upos", "PUNCT"], + ["xpos", "."], + ["feats", "_"], + ["head", "2"], + ["deprel", "punct"], + ["deps", "_"], + ["misc", "_"] + ]], + ["sentence-separator", ""], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "panc0.s4"] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text"], + ["operator", "="], + ["value", "तत् यथानुश्रूयते।"] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "translit"], + ["operator", "="], + ["value", "tat yathānuśrūyate."] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text_fr"], + ["operator", "="], + ["value", "Voilà ce qui nous est parvenu par la tradition orale."] + ]] + ]], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "text_en"], + ["operator", "="], + ["value", "This is what is heard."] + ]] + ]], + ["token", [ + ["id", "1"], + ["form", "तत्"], + ["lemma", "तद्"], + ["upos", "DET"], + ["xpos", "_"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + "…", + ["punctuation", "|"], + ["feature", [ + ["key", "PronType"], + ["operator", "="], + ["value", "Dem"] + ]] + ]], + ["head", "3"], + ["deprel", "nsubj"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "Translit"], + ["operator", "="], + ["value", "tat"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "LTranslit"], + ["operator", "="], + ["value", "tad"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gloss"], + ["operator", "="], + ["value", "it"] + ]] + ]] + ]], + ["token", [ + ["id", "2-3"], + ["form", "यथानुश्रूयते"], + ["lemma", "_"], + ["upos", "_"], + ["xpos", "_"], + ["feats", "_"], + ["head", "_"], + ["deprel", "_"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "SpaceAfter"], + ["operator", "="], + ["value", "No"] + ]] + ]] + ]], + ["token", [ + ["id", "2"], + ["form", "यथा"], + ["lemma", "यथा"], + ["upos", "ADV"], + ["xpos", "_"], + ["feats", [ + ["feature", [ + ["key", "PronType"], + ["operator", "="], + ["value", "Rel"] + ]] + ]], + ["head", "3"], + ["deprel", "advmod"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "Translit"], + ["operator", "="], + ["value", "yathā"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "LTranslit"], + ["operator", "="], + ["value", "yathā"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gloss"], + ["operator", "="], + ["value", "how"] + ]] + ]] + ]], + ["token", [ + ["id", "3"], + ["form", "अनुश्रूयते"], + ["lemma", "अनु-श्रु"], + ["upos", "VERB"], + ["xpos", "_"], + ["feats", [ + ["feature", [ + ["key", "Mood"], + ["operator", "="], + ["value", "Ind"] + ]], + ["punctuation", "|"], + "…", + ["punctuation", "|"], + ["feature", [ + ["key", "Voice"], + ["operator", "="], + ["value", "Pass"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "Translit"], + ["operator", "="], + ["value", "anuśrūyate"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "LTranslit"], + ["operator", "="], + ["value", "anu-śru"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gloss"], + ["operator", "="], + ["value", "it-is-heard"] + ]] + ]] + ]], + ["token", [ + ["id", "4"], + ["form", "।"], + ["lemma", "।"], + ["upos", "PUNCT"], + ["xpos", "_"], + ["feats", "_"], + ["head", "3"], + ["deprel", "punct"], + ["deps", "_"], + ["misc", [ + ["feature", [ + ["key", "Translit"], + ["operator", "="], + ["value", "."] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "LTranslit"], + ["operator", "="], + ["value", "."] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Gloss"], + ["operator", "="], + ["value", "."] + ]] + ]] + ]] +] + +---------------------------------------------------- + +Example for sentence boundaries. diff --git a/tests/languages/conllu/sentence_separator_feature.test b/tests/languages/conllu/sentence_separator_feature.test new file mode 100644 index 0000000000..16e9a3fd7a --- /dev/null +++ b/tests/languages/conllu/sentence_separator_feature.test @@ -0,0 +1,122 @@ +# sent_id = 2 +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ + +# sent_id = 3 +1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _ +2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _ + +---------------------------------------------------- + +[ + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "2"] + ]] + ]], + ["token", [ + ["id", "1"], + ["form", "I"], + ["lemma", "I"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", "_"], + ["misc", "_"] + ]], + ["sentence-separator", ""], + ["comment", [ + ["punctuation", "#"], + ["metadata", [ + ["key", "sent_id"], + ["operator", "="], + ["value", "3"] + ]] + ]], + ["token", [ + ["id", "1"], + ["form", "I"], + ["lemma", "I"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", "_"], + ["misc", "_"] + ]], + ["token", [ + ["id", "2"], + ["form", "have"], + ["lemma", "have"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Sing"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "1"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", "_"], + ["misc", "_"] + ]] +] + +---------------------------------------------------- + +Add sentence separator token. diff --git a/tests/languages/conllu/syntactical_annotation.test b/tests/languages/conllu/syntactical_annotation.test new file mode 100644 index 0000000000..e83bbd6124 --- /dev/null +++ b/tests/languages/conllu/syntactical_annotation.test @@ -0,0 +1,189 @@ +1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj +2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root +3 and and CCONJ CC _ 4 cc 4:cc +4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj +5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj +6 . . PUNCT . _ 2 punct 2:punct + +---------------------------------------------------- + +[ + ["token", [ + ["id", "1"], + ["form", "They"], + ["lemma", "they"], + ["upos", "PRON"], + ["xpos", "PRP"], + ["feats", [ + ["feature", [ + ["key", "Case"], + ["operator", "="], + ["value", "Nom"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]] + ]], + ["head", "2"], + ["deprel", "nsubj"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "nsubj"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "nsubj"] + ]] + ]] + ]], + ["token", [ + ["id", "2"], + ["form", "buy"], + ["lemma", "buy"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "3"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "0"], + ["deprel", "root"], + ["deps", [ + ["dep", [ + ["head", "0"], + ["punctuation", ":"], + ["relation", "root"] + ]] + ]] + ]], + ["token", [ + ["id", "3"], + ["form", "and"], + ["lemma", "and"], + ["upos", "CCONJ"], + ["xpos", "CC"], + ["feats", "_"], + ["head", "4"], + ["deprel", "cc"], + ["deps", [ + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "cc"] + ]] + ]] + ]], + ["token", [ + ["id", "4"], + ["form", "sell"], + ["lemma", "sell"], + ["upos", "VERB"], + ["xpos", "VBP"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Person"], + ["operator", "="], + ["value", "3"] + ]], + ["punctuation", "|"], + ["feature", [ + ["key", "Tense"], + ["operator", "="], + ["value", "Pres"] + ]] + ]], + ["head", "2"], + ["deprel", "conj"], + ["deps", [ + ["dep", [ + ["head", "0"], + ["punctuation", ":"], + ["relation", "root"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "conj"] + ]] + ]] + ]], + ["token", [ + ["id", "5"], + ["form", "books"], + ["lemma", "book"], + ["upos", "NOUN"], + ["xpos", "NNS"], + ["feats", [ + ["feature", [ + ["key", "Number"], + ["operator", "="], + ["value", "Plur"] + ]] + ]], + ["head", "2"], + ["deprel", "obj"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "obj"] + ]], + ["punctuation", "|"], + ["dep", [ + ["head", "4"], + ["punctuation", ":"], + ["relation", "obj"] + ]] + ]] + ]], + ["token", [ + ["id", "6"], + ["form", "."], + ["lemma", "."], + ["upos", "PUNCT"], + ["xpos", "."], + ["feats", "_"], + ["head", "2"], + ["deprel", "punct"], + ["deps", [ + ["dep", [ + ["head", "2"], + ["punctuation", ":"], + ["relation", "punct"] + ]] + ]] + ]] +] + +---------------------------------------------------- + +Example for syntactical annotation.