diff --git a/components.json b/components.json index b353778e90..e0eb7433fe 100644 --- a/components.json +++ b/components.json @@ -342,6 +342,10 @@ "alias": "conc", "owner": "jasontatton" }, + "conllu": { + "title": "CoNLL-U", + "owner": "Querela" + }, "csp": { "title": "Content-Security-Policy", "owner": "ScottHelme" diff --git a/components/prism-conllu.js b/components/prism-conllu.js new file mode 100644 index 0000000000..f425a0ec57 --- /dev/null +++ b/components/prism-conllu.js @@ -0,0 +1,162 @@ +(function (Prism) { + + Prism.languages.conllu = { + // comment lines + comment: { + pattern: /#(?:[^\n])*/, + inside: { + metadata: { + pattern: /(?:\w+)\s*=\s*.*/, + inside: { + key: { + pattern: /\w+(?=\s*=)/, + alias: 'property', + }, + value: { + pattern: /(\s*=\s*)\S.*$/, + lookbehind: true, + alias: 'string', + }, + operator: /[=]/, + } + }, + punctuation: /^#/, + } + }, + // separator between two sentence blocks + "sentence-separator": { + pattern: /(\r?\n)(?=\r?\n)/s, + lookbehind: true, + }, + // word lines + token: { + pattern: /.+/, + inside: { + id: { + pattern: /^\d+(?:[.-]\d+)?/, + alias: 'number', + }, + // form / lemma / upos / xpos / feats / head / deprel / deps / misc + value: { + pattern: /^(\t)[^\t]*(?=\t|$)/, + lookbehind: true, + // alias: 'string', + // inside: { + // unspecified: /_/, + // } + }, + }, + }, + }; + + const featKeyExp = /[A-Z][A-Za-z0-9]*(?:\[[a-z0-9]+\])?/; + const featValueExp = /.+/; // we just want everything here ... not /[A-Z0-9][A-Za-z0-9]*/; + const featsGrammar = { + punctuation: /\|/, + feature: { + pattern: RegExp('^' + featKeyExp.source + '=' + '.*' + '$'), + inside: { + key: { + pattern: RegExp(featKeyExp.source + '(?==)'), // /\w+(?==)/, + alias: 'property', + }, + value: [ + { + pattern: /(=)(?:yes|no)$/i, + lookbehind: true, + alias: 'boolean', + }, { + pattern: RegExp('(=)' + featValueExp.source + '$'), // /(=).+$/, + lookbehind: true, + alias: 'string', + } + ], + operator: /=/, + }, + }, + }; + + const relationExp = /^[a-z]+(:[a-z]+)?(:[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(_[\p{Ll}\p{Lm}\p{Lo}\p{M}]+)*)?(:[a-z]+)?$/; + const depsGrammar = { + punctuation: /\|/, + dep: { + pattern: /^\S+$/, + inside: { + head: { + pattern: /\d+(?=:)/, + alias: 'number', + }, + punctuation: /^:/, + relation: { + pattern: /.+/, // we just capture everything, should be ok + alias: 'symbol', + }, + } + }, + } + + // hook to assign roles to value fields + const entryTypes = ['form', 'lemma', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc']; + const entryTypesAlias = [null, null, 'symbol', 'symbol', null, 'number', 'symbol', null, null]; + const entryTypeInside = [null, null, null, null, featsGrammar, null, null, depsGrammar, featsGrammar]; + Prism.hooks.add('after-tokenize', function (env) { + if (env.language !== 'conllu') { + return; + } + + for (const row of env.tokens) { + // go over each token row (if it is a "token" and not a comment/sentence-separator) + if (row.type === 'token') { + let entryTypeCounter = 0; + for (const field of row.content) { + // skip space between + if (typeof field === 'string') { continue; } + // only fields, not ids + if (field?.type !== 'value') { continue; } + + if (field.alias === undefined) { field.alias = []; } + if (typeof field.alias === 'string') { field.alias = [field.alias]; } + + // check if "_" value, and assign class + if (field.content === '_') { + field.alias.push('unspecified'); + } + + // assign role to value based on position + if (entryTypeCounter < entryTypes.length) { + // add "value" as one alias + field.alias.push(field.type); + // change field type + field.type = entryTypes[entryTypeCounter]; + // add alias if available + if (entryTypesAlias[entryTypeCounter] !== null) { + field.alias.push(entryTypesAlias[entryTypeCounter]); + } else if (entryTypeInside[entryTypeCounter] === null) { + // only assign string if there is no inner processing? + field.alias.push('string'); + } + + // run inner processing only for selected types! + if (field.content !== '_' && entryTypeInside[entryTypeCounter] !== null) { + field.content = Prism.tokenize(field.content, entryTypeInside[entryTypeCounter]); + } + } + + entryTypeCounter++; + } + } + } + }); + + // just to have the classes listed on /faq.html#how-do-i-know-which-tokens-i-can-style-for + // insert dummy rules that do not match anything + // TODO: unsure about possible performance hit? - there should not be anything left to match but regex matching steps increase linearly with input string length ... + // for (let index = 0; index < entryTypes.length; index++) { + // const entryType = entryTypes[index]; + // const entryTypeAlias = entryTypesAlias[index]; + // const name = 'value.' + entryType + (entryTypeAlias !== null ? '.' + entryTypeAlias : ''); + // // use some invalid pattern + // Prism.languages.conllu.token.inside[name] = /\b\B/; + // } + +}(Prism)); diff --git a/examples/prism-conllu.html b/examples/prism-conllu.html new file mode 100644 index 0000000000..6883093226 --- /dev/null +++ b/examples/prism-conllu.html @@ -0,0 +1,119 @@ +
Full details can be fround at Universal Dependencies - Format.
+ +# sent_id = 2
+# text = I have no clue.
+# or a simple string
+
+# sent_id = 2
+# text = I have no clue.
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
+3 no no DET DT PronType=Neg 4 det _ _
+4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
+5 . . PUNCT . _ 2 punct _ _
+
+1-2 vámonos _
+1 vamos ir
+2 nos nosotros
+3-4 al _
+3 a a
+4 el el
+5 mar mar
+
+1 Sue Sue
+2 likes like
+3 coffee coffee
+4 and and
+5 Bill Bill
+5.1 likes like
+6 tea tea
+
+1 nosotros nosotros
+2 vamos ir
+3-4 al _
+3 a a
+4 el el
+5 mar mar
+6 y y
+7 vosotros vosotros
+7.1 vais ir
+8-9 al _
+8 a a
+9 el el
+10 parque parque
+
+1 Då då ADV AB _
+2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act
+3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing
+4 elva elva NUM RG.NOM Case=Nom|NumType=Card
+5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
+6 . . PUNCT DL.MAD _
+
+1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj
+2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root
+3 and and CCONJ CC _ 4 cc 4:cc
+4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj
+5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj
+6 . . PUNCT . _ 2 punct 2:punct
+
+# text = Er arbeitet fürs FBI (deutsch etwa: „Bundesamt für Ermittlung“).
+# text_en = He works for the FBI (German approx: “Bundesamt für Ermittlung”).
+1 Er er PRON … _
+2 arbeitet arbeiten VERB … _
+3-4 fürs _ _ … _
+3 für für ADP … _
+4 das der DET … _
+5 FBI FBI PROPN … _
+6 ( ( PUNCT … SpaceAfter=No
+7 deutsch deutsch ADV … _
+8 etwa etwa ADV … SpaceAfter=No
+9 : : PUNCT … _
+10 „ „ PUNCT … SpaceAfter=No
+11 Bundesamt Bundesamt NOUN … _
+12 für für ADP … _
+13 Ermittlung Ermittlung NOUN … SpaceAfter=No
+14 “ “ PUNCT … SpaceAfter=No
+15 ) ) PUNCT … SpaceAfter=No
+16 . . PUNCT … _
+
+# sent_id = 1
+# text = They buy and sell books.
+1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
+2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
+3 and and CCONJ CC _ 4 cc 4:cc _
+4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
+5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
+6 . . PUNCT . _ 2 punct 2:punct _
+
+# sent_id = 2
+# text = I have no clue.
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
+3 no no DET DT PronType=Neg 4 det _ _
+4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
+5 . . PUNCT . _ 2 punct _ _
+
+# sent_id = panc0.s4
+# text = तत् यथानुश्रूयते।
+# translit = tat yathānuśrūyate.
+# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
+# text_en = This is what is heard.
+1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it
+2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No
+2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how
+3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
+4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=.
diff --git a/tests/languages/conllu/index_feature.test b/tests/languages/conllu/index_feature.test
new file mode 100644
index 0000000000..41d7ea1587
--- /dev/null
+++ b/tests/languages/conllu/index_feature.test
@@ -0,0 +1,97 @@
+1-2 vámonos _
+1 vamos ir
+2 nos nosotros
+3-4 al _
+3 a a
+4 el el
+5 mar mar
+
+1 Sue Sue
+2 likes like
+3 coffee coffee
+4 and and
+5 Bill Bill
+5.1 likes like
+6 tea tea
+
+----------------------------------------------------
+
+[
+ ["token", [
+ ["id", "1-2"],
+ ["form", "vámonos"],
+ ["lemma", "_"]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "vamos"],
+ ["lemma", "ir"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "nos"],
+ ["lemma", "nosotros"]
+ ]],
+ ["token", [
+ ["id", "3-4"],
+ ["form", "al"],
+ ["lemma", "_"]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "a"],
+ ["lemma", "a"]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "el"],
+ ["lemma", "el"]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "mar"],
+ ["lemma", "mar"]
+ ]],
+ ["sentence-separator", ""],
+ ["token", [
+ ["id", "1"],
+ ["form", "Sue"],
+ ["lemma", "Sue"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "likes"],
+ ["lemma", "like"]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "coffee"],
+ ["lemma", "coffee"]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "and"],
+ ["lemma", "and"]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "Bill"],
+ ["lemma", "Bill"]
+ ]],
+ ["token", [
+ ["id", "5.1"],
+ ["form", "likes"],
+ ["lemma", "like"]
+ ]],
+ ["token", [
+ ["id", "6"],
+ ["form", "tea"],
+ ["lemma", "tea"]
+ ]]
+]
+
+----------------------------------------------------
+
+Testing indexing schemes.
+
+https://universaldependencies.org/format.html
diff --git a/tests/languages/conllu/issue3790.test b/tests/languages/conllu/issue3790.test
new file mode 100644
index 0000000000..0b5bfd467c
--- /dev/null
+++ b/tests/languages/conllu/issue3790.test
@@ -0,0 +1,151 @@
+# sent_id = 2
+# text = I have no clue.
+# is a normal comment allowed?
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
+3 no no DET DT PronType=Neg 4 det _ _
+4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
+5 . . PUNCT . _ 2 punct _ _
+
+----------------------------------------------------
+
+[
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "2"]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text"],
+ ["operator", "="],
+ ["value", "I have no clue."]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ " is a normal comment allowed?\r"
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "I"],
+ ["lemma", "I"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "have"],
+ ["lemma", "have"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "no"],
+ ["lemma", "no"],
+ ["upos", "DET"],
+ ["xpos", "DT"],
+ ["feats", [
+ ["feature", [
+ ["key", "PronType"],
+ ["operator", "="],
+ ["value", "Neg"]
+ ]]
+ ]],
+ ["head", "4"],
+ ["deprel", "det"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "clue"],
+ ["lemma", "clue"],
+ ["upos", "NOUN"],
+ ["xpos", "NN"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "obj"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "SpaceAfter"],
+ ["operator", "="],
+ ["value", "No"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "."],
+ ["lemma", "."],
+ ["upos", "PUNCT"],
+ ["xpos", "."],
+ ["feats", "_"],
+ ["head", "2"],
+ ["deprel", "punct"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]]
+]
+
+----------------------------------------------------
+
+Language feature request.
diff --git a/tests/languages/conllu/morphological_annotation.test b/tests/languages/conllu/morphological_annotation.test
new file mode 100644
index 0000000000..ffea3ace4d
--- /dev/null
+++ b/tests/languages/conllu/morphological_annotation.test
@@ -0,0 +1,135 @@
+1 Då då ADV AB _
+2 var vara VERB VB.PRET.ACT Tense=Past|Voice=Act
+3 han han PRON PN.UTR.SIN.DEF.NOM Case=Nom|Definite=Def|Gender=Com|Number=Sing
+4 elva elva NUM RG.NOM Case=Nom|NumType=Card
+5 år år NOUN NN.NEU.PLU.IND.NOM Case=Nom|Definite=Ind|Gender=Neut|Number=Plur
+6 . . PUNCT DL.MAD _
+
+----------------------------------------------------
+
+[
+ ["token", [
+ ["id", "1"],
+ ["form", "Då"],
+ ["lemma", "då"],
+ ["upos", "ADV"],
+ ["xpos", "AB"],
+ ["feats", "_"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "var"],
+ ["lemma", "vara"],
+ ["upos", "VERB"],
+ ["xpos", "VB.PRET.ACT"],
+ ["feats", [
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Past"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Voice"],
+ ["operator", "="],
+ ["value", "Act"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "han"],
+ ["lemma", "han"],
+ ["upos", "PRON"],
+ ["xpos", "PN.UTR.SIN.DEF.NOM"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Definite"],
+ ["operator", "="],
+ ["value", "Def"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gender"],
+ ["operator", "="],
+ ["value", "Com"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "elva"],
+ ["lemma", "elva"],
+ ["upos", "NUM"],
+ ["xpos", "RG.NOM"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "NumType"],
+ ["operator", "="],
+ ["value", "Card"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "år"],
+ ["lemma", "år"],
+ ["upos", "NOUN"],
+ ["xpos", "NN.NEU.PLU.IND.NOM"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Definite"],
+ ["operator", "="],
+ ["value", "Ind"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gender"],
+ ["operator", "="],
+ ["value", "Neut"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "6"],
+ ["form", "."],
+ ["lemma", "."],
+ ["upos", "PUNCT"],
+ ["xpos", "DL.MAD"],
+ ["feats", "_"]
+ ]]
+]
+
+----------------------------------------------------
+
+Example for morphological annotation.
diff --git a/tests/languages/conllu/sentence_boundaries.test b/tests/languages/conllu/sentence_boundaries.test
new file mode 100644
index 0000000000..08ad52c5ac
--- /dev/null
+++ b/tests/languages/conllu/sentence_boundaries.test
@@ -0,0 +1,582 @@
+# sent_id = 1
+# text = They buy and sell books.
+1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
+2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root _
+3 and and CCONJ CC _ 4 cc 4:cc _
+4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj _
+5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
+6 . . PUNCT . _ 2 punct 2:punct _
+
+# sent_id = 2
+# text = I have no clue.
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
+3 no no DET DT PronType=Neg 4 det _ _
+4 clue clue NOUN NN Number=Sing 2 obj _ SpaceAfter=No
+5 . . PUNCT . _ 2 punct _ _
+
+# sent_id = panc0.s4
+# text = तत् यथानुश्रूयते।
+# translit = tat yathānuśrūyate.
+# text_fr = Voilà ce qui nous est parvenu par la tradition orale.
+# text_en = This is what is heard.
+1 तत् तद् DET _ Case=Nom|…|PronType=Dem 3 nsubj _ Translit=tat|LTranslit=tad|Gloss=it
+2-3 यथानुश्रूयते _ _ _ _ _ _ _ SpaceAfter=No
+2 यथा यथा ADV _ PronType=Rel 3 advmod _ Translit=yathā|LTranslit=yathā|Gloss=how
+3 अनुश्रूयते अनु-श्रु VERB _ Mood=Ind|…|Voice=Pass 0 root _ Translit=anuśrūyate|LTranslit=anu-śru|Gloss=it-is-heard
+4 । । PUNCT _ _ 3 punct _ Translit=.|LTranslit=.|Gloss=.
+
+----------------------------------------------------
+
+[
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "1"]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text"],
+ ["operator", "="],
+ ["value", "They buy and sell books."]
+ ]]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "They"],
+ ["lemma", "they"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "nsubj"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "nsubj"]
+ ]]
+ ]],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "buy"],
+ ["lemma", "buy"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "3"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", [
+ ["dep", [
+ ["head", "0"],
+ ["punctuation", ":"],
+ ["relation", "root"]
+ ]]
+ ]],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "and"],
+ ["lemma", "and"],
+ ["upos", "CCONJ"],
+ ["xpos", "CC"],
+ ["feats", "_"],
+ ["head", "4"],
+ ["deprel", "cc"],
+ ["deps", [
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "cc"]
+ ]]
+ ]],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "sell"],
+ ["lemma", "sell"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "3"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "conj"],
+ ["deps", [
+ ["dep", [
+ ["head", "0"],
+ ["punctuation", ":"],
+ ["relation", "root"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "conj"]
+ ]]
+ ]],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "books"],
+ ["lemma", "book"],
+ ["upos", "NOUN"],
+ ["xpos", "NNS"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "obj"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "obj"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "obj"]
+ ]]
+ ]],
+ ["misc", [
+ ["feature", [
+ ["key", "SpaceAfter"],
+ ["operator", "="],
+ ["value", "No"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "6"],
+ ["form", "."],
+ ["lemma", "."],
+ ["upos", "PUNCT"],
+ ["xpos", "."],
+ ["feats", "_"],
+ ["head", "2"],
+ ["deprel", "punct"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "punct"]
+ ]]
+ ]],
+ ["misc", "_"]
+ ]],
+ ["sentence-separator", ""],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "2"]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text"],
+ ["operator", "="],
+ ["value", "I have no clue."]
+ ]]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "I"],
+ ["lemma", "I"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "have"],
+ ["lemma", "have"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "no"],
+ ["lemma", "no"],
+ ["upos", "DET"],
+ ["xpos", "DT"],
+ ["feats", [
+ ["feature", [
+ ["key", "PronType"],
+ ["operator", "="],
+ ["value", "Neg"]
+ ]]
+ ]],
+ ["head", "4"],
+ ["deprel", "det"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "clue"],
+ ["lemma", "clue"],
+ ["upos", "NOUN"],
+ ["xpos", "NN"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "obj"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "SpaceAfter"],
+ ["operator", "="],
+ ["value", "No"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "."],
+ ["lemma", "."],
+ ["upos", "PUNCT"],
+ ["xpos", "."],
+ ["feats", "_"],
+ ["head", "2"],
+ ["deprel", "punct"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["sentence-separator", ""],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "panc0.s4"]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text"],
+ ["operator", "="],
+ ["value", "तत् यथानुश्रूयते।"]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "translit"],
+ ["operator", "="],
+ ["value", "tat yathānuśrūyate."]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text_fr"],
+ ["operator", "="],
+ ["value", "Voilà ce qui nous est parvenu par la tradition orale."]
+ ]]
+ ]],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "text_en"],
+ ["operator", "="],
+ ["value", "This is what is heard."]
+ ]]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "तत्"],
+ ["lemma", "तद्"],
+ ["upos", "DET"],
+ ["xpos", "_"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ "…",
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "PronType"],
+ ["operator", "="],
+ ["value", "Dem"]
+ ]]
+ ]],
+ ["head", "3"],
+ ["deprel", "nsubj"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "Translit"],
+ ["operator", "="],
+ ["value", "tat"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "LTranslit"],
+ ["operator", "="],
+ ["value", "tad"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gloss"],
+ ["operator", "="],
+ ["value", "it"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "2-3"],
+ ["form", "यथानुश्रूयते"],
+ ["lemma", "_"],
+ ["upos", "_"],
+ ["xpos", "_"],
+ ["feats", "_"],
+ ["head", "_"],
+ ["deprel", "_"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "SpaceAfter"],
+ ["operator", "="],
+ ["value", "No"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "यथा"],
+ ["lemma", "यथा"],
+ ["upos", "ADV"],
+ ["xpos", "_"],
+ ["feats", [
+ ["feature", [
+ ["key", "PronType"],
+ ["operator", "="],
+ ["value", "Rel"]
+ ]]
+ ]],
+ ["head", "3"],
+ ["deprel", "advmod"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "Translit"],
+ ["operator", "="],
+ ["value", "yathā"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "LTranslit"],
+ ["operator", "="],
+ ["value", "yathā"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gloss"],
+ ["operator", "="],
+ ["value", "how"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "अनुश्रूयते"],
+ ["lemma", "अनु-श्रु"],
+ ["upos", "VERB"],
+ ["xpos", "_"],
+ ["feats", [
+ ["feature", [
+ ["key", "Mood"],
+ ["operator", "="],
+ ["value", "Ind"]
+ ]],
+ ["punctuation", "|"],
+ "…",
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Voice"],
+ ["operator", "="],
+ ["value", "Pass"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "Translit"],
+ ["operator", "="],
+ ["value", "anuśrūyate"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "LTranslit"],
+ ["operator", "="],
+ ["value", "anu-śru"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gloss"],
+ ["operator", "="],
+ ["value", "it-is-heard"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "।"],
+ ["lemma", "।"],
+ ["upos", "PUNCT"],
+ ["xpos", "_"],
+ ["feats", "_"],
+ ["head", "3"],
+ ["deprel", "punct"],
+ ["deps", "_"],
+ ["misc", [
+ ["feature", [
+ ["key", "Translit"],
+ ["operator", "="],
+ ["value", "."]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "LTranslit"],
+ ["operator", "="],
+ ["value", "."]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Gloss"],
+ ["operator", "="],
+ ["value", "."]
+ ]]
+ ]]
+ ]]
+]
+
+----------------------------------------------------
+
+Example for sentence boundaries.
diff --git a/tests/languages/conllu/sentence_separator_feature.test b/tests/languages/conllu/sentence_separator_feature.test
new file mode 100644
index 0000000000..16e9a3fd7a
--- /dev/null
+++ b/tests/languages/conllu/sentence_separator_feature.test
@@ -0,0 +1,122 @@
+# sent_id = 2
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+
+# sent_id = 3
+1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj _ _
+2 have have VERB VBP Number=Sing|Person=1|Tense=Pres 0 root _ _
+
+----------------------------------------------------
+
+[
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "2"]
+ ]]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "I"],
+ ["lemma", "I"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["sentence-separator", ""],
+ ["comment", [
+ ["punctuation", "#"],
+ ["metadata", [
+ ["key", "sent_id"],
+ ["operator", "="],
+ ["value", "3"]
+ ]]
+ ]],
+ ["token", [
+ ["id", "1"],
+ ["form", "I"],
+ ["lemma", "I"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "have"],
+ ["lemma", "have"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Sing"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "1"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", "_"],
+ ["misc", "_"]
+ ]]
+]
+
+----------------------------------------------------
+
+Add sentence separator token.
diff --git a/tests/languages/conllu/syntactical_annotation.test b/tests/languages/conllu/syntactical_annotation.test
new file mode 100644
index 0000000000..e83bbd6124
--- /dev/null
+++ b/tests/languages/conllu/syntactical_annotation.test
@@ -0,0 +1,189 @@
+1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj
+2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres 0 root 0:root
+3 and and CCONJ CC _ 4 cc 4:cc
+4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 0:root|2:conj
+5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj
+6 . . PUNCT . _ 2 punct 2:punct
+
+----------------------------------------------------
+
+[
+ ["token", [
+ ["id", "1"],
+ ["form", "They"],
+ ["lemma", "they"],
+ ["upos", "PRON"],
+ ["xpos", "PRP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Case"],
+ ["operator", "="],
+ ["value", "Nom"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "nsubj"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "nsubj"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "nsubj"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "2"],
+ ["form", "buy"],
+ ["lemma", "buy"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "3"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "0"],
+ ["deprel", "root"],
+ ["deps", [
+ ["dep", [
+ ["head", "0"],
+ ["punctuation", ":"],
+ ["relation", "root"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "3"],
+ ["form", "and"],
+ ["lemma", "and"],
+ ["upos", "CCONJ"],
+ ["xpos", "CC"],
+ ["feats", "_"],
+ ["head", "4"],
+ ["deprel", "cc"],
+ ["deps", [
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "cc"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "4"],
+ ["form", "sell"],
+ ["lemma", "sell"],
+ ["upos", "VERB"],
+ ["xpos", "VBP"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Person"],
+ ["operator", "="],
+ ["value", "3"]
+ ]],
+ ["punctuation", "|"],
+ ["feature", [
+ ["key", "Tense"],
+ ["operator", "="],
+ ["value", "Pres"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "conj"],
+ ["deps", [
+ ["dep", [
+ ["head", "0"],
+ ["punctuation", ":"],
+ ["relation", "root"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "conj"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "5"],
+ ["form", "books"],
+ ["lemma", "book"],
+ ["upos", "NOUN"],
+ ["xpos", "NNS"],
+ ["feats", [
+ ["feature", [
+ ["key", "Number"],
+ ["operator", "="],
+ ["value", "Plur"]
+ ]]
+ ]],
+ ["head", "2"],
+ ["deprel", "obj"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "obj"]
+ ]],
+ ["punctuation", "|"],
+ ["dep", [
+ ["head", "4"],
+ ["punctuation", ":"],
+ ["relation", "obj"]
+ ]]
+ ]]
+ ]],
+ ["token", [
+ ["id", "6"],
+ ["form", "."],
+ ["lemma", "."],
+ ["upos", "PUNCT"],
+ ["xpos", "."],
+ ["feats", "_"],
+ ["head", "2"],
+ ["deprel", "punct"],
+ ["deps", [
+ ["dep", [
+ ["head", "2"],
+ ["punctuation", ":"],
+ ["relation", "punct"]
+ ]]
+ ]]
+ ]]
+]
+
+----------------------------------------------------
+
+Example for syntactical annotation.