Skip to content

Commit

Permalink
first version of CONLL-U to FoLiA converter #53
Browse files Browse the repository at this point in the history
  • Loading branch information
proycon committed Aug 17, 2018
1 parent 1fd1f87 commit 7d26f8d
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 19 deletions.
49 changes: 32 additions & 17 deletions foliatools/conllu2folia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@
from pynlpl.formats import folia

UPOS_SET = "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl"
UDEP_SET = "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-dependencies.foliaset.ttl"


def main():
parser = argparse.ArgumentParser(description="CONLL-U to FoLiA converter", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--id',type=str,help="Document ID for the FoLiA document", action='store',default="",required=False)
parser.add_argument('--lemma-set',dest="lemmaset", type=str,help="URL of the set definition for lemmas", action='store',default="undefined",required=False)
parser.add_argument('--pos-set',dest="posset",type=str,help="URL of the set definition for language-specific part-of-speech and features (xpos)", action='store',default="undefined",required=False)
parser.add_argument('--dependency-set',dest="depset", type=str,help="Dependency set", action='store',default="undefined", required=False)
parser.add_argument('--pos-set',dest="posset",type=str,help="URL of the set definition for *language-specific* part-of-speech and features (xpos and not the universal pos!)", action='store',default="undefined",required=False)
parser.add_argument('--dependency-set',dest="depset", type=str,help="Dependency set", action='store',default=UDEP_SET, required=False)
parser.add_argument('-o', '--outputdir',type=str,help="Output directory", action='store',default=".", required=False)
parser.add_argument('files', nargs='+', help='CONLL-U input files')
args = parser.parse_args()
Expand All @@ -48,11 +49,16 @@ def main():
for i, tokenlist in enumerate(sentences):
if 'newdoc id' in tokenlist.metadata or i == 0:
if doc is not None and hascontent:
doc.save(os.path.join(args.outputdir), doc_id + ".folia.xml")
doc.save(os.path.join(args.outputdir, doc_id + ".folia.xml"))
print("Wrote " + doc_id + ".folia.xml",file=sys.stderr)
if 'newdoc id' in tokenlist.metadata:
doc_id = tokenlist.metadata['newdoc id']
hascontent = False
doc = folia.Document(id=doc_id, autodeclare=True)
doc = folia.Document(id=doc_id)
doc.declare(folia.PosAnnotation, set=UPOS_SET, annotator="conll2folia")
doc.declare(folia.PosAnnotation, set=args.posset, annotator="conll2folia")
doc.declare(folia.Dependency, set=args.depset, annotator="conll2folia")
doc.declare(folia.LemmaAnnotation, set=args.lemmaset, annotator="conll2folia")
textbody = folia.Text(doc, id=doc_id+'.text')
doc.append(textbody)
anchor = textbody
Expand All @@ -63,29 +69,38 @@ def main():
else:
sent_id = doc_id + '.s.' + str(i+1)
sentence = folia.Sentence(doc, id=sent_id)
wordindex = {} #quick lookup index for this sentence
for token in tokenlist:
word = sentence.append(folia.Word, token['form'], id=sent_id + ".w." + token['id'])
if token['misc'] and 'SpaceAfter' in token['misc'] and token['misc']['SpaceAfter'].lower() == 'no':
space = False
else:
space = True
word = sentence.add(folia.Word, token['form'], id=sent_id + ".w." + str(token['id']), space=space)
wordindex[token['id']] = word
if token['upostag']:
pos = word.append(folia.PosAnnotation, cls=token['upostag'], set=UPOS_SET)
if token['feats']:
pos = word.add(folia.PosAnnotation, cls=token['upostag'], set=UPOS_SET)
if isinstance(token['feats'], dict):
for subset, cls in token['feats'].items():
pos.append(folia.Feature(subset=subset,cls=cls))
pos.add(folia.Feature,subset=subset,cls=cls)
if token['xpostag']:
word.append(folia.PosAnnotation, cls=token['xpostag'], set=args.posset)
if token['feats'] and not token['upostag']:
if isinstance(token['feats'], dict) and not token['upostag']:
for subset, cls in token['feats'].items():
pos.append(folia.Feature(subset=subset,cls=cls))
pos.add(folia.Feature,subset=subset,cls=cls)
if token['lemma']:
word.append(folia.LemmaAnnotation, cls=token['lemma'], set=args.lemmaset)
word.add(folia.LemmaAnnotation, cls=token['lemma'], set=args.lemmaset)
hascontent = True
anchor.append(sentence)
for token in tokenlist:
if token['head'] and token['deprel']:
sentence.add(folia.Dependency(doc, set=args.depset, cls=token['deprel'], contents=[
folia.DependencyHead(doc, wordindex[token['head']]),
folia.DependencyDependent(doc, wordindex[token['id']])
]))
anchor.add(sentence)

if doc is not None and hascontent:
doc.save(os.path.join(args.outputdir), doc_id + ".folia.xml")




doc.save(os.path.join(args.outputdir, doc_id + ".folia.xml"))
print("Wrote " + doc_id + ".folia.xml",file=sys.stderr)

if __name__ == '__main__':
main()
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def read(fname):

setup(
name = "FoLiA-tools",
version = "1.5.1.60", #point to this from pynlpl/tests/folia.py
version = "1.5.1.61", #point to this from pynlpl/tests/folia.py
author = "Maarten van Gompel",
author_email = "proycon@anaproy.nl",
description = ("FoLiA-tools contains various Python-based command line tools for working with FoLiA XML (Format for Linguistic Annotation)"),
Expand Down Expand Up @@ -56,9 +56,10 @@ def read(fname):
'foliatree = foliatools.foliatree:main',
'foliasetdefinition = foliatools.foliasetdefinition:main',
'foliaeval = foliatools.foliaeval:main',
'conllu2folia = foliatools.conllu2folia:main',
]
},
#include_package_data=True,
package_data = {'foliatools': ['*.xsl', '../schemas/*.yml','../schemas/*.rng']},
install_requires=['pynlpl >= 1.2.5', 'lxml >= 2.2','docutils', 'pyyaml']
install_requires=['pynlpl >= 1.2.5', 'lxml >= 2.2','docutils', 'pyyaml', 'conllu']
)

0 comments on commit 7d26f8d

Please sign in to comment.