diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py
index 76176ea..c9e771c 100644
--- a/mwe_query/canonicalform.py
+++ b/mwe_query/canonicalform.py
@@ -4,17 +4,17 @@
"""
from typing import Dict, List, Optional, Set, Tuple
-from sastatypes import SynTree
+from sastadev.sastatypes import SynTree
import re
import sys
-from treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \
+from sastadev.treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \
getindexednodesmap, getbasicindexednodesmap, clausebodycats
import lxml.etree as ET
import copy
-from adpositions import vzazindex
-from alpinoparsing import parse
-from lcat import expandnonheadwords
+from mwe_query.adpositions import vzazindex
+from sastadev.alpinoparsing import parse
+from mwe_query.lcat import expandnonheadwords
Xpathexpression = str
@@ -688,6 +688,16 @@ def expandsu(vc: SynTree, subject: SynTree) -> SynTree:
return newvc
+def adaptvzlemma(lemma: str) -> str:
+ if lemma == 'met':
+ result = 'mee'
+ elif lemma == ' tot':
+ result = ' toe'
+ else:
+ result = lemma
+ return result
+
+
def getpronadv(lemma, rel, rprons={}):
newnode = mknode()
newlemma = adaptvzlemma(lemma)
@@ -1329,7 +1339,7 @@ def mkpp(rel: str, vz: str, obj1node: SynTree, begin, end, index, az=None,) ->
return ppnode
-def adaptvzlemma(inlemma: str) -> str:
+def adaptvzlemma_inv(inlemma: str) -> str:
if inlemma == 'mee':
result = 'met'
elif inlemma == 'toe':
@@ -1370,7 +1380,7 @@ def relpronsubst(stree: SynTree) -> SynTree:
newstree, f'.//node[@pt="vz" and @rel="hd" and ../node[@index="{rhdindex}"]]')
if govprep is not None:
govprep.attrib['vztype'] = 'init'
- govprep.attrib['lemma'] = adaptvzlemma(
+ govprep.attrib['lemma'] = adaptvzlemma_inv(
govprep.attrib['lemma'])
# ET.dump(newstree)
@@ -1454,31 +1464,30 @@ def mksuperquery(mwetrees) -> Xpathexpression:
This uses the content words. If only one content word is in the expression, all the words are used.
This way extensions for alternatives (such as the lemma "mijzelf|jezelf|zichzelf") are included.
"""
- if mwetrees == []:
- result = ''
- else:
- mwetree = mwetrees[0] # we only have to look at the first tree
- wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib]
- contentwordnodes = [
- node for node in mwetree.iter() if iscontentwordnode(node)]
- contentwordnodes = contentwordnodes if len(
- contentwordnodes) > 1 else wordnodes
-
- newmwetree = ET.Element('node', attrib={'cat': 'top'})
- for contentwordnode in contentwordnodes:
- cwlemma = gav(contentwordnode, 'lemma')
- cwpt = gav(contentwordnode, 'pt')
- newcontentwordnode = ET.Element(
- 'node', attrib={'lemma': cwlemma, 'pt': cwpt, 'axis': 'descendant'})
- newmwetree.append(newcontentwordnode)
- result = tree2xpath(newmwetree)
-
- # lemmapts = [(gav(node, 'lemma'), gav(node, 'pt')) for node in contentwordnodes]
- # lemmaptxpaths = [f'.//node[@lemma="{lemma}" and @pt="{pt}"]' for (lemma, pt) in lemmapts]
- # lemmaptcondition = ' and '.join(lemmaptxpaths)
- # result = f'//node[@cat="top" and {lemmaptcondition}]'
- return result
-
+ if len(mwetrees) < 1:
+ raise RuntimeError('Cannot generate superset query for empty tree set')
+
+ mwetree = mwetrees[0] # we only have to look at the first tree
+ wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib]
+ contentwordnodes = [node for node in mwetree.iter()
+ if iscontentwordnode(node)]
+ search_for = contentwordnodes if len(contentwordnodes) > 1 else wordnodes
+
+ target_node = ET.Element('node', attrib={'cat': 'top'})
+ children = []
+ for node in search_for:
+ cwlemma = gav(node, 'lemma')
+ cwpt = gav(node, 'pt')
+ n = ET.Element('node', attrib=dict(lemma=cwlemma, pt=cwpt, axis='descendant'))
+ children.append(n)
+
+ del children[0].attrib['axis']
+ for child in children[1:]:
+ target_node.append(child)
+
+ return '//{}/ancestor::alpino_ds/{}'.format(
+ tree2xpath(children[0]),
+ tree2xpath(target_node))
def generatequeries(mwe: str, lcatexpansion=True) -> Tuple[Xpathexpression, Xpathexpression, Xpathexpression]:
"""
diff --git a/mwe_query/indextransform.py b/mwe_query/indextransform.py
index a858dc6..d890ebd 100644
--- a/mwe_query/indextransform.py
+++ b/mwe_query/indextransform.py
@@ -1,4 +1,5 @@
# flake8: noqa
+# TODO: implement this file
from copy import copy
indexdict = {}
@@ -8,5 +9,5 @@ def makeindexdict(stree):
indexdict[index] = stree
for i , node in indexdict.items():
-
+ pass # TODO
diff --git a/mwe_query/lcat.py b/mwe_query/lcat.py
index f8f5703..32c3877 100644
--- a/mwe_query/lcat.py
+++ b/mwe_query/lcat.py
@@ -185,6 +185,3 @@ def getlcat(node: SynTree, prel=None) -> str: # noqa: C901
ET.dump(node)
return result
-
- result = 'xp'
- return result
diff --git a/requirements.txt b/requirements.txt
index 9c0655f..9730a7c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@
#
# pip-compile
#
-alpino-query==2.1.7
+alpino-query==2.1.9
# via mwe-query (setup.py)
basexclient==8.4.4
# via mwe-query (setup.py)
@@ -20,5 +20,7 @@ requests==2.28.1
# via
# alpino-query
# mwe-query (setup.py)
+sastadev==0.0.3
+ # via mwe-query (setup.py)
urllib3==1.26.11
# via requests
diff --git a/setup.py b/setup.py
index 610de78..0b773ae 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
package_data={"mwe_query": ["py.typed"]},
zip_safe=True,
install_requires=[
- 'alpino-query>=2.1.8', 'requests', 'BaseXClient'
+ 'alpino-query>=2.1.8', 'requests', 'BaseXClient', 'sastadev>=0.0.3'
],
entry_points={
'console_scripts': [
diff --git a/tests/data/transform/0-0.xml b/tests/data/transform/0-0.xml
new file mode 100644
index 0000000..2468b81
--- /dev/null
+++ b/tests/data/transform/0-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/1-0.xml b/tests/data/transform/1-0.xml
new file mode 100644
index 0000000..53fbbde
--- /dev/null
+++ b/tests/data/transform/1-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/2-0.xml b/tests/data/transform/2-0.xml
new file mode 100644
index 0000000..4a3718b
--- /dev/null
+++ b/tests/data/transform/2-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/3-0.xml b/tests/data/transform/3-0.xml
new file mode 100644
index 0000000..8f45b92
--- /dev/null
+++ b/tests/data/transform/3-0.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/4-0.xml b/tests/data/transform/4-0.xml
new file mode 100644
index 0000000..b1cb7be
--- /dev/null
+++ b/tests/data/transform/4-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/5-0.xml b/tests/data/transform/5-0.xml
new file mode 100644
index 0000000..b1cb7be
--- /dev/null
+++ b/tests/data/transform/5-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/6-0.xml b/tests/data/transform/6-0.xml
new file mode 100644
index 0000000..9191b94
--- /dev/null
+++ b/tests/data/transform/6-0.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/data/transform/mwes.txt b/tests/data/transform/mwes.txt
new file mode 100644
index 0000000..f0f0d29
--- /dev/null
+++ b/tests/data/transform/mwes.txt
@@ -0,0 +1,7 @@
+iemand zal de dans ontspringen
+iemand zal de *dans ontspringen
+iemand zal de +dans ontspringen
+iemand zal 0de dans ontspringen
+iemand zal de +*dans ontspringen
+iemand zal de *+dans ontspringen
+iemand zal de =dans ontspringen
diff --git a/tests/data/transform/tree.xml b/tests/data/transform/tree.xml
new file mode 100644
index 0000000..9c01073
--- /dev/null
+++ b/tests/data/transform/tree.xml
@@ -0,0 +1,58 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ iemand zal de dans ontspringen
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/test_expand.py b/tests/test_expand.py
index c92e7f7..f976f56 100644
--- a/tests/test_expand.py
+++ b/tests/test_expand.py
@@ -1,11 +1,15 @@
import unittest
+import os
import xml.etree.ElementTree as ET
from mwe_query import expand_index_nodes
+
class TextIndexExpansion(unittest.TestCase):
+ def data_path(self, filename):
+ return os.path.join(os.path.dirname(__file__), "data", filename)
def test_no_infinite_loop(self):
- with open('tests/data/expand/001.xml') as f:
+ with open(self.data_path('expand/001.xml')) as f:
doc = ET.parse(f)
expand_index_nodes(doc)
diff --git a/tests/preprocess_test.py b/tests/test_preprocess.py
similarity index 58%
rename from tests/preprocess_test.py
rename to tests/test_preprocess.py
index e3cf818..d43dc72 100644
--- a/tests/preprocess_test.py
+++ b/tests/test_preprocess.py
@@ -1,16 +1,18 @@
-from canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \
+import unittest
+from mwe_query.canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \
genvariants, trees2xpath, removesuperfluousindexes, newgenvariants, lowerpredm, relpronsubst, expandfull, \
- generatequeries, applyqueries, selfapplyqueries
+ generatequeries, applyqueries, selfapplyqueries, variable, com, noann
import os
import sys
import lxml.etree as ET
+from difflib import context_diff
from treebankfunctions import getstree, getyield, indextransform, getyieldstr
from alpinoparsing import parse
-from lcat import expandnonheadwords
+from mwe_query.lcat import expandnonheadwords
-##DONE
-##indextransform uitstellen
+# DONE
+# indextransform uitstellen
# index meenemen
# speciale behandeling voor bareindexnodes
# varianten genereren
@@ -24,6 +26,7 @@
comma = ','
tab = '\t'
+
def gettopnode(stree):
for child in stree:
if child.tag == 'node':
@@ -31,68 +34,88 @@ def gettopnode(stree):
return None
-def main():
- inputfilename = r'./testdata/all_mwes_2022-08-22.txt'
- base, ext = os.path.splitext(inputfilename)
- outfilename = base + '_annotated' + ext
- with open(inputfilename, 'r', encoding='utf8') as infile:
- with open(outfilename, 'w', encoding='utf8') as outfile:
- linenr = 0
- for idmwe in infile:
- linenr += 1
- # skip header
- if linenr == 1:
- continue
- idmwelist = idmwe.split(tab)
- id = idmwelist[0]
- mwe = idmwelist[1][:-1]
- annotatedlist = preprocess_MWE(mwe)
- wlist = [el[0] for el in annotatedlist]
- annlist = [el[1] for el in annotatedlist]
- wliststr = space.join(wlist)
- annliststr = comma.join([str(i) for i in annlist])
- print(f'{mwe};{wliststr};{annliststr}', file=outfile)
- b, sym = containsillegalsymbols(wliststr)
- if b:
- print(f'Illegal symbol {sym} in {wliststr}', file=sys.stderr)
-
-def mktreebank(dict, outfilename):
- treebank = ET.Element('treebank')
- for mwe in dict:
- tree = parse(mwe)
- treebank.append(tree)
-
- fulltreebank = ET.ElementTree(treebank)
- fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
-
-
-
-
-def test1():
- mwe = 'iemand zal blikken com:[met iemand] wisselen'
- annotatedlist = preprocess_MWE(mwe)
- print(annotatedlist)
-
-
-def test2():
- mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen']
- mwes += ['iemand zal 0de dans ontspringen']
- mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen']
- mwes = ['iemand zal de =dans ontspringen']
- for mwe in mwes:
+class TextIndexExpansion(unittest.TestCase):
+ def data_path(self, *paths):
+ return os.path.join(os.path.dirname(__file__), "data", *paths)
+
+ def main(self):
+ inputfilename = self.data_path('all_mwes_2022-08-22.txt')
+ base, ext = os.path.splitext(inputfilename)
+ outfilename = base + '_annotated' + ext
+ with open(inputfilename, 'r', encoding='utf8') as infile:
+ with open(outfilename, 'w', encoding='utf8') as outfile:
+ linenr = 0
+ for idmwe in infile:
+ linenr += 1
+ # skip header
+ if linenr == 1:
+ continue
+ idmwelist = idmwe.split(tab)
+ id = idmwelist[0]
+ mwe = idmwelist[1][:-1]
+ annotatedlist = preprocess_MWE(mwe)
+ wlist = [el[0] for el in annotatedlist]
+ annlist = [el[1] for el in annotatedlist]
+ wliststr = space.join(wlist)
+ annliststr = comma.join([str(i) for i in annlist])
+ print(f'{mwe};{wliststr};{annliststr}', file=outfile)
+ b, sym = self.containsillegalsymbols(wliststr)
+ if b:
+ print(
+ f'Illegal symbol {sym} in {wliststr}', file=sys.stderr)
+
+ def mktreebank(self, dict, outfilename):
+ treebank = ET.Element('treebank')
+ for mwe in dict:
+ tree = parse(mwe)
+ treebank.append(tree)
+
+ fulltreebank = ET.ElementTree(treebank)
+ fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
+
+ def test_annotation(self):
+ mwe = 'iemand zal blikken com:[met iemand] wisselen'
annotatedlist = preprocess_MWE(mwe)
- annotations = [el[1] for el in annotatedlist]
- cleanmwe = space.join([el[0] for el in annotatedlist])
- fullmweparse = strees[1]
- mweparse = gettopnode(fullmweparse)
- newtrees = transformtree(mweparse, annotations)
- print(f'{mwe}:')
- for newtree in newtrees:
- ET.dump(newtree)
-
-
-streestrings = {}
-streestrings[1] = """
+ assert annotatedlist == [
+ ('iemand', variable),
+ ('zal', noann),
+ ('blikken', noann),
+ ('met', com),
+ ('iemand', com),
+ ('wisselen', noann)]
+
+ def test_transform(self):
+ with open(self.data_path("transform", "mwes.txt"), encoding="utf-8", mode="r") as f:
+ mwes = f.readlines()
+
+ i = 0
+ for mwe in mwes:
+ if not mwe:
+ continue
+
+ annotatedlist = preprocess_MWE(mwe)
+ annotations = [el[1] for el in annotatedlist]
+ cleanmwe = space.join([el[0] for el in annotatedlist])
+ fullmweparse = self.strees[1]
+ mweparse = gettopnode(fullmweparse)
+ newtrees = transformtree(mweparse, annotations)
+ j = 0
+ for newtree in newtrees:
+ ET.indent(newtree)
+ actual = ET.tostring(newtree, encoding="unicode").splitlines(True)
+ with open(self.data_path("transform", f"{i}-{j}.xml"), encoding="utf-8", mode="r") as f:
+ expected = f.readlines()
+ diff = ''.join(context_diff(expected, actual))
+ try:
+ assert not diff
+ except:
+ print(diff)
+ raise
+ j += 1
+ i += 1
+
+ streestrings = {}
+ streestrings[1] = """
@@ -154,7 +177,7 @@ def test2():
"""
-streestrings[2] = """
+ streestrings[2] = """
@@ -228,7 +251,7 @@ def test2():
"""
-streestrings[3] = """
+ streestrings[3] = """
@@ -286,7 +309,7 @@ def test2():
"""
-streestrings[4] = """
+ streestrings[4] = """
@@ -321,7 +344,7 @@ def test2():
"""
-streestrings[5] = """
+ streestrings[5] = """
@@ -353,7 +376,7 @@ def test2():
"""
-streestrings[6] = """
+ streestrings[6] = """
@@ -382,400 +405,395 @@ def test2():
"""
-strees = {i: ET.fromstring(streestrings[i]) for i in streestrings}
-
-
-def containsillegalsymbols(mwe):
- for el in annotationstrings:
- if el in mwe:
- return True, el
- return (False, None)
-
-
-def test3():
- lofs = [[1, 2], [3, 4], [5, 6]]
- results = listofsets2setoflists(lofs)
- for result in results:
- print(result)
-
-
-def getmwedict(intbfilename):
- mwedict = {}
- fulltreebank = getstree(intbfilename)
- treebank = fulltreebank.getroot()
- for stree in treebank:
- keylist = getyield(stree)
- key = space.join(keylist)
- richstree = stree
- #richstree = indextransform(stree) # put off because this should happen later
- mwedict[key] = richstree
- return mwedict
-
-def testrel():
- for i in {4,5,6}:
- newstree = relpronsubst(strees[i])
- ET.dump(newstree)
-
-def test4():
- intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml'
- mwedict = getmwedict(intbfilename)
- mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen']
- mwes += ['iemand zal 0de dans ontspringen']
- mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen']
- mwes += ['iemand zal de =dans ontspringen']
- mwes += ['dat mes zal aan twee kanten snijden']
- mwes += ['0nu zal de aap uit de mouw komen']
- mwes += ['iemand zal de schuld op zich nemen']
- mwes += ['iemand zal buiten zichzelf zijn']
- mwes += ['iemand zal veel in zijn mars hebben']
- mwes += ['bij nacht en ontijd']
- mwes += ['iemand zal blikken com:[met] iemand wisselen'] # still something wrong here
- mwes += ['dd:[dat] mes zal aan twee kanten snijden']
- mwes += ['iets zal er inzitten']
- mwes += ['iemand zal in touw zijn']
- mwes += ['iemand zal aan iemand een *hekel hebben']
- mwes += ['iemand zal 0geen gras over iets laten groeien']
- #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ]
- mwes += ['#door dik en dun']
- mwes += ['#ad patres']
- mwes += ['ad patres']
- mwes += ['iemand zal aan de kant #gaan']
- mwes += ['iemand zal aan de kant gaan']
-
- for mwe in mwes:
- annotatedlist = preprocess_MWE(mwe)
- annotations = [el[1] for el in annotatedlist]
- cleanmwe = space.join([el[0] for el in annotatedlist])
- fullmweparse = None
- if cleanmwe in mwedict:
- fullmweparse = mwedict[cleanmwe]
- #ET.dump(fullmweparse)
- elif mwe in mwedict:
+ @property
+ def strees(self):
+ return {i: ET.fromstring(self.streestrings[i]) for i in self.streestrings}
+
+ def containsillegalsymbols(self, mwe):
+ for el in annotationstrings:
+ if el in mwe:
+ return True, el
+ return (False, None)
+
+ def test_lofs(self):
+ lofs = [[1, 2], [3, 4], [5, 6]]
+ results = listofsets2setoflists(lofs)
+ assert results == [
+ [1, 3, 5],
+ [1, 3, 6],
+ [1, 4, 5],
+ [1, 4, 6],
+ [2, 3, 5],
+ [2, 3, 6],
+ [2, 4, 5],
+ [2, 4, 6]]
+
+ def getmwedict(self, intbfilename):
+ mwedict = {}
+ fulltreebank = getstree(intbfilename)
+ treebank = fulltreebank.getroot()
+ for stree in treebank:
+ keylist = getyield(stree)
+ key = space.join(keylist)
+ richstree = stree
+ # richstree = indextransform(stree) # put off because this should happen later
+ mwedict[key] = richstree
+ return mwedict
+
+ def test_rel(self):
+ for i in {4, 5, 6}:
+ newstree = relpronsubst(self.strees[i])
+ ET.dump(newstree)
+
+ def test4(self):
+ intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml')
+ mwedict = self.getmwedict(intbfilename)
+ mwes = ['iemand zal de dans ontspringen',
+ 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen']
+ mwes += ['iemand zal 0de dans ontspringen']
+ mwes += ['iemand zal de +*dans ontspringen',
+ 'iemand zal de *+dans ontspringen']
+ mwes += ['iemand zal de =dans ontspringen']
+ mwes += ['dat mes zal aan twee kanten snijden']
+ mwes += ['0nu zal de aap uit de mouw komen']
+ mwes += ['iemand zal de schuld op zich nemen']
+ mwes += ['iemand zal buiten zichzelf zijn']
+ mwes += ['iemand zal veel in zijn mars hebben']
+ mwes += ['bij nacht en ontijd']
+ # still something wrong here
+ mwes += ['iemand zal blikken com:[met] iemand wisselen']
+ mwes += ['dd:[dat] mes zal aan twee kanten snijden']
+ mwes += ['iets zal er inzitten']
+ mwes += ['iemand zal in touw zijn']
+ mwes += ['iemand zal aan iemand een *hekel hebben']
+ mwes += ['iemand zal 0geen gras over iets laten groeien']
+ #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ]
+ mwes += ['#door dik en dun']
+ mwes += ['#ad patres']
+ mwes += ['ad patres']
+ mwes += ['iemand zal aan de kant #gaan']
+ mwes += ['iemand zal aan de kant gaan']
+
+ for mwe in mwes:
+ annotatedlist = preprocess_MWE(mwe)
+ annotations = [el[1] for el in annotatedlist]
+ cleanmwe = space.join([el[0] for el in annotatedlist])
+ fullmweparse = None
+ if cleanmwe in mwedict:
+ fullmweparse = mwedict[cleanmwe]
+ # ET.dump(fullmweparse)
+ elif mwe in mwedict:
+ fullmweparse = mwedict[mwe]
+ if fullmweparse is not None:
+ mweparse = gettopnode(fullmweparse)
+ newtreesa = transformtree(mweparse, annotations)
+ newtrees = []
+ for newtreea in newtreesa:
+ newtrees += genvariants(newtreea)
+ newtrees.extend(newtreesa)
+ print(f'{mwe}:')
+ for newtree in newtrees:
+ # print(f'{i+1}:')
+ print()
+ ET.dump(newtree)
+ else:
+ print(f'MWE <{cleanmwe}> not found ', file=sys.stderr)
+
+ def mkoutfilename(self, infilename: str, suffix: str, ext=None) -> str:
+ basefilename, inext = os.path.splitext(infilename)
+ if ext is None:
+ ext = inext
+ result = basefilename + suffix + ext
+ return result
+
+ def base_testfind(self, basemwe, xpath, mwedict, all=False):
+ results = []
+ localxpath = "." + xpath
+ for mwe in mwedict:
+ origmwetree = mwedict[mwe]
+ mwetree = lowerpredm(origmwetree)
+ mweyield = getyield(mwetree)
+ mwestr = space.join(mweyield)
+ # ET.dump(mwetree)
+ # print(f'mwe={mwe}')
+ # print(f'xpath:\n{localxpath}\n')
+ mwehits = mwetree.xpath(localxpath)
+ newresult = (basemwe, mwe, len(mwehits))
+ results.append(newresult)
+
+ for basemwe, mwe, count in results:
+ if all:
+ cond = True
+ else:
+ cond = (basemwe == mwe and count != 1) or (
+ basemwe != mwe and count != 0)
+ if cond:
+ print(basemwe, mwe, count, file=sys.stderr)
+
+ @unittest.skip("slooooow")
+ def test5(self):
+ reportevery = 500
+ intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml')
+ mwedict = self.getmwedict(intbfilename)
+ # next one is problematic, so we delete it
+ problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen'
+ if problemmwe in mwedict:
+ del mwedict[problemmwe]
+ #mwedict = {}
+ #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2]
+ #mwedict['iemand zal ’m smeren'] = strees[3]
+ # for ind, tree in expandedmwedict.items():
+ # print(ind)
+ # ET.dump(tree)
+ suffix = '_trees'
+ outfilename = self.mkoutfilename(intbfilename, suffix)
+ # with open(outfilename, 'w', encoding='utf8') as outfile:
+ treebank = ET.Element('treebank')
+ inds = ['iemand zal uit iemands koker komen']
+ inds = ['iemand zal slechte invloed op iemand hebben']
+ inds = ['iemand zal met de pet naar iets gooien']
+ inds = ['de tale Kanaäns']
+ inds += ['heel af en toe']
+ inds += ['na verloop van tijd']
+ inds += ['al doende zal men leren']
+ inds = ['iemand zal de schuld van iets op iemand schuimwedicven']
+ inds += ['iemand zal iets door de vingers zien']
+ inds += ['iemand zal achter iets komen']
+ inds += ['iemand zal uit iemands koker komen']
+ inds += ['al doende zal men leren']
+ # inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions
+ inds += ['te dom om voor de duivel te dansen']
+ inds += ['zo doof als een kwartel']
+ inds = ['iemand zal veel ellende over iemand uitstorten']
+ #mwedict = {ind: mwedict[ind] for ind in inds}
+ expandedmwedict = {mwe: indextransform(
+ tree) for mwe, tree in mwedict.items()}
+ counter = 0
+ for mwe in mwedict:
+ counter += 1
+ mwe_element = ET.Element('mwe', attrib={'mwe': mwe})
+ #print(mwe, file=sys.stderr)
+ if counter % reportevery == 0:
+ print(counter, file=sys.stderr)
+ annotatedlist = preprocess_MWE(mwe)
+ annotations = [el[1] for el in annotatedlist]
+ #cleanmwe = space.join([el[0] for el in annotatedlist])
fullmweparse = mwedict[mwe]
- if fullmweparse is not None:
mweparse = gettopnode(fullmweparse)
+ # if mweparse is None:
+ # #print(f'\n\n{mwe}:', file=outfile)
+ # #print('None')
+ # continue
+ treeyield = getyield(mweparse)
+ treeyieldstr = space.join(treeyield)
+ if treeyieldstr != mwe:
+ print(f'mismatch:\n{treeyieldstr}=/={mwe} ')
+ continue
newtreesa = transformtree(mweparse, annotations)
newtrees = []
for newtreea in newtreesa:
- newtrees += genvariants(newtreea)
- newtrees.extend(newtreesa)
- print(f'{mwe}:')
- for newtree in newtrees:
- #print(f'{i+1}:')
- print()
- ET.dump(newtree)
- else:
- print(f'MWE <{cleanmwe}> not found ', file=sys.stderr)
-
-def mkoutfilename(infilename: str, suffix: str, ext=None) -> str:
- basefilename, inext = os.path.splitext(infilename)
- if ext is None:
- ext = inext
- result = basefilename + suffix + ext
- return result
-
-def testfind(basemwe, xpath, mwedict, all=False):
- results = []
- localxpath = "." + xpath
- for mwe in mwedict:
- origmwetree = mwedict[mwe]
- mwetree = lowerpredm(origmwetree)
- mweyield = getyield(mwetree)
- mwestr = space.join(mweyield)
- #ET.dump(mwetree)
- #print(f'mwe={mwe}')
- #print(f'xpath:\n{localxpath}\n')
- mwehits = mwetree.xpath(localxpath)
- newresult = (basemwe, mwe, len(mwehits))
- results.append(newresult)
-
-
- for basemwe, mwe, count in results:
- if all:
- cond = True
- else:
- cond = (basemwe == mwe and count!=1) or (basemwe != mwe and count!=0)
- if cond:
- print(basemwe, mwe, count, file=sys.stderr )
-
-
-
-def test5():
- reportevery = 500
- intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml'
- mwedict = getmwedict(intbfilename)
- # next one is problematic, so we delete it
- problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen'
- if problemmwe in mwedict:
- del mwedict[problemmwe]
- #mwedict = {}
- #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2]
- #mwedict['iemand zal ’m smeren'] = strees[3]
- #for ind, tree in expandedmwedict.items():
- # print(ind)
- # ET.dump(tree)
- suffix = '_trees'
- outfilename = mkoutfilename(intbfilename, suffix)
-# with open(outfilename, 'w', encoding='utf8') as outfile:
- treebank = ET.Element('treebank')
- inds = ['iemand zal uit iemands koker komen']
- inds = ['iemand zal slechte invloed op iemand hebben']
- inds = ['iemand zal met de pet naar iets gooien']
- inds = ['de tale Kanaäns']
- inds += ['heel af en toe']
- inds += ['na verloop van tijd']
- inds += ['al doende zal men leren']
- inds = ['iemand zal de schuld van iets op iemand schuimwedicven']
- inds += ['iemand zal iets door de vingers zien']
- inds += ['iemand zal achter iets komen']
- inds += ['iemand zal uit iemands koker komen']
- inds += ['al doende zal men leren']
- #inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions
- inds += ['te dom om voor de duivel te dansen']
- inds += ['zo doof als een kwartel']
- inds = ['iemand zal veel ellende over iemand uitstorten']
- #mwedict = {ind: mwedict[ind] for ind in inds}
- expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()}
- counter = 0
- for mwe in mwedict:
- counter += 1
- mwe_element = ET.Element('mwe', attrib={'mwe': mwe})
- #print(mwe, file=sys.stderr)
- if counter % reportevery == 0:
- print(counter, file=sys.stderr)
- annotatedlist = preprocess_MWE(mwe)
- annotations = [el[1] for el in annotatedlist]
- #cleanmwe = space.join([el[0] for el in annotatedlist])
- fullmweparse = mwedict[mwe]
- mweparse = gettopnode(fullmweparse)
- #if mweparse is None:
- # #print(f'\n\n{mwe}:', file=outfile)
- # #print('None')
- # continue
- treeyield = getyield(mweparse)
- treeyieldstr = space.join(treeyield)
- if treeyieldstr != mwe:
- print(f'mismatch:\n{treeyieldstr}=/={mwe} ' )
- continue
- newtreesa = transformtree(mweparse, annotations)
- newtrees = []
- for newtreea in newtreesa:
- newtrees += newgenvariants(newtreea)
- #newtrees.extend(newtreesa)
- #print(f'\n\n{mwe}:', file=outfile)
- cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees]
- #cleantrees = newtrees
- #print('cleantrees:')
- #for cleantree in cleantrees:
- # ET.dump(cleantree)
- mwe_element.extend(cleantrees)
- xpath = trees2xpath(cleantrees, expanded=True)
- #print(xpath)
- xpath_element = ET.Element('xpath')
- xpath_element.text = xpath
- mwe_element.append(xpath_element)
- treebank.append(mwe_element)
- testfind(mwe, xpath, expandedmwedict)
- #ET.dump(treebank)
- # for newtree in newtrees:
- # #print(f'{i+1}:')
- # print()
- # treebank.append(newtree)
- fulltreebank = ET.ElementTree(treebank)
- #ET.indent(newtree, space=" ")
- #print(ET.tostring(newtree), file=outfile)
- fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
-
-def check(treebankdict):
- for utt, stree in treebankdict.items():
- for node in stree.iter():
- if 'pt' in node.attrib:
- for att in {'begin','end'}:
- if 'id' in node.attrib:
- id = node.attrib['id']
- else:
- id = 'None'
- if att not in node.attrib:
- print(f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.')
- ET.dump(stree)
-
-
-def getutts(infilename):
- #each utterance on a separate line, discard the final \n and skip empty lines
- infile = open(infilename, 'r', encoding='utf8')
- rawutts = infile.readlines()
- utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1]
- return utts
-
-
-def testvariatie():
- mwetreebank = './testdata/mwesvoorvariatie-noann_treebank.xml'
- mwedict = getmwedict(mwetreebank)
- #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()}
- testtreebankfilename = './testdata/testzinnen mwevarianten_treebank.xml'
- fullvariationtreebank = getstree(testtreebankfilename)
- variationtreebank = fullvariationtreebank.getroot()
- variationtreebankdict= {getyieldstr(tree): expandfull(tree) for tree in variationtreebank}
- #check(variationtreebankdict)
- annotatedmwefilename = './testdata/mwesvoorvariatie-annotated.txt'
- annotatedmwes = getutts(annotatedmwefilename)
- suffix = '_derivedtrees'
- outfilename = mkoutfilename(mwetreebank, suffix)
- treebank = ET.Element('treebank')
- counter = 0
- reportevery = 500
-
- #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen']
- #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen']
- for rawmwe in annotatedmwes:
- counter += 1
- mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe})
- #print(mwe, file=sys.stderr)
- if counter % reportevery == 0:
- print(counter, file=sys.stderr)
- annotatedlist = preprocess_MWE(rawmwe)
- annotations = [el[1] for el in annotatedlist]
- mweparts = [el[0] for el in annotatedlist]
- mwe = space.join(mweparts)
- fullmweparse = mwedict[mwe]
- mweparse = gettopnode(fullmweparse)
- #if mweparse is None:
- # #print(f'\n\n{mwe}:', file=outfile)
- # #print('None')
- # continue
- treeyield = getyield(mweparse)
- treeyieldstr = space.join(treeyield)
- if treeyieldstr != mwe:
- print(f'mismatch:\n{treeyieldstr}=/={mwe} ' )
- continue
- newtreesa = transformtree(mweparse, annotations)
- newtrees = []
- for newtreea in newtreesa:
- newtrees += newgenvariants(newtreea)
- #newtrees.extend(newtreesa)
- #print(f'\n\n{mwe}:', file=outfile)
- cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees]
- #cleantrees = newtrees
- #print('cleantrees:')
- #for cleantree in cleantrees:
- # ET.dump(cleantree)
- mwe_element.extend(cleantrees)
- xpath = trees2xpath(cleantrees, expanded=True)
- #print(xpath)
- xpath_element = ET.Element('xpath')
- xpath_element.text = xpath
- mwe_element.append(xpath_element)
- treebank.append(mwe_element)
- testfind(mwe, xpath, variationtreebankdict)
- #ET.dump(treebank)
- # for newtree in newtrees:
- # #print(f'{i+1}:')
- # print()
- # treebank.append(newtree)
- fulltreebank = ET.ElementTree(treebank)
- #ET.indent(newtree, space=" ")
- #print(ET.tostring(newtree), file=outfile)
- fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
-
-def gentreebank():
- #generate a new treebank because a new parser is being used
- intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml'
- suffix = '_parse2022-11-18'
- outfilename = mkoutfilename(intbfilename, suffix)
- mwedict = getmwedict(intbfilename)
- mktreebank(mwedict, outfilename)
-
-
-
-def genqueries():
- """
- Generates queries in a file (with suffix _querytriples) in the
- testdata folder for a mwe treebank in the same folder and applies
- the queries on all the mwe tree in the treebank.
-
- It will generate output on the console for every mwe and reports
- differences from what was expected.
- """
-
- # if True it will only check whether the queries for an mwe find
- # a match in the parsed tree of the canonical form of the mwe
- # (that is a minimal requirement for an xpath query)
- selftest = False
-
- # test these specific MWEs
- mwes = [ 'iemand zal een poging doen', 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen']
- mwes += ['iemand zal *honger hebben']
- #mwes = ['iemand zal 0een *+poging doen']
-
- # Jan Odijk:
- # "Oorspronkelijk werkte ik met een bestand gedownload uit GreTEL 4 (MWE20220429_corpus2alpino),
- # maar de parse hiervan bleken anders te zijn dan in de huidige versie,
- # dus heb ik een nieuwe treebank gegenereerd: MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml."
- #intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml'
- intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml'
- suffix = '_querytriples'
- outfilename = mkoutfilename(intbfilename, suffix)
- mwedict = getmwedict(intbfilename)
- #selectedmwe = 'af en toe'
- mwes = [mwe for mwe, _ in mwedict.items() ]
- # mwes = ['iemand zal 0een *+poging doen']
- # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten']
- # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald']
- # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken']
- # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden']
- # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging']
- # mwes += ['dd:[dat] zelfde liedje']
- # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten']
- # mwes += ['iemand zal veel ellende over iemand uitstorten']
- # mwes += ['iemand zal aanhangen als een klis']
- # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren',
- # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn',
- # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan']
- # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn',
- # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen']
- # mwes += ['iemand zal buiten zichzelf zijn']
- # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn',
- # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen',
- # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter',
- # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat',
- # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets',
- # 'wie een hond wil slaan zal licht een stok vinden',
- # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen']
- #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen']
- #mwes += ['het zal zaliger zijn te geven dan te ontvangen']
- with open(outfilename, 'w', encoding='utf8') as outfile:
- for mwe in mwes:
- print(mwe)
- (mweq, nearmissq, supersetq) = generatequeries(mwe)
- print(f'\n{mwe}:', file=outfile)
- print(f'mweq:\n{mweq}', file=outfile)
-
- print(f'nearmissq:\n{nearmissq}', file=outfile)
+ newtrees += newgenvariants(newtreea)
+ # newtrees.extend(newtreesa)
+ #print(f'\n\n{mwe}:', file=outfile)
+ cleantrees = [removesuperfluousindexes(
+ newtree) for newtree in newtrees]
+ #cleantrees = newtrees
+ # print('cleantrees:')
+ # for cleantree in cleantrees:
+ # ET.dump(cleantree)
+ mwe_element.extend(cleantrees)
+ xpath = trees2xpath(cleantrees, expanded=True)
+ # print(xpath)
+ xpath_element = ET.Element('xpath')
+ xpath_element.text = xpath
+ mwe_element.append(xpath_element)
+ treebank.append(mwe_element)
+ self.base_testfind(mwe, xpath, expandedmwedict)
+ # ET.dump(treebank)
+ # for newtree in newtrees:
+ # #print(f'{i+1}:')
+ # print()
+ # treebank.append(newtree)
+ fulltreebank = ET.ElementTree(treebank)
+ #ET.indent(newtree, space=" ")
+ #print(ET.tostring(newtree), file=outfile)
+ fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
+
+ def check(self, treebankdict):
+ for utt, stree in treebankdict.items():
+ for node in stree.iter():
+ if 'pt' in node.attrib:
+ for att in {'begin', 'end'}:
+ if 'id' in node.attrib:
+ id = node.attrib['id']
+ else:
+ id = 'None'
+ if att not in node.attrib:
+ print(
+ f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.')
+ ET.dump(stree)
+
+ def getutts(self, infilename):
+ # each utterance on a separate line, discard the final \n and skip empty lines
+ with open(infilename, 'r', encoding='utf8') as infile:
+ rawutts = infile.readlines()
+ utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1]
+ return utts
+
+ @unittest.skip("not deterministic")
+ def test_variatie(self):
+ mwetreebank = self.data_path('mwesvoorvariatie-noann_treebank.xml')
+ mwedict = self.getmwedict(mwetreebank)
+ #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()}
+ testtreebankfilename = self.data_path('testzinnen mwevarianten_treebank.xml')
+ fullvariationtreebank = getstree(testtreebankfilename)
+ variationtreebank = fullvariationtreebank.getroot()
+ variationtreebankdict = {getyieldstr(tree): expandfull(
+ tree) for tree in variationtreebank}
+ # check(variationtreebankdict)
+ annotatedmwefilename = self.data_path('mwesvoorvariatie-annotated.txt')
+ annotatedmwes = self.getutts(annotatedmwefilename)
+ suffix = '_derivedtrees'
+ outfilename = self.mkoutfilename(mwetreebank, suffix)
+ treebank = ET.Element('treebank')
+ counter = 0
+ reportevery = 500
+
+ #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen']
+ #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen']
+ for rawmwe in annotatedmwes:
+ counter += 1
+ mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe})
+ #print(mwe, file=sys.stderr)
+ if counter % reportevery == 0:
+ print(counter, file=sys.stderr)
+ annotatedlist = preprocess_MWE(rawmwe)
+ annotations = [el[1] for el in annotatedlist]
+ mweparts = [el[0] for el in annotatedlist]
+ mwe = space.join(mweparts)
+ fullmweparse = mwedict[mwe]
+ mweparse = gettopnode(fullmweparse)
+ # if mweparse is None:
+ # #print(f'\n\n{mwe}:', file=outfile)
+ # #print('None')
+ # continue
+ treeyield = getyield(mweparse)
+ treeyieldstr = space.join(treeyield)
+ if treeyieldstr != mwe:
+ print(f'mismatch:\n{treeyieldstr}=/={mwe} ')
+ continue
+ newtreesa = transformtree(mweparse, annotations)
+ newtrees = []
+ for newtreea in newtreesa:
+ newtrees += newgenvariants(newtreea)
+ # newtrees.extend(newtreesa)
+ #print(f'\n\n{mwe}:', file=outfile)
+ cleantrees = [removesuperfluousindexes(
+ newtree) for newtree in newtrees]
+ #cleantrees = newtrees
+ # print('cleantrees:')
+ # for cleantree in cleantrees:
+ # ET.dump(cleantree)
+ mwe_element.extend(cleantrees)
+ xpath = trees2xpath(cleantrees, expanded=True)
+ # print(xpath)
+ xpath_element = ET.Element('xpath')
+ xpath_element.text = xpath
+ mwe_element.append(xpath_element)
+ treebank.append(mwe_element)
+ self.base_testfind(mwe, xpath, variationtreebankdict)
+ # ET.dump(treebank)
+ # for newtree in newtrees:
+ # #print(f'{i+1}:')
+ # print()
+ # treebank.append(newtree)
+ fulltreebank = ET.ElementTree(treebank)
+ #ET.indent(newtree, space=" ")
+ #print(ET.tostring(newtree), file=outfile)
+ fulltreebank.write(outfilename, encoding='utf8', pretty_print=True)
+
+ def gentreebank(self):
+ # generate a new treebank because a new parser is being used
+ intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml')
+ suffix = '_parse2022-11-18'
+ outfilename = self.mkoutfilename(intbfilename, suffix)
+ mwedict = self.getmwedict(intbfilename)
+ self.mktreebank(mwedict, outfilename)
+
+ def genqueries(self):
+ selftest = False
+ mwes = ['iemand zal een poging doen',
+ 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen']
+ mwes += ['iemand zal *honger hebben']
+ #mwes = ['iemand zal 0een *+poging doen']
+ #intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml')
+ intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml')
+ suffix = '_querytriples'
+ outfilename = self.mkoutfilename(intbfilename, suffix)
+ mwedict = self.getmwedict(intbfilename)
+ #selectedmwe = 'af en toe'
+ mwes = [mwe for mwe, _ in mwedict.items()]
+ # mwes = ['iemand zal 0een *+poging doen']
+ # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten']
+ # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald']
+ # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken']
+ # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden']
+ # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging']
+ # mwes += ['dd:[dat] zelfde liedje']
+ # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten']
+ # mwes += ['iemand zal veel ellende over iemand uitstorten']
+ # mwes += ['iemand zal aanhangen als een klis']
+ # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren',
+ # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn',
+ # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan']
+ # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn',
+ # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen']
+ # mwes += ['iemand zal buiten zichzelf zijn']
+ # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn',
+ # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen',
+ # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter',
+ # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat',
+ # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets',
+ # 'wie een hond wil slaan zal licht een stok vinden',
+ # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen']
+ #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen']
+ #mwes += ['het zal zaliger zijn te geven dan te ontvangen']
+ with open(outfilename, 'w', encoding='utf8') as outfile:
+ for mwe in mwes:
+ print(mwe)
+ (mweq, nearmissq, supersetq) = generatequeries(mwe)
+ print(f'\n{mwe}:', file=outfile)
+ print(f'mweq:\n{mweq}', file=outfile)
- print(f'supersetq:\n{supersetq}', file= outfile)
+ print(f'nearmissq:\n{nearmissq}', file=outfile)
- annotatedlist = preprocess_MWE(mwe)
- #annotations = [el[1] for el in annotatedlist]
- mweparts = [el[0] for el in annotatedlist]
- utt = space.join(mweparts)
+ print(f'supersetq:\n{supersetq}', file=outfile)
- if selftest:
- # #self test
- (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries(utt, mweq, nearmissq, supersetq)
- if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1:
- print(f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}')
- else:
- results = applyqueries(mwedict, mwe, mweq, nearmissq, supersetq)
-
-
-if __name__ == '__main__':
- # main()
- # test1()
- #test2()
- #test3()
- #test4()
- #test5()
- #testrel()
- #testvariatie()
- #gentreebank()
- genqueries()
+ annotatedlist = preprocess_MWE(mwe)
+ #annotations = [el[1] for el in annotatedlist]
+ mweparts = [el[0] for el in annotatedlist]
+ utt = space.join(mweparts)
+
+ if selftest:
+ # #self test
+ (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries(
+ utt, mweq, nearmissq, supersetq)
+ if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1:
+ print(
+ f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}')
+ else:
+ results = applyqueries(
+ mwedict, mwe, mweq, nearmissq, supersetq)
+
+
+# if __name__ == '__main__':
+# # main()
+# # test1()
+# #test2()
+# #test3()
+# #test4()
+# #test5()
+# #testrel()
+# #testvariatie()
+# #gentreebank()
+# genqueries()
diff --git a/tests/update_outputs.py b/tests/update_outputs.py
index 48fccd2..d0559fb 100755
--- a/tests/update_outputs.py
+++ b/tests/update_outputs.py
@@ -7,6 +7,7 @@
import sys
from os import path
import glob
+import lxml.etree as ET
testdir = path.dirname(__file__)
datadir = path.join(testdir, "data")
@@ -14,34 +15,35 @@
# import this implementation
sys.path.insert(0, path.join(testdir, ".."))
from mwe_query import Mwe
+from mwe_query.canonicalform import preprocess_MWE, transformtree
-def datapath(filename):
- return path.join(datadir, filename)
+def datapath(dirname, filename):
+ return path.join(datadir, dirname, filename)
-def read(filename):
- with open(datapath(filename)) as f:
+def read(dirname, filename):
+ with open(datapath(dirname, filename)) as f:
return f.read()
-def write(filename, content):
- with open(datapath(filename), "w") as f:
+def write(dirname, filename, content):
+ with open(datapath(dirname, filename), "w") as f:
f.write(content)
-def update(basename):
- lines = read(basename + ".txt").splitlines()
+def update_generate(basename):
+ lines = read("generate", basename + ".txt").splitlines()
can_form = lines[0].strip()
sentence = lines[1].strip()
alpino_xml_filename = basename + ".xml"
- if not path.exists(datapath(alpino_xml_filename)):
+ if not path.exists(datapath("generate", alpino_xml_filename)):
print("parsing")
alpino_xml = parse_sentence(can_form)
- write(alpino_xml_filename, alpino_xml)
+ write("generate", alpino_xml_filename, alpino_xml)
else:
- alpino_xml = read(alpino_xml_filename)
+ alpino_xml = read("generate", alpino_xml_filename)
mwe = Mwe(sentence)
mwe.set_tree(alpino_xml)
@@ -50,10 +52,37 @@ def update(basename):
queries = mwe.generate_queries()
for query in queries:
- write(f"{basename}-{query.rank}.xpath", query.xpath)
+ write("generate", f"{basename}-{query.rank}.xpath", query.xpath)
-input_files = glob.glob(path.join(datadir, '*.txt'))
+def gettopnode(stree):
+ for child in stree:
+ if child.tag == 'node':
+ return child
+ return None
+
+
+def update_transform():
+ mwes = read("transform", "mwes.txt").splitlines()
+
+ i = 0
+ for mwe in mwes:
+ annotatedlist = preprocess_MWE(mwe)
+ annotations = [el[1] for el in annotatedlist]
+ fullmweparse = ET.fromstring(read("transform", "tree.xml"))
+ mweparse = gettopnode(fullmweparse)
+ newtrees = transformtree(mweparse, annotations)
+
+ j = 0
+ for newtree in newtrees:
+ ET.indent(newtree)
+ write("transform", f"{i}-{j}.xml", ET.tostring(newtree, encoding="unicode"))
+ j += 1
+
+ i += 1
+
+input_files = glob.glob(path.join(datadir, "generate", '*.txt'))
for input in input_files:
head, ext = path.splitext(path.basename(input))
- update(head)
+ update_generate(head)
+update_transform()