diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index 76176ea..c9e771c 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -4,17 +4,17 @@ """ from typing import Dict, List, Optional, Set, Tuple -from sastatypes import SynTree +from sastadev.sastatypes import SynTree import re import sys -from treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \ +from sastadev.treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \ getindexednodesmap, getbasicindexednodesmap, clausebodycats import lxml.etree as ET import copy -from adpositions import vzazindex -from alpinoparsing import parse -from lcat import expandnonheadwords +from mwe_query.adpositions import vzazindex +from sastadev.alpinoparsing import parse +from mwe_query.lcat import expandnonheadwords Xpathexpression = str @@ -688,6 +688,16 @@ def expandsu(vc: SynTree, subject: SynTree) -> SynTree: return newvc +def adaptvzlemma(lemma: str) -> str: + if lemma == 'met': + result = 'mee' + elif lemma == ' tot': + result = ' toe' + else: + result = lemma + return result + + def getpronadv(lemma, rel, rprons={}): newnode = mknode() newlemma = adaptvzlemma(lemma) @@ -1329,7 +1339,7 @@ def mkpp(rel: str, vz: str, obj1node: SynTree, begin, end, index, az=None,) -> return ppnode -def adaptvzlemma(inlemma: str) -> str: +def adaptvzlemma_inv(inlemma: str) -> str: if inlemma == 'mee': result = 'met' elif inlemma == 'toe': @@ -1370,7 +1380,7 @@ def relpronsubst(stree: SynTree) -> SynTree: newstree, f'.//node[@pt="vz" and @rel="hd" and ../node[@index="{rhdindex}"]]') if govprep is not None: govprep.attrib['vztype'] = 'init' - govprep.attrib['lemma'] = adaptvzlemma( + govprep.attrib['lemma'] = adaptvzlemma_inv( govprep.attrib['lemma']) # ET.dump(newstree) @@ -1454,31 +1464,30 @@ def mksuperquery(mwetrees) -> Xpathexpression: This uses the content words. If only one content word is in the expression, all the words are used. This way extensions for alternatives (such as the lemma "mijzelf|jezelf|zichzelf") are included. """ - if mwetrees == []: - result = '' - else: - mwetree = mwetrees[0] # we only have to look at the first tree - wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib] - contentwordnodes = [ - node for node in mwetree.iter() if iscontentwordnode(node)] - contentwordnodes = contentwordnodes if len( - contentwordnodes) > 1 else wordnodes - - newmwetree = ET.Element('node', attrib={'cat': 'top'}) - for contentwordnode in contentwordnodes: - cwlemma = gav(contentwordnode, 'lemma') - cwpt = gav(contentwordnode, 'pt') - newcontentwordnode = ET.Element( - 'node', attrib={'lemma': cwlemma, 'pt': cwpt, 'axis': 'descendant'}) - newmwetree.append(newcontentwordnode) - result = tree2xpath(newmwetree) - - # lemmapts = [(gav(node, 'lemma'), gav(node, 'pt')) for node in contentwordnodes] - # lemmaptxpaths = [f'.//node[@lemma="{lemma}" and @pt="{pt}"]' for (lemma, pt) in lemmapts] - # lemmaptcondition = ' and '.join(lemmaptxpaths) - # result = f'//node[@cat="top" and {lemmaptcondition}]' - return result - + if len(mwetrees) < 1: + raise RuntimeError('Cannot generate superset query for empty tree set') + + mwetree = mwetrees[0] # we only have to look at the first tree + wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib] + contentwordnodes = [node for node in mwetree.iter() + if iscontentwordnode(node)] + search_for = contentwordnodes if len(contentwordnodes) > 1 else wordnodes + + target_node = ET.Element('node', attrib={'cat': 'top'}) + children = [] + for node in search_for: + cwlemma = gav(node, 'lemma') + cwpt = gav(node, 'pt') + n = ET.Element('node', attrib=dict(lemma=cwlemma, pt=cwpt, axis='descendant')) + children.append(n) + + del children[0].attrib['axis'] + for child in children[1:]: + target_node.append(child) + + return '//{}/ancestor::alpino_ds/{}'.format( + tree2xpath(children[0]), + tree2xpath(target_node)) def generatequeries(mwe: str, lcatexpansion=True) -> Tuple[Xpathexpression, Xpathexpression, Xpathexpression]: """ diff --git a/mwe_query/indextransform.py b/mwe_query/indextransform.py index a858dc6..d890ebd 100644 --- a/mwe_query/indextransform.py +++ b/mwe_query/indextransform.py @@ -1,4 +1,5 @@ # flake8: noqa +# TODO: implement this file from copy import copy indexdict = {} @@ -8,5 +9,5 @@ def makeindexdict(stree): indexdict[index] = stree for i , node in indexdict.items(): - + pass # TODO diff --git a/mwe_query/lcat.py b/mwe_query/lcat.py index f8f5703..32c3877 100644 --- a/mwe_query/lcat.py +++ b/mwe_query/lcat.py @@ -185,6 +185,3 @@ def getlcat(node: SynTree, prel=None) -> str: # noqa: C901 ET.dump(node) return result - - result = 'xp' - return result diff --git a/requirements.txt b/requirements.txt index 9c0655f..9730a7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -alpino-query==2.1.7 +alpino-query==2.1.9 # via mwe-query (setup.py) basexclient==8.4.4 # via mwe-query (setup.py) @@ -20,5 +20,7 @@ requests==2.28.1 # via # alpino-query # mwe-query (setup.py) +sastadev==0.0.3 + # via mwe-query (setup.py) urllib3==1.26.11 # via requests diff --git a/setup.py b/setup.py index 610de78..0b773ae 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ package_data={"mwe_query": ["py.typed"]}, zip_safe=True, install_requires=[ - 'alpino-query>=2.1.8', 'requests', 'BaseXClient' + 'alpino-query>=2.1.8', 'requests', 'BaseXClient', 'sastadev>=0.0.3' ], entry_points={ 'console_scripts': [ diff --git a/tests/data/transform/0-0.xml b/tests/data/transform/0-0.xml new file mode 100644 index 0000000..2468b81 --- /dev/null +++ b/tests/data/transform/0-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/1-0.xml b/tests/data/transform/1-0.xml new file mode 100644 index 0000000..53fbbde --- /dev/null +++ b/tests/data/transform/1-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/2-0.xml b/tests/data/transform/2-0.xml new file mode 100644 index 0000000..4a3718b --- /dev/null +++ b/tests/data/transform/2-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/3-0.xml b/tests/data/transform/3-0.xml new file mode 100644 index 0000000..8f45b92 --- /dev/null +++ b/tests/data/transform/3-0.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/4-0.xml b/tests/data/transform/4-0.xml new file mode 100644 index 0000000..b1cb7be --- /dev/null +++ b/tests/data/transform/4-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/5-0.xml b/tests/data/transform/5-0.xml new file mode 100644 index 0000000..b1cb7be --- /dev/null +++ b/tests/data/transform/5-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/6-0.xml b/tests/data/transform/6-0.xml new file mode 100644 index 0000000..9191b94 --- /dev/null +++ b/tests/data/transform/6-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/mwes.txt b/tests/data/transform/mwes.txt new file mode 100644 index 0000000..f0f0d29 --- /dev/null +++ b/tests/data/transform/mwes.txt @@ -0,0 +1,7 @@ +iemand zal de dans ontspringen +iemand zal de *dans ontspringen +iemand zal de +dans ontspringen +iemand zal 0de dans ontspringen +iemand zal de +*dans ontspringen +iemand zal de *+dans ontspringen +iemand zal de =dans ontspringen diff --git a/tests/data/transform/tree.xml b/tests/data/transform/tree.xml new file mode 100644 index 0000000..9c01073 --- /dev/null +++ b/tests/data/transform/tree.xml @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + iemand zal de dans ontspringen + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_expand.py b/tests/test_expand.py index c92e7f7..f976f56 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -1,11 +1,15 @@ import unittest +import os import xml.etree.ElementTree as ET from mwe_query import expand_index_nodes + class TextIndexExpansion(unittest.TestCase): + def data_path(self, filename): + return os.path.join(os.path.dirname(__file__), "data", filename) def test_no_infinite_loop(self): - with open('tests/data/expand/001.xml') as f: + with open(self.data_path('expand/001.xml')) as f: doc = ET.parse(f) expand_index_nodes(doc) diff --git a/tests/preprocess_test.py b/tests/test_preprocess.py similarity index 58% rename from tests/preprocess_test.py rename to tests/test_preprocess.py index e3cf818..d43dc72 100644 --- a/tests/preprocess_test.py +++ b/tests/test_preprocess.py @@ -1,16 +1,18 @@ -from canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \ +import unittest +from mwe_query.canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \ genvariants, trees2xpath, removesuperfluousindexes, newgenvariants, lowerpredm, relpronsubst, expandfull, \ - generatequeries, applyqueries, selfapplyqueries + generatequeries, applyqueries, selfapplyqueries, variable, com, noann import os import sys import lxml.etree as ET +from difflib import context_diff from treebankfunctions import getstree, getyield, indextransform, getyieldstr from alpinoparsing import parse -from lcat import expandnonheadwords +from mwe_query.lcat import expandnonheadwords -##DONE -##indextransform uitstellen +# DONE +# indextransform uitstellen # index meenemen # speciale behandeling voor bareindexnodes # varianten genereren @@ -24,6 +26,7 @@ comma = ',' tab = '\t' + def gettopnode(stree): for child in stree: if child.tag == 'node': @@ -31,68 +34,88 @@ def gettopnode(stree): return None -def main(): - inputfilename = r'./testdata/all_mwes_2022-08-22.txt' - base, ext = os.path.splitext(inputfilename) - outfilename = base + '_annotated' + ext - with open(inputfilename, 'r', encoding='utf8') as infile: - with open(outfilename, 'w', encoding='utf8') as outfile: - linenr = 0 - for idmwe in infile: - linenr += 1 - # skip header - if linenr == 1: - continue - idmwelist = idmwe.split(tab) - id = idmwelist[0] - mwe = idmwelist[1][:-1] - annotatedlist = preprocess_MWE(mwe) - wlist = [el[0] for el in annotatedlist] - annlist = [el[1] for el in annotatedlist] - wliststr = space.join(wlist) - annliststr = comma.join([str(i) for i in annlist]) - print(f'{mwe};{wliststr};{annliststr}', file=outfile) - b, sym = containsillegalsymbols(wliststr) - if b: - print(f'Illegal symbol {sym} in {wliststr}', file=sys.stderr) - -def mktreebank(dict, outfilename): - treebank = ET.Element('treebank') - for mwe in dict: - tree = parse(mwe) - treebank.append(tree) - - fulltreebank = ET.ElementTree(treebank) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - - - - -def test1(): - mwe = 'iemand zal blikken com:[met iemand] wisselen' - annotatedlist = preprocess_MWE(mwe) - print(annotatedlist) - - -def test2(): - mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] - mwes += ['iemand zal 0de dans ontspringen'] - mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen'] - mwes = ['iemand zal de =dans ontspringen'] - for mwe in mwes: +class TextIndexExpansion(unittest.TestCase): + def data_path(self, *paths): + return os.path.join(os.path.dirname(__file__), "data", *paths) + + def main(self): + inputfilename = self.data_path('all_mwes_2022-08-22.txt') + base, ext = os.path.splitext(inputfilename) + outfilename = base + '_annotated' + ext + with open(inputfilename, 'r', encoding='utf8') as infile: + with open(outfilename, 'w', encoding='utf8') as outfile: + linenr = 0 + for idmwe in infile: + linenr += 1 + # skip header + if linenr == 1: + continue + idmwelist = idmwe.split(tab) + id = idmwelist[0] + mwe = idmwelist[1][:-1] + annotatedlist = preprocess_MWE(mwe) + wlist = [el[0] for el in annotatedlist] + annlist = [el[1] for el in annotatedlist] + wliststr = space.join(wlist) + annliststr = comma.join([str(i) for i in annlist]) + print(f'{mwe};{wliststr};{annliststr}', file=outfile) + b, sym = self.containsillegalsymbols(wliststr) + if b: + print( + f'Illegal symbol {sym} in {wliststr}', file=sys.stderr) + + def mktreebank(self, dict, outfilename): + treebank = ET.Element('treebank') + for mwe in dict: + tree = parse(mwe) + treebank.append(tree) + + fulltreebank = ET.ElementTree(treebank) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def test_annotation(self): + mwe = 'iemand zal blikken com:[met iemand] wisselen' annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = strees[1] - mweparse = gettopnode(fullmweparse) - newtrees = transformtree(mweparse, annotations) - print(f'{mwe}:') - for newtree in newtrees: - ET.dump(newtree) - - -streestrings = {} -streestrings[1] = """ + assert annotatedlist == [ + ('iemand', variable), + ('zal', noann), + ('blikken', noann), + ('met', com), + ('iemand', com), + ('wisselen', noann)] + + def test_transform(self): + with open(self.data_path("transform", "mwes.txt"), encoding="utf-8", mode="r") as f: + mwes = f.readlines() + + i = 0 + for mwe in mwes: + if not mwe: + continue + + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + cleanmwe = space.join([el[0] for el in annotatedlist]) + fullmweparse = self.strees[1] + mweparse = gettopnode(fullmweparse) + newtrees = transformtree(mweparse, annotations) + j = 0 + for newtree in newtrees: + ET.indent(newtree) + actual = ET.tostring(newtree, encoding="unicode").splitlines(True) + with open(self.data_path("transform", f"{i}-{j}.xml"), encoding="utf-8", mode="r") as f: + expected = f.readlines() + diff = ''.join(context_diff(expected, actual)) + try: + assert not diff + except: + print(diff) + raise + j += 1 + i += 1 + + streestrings = {} + streestrings[1] = """ @@ -154,7 +177,7 @@ def test2(): """ -streestrings[2] = """ + streestrings[2] = """ @@ -228,7 +251,7 @@ def test2(): """ -streestrings[3] = """ + streestrings[3] = """ @@ -286,7 +309,7 @@ def test2(): """ -streestrings[4] = """ + streestrings[4] = """ @@ -321,7 +344,7 @@ def test2(): """ -streestrings[5] = """ + streestrings[5] = """ @@ -353,7 +376,7 @@ def test2(): """ -streestrings[6] = """ + streestrings[6] = """ @@ -382,400 +405,395 @@ def test2(): """ -strees = {i: ET.fromstring(streestrings[i]) for i in streestrings} - - -def containsillegalsymbols(mwe): - for el in annotationstrings: - if el in mwe: - return True, el - return (False, None) - - -def test3(): - lofs = [[1, 2], [3, 4], [5, 6]] - results = listofsets2setoflists(lofs) - for result in results: - print(result) - - -def getmwedict(intbfilename): - mwedict = {} - fulltreebank = getstree(intbfilename) - treebank = fulltreebank.getroot() - for stree in treebank: - keylist = getyield(stree) - key = space.join(keylist) - richstree = stree - #richstree = indextransform(stree) # put off because this should happen later - mwedict[key] = richstree - return mwedict - -def testrel(): - for i in {4,5,6}: - newstree = relpronsubst(strees[i]) - ET.dump(newstree) - -def test4(): - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - mwedict = getmwedict(intbfilename) - mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] - mwes += ['iemand zal 0de dans ontspringen'] - mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen'] - mwes += ['iemand zal de =dans ontspringen'] - mwes += ['dat mes zal aan twee kanten snijden'] - mwes += ['0nu zal de aap uit de mouw komen'] - mwes += ['iemand zal de schuld op zich nemen'] - mwes += ['iemand zal buiten zichzelf zijn'] - mwes += ['iemand zal veel in zijn mars hebben'] - mwes += ['bij nacht en ontijd'] - mwes += ['iemand zal blikken com:[met] iemand wisselen'] # still something wrong here - mwes += ['dd:[dat] mes zal aan twee kanten snijden'] - mwes += ['iets zal er inzitten'] - mwes += ['iemand zal in touw zijn'] - mwes += ['iemand zal aan iemand een *hekel hebben'] - mwes += ['iemand zal 0geen gras over iets laten groeien'] - #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ] - mwes += ['#door dik en dun'] - mwes += ['#ad patres'] - mwes += ['ad patres'] - mwes += ['iemand zal aan de kant #gaan'] - mwes += ['iemand zal aan de kant gaan'] - - for mwe in mwes: - annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = None - if cleanmwe in mwedict: - fullmweparse = mwedict[cleanmwe] - #ET.dump(fullmweparse) - elif mwe in mwedict: + @property + def strees(self): + return {i: ET.fromstring(self.streestrings[i]) for i in self.streestrings} + + def containsillegalsymbols(self, mwe): + for el in annotationstrings: + if el in mwe: + return True, el + return (False, None) + + def test_lofs(self): + lofs = [[1, 2], [3, 4], [5, 6]] + results = listofsets2setoflists(lofs) + assert results == [ + [1, 3, 5], + [1, 3, 6], + [1, 4, 5], + [1, 4, 6], + [2, 3, 5], + [2, 3, 6], + [2, 4, 5], + [2, 4, 6]] + + def getmwedict(self, intbfilename): + mwedict = {} + fulltreebank = getstree(intbfilename) + treebank = fulltreebank.getroot() + for stree in treebank: + keylist = getyield(stree) + key = space.join(keylist) + richstree = stree + # richstree = indextransform(stree) # put off because this should happen later + mwedict[key] = richstree + return mwedict + + def test_rel(self): + for i in {4, 5, 6}: + newstree = relpronsubst(self.strees[i]) + ET.dump(newstree) + + def test4(self): + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + mwedict = self.getmwedict(intbfilename) + mwes = ['iemand zal de dans ontspringen', + 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] + mwes += ['iemand zal 0de dans ontspringen'] + mwes += ['iemand zal de +*dans ontspringen', + 'iemand zal de *+dans ontspringen'] + mwes += ['iemand zal de =dans ontspringen'] + mwes += ['dat mes zal aan twee kanten snijden'] + mwes += ['0nu zal de aap uit de mouw komen'] + mwes += ['iemand zal de schuld op zich nemen'] + mwes += ['iemand zal buiten zichzelf zijn'] + mwes += ['iemand zal veel in zijn mars hebben'] + mwes += ['bij nacht en ontijd'] + # still something wrong here + mwes += ['iemand zal blikken com:[met] iemand wisselen'] + mwes += ['dd:[dat] mes zal aan twee kanten snijden'] + mwes += ['iets zal er inzitten'] + mwes += ['iemand zal in touw zijn'] + mwes += ['iemand zal aan iemand een *hekel hebben'] + mwes += ['iemand zal 0geen gras over iets laten groeien'] + #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ] + mwes += ['#door dik en dun'] + mwes += ['#ad patres'] + mwes += ['ad patres'] + mwes += ['iemand zal aan de kant #gaan'] + mwes += ['iemand zal aan de kant gaan'] + + for mwe in mwes: + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + cleanmwe = space.join([el[0] for el in annotatedlist]) + fullmweparse = None + if cleanmwe in mwedict: + fullmweparse = mwedict[cleanmwe] + # ET.dump(fullmweparse) + elif mwe in mwedict: + fullmweparse = mwedict[mwe] + if fullmweparse is not None: + mweparse = gettopnode(fullmweparse) + newtreesa = transformtree(mweparse, annotations) + newtrees = [] + for newtreea in newtreesa: + newtrees += genvariants(newtreea) + newtrees.extend(newtreesa) + print(f'{mwe}:') + for newtree in newtrees: + # print(f'{i+1}:') + print() + ET.dump(newtree) + else: + print(f'MWE <{cleanmwe}> not found ', file=sys.stderr) + + def mkoutfilename(self, infilename: str, suffix: str, ext=None) -> str: + basefilename, inext = os.path.splitext(infilename) + if ext is None: + ext = inext + result = basefilename + suffix + ext + return result + + def base_testfind(self, basemwe, xpath, mwedict, all=False): + results = [] + localxpath = "." + xpath + for mwe in mwedict: + origmwetree = mwedict[mwe] + mwetree = lowerpredm(origmwetree) + mweyield = getyield(mwetree) + mwestr = space.join(mweyield) + # ET.dump(mwetree) + # print(f'mwe={mwe}') + # print(f'xpath:\n{localxpath}\n') + mwehits = mwetree.xpath(localxpath) + newresult = (basemwe, mwe, len(mwehits)) + results.append(newresult) + + for basemwe, mwe, count in results: + if all: + cond = True + else: + cond = (basemwe == mwe and count != 1) or ( + basemwe != mwe and count != 0) + if cond: + print(basemwe, mwe, count, file=sys.stderr) + + @unittest.skip("slooooow") + def test5(self): + reportevery = 500 + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + mwedict = self.getmwedict(intbfilename) + # next one is problematic, so we delete it + problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen' + if problemmwe in mwedict: + del mwedict[problemmwe] + #mwedict = {} + #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2] + #mwedict['iemand zal ’m smeren'] = strees[3] + # for ind, tree in expandedmwedict.items(): + # print(ind) + # ET.dump(tree) + suffix = '_trees' + outfilename = self.mkoutfilename(intbfilename, suffix) + # with open(outfilename, 'w', encoding='utf8') as outfile: + treebank = ET.Element('treebank') + inds = ['iemand zal uit iemands koker komen'] + inds = ['iemand zal slechte invloed op iemand hebben'] + inds = ['iemand zal met de pet naar iets gooien'] + inds = ['de tale Kanaäns'] + inds += ['heel af en toe'] + inds += ['na verloop van tijd'] + inds += ['al doende zal men leren'] + inds = ['iemand zal de schuld van iets op iemand schuimwedicven'] + inds += ['iemand zal iets door de vingers zien'] + inds += ['iemand zal achter iets komen'] + inds += ['iemand zal uit iemands koker komen'] + inds += ['al doende zal men leren'] + # inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions + inds += ['te dom om voor de duivel te dansen'] + inds += ['zo doof als een kwartel'] + inds = ['iemand zal veel ellende over iemand uitstorten'] + #mwedict = {ind: mwedict[ind] for ind in inds} + expandedmwedict = {mwe: indextransform( + tree) for mwe, tree in mwedict.items()} + counter = 0 + for mwe in mwedict: + counter += 1 + mwe_element = ET.Element('mwe', attrib={'mwe': mwe}) + #print(mwe, file=sys.stderr) + if counter % reportevery == 0: + print(counter, file=sys.stderr) + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + #cleanmwe = space.join([el[0] for el in annotatedlist]) fullmweparse = mwedict[mwe] - if fullmweparse is not None: mweparse = gettopnode(fullmweparse) + # if mweparse is None: + # #print(f'\n\n{mwe}:', file=outfile) + # #print('None') + # continue + treeyield = getyield(mweparse) + treeyieldstr = space.join(treeyield) + if treeyieldstr != mwe: + print(f'mismatch:\n{treeyieldstr}=/={mwe} ') + continue newtreesa = transformtree(mweparse, annotations) newtrees = [] for newtreea in newtreesa: - newtrees += genvariants(newtreea) - newtrees.extend(newtreesa) - print(f'{mwe}:') - for newtree in newtrees: - #print(f'{i+1}:') - print() - ET.dump(newtree) - else: - print(f'MWE <{cleanmwe}> not found ', file=sys.stderr) - -def mkoutfilename(infilename: str, suffix: str, ext=None) -> str: - basefilename, inext = os.path.splitext(infilename) - if ext is None: - ext = inext - result = basefilename + suffix + ext - return result - -def testfind(basemwe, xpath, mwedict, all=False): - results = [] - localxpath = "." + xpath - for mwe in mwedict: - origmwetree = mwedict[mwe] - mwetree = lowerpredm(origmwetree) - mweyield = getyield(mwetree) - mwestr = space.join(mweyield) - #ET.dump(mwetree) - #print(f'mwe={mwe}') - #print(f'xpath:\n{localxpath}\n') - mwehits = mwetree.xpath(localxpath) - newresult = (basemwe, mwe, len(mwehits)) - results.append(newresult) - - - for basemwe, mwe, count in results: - if all: - cond = True - else: - cond = (basemwe == mwe and count!=1) or (basemwe != mwe and count!=0) - if cond: - print(basemwe, mwe, count, file=sys.stderr ) - - - -def test5(): - reportevery = 500 - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - mwedict = getmwedict(intbfilename) - # next one is problematic, so we delete it - problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen' - if problemmwe in mwedict: - del mwedict[problemmwe] - #mwedict = {} - #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2] - #mwedict['iemand zal ’m smeren'] = strees[3] - #for ind, tree in expandedmwedict.items(): - # print(ind) - # ET.dump(tree) - suffix = '_trees' - outfilename = mkoutfilename(intbfilename, suffix) -# with open(outfilename, 'w', encoding='utf8') as outfile: - treebank = ET.Element('treebank') - inds = ['iemand zal uit iemands koker komen'] - inds = ['iemand zal slechte invloed op iemand hebben'] - inds = ['iemand zal met de pet naar iets gooien'] - inds = ['de tale Kanaäns'] - inds += ['heel af en toe'] - inds += ['na verloop van tijd'] - inds += ['al doende zal men leren'] - inds = ['iemand zal de schuld van iets op iemand schuimwedicven'] - inds += ['iemand zal iets door de vingers zien'] - inds += ['iemand zal achter iets komen'] - inds += ['iemand zal uit iemands koker komen'] - inds += ['al doende zal men leren'] - #inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions - inds += ['te dom om voor de duivel te dansen'] - inds += ['zo doof als een kwartel'] - inds = ['iemand zal veel ellende over iemand uitstorten'] - #mwedict = {ind: mwedict[ind] for ind in inds} - expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} - counter = 0 - for mwe in mwedict: - counter += 1 - mwe_element = ET.Element('mwe', attrib={'mwe': mwe}) - #print(mwe, file=sys.stderr) - if counter % reportevery == 0: - print(counter, file=sys.stderr) - annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - #cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = mwedict[mwe] - mweparse = gettopnode(fullmweparse) - #if mweparse is None: - # #print(f'\n\n{mwe}:', file=outfile) - # #print('None') - # continue - treeyield = getyield(mweparse) - treeyieldstr = space.join(treeyield) - if treeyieldstr != mwe: - print(f'mismatch:\n{treeyieldstr}=/={mwe} ' ) - continue - newtreesa = transformtree(mweparse, annotations) - newtrees = [] - for newtreea in newtreesa: - newtrees += newgenvariants(newtreea) - #newtrees.extend(newtreesa) - #print(f'\n\n{mwe}:', file=outfile) - cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees] - #cleantrees = newtrees - #print('cleantrees:') - #for cleantree in cleantrees: - # ET.dump(cleantree) - mwe_element.extend(cleantrees) - xpath = trees2xpath(cleantrees, expanded=True) - #print(xpath) - xpath_element = ET.Element('xpath') - xpath_element.text = xpath - mwe_element.append(xpath_element) - treebank.append(mwe_element) - testfind(mwe, xpath, expandedmwedict) - #ET.dump(treebank) - # for newtree in newtrees: - # #print(f'{i+1}:') - # print() - # treebank.append(newtree) - fulltreebank = ET.ElementTree(treebank) - #ET.indent(newtree, space=" ") - #print(ET.tostring(newtree), file=outfile) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - -def check(treebankdict): - for utt, stree in treebankdict.items(): - for node in stree.iter(): - if 'pt' in node.attrib: - for att in {'begin','end'}: - if 'id' in node.attrib: - id = node.attrib['id'] - else: - id = 'None' - if att not in node.attrib: - print(f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.') - ET.dump(stree) - - -def getutts(infilename): - #each utterance on a separate line, discard the final \n and skip empty lines - infile = open(infilename, 'r', encoding='utf8') - rawutts = infile.readlines() - utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1] - return utts - - -def testvariatie(): - mwetreebank = './testdata/mwesvoorvariatie-noann_treebank.xml' - mwedict = getmwedict(mwetreebank) - #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} - testtreebankfilename = './testdata/testzinnen mwevarianten_treebank.xml' - fullvariationtreebank = getstree(testtreebankfilename) - variationtreebank = fullvariationtreebank.getroot() - variationtreebankdict= {getyieldstr(tree): expandfull(tree) for tree in variationtreebank} - #check(variationtreebankdict) - annotatedmwefilename = './testdata/mwesvoorvariatie-annotated.txt' - annotatedmwes = getutts(annotatedmwefilename) - suffix = '_derivedtrees' - outfilename = mkoutfilename(mwetreebank, suffix) - treebank = ET.Element('treebank') - counter = 0 - reportevery = 500 - - #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen'] - #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen'] - for rawmwe in annotatedmwes: - counter += 1 - mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe}) - #print(mwe, file=sys.stderr) - if counter % reportevery == 0: - print(counter, file=sys.stderr) - annotatedlist = preprocess_MWE(rawmwe) - annotations = [el[1] for el in annotatedlist] - mweparts = [el[0] for el in annotatedlist] - mwe = space.join(mweparts) - fullmweparse = mwedict[mwe] - mweparse = gettopnode(fullmweparse) - #if mweparse is None: - # #print(f'\n\n{mwe}:', file=outfile) - # #print('None') - # continue - treeyield = getyield(mweparse) - treeyieldstr = space.join(treeyield) - if treeyieldstr != mwe: - print(f'mismatch:\n{treeyieldstr}=/={mwe} ' ) - continue - newtreesa = transformtree(mweparse, annotations) - newtrees = [] - for newtreea in newtreesa: - newtrees += newgenvariants(newtreea) - #newtrees.extend(newtreesa) - #print(f'\n\n{mwe}:', file=outfile) - cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees] - #cleantrees = newtrees - #print('cleantrees:') - #for cleantree in cleantrees: - # ET.dump(cleantree) - mwe_element.extend(cleantrees) - xpath = trees2xpath(cleantrees, expanded=True) - #print(xpath) - xpath_element = ET.Element('xpath') - xpath_element.text = xpath - mwe_element.append(xpath_element) - treebank.append(mwe_element) - testfind(mwe, xpath, variationtreebankdict) - #ET.dump(treebank) - # for newtree in newtrees: - # #print(f'{i+1}:') - # print() - # treebank.append(newtree) - fulltreebank = ET.ElementTree(treebank) - #ET.indent(newtree, space=" ") - #print(ET.tostring(newtree), file=outfile) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - -def gentreebank(): - #generate a new treebank because a new parser is being used - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - suffix = '_parse2022-11-18' - outfilename = mkoutfilename(intbfilename, suffix) - mwedict = getmwedict(intbfilename) - mktreebank(mwedict, outfilename) - - - -def genqueries(): - """ - Generates queries in a file (with suffix _querytriples) in the - testdata folder for a mwe treebank in the same folder and applies - the queries on all the mwe tree in the treebank. - - It will generate output on the console for every mwe and reports - differences from what was expected. - """ - - # if True it will only check whether the queries for an mwe find - # a match in the parsed tree of the canonical form of the mwe - # (that is a minimal requirement for an xpath query) - selftest = False - - # test these specific MWEs - mwes = [ 'iemand zal een poging doen', 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen'] - mwes += ['iemand zal *honger hebben'] - #mwes = ['iemand zal 0een *+poging doen'] - - # Jan Odijk: - # "Oorspronkelijk werkte ik met een bestand gedownload uit GreTEL 4 (MWE20220429_corpus2alpino), - # maar de parse hiervan bleken anders te zijn dan in de huidige versie, - # dus heb ik een nieuwe treebank gegenereerd: MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml." - #intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml' - suffix = '_querytriples' - outfilename = mkoutfilename(intbfilename, suffix) - mwedict = getmwedict(intbfilename) - #selectedmwe = 'af en toe' - mwes = [mwe for mwe, _ in mwedict.items() ] - # mwes = ['iemand zal 0een *+poging doen'] - # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten'] - # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald'] - # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken'] - # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden'] - # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging'] - # mwes += ['dd:[dat] zelfde liedje'] - # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten'] - # mwes += ['iemand zal veel ellende over iemand uitstorten'] - # mwes += ['iemand zal aanhangen als een klis'] - # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren', - # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn', - # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan'] - # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn', - # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen'] - # mwes += ['iemand zal buiten zichzelf zijn'] - # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn', - # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen', - # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter', - # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat', - # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets', - # 'wie een hond wil slaan zal licht een stok vinden', - # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen'] - #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen'] - #mwes += ['het zal zaliger zijn te geven dan te ontvangen'] - with open(outfilename, 'w', encoding='utf8') as outfile: - for mwe in mwes: - print(mwe) - (mweq, nearmissq, supersetq) = generatequeries(mwe) - print(f'\n{mwe}:', file=outfile) - print(f'mweq:\n{mweq}', file=outfile) - - print(f'nearmissq:\n{nearmissq}', file=outfile) + newtrees += newgenvariants(newtreea) + # newtrees.extend(newtreesa) + #print(f'\n\n{mwe}:', file=outfile) + cleantrees = [removesuperfluousindexes( + newtree) for newtree in newtrees] + #cleantrees = newtrees + # print('cleantrees:') + # for cleantree in cleantrees: + # ET.dump(cleantree) + mwe_element.extend(cleantrees) + xpath = trees2xpath(cleantrees, expanded=True) + # print(xpath) + xpath_element = ET.Element('xpath') + xpath_element.text = xpath + mwe_element.append(xpath_element) + treebank.append(mwe_element) + self.base_testfind(mwe, xpath, expandedmwedict) + # ET.dump(treebank) + # for newtree in newtrees: + # #print(f'{i+1}:') + # print() + # treebank.append(newtree) + fulltreebank = ET.ElementTree(treebank) + #ET.indent(newtree, space=" ") + #print(ET.tostring(newtree), file=outfile) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def check(self, treebankdict): + for utt, stree in treebankdict.items(): + for node in stree.iter(): + if 'pt' in node.attrib: + for att in {'begin', 'end'}: + if 'id' in node.attrib: + id = node.attrib['id'] + else: + id = 'None' + if att not in node.attrib: + print( + f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.') + ET.dump(stree) + + def getutts(self, infilename): + # each utterance on a separate line, discard the final \n and skip empty lines + with open(infilename, 'r', encoding='utf8') as infile: + rawutts = infile.readlines() + utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1] + return utts + + @unittest.skip("not deterministic") + def test_variatie(self): + mwetreebank = self.data_path('mwesvoorvariatie-noann_treebank.xml') + mwedict = self.getmwedict(mwetreebank) + #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} + testtreebankfilename = self.data_path('testzinnen mwevarianten_treebank.xml') + fullvariationtreebank = getstree(testtreebankfilename) + variationtreebank = fullvariationtreebank.getroot() + variationtreebankdict = {getyieldstr(tree): expandfull( + tree) for tree in variationtreebank} + # check(variationtreebankdict) + annotatedmwefilename = self.data_path('mwesvoorvariatie-annotated.txt') + annotatedmwes = self.getutts(annotatedmwefilename) + suffix = '_derivedtrees' + outfilename = self.mkoutfilename(mwetreebank, suffix) + treebank = ET.Element('treebank') + counter = 0 + reportevery = 500 + + #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen'] + #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen'] + for rawmwe in annotatedmwes: + counter += 1 + mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe}) + #print(mwe, file=sys.stderr) + if counter % reportevery == 0: + print(counter, file=sys.stderr) + annotatedlist = preprocess_MWE(rawmwe) + annotations = [el[1] for el in annotatedlist] + mweparts = [el[0] for el in annotatedlist] + mwe = space.join(mweparts) + fullmweparse = mwedict[mwe] + mweparse = gettopnode(fullmweparse) + # if mweparse is None: + # #print(f'\n\n{mwe}:', file=outfile) + # #print('None') + # continue + treeyield = getyield(mweparse) + treeyieldstr = space.join(treeyield) + if treeyieldstr != mwe: + print(f'mismatch:\n{treeyieldstr}=/={mwe} ') + continue + newtreesa = transformtree(mweparse, annotations) + newtrees = [] + for newtreea in newtreesa: + newtrees += newgenvariants(newtreea) + # newtrees.extend(newtreesa) + #print(f'\n\n{mwe}:', file=outfile) + cleantrees = [removesuperfluousindexes( + newtree) for newtree in newtrees] + #cleantrees = newtrees + # print('cleantrees:') + # for cleantree in cleantrees: + # ET.dump(cleantree) + mwe_element.extend(cleantrees) + xpath = trees2xpath(cleantrees, expanded=True) + # print(xpath) + xpath_element = ET.Element('xpath') + xpath_element.text = xpath + mwe_element.append(xpath_element) + treebank.append(mwe_element) + self.base_testfind(mwe, xpath, variationtreebankdict) + # ET.dump(treebank) + # for newtree in newtrees: + # #print(f'{i+1}:') + # print() + # treebank.append(newtree) + fulltreebank = ET.ElementTree(treebank) + #ET.indent(newtree, space=" ") + #print(ET.tostring(newtree), file=outfile) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def gentreebank(self): + # generate a new treebank because a new parser is being used + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + suffix = '_parse2022-11-18' + outfilename = self.mkoutfilename(intbfilename, suffix) + mwedict = self.getmwedict(intbfilename) + self.mktreebank(mwedict, outfilename) + + def genqueries(self): + selftest = False + mwes = ['iemand zal een poging doen', + 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen'] + mwes += ['iemand zal *honger hebben'] + #mwes = ['iemand zal 0een *+poging doen'] + #intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml') + suffix = '_querytriples' + outfilename = self.mkoutfilename(intbfilename, suffix) + mwedict = self.getmwedict(intbfilename) + #selectedmwe = 'af en toe' + mwes = [mwe for mwe, _ in mwedict.items()] + # mwes = ['iemand zal 0een *+poging doen'] + # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten'] + # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald'] + # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken'] + # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden'] + # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging'] + # mwes += ['dd:[dat] zelfde liedje'] + # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten'] + # mwes += ['iemand zal veel ellende over iemand uitstorten'] + # mwes += ['iemand zal aanhangen als een klis'] + # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren', + # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn', + # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan'] + # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn', + # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen'] + # mwes += ['iemand zal buiten zichzelf zijn'] + # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn', + # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen', + # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter', + # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat', + # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets', + # 'wie een hond wil slaan zal licht een stok vinden', + # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen'] + #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen'] + #mwes += ['het zal zaliger zijn te geven dan te ontvangen'] + with open(outfilename, 'w', encoding='utf8') as outfile: + for mwe in mwes: + print(mwe) + (mweq, nearmissq, supersetq) = generatequeries(mwe) + print(f'\n{mwe}:', file=outfile) + print(f'mweq:\n{mweq}', file=outfile) - print(f'supersetq:\n{supersetq}', file= outfile) + print(f'nearmissq:\n{nearmissq}', file=outfile) - annotatedlist = preprocess_MWE(mwe) - #annotations = [el[1] for el in annotatedlist] - mweparts = [el[0] for el in annotatedlist] - utt = space.join(mweparts) + print(f'supersetq:\n{supersetq}', file=outfile) - if selftest: - # #self test - (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries(utt, mweq, nearmissq, supersetq) - if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1: - print(f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}') - else: - results = applyqueries(mwedict, mwe, mweq, nearmissq, supersetq) - - -if __name__ == '__main__': - # main() - # test1() - #test2() - #test3() - #test4() - #test5() - #testrel() - #testvariatie() - #gentreebank() - genqueries() + annotatedlist = preprocess_MWE(mwe) + #annotations = [el[1] for el in annotatedlist] + mweparts = [el[0] for el in annotatedlist] + utt = space.join(mweparts) + + if selftest: + # #self test + (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries( + utt, mweq, nearmissq, supersetq) + if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1: + print( + f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}') + else: + results = applyqueries( + mwedict, mwe, mweq, nearmissq, supersetq) + + +# if __name__ == '__main__': +# # main() +# # test1() +# #test2() +# #test3() +# #test4() +# #test5() +# #testrel() +# #testvariatie() +# #gentreebank() +# genqueries() diff --git a/tests/update_outputs.py b/tests/update_outputs.py index 48fccd2..d0559fb 100755 --- a/tests/update_outputs.py +++ b/tests/update_outputs.py @@ -7,6 +7,7 @@ import sys from os import path import glob +import lxml.etree as ET testdir = path.dirname(__file__) datadir = path.join(testdir, "data") @@ -14,34 +15,35 @@ # import this implementation sys.path.insert(0, path.join(testdir, "..")) from mwe_query import Mwe +from mwe_query.canonicalform import preprocess_MWE, transformtree -def datapath(filename): - return path.join(datadir, filename) +def datapath(dirname, filename): + return path.join(datadir, dirname, filename) -def read(filename): - with open(datapath(filename)) as f: +def read(dirname, filename): + with open(datapath(dirname, filename)) as f: return f.read() -def write(filename, content): - with open(datapath(filename), "w") as f: +def write(dirname, filename, content): + with open(datapath(dirname, filename), "w") as f: f.write(content) -def update(basename): - lines = read(basename + ".txt").splitlines() +def update_generate(basename): + lines = read("generate", basename + ".txt").splitlines() can_form = lines[0].strip() sentence = lines[1].strip() alpino_xml_filename = basename + ".xml" - if not path.exists(datapath(alpino_xml_filename)): + if not path.exists(datapath("generate", alpino_xml_filename)): print("parsing") alpino_xml = parse_sentence(can_form) - write(alpino_xml_filename, alpino_xml) + write("generate", alpino_xml_filename, alpino_xml) else: - alpino_xml = read(alpino_xml_filename) + alpino_xml = read("generate", alpino_xml_filename) mwe = Mwe(sentence) mwe.set_tree(alpino_xml) @@ -50,10 +52,37 @@ def update(basename): queries = mwe.generate_queries() for query in queries: - write(f"{basename}-{query.rank}.xpath", query.xpath) + write("generate", f"{basename}-{query.rank}.xpath", query.xpath) -input_files = glob.glob(path.join(datadir, '*.txt')) +def gettopnode(stree): + for child in stree: + if child.tag == 'node': + return child + return None + + +def update_transform(): + mwes = read("transform", "mwes.txt").splitlines() + + i = 0 + for mwe in mwes: + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + fullmweparse = ET.fromstring(read("transform", "tree.xml")) + mweparse = gettopnode(fullmweparse) + newtrees = transformtree(mweparse, annotations) + + j = 0 + for newtree in newtrees: + ET.indent(newtree) + write("transform", f"{i}-{j}.xml", ET.tostring(newtree, encoding="unicode")) + j += 1 + + i += 1 + +input_files = glob.glob(path.join(datadir, "generate", '*.txt')) for input in input_files: head, ext = path.splitext(path.basename(input)) - update(head) + update_generate(head) +update_transform()