From d92d1eb72be637f8e58ae12b75760dd277fdd517 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Wed, 30 Nov 2022 16:47:48 +0100 Subject: [PATCH 1/4] Fix linting issues --- mwe_query/adpositions.py | 64 ++--- mwe_query/canonicalform.py | 449 +++++++++++++++++++++--------------- mwe_query/getalpinomwus.py | 14 +- mwe_query/indextransform.py | 4 +- mwe_query/lcat.py | 25 +- 5 files changed, 319 insertions(+), 237 deletions(-) diff --git a/mwe_query/adpositions.py b/mwe_query/adpositions.py index eb60461..fb11418 100644 --- a/mwe_query/adpositions.py +++ b/mwe_query/adpositions.py @@ -1,35 +1,35 @@ circumpositions = [ -('aan', 'toe'), -('achter', 'aan'), -('bij', 'af'), -('bij', 'na'), -('bij', 'thuis'), -('bij', 'vandaan'), -('boven', 'uit'), -('buiten', 'om'), -('door', 'heen'), -('met', 'mee'), -('naar', 'toe'), -('om', 'heen'), -('onder', 'door'), -('onder', ' uit'), -('onder', 'vandaan'), -('op', 'af'), -('op', 'na'), -('over', 'heen'), -('tegen', 'aan'), -('tegen', 'in'), -('tegen', 'op'), -('tot', 'toe'), -('tussen', 'door'), -('tussen', 'in'), -('uit', 'vandaan'), -('van', 'af'), -('van', 'uit'), -('van', 'vandaan'), -('voor', 'aan'), -('voor', 'langs'), -('voor', 'uit') + ('aan', 'toe'), + ('achter', 'aan'), + ('bij', 'af'), + ('bij', 'na'), + ('bij', 'thuis'), + ('bij', 'vandaan'), + ('boven', 'uit'), + ('buiten', 'om'), + ('door', 'heen'), + ('met', 'mee'), + ('naar', 'toe'), + ('om', 'heen'), + ('onder', 'door'), + ('onder', ' uit'), + ('onder', 'vandaan'), + ('op', 'af'), + ('op', 'na'), + ('over', 'heen'), + ('tegen', 'aan'), + ('tegen', 'in'), + ('tegen', 'op'), + ('tot', 'toe'), + ('tussen', 'door'), + ('tussen', 'in'), + ('uit', 'vandaan'), + ('van', 'af'), + ('van', 'uit'), + ('van', 'vandaan'), + ('voor', 'aan'), + ('voor', 'langs'), + ('voor', 'uit') ] -vzazindex = {vz+az: (vz, az) for (vz, az) in circumpositions} \ No newline at end of file +vzazindex = {vz+az: (vz, az) for (vz, az) in circumpositions} diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index 389cfc8..a536428 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -1,9 +1,9 @@ -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Set from sastatypes import SynTree import re import sys from treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \ - getindexednodesmap, getbasicindexednodesmap, clausecats, clausebodycats + getindexednodesmap, getbasicindexednodesmap, clausebodycats import lxml.etree as ET import copy @@ -20,15 +20,15 @@ Condition = str - altsym = '|' -annotationstrings = {'0', '+*', '*+', '+', '*', 'dd:[', ']', '<', '>', '|', '=', '#' } +annotationstrings = {'0', '+*', '*+', '+', + '*', 'dd:[', ']', '<', '>', '|', '=', '#'} start_state, invbl_state, dd_state, com_state, dr_state = 0, 1, 2, 3, 4 noann, modifiable, inflectable, modandinfl, variable, bound, dd, invariable, zero, com, \ -literal, unmodifiable, unmodandinfl, dr = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + literal, unmodifiable, unmodandinfl, dr = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 notop, itop, parenttop = 0, 1, 2 @@ -39,7 +39,8 @@ nomodanns = {unmodifiable, unmodandinfl} zichlemmas = ['me', 'mij', 'je', 'zich', 'ons'] -zichzelflemmas = ['mezelf', 'mijzelf', 'jezelf', 'jouzelf', 'zichzelf', 'onszelf' ] +zichzelflemmas = ['mezelf', 'mijzelf', + 'jezelf', 'jouzelf', 'zichzelf', 'onszelf'] zijnlemmas = ['mijn', 'jouw', 'zijn', 'ons', 'jullie', 'je'] defdets = {'de', 'het', 'deze', 'die', 'dit', 'dat'} defRpronouns = {'er', 'hier', 'daar'} @@ -50,8 +51,8 @@ vblnode = """(not(@word) and not(@pt) and count(node)=0)""" npmodppidxpath = \ - f""".//node[@cat="np" and - node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and + f""".//node[@cat="np" and + node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and ../node[@rel="hd" and @pt="ww"]]/@id""" vobj1nodeidxpath = f'.//node[@rel="obj1" and {vblnode} and ../node[@rel="hd" and @pt="ww"]]/@id' @@ -59,30 +60,35 @@ coreproperties = ['rel', 'pt', 'cat', 'lemma'] -#maybe make this dependent on the pt (nominal (getal inherent), verbal (getal niet inherent) -inherentinflproperties = ['wvorm', 'pvtijd', 'getal-n', 'getal', 'persoon', 'graad'] -contextualinflproperties = [ 'positie', 'pvagr', 'buiging', 'naamval', 'npagr'] +# maybe make this dependent on the pt (nominal (getal inherent), verbal (getal niet inherent) +inherentinflproperties = ['wvorm', 'pvtijd', + 'getal-n', 'getal', 'persoon', 'graad'] +contextualinflproperties = ['positie', + 'pvagr', 'buiging', 'naamval', 'npagr'] inflproperties = inherentinflproperties + contextualinflproperties -subcatproperties = [ 'ntype', 'genus', 'numtype', 'vwtype', 'lwtype', 'vztype', 'conjtype', 'spectype' ] +subcatproperties = ['ntype', 'genus', 'numtype', + 'vwtype', 'lwtype', 'vztype', 'conjtype', 'spectype'] defaultinhinflvalues = {'wvorm': {'inf', 'pv'}, 'pvtijd': {'tgw'}, 'getal-n': {''}, 'getal': {'ev'}, 'persoon': {'3'}, 'graad': {'basis'}} xpathproperties = ['axis'] -pobj1node = ET.Element('node', attrib={'rel': 'pobj1', 'pt': 'vnw' }) +pobj1node = ET.Element('node', attrib={'rel': 'pobj1', 'pt': 'vnw'}) vcnode = ET.Element('node', attrib={'rel': 'vc'}) -de_lw = ET.Element('node', attrib={'lemma': 'de', 'pt':'lw'}) -het_lw = ET.Element('node', attrib={'lemma': 'het', 'pt':'lw'}) -van_vz = ET.Element('node', attrib={'lemma': 'van', 'pt': 'vz', 'vztype': 'init'}) -dummymod = ET.Element('node', attrib={'rel': 'mod', 'pt': 'dummy', 'begin': '0', 'end': '0', 'word':'dummy'}) +de_lw = ET.Element('node', attrib={'lemma': 'de', 'pt': 'lw'}) +het_lw = ET.Element('node', attrib={'lemma': 'het', 'pt': 'lw'}) +van_vz = ET.Element( + 'node', attrib={'lemma': 'van', 'pt': 'vz', 'vztype': 'init'}) +dummymod = ET.Element('node', attrib={ + 'rel': 'mod', 'pt': 'dummy', 'begin': '0', 'end': '0', 'word': 'dummy'}) -def orconds(att:str, vals:List[str]) -> str: +def orconds(att: str, vals: List[str]) -> str: condlist = [f'@{att}="{val}"' for val in vals] if len(condlist) > 1: - result = ' or '.join(condlist) + result = ' or '.join(condlist) elif len(condlist) == 1: result = condlist[0] else: @@ -90,6 +96,7 @@ def orconds(att:str, vals:List[str]) -> str: endresult = '(' + result + ')' return endresult + def alts(ls: List[str]) -> str: result = altsym.join(ls) return result @@ -97,26 +104,28 @@ def alts(ls: List[str]) -> str: clausebodycatalts = orconds('cat', clausebodycats) + def selectinherentproperties(node): result = [] for att in node.attrib: if att in defaultinhinflvalues: nodeval = node.attrib[att] defvals = defaultinhinflvalues[att] - if nodeval not in defvals: + if nodeval not in defvals: result.append(att) return result - def nodecopy(node): newnode = ET.Element('node') for att, val in node.attrib.items(): newnode.attrib[att] = val return newnode + def tokenize(sentence): - sentence = re.sub(r'([\.\,\;\?!\(\)\"\\\/])', r' \1 ', sentence) # ':' removed + sentence = re.sub(r'([\.\,\;\?!\(\)\"\\\/])', + r' \1 ', sentence) # ':' removed sentence = re.sub(r'(\.\s+\.\s+\.)', r' ... ', sentence) sentence = re.sub(r'^\s*(.*?)\s*$', r'\1', sentence) sentence = re.sub(r'\s+', r' ', sentence) @@ -135,7 +144,8 @@ def listofsets2setoflists(listofset): resultset.append(newresult) return resultset -def preprocess_MWE(rawmwe): + +def preprocess_MWE(rawmwe): # noqa: C901 mwe = mwenormalise(rawmwe) can_form = tokenize(mwe) ann_list = [] @@ -219,10 +229,9 @@ def preprocess_MWE(rawmwe): exit(-1) ann_list.append((newword, newann)) - - return ann_list + def mwenormalise(rawmwe): result = rawmwe result = re.sub(r'(?i)iemand\s*\|\s*iets', 'iemand|iets', result) @@ -236,6 +245,7 @@ def mwenormalise(rawmwe): stateprops[com_state] = (']', com) stateprops[invbl_state] = ('>', invariable) + def mwstate(word, instate): if word == stateprops[instate][0]: newstate = start_state @@ -251,10 +261,12 @@ def mwstate(word, instate): newword = word return (newword, newann, newstate) + def mincopynode(node: SynTree) -> SynTree: newnode = attcopy(node, ['rel', 'pt', 'cat']) return newnode + def mkresults(node, childslist): results = [] for childs in childslist: @@ -264,15 +276,18 @@ def mkresults(node, childslist): results.append(newnode) return results + def getchild(stree: SynTree, rel: str) -> Optional[SynTree]: for child in stree: if gav(child, 'rel') == rel: return child return None + def mknode(): return ET.Element('node') + def all_leaves(stree: SynTree, annotations: List[Annotation], allowedannotations: Set[Annotation]) -> bool: leaves = getnodeyield(stree) for leave in leaves: @@ -281,6 +296,7 @@ def all_leaves(stree: SynTree, annotations: List[Annotation], allowedannotations return False return True + def headmodifiable(stree, mwetop, annotations): head = getchild(stree, 'hd') if terminal(head): @@ -294,22 +310,27 @@ def headmodifiable(stree, mwetop, annotations): print(f'Illegal value for mwetop={mwetop}', file=sys.stderr) result = False else: - print(f'Index out of range: {beginint} in {annotations}', file=sys.stderr) + print( + f'Index out of range: {beginint} in {annotations}', file=sys.stderr) result = False else: # can now only be node with cat=mwu mwps = getnodeyield(head) if mwetop == notop: - result = any([annotations[int(gav(mwp, 'begin'))] in modanns for mwp in mwps]) + result = any([annotations[int(gav(mwp, 'begin'))] + in modanns for mwp in mwps]) elif mwetop in {itop, parenttop}: - result = any([annotations[int(gav(mwp, 'begin'))] not in nomodanns for mwp in mwps]) + result = any([annotations[int(gav(mwp, 'begin'))] + not in nomodanns for mwp in mwps]) else: print(f'Illegal value for mwetop={mwetop}', file=sys.stderr) result = False return result -def attcopy(sourcenode:SynTree, atts:List[str]) -> SynTree: + +def attcopy(sourcenode: SynTree, atts: List[str]) -> SynTree: targetnode = mknode() - extatts = atts + ['id', 'index'] # we always copy the 'id' and 'index'attributes, needed for conditions, perhaps not needed anymorre + # we always copy the 'id' and 'index'attributes, needed for conditions, perhaps not needed anymorre + extatts = atts + ['id', 'index'] for att in extatts: if att in sourcenode.attrib: if att == 'word': @@ -328,9 +349,11 @@ def zerochildrencount(stree, annotations): if annotations[intbegin] == zero: result += 1 else: - print(f'Index out of range: {intbegin} in {annotations}', file=sys.stderr) + print( + f'Index out of range: {intbegin} in {annotations}', file=sys.stderr) return result + def mknewnode(stree, mwetop, atts, annotations): newnode = attcopy(stree, atts) if not headmodifiable(stree, mwetop, annotations): @@ -340,13 +363,15 @@ def mknewnode(stree, mwetop, atts, annotations): newnode.attrib['maxnodecount'] = f'{len(stree)}' return newnode + def zullenheadclause(stree: SynTree) -> bool: if stree.tag == 'node': cat = gav(stree, 'cat') head = getchild(stree, 'hd') headlemma = gav(head, 'lemma') headpt = gav(head, 'pt') - result = cat in {'smain', 'sv1'} and headlemma == 'zullen' and headpt == 'ww' + result = cat in { + 'smain', 'sv1'} and headlemma == 'zullen' and headpt == 'ww' else: result = False return result @@ -379,7 +404,8 @@ def zullenheadclause(stree: SynTree) -> bool: # * com: keep lemma, pt, rel # * literal: word=node.@word.lower(), pt, rel -def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, axis=None ) -> List[SynTree]: + +def transformtree(stree: SynTree, annotations: List[Annotation], mwetop=notop, axis=None) -> List[SynTree]: # noqa: C901 # it is presupposed that with zullen + vc the subject index node of the vc has already been expanded # it is presupposed that the function is called with node SynTree at the top if stree.tag != 'node': @@ -389,7 +415,6 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax if not terminal(stree): cat = gav(stree, 'cat') rel = gav(stree, 'rel') - id = gav(stree, 'id') if cat == 'top' and len(stree) > 1: newnode = mincopynode(stree) @@ -404,15 +429,16 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax head = getchild(stree, 'hd') lemma = gav(head, 'lemma') vc = getchild(stree, 'vc') - ## predm, if present, must be moved downwards here + # predm, if present, must be moved downwards here newstree = lowerpredm(stree) - #print('newstree') - #ET.dump(newstree) + # print('newstree') + # ET.dump(newstree) if lemma == 'zullen' and vc is not None: subject = find1(newstree, './node[@rel="su"]') newvc = getchild(newstree, 'vc') newvc = expandsu(newvc, subject) - results = transformtree(newvc, annotations, mwetop= itop, axis=axis) + results = transformtree( + newvc, annotations, mwetop=itop, axis=axis) return results elif mwetop == itop: newnode = ET.Element('node') @@ -451,7 +477,7 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax newnodes.append(newnode) return newnodes elif all_leaves(stree, annotations, {zero}): - newnode = None # remove it + newnode = None # remove it if axis is not None: newnode.attrib['axis'] = axis newnodes.append(newnode) @@ -468,7 +494,6 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax newnode.attrib['rel'] = 'pc|ld|mod|predc|svp|predm' newnodes.append(newnode) - newchilds = [] newchildalternativeslist = [] for child in stree: childaxis = None @@ -479,23 +504,26 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax childaxis = 'descendant' else: newmwetop = notop - newchildalternatives = transformtree(child, annotations, mwetop=newmwetop, axis=childaxis) + newchildalternatives = transformtree( + child, annotations, mwetop=newmwetop, axis=childaxis) newchildalternativeslist.append(newchildalternatives) - #list of alternative childs -> alternatives of childlists - newchildlistalternatives = listofsets2setoflists(newchildalternativeslist) + # list of alternative childs -> alternatives of childlists + newchildlistalternatives = listofsets2setoflists( + newchildalternativeslist) results = [] for newnode in newnodes: if newnode is not None: for newchildlist in newchildlistalternatives: - newnodecopy = nodecopy(newnode) # we must make a new copy to obtain a new tree + # we must make a new copy to obtain a new tree + newnodecopy = nodecopy(newnode) for newchild in newchildlist: if newchild is not None: if DEBUG: print('\nnewchild:') ET.dump(newchild) - #we must make a copy of the child because each Element has only one parent + # we must make a copy of the child because each Element has only one parent newchildcopy = copy.copy(newchild) newnodecopy.append(newchildcopy) if DEBUG: @@ -514,38 +542,49 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax lcword = gav(stree, 'word').lower() pt = gav(stree, 'pt') rel = gav(stree, 'rel') - if not(0 <= beginint < len(annotations)): - print(f'Index out of range: {beginint} in {annotations}', file=sys.stderr) - #we simply skip this node - #newnode = None + if not (0 <= beginint < len(annotations)): + print( + f'Index out of range: {beginint} in {annotations}', file=sys.stderr) + # we simply skip this node + # newnode = None else: - if annotations[beginint] == zero: #maybe something special if it concerns a head + # maybe something special if it concerns a head + if annotations[beginint] == zero: newnode = None results.append(newnode) elif annotations[beginint] == literal: - newnode = attcopy(stree, ['word', 'rel', 'pt'] + subcatproperties + inflproperties) + newnode = attcopy( + stree, ['word', 'rel', 'pt'] + subcatproperties + inflproperties) results.append(newnode) elif annotations[beginint] in {inflectable, modandinfl, unmodandinfl}: - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties) results.append(newnode) - elif annotations[beginint] in {noann} and (mwetop!=parenttop or rel != 'hd'): - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) + elif annotations[beginint] in {noann} and (mwetop != parenttop or rel != 'hd'): + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) results.append(newnode) elif annotations[beginint] in {noann, unmodifiable} and mwetop == parenttop and rel == 'hd': - selectedinherentinflproperties = selectinherentproperties(stree) - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties + selectedinherentinflproperties) + selectedinherentinflproperties = selectinherentproperties( + stree) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties + selectedinherentinflproperties) results.append(newnode) elif annotations[beginint] in {bound} and lcword == 'zijn' and pt == 'ww' and (mwetop != parenttop or rel != 'hd'): - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) results.append(newnode) elif annotations[beginint] in {bound} and lcword == 'zijn' and pt == 'ww' and mwetop == parenttop and rel == 'hd': - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties) results.append(newnode) elif annotations[beginint] in {com}: - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties) results.append(newnode) elif annotations[beginint] in {modifiable, unmodifiable}: - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties + inherentinflproperties) results.append(newnode) elif annotations[beginint] == variable: newnode = attcopy(stree, ['rel']) @@ -559,7 +598,6 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax newnode = attcopy(stree, ['rel', 'pt'] + subcatproperties) lemma = gav(stree, 'lemma') pt = gav(stree, 'pt') - id = gav(stree, 'id') vwtype = gav(stree, 'vwtype') if lemma == 'zich': newnode.attrib['lemma'] = alts(zichlemmas) @@ -567,7 +605,7 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax elif lemma == 'zichzelf': newnode.attrib['lemma'] = alts(zichzelflemmas) newnode.attrib['vwtype'] = 'refl|pr' - elif lemma == 'zijn' and pt == 'vnw' and vwtype == 'bez': # we do not want to include the verb zijn here + elif lemma == 'zijn' and pt == 'vnw' and vwtype == 'bez': # we do not want to include the verb zijn here newnode.attrib['lemma'] = alts(zijnlemmas) results.append(newnode) elif annotations[beginint] == dd: @@ -581,8 +619,10 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax newnode.attrib['pt'] = 'vnw' results.append(newnode) else: - print(f'Unrecognized annotation: {annotations[beginint]}', file=sys.stderr) - newnode = attcopy(stree, ['lemma', 'rel', 'pt'] +subcatproperties + inflproperties) + print( + f'Unrecognized annotation: {annotations[beginint]}', file=sys.stderr) + newnode = attcopy( + stree, ['lemma', 'rel', 'pt'] + subcatproperties + inflproperties) results.append(newnode) if DEBUG: @@ -594,12 +634,14 @@ def transformtree(stree: SynTree, annotations:List[Annotation], mwetop=notop, ax ET.dump(result) return results + def isvblnode(node: SynTree) -> bool: - result = len(node) == 0 and 'word' not in node.attrib and 'pt' not in node.attrib + result = len( + node) == 0 and 'word' not in node.attrib and 'pt' not in node.attrib return result -def expandsu(vc: SynTree, subject:SynTree) -> SynTree: +def expandsu(vc: SynTree, subject: SynTree) -> SynTree: ''' The function *expandsu* creates a copy of *vc* in which the subject (su or sup) of *vc* has been replaced by *subject*, unless this subject is a variable subject @@ -632,6 +674,7 @@ def adaptvzlemma(lemma: str) -> str: result = lemma return result + def getpronadv(lemma, rel, rprons={}): newnode = mknode() newlemma = adaptvzlemma(lemma) @@ -661,7 +704,8 @@ def makepobj1vc(stree, obj1nodeid): results += newresults return results -def makevanPP(stree, gennodeid ): + +def makevanPP(stree, gennodeid): results = [] newstree = copy.deepcopy(stree) gennode = find1(newstree, f'.//node[@id="{str(gennodeid)}"]') @@ -669,8 +713,10 @@ def makevanPP(stree, gennodeid ): parent.remove(gennode) headnodegenus = find1(parent, './node[@rel="hd"]/@genus') headnodegetal = find1(parent, './node[@rel="hd"]/@getal') - lw = copy.copy(het_lw) if headnodegenus == 'onz' and headnodegetal == 'ev' else copy.copy(de_lw) - vanpp = ET.Element('node', attrib={'cat': 'pp', 'rel': 'mod', 'nodecount':'2'}) + lw = copy.copy( + het_lw) if headnodegenus == 'onz' and headnodegetal == 'ev' else copy.copy(de_lw) + vanpp = ET.Element( + 'node', attrib={'cat': 'pp', 'rel': 'mod', 'nodecount': '2'}) van_vzcopy = copy.copy(van_vz) gennodecopy = attcopy(gennode, ['index', 'id']) gennodecopy.attrib['rel'] = 'obj1' @@ -685,15 +731,17 @@ def makevanPP(stree, gennodeid ): results += newresults return results + def makenpzijn(stree, gennodeid): results = [] newstree = copy.deepcopy(stree) gennode = find1(newstree, f'.//node[@id="{str(gennodeid)}"]') parent = gennode.getparent() parent.remove(gennode) - detp = ET.Element('node', attrib={'rel': 'det', 'cat':'detp'}) - vbl = ET.Element('node', attrib={'rel':'mod'}) - bezvnw = ET.Element('node', attrib={'rel': 'hd', 'lemma': 'zijn|haar|hun', 'pt':'vnw', 'vwtype':'bez'}) + detp = ET.Element('node', attrib={'rel': 'det', 'cat': 'detp'}) + vbl = ET.Element('node', attrib={'rel': 'mod'}) + bezvnw = ET.Element('node', attrib={ + 'rel': 'hd', 'lemma': 'zijn|haar|hun', 'pt': 'vnw', 'vwtype': 'bez'}) detp.append(vbl) detp.append(bezvnw) parent.append(detp) @@ -702,16 +750,16 @@ def makenpzijn(stree, gennodeid): results += newresults return results + def mkpronadvvc(stree, ppnodeid): results = [] newstree = copy.deepcopy(stree) ppnode = find1(newstree, f'.//node[@id="{str(ppnodeid)}"]') vzlemma = find1(ppnode, './/node[@rel="hd"]/@lemma') - pprel = gav(ppnode, 'rel') headnode = find1(ppnode, './node[@rel="hd"]') obj1node = find1(ppnode, './node[@rel="obj1"] ') if obj1node is not None and headnode is not None and vzlemma is not None: - pronadvnode = getpronadv(vzlemma, 'hd' , rprons={'er'}) + pronadvnode = getpronadv(vzlemma, 'hd', rprons={'er'}) newvcnode = nodecopy(vcnode) # print('ppnode:') # ET.dump(ppnode) @@ -724,6 +772,7 @@ def mkpronadvvc(stree, ppnodeid): results += newresults return results + def makepronadv(stree, ppnodeid): results = [] newstree = copy.deepcopy(stree) @@ -740,6 +789,7 @@ def makepronadv(stree, ppnodeid): results += newresults return results + def mkextraobcomp(stree, obcompphraseid): results = [] newstree = copy.deepcopy(stree) @@ -749,7 +799,7 @@ def mkextraobcomp(stree, obcompphraseid): streeheadpt = gav(streehead, 'pt') newtopnode = ET.Element('node') obcompphrase.remove(obcomp) - #ET.dump(obcompphrase) + # ET.dump(obcompphrase) obcomphead = find1(obcomp, './node[@rel="cmp"]') if obcomphead is not None and obcomphead.attrib['lemma'] == 'als' and obcomphead.attrib['pt'] == 'vg': obcomphead.attrib['pt'] = 'vz' @@ -761,7 +811,7 @@ def mkextraobcomp(stree, obcompphraseid): thechild = ocpchilds[0] thechild.attrib['rel'] = gav(obcompphrase, 'rel') newobcompphrase = thechild - #ET.dump(newobcompphrase) + # ET.dump(newobcompphrase) else: newobcompphrase = obcompphrase obcomp.attrib['rel'] = 'predm|mod' @@ -794,12 +844,12 @@ def makeppnp(stree, npmodppid): npnode.remove(ppnode) if 'nodecount' in npnode.attrib: npnode.attrib['nodecount'] = str(len(npnode)) - #ET.dump(newstree) + # ET.dump(newstree) npparent = npnode.getparent() npparent.append(newppnode) if 'nodecount' in npparent.attrib: npparent.attrib['nodecount'] = str(len(npparent)) - #ET.dump(newstree) + # ET.dump(newstree) newresults = genvariants(newstree) results.append(newstree) results += newresults @@ -811,7 +861,7 @@ def makesubjectlessimperatives(stree, nodeid): newstree = copy.deepcopy(stree) impnode = newstree if newstree.attrib['id'] == nodeid else None subject = find1(impnode, f'./node[@rel="su" and {vblnode} ]') - head = find1(impnode, './node[@rel="hd" and @pt="ww"]' ) + head = find1(impnode, './node[@rel="hd" and @pt="ww"]') if impnode is not None and subject is not None: subject.attrib['presence'] = 'no' impnode.attrib['cat'] = 'sv1' @@ -823,6 +873,7 @@ def makesubjectlessimperatives(stree, nodeid): results += newresults return results + def mkalternativesnode(altlists: List[List[SynTree]]) -> SynTree: altnodes = [mkalternativenode(altlist) for altlist in altlists] alternativesnode = ET.Element('alternatives') @@ -830,6 +881,7 @@ def mkalternativesnode(altlists: List[List[SynTree]]) -> SynTree: alternativesnode.append(altnode) return alternativesnode + def mkalternativenode(altlist: List[SynTree]) -> SynTree: alternativenode = ET.Element('alternative') for alt in altlist: @@ -838,25 +890,26 @@ def mkalternativenode(altlist: List[SynTree]) -> SynTree: def lowerpredm(stree: SynTree) -> SynTree: - #print('lowerpredm: stree:') - #ET.dump(stree) + # print('lowerpredm: stree:') + # ET.dump(stree) predmnodeids = stree.xpath('.//node[@rel="predm"]/@id') lowestvcnode = find1(stree, './/node[@rel="vc" and not(node[@rel="vc"])]') if predmnodeids != [] and lowestvcnode is not None: newstree = copy.deepcopy(stree) - lowestvcnode = find1(newstree, './/node[@rel="vc" and not(node[@rel="vc"])]') + lowestvcnode = find1( + newstree, './/node[@rel="vc" and not(node[@rel="vc"])]') for predmnodeid in predmnodeids: predmnode = find1(newstree, f'.//node[@id="{predmnodeid}"]') predmparent = predmnode.getparent() predmparent.remove(predmnode) lowestvcnode.append(predmnode) - #print('lowerpredm: newstree') - #ET.dump(newstree) + # print('lowerpredm: newstree') + # ET.dump(newstree) return newstree else: return stree - ## genvariants2, different strategy, less multiplication + # genvariants2, different strategy, less multiplication # 1. basic mwe structure, include predm, include subject # 2. remove open slot subject (covers imperatives, topic drop, passives (in indexexpanded trees) # 3. np[ ..pp] -> np pp @@ -867,23 +920,25 @@ def lowerpredm(stree: SynTree) -> SynTree: # c. gennodes # d. iemands + def newgenvariants(stree: SynTree) -> List[SynTree]: results = [] newstree = copy.deepcopy(stree) - #remove open slot subject - vblsu = find1(newstree, f'.//node[@rel="su" and {vblnode}]') #maybe we should delete not all vbl subjects? //-> / + # remove open slot subject + # maybe we should delete not all vbl subjects? //-> / + vblsu = find1(newstree, f'.//node[@rel="su" and {vblnode}]') if vblsu is not None: parent = vblsu.getparent() parent.remove(vblsu) - #move predm down not needed already done in transformtree - #newstree = lowerpredm(newstree) + # move predm down not needed already done in transformtree + # newstree = lowerpredm(newstree) - #Global changes + # Global changes globalresults = [] - ## np[n mod/pp] -> np pc|mod/pp + # np[n mod/pp] -> np pc|mod/pp npmodppid = find1(stree, npmodppidxpath) - if npmodppid is not None : + if npmodppid is not None: ppnpresults = makeppnp(stree, npmodppid) globalresults += ppnpresults @@ -906,7 +961,8 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: newvcnode2 = nodecopy(vcnode) parent = obj1node.getparent() parent.remove(obj1node) - alternativesnode = mkalternativesnode([[obj1node], [newvcnode1], [newpobj1node, newvcnode2]]) + alternativesnode = mkalternativesnode( + [[obj1node], [newvcnode1], [newpobj1node, newvcnode2]]) parent.append(alternativesnode) vblppnodeids = globalresult.xpath(vblppnodeidxpath) @@ -922,11 +978,11 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: vz = find1(ppnode, './node[@rel="hd" and @pt="vz"]') newvz1 = copy.copy(vz) pppobj1vcnode = newppnode1 - children = [newvz1, newpobj1node1, newvcnode1] + children = [newvz1, newpobj1node1, newvcnode1] for child in children: pppobj1vcnode.append(child) - #pp with R-pronoun object + # pp with R-pronoun object newppnode2 = copy.copy(ppnode) newvz2 = copy.copy(vz) newvz2.attrib['vztype'] = 'fin' @@ -939,7 +995,7 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: newppnode2.append(Rpronounobj1node) newppnode2.append(newvz2) - #pp with R-pronoun object which has been replaced by a full NO with a dummymod + # pp with R-pronoun object which has been replaced by a full NO with a dummymod newppnode3 = copy.copy(ppnode) newvz3 = copy.copy(vz) newvz3.attrib['vztype'] = 'fin' @@ -957,7 +1013,7 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: pppronadvvcnode.remove(child) if vz is not None: vzlemma = gav(vz, 'lemma') - if vz is not None and vzlemma!= '': + if vz is not None and vzlemma != '': pronadvnode1 = getpronadv(vzlemma, 'hd', rprons={'er'}) newvcnode = nodecopy(vcnode) # print('ppnode:') @@ -968,17 +1024,20 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: pprel = gav(ppnode, 'rel') pronadvnode = getpronadv(vzlemma, pprel) - alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3],[pppobj1vcnode], [pppronadvvcnode], [pronadvnode]]) + alternativesnode = mkalternativesnode([[ppnode], [newppnode2], [newppnode3], [ + pppobj1vcnode], [pppronadvvcnode], [pronadvnode]]) parent.append(alternativesnode) - vblgennpnodeids = newstree.xpath(f'//node[@cat="np" and node[@naamval="gen" and @rel="det" and {vblnode}]]/@id') + vblgennpnodeids = newstree.xpath( + f'//node[@cat="np" and node[@naamval="gen" and @rel="det" and {vblnode}]]/@id') for vblgennpnodeid in vblgennpnodeids: npnode = find1(newstree, f'//node[@id="{vblgennpnodeid}"]') detnode = find1(npnode, './node[@rel="det"]') # NP zijn etc detp = ET.Element('node', attrib={'rel': 'det', 'cat': 'detp'}) vbl = ET.Element('node', attrib={'rel': 'mod'}) - bezvnw = ET.Element('node', attrib={'rel': 'hd', 'lemma': 'zijn|haar|hun', 'pt': 'vnw', 'vwtype': 'bez'}) + bezvnw = ET.Element('node', attrib={ + 'rel': 'hd', 'lemma': 'zijn|haar|hun', 'pt': 'vnw', 'vwtype': 'bez'}) detp.append(vbl) detp.append(bezvnw) npnode.remove(detnode) @@ -986,18 +1045,21 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: # de ... van X headnodegenus = find1(npnode, './node[@rel="hd"]/@genus') headnodegetal = find1(npnode, './node[@rel="hd"]/@getal') - lwnode = copy.copy(het_lw) if headnodegenus == 'onz' and headnodegetal == 'ev' else copy.copy(de_lw) - vanpp = ET.Element('node', attrib={'cat': 'pp', 'rel': 'mod', 'nodecount': '2'}) + lwnode = copy.copy( + het_lw) if headnodegenus == 'onz' and headnodegetal == 'ev' else copy.copy(de_lw) + vanpp = ET.Element( + 'node', attrib={'cat': 'pp', 'rel': 'mod', 'nodecount': '2'}) van_vzcopy = copy.copy(van_vz) gennodecopy = attcopy(detnode, ['index', 'id']) gennodecopy.attrib['rel'] = 'obj1' vanpp.append(van_vzcopy) vanpp.append(gennodecopy) - #Jans, tantes + # Jans, tantes gendetnode = attcopy(detnode, ['index', 'id', 'naamval', 'rel']) - alternativesnode = mkalternativesnode([[gendetnode], [detp], [lwnode, vanpp]]) + alternativesnode = mkalternativesnode( + [[gendetnode], [detp], [lwnode, vanpp]]) npnode.append(alternativesnode) localresults.append(newstree) @@ -1006,57 +1068,61 @@ def newgenvariants(stree: SynTree) -> List[SynTree]: return results - def genvariants(stree: SynTree) -> List[SynTree]: results = [] - #print('-->genvariants:') - #ET.dump(stree) + # print('-->genvariants:') + # ET.dump(stree) npmodppidxpath = \ - f""".//node[@cat="np" and - node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and + f""".//node[@cat="np" and + node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and ../node[@rel="hd" and @pt="ww"]]/@id""" npmodppid = find1(stree, npmodppidxpath) obcompphraseid = find1(stree, './/node[node[@rel="obcomp"]]/@id') - ## np[n mod/pp] -> np pc|mod/pp + # np[n mod/pp] -> np pc|mod/pp if npmodppid is not None: ppnpresults = makeppnp(stree, npmodppid) results += ppnpresults - ## [zo .. obcomp/X] -> [[zo ..] mod/X] zo vrij als een vogel -> zo vrij [is] als een vogel + # [zo .. obcomp/X] -> [[zo ..] mod/X] zo vrij als een vogel -> zo vrij [is] als een vogel if obcompphraseid is not None: obcompresults = mkextraobcomp(stree, obcompphraseid) results += obcompresults - #print('<--genvariants') + # print('<--genvariants') return results + def oldgenvariants(stree: SynTree) -> List[SynTree]: results = [] - #print('-->genvariants:') - #ET.dump(stree) - catsv1 = lambda stree: gav(stree, 'cat') == 'sv1' + # print('-->genvariants:') + # ET.dump(stree) + def catsv1(stree): return gav(stree, 'cat') == 'sv1' obj1nodeid = find1(stree, f'.//node[@rel="obj1" and {vblnode} ]/@id') ppnodeidxpath = f'.//node[@cat="pp" and node[@rel="hd"] and node[@rel="obj1" and {vblnode}] and count(node) =2]/@id' ppnodeid = find1(stree, ppnodeidxpath) - gennodeid = find1(stree, f'.//node[@naamval="gen" and count(node)=0 and not(@lemma) and not(@cat)]/@id') + gennodeid = find1( + stree, './/node[@naamval="gen" and count(node)=0 and not(@lemma) and not(@cat)]/@id') npmodppidxpath = \ - f""".//node[@cat="np" and - node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and + f""".//node[@cat="np" and + node[@rel="mod" and @cat="pp" and node[{vblnode}] and not(node[@rel="pobj1"]) and not(node[@rel="vc"])] and ../node[@rel="hd" and @pt="ww"]]/@id""" npmodppid = find1(stree, npmodppidxpath) - hasvblsu = lambda stree: find1(stree, f'./node[@rel="su" and {vblnode}]') is not None - hasverbalhead = lambda stree: find1(stree, './node[@rel="hd" and @pt="ww"]') is not None + + def hasvblsu(stree): return find1( + stree, f'./node[@rel="su" and {vblnode}]') is not None + def hasverbalhead(stree): return find1( + stree, './node[@rel="hd" and @pt="ww"]') is not None if hasverbalhead(stree) and hasvblsu(stree) and gav(stree, 'cat') != 'sv1': potentialimperativenodeid = stree.attrib['id'] else: potentialimperativenodeid = None - #potimpxpath = f'.//node[@cat="{alts(clausebodycats)}" and node[@rel="su" and {vblnode}]]/@id' - #potentialimperativenodeid = find1(stree, potimpxpath) + # potimpxpath = f'.//node[@cat="{alts(clausebodycats)}" and node[@rel="su" and {vblnode}]]/@id' + # potentialimperativenodeid = find1(stree, potimpxpath) # pp[ vz obj1] -> pp[vz pobj1 vc (op iets -> er op dat....) # [ ..ww ... obj1 ] -> [ - if obj1nodeid is not None and not catsv1(stree): + if obj1nodeid is not None and not catsv1(stree): rvcresults = makepobj1vc(stree, obj1nodeid) results += rvcresults # pp[ vz obj1] -> bw (pronominal adverb) naar iets -> ernaar/daarnaar etc @@ -1074,18 +1140,19 @@ def oldgenvariants(stree: SynTree) -> List[SynTree]: zijnnpresults = makenpzijn(stree, gennodeid) results += zijnnpresults - ## np[n mod/pp] -> np pc|mod/pp + # np[n mod/pp] -> np pc|mod/pp if npmodppid is not None and not catsv1(stree): ppnpresults = makeppnp(stree, npmodppid) results += ppnpresults - ## @@TODO: personal passives - ## @@TODO: impersonal passives - ## subjectless imperatives + # @@TODO: personal passives + # @@TODO: impersonal passives + # subjectless imperatives if potentialimperativenodeid is not None: - subjectlessimperatives = makesubjectlessimperatives(stree, potentialimperativenodeid) + subjectlessimperatives = makesubjectlessimperatives( + stree, potentialimperativenodeid) results += subjectlessimperatives - #print('<--genvariants') + # print('<--genvariants') return results @@ -1104,13 +1171,13 @@ def trees2xpath(strees: List[SynTree], expanded=False) -> str: def removesuperfluousindexes(stree: SynTree) -> SynTree: - #ET.dump(stree) + # ET.dump(stree) basicindexednodesmap = getbasicindexednodesmap(stree) # for ind, tree in basicindexednodesmap.items(): # print(ind) # ET.dump(tree) indexnodesmap = getindexednodesmap(basicindexednodesmap) - #for ind, tree in indexnodesmap.items(): + # for ind, tree in indexnodesmap.items(): # print(ind) # ET.dump(tree) newstree = copy.deepcopy(stree) @@ -1120,7 +1187,7 @@ def removesuperfluousindexes(stree: SynTree) -> SynTree: return newstree -def tree2xpath(stree:SynTree, indent=0) -> str: +def tree2xpath(stree: SynTree, indent=0) -> str: indentstr = indent * space childxpaths = [tree2xpath(child, indent+5) for child in stree] attconditions = [] @@ -1173,7 +1240,8 @@ def tree2xpath(stree:SynTree, indent=0) -> str: val = stree.attrib[att] attcondition = f'{attstr}{opstr}"{val}"' else: - orconditionlist = [f'{attstr}{opstr}"{val}"' for val in vals] + orconditionlist = [ + f'{attstr}{opstr}"{val}"' for val in vals] attcondition = f'({" or ".join(orconditionlist)})' attconditions.append(attcondition) @@ -1202,35 +1270,40 @@ def tree2xpath(stree:SynTree, indent=0) -> str: else: polresult = baseresult - result = f'\n{indentstr}{polresult}' elif stree.tag == 'alternatives': - result = f'\n{indentstr}(' + f' or '.join(childxpaths) + f'\n{indentstr})' + result = f'\n{indentstr}(' + \ + ' or '.join(childxpaths) + f'\n{indentstr})' elif stree.tag == 'alternative': - result = f'\n{indentstr}(' + f' and '.join(childxpaths) + f'\n{indentstr})' + result = f'\n{indentstr}(' + \ + ' and '.join(childxpaths) + f'\n{indentstr})' else: result = stree.tag - #message that an illegal structure has been encountered + # message that an illegal structure has been encountered return result + def adaptindexes(stree: SynTree, antecedent: SynTree, rhdnode: SynTree) -> SynTree: antecedentindex = gav(antecedent, 'index') rhdindex = gav(rhdnode, 'index') if antecedentindex != '': for node in stree.iter(): - nodeindex= gav(node, 'index') + nodeindex = gav(node, 'index') if nodeindex == rhdindex: node.attrib['index'] = antecedentindex -def mkpp(rel: str, vz: str, obj1node:SynTree, begin, end, index, az=None,) -> SynTree: - ppnode = ET.Element('node', attrib={'cat':'pp', 'rel':rel, 'index': index}) + +def mkpp(rel: str, vz: str, obj1node: SynTree, begin, end, index, az=None,) -> SynTree: + ppnode = ET.Element( + 'node', attrib={'cat': 'pp', 'rel': rel, 'index': index}) prepnode = ET.Element('node', attrib={'pt': 'vz', 'lemma': vz, 'word': vz, 'rel': 'hd', 'begin': begin, 'end': end, 'vztype': 'init'}) - aznode = ET.Element('node', attrib={'pt': 'vz', 'lemma': az, 'word': az, 'rel': 'hdf'}) if az is not None else None + aznode = ET.Element('node', attrib={ + 'pt': 'vz', 'lemma': az, 'word': az, 'rel': 'hdf'}) if az is not None else None newobj1node = copy.deepcopy(obj1node) newobj1node.attrib['rel'] = 'obj1' ppnode.append(prepnode) @@ -1239,7 +1312,8 @@ def mkpp(rel: str, vz: str, obj1node:SynTree, begin, end, index, az=None,) -> S ppnode.append(aznode) return ppnode -def adaptvzlemma(inlemma: str) -> str: + +def adaptvzlemma_inv(inlemma: str) -> str: if inlemma == 'mee': result = 'met' elif inlemma == 'toe': @@ -1248,14 +1322,17 @@ def adaptvzlemma(inlemma: str) -> str: result = inlemma return result + def relpronsubst(stree: SynTree) -> SynTree: newstree = copy.deepcopy(stree) - npwithrelnodeids = stree.xpath('.//node[@cat="np" and node[@rel="mod" and @cat="rel"]]/@id') + npwithrelnodeids = stree.xpath( + './/node[@cat="np" and node[@rel="mod" and @cat="rel"]]/@id') for npwithrelnodeid in npwithrelnodeids: npnode = find1(newstree, f'.//node[@id="{npwithrelnodeid}"]') if npnode is not None: - relnodeid = find1(npnode, f'./node[@rel="mod" and @cat="rel"]/@id') - rhdnode = find1(npnode, './node[@rel="mod" and @cat="rel"]/node[@rel="rhd"]') + relnodeid = find1(npnode, './node[@rel="mod" and @cat="rel"]/@id') + rhdnode = find1( + npnode, './node[@rel="mod" and @cat="rel"]/node[@rel="rhd"]') rhdpt = gav(rhdnode, 'pt') rhdframe = gav(rhdnode, 'frame') antecedent = copy.deepcopy(npnode) @@ -1263,7 +1340,7 @@ def relpronsubst(stree: SynTree) -> SynTree: antecedent.remove(relinantecedent) antecedent.append(dummymod) antecedent.attrib['rel'] = 'rhd' - #adaptindexes(newstree, antecedent, rhdnode) # the antecedent may have its own index yes, + # adaptindexes(newstree, antecedent, rhdnode) # the antecedent may have its own index yes, # but DO NOT do this, or you will have multiple incompatible antecedents relnode = find1(npnode, f'./node[@id="{relnodeid}"]') @@ -1272,12 +1349,14 @@ def relpronsubst(stree: SynTree) -> SynTree: antecedent.attrib['index'] = rhdindex relnode.remove(rhdnode) relnode.insert(0, antecedent) - #adapt the governing adposition if there is one - govprep = find1(newstree, f'.//node[@pt="vz" and @rel="hd" and ../node[@index="{rhdindex}"]]') + # adapt the governing adposition if there is one + govprep = find1( + newstree, f'.//node[@pt="vz" and @rel="hd" and ../node[@index="{rhdindex}"]]') if govprep is not None: govprep.attrib['vztype'] = 'init' - govprep.attrib['lemma'] = adaptvzlemma(govprep.attrib['lemma']) - #ET.dump(newstree) + govprep.attrib['lemma'] = adaptvzlemma_inv( + govprep.attrib['lemma']) + # ET.dump(newstree) elif rhdframe.startswith('waar_adverb'): index = gav(rhdnode, 'index') @@ -1287,7 +1366,7 @@ def relpronsubst(stree: SynTree) -> SynTree: else: vz = prep az = None - b, e = gav(rhdnode, 'begin'), gav(rhdnode,'end') + b, e = gav(rhdnode, 'begin'), gav(rhdnode, 'end') ppnode = mkpp('rhd', vz, antecedent, b, e, index, az=az) ppnode.attrib['rel'] = 'rhd' relnode.remove(rhdnode) @@ -1297,7 +1376,7 @@ def relpronsubst(stree: SynTree) -> SynTree: def expandfull(stree: SynTree) -> SynTree: - #possibly add getlcat + # possibly add getlcat stree1 = relpronsubst(stree) stree2 = indextransform(stree1) return stree2 @@ -1309,11 +1388,13 @@ def gettopnode(stree): return child return None + def iscontentwordnode(node: SynTree) -> bool: nodept = gav(node, 'pt') result = nodept in contentwordpts return result + def removeemptyalts(stree: SynTree) -> SynTree: newstree = copy.deepcopy(stree) for node in newstree.iter(): @@ -1321,16 +1402,17 @@ def removeemptyalts(stree: SynTree) -> SynTree: node.getparent().remove(node) return newstree + def mknearmiss(mwetrees: List[SynTree]) -> Xpathexpression: reducedmwetrees = [] for mwetree in mwetrees: reducedmwetree = copy.deepcopy(mwetree) - nodelist = list(reducedmwetree.iter()) # turn it into a list to make sure it has been computed - contentwordnodes = [node for node in nodelist if iscontentwordnode(node)] + # turn it into a list to make sure it has been computed + nodelist = list(reducedmwetree.iter()) + contentwordnodes = [ + node for node in nodelist if iscontentwordnode(node)] contentwordcount = len(contentwordnodes) for node in nodelist: - id = gav(node, 'id') - nodept = gav(node, 'pt') if 'pt' in node.attrib and not iscontentwordnode(node) and contentwordcount > 1: parent = node.getparent() parent.remove(node) @@ -1344,25 +1426,29 @@ def mknearmiss(mwetrees: List[SynTree]) -> Xpathexpression: del node.attrib[att] cleanreducedmwetree = removeemptyalts(reducedmwetree) reducedmwetrees.append(cleanreducedmwetree) - #for reducedmwetree in reducedmwetrees: + # for reducedmwetree in reducedmwetrees: # ET.dump(reducedmwetree) result = trees2xpath(reducedmwetrees) return result + def mksuperquery(mwetrees) -> Xpathexpression: if mwetrees == []: result = '' else: mwetree = mwetrees[0] # we only have to look at the first tree wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib] - contentwordnodes = [node for node in mwetree.iter() if iscontentwordnode(node)] - contentwordnodes = contentwordnodes if len(contentwordnodes) > 1 else wordnodes + contentwordnodes = [ + node for node in mwetree.iter() if iscontentwordnode(node)] + contentwordnodes = contentwordnodes if len( + contentwordnodes) > 1 else wordnodes newmwetree = ET.Element('node', attrib={'cat': 'top'}) for contentwordnode in contentwordnodes: cwlemma = gav(contentwordnode, 'lemma') cwpt = gav(contentwordnode, 'pt') - newcontentwordnode = ET.Element('node' , attrib={'lemma':cwlemma, 'pt':cwpt, 'axis': 'descendant'}) + newcontentwordnode = ET.Element( + 'node', attrib={'lemma': cwlemma, 'pt': cwpt, 'axis': 'descendant'}) newmwetree.append(newcontentwordnode) result = tree2xpath(newmwetree) @@ -1373,19 +1459,18 @@ def mksuperquery(mwetrees) -> Xpathexpression: return result - def generatequeries(mwe: str, lcatexpansion=True) -> (Xpathexpression, Xpathexpression, Xpathexpression): annotatedlist = preprocess_MWE(mwe) annotations = [el[1] for el in annotatedlist] cleanmwe = space.join([el[0] for el in annotatedlist]) - #parse the utterance + # parse the utterance unexpandedfullmweparse = parse(cleanmwe) if lcatexpansion: fullmweparse = expandnonheadwords(unexpandedfullmweparse) else: fullmweparse = unexpandedfullmweparse - #ET.dump(fullmweparse) + # ET.dump(fullmweparse) mweparse = gettopnode(fullmweparse) newtreesa = transformtree(mweparse, annotations) newtrees = [] @@ -1394,20 +1479,22 @@ def generatequeries(mwe: str, lcatexpansion=True) -> (Xpathexpression, Xpathexpr cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees] mwequery = trees2xpath(cleantrees, expanded=True) - #nearmissquery + # nearmissquery nearmissquery = mknearmiss(cleantrees) - #supersetquery + # supersetquery supersetquery = mksuperquery(newtreesa) return mwequery, nearmissquery, supersetquery + def selfapplyqueries(utt, mwequery, nearmissquery, supersetquery, lcatexpansion=True): unexpandedfullparse = parse(utt) unexpandedfullparse = lowerpredm(unexpandedfullparse) - #ET.dump(unexpandedfullparse) + # ET.dump(unexpandedfullparse) - supersetnodes = unexpandedfullparse.xpath(supersetquery) # in the real application this should be done on the treebank's index + # in the real application this should be done on the treebank's index + supersetnodes = unexpandedfullparse.xpath(supersetquery) nearmissnodes = [] mwenodes = [] @@ -1416,18 +1503,18 @@ def selfapplyqueries(utt, mwequery, nearmissquery, supersetquery, lcatexpansion= fullparse = expandnonheadwords(supersetnode) else: fullparse = supersetnode - #ET.dump(fullparse) + # ET.dump(fullparse) indexpfullparse = indextransform(fullparse) - #ET.dump(indexpfullparse) + # ET.dump(indexpfullparse) nearmissnodes += indexpfullparse.xpath(nearmissquery) mwenodes += indexpfullparse.xpath(mwequery) return (mwenodes, nearmissnodes, supersetnodes) + def markutt(utt: str, nodes: List[SynTree]) -> str: - results = [] tokens = utt.split() if nodes == []: result = utt @@ -1435,24 +1522,25 @@ def markutt(utt: str, nodes: List[SynTree]) -> str: node = nodes[0] nodeyield = getnodeyield(node) markbegins = [int(gav(node, 'begin')) for node in nodeyield] - markedutttokens = [mark(token) if i in markbegins else token for i, token in enumerate(tokens)] + markedutttokens = [ + mark(token) if i in markbegins else token for i, token in enumerate(tokens)] result = space.join(markedutttokens) return result + def mark(wrd: str) -> str: return f'*{wrd}*' - - def applyqueries(treebank, mwe, mwequery, nearmissquery, supersetquery, lcatexpansion=True): allresults = {} for treeid, tree in treebank.items(): allresults[treeid] = [] unexpandedfullparse = lowerpredm(tree) - #ET.dump(unexpandedfullparse) + # ET.dump(unexpandedfullparse) - supersetnodes = unexpandedfullparse.xpath(supersetquery) # in the real application this should be done on the treebank's index + # in the real application this should be done on the treebank's index + supersetnodes = unexpandedfullparse.xpath(supersetquery) nearmissnodes = [] mwenodes = [] @@ -1461,16 +1549,17 @@ def applyqueries(treebank, mwe, mwequery, nearmissquery, supersetquery, lcatexpa fullparse = expandnonheadwords(supersetnode) else: fullparse = supersetnode - #ET.dump(fullparse) + # ET.dump(fullparse) indexpfullparse = indextransform(fullparse) - #ET.dump(indexpfullparse) + # ET.dump(indexpfullparse) nearmissnodes += indexpfullparse.xpath(nearmissquery) mwenodes += indexpfullparse.xpath(mwequery) if mwenodes != []: - allresults[treeid].append((mwenodes, nearmissnodes, supersetnodes)) + allresults[treeid].append( + (mwenodes, nearmissnodes, supersetnodes)) if treeid != mwe: print(f'<{treeid}> found by query for <{mwe}>') print(markutt(treeid, mwenodes)) @@ -1479,7 +1568,7 @@ def applyqueries(treebank, mwe, mwequery, nearmissquery, supersetquery, lcatexpa else: if treeid == mwe: print(f' <{treeid}> not found by query for <{mwe}>') - print(f' mwenodes:{len(mwenodes)}; nearmiss:{len(nearmissnodes)}; superset:{len(supersetnodes)}') - + print( + f' mwenodes:{len(mwenodes)}; nearmiss:{len(nearmissnodes)}; superset:{len(supersetnodes)}') return allresults diff --git a/mwe_query/getalpinomwus.py b/mwe_query/getalpinomwus.py index f9719cc..c5173a8 100644 --- a/mwe_query/getalpinomwus.py +++ b/mwe_query/getalpinomwus.py @@ -7,15 +7,15 @@ nounmwus = [] with open(inputfilename, 'r', encoding='utf8') as infile: - text = infile.read() - matchiterator = nounmwusre.finditer(text) - for match in matchiterator: - rawmwu = match.group(2) + text = infile.read() + matchiterator = nounmwusre.finditer(text) + for match in matchiterator: + rawmwu = match.group(2) - mwu = rawmwu - nounmwus.append(mwu) + mwu = rawmwu + nounmwus.append(mwu) outfilename = './testdata/nounmwus.txt' with open(outfilename, 'w', encoding='utf8') as outfile: for mwu in nounmwus: - print(mwu, file=outfile) \ No newline at end of file + print(mwu, file=outfile) diff --git a/mwe_query/indextransform.py b/mwe_query/indextransform.py index b6bb14d..d890ebd 100644 --- a/mwe_query/indextransform.py +++ b/mwe_query/indextransform.py @@ -1,3 +1,5 @@ +# flake8: noqa +# TODO: implement this file from copy import copy indexdict = {} @@ -7,5 +9,5 @@ def makeindexdict(stree): indexdict[index] = stree for i , node in indexdict.items(): - + pass # TODO diff --git a/mwe_query/lcat.py b/mwe_query/lcat.py index b296b9f..358d38b 100644 --- a/mwe_query/lcat.py +++ b/mwe_query/lcat.py @@ -3,6 +3,7 @@ import copy import lxml.etree as ET + def expandnonheadwords(stree: SynTree) -> SynTree: # it is presupposed that the input stree is not None newnode = copy.copy(stree) @@ -12,7 +13,7 @@ def expandnonheadwords(stree: SynTree) -> SynTree: for child in stree: if terminal(child): rel = gav(child, 'rel') - if rel not in ['hd', 'mwp', 'svp', 'hdf', 'cmp']: + if rel not in ['hd', 'mwp', 'svp', 'hdf', 'cmp']: newchild = mkphrase(child) else: newchild = copy.copy(child) @@ -25,6 +26,7 @@ def expandnonheadwords(stree: SynTree) -> SynTree: newnode.append(newchild) return newnode + def getlcatatt(node: SynTree) -> str: pt = gav(node, 'pt') cat = gav(node, 'cat') @@ -62,9 +64,8 @@ def mkphrase(child: SynTree) -> SynTree: return newnode -def getlcat(node: SynTree, prel=None) -> str: +def getlcat(node: SynTree, prel=None) -> str: # noqa: C901 pt = gav(node, 'pt') - cat = gav(node, 'cat') rel = gav(node, 'rel') if prel is None else prel positie = gav(node, 'positie') wvorm = gav(node, 'wvorm') @@ -119,7 +120,7 @@ def getlcat(node: SynTree, prel=None) -> str: result = 'ap' elif 'adverb' in frame: result = 'advp' - elif 'post_p' in frame or 'preposition' in frame : + elif 'post_p' in frame or 'preposition' in frame: result = 'pp' else: result = 'pp' @@ -134,10 +135,10 @@ def getlcat(node: SynTree, prel=None) -> str: result = 'xp' elif wvorm == 'inf' and positie == 'nom': result = 'np' - elif wvorm == 'inf'and positie == 'vrij': + elif wvorm == 'inf' and positie == 'vrij': result = 'inf' elif wvorm == 'inf' and positie == 'prenom': - result = 'inf' #checked in Lassy-Small + result = 'inf' # checked in Lassy-Small elif wvorm == 'pv': result = 'sv1' else: @@ -158,7 +159,7 @@ def getlcat(node: SynTree, prel=None) -> str: result = 'np' elif positie == 'prenom' and 'determiner' in frame: result = 'detp' - elif 'positie' not in node.attrib and vwtype== 'aanw': + elif 'positie' not in node.attrib and vwtype == 'aanw': result = 'detp' elif rel == 'det' and vwtype == 'aanw': result = 'detp' @@ -179,13 +180,3 @@ def getlcat(node: SynTree, prel=None) -> str: ET.dump(node) return result - - - - - - - - - result = 'xp' - return result From 401f64c80f1309ad73faf29540ecff1515d7babf Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Tue, 6 Dec 2022 13:58:14 +0100 Subject: [PATCH 2/4] Fix unit tests --- mwe_query/canonicalform.py | 10 +- tests/data/transform/0-0.xml | 8 + tests/data/transform/1-0.xml | 8 + tests/data/transform/2-0.xml | 8 + tests/data/transform/3-0.xml | 7 + tests/data/transform/4-0.xml | 8 + tests/data/transform/5-0.xml | 8 + tests/data/transform/6-0.xml | 8 + tests/data/transform/mwes.txt | 7 + tests/data/transform/tree.xml | 58 ++ tests/test_expand.py | 6 +- ...{preprocess_test.py => test_preprocess.py} | 919 +++++++++--------- tests/update_outputs.py | 57 +- 13 files changed, 651 insertions(+), 461 deletions(-) create mode 100644 tests/data/transform/0-0.xml create mode 100644 tests/data/transform/1-0.xml create mode 100644 tests/data/transform/2-0.xml create mode 100644 tests/data/transform/3-0.xml create mode 100644 tests/data/transform/4-0.xml create mode 100644 tests/data/transform/5-0.xml create mode 100644 tests/data/transform/6-0.xml create mode 100644 tests/data/transform/mwes.txt create mode 100644 tests/data/transform/tree.xml rename tests/{preprocess_test.py => test_preprocess.py} (58%) diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index a536428..d3a7cd3 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -1,15 +1,15 @@ from typing import List, Optional, Set -from sastatypes import SynTree +from sastadev.sastatypes import SynTree import re import sys -from treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \ +from sastadev.treebankfunctions import getattval as gav, terminal, getnodeyield, find1, bareindexnode, indextransform, \ getindexednodesmap, getbasicindexednodesmap, clausebodycats import lxml.etree as ET import copy -from adpositions import vzazindex -from alpinoparsing import parse -from lcat import expandnonheadwords +from mwe_query.adpositions import vzazindex +from sastadev.alpinoparsing import parse +from mwe_query.lcat import expandnonheadwords Xpathexpression = str diff --git a/tests/data/transform/0-0.xml b/tests/data/transform/0-0.xml new file mode 100644 index 0000000..2468b81 --- /dev/null +++ b/tests/data/transform/0-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/1-0.xml b/tests/data/transform/1-0.xml new file mode 100644 index 0000000..53fbbde --- /dev/null +++ b/tests/data/transform/1-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/2-0.xml b/tests/data/transform/2-0.xml new file mode 100644 index 0000000..4a3718b --- /dev/null +++ b/tests/data/transform/2-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/3-0.xml b/tests/data/transform/3-0.xml new file mode 100644 index 0000000..8f45b92 --- /dev/null +++ b/tests/data/transform/3-0.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/4-0.xml b/tests/data/transform/4-0.xml new file mode 100644 index 0000000..b1cb7be --- /dev/null +++ b/tests/data/transform/4-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/5-0.xml b/tests/data/transform/5-0.xml new file mode 100644 index 0000000..b1cb7be --- /dev/null +++ b/tests/data/transform/5-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/6-0.xml b/tests/data/transform/6-0.xml new file mode 100644 index 0000000..9191b94 --- /dev/null +++ b/tests/data/transform/6-0.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/tests/data/transform/mwes.txt b/tests/data/transform/mwes.txt new file mode 100644 index 0000000..f0f0d29 --- /dev/null +++ b/tests/data/transform/mwes.txt @@ -0,0 +1,7 @@ +iemand zal de dans ontspringen +iemand zal de *dans ontspringen +iemand zal de +dans ontspringen +iemand zal 0de dans ontspringen +iemand zal de +*dans ontspringen +iemand zal de *+dans ontspringen +iemand zal de =dans ontspringen diff --git a/tests/data/transform/tree.xml b/tests/data/transform/tree.xml new file mode 100644 index 0000000..9c01073 --- /dev/null +++ b/tests/data/transform/tree.xml @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + iemand zal de dans ontspringen + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_expand.py b/tests/test_expand.py index c92e7f7..f976f56 100644 --- a/tests/test_expand.py +++ b/tests/test_expand.py @@ -1,11 +1,15 @@ import unittest +import os import xml.etree.ElementTree as ET from mwe_query import expand_index_nodes + class TextIndexExpansion(unittest.TestCase): + def data_path(self, filename): + return os.path.join(os.path.dirname(__file__), "data", filename) def test_no_infinite_loop(self): - with open('tests/data/expand/001.xml') as f: + with open(self.data_path('expand/001.xml')) as f: doc = ET.parse(f) expand_index_nodes(doc) diff --git a/tests/preprocess_test.py b/tests/test_preprocess.py similarity index 58% rename from tests/preprocess_test.py rename to tests/test_preprocess.py index 724cac1..d43dc72 100644 --- a/tests/preprocess_test.py +++ b/tests/test_preprocess.py @@ -1,16 +1,18 @@ -from canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \ +import unittest +from mwe_query.canonicalform import preprocess_MWE, annotationstrings, transformtree, listofsets2setoflists, \ genvariants, trees2xpath, removesuperfluousindexes, newgenvariants, lowerpredm, relpronsubst, expandfull, \ - generatequeries, applyqueries, selfapplyqueries + generatequeries, applyqueries, selfapplyqueries, variable, com, noann import os import sys import lxml.etree as ET +from difflib import context_diff from treebankfunctions import getstree, getyield, indextransform, getyieldstr from alpinoparsing import parse -from lcat import expandnonheadwords +from mwe_query.lcat import expandnonheadwords -##DONE -##indextransform uitstellen +# DONE +# indextransform uitstellen # index meenemen # speciale behandeling voor bareindexnodes # varianten genereren @@ -24,6 +26,7 @@ comma = ',' tab = '\t' + def gettopnode(stree): for child in stree: if child.tag == 'node': @@ -31,68 +34,88 @@ def gettopnode(stree): return None -def main(): - inputfilename = r'./testdata/all_mwes_2022-08-22.txt' - base, ext = os.path.splitext(inputfilename) - outfilename = base + '_annotated' + ext - with open(inputfilename, 'r', encoding='utf8') as infile: - with open(outfilename, 'w', encoding='utf8') as outfile: - linenr = 0 - for idmwe in infile: - linenr += 1 - # skip header - if linenr == 1: - continue - idmwelist = idmwe.split(tab) - id = idmwelist[0] - mwe = idmwelist[1][:-1] - annotatedlist = preprocess_MWE(mwe) - wlist = [el[0] for el in annotatedlist] - annlist = [el[1] for el in annotatedlist] - wliststr = space.join(wlist) - annliststr = comma.join([str(i) for i in annlist]) - print(f'{mwe};{wliststr};{annliststr}', file=outfile) - b, sym = containsillegalsymbols(wliststr) - if b: - print(f'Illegal symbol {sym} in {wliststr}', file=sys.stderr) - -def mktreebank(dict, outfilename): - treebank = ET.Element('treebank') - for mwe in dict: - tree = parse(mwe) - treebank.append(tree) - - fulltreebank = ET.ElementTree(treebank) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - - - - -def test1(): - mwe = 'iemand zal blikken com:[met iemand] wisselen' - annotatedlist = preprocess_MWE(mwe) - print(annotatedlist) - - -def test2(): - mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] - mwes += ['iemand zal 0de dans ontspringen'] - mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen'] - mwes = ['iemand zal de =dans ontspringen'] - for mwe in mwes: +class TextIndexExpansion(unittest.TestCase): + def data_path(self, *paths): + return os.path.join(os.path.dirname(__file__), "data", *paths) + + def main(self): + inputfilename = self.data_path('all_mwes_2022-08-22.txt') + base, ext = os.path.splitext(inputfilename) + outfilename = base + '_annotated' + ext + with open(inputfilename, 'r', encoding='utf8') as infile: + with open(outfilename, 'w', encoding='utf8') as outfile: + linenr = 0 + for idmwe in infile: + linenr += 1 + # skip header + if linenr == 1: + continue + idmwelist = idmwe.split(tab) + id = idmwelist[0] + mwe = idmwelist[1][:-1] + annotatedlist = preprocess_MWE(mwe) + wlist = [el[0] for el in annotatedlist] + annlist = [el[1] for el in annotatedlist] + wliststr = space.join(wlist) + annliststr = comma.join([str(i) for i in annlist]) + print(f'{mwe};{wliststr};{annliststr}', file=outfile) + b, sym = self.containsillegalsymbols(wliststr) + if b: + print( + f'Illegal symbol {sym} in {wliststr}', file=sys.stderr) + + def mktreebank(self, dict, outfilename): + treebank = ET.Element('treebank') + for mwe in dict: + tree = parse(mwe) + treebank.append(tree) + + fulltreebank = ET.ElementTree(treebank) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def test_annotation(self): + mwe = 'iemand zal blikken com:[met iemand] wisselen' annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = strees[1] - mweparse = gettopnode(fullmweparse) - newtrees = transformtree(mweparse, annotations) - print(f'{mwe}:') - for newtree in newtrees: - ET.dump(newtree) - - -streestrings = {} -streestrings[1] = """ + assert annotatedlist == [ + ('iemand', variable), + ('zal', noann), + ('blikken', noann), + ('met', com), + ('iemand', com), + ('wisselen', noann)] + + def test_transform(self): + with open(self.data_path("transform", "mwes.txt"), encoding="utf-8", mode="r") as f: + mwes = f.readlines() + + i = 0 + for mwe in mwes: + if not mwe: + continue + + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + cleanmwe = space.join([el[0] for el in annotatedlist]) + fullmweparse = self.strees[1] + mweparse = gettopnode(fullmweparse) + newtrees = transformtree(mweparse, annotations) + j = 0 + for newtree in newtrees: + ET.indent(newtree) + actual = ET.tostring(newtree, encoding="unicode").splitlines(True) + with open(self.data_path("transform", f"{i}-{j}.xml"), encoding="utf-8", mode="r") as f: + expected = f.readlines() + diff = ''.join(context_diff(expected, actual)) + try: + assert not diff + except: + print(diff) + raise + j += 1 + i += 1 + + streestrings = {} + streestrings[1] = """ @@ -154,7 +177,7 @@ def test2(): """ -streestrings[2] = """ + streestrings[2] = """ @@ -228,7 +251,7 @@ def test2(): """ -streestrings[3] = """ + streestrings[3] = """ @@ -286,7 +309,7 @@ def test2(): """ -streestrings[4] = """ + streestrings[4] = """ @@ -321,7 +344,7 @@ def test2(): """ -streestrings[5] = """ + streestrings[5] = """ @@ -353,7 +376,7 @@ def test2(): """ -streestrings[6] = """ + streestrings[6] = """ @@ -382,381 +405,395 @@ def test2(): """ -strees = {i: ET.fromstring(streestrings[i]) for i in streestrings} - - -def containsillegalsymbols(mwe): - for el in annotationstrings: - if el in mwe: - return True, el - return (False, None) - - -def test3(): - lofs = [[1, 2], [3, 4], [5, 6]] - results = listofsets2setoflists(lofs) - for result in results: - print(result) - - -def getmwedict(intbfilename): - mwedict = {} - fulltreebank = getstree(intbfilename) - treebank = fulltreebank.getroot() - for stree in treebank: - keylist = getyield(stree) - key = space.join(keylist) - richstree = stree - #richstree = indextransform(stree) # put off because this should happen later - mwedict[key] = richstree - return mwedict - -def testrel(): - for i in {4,5,6}: - newstree = relpronsubst(strees[i]) - ET.dump(newstree) - -def test4(): - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - mwedict = getmwedict(intbfilename) - mwes = ['iemand zal de dans ontspringen', 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] - mwes += ['iemand zal 0de dans ontspringen'] - mwes += ['iemand zal de +*dans ontspringen', 'iemand zal de *+dans ontspringen'] - mwes += ['iemand zal de =dans ontspringen'] - mwes += ['dat mes zal aan twee kanten snijden'] - mwes += ['0nu zal de aap uit de mouw komen'] - mwes += ['iemand zal de schuld op zich nemen'] - mwes += ['iemand zal buiten zichzelf zijn'] - mwes += ['iemand zal veel in zijn mars hebben'] - mwes += ['bij nacht en ontijd'] - mwes += ['iemand zal blikken com:[met] iemand wisselen'] # still something wrong here - mwes += ['dd:[dat] mes zal aan twee kanten snijden'] - mwes += ['iets zal er inzitten'] - mwes += ['iemand zal in touw zijn'] - mwes += ['iemand zal aan iemand een *hekel hebben'] - mwes += ['iemand zal 0geen gras over iets laten groeien'] - #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ] - mwes += ['#door dik en dun'] - mwes += ['#ad patres'] - mwes += ['ad patres'] - mwes += ['iemand zal aan de kant #gaan'] - mwes += ['iemand zal aan de kant gaan'] - - for mwe in mwes: - annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = None - if cleanmwe in mwedict: - fullmweparse = mwedict[cleanmwe] - #ET.dump(fullmweparse) - elif mwe in mwedict: + @property + def strees(self): + return {i: ET.fromstring(self.streestrings[i]) for i in self.streestrings} + + def containsillegalsymbols(self, mwe): + for el in annotationstrings: + if el in mwe: + return True, el + return (False, None) + + def test_lofs(self): + lofs = [[1, 2], [3, 4], [5, 6]] + results = listofsets2setoflists(lofs) + assert results == [ + [1, 3, 5], + [1, 3, 6], + [1, 4, 5], + [1, 4, 6], + [2, 3, 5], + [2, 3, 6], + [2, 4, 5], + [2, 4, 6]] + + def getmwedict(self, intbfilename): + mwedict = {} + fulltreebank = getstree(intbfilename) + treebank = fulltreebank.getroot() + for stree in treebank: + keylist = getyield(stree) + key = space.join(keylist) + richstree = stree + # richstree = indextransform(stree) # put off because this should happen later + mwedict[key] = richstree + return mwedict + + def test_rel(self): + for i in {4, 5, 6}: + newstree = relpronsubst(self.strees[i]) + ET.dump(newstree) + + def test4(self): + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + mwedict = self.getmwedict(intbfilename) + mwes = ['iemand zal de dans ontspringen', + 'iemand zal de *dans ontspringen', 'iemand zal de +dans ontspringen'] + mwes += ['iemand zal 0de dans ontspringen'] + mwes += ['iemand zal de +*dans ontspringen', + 'iemand zal de *+dans ontspringen'] + mwes += ['iemand zal de =dans ontspringen'] + mwes += ['dat mes zal aan twee kanten snijden'] + mwes += ['0nu zal de aap uit de mouw komen'] + mwes += ['iemand zal de schuld op zich nemen'] + mwes += ['iemand zal buiten zichzelf zijn'] + mwes += ['iemand zal veel in zijn mars hebben'] + mwes += ['bij nacht en ontijd'] + # still something wrong here + mwes += ['iemand zal blikken com:[met] iemand wisselen'] + mwes += ['dd:[dat] mes zal aan twee kanten snijden'] + mwes += ['iets zal er inzitten'] + mwes += ['iemand zal in touw zijn'] + mwes += ['iemand zal aan iemand een *hekel hebben'] + mwes += ['iemand zal 0geen gras over iets laten groeien'] + #mwes = ['iemand zal iets | Iemand op zijn dak krijgen' ] + mwes += ['#door dik en dun'] + mwes += ['#ad patres'] + mwes += ['ad patres'] + mwes += ['iemand zal aan de kant #gaan'] + mwes += ['iemand zal aan de kant gaan'] + + for mwe in mwes: + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + cleanmwe = space.join([el[0] for el in annotatedlist]) + fullmweparse = None + if cleanmwe in mwedict: + fullmweparse = mwedict[cleanmwe] + # ET.dump(fullmweparse) + elif mwe in mwedict: + fullmweparse = mwedict[mwe] + if fullmweparse is not None: + mweparse = gettopnode(fullmweparse) + newtreesa = transformtree(mweparse, annotations) + newtrees = [] + for newtreea in newtreesa: + newtrees += genvariants(newtreea) + newtrees.extend(newtreesa) + print(f'{mwe}:') + for newtree in newtrees: + # print(f'{i+1}:') + print() + ET.dump(newtree) + else: + print(f'MWE <{cleanmwe}> not found ', file=sys.stderr) + + def mkoutfilename(self, infilename: str, suffix: str, ext=None) -> str: + basefilename, inext = os.path.splitext(infilename) + if ext is None: + ext = inext + result = basefilename + suffix + ext + return result + + def base_testfind(self, basemwe, xpath, mwedict, all=False): + results = [] + localxpath = "." + xpath + for mwe in mwedict: + origmwetree = mwedict[mwe] + mwetree = lowerpredm(origmwetree) + mweyield = getyield(mwetree) + mwestr = space.join(mweyield) + # ET.dump(mwetree) + # print(f'mwe={mwe}') + # print(f'xpath:\n{localxpath}\n') + mwehits = mwetree.xpath(localxpath) + newresult = (basemwe, mwe, len(mwehits)) + results.append(newresult) + + for basemwe, mwe, count in results: + if all: + cond = True + else: + cond = (basemwe == mwe and count != 1) or ( + basemwe != mwe and count != 0) + if cond: + print(basemwe, mwe, count, file=sys.stderr) + + @unittest.skip("slooooow") + def test5(self): + reportevery = 500 + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + mwedict = self.getmwedict(intbfilename) + # next one is problematic, so we delete it + problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen' + if problemmwe in mwedict: + del mwedict[problemmwe] + #mwedict = {} + #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2] + #mwedict['iemand zal ’m smeren'] = strees[3] + # for ind, tree in expandedmwedict.items(): + # print(ind) + # ET.dump(tree) + suffix = '_trees' + outfilename = self.mkoutfilename(intbfilename, suffix) + # with open(outfilename, 'w', encoding='utf8') as outfile: + treebank = ET.Element('treebank') + inds = ['iemand zal uit iemands koker komen'] + inds = ['iemand zal slechte invloed op iemand hebben'] + inds = ['iemand zal met de pet naar iets gooien'] + inds = ['de tale Kanaäns'] + inds += ['heel af en toe'] + inds += ['na verloop van tijd'] + inds += ['al doende zal men leren'] + inds = ['iemand zal de schuld van iets op iemand schuimwedicven'] + inds += ['iemand zal iets door de vingers zien'] + inds += ['iemand zal achter iets komen'] + inds += ['iemand zal uit iemands koker komen'] + inds += ['al doende zal men leren'] + # inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions + inds += ['te dom om voor de duivel te dansen'] + inds += ['zo doof als een kwartel'] + inds = ['iemand zal veel ellende over iemand uitstorten'] + #mwedict = {ind: mwedict[ind] for ind in inds} + expandedmwedict = {mwe: indextransform( + tree) for mwe, tree in mwedict.items()} + counter = 0 + for mwe in mwedict: + counter += 1 + mwe_element = ET.Element('mwe', attrib={'mwe': mwe}) + #print(mwe, file=sys.stderr) + if counter % reportevery == 0: + print(counter, file=sys.stderr) + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + #cleanmwe = space.join([el[0] for el in annotatedlist]) fullmweparse = mwedict[mwe] - if fullmweparse is not None: mweparse = gettopnode(fullmweparse) + # if mweparse is None: + # #print(f'\n\n{mwe}:', file=outfile) + # #print('None') + # continue + treeyield = getyield(mweparse) + treeyieldstr = space.join(treeyield) + if treeyieldstr != mwe: + print(f'mismatch:\n{treeyieldstr}=/={mwe} ') + continue newtreesa = transformtree(mweparse, annotations) newtrees = [] for newtreea in newtreesa: - newtrees += genvariants(newtreea) - newtrees.extend(newtreesa) - print(f'{mwe}:') - for newtree in newtrees: - #print(f'{i+1}:') - print() - ET.dump(newtree) - else: - print(f'MWE <{cleanmwe}> not found ', file=sys.stderr) - -def mkoutfilename(infilename: str, suffix: str, ext=None) -> str: - basefilename, inext = os.path.splitext(infilename) - if ext is None: - ext = inext - result = basefilename + suffix + ext - return result - -def testfind(basemwe, xpath, mwedict, all=False): - results = [] - localxpath = "." + xpath - for mwe in mwedict: - origmwetree = mwedict[mwe] - mwetree = lowerpredm(origmwetree) - mweyield = getyield(mwetree) - mwestr = space.join(mweyield) - #ET.dump(mwetree) - #print(f'mwe={mwe}') - #print(f'xpath:\n{localxpath}\n') - mwehits = mwetree.xpath(localxpath) - newresult = (basemwe, mwe, len(mwehits)) - results.append(newresult) - - - for basemwe, mwe, count in results: - if all: - cond = True - else: - cond = (basemwe == mwe and count!=1) or (basemwe != mwe and count!=0) - if cond: - print(basemwe, mwe, count, file=sys.stderr ) - - - -def test5(): - reportevery = 500 - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - mwedict = getmwedict(intbfilename) - # next one is problematic, so we delete it - problemmwe = 'iemand zal iets | Iemand op zijn dak krijgen' - if problemmwe in mwedict: - del mwedict[problemmwe] - #mwedict = {} - #mwedict['wat het oog niet ziet zal het hart niet deren'] = strees[2] - #mwedict['iemand zal ’m smeren'] = strees[3] - #for ind, tree in expandedmwedict.items(): - # print(ind) - # ET.dump(tree) - suffix = '_trees' - outfilename = mkoutfilename(intbfilename, suffix) -# with open(outfilename, 'w', encoding='utf8') as outfile: - treebank = ET.Element('treebank') - inds = ['iemand zal uit iemands koker komen'] - inds = ['iemand zal slechte invloed op iemand hebben'] - inds = ['iemand zal met de pet naar iets gooien'] - inds = ['de tale Kanaäns'] - inds += ['heel af en toe'] - inds += ['na verloop van tijd'] - inds += ['al doende zal men leren'] - inds = ['iemand zal de schuld van iets op iemand schuimwedicven'] - inds += ['iemand zal iets door de vingers zien'] - inds += ['iemand zal achter iets komen'] - inds += ['iemand zal uit iemands koker komen'] - inds += ['al doende zal men leren'] - #inds = ['die wind zaait zal storm zullen oogsten'] we must not have zullen with these expressions - inds += ['te dom om voor de duivel te dansen'] - inds += ['zo doof als een kwartel'] - inds = ['iemand zal veel ellende over iemand uitstorten'] - #mwedict = {ind: mwedict[ind] for ind in inds} - expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} - counter = 0 - for mwe in mwedict: - counter += 1 - mwe_element = ET.Element('mwe', attrib={'mwe': mwe}) - #print(mwe, file=sys.stderr) - if counter % reportevery == 0: - print(counter, file=sys.stderr) - annotatedlist = preprocess_MWE(mwe) - annotations = [el[1] for el in annotatedlist] - #cleanmwe = space.join([el[0] for el in annotatedlist]) - fullmweparse = mwedict[mwe] - mweparse = gettopnode(fullmweparse) - #if mweparse is None: - # #print(f'\n\n{mwe}:', file=outfile) - # #print('None') - # continue - treeyield = getyield(mweparse) - treeyieldstr = space.join(treeyield) - if treeyieldstr != mwe: - print(f'mismatch:\n{treeyieldstr}=/={mwe} ' ) - continue - newtreesa = transformtree(mweparse, annotations) - newtrees = [] - for newtreea in newtreesa: - newtrees += newgenvariants(newtreea) - #newtrees.extend(newtreesa) - #print(f'\n\n{mwe}:', file=outfile) - cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees] - #cleantrees = newtrees - #print('cleantrees:') - #for cleantree in cleantrees: - # ET.dump(cleantree) - mwe_element.extend(cleantrees) - xpath = trees2xpath(cleantrees, expanded=True) - #print(xpath) - xpath_element = ET.Element('xpath') - xpath_element.text = xpath - mwe_element.append(xpath_element) - treebank.append(mwe_element) - testfind(mwe, xpath, expandedmwedict) - #ET.dump(treebank) - # for newtree in newtrees: - # #print(f'{i+1}:') - # print() - # treebank.append(newtree) - fulltreebank = ET.ElementTree(treebank) - #ET.indent(newtree, space=" ") - #print(ET.tostring(newtree), file=outfile) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - -def check(treebankdict): - for utt, stree in treebankdict.items(): - for node in stree.iter(): - if 'pt' in node.attrib: - for att in {'begin','end'}: - if 'id' in node.attrib: - id = node.attrib['id'] - else: - id = 'None' - if att not in node.attrib: - print(f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.') - ET.dump(stree) - - -def getutts(infilename): - #each utterance on a separate line, discard the final \n and skip empty lines - infile = open(infilename, 'r', encoding='utf8') - rawutts = infile.readlines() - utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1] - return utts - - -def testvariatie(): - mwetreebank = './testdata/mwesvoorvariatie-noann_treebank.xml' - mwedict = getmwedict(mwetreebank) - #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} - testtreebankfilename = './testdata/testzinnen mwevarianten_treebank.xml' - fullvariationtreebank = getstree(testtreebankfilename) - variationtreebank = fullvariationtreebank.getroot() - variationtreebankdict= {getyieldstr(tree): expandfull(tree) for tree in variationtreebank} - #check(variationtreebankdict) - annotatedmwefilename = './testdata/mwesvoorvariatie-annotated.txt' - annotatedmwes = getutts(annotatedmwefilename) - suffix = '_derivedtrees' - outfilename = mkoutfilename(mwetreebank, suffix) - treebank = ET.Element('treebank') - counter = 0 - reportevery = 500 - - #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen'] - #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen'] - for rawmwe in annotatedmwes: - counter += 1 - mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe}) - #print(mwe, file=sys.stderr) - if counter % reportevery == 0: - print(counter, file=sys.stderr) - annotatedlist = preprocess_MWE(rawmwe) - annotations = [el[1] for el in annotatedlist] - mweparts = [el[0] for el in annotatedlist] - mwe = space.join(mweparts) - fullmweparse = mwedict[mwe] - mweparse = gettopnode(fullmweparse) - #if mweparse is None: - # #print(f'\n\n{mwe}:', file=outfile) - # #print('None') - # continue - treeyield = getyield(mweparse) - treeyieldstr = space.join(treeyield) - if treeyieldstr != mwe: - print(f'mismatch:\n{treeyieldstr}=/={mwe} ' ) - continue - newtreesa = transformtree(mweparse, annotations) - newtrees = [] - for newtreea in newtreesa: - newtrees += newgenvariants(newtreea) - #newtrees.extend(newtreesa) - #print(f'\n\n{mwe}:', file=outfile) - cleantrees = [removesuperfluousindexes(newtree) for newtree in newtrees] - #cleantrees = newtrees - #print('cleantrees:') - #for cleantree in cleantrees: - # ET.dump(cleantree) - mwe_element.extend(cleantrees) - xpath = trees2xpath(cleantrees, expanded=True) - #print(xpath) - xpath_element = ET.Element('xpath') - xpath_element.text = xpath - mwe_element.append(xpath_element) - treebank.append(mwe_element) - testfind(mwe, xpath, variationtreebankdict) - #ET.dump(treebank) - # for newtree in newtrees: - # #print(f'{i+1}:') - # print() - # treebank.append(newtree) - fulltreebank = ET.ElementTree(treebank) - #ET.indent(newtree, space=" ") - #print(ET.tostring(newtree), file=outfile) - fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) - -def gentreebank(): - #generate a new treebank because a new parser is being used - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - suffix = '_parse2022-11-18' - outfilename = mkoutfilename(intbfilename, suffix) - mwedict = getmwedict(intbfilename) - mktreebank(mwedict, outfilename) - - - -def genqueries(): - selftest = False - mwes = [ 'iemand zal een poging doen', 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen'] - mwes += ['iemand zal *honger hebben'] - #mwes = ['iemand zal 0een *+poging doen'] - #intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID.xml' - intbfilename = './testdata/MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml' - suffix = '_querytriples' - outfilename = mkoutfilename(intbfilename, suffix) - mwedict = getmwedict(intbfilename) - #selectedmwe = 'af en toe' - mwes = [mwe for mwe, _ in mwedict.items() ] - # mwes = ['iemand zal 0een *+poging doen'] - # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten'] - # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald'] - # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken'] - # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden'] - # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging'] - # mwes += ['dd:[dat] zelfde liedje'] - # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten'] - # mwes += ['iemand zal veel ellende over iemand uitstorten'] - # mwes += ['iemand zal aanhangen als een klis'] - # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren', - # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn', - # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan'] - # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn', - # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen'] - # mwes += ['iemand zal buiten zichzelf zijn'] - # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn', - # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen', - # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter', - # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat', - # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets', - # 'wie een hond wil slaan zal licht een stok vinden', - # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen'] - #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen'] - #mwes += ['het zal zaliger zijn te geven dan te ontvangen'] - with open(outfilename, 'w', encoding='utf8') as outfile: - for mwe in mwes: - print(mwe) - (mweq, nearmissq, supersetq) = generatequeries(mwe) - print(f'\n{mwe}:', file=outfile) - print(f'mweq:\n{mweq}', file=outfile) - - print(f'nearmissq:\n{nearmissq}', file=outfile) + newtrees += newgenvariants(newtreea) + # newtrees.extend(newtreesa) + #print(f'\n\n{mwe}:', file=outfile) + cleantrees = [removesuperfluousindexes( + newtree) for newtree in newtrees] + #cleantrees = newtrees + # print('cleantrees:') + # for cleantree in cleantrees: + # ET.dump(cleantree) + mwe_element.extend(cleantrees) + xpath = trees2xpath(cleantrees, expanded=True) + # print(xpath) + xpath_element = ET.Element('xpath') + xpath_element.text = xpath + mwe_element.append(xpath_element) + treebank.append(mwe_element) + self.base_testfind(mwe, xpath, expandedmwedict) + # ET.dump(treebank) + # for newtree in newtrees: + # #print(f'{i+1}:') + # print() + # treebank.append(newtree) + fulltreebank = ET.ElementTree(treebank) + #ET.indent(newtree, space=" ") + #print(ET.tostring(newtree), file=outfile) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def check(self, treebankdict): + for utt, stree in treebankdict.items(): + for node in stree.iter(): + if 'pt' in node.attrib: + for att in {'begin', 'end'}: + if 'id' in node.attrib: + id = node.attrib['id'] + else: + id = 'None' + if att not in node.attrib: + print( + f'missing {att} in node with id={id}, pt={node.attrib["pt" ]}.') + ET.dump(stree) + + def getutts(self, infilename): + # each utterance on a separate line, discard the final \n and skip empty lines + with open(infilename, 'r', encoding='utf8') as infile: + rawutts = infile.readlines() + utts = [rawutt[:-1] for rawutt in rawutts if len(rawutt) > 1] + return utts + + @unittest.skip("not deterministic") + def test_variatie(self): + mwetreebank = self.data_path('mwesvoorvariatie-noann_treebank.xml') + mwedict = self.getmwedict(mwetreebank) + #expandedmwedict = {mwe:indextransform(tree) for mwe, tree in mwedict.items()} + testtreebankfilename = self.data_path('testzinnen mwevarianten_treebank.xml') + fullvariationtreebank = getstree(testtreebankfilename) + variationtreebank = fullvariationtreebank.getroot() + variationtreebankdict = {getyieldstr(tree): expandfull( + tree) for tree in variationtreebank} + # check(variationtreebankdict) + annotatedmwefilename = self.data_path('mwesvoorvariatie-annotated.txt') + annotatedmwes = self.getutts(annotatedmwefilename) + suffix = '_derivedtrees' + outfilename = self.mkoutfilename(mwetreebank, suffix) + treebank = ET.Element('treebank') + counter = 0 + reportevery = 500 + + #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal aan 0de *+dans ontspringen'] + #annotatedmwes = [amwe for amwe in annotatedmwes if amwe=='iemand zal de plaat poetsen'] + for rawmwe in annotatedmwes: + counter += 1 + mwe_element = ET.Element('mwe', attrib={'mwe': rawmwe}) + #print(mwe, file=sys.stderr) + if counter % reportevery == 0: + print(counter, file=sys.stderr) + annotatedlist = preprocess_MWE(rawmwe) + annotations = [el[1] for el in annotatedlist] + mweparts = [el[0] for el in annotatedlist] + mwe = space.join(mweparts) + fullmweparse = mwedict[mwe] + mweparse = gettopnode(fullmweparse) + # if mweparse is None: + # #print(f'\n\n{mwe}:', file=outfile) + # #print('None') + # continue + treeyield = getyield(mweparse) + treeyieldstr = space.join(treeyield) + if treeyieldstr != mwe: + print(f'mismatch:\n{treeyieldstr}=/={mwe} ') + continue + newtreesa = transformtree(mweparse, annotations) + newtrees = [] + for newtreea in newtreesa: + newtrees += newgenvariants(newtreea) + # newtrees.extend(newtreesa) + #print(f'\n\n{mwe}:', file=outfile) + cleantrees = [removesuperfluousindexes( + newtree) for newtree in newtrees] + #cleantrees = newtrees + # print('cleantrees:') + # for cleantree in cleantrees: + # ET.dump(cleantree) + mwe_element.extend(cleantrees) + xpath = trees2xpath(cleantrees, expanded=True) + # print(xpath) + xpath_element = ET.Element('xpath') + xpath_element.text = xpath + mwe_element.append(xpath_element) + treebank.append(mwe_element) + self.base_testfind(mwe, xpath, variationtreebankdict) + # ET.dump(treebank) + # for newtree in newtrees: + # #print(f'{i+1}:') + # print() + # treebank.append(newtree) + fulltreebank = ET.ElementTree(treebank) + #ET.indent(newtree, space=" ") + #print(ET.tostring(newtree), file=outfile) + fulltreebank.write(outfilename, encoding='utf8', pretty_print=True) + + def gentreebank(self): + # generate a new treebank because a new parser is being used + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + suffix = '_parse2022-11-18' + outfilename = self.mkoutfilename(intbfilename, suffix) + mwedict = self.getmwedict(intbfilename) + self.mktreebank(mwedict, outfilename) + + def genqueries(self): + selftest = False + mwes = ['iemand zal een poging doen', + 'iemand zal 0een *+poging doen', 'iemand zal aan de bak komen'] + mwes += ['iemand zal *honger hebben'] + #mwes = ['iemand zal 0een *+poging doen'] + #intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID.xml') + intbfilename = self.data_path('MWE20220429_CORPUS2ALPINO_ID_parse2022-11-18.xml') + suffix = '_querytriples' + outfilename = self.mkoutfilename(intbfilename, suffix) + mwedict = self.getmwedict(intbfilename) + #selectedmwe = 'af en toe' + mwes = [mwe for mwe, _ in mwedict.items()] + # mwes = ['iemand zal 0een *+poging doen'] + # mwes += ['iemand zal achterna zitten', 'iemand zal iemand achterna zitten'] + # mwes += ['iemand zal beter ten halve gekeerd dan ten hele gedwaald'] + # mwes += ['god betere het', 'harde dobbel', 'holland op zijn smalst', 'laatste der mohikanen', 'malle pietje', 'iemand zal zich op iets beslapen', 'iemand zal zich de tandjes werken'] + # mwes += ['iemand zal zich het vuur uit se sloffen lopen', 'iemand zal zich jakes lopen', 'iemand zal zich katoen houden', 'imand zal zich koes houden'] + # mwes += ['iemand doet 0een *+poging', 'iemand doet een poging'] + # mwes += ['dd:[dat] zelfde liedje'] + # mwes += ['iemand zal het dr:[er] 0niet bij laten zitten'] + # mwes += ['iemand zal veel ellende over iemand uitstorten'] + # mwes += ['iemand zal aanhangen als een klis'] + # mwes += ['aanzien zal doen gedenken', 'al doende zal men leren', + # 'al is de leugen nog zo snel de waarheid zal haar wel achterhalen', 'Iets zal allemaal kool zijn', + # 'iets zal allemaal kool zijn', 'iemand zal als een tang op een varken slaan'] + # mwes += ['iemand zal balen als een stekker', 'iemand zal blauw aanlopen', 'iemand zal buiten zichzelf zijn', + # 'iemand zal branden als een lier', 'daar gehakt wordt zullen spaanders vallen'] + # mwes += ['iemand zal buiten zichzelf zijn'] + # mwes = ['iemand zal steen en been over iets klagen', 'iemand zal heer en meester over iets zijn', + # 'een vette gans zal = zichzelf bedruipen', 'een vette gans zal =zichzelf bedruipen', + # 'het zal zaliger zijn te geven dan te ontvangen', 'iemand zal roken als een ketter vloeken als een ketter', + # 'wat het oog niet ziet zal het hart niet deren', 'iemand zal zeggen waar het op staat', + # 'waar het hart vol van is zal de mond van overvloeien', 'in alle hoeken en gaten van iets', + # 'wie een hond wil slaan zal licht een stok vinden', + # 'Wie het onderste uit de kan wil hebben zal het deksel op de neus krijgen'] + #mwes = ['iemand zal ’m van jetje geven', 'iemand zal voor gek lopen'] + #mwes += ['het zal zaliger zijn te geven dan te ontvangen'] + with open(outfilename, 'w', encoding='utf8') as outfile: + for mwe in mwes: + print(mwe) + (mweq, nearmissq, supersetq) = generatequeries(mwe) + print(f'\n{mwe}:', file=outfile) + print(f'mweq:\n{mweq}', file=outfile) - print(f'supersetq:\n{supersetq}', file= outfile) + print(f'nearmissq:\n{nearmissq}', file=outfile) - annotatedlist = preprocess_MWE(mwe) - #annotations = [el[1] for el in annotatedlist] - mweparts = [el[0] for el in annotatedlist] - utt = space.join(mweparts) + print(f'supersetq:\n{supersetq}', file=outfile) - if selftest: - # #self test - (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries(utt, mweq, nearmissq, supersetq) - if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1: - print(f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}') - else: - results = applyqueries(mwedict, mwe, mweq, nearmissq, supersetq) - - -if __name__ == '__main__': - # main() - # test1() - #test2() - #test3() - #test4() - #test5() - #testrel() - #testvariatie() - #gentreebank() - genqueries() + annotatedlist = preprocess_MWE(mwe) + #annotations = [el[1] for el in annotatedlist] + mweparts = [el[0] for el in annotatedlist] + utt = space.join(mweparts) + + if selftest: + # #self test + (mwenodes, nearmissnodes, supersetnodes) = selfapplyqueries( + utt, mweq, nearmissq, supersetq) + if len(mwenodes) != 1 or len(nearmissnodes) != 1 or len(supersetnodes) != 1: + print( + f'mwe:{len(mwenodes)}; nearmiss: {len(nearmissnodes)}; superset:{len(supersetnodes)}') + else: + results = applyqueries( + mwedict, mwe, mweq, nearmissq, supersetq) + + +# if __name__ == '__main__': +# # main() +# # test1() +# #test2() +# #test3() +# #test4() +# #test5() +# #testrel() +# #testvariatie() +# #gentreebank() +# genqueries() diff --git a/tests/update_outputs.py b/tests/update_outputs.py index 48fccd2..d0559fb 100755 --- a/tests/update_outputs.py +++ b/tests/update_outputs.py @@ -7,6 +7,7 @@ import sys from os import path import glob +import lxml.etree as ET testdir = path.dirname(__file__) datadir = path.join(testdir, "data") @@ -14,34 +15,35 @@ # import this implementation sys.path.insert(0, path.join(testdir, "..")) from mwe_query import Mwe +from mwe_query.canonicalform import preprocess_MWE, transformtree -def datapath(filename): - return path.join(datadir, filename) +def datapath(dirname, filename): + return path.join(datadir, dirname, filename) -def read(filename): - with open(datapath(filename)) as f: +def read(dirname, filename): + with open(datapath(dirname, filename)) as f: return f.read() -def write(filename, content): - with open(datapath(filename), "w") as f: +def write(dirname, filename, content): + with open(datapath(dirname, filename), "w") as f: f.write(content) -def update(basename): - lines = read(basename + ".txt").splitlines() +def update_generate(basename): + lines = read("generate", basename + ".txt").splitlines() can_form = lines[0].strip() sentence = lines[1].strip() alpino_xml_filename = basename + ".xml" - if not path.exists(datapath(alpino_xml_filename)): + if not path.exists(datapath("generate", alpino_xml_filename)): print("parsing") alpino_xml = parse_sentence(can_form) - write(alpino_xml_filename, alpino_xml) + write("generate", alpino_xml_filename, alpino_xml) else: - alpino_xml = read(alpino_xml_filename) + alpino_xml = read("generate", alpino_xml_filename) mwe = Mwe(sentence) mwe.set_tree(alpino_xml) @@ -50,10 +52,37 @@ def update(basename): queries = mwe.generate_queries() for query in queries: - write(f"{basename}-{query.rank}.xpath", query.xpath) + write("generate", f"{basename}-{query.rank}.xpath", query.xpath) -input_files = glob.glob(path.join(datadir, '*.txt')) +def gettopnode(stree): + for child in stree: + if child.tag == 'node': + return child + return None + + +def update_transform(): + mwes = read("transform", "mwes.txt").splitlines() + + i = 0 + for mwe in mwes: + annotatedlist = preprocess_MWE(mwe) + annotations = [el[1] for el in annotatedlist] + fullmweparse = ET.fromstring(read("transform", "tree.xml")) + mweparse = gettopnode(fullmweparse) + newtrees = transformtree(mweparse, annotations) + + j = 0 + for newtree in newtrees: + ET.indent(newtree) + write("transform", f"{i}-{j}.xml", ET.tostring(newtree, encoding="unicode")) + j += 1 + + i += 1 + +input_files = glob.glob(path.join(datadir, "generate", '*.txt')) for input in input_files: head, ext = path.splitext(path.basename(input)) - update(head) + update_generate(head) +update_transform() From 0d5c9283fcfe182853426b47131401a4ccf5e901 Mon Sep 17 00:00:00 2001 From: Sheean Spoel Date: Tue, 6 Dec 2022 14:31:54 +0100 Subject: [PATCH 3/4] Added sastadev requirement --- requirements.txt | 4 +++- setup.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9c0655f..9730a7c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ # # pip-compile # -alpino-query==2.1.7 +alpino-query==2.1.9 # via mwe-query (setup.py) basexclient==8.4.4 # via mwe-query (setup.py) @@ -20,5 +20,7 @@ requests==2.28.1 # via # alpino-query # mwe-query (setup.py) +sastadev==0.0.3 + # via mwe-query (setup.py) urllib3==1.26.11 # via requests diff --git a/setup.py b/setup.py index 610de78..0b773ae 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ package_data={"mwe_query": ["py.typed"]}, zip_safe=True, install_requires=[ - 'alpino-query>=2.1.8', 'requests', 'BaseXClient' + 'alpino-query>=2.1.8', 'requests', 'BaseXClient', 'sastadev>=0.0.3' ], entry_points={ 'console_scripts': [ From 7160761eec7d69ada53744813101438929b64a0a Mon Sep 17 00:00:00 2001 From: Ben Bonfil Date: Wed, 18 Jan 2023 10:49:20 +0100 Subject: [PATCH 4/4] trying to adapt mksuperquery to use BaseX attr indices (#6) --- mwe_query/canonicalform.py | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/mwe_query/canonicalform.py b/mwe_query/canonicalform.py index d3a7cd3..217f36f 100644 --- a/mwe_query/canonicalform.py +++ b/mwe_query/canonicalform.py @@ -1433,30 +1433,30 @@ def mknearmiss(mwetrees: List[SynTree]) -> Xpathexpression: def mksuperquery(mwetrees) -> Xpathexpression: - if mwetrees == []: - result = '' - else: - mwetree = mwetrees[0] # we only have to look at the first tree - wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib] - contentwordnodes = [ - node for node in mwetree.iter() if iscontentwordnode(node)] - contentwordnodes = contentwordnodes if len( - contentwordnodes) > 1 else wordnodes - - newmwetree = ET.Element('node', attrib={'cat': 'top'}) - for contentwordnode in contentwordnodes: - cwlemma = gav(contentwordnode, 'lemma') - cwpt = gav(contentwordnode, 'pt') - newcontentwordnode = ET.Element( - 'node', attrib={'lemma': cwlemma, 'pt': cwpt, 'axis': 'descendant'}) - newmwetree.append(newcontentwordnode) - result = tree2xpath(newmwetree) - - # lemmapts = [(gav(node, 'lemma'), gav(node, 'pt')) for node in contentwordnodes] - # lemmaptxpaths = [f'.//node[@lemma="{lemma}" and @pt="{pt}"]' for (lemma, pt) in lemmapts] - # lemmaptcondition = ' and '.join(lemmaptxpaths) - # result = f'//node[@cat="top" and {lemmaptcondition}]' - return result + if len(mwetrees) < 1: + raise RuntimeError('Cannot generate superset query for empty tree set') + + mwetree = mwetrees[0] # we only have to look at the first tree + wordnodes = [node for node in mwetree.iter() if 'pt' in node.attrib] + contentwordnodes = [node for node in mwetree.iter() + if iscontentwordnode(node)] + search_for = contentwordnodes if len(contentwordnodes) > 1 else wordnodes + + target_node = ET.Element('node', attrib={'cat': 'top'}) + children = [] + for node in search_for: + cwlemma = gav(node, 'lemma') + cwpt = gav(node, 'pt') + n = ET.Element('node', attrib=dict(lemma=cwlemma, pt=cwpt, axis='descendant')) + children.append(n) + + del children[0].attrib['axis'] + for child in children[1:]: + target_node.append(child) + + return '//{}/ancestor::alpino_ds/{}'.format( + tree2xpath(children[0]), + tree2xpath(target_node)) def generatequeries(mwe: str, lcatexpansion=True) -> (Xpathexpression, Xpathexpression, Xpathexpression):