diff --git a/ASTApostfunctions.py b/ASTApostfunctions.py index 9ef1e14..73ea9e1 100644 --- a/ASTApostfunctions.py +++ b/ASTApostfunctions.py @@ -1,12 +1,28 @@ from collections import Counter +from treebankfunctions import getattval, getuttid, getnodeyield +from stringfunctions import realwordstring from copy import deepcopy - -from treebankfunctions import getattval, getnodeyield, getuttid +from lexicon import getwordposinfo, getwordinfo lpad = 3 zero = '0' astamaxwordcount = 300 +excluded_lemmas = ['gevallen', 'gewinnen'] + +nounqid = 'A021' +lexqid = 'A018' +samplesizeqid = 'A045' +mluxqid = 'A029' +pvqid = 'A024' +delpvqid = 'A009' +subpvqid = 'A032' +kqid = 'A013' +mqid = 'A020' +tijdfoutpvqid = 'A041' +nounlemmaqid = 'A046' +verblemmaqid = 'A049' + def sumctr(ctr): result = sum(ctr.values()) @@ -16,8 +32,9 @@ def sumctr(ctr): def wordcountperutt(allresults): lemmas = getalllemmas(allresults) wordcounts = {uttid: sum(ctr.values()) for uttid, ctr in lemmas.items()} - ignorewordcounts = deepcopy(allresults.coreresults['A045']) if 'A045' in allresults.coreresults else Counter() # samplesize - ignorewordcounts += allresults.coreresults['A029'] if 'A029' in allresults.coreresults else Counter() # mlux + ignorewordcounts = deepcopy( + allresults.coreresults[samplesizeqid]) if samplesizeqid in allresults.coreresults else Counter() # samplesize + ignorewordcounts += allresults.coreresults[mluxqid] if mluxqid in allresults.coreresults else Counter() # mlux # ignorewordcounts += allresults.coreresults['A050'] if 'A050' in allresults.coreresults else Counter() # echolalie covered by mlux result = {} for uttid in wordcounts: @@ -29,10 +46,10 @@ def wordcountperutt(allresults): def finietheidsindex(allresults, _): - allpvs = allresults.coreresults['A024'] if 'A024' in allresults.coreresults else Counter() - subpvs = allresults.coreresults['A032'] if 'A032' in allresults.coreresults else Counter() - delpvs = allresults.coreresults['A009'] if 'A009' in allresults.coreresults else Counter() - tijdfoutpvs = allresults.coreresults['A041'] if 'A041' in allresults.coreresults else Counter() + allpvs = allresults.coreresults[pvqid] if pvqid in allresults.coreresults else Counter() + subpvs = allresults.coreresults[subpvqid] if subpvqid in allresults.coreresults else Counter() + delpvs = allresults.coreresults[delpvqid] if delpvqid in allresults.coreresults else Counter() + tijdfoutpvs = allresults.coreresults[tijdfoutpvqid] if tijdfoutpvqid in allresults.coreresults else Counter() foutepvs = subpvs + delpvs + tijdfoutpvs allpvcount = sumctr(allpvs) foutepvcount = sumctr(foutepvs) @@ -45,6 +62,7 @@ def finietheidsindex(allresults, _): def countwordsandcutoff(allresults, _): + # @@to be adapted result = (None, 0) if 'A047' in allresults.postresults: paddedlist = [] @@ -64,8 +82,8 @@ def countwordsandcutoff(allresults, _): def KMcount(allresults, _): - Kcount = sumctr(allresults.coreresults['A013']) if 'A013' in allresults.coreresults else 0 - Mcount = sumctr(allresults.coreresults['A020']) if 'A020' in allresults.coreresults else 0 + Kcount = sumctr(allresults.coreresults[kqid]) if kqid in allresults.coreresults else 0 + Mcount = sumctr(allresults.coreresults[mqid]) if mqid in allresults.coreresults else 0 result = Kcount + Mcount return result @@ -84,7 +102,17 @@ def old_old_getlemmas(allresults, _): def getlemmas(allresults, _): - result = getcondlemmas(allresults, _, lambda qid: qid in ['A021', 'A018']) + result = getcondlemmas(allresults, _, lambda qid: qid in [nounqid, lexqid]) + return result + + +def getnounlemmas(allresults, _): + result = getposlemmas(allresults, nounqid) + return result + + +def getlexlemmas(allresults, _): + result = getposlemmas(allresults, lexqid) return result @@ -95,10 +123,15 @@ def realword(node): def getalllemmas(allresults): result = {} - for syntree in allresults.analysedtrees: - uttid = getuttid(syntree) - lemmas = [getattval(node, 'lemma') for node in getnodeyield(syntree) if realword(node)] - result[uttid] = Counter(lemmas) + if allresults.annotationinput: + for uttid in allresults.allutts: + lemmas = [bgetlemma(w) for w in allresults.allutts[uttid] if realwordstring(w)] + result[uttid] = Counter(lemmas) + else: + for syntree in allresults.analysedtrees: + uttid = getuttid(syntree) + lemmas = [getattval(node, 'lemma') for node in getnodeyield(syntree) if realword(node)] + result[uttid] = Counter(lemmas) return result @@ -115,7 +148,7 @@ def old_getlemmas(allresults, _): return result -def getcondlemmas(allresults, _, cond): +def oldgetcondlemmas(allresults, _, cond): allmatches = allresults.allmatches result = Counter() for el in allmatches: @@ -126,3 +159,85 @@ def getcondlemmas(allresults, _, cond): theword = getattval(amatch[0], 'lemma') result.update([(theword, uttid)]) return result + +#not used anymore, contains an error +def getcondlemmas(allresults, _, cond): + result = Counter() + if allresults.annotationinput: + for qid in allresults.exactresults: + if cond(qid): + for (uttid, position) in allresults.exactresults[qid]: + word = allresults.allutts[uttid][position - 1] + if qid == 'A021': + pos = 'n' + elif qid == 'A018': + pos = 'ww' + else: + pos = None + lemma = bgetlemma(word, pos) + result.update([(lemma, qid, uttid)]) + + else: + allmatches = allresults.allmatches + for el in allmatches: + (qid, uttid) = el + if cond(qid): + for amatch in allmatches[el]: + # theword = normalizedword(amatch[0]) + theword = getattval(amatch[0], 'lemma') + result.update([(theword, uttid)]) + return result + + +def getposfromqid(qid): + if qid == 'A021': + pos = 'n' + elif qid == 'A018': + pos = 'ww' + else: + pos = None + return pos + + +def getposlemmas(allresults, posqid): + result = Counter() + if allresults.annotationinput: + for (uttid, position) in allresults.exactresults[posqid]: + word = allresults.allutts[uttid][position - 1] + pos = getposfromqid(posqid) + lemma = bgetlemma(word, pos) + result.update([(lemma, uttid)]) + else: + allmatches = allresults.allmatches + for el in allmatches: + (qid, uttid) = el + if qid == posqid: + for amatch in allmatches[el]: + # theword = normalizedword(amatch[0]) + theword = getattval(amatch[0], 'lemma') + result.update([(theword, uttid)]) + return result + + +def bgetlemma(word, pos=None): + if pos is None: + wordinfos = getwordinfo(word) + if wordinfos == []: + lemma = word + else: + filteredwordinfos = [wi for wi in wordinfos if wi[3] not in excluded_lemmas] + if filteredwordinfos == []: + lemma = wordinfos[0][3] + else: + lemma = filteredwordinfos[0][3] + else: + wordinfos = getwordposinfo(word, pos) + if wordinfos == []: + lemma = word + else: + filteredwordinfos = [wi for wi in wordinfos if wi[3] not in excluded_lemmas] + if filteredwordinfos == []: + lemma = wordinfos[0][3] + else: + lemma = filteredwordinfos[0][3] + return lemma diff --git a/SAFreader.py b/SAFreader.py index ed8bb25..fdd8c36 100644 --- a/SAFreader.py +++ b/SAFreader.py @@ -5,24 +5,25 @@ and the function read_annotations() to obtain a score dictionary with queryid as keys and Counter() as values ''' -#todo -#-additional columns unaligned treatment and generalisation -#-code alternatives and replacemtne extensions -#=codes written without spaces? - -import os -import re -from collections import Counter, defaultdict +# todo +# -additional columns unaligned treatment and generalisation +# -code alternatives and replacemtne extensions +# =codes written without spaces? import xlrd - +from collections import defaultdict +from collections import Counter +import re +import os from config import SDLOGGER -from readmethod import itemseppattern, read_method +#import logging +from readmethod import read_method, itemseppattern varitem = '' txtext = ".txt" comma = "," +space = ' ' tsvext = '.tsv' commaspace = ', ' tab = '\t' @@ -69,9 +70,9 @@ def getlabels(labelstr, patterns): results = [] ms = pattern.finditer(labelstr) logstr = str([m.group(0) for m in ms if m.group(0) not in ' ;,-']) - #print('Cannot interpret {}; found items: {}'.format(labelstr,logstr), file=sys.stderr) - logging.warning('Cannot interpret %s; found items: %s', labelstr, logstr) - #exit(-1) + # print('Cannot interpret {}; found items: {}'.format(labelstr,logstr), file=sys.stderr) + SDLOGGER.warning('Cannot interpret %s; found items: %s', labelstr, logstr) + # exit(-1) return results @@ -136,8 +137,8 @@ def oldget_annotations(infilename, patterns): headers = {} lastrow = sheet.nrows lastcol = sheet.ncols -# firstwordcol = 2 -# lastwordcol = lastcol - 4 + # firstwordcol = 2 + # lastwordcol = lastcol - 4 levelcol = 1 uttidcol = 0 stagescol = -1 @@ -182,7 +183,7 @@ def oldget_annotations(infilename, patterns): thelabelstr = sheet.cell_value(rowctr, colctr) thelevel = sheet.cell_value(rowctr, levelcol) if lastwordcol + 1 <= colctr < sheet.ncols: - #prefix = headers[colctr] aangepast om het simpeler te houden + # prefix = headers[colctr] aangepast om het simpeler te houden prefix = "" else: prefix = "" @@ -194,7 +195,7 @@ def oldget_annotations(infilename, patterns): for (cleanlevel, cleanlabel) in cleanlevelsandlabels: thedata[(cleanlevel, cleanlabel)].append(uttid) exactdata[(cleanlevel, cleanlabel)].append((uttid, tokenposition)) - #wb.close() there is no way to close the workbook + # wb.close() there is no way to close the workbook for atuple in thedata: cdata[atuple] = Counter(thedata[atuple]) return cdata @@ -211,6 +212,8 @@ def get_annotations(infilename, patterns): thedata = defaultdict(list) cdata = {} + allutts = {} + # To open Workbook wb = xlrd.open_workbook(infilename) sheet = wb.sheet_by_index(0) @@ -221,8 +224,8 @@ def get_annotations(infilename, patterns): headers = {} lastrow = sheet.nrows lastcol = sheet.ncols -# firstwordcol = 2 -# lastwordcol = lastcol - 4 + # firstwordcol = 2 + # lastwordcol = lastcol - 4 levelcol = 1 uttidcol = 0 stagescol = -1 @@ -230,6 +233,8 @@ def get_annotations(infilename, patterns): uttlevel = 'utt' + uttcount = 0 + for rowctr in range(startrow, lastrow): if rowctr == headerrow: for colctr in range(startcol, lastcol): @@ -254,14 +259,21 @@ def get_annotations(infilename, patterns): thelevel = sheet.cell_value(rowctr, levelcol) thelevel = clean(thelevel) all_levels.add(thelevel) + # if thelevel == uttlevel: + # uttcount += 1 + curuttwlist = [] for colctr in range(firstwordcol, sheet.ncols): - if thelevel in literallevels and colctr != stagescol and colctr != commentscol: + if thelevel == uttlevel: + curcellval = sheet.cell_value(rowctr, colctr) + if curcellval != '': + curuttwlist.append(curcellval) + elif thelevel in literallevels and colctr != stagescol and colctr != commentscol: thelabel = sheet.cell_value(rowctr, colctr) if colctr > lastwordcol: tokenposition = 0 else: tokenposition = colctr - firstwordcol + 1 - #thedata[(thelevel, thelabel)].append(uttid) + # thedata[(thelevel, thelabel)].append(uttid) cleanlevel = thelevel cleanlabel = thelabel if cleanlabel != '': @@ -270,7 +282,7 @@ def get_annotations(infilename, patterns): thelabelstr = sheet.cell_value(rowctr, colctr) thelevel = sheet.cell_value(rowctr, levelcol) if lastwordcol + 1 <= colctr < sheet.ncols: - #prefix = headers[colctr] aangepast om het simpeler te houden + # prefix = headers[colctr] aangepast om het simpeler te houden prefix = "" else: prefix = "" @@ -281,8 +293,10 @@ def get_annotations(infilename, patterns): tokenposition = colctr - firstwordcol + 1 for (cleanlevel, cleanlabel) in cleanlevelsandlabels: thedata[(cleanlevel, cleanlabel)].append((uttid, tokenposition)) - #wb.close() there is no way to close the workbook - return thedata + if curuttwlist != []: + allutts[uttid] = curuttwlist + # wb.close() there is no way to close the workbook + return allutts, thedata def update(thedict, qid, goldtuple): @@ -322,7 +336,7 @@ def mkpatterns(allcodes): adaptedcodes = [codeadapt(c) for c in sortedallcodes] basepattern = r'' + '|'.join(adaptedcodes) + '|' + itemseppattern fullpattern = r'^(' + basepattern + r')*$' - return(re.compile(basepattern), re.compile(fullpattern)) + return (re.compile(basepattern), re.compile(fullpattern)) def get_golddata(filename, mapping, altcodes, queries, includeimplies=False): @@ -333,7 +347,7 @@ def get_golddata(filename, mapping, altcodes, queries, includeimplies=False): allaltcodesitems = [item for (item, _) in altcodes] allitems = allmappingitems + allaltcodesitems patterns = mkpatterns(allitems) - basicdata = get_annotations(filename, patterns) + allutts, basicdata = get_annotations(filename, patterns) results = {} for thelevel, theitem in basicdata: thecounter = basicdata[(thelevel, theitem)] @@ -352,59 +366,70 @@ def get_golddata(filename, mapping, altcodes, queries, includeimplies=False): impliedqid = mapping[(implieditem, thelevel)] update(results, impliedqid, (altlevel, altitem, thecounter)) else: - logging.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) + SDLOGGER.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) elif (theitem, thelevel) in altcodes: (altitem, altlevel) = altcodes[(theitem, thelevel)] qid = mapping[(altitem, altlevel)] update(results, qid, (altlevel, altitem, thecounter)) - logging.info('{} of level {} invalid code replaced by {} of level {}'.format(theitem, thelevel, altitem, altlevel)) + SDLOGGER.info( + '{} of level {} invalid code replaced by {} of level {}'.format(theitem, thelevel, altitem, altlevel)) if includeimplies: for implieditem in queries[qid].implies: if (implieditem, thecorrectlevel) in mapping: impliedqid = mapping[(implieditem, thelevel)] update(results, impliedqid, (altlevel, altitem, thecounter)) else: - logging.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) + SDLOGGER.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) elif theitem in mappingitem2levelmap: thecorrectlevels = mappingitem2levelmap[theitem] if len(thecorrectlevels) == 1: thecorrectlevel = thecorrectlevels[0] qid = mapping[(theitem, thecorrectlevel)] update(results, qid, (thecorrectlevel, theitem, thecounter)) - logging.info('level {} of item {} replaced by correct level {}'.format(thelevel, theitem, thecorrectlevel)) + SDLOGGER.info( + 'level {} of item {} replaced by correct level {}'.format(thelevel, theitem, thecorrectlevel)) elif len(thecorrectlevels) > 1: - logging.error('Item {} of level {} not a valid coding (wrong level, multiple candidate levels: {}'.format(theitem, thelevel, str(thecorrectlevels))) + SDLOGGER.error( + 'Item {} of level {} not a valid coding (wrong level, multiple candidate levels: {}'.format(theitem, + thelevel, + str( + thecorrectlevels))) else: - logging.error('{} of level {} not a valid coding (wrong level'.format(theitem, thelevel)) + SDLOGGER.error('{} of level {} not a valid coding (wrong level'.format(theitem, thelevel)) if includeimplies: for implieditem in queries[qid].implies: if (implieditem, thecorrectlevel) in mapping: impliedqid = mapping[(implieditem, thecorrectlevel)] update(results, impliedqid, (thecorrectlevel, theitem, thecounter)) else: - logging.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) + SDLOGGER.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) elif theitem in altcodesitem2levelmap: thecorrectlevels = altcodesitem2levelmap[theitem] if len(thecorrectlevels) == 1: (thecorrectitem, thecorrectlevel) = altcodes[(theitem, thecorrectlevels[0])] qid = mapping[(thecorrectitem, thecorrectlevel)] update(results, qid, (thecorrectlevel, thecorrectitem, thecounter)) - logging.info('level {} of item {} replaced by correct level {} and item {}'.format(thelevel, theitem, thecorrectlevel, thecorrectitem)) + SDLOGGER.info('level {} of item {} replaced by correct level {} and item {}'.format(thelevel, theitem, + thecorrectlevel, + thecorrectitem)) elif len(thecorrectlevels) > 1: - logging.error('Item {} of level {} not a valid coding (item replaced by {}, wrong level, multiple candidate levels: {}'.format(theitem. thelevel, thecorrectitem, thecorrectlevels)) + SDLOGGER.error( + 'Item {} of level {} not a valid coding (item replaced by {}, wrong level, multiple candidate levels: {}'.format( + theitem.thelevel, thecorrectitem, thecorrectlevels)) else: - logging.error('{} of level {} not a valid coding (alternative item, wrong level)'.format(theitem, thelevel)) + SDLOGGER.error( + '{} of level {} not a valid coding (alternative item, wrong level)'.format(theitem, thelevel)) if includeimplies: for implieditem in queries[qid].implies: if (implieditem, thecorrectlevel) in mapping: impliedqid = mapping[(implieditem, thecorrectlevel)] update(results, impliedqid, (thecorrectlevel, theitem, thecounter)) else: - logging.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) + SDLOGGER.error('Implied Item ({},{}) not found in mapping'.format(implieditem, thecorrectlevel)) else: - logging.error('{} of level {} not a valid coding'.format(theitem, thelevel)) - return results + SDLOGGER.error('{} of level {} not a valid coding'.format(theitem, thelevel)) + return allutts, results def exact2global(thedata): @@ -462,12 +487,12 @@ def read_annotations(methodfilename, annotationfilename, includeimplies=False): if __name__ == "__main__": # Give the location of the input file - #infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AurisdataAligned Current.xlsx" - #infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AurisdataAligned TagsCleaned Current.xlsx" - #infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\SchlichtingVoorbeeldGoldCurrent.xlsx" + # infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AurisdataAligned Current.xlsx" + # infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AurisdataAligned TagsCleaned Current.xlsx" + # infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\SchlichtingVoorbeeldGoldCurrent.xlsx" infilename = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\aangeleverde data\ASTA\SASTA sample 01.xlsx" - #Give the location of the method file + # Give the location of the method file methodfilename = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\ASTA\ASTA Index Current.xlsx' thedata = {} diff --git a/allresults.py b/allresults.py index ce3e0ab..cfe6f55 100644 --- a/allresults.py +++ b/allresults.py @@ -1,13 +1,15 @@ class AllResults: - def __init__(self, uttcount, coreresults, postresults, allmatches, filename, analysedtrees): + def __init__(self, uttcount, coreresults, exactresults, postresults, allmatches, filename, analysedtrees, allutts, annotationinput=False): self.uttcount = uttcount self.coreresults = coreresults + self.exactresults = exactresults self.postresults = postresults self.allmatches = allmatches self.filename = filename self.analysedtrees = analysedtrees - + self.allutts = allutts + self.annotationinput = annotationinput def scores2counts(scores): ''' diff --git a/alpinoparsing.py b/alpinoparsing.py index 96b5417..34e65de 100644 --- a/alpinoparsing.py +++ b/alpinoparsing.py @@ -5,7 +5,8 @@ from lxml import etree from memoize import memoize -from config import SDLOGGER +import logging +#from config import SDLOGGER alpino_special_symbols_pattern = r'[\[\]]' alpino_special_symbols_re = re.compile(alpino_special_symbols_pattern) @@ -35,10 +36,10 @@ def parse(origsent, escape=True): try: r1 = urllib.request.urlopen(fullurl) except urllib.request.HTTPError as e: - SDLOGGER.error('{}: parsing <{}> failed'.format(e, sent)) + logging.error('{}: parsing <{}> failed'.format(e, sent)) return None except urllib.error.URLError as e: - SDLOGGER.error('{}: parsing <{}> failed'.format(e, sent)) + logging.error('{}: parsing <{}> failed'.format(e, sent)) return None else: if 300 > r1.status >= 200: @@ -47,7 +48,7 @@ def parse(origsent, escape=True): stree = etree.fromstring(streebytes) return stree else: - SDLOGGER.error('parsing failed:', r1.status, r1.reason, sent) + logging.error('parsing failed:', r1.status, r1.reason, sent) return None diff --git a/astaforms.py b/astaforms.py index 51bf4ed..5c9b057 100644 --- a/astaforms.py +++ b/astaforms.py @@ -4,13 +4,15 @@ import xlsxwriter -from ASTApostfunctions import wordcountperutt +from ASTApostfunctions import wordcountperutt, nounlemmaqid, verblemmaqid from treebankfunctions import getattval green = '#00FF00' red = '#FF0000' orange = '#FFBB9A' grey = '#B0B0B0' + + # green = 'green' # green = '#006100' # red = '#9C0006' @@ -80,7 +82,6 @@ def applytemplate3(sheet, colchar): ['Finietheidsindex', 0.99, 0.03], ['Aantal bijzinnen', 4.8, 2.78]] - scores = [['', 'Score', 'SD'], ['Aantal zelfstandige naamwoorden', "='ZNW & WW'!B1", "=(B2-Tabel!B3)/Tabel!C3"], ['TTR zelfstandige naamwoorden', applytemplate3('ZNW & WW', 'B'), applytemplate1(3)], @@ -107,9 +108,11 @@ def applytemplate3(sheet, colchar): ['Correct', '=COUNTIF(C6:C105,"J")', '', '', '', ''] ] -sheet2header = ['Nummer', 'Zelfstandig naamwoord', 'Herhaling', 'Aantal', '', 'Lexicaal werkwoord', 'Herhaling', 'Aantal'] +sheet2header = ['Nummer', 'Zelfstandig naamwoord', 'Herhaling', 'Aantal', '', 'Lexicaal werkwoord', 'Herhaling', + 'Aantal'] sheet2colwidths = [10, 25, 20, 10, 5, 25, 20, 10] -sheet3header = ['Uitingsnummer', 'Aantal woorden', 'Correct', "Goede PV's", "Foute en ontbrekende PV's", "Aantal bijzinnen", "Bijzonderheden"] +sheet3header = ['Uitingsnummer', 'Aantal woorden', 'Correct', "Goede PV's", "Foute en ontbrekende PV's", + "Aantal bijzinnen", "Bijzonderheden"] def writetable(tabel, ws, startrow=0, startcol=0, rhformat=None, chformat=None, cellformat=None): @@ -294,7 +297,7 @@ def resultdict2table(resultdict): okpvs = max(0, allpvs - foutepvs) bijzincount = dictget(uttid_dict, 'bijzincount') remarks = dictget(uttid_dict, 'remarks') - paddeduttid = uttid.rjust(3, '0') + paddeduttid = str(uttid).rjust(3, '0') newrow = [paddeduttid, wc, correct, okpvs, foutepvs, bijzincount, remarks] table.append(newrow) sortedtable = sorted(table, key=lambda row: row[0]) @@ -324,18 +327,22 @@ def astaform(allresults, _, in_memory=False): noundict = defaultdict(int) verbdict = defaultdict(int) allmatches = allresults.allmatches - for el in allmatches: - (qid, uttid) = el - if qid == 'A021': - for amatch in allmatches[el]: - # theword = normalizedword(amatch[0]) - theword = getattval(amatch[0], 'lemma') - noundict[theword] += 1 - if qid == 'A018': - for amatch in allmatches[el]: - # theword = normalizedword(amatch[0]) - theword = getattval(amatch[0], 'lemma') - verbdict[theword] += 1 + # for el in allmatches: + # (qid, uttid) = el + # if qid == 'A021': + # for amatch in allmatches[el]: + # # theword = normalizedword(amatch[0]) + # theword = getattval(amatch[0], 'lemma') + # noundict[theword] += 1 + # if qid == 'A018': + # for amatch in allmatches[el]: + # # theword = normalizedword(amatch[0]) + # theword = getattval(amatch[0], 'lemma') + # verbdict[theword] += 1 + for (lemma, uttid) in allresults.postresults[nounlemmaqid]: + noundict[lemma] += 1 + for (lemma, uttid) in allresults.postresults[verblemmaqid]: + verbdict[lemma] += 1 vardict = getvardict(allresults) uttlist = getuttlist(allresults) astadata = AstaFormData(noundict, verbdict, vardict, uttlist) diff --git a/basicreplacements.py b/basicreplacements.py index 77e504f..f369702 100644 --- a/basicreplacements.py +++ b/basicreplacements.py @@ -1,7 +1,6 @@ from collections import defaultdict - +from metadata import bpl_word, bpl_node from deregularise import correctinflection -from metadata import bpl_word pron = 'Pronunciation' orth = 'Orthography' @@ -21,7 +20,9 @@ zdev = 'Devoicing of /z/' wrongpron = 'Wrong Prunciation' phonrepl = '/{wrong}/ instead of /{correct}/' - +wronginfl = 'Incorrect inflection' +morph = 'Morphology' +overgen = 'Overgeneralisation' Rvzlist = ['aan', 'achter', 'achteraan', 'achterin', 'achterop', 'af', 'beneden', 'benevens', 'bij', 'binnen', 'binnenuit', 'boven', 'bovenaan', 'bovenin', 'bovenop', 'buiten', 'dichtbij', 'door', 'doorheen', 'heen', @@ -33,7 +34,6 @@ ervzvariants = [('der' + vz, 'er' + vz, pron, varpron, d_er) for vz in Rvzlist] + \ [("d'r" + vz, 'er' + vz, pron, varpron, d_er) for vz in Rvzlist] - basicreplacementlist = [('as', 'als', pron, infpron, codared), ('isse', 'is', pron, infpron, addschwa), ('ooke', 'ook', pron, infpron, addschwa), ('t', "'t", orth, spellerr, apomiss), ('effjes', 'eventjes', pron, infpron, varpron), @@ -47,7 +47,13 @@ ('da', 'dat', pron, infpron, codared), ('si', 'zit', pron, infpron, codared), # and zdev ('ieduleen', 'iedereen', pron, wrongpron, phonrepl.format(wrong='l', correct='r')), - ('allemaaw', 'allemaal', pron, wrongpron, phonrepl.format(wrong='w', correct='l')) + ('allemaaw', 'allemaal', pron, wrongpron, phonrepl.format(wrong='w', correct='l')), + ('amaal', 'allemaal', pron, infpron, varpron), + ('wiw', 'wil', pron, wrongpron, phonrepl.format(wrong='w', correct='l')), + ('annug', 'ander', pron, wrongpron, phonrepl.format(wrong='nug', correct='der')), + ('nohug', 'nodig', pron, wrongpron, phonrepl.format(wrong='hu', correct='di')), + ('magge', 'mogen', morph, wronginfl, '{} & {}'.format(overgen, infpron)), + ('maggen', 'mogen', morph, wronginfl, overgen) ] + ervzvariants # ('inne', 'in', pron, infpron, addschwa) # put off because it b;ock inne -> in de @@ -70,7 +76,6 @@ for w1, w2, c, n, v in basicexpansionlist: basicexpansions[w1].append((w2, c, n, v)) - knownreplacements = [ ('ze', "z'n", pron, infpron, fndrop, bpl_word), ('desu', 'deze', pron, infpron, zdev, bpl_word), @@ -78,7 +83,6 @@ ] - knownreplacementsdict = {(repl[0], repl[1]): repl for repl in knownreplacements} diff --git a/corrector.py b/corrector.py index 2e561a4..d5c78ba 100644 --- a/corrector.py +++ b/corrector.py @@ -25,17 +25,10 @@ getfilledpauses, getprefixwords, getrepeatedtokens, getunwantedtokens, nodesfindjaneenou) from deregularise import correctinflection -from find_ngram import findmatches, ngram1 from iedims import getjeforms -from lexicon import de, dets, getwordinfo, het, informlexicon, known_word +from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart from macros import expandmacros -# from alternative import Alternative, Replacement, Metadata, Meta -from metadata import (Meta, bpl_indeze, bpl_node, bpl_none, bpl_word, - bpl_wordlemma, defaultbackplacement, defaultpenalty, - filled_pause, fstoken, intj, janeenou, longrep, - mkSASTAMeta, repeated, repeatedjaneenou, - repeatedseqtoken, shortrep, substringrep, unknownsymbol) -from namepartlexicon import isa_namepart +# from namepartlexicon import namepart_isa_namepart from sastatok import sasta_tokenize from sastatoken import Token, tokenlist2stringlist from stringfunctions import (chatxxxcodes, consonants, deduplicate, @@ -44,6 +37,15 @@ from sva import getsvacorrections from tokenmd import TokenListMD, TokenMD, mdlist2listmd from treebankfunctions import find1, getattval, getnodeyield +from lxml import etree +import sys +# from alternative import Alternative, Replacement, Metadata, Meta +from metadata import Meta, defaultbackplacement, defaultpenalty, bpl_node, bpl_none, bpl_word, bpl_indeze, \ + bpl_wordlemma, mkSASTAMeta, janeenou, shortrep, longrep, repeatedseqtoken, intj, unknownword, unknownsymbol, \ + filled_pause, repeatedjaneenou, repeated, substringrep, fstoken, falsestart +from alpinoparsing import parse, escape_alpino_input +from expandquery import expandmacros +from find_ngram import findmatches, ngram1, ngram2, ngram7, ngram10, ngram11, ngram16, ngram17 SASTA = 'SASTA' @@ -138,10 +140,9 @@ def ngramreduction(reducedtokens, token2nodemap, allremovetokens, allremoveposit def reduce(tokens, tree): - if tree is None: SDLOGGER.error('No tree for :{}\nNo reduction applied'.format(tokens)) - return((tokens, [])) + return ((tokens, [])) tokennodes = tree.xpath('.//node[@pt or @pos]') tokennodesdict = {int(getattval(n, 'begin')): n for n in tokennodes} @@ -197,7 +198,8 @@ def reduce(tokens, tree): # remove ja nee nou janeenounodes = nodesfindjaneenou(reducednodes) - janeenoutokens = [tok for tok in reducedtokens if keycheck(tok.pos, token2nodemap) and token2nodemap[tok.pos] in janeenounodes] + janeenoutokens = [tok for tok in reducedtokens if + keycheck(tok.pos, token2nodemap) and token2nodemap[tok.pos] in janeenounodes] janeenoupositions = [token.pos for token in janeenoutokens] allremovetokens += janeenoutokens allremovepositions += janeenoupositions @@ -206,25 +208,36 @@ def reduce(tokens, tree): allmetadata += metadata # short repetitions - def oldcond(x, y): return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) < .5 and not informlexicon(cleanwordofnort(x)) - def cond(x, y): return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) < .5 # check on lexicon put off actually two variants should be tried if the word is an existin gword + def oldcond(x, y): + return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) < .5 and not informlexicon(cleanwordofnort(x)) + + def cond(x, y): + return len(cleanwordofnort(x)) / len(cleanwordofnort( + y)) < .5 # check on lexicon put off actually two variants should be tried if the word is an existin gword + shortprefixtokens = getprefixwords(reducedtokens, cond) shortprefixpositions = [token.pos for token in shortprefixtokens] repeatedtokens = getrepeatedtokens(reducedtokens, shortprefixtokens) allremovetokens += shortprefixtokens allremovepositions += shortprefixpositions - metadata = [mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', shortrep, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] + metadata = [ + mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', shortrep, 'Tokenisation', subcat=repetition) for + token in reducedtokens if token in repeatedtokens] allmetadata += metadata reducedtokens = [tok for tok in reducedtokens if tok not in shortprefixtokens] # long repetitions - def cond(x, y): return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) >= .5 and not informlexicon(cleanwordofnort(x)) + def cond(x, y): + return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) >= .5 and not informlexicon(cleanwordofnort(x)) + longprefixtokens = getprefixwords(reducedtokens, cond) longprefixpositions = [token.pos for token in longprefixtokens] repeatedtokens = getrepeatedtokens(reducedtokens, longprefixtokens) allremovetokens += longprefixtokens allremovepositions += longprefixpositions - metadata = [mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', longrep, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] + metadata = [ + mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', longrep, 'Tokenisation', subcat=repetition) for + token in reducedtokens if token in repeatedtokens] allmetadata += metadata reducedtokens = [tok for tok in reducedtokens if tok not in longprefixtokens] @@ -246,7 +259,8 @@ def cond(x, y): return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) >= .5 a allremovetokens += dupnodetokens allremovepositions += dupnodepositions metadata = [mkSASTAMeta(token, repeatedtokens[token], 'ExtraGrammatical', - repeated, 'Tokenisation', subcat=repetition) for token in reducedtokens if token in repeatedtokens] + repeated, 'Tokenisation', subcat=repetition) for token in reducedtokens if + token in repeatedtokens] allmetadata += metadata reducedtokens = [tok for tok in reducedtokens if tok not in dupnodetokens] @@ -284,19 +298,42 @@ def cond(x, y): return len(cleanwordofnort(x)) / len(cleanwordofnort(y)) >= .5 a # vnw pv vnw pv - def metaf(falsestarttokens, falsestartpositions, correcttokens): return \ - [Meta('Retracing', 'Retracing with Correction', annotatedposlist=falsestartpositions, - annotatedwordlist=[c.word for c in falsestarttokens], - annotationposlist=[c.pos for c in correcttokens], - annotationwordlist=[c.word for c in correcttokens], cat='Retracing', subcat=None, source=SASTA, - penalty=defaultpenalty, backplacement=bpl_none)] + \ - [mkSASTAMeta(ftoken, ctoken, 'Retracing with Correction', fstoken, 'Retracing') - for ftoken, ctoken in zip(falsestarttokens, correcttokens)] + def metaf(falsestarttokens, falsestartpositions, correcttokens): + return \ + [Meta('Retracing', 'Retracing with Correction', annotatedposlist=falsestartpositions, + annotatedwordlist=[c.word for c in falsestarttokens], + annotationposlist=[c.pos for c in correcttokens], + annotationwordlist=[c.word for c in correcttokens], cat='Retracing', subcat=None, source=SASTA, + penalty=defaultpenalty, backplacement=bpl_none)] + \ + [mkSASTAMeta(ftoken, ctoken, 'Retracing with Correction', fstoken, 'Retracing') + for ftoken, ctoken in zip(falsestarttokens, correcttokens)] vnwpvvnwpvcor = Ngramcorrection(ngram1, (0, 2), (2, 4), metaf) reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, allremovepositions, allmetadata, vnwpvvnwpvcor) + vzdetvzdetcor = Ngramcorrection(ngram2, (0, 2), (2, 4), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, vzdetvzdetcor) + + vgdetvgdetcor = Ngramcorrection(ngram7, (0, 2), (2, 4), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, vgdetvgdetcor) + vnwipvjxpvjvnwi = Ngramcorrection(ngram10, (0, 2), (3, 5), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, vnwipvjxpvjvnwi) + lemilemjlemilemj = Ngramcorrection(ngram11, (0, 2), (3, 5), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, lemilemjlemilemj) + + dinjdknj = Ngramcorrection(ngram16, (0, 2), (3, 5), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, dinjdknj) + + tevtev = Ngramcorrection(ngram17, (0, 2), (2, 4), metaf) + reducedtokens, allremovetokens, allmetadata = ngramreduction(reducedtokens, token2nodemap, allremovetokens, + allremovepositions, allmetadata, tevtev) + # reducedleaves = [token2nodemap[tok.pos] for tok in reducedtokens] # # vnwpvvnwpvmatches = findmatches(ngram1, reducedleaves) @@ -377,7 +414,6 @@ def getcorrection(utt, tree=None, interactive=False): def getcorrections(utt, method, tree=None, interactive=False): - origutt = utt allmetadata = [] rawtokens = sasta_tokenize(utt) @@ -418,7 +454,6 @@ def getcorrections(utt, method, tree=None, interactive=False): # def getalternatives(origtokensmd, method, llremovedtokens, tree, uttid): def getalternatives(origtokensmd, method, tree, uttid): - tokensmd = explanationasreplacement(origtokensmd, tree) if tokensmd is None: tokensmd = origtokensmd @@ -597,6 +632,7 @@ def lexcheck(intokensmd, allalternativemds): finalalternativemds.append(alternativemd) return finalalternativemds + # moved to metadata # def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalty, backplacement=defaultbackplacement): # result = Meta(name, value, annotatedposlist=[token.pos], @@ -670,7 +706,8 @@ def explanationasreplacement(tokensmd, tree): if known_word(newword): newtokens = tokenreplace(newtokens, newtoken) bpl = bpl_node if known_word(oldword) else bpl_word - meta = mkSASTAMeta(oldtoken, newtoken, name='ExplanationasReplacement', value='ExplanationasReplacement', + meta = mkSASTAMeta(oldtoken, newtoken, name='ExplanationasReplacement', + value='ExplanationasReplacement', cat='Lexical Error', backplacement=bpl_node) newmetadata.append(meta) result = TokenListMD(newtokens, newmetadata) @@ -682,21 +719,20 @@ def explanationasreplacement(tokensmd, tree): def initdevoicing(token, voiceless, voiced, newtokenmds, beginmetadata): - # initial s -> z, f -> v if not known_word(token.word.lower()) or token.word.lower() in specialdevoicingwords: if token.word[0] == voiceless: newword = voiced + token.word[1:] if known_word(newword): newtokenmds = updatenewtokenmds(newtokenmds, token, [newword], beginmetadata, - name='Pronunciation Variant', value='Initial {} devoicing'.format(voiced), + name='Pronunciation Variant', + value='Initial {} devoicing'.format(voiced), cat='Pronunciation', backplacement=bpl_word) return newtokenmds def getalternativetokenmds(tokenmd, method, tokens, tokenctr, tree, uttid): - token = tokenmd.token beginmetadata = tokenmd.metadata newtokenmds = [] @@ -794,7 +830,8 @@ def getalternativetokenmds(tokenmd, method, tokens, tokenctr, tree, uttid): # zenode = find1(tree, zexpath) tokennodes = getnodeyield(tree) zenode = tokennodes[tokenctr] - nexttoken = tokens[tokenctr + 1] # do not take it from the tree because it may have been replaced by something else, e.g. avoid: ze dee -> ze deed -/-> z'n deed! + nexttoken = tokens[ + tokenctr + 1] # do not take it from the tree because it may have been replaced by something else, e.g. avoid: ze dee -> ze deed -/-> z'n deed! zerel = getattval(zenode, 'rel') zeparent = zenode.getparent() zeparentcat = getattval(zeparent, 'cat') @@ -809,7 +846,8 @@ def getalternativetokenmds(tokenmd, method, tokens, tokenctr, tree, uttid): # e-> e(n) enexceptions = {'inne'} - if not known_word(token.word) and token.word.lower() not in basicreplacements and token.word.lower() not in enexceptions: + if not known_word( + token.word) and token.word.lower() not in basicreplacements and token.word.lower() not in enexceptions: if endsinschwa(token.word) and not monosyllabic(token.word): newword = token.word + 'n' if known_word(newword): @@ -844,7 +882,6 @@ def getalternativetokenmds(tokenmd, method, tokens, tokenctr, tree, uttid): def getvalidalternativetokenmds(tokenmd, newtokenmds): - validnewtokenmds = [tokenmd for tokenmd in newtokenmds if known_word(tokenmd.token.word)] if validnewtokenmds == []: validnewtokenmds = [tokenmd] @@ -926,7 +963,8 @@ def correctPdit(tokensmd, tree, uttid): for token in tokens: tokennode = next(filter(lambda x: getattval(x, 'begin') == str(tokenctr), tokennodes), None) tokenlemma = getattval(tokennode, 'lemma') - if not token.skip and prevtoken is not None and not prevtoken.skip and tokenlemma in {'dit', 'dat', 'deze', 'die'}: + if not token.skip and prevtoken is not None and not prevtoken.skip and tokenlemma in {'dit', 'dat', 'deze', + 'die'}: tokenrel = getattval(tokennode, 'rel') tokenpt = getattval(tokennode, 'pt') prevtokennode = tokennodes[tokenctr - 1] if tokenctr > 0 else None @@ -935,7 +973,8 @@ def correctPdit(tokensmd, tree, uttid): prevparent = prevtokennode.getparent() prevparentrel, prevparentcat = getattval(prevparent, 'rel'), getattval(prevparent, 'cat') indezemwp = getindezemwp(prevtokennode, tokennode) - if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'obj1', 'det'} and tokenpt == 'vnw') or \ + if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'obj1', + 'det'} and tokenpt == 'vnw') or \ indezemwp: newtoken = Token('hem', tokenctr) bpl = bpl_indeze if indezemwp else bpl_node diff --git a/external_functions.py b/external_functions.py index dc09efa..23fabea 100644 --- a/external_functions.py +++ b/external_functions.py @@ -1,31 +1,30 @@ import re - -from asta_queries import asta_bijzin, asta_delpv, asta_lex, asta_noun -from astaforms import astaform -from ASTApostfunctions import (KMcount, countwordsandcutoff, finietheidsindex, - getalllemmas, getlemmas, wordcountperutt) from compounds import getcompounds -from dedup import correct, mlux, neologisme, onvolledig, samplesize -from imperatives import wond4, wond5plus, wondx, wx, wxy, wxyz, wxyz5 -from methods import allok -from queryfunctions import VzN, xneg_neg, xneg_x -from STAPpostfunctions import GL5LVU, GLVU, BB_totaal from Sziplus import sziplus6, vr5plus -from tarspform import mktarspform -from TARSPpostfunctions import (gofase, gtotaal, pf, pf2, pf3, pf4, pf5, pf6, - pf7, vutotaal) -from TARSPscreening import tarsp_screening from xenx import xenx +from imperatives import wx, wxy, wxyz, wxyz5, wondx, wond4, wond5plus +from TARSPscreening import tarsp_screening +from TARSPpostfunctions import vutotaal, gofase, gtotaal, pf2, pf3, pf4, pf5, pf6, pf7, pf +from queryfunctions import xneg_x, xneg_neg, VzN +from dedup import mlux, samplesize, neologisme, onvolledig, correct +from STAPpostfunctions import BB_totaal, GLVU, GL5LVU +from ASTApostfunctions import wordcountperutt, countwordsandcutoff, KMcount, finietheidsindex, getnounlemmas,\ + getlexlemmas, getalllemmas +from astaforms import astaform +from tarspform import mktarspform +from stapforms import makestapform +from asta_queries import asta_noun, asta_bijzin, asta_lex, asta_delpv +from methods import allok normalfunctionpattern = r'".format(q, invalidqueries[q], queries[q].query)) -#print the header +# print the header print(resultsheaderstring, file=outfile) outworksheet.write_row(outrowctr, outstartcol, resultsheaderrow) outrowctr += 1 -#print the platinumheader +# print the platinumheader print(platinumheaderstring, file=platinumoutfile) -#print the results +# print the results qcount = 0 invalidqcount = 0 undefinedqcount = 0 results = allresults.coreresults -#exactresults = getexactresults(allmatches) +# exactresults = getexactresults(allmatches) exact = True pcheaders = [['User1', 'User2', 'User3', 'MoreorLess', 'qid', 'cat', 'subcat', 'item', 'uttid', 'pos', 'utt']] @@ -786,7 +799,7 @@ def passfilter(rawexactresults, method): theresults = results[queryid] resultstr = counter2liststr(theresults) if queryid in goldscores: - #(goldlevel, golditem, goldcounter) = goldscores[queryid] + # (goldlevel, golditem, goldcounter) = goldscores[queryid] goldcounter = goldscores[queryid][2] goldcount = sumfreq(goldcounter) sortedgolduttstr = counter2liststr(goldcounter) @@ -804,7 +817,7 @@ def passfilter(rawexactresults, method): qex = 'no' undefinedqcount += 1 if query_exists(thequery) and queryid not in invalidqueries: - #print(queryid, file=logfile) + # print(queryid, file=logfile) if queryid in goldscores: goldcounter = goldscores[queryid][2] else: @@ -851,20 +864,25 @@ def passfilter(rawexactresults, method): queryinforow = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item] queryresultsrow = [str(sumfreq(theresults)), resultstr, str(goldcount), sortedgolduttstr, qex] queryRGscorerow = [sf(recall), sf(precision), sf(f1score), liststargoldstr, goldminusliststr, listminusgoldstr] - queryRPscorerow = [sortedplatinumliststr, sf(platinumrecall), sf(platinumprecision), sf(platinumf1score), platinumminusliststr, listminusplatinumliststr] - queryGPscorerow = [sf(gprecall), sf(gpprecision), sf(gpf1score), goldstarplatinumstr, platinumminusgoldstr, goldminusplatinumstr] + queryRPscorerow = [sortedplatinumliststr, sf(platinumrecall), sf(platinumprecision), sf(platinumf1score), + platinumminusliststr, listminusplatinumliststr] + queryGPscorerow = [sf(gprecall), sf(gpprecision), sf(gpf1score), goldstarplatinumstr, platinumminusgoldstr, + goldminusplatinumstr] fullresultrow = queryinforow + queryresultsrow + queryRGscorerow + queryRPscorerow + queryGPscorerow print(tab.join(fullresultrow), file=outfile) outworksheet.write_row(outrowctr, outstartcol, fullresultrow) outrowctr += 1 - platinumrow = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, platinumoutresultsstring, listminusgoldstr, '', ''] + platinumrow = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, + platinumoutresultsstring, listminusgoldstr, '', ''] print(tab.join(platinumrow), file=platinumoutfile) + # @with an annotationfile allmatches is empty so we need to redefine newrows (exactmismatches) markedutt (getmarkedutt)-done if exact: - newrows = exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile, silverannotationsdict) + newrows = exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, + platinumcheckfile, silverannotationsdict, annotationinput) allrows += newrows else: if theresultsminusgold != {}: @@ -873,7 +891,8 @@ def passfilter(rawexactresults, method): if (queryid, uttid) in allmatches: for (m, syntree) in allmatches[(queryid, uttid)]: markedutt = getmarkedutt(m, syntree) - platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, uttid, markedutt] + platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, + uttid, markedutt] print(tab.join(platinumcheckrow1), file=platinumcheckfile) if goldminustheresults != {}: @@ -883,31 +902,31 @@ def passfilter(rawexactresults, method): uttstr = space.join(allutts[uttid]) else: SDLOGGER.warning('uttid {} not in allutts'.format(uttid)) - platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, uttid, uttstr] + platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, uttid, + uttstr] print(tab.join(platinumcheckrow2), file=platinumcheckfile) platinumcheckfullname = platinumcheckfile.name (base, ext) = os.path.splitext(platinumcheckfullname) -#platinumcheckxlfullname = base + '.xlsx' +# platinumcheckxlfullname = base + '.xlsx' wb = mkworkbook(platinumcheckxlfullname, pcheaders, allrows, freeze_panes=(1, 9)) wb.close() -#compute the gold postresults +# compute the gold postresults goldpostresults = {} goldcounters = {} allgoldmatches = {} for qid in goldscores: goldcounters[qid] = goldscores[qid][2] -allgoldresults = AllResults(uttcount, goldcounters, goldpostresults, allgoldmatches, reffilename, []) +allgoldresults = AllResults(uttcount, goldcounters, exactgoldscores, goldpostresults, allgoldmatches, reffilename, [], + allannutts, annotationinput) dopostqueries(allgoldresults, postquerylist, queries) - # compute the platinum postresults platinumpostresults = {} - -#print the postresults +# print the postresults thepostresults = allresults.postresults for queryid in postquerylist: resultposval = str(getpostval(queryid, thepostresults)) @@ -927,30 +946,33 @@ def passfilter(rawexactresults, method): outworksheet.write_row(outrowctr, outstartcol, postrow) outrowctr += 1 -#gather overall results, 2 cases: (1)for defined original measure queries only; (2) for all original measure queries +# gather overall results, 2 cases: (1)for defined original measure queries only; (2) for all original measure queries -overallmethods = [(1, 'Overall (defined pre and core queries in the profile)', lambda x: is_preorcore(x) and query_exists(x) and query_inform(x)), - (2, 'Overall (all pre and core queries in the profile)', lambda x: is_preorcore(x) and query_inform(x)), - (3, 'Overall (original pre and core measures with defined queries only)', lambda x: is_preorcore(x) and query_exists(x)), +overallmethods = [(1, 'Overall (defined pre and core queries in the profile)', + lambda x: is_preorcore(x) and query_exists(x) and query_inform(x)), + (2, 'Overall (all pre and core queries in the profile)', + lambda x: is_preorcore(x) and query_inform(x)), + (3, 'Overall (original pre and core measures with defined queries only)', + lambda x: is_preorcore(x) and query_exists(x)), (4, 'Overall (all original pre and core measures)', lambda x: is_preorcore(x))] -logheader = ['datetime', 'treebank', 'scorenr,' 'R', 'P', 'F1', 'P-R', 'P-P', 'P-F1', 'GP-R', 'GP-P', 'GP-F1', 'ref', 'method'] +logheader = ['datetime', 'treebank', 'scorenr,' 'R', 'P', 'F1', 'P-R', 'P-P', 'P-F1', 'GP-R', 'GP-P', 'GP-F1', 'ref', + 'method'] logname = 'sastalog.txt' biglogfile = open(logname, 'a', encoding='utf8') exactlynow = datetime.datetime.now() now = exactlynow.replace(microsecond=0).isoformat() - for (ctr, message, queryfunction) in overallmethods: - #gather resultscount + # gather resultscount resultscount = 0 for queryid in results: thequery = queries[queryid] if thequery.original and queryfunction(thequery): resultscount += sum(results[queryid].values()) - #gather goldcount + # gather goldcount goldcount = 0 for queryid in goldscores: thequery = queries[queryid] @@ -958,7 +980,7 @@ def passfilter(rawexactresults, method): if thequery.original and queryfunction(thequery): goldcount += sum(goldcounter.values()) - #gather platinumcount + # gather platinumcount platinumcount = 0 for queryid in platinumresults: if queryid in queries: @@ -968,7 +990,7 @@ def passfilter(rawexactresults, method): else: SDLOGGER.warning('Query {} found in platinumresults but not in queries'.format(queryid)) - #resultsgoldintersectiocount + # resultsgoldintersectiocount resultsgoldintersectioncount = 0 for queryid in results: thequery = queries[queryid] @@ -979,9 +1001,9 @@ def passfilter(rawexactresults, method): resultsgoldintersectioncount += sum(intersection.values()) else: pass - #SDLOGGER.warning('Query {} found in results but not in goldscores'.format(queryid)) + # SDLOGGER.warning('Query {} found in results but not in goldscores'.format(queryid)) - #resultsplatinumintersectioncount + # resultsplatinumintersectioncount resultsplatinumintersectioncount = 0 for queryid in results: thequery = queries[queryid] @@ -991,9 +1013,9 @@ def passfilter(rawexactresults, method): resultsplatinumintersectioncount += sum(intersection.values()) else: pass - #SDLOGGER.warning('queryid {} not in platinumresults'.format(queryid)) + # SDLOGGER.warning('queryid {} not in platinumresults'.format(queryid)) - #goldplatinumintersectioncount + # goldplatinumintersectioncount goldplatinumintersectioncount = 0 for queryid in platinumresults: if queryid in queries: @@ -1005,12 +1027,13 @@ def passfilter(rawexactresults, method): goldplatinumintersectioncount += sum(intersection.values()) else: pass - #SDLOGGER.warning('Query {} in platinumresults but not in goldscores'.format(queryid)) + # SDLOGGER.warning('Query {} in platinumresults but not in goldscores'.format(queryid)) else: SDLOGGER.warning('Query {} in platinumresults but not in queries'.format(queryid)) (recall, precision, f1score) = getevalscores(resultscount, goldcount, resultsgoldintersectioncount) - (platinumrecall, platinumprecision, platinumf1score) = getevalscores(resultscount, platinumcount, resultsplatinumintersectioncount) + (platinumrecall, platinumprecision, platinumf1score) = getevalscores(resultscount, platinumcount, + resultsplatinumintersectioncount) (gprecall, gpprecision, gpf1score) = getevalscores(goldcount, platinumcount, goldplatinumintersectioncount) overallrow = ['', '', '', message, '', '', '', '', '', sf(recall), sf(precision), sf(f1score), @@ -1028,17 +1051,14 @@ def passfilter(rawexactresults, method): print(tab.join(logrow), file=biglogfile) - biglogfile.close() outfile.close() outworkbook.close() platinumoutfile.close() platinumcheckfile.close() - resultscounts = scores2counts(results) - countcomparison = get_comparison(resultscounts, goldcounts, queries) if countcomparison != []: countcomparisonfile = open(countcomparisonfilename, 'w', encoding='utf8') @@ -1046,10 +1066,9 @@ def passfilter(rawexactresults, method): ccheaderstr = tab.join(ccheader) print(ccheaderstr, file=countcomparisonfile) for (q, r, g) in countcomparison: - if not(r == 0 and g == 0): + if not (r == 0 and g == 0): print(q, r, g, r - g, sep=tab, file=countcomparisonfile) - definedqcount = qcount - undefinedqcount emptycounter = Counter() @@ -1072,7 +1091,6 @@ def passfilter(rawexactresults, method): else: percentagecompletion2str = 'N/A' - finalmessagetemplate1 = '{} measures, {} undefined, {} defined, of which {} invalid.' finalmessagetemplate2 = '{} measures defined for a non empty gold score out of {} ({}).' finalmessagetemplate3 = '{} measures defined for a non empty gold count out of {} ({}).' diff --git a/sentence_parser.py b/sentence_parser.py index 861079a..e49004f 100644 --- a/sentence_parser.py +++ b/sentence_parser.py @@ -1,3 +1,4 @@ +from functools import lru_cache import socket from contextlib import contextmanager @@ -5,6 +6,8 @@ import config +from alpinoparsing import escape_alpino_input + class AlpinoSentenceParser: ''' Assumes a Alpino server is running on provided host:port, @@ -19,6 +22,7 @@ def connection(self): raise def parse_sentence(self, sentence: str, buffer_size=8096) -> str: + sentence = escape_alpino_input(sentence) with self.connection() as s: sentence += '\n\n' # flag end of file s.sendall(sentence.encode('utf-8')) @@ -31,6 +35,7 @@ def parse_sentence(self, sentence: str, buffer_size=8096) -> str: return xml.decode('utf-8') +@lru_cache(maxsize=128) def parse(sentence): ''' Wrapper for use in sastadev''' alp = AlpinoSentenceParser() diff --git a/stapforms.py b/stapforms.py new file mode 100644 index 0000000..ba2fdf5 --- /dev/null +++ b/stapforms.py @@ -0,0 +1,120 @@ +from io import BytesIO +import os +from shutil import copyfile, copyfileobj +from collections import defaultdict + +from openpyxl import load_workbook +from allresults import AllResults +from config import SD_DIR, SDLOGGER + +scoresheetname = 'STAP 1 - 5' +maxutt = 50 +zerocount = 0 +basexl = os.path.join(SD_DIR, 'form_templates', 'STAP Excel VUmc 2018.xlsx') + +NS = 'S001' +OS = 'S002' +PV = 'S003' +SGG = 'S004' +VT = 'S005' +VD = 'S006' +N = 'S007' +BvBep = 'S008' +zelfvnw3 = 'S009' +BBp = 'S010' +BBt = 'S011' +BBo = 'S012' + +AG = 33 +Ucol = 21 +AF = 32 + +# order in the Excel sheet: NS OS PV SGG VT VD N BvBep zelf. vnw. 3 BB p BB t BB o +# i.e. +sorteditemlist = [NS, OS, PV, SGG, VT, VD, N, BvBep, zelfvnw3, BBp, BBt, BBo] + + +def data2rowtuples(data): + # data is a dictionary with key item and as value a counter with (uttid, count) items + newdata = defaultdict(lambda: defaultdict(int)) + for item in data: + for (uttid, count) in data[item].items(): + newdata[uttid][item] += count + + rowlist = [] + uttidlist = [uttid for uttid in newdata] + sorteduttidlist = sorted(uttidlist) + + for uttid in sorteduttidlist: + row = [] + for item in sorteditemlist: + if item in newdata[uttid]: + row.append(newdata[uttid][item]) + else: + row.append(zerocount) + rowlist.append((uttid, row)) + + return rowlist + + +def makestapform(allresults, _, basexl=basexl, in_memory=False): + if not in_memory: + # copy the basexl to a new one with the appropriate name + (base, ext) = os.path.splitext(allresults.filename) + target = base + '_STAP-Form' + '.xlsx' + + copyfile(basexl, target) + + # open the workbook + wb = load_workbook(filename=target) + else: + target = BytesIO() + with open(basexl, 'rb') as source: + copyfileobj(fsrc=source, fdst=target) + wb = load_workbook(target) + + # gather the results + + # put the results in the right order + rowlist = data2rowtuples(allresults.coreresults) + + ws = wb[scoresheetname] + + cols = ['U', 'V', 'W', 'X', 'Y', 'Z', 'AA', 'AB', 'AC', 'AD', 'AE', 'AF'] + # adapt the relevant sheet + for (uttid, row) in rowlist: + uttidrow = int(uttid) + 3 + xluttctr = ws.cell(column=AG, row=uttidrow).value + uttidrowstr = str(uttidrow) + if int(uttid) == xluttctr: + for col, el in zip(cols, row): + # special proviso for PV in column W + if col == 'W': + el = el - 1 + cellkey = col + uttidrowstr + ws[cellkey] = el + else: + SDLOGGER.error('Unexpected utterance id encountered: {}'.format(uttid)) + + # save the workbook + wb.save(target) + wb.close() + + # return the workbook- not needed + return target + + +def test(): + coreresults = {NS: {'1': 3}, OS: {'1': 2, '2': 6}} + postresults = {} + allmatches = {} + fn = 'STAP42.xml' + analysedtrees = {} + allresults = AllResults(0, coreresults, postresults, allmatches, fn, analysedtrees) + fnbase, _ = os.path.splitext(fn) + formxl = fnbase + '_form' + '.xlsx' + makestapform(allresults, _) + + +if __name__ == '__main__': + test() diff --git a/stringfunctions.py b/stringfunctions.py index ddf2ad3..684d672 100644 --- a/stringfunctions.py +++ b/stringfunctions.py @@ -1,4 +1,5 @@ import re +import unicodedata vertbar = '|' space = ' ' @@ -18,12 +19,12 @@ tremavowels = 'äëïöüÿ' circumflexvowels = 'âêîôû\u0177' - consonants = 'bcdfghjklmnpqrstvwxz\u00E7' # \u00E7 is c cedilla dutch_base_vowels = barevowels + aiguvowels + gravevowels + tremavowels + circumflexvowels vowels = dutch_base_vowels dutch_base_diphthongs = ['aa', 'ee', 'ie', 'oo', 'uu', 'ij', 'ei', 'au', 'ou', 'ui', 'eu', 'oe'] -dutch_y_diphthongs = ['y' + d for d in dutch_base_vowels] + [d + 'y' for d in dutch_base_vowels] # ryen gaat nog fout ye alleen samen nemen aan begin van woord +dutch_y_diphthongs = ['y' + d for d in dutch_base_vowels] + [d + 'y' for d in + dutch_base_vowels] # ryen gaat nog fout ye alleen samen nemen aan begin van woord dutch_y_triphthongs = ['y' + d for d in dutch_base_diphthongs] + [d + 'y' for d in dutch_base_diphthongs] dutch_trema_diphthongs = ['äa', "ëe", 'ïe', 'öo', 'üu', 'ëi'] dutch_diphthongs = dutch_base_diphthongs + dutch_y_diphthongs + dutch_trema_diphthongs @@ -120,13 +121,13 @@ def delhyphenprefix(word, inlexicon): mwinlex = inlexicon(mainword) pfinlex = inlexicon(prefix) deduppf = barededup(word) - if prefix in hyphenprefixes and mwinlex: # the word starts wit ha known prefix that uses hyphen such as ex (ex-vrouw) + if prefix in hyphenprefixes and mwinlex: # the word starts wit ha known prefix that uses hyphen such as ex (ex-vrouw) result = [] elif mainword.startswith(prefix) and mwinlex: # this is the core case e.g. ver-verkoop result = [mainword] - elif pfinlex and mwinlex: # for compounds with a hyphen: kat-oorbellen, generaal-majoor and for tennis-baan(?) + elif pfinlex and mwinlex: # for compounds with a hyphen: kat-oorbellen, generaal-majoor and for tennis-baan(?) result = [] - elif mainword.startswith(deduppf) and mwinlex: # vver-verkoop + elif mainword.startswith(deduppf) and mwinlex: # vver-verkoop result = [mainword] else: result = [] @@ -150,8 +151,8 @@ def dehyphenate(word): head = word[0:1] tail = word[1:] if head == hyphen: - #newresult = head + tail - #results.append(newresult) + # newresult = head + tail + # results.append(newresult) rightresults = dehyphenate(tail) for rightresult in rightresults: newresult = head + rightresult @@ -232,13 +233,42 @@ def aigu(c): result = aiguvowels[theindex] +def testcondition(condition, word): + if condition(word): + print('OK:{}'.format(word)) + else: + print('NO:{}'.format(word)) + + +def test(): + monosyllabicwords = ['baai', 'eeuw', 'mooi', 'aap', 'deed', 'Piet', 'noot', 'duut', 'rijd', 'meid', 'rauw', 'koud', + 'buit', 'reuk', 'boer', 'la', 'de', 'hik', 'dop', 'dut', + 'yell', 'ry', 'Händl', 'Pëtr', 'bït', 'Köln', 'Kür', 'Tÿd'] + disyllabicwords = ['baaien', 'eeuwen', 'mooie', 'aapje', 'deden', 'Pietje', 'noten', 'dut', 'rijden', 'meiden', + 'rauwe', 'koude', 'buitje', 'reuken', 'boeren', 'laden', 'dender', + 'hikken', 'doppen', 'dutten', 'yellen', 'ryen', 'Händler', 'Pëtri', 'bïty', 'Kölner', 'Kürer', + 'Tÿding', 'naäap', 'meeëten', 'ciën', 'coöp'] + + for word in monosyllabicwords: + testcondition(monosyllabic, word.lower()) + for word in disyllabicwords: + testcondition(monosyllabic, word.lower()) + + for word in monosyllabicwords + disyllabicwords: + ms = syllableheadsre.finditer(word) + print(word, end=' -- ') + for m in ms: + print(m.group(0), end=', ') + print('') + + def nono(inval): result = (inval is None) or (inval == 0) or (inval == []) or (inval == '') return result def nonnull(inval): - result = not(nono(inval)) + result = not (nono(inval)) return result @@ -254,3 +284,15 @@ def string2list(liststr): core = liststr[1:-1] parts = core.split(comma) return parts + + +def realwordstring(w): + if len(w) != 1: + result = True + else: + result = not unicodedata.category(w).startswith('P') + return result + + +if __name__ == '__main__': + test() diff --git a/targets.py b/targets.py index d0116a6..50f1c7d 100644 --- a/targets.py +++ b/targets.py @@ -1,4 +1,3 @@ - target_intarget, target_xsid, target_all, target_byrole, target_bysyn = 0, 1, 2, 3, 4 intargetxpath = '//meta[@name="intarget"]' xsidxpath = '//meta[@name="xsid"]' @@ -17,9 +16,9 @@ def get_targets(treebank): roles = treebank.xpath(rolevalxpath) targetrolesfound = any(map(lambda x: x.lower() in targetroles, roles)) synannotations = treebank.xpath(synxpath) - if synannotations != []: - result = target_bysyn - elif xsids != []: + # if synannotations != []: + # result = target_bysyn + if xsids != []: result = target_xsid elif intargets != []: result = target_intarget diff --git a/treebankfunctions.py b/treebankfunctions.py index 6917de3..ca5e4b1 100644 --- a/treebankfunctions.py +++ b/treebankfunctions.py @@ -2,15 +2,16 @@ various treebank functions ''' + +import sys import re +import logging from copy import copy, deepcopy - from lxml import etree - -#from lexicon import informlexiconpos, isa_namepart_uc, informlexicon, isa_namepart -import lexicon as lex from config import SDLOGGER from stringfunctions import allconsonants +# from lexicon import informlexiconpos, isa_namepart_uc, informlexicon, isa_namepart +import lexicon as lex class Metadata: @@ -42,8 +43,7 @@ def md2XMLElement(self): numberpattern = r'^[\d\.,]+$' numberre = re.compile(numberpattern) - -#next 3 derived from the alpino dtd +# next 3 derived from the alpino dtd allrels = ['hdf', 'hd', 'cmp', 'sup', 'su', 'obj1', 'pobj1', 'obj2', 'se', 'pc', 'vc', 'svp', 'predc', 'ld', 'me', 'predm', 'obcomp', 'mod', 'body', 'det', 'app', 'whd', 'rhd', 'cnj', 'crd', 'nucl', 'sat', 'tag', 'dp', 'top', 'mwp', 'dlink', '--'] @@ -78,10 +78,10 @@ def md2XMLElement(self): 'evenveel', 'geen', 'ieder', 'meer', 'meerdere', 'menig', 'minder', 'minst', 'sommig', 'teveel', 'tevéél', 'veel', 'weinig', 'één', 'keiveel'} -#uttidquery = "//meta[@name='uttid']/@value" +# uttidquery = "//meta[@name='uttid']/@value" sentidxpath = './/sentence/@sentid' -#altquery = "//meta[@name='alt']/@value" +# altquery = "//meta[@name='alt']/@value" metaquerytemplate = "//meta[@name='{}']/@value" sentencexpathquery = "//sentence/text()" @@ -193,6 +193,21 @@ def getuttid(syntree): return result +def getuttno(syntree): + result = getmeta(syntree, 'uttno') + if result is None: + result = '0' + return result + +def getuttidorno(syntree): + result = getmeta(syntree, 'xsid') + if result is None: + result = getmeta(syntree, 'uttno') + if result is None: + result = '0' + return result + + def getxsid(syntree): result = getmeta(syntree, 'xsid') if result is None: @@ -550,7 +565,7 @@ def mark(str): def getwordpositions(matchtree, syntree): - #nothing special needs to be done for index nodes since they also have begin and end + # nothing special needs to be done for index nodes since they also have begin and end positions = [] for node in matchtree.iter(): if 'end' in node.attrib: @@ -588,7 +603,8 @@ def addmetadata(stree, meta): metadatanode = etree.Element('metadata') stree.append(metadatanode) else: - metadatanode = metadatanodes[0] # we append to the first metadata node if there would be multiple (which should not be the case) + metadatanode = metadatanodes[ + 0] # we append to the first metadata node if there would be multiple (which should not be the case) metadatanode.append(meta) result = stree return result @@ -714,8 +730,8 @@ def asta_recognised_nounnode(node): result = result or sasta_long(node) result = result or recognised_wordnodepos(node, pos) result = result or recognised_lemmanodepos(node, pos) - result = result and not(all_lower_consonantsnode(node)) - result = result and not(short_nucl_n(node)) + result = result and not (all_lower_consonantsnode(node)) + result = result and not (short_nucl_n(node)) return result @@ -727,8 +743,8 @@ def asta_recognised_wordnode(node): result = result or recognised_wordnode(node) result = result or recognised_lemmanode(node) result = result or isnumber(node) - result = result and not(all_lower_consonantsnode(node)) - result = result and not(short_nucl_n(node)) + result = result and not (all_lower_consonantsnode(node)) + result = result and not (short_nucl_n(node)) return result @@ -751,7 +767,8 @@ def short_nucl_n(node): return result -sasta_pseudonyms = ['NAAM', 'VOORNAAM', 'ACHTERNAAM', 'ZIEKENHUIS', 'STRAAT', 'PLAATS', 'PLAATSNAAM', 'KIND', 'BEROEP', 'OPLEIDING'] +sasta_pseudonyms = ['NAAM', 'VOORNAAM', 'ACHTERNAAM', 'ZIEKENHUIS', 'STRAAT', 'PLAATS', 'PLAATSNAAM', 'KIND', 'BEROEP', + 'OPLEIDING'] pseudonym_patternlist = [r'^{}\d?$'.format(el) for el in sasta_pseudonyms] pseudonym_pattern = vertbar.join(pseudonym_patternlist) pseudonymre = re.compile(pseudonym_pattern) @@ -768,14 +785,15 @@ def recognised_wordnodepos(node, pos): word = getattval(node, 'word') lcword = word.lower() result = lex.informlexiconpos(word, pos) or lex.informlexiconpos(lcword, pos) or \ - iscompound(node) or isdiminutive(node) or lex.isa_namepart_uc(word) + iscompound(node) or isdiminutive(node) or lex.isa_namepart_uc(word) return result def recognised_wordnode(node): word = getattval(node, 'word') lcword = word.lower() - result = lex.informlexicon(word) or lex.informlexicon(lcword) or iscompound(node) or isdiminutive(node) or lex.isa_namepart(word) + result = lex.informlexicon(word) or lex.informlexicon(lcword) or iscompound(node) or isdiminutive( + node) or lex.isa_namepart(word) return result @@ -826,10 +844,10 @@ def simpleshow2(stree, showchildren=True): if index != '': print(nodeformat.format(rel, '', indexstr), end=' ') else: - #print('top', end=' ') + # print('top', end=' ') for child in stree: simpleshow2(child) - #print(']', end=' ') + # print(']', end=' ') def showflatxml(elem): @@ -890,14 +908,15 @@ def nodecopy(node): def bareindexnode(node): - result = terminal(node) and 'index' in node.attrib and 'postag' not in node.attrib and 'cat' not in node.attrib and 'pt' not in node.attrib and 'pos' not in node.attrib - #print(props2str(get_node_props(node)), result, file=sys.stderr) - return(result) + result = terminal( + node) and 'index' in node.attrib and 'postag' not in node.attrib and 'cat' not in node.attrib and 'pt' not in node.attrib and 'pos' not in node.attrib + # print(props2str(get_node_props(node)), result, file=sys.stderr) + return (result) def terminal(node): result = node is not None and len(node) == 0 - return(result) + return (result) def indextransform(stree): @@ -922,16 +941,16 @@ def indextransform2(stree, indexednodesmap): therel = getattval(stree, 'rel') newstree = deepcopy(indexednodesmap[theindex]) newstree.attrib['rel'] = therel - #simpleshow(newstree) - #print() + # simpleshow(newstree) + # print() else: newstree = nodecopy(stree) - #simpleshow(newstree) - #print(id(stree)) - #print(id(newstree)) - #print(len(newstree)) - #print(id(newstree.getparent())) - #print(id(None)) + # simpleshow(newstree) + # print(id(stree)) + # print(id(newstree)) + # print(len(newstree)) + # print(id(newstree.getparent())) + # print(id(None)) for child in stree: newchild = indextransform2(child, indexednodesmap) newstree.append(newchild) @@ -951,7 +970,7 @@ def getstree(fullname): except OSError as e: SDLOGGER.error('OS Error: {}; file: {}'.format(e, fullname)) return None - except Exception: + except: SDLOGGER.error('Error: Unknown error in file {}'.format(fullname)) return None @@ -978,6 +997,137 @@ def getstree(fullname): return tree +streestrings = {} +streestrings[1] = ''' + + + + + + + + + + + + + + + en uhm en uhm hij hij is nogal + + + + + + + + + + + + + + + + + + + + + + + + + + +''' + +streestrings[2] = ''' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ik heb een ik heb een ik heb een man met wie ik wil gaan trouwen uhm + + + + + + + + + + + + + + + + + + + + + + + + + + +''' + +strees = {} +for el in streestrings: + strees[el] = etree.fromstring(streestrings[el]) + + +def test(): + for el in strees: + stree = strees[el] + lmc = lastmainclauseof(stree) + print(getmarkedutt(lmc, stree)) + + def getsentid(stree): sentidlist = stree.xpath(sentidxpath) if sentidlist == []: @@ -988,6 +1138,15 @@ def getsentid(stree): return uttid +def testindextransform(): + for el in strees: + stree = strees[el] + print(el) + simpleshow(stree) + newstree = indextransform(stree) + simpleshow(newstree) + + def adaptsentence(stree): # adapt the sentence # find the sentence element's parent and its index @@ -999,7 +1158,7 @@ def adaptsentence(stree): sentencenodeparent = sentencenode.getparent() sentencenodeindex = sentencenodeparent.index(sentencenode) sentencenodeparent.remove(sentencenode) - #del sentencenodeparent[sentencenodeindex] + # del sentencenodeparent[sentencenodeindex] theyield = getyield(stree) theyieldstr = space.join(theyield) newsentence = etree.Element('sentence') @@ -1018,24 +1177,24 @@ def transplant_node(node1, node2, stree): :param stree: tree in which the replacement takes place :return: None, the stree input parameter is modified ''' - #find the parent of node1 - #determine the index of node1 + # find the parent of node1 + # determine the index of node1 sentid = getsentid(stree) parentindex = get_parentandindex(node1, stree) if parentindex is None: result = stree else: parent, index = parentindex - #SDLOGGER.debug(simpleshow(parent)) + # SDLOGGER.debug(simpleshow(parent)) del parent[index] - #SDLOGGER.debug(simpleshow(parent)) + # SDLOGGER.debug(simpleshow(parent)) parent.insert(index, node2) - #SDLOGGER.debug(simpleshow(parent)) + # SDLOGGER.debug(simpleshow(parent)) result = stree - #SDLOGGER.debug(simpleshow(stree)) + # SDLOGGER.debug(simpleshow(stree)) - #adapt the sentence - #find the sentence element's parent and its index + # adapt the sentence + # find the sentence element's parent and its index sentencenode = stree.find('.//sentence') sentencenodeparent = sentencenode.getparent() sentencenodeindex = sentencenodeparent.index(sentencenode) @@ -1066,7 +1225,7 @@ def get_parentandindex(node, stree): return (stree, idx) else: chresult = get_parentandindex(node, child) - if chresult is not None: + if chresult != None: return chresult idx += 1 return None @@ -1081,14 +1240,14 @@ def getspan(node): def lbrother(node, tree): nodebegin = getattval(node, 'begin') - def condition(n): return getattval(n, 'end') == nodebegin + condition = lambda n: getattval(n, 'end') == nodebegin result = findfirstnode(tree, condition) return result def rbrother(node, tree): nodeend = getattval(node, 'end') - def condition(n): return getattval(n, 'begin') == nodeend + condition = lambda n: getattval(n, 'begin') == nodeend result = findfirstnode(tree, condition) return result @@ -1178,7 +1337,7 @@ def getxmetatreepositions(tree, xmetaname, poslistname='annotationposlist'): return result -#topendxpath = './/node[@cat="top"]/@end' +# topendxpath = './/node[@cat="top"]/@end' wordnodemodel = './/node[(@pt or (not(@pt) and not(@cat) and @index)) and @begin="{}"]' @@ -1211,13 +1370,13 @@ def deletewordnode(tree, begin): if thenode is not None: thenode.getparent().remove(thenode) # renumber begins and ends must be done outside this functions when all deletions have been done; - #updatebeginend(newtree, begin) + # updatebeginend(newtree, begin) # adapt the cleantokenisation # done outside this function - #adapt the sentence: do this after all deletions - #newtree = adaptsentence(newtree) + # adapt the sentence: do this after all deletions + # newtree = adaptsentence(newtree) return newtree @@ -1234,7 +1393,7 @@ def deletewordnodes(tree, begins): if newtree is None: return newtree else: - #wordnodexpath = wordnodemodel.format(str(begin)) + # wordnodexpath = wordnodemodel.format(str(begin)) thenodes = [] for begin in begins: thenodes += newtree.xpath(wordnodemodel.format(str(begin))) @@ -1251,7 +1410,7 @@ def deletewordnodes(tree, begins): # adapt the cleantokenisation # done outside this function - #adapt the sentence + # adapt the sentence newtree = adaptsentence(newtree) return newtree @@ -1270,7 +1429,7 @@ def update_cleantokenisation(stree, begin): oldcleanedtokposmeta = find1(stree, '//xmeta[@name="cleanedtokenpositions"]') cleanedtokposmeta = copy(oldcleanedtokposmeta) parent = oldcleanedtokmeta.getparent() - if not(cleanedtokmeta is None and cleanedtokposmeta is None): + if not (cleanedtokmeta is None and cleanedtokposmeta is None): cleanedtokstr = cleanedtokmeta.attrib['annotationwordlist'] cleanedtok = strliststr2list(cleanedtokstr) newcleanedtok = cleanedtok[:intbegin] + cleanedtok[intbegin + 1:] @@ -1381,3 +1540,8 @@ def add_metadata(intree, metalist): for meta in metalist: metadata.append(meta.toElement()) return tree + + +if __name__ == '__main__': + # test() + testindextransform()