diff --git a/.gitignore b/.gitignore index 97c0b03..cceb264 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,8 @@ env.bak/ venv.bak/ # configuration -config.py \ No newline at end of file +config.py + +# additional files +.idea/ +sastalog.txt \ No newline at end of file diff --git a/CHAT_Annotation.py b/CHAT_Annotation.py index 1cfd985..edc006f 100644 --- a/CHAT_Annotation.py +++ b/CHAT_Annotation.py @@ -22,6 +22,9 @@ emptyreplacement = eps anybutrb = r'[^\]]*' +errormarking = 'Error Marking' +omittedword = 'Omitted Word' +specialform = 'Special Form' def fullre(pat): result = r'^' + pat + r'$' @@ -41,6 +44,7 @@ def refunction(x): result = fullre(x) return result + # u2013 = en-dash, u2014 = em-dash, u2015 = horizontal bar @@ -135,7 +139,8 @@ def apply(self, tokens, annotation, repkeep): annotatedposlist = [token.pos] annotatedwordlist = [token.word] annotationposlist = [p for p in range(m.start(), m.end())] - newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist) + newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist, + annotatedwordlist, annotationposlist) metadata.append(newmeta) newword = self.compiledre.sub(self.replacement, token.word) newtoken = Token(newword, token.pos) @@ -226,7 +231,10 @@ def apply(self, tokens, annotation, repkeep): else: (b, e) = scope if ltodotokens == e + 1: - SDLOGGER.error('Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e, show(todotokens))) + SDLOGGER.error( + 'Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e, + show( + todotokens))) newtokens += todotokens[:b] + todotokens[b + 1:e] tokenctr = e + 1 elif self.compiledre.search(todotokens[e + 1].word): @@ -234,7 +242,9 @@ def apply(self, tokens, annotation, repkeep): annotationpositions = [token.pos for token in todotokens[b + 1:e]] if self.arity == dyadic: if ltodotokens <= e + 2: - SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name, show(todotokens))) + SDLOGGER.error( + 'Missing second argument for dyadic annotation {} in {}'.format(annotation.name, + show(todotokens))) newtokens += todotokens[b + 1:e] break else: @@ -247,7 +257,8 @@ def apply(self, tokens, annotation, repkeep): SDLOGGER.error('Illegal arity specification ({}) on {}'.format(self.arity, annotation.name)) annotatedwords = [] annotatedpositions = [] - newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions, annotatedwords, annotationpositions) + newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions, + annotatedwords, annotationpositions) metadata.append(newmeta) newtokens += todotokens[tokenctr:b] replacement = getreplacement(repkeep, annotation) @@ -270,7 +281,8 @@ def apply(self, tokens, annotation, repkeep): while i < ltodotokens: if self.compiledre.search(todotokens[i].word): if scopewords == []: - SDLOGGER.error('First argument of annotation {} missing. Annotation ignored'.format(annotation.name)) + SDLOGGER.error( + 'First argument of annotation {} missing. Annotation ignored'.format(annotation.name)) else: if self.arity == monadic: annotatedpositions = [] @@ -283,15 +295,17 @@ def apply(self, tokens, annotation, repkeep): metadata.append(newmeta) elif self.arity == dyadic: if i + 1 >= ltodotokens: - SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name, - show(todotokens))) + SDLOGGER.error( + 'Missing second argument for dyadic annotation {} in {}'.format(annotation.name, + show(todotokens))) else: annotatedpositions = [todotokens[i + 1].pos] annotatedwords = [todotokens[i + 1].word] replacement = getreplacement(repkeep, annotation) newtokens = doreplacement([prevtoken], replacement, newtokens) prevtoken = None - newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions, annotatedwords, scopepositions) + newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions, + annotatedwords, scopepositions) metadata.append(newmeta) else: if prevtoken is not None: @@ -308,11 +322,11 @@ def apply(self, tokens, annotation, repkeep): class CHAT_ComplexRegex(CHAT_Regex): def __init__(self, regextuple, replacementtuple, scoped, containswords=False): - self.regexbegin = regextuple[0] # 3 elements: begin mid end - self.regexmid = regextuple[1] # 3 elements: begin mid end - self.regexend = regextuple[2] # 3 elements: begin mid end - self.scopereplacement = replacementtuple[0] # 2 elements: one for the scope and one for the text between [ ] - self.bracketreplacement = replacementtuple[1] # 2 elements: one for the scope and one for the text between [ ] + self.regexbegin = regextuple[0] # 3 elements: begin mid end + self.regexmid = regextuple[1] # 3 elements: begin mid end + self.regexend = regextuple[2] # 3 elements: begin mid end + self.scopereplacement = replacementtuple[0] # 2 elements: one for the scope and one for the text between [ ] + self.bracketreplacement = replacementtuple[1] # 2 elements: one for the scope and one for the text between [ ] self.scoped = scoped self.containswords = containswords self.compiledrebegin = re.compile(refunction(self.regexbegin)) @@ -360,7 +374,8 @@ def apply(self, tokens, annotation, repkeep): elif state == scopestate: scope = findscope(tokens[tokenctr - 1:], offset=tokenctr - 1) if scope is None: - SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos, show(tokens))) + SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos, + show(tokens))) state = wstate else: (b, e) = scope @@ -372,13 +387,16 @@ def apply(self, tokens, annotation, repkeep): if bbbe is not None: (bracketbegin, bracketend) = bbbe annotationtokens = todotokens[bracketbegin + 1: bracketend] - (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens, repkeep) if self.containswords else (annotationtokens, []) + (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens, + repkeep) if self.containswords else ( + annotationtokens, []) metadata += innermetadata annotatedwords = [t.word for t in tobereplacedtokens if t.word not in ['<', '>']] annotatedpositions = [t.pos for t in tobereplacedtokens if t.word not in ['<', '>']] thevalue = [token.word for token in cleanannotationtokens] annotationpositions = [token.pos for token in cleanannotationtokens] - newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords, annotationpositions) + newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords, + annotationpositions) metadata.append(newmeta) replacement = self.scopereplacement repltokens = [t for t in tobereplacedtokens if t.word not in ['<', '>']] @@ -395,10 +413,10 @@ def apply(self, tokens, annotation, repkeep): tokenctr += inc newtokens += tobereplacedtokens if state in estates: - return(newtokens, metadata) + return (newtokens, metadata) else: SDLOGGER.error('Not in an end state, state={} in {}'.format(state, show(tokens))) - return(tokens, []) + return (tokens, []) def findbrackets(tokens, regexes, offset=0): @@ -430,14 +448,29 @@ def dropbrackets(w): return result -def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT) -def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT, backplacement=bpl_delete) +def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], + annotatedwordlist=[w], source=CHAT) + + +def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], + annotatedwordlist=[w], source=CHAT, + backplacement=bpl_delete) def simplescopedmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \ - Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist, annotatedwordlist=annotatedwordlist, source=CHAT) + Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist, + annotatedwordlist=annotatedwordlist, source=CHAT) + + def complexmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \ - Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist, annotatedposlist=annotatedposlist, source=CHAT) + Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist, + annotatedposlist=annotatedposlist, source=CHAT) + + +def charmetafunction(ann, annotationcharlist, annotatedcharlist, annotationcharposlist, annotatedcharposlist): + return Meta(ann.name, annotationcharlist, annotationcharlist=annotationcharlist, + annotatedcharlist=annotatedcharlist, + annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist) def epsf(w): return '' @@ -492,6 +525,7 @@ def dropchars2(w, c): def CHAT_message(msg): def result(x, y): return SDLOGGER.warning(msg.format(x, y)) + return result @@ -502,12 +536,15 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y)) # here additional things could be done CHAT_Annotation('Overlap Precedes', '8.4:71-72', '10.3:75', CHAT_SimpleScopedRegex(r'\[\<[0-9]?\]', keep, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Special Form', '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False), simplemetafunction(getsfvalue)), - CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False), simplemetafunction(epsf)), - CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False), simplemetafunction(epsf)), + CHAT_Annotation(specialform, '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False), + simplemetafunction(getsfvalue)), + CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False), + simplemetafunction(epsf)), + CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False), + simplemetafunction(epsf)), CHAT_Annotation('Noncompletion of a Word', '6.5:43', '8.5:48', - CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), complexmetafunction), - CHAT_Annotation('Omitted Word', '6.5:43', '8.5:48-49', + CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), charmetafunction), + CHAT_Annotation(omittedword, '6.5:43', '8.5:48-49', CHAT_SimpleRegex(r'0[\w:]+', dropzero, False), simple_bpldel_metafunction(dropzero)), CHAT_Annotation('Satellite at End', '7.4:58', '9.2:59-60', CHAT_SimpleRegex(r'\s„\s', eps, False), simplemetafunction(identity)), @@ -524,8 +561,9 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y)) simplemetafunction(dropinitial)), # this one must crucially precede Pause Between Syllables CHAT_Annotation('Pause Between Syllables', '7.7:60', '9.9:63-64', CHAT_InWordRegex(r'\^', ''), complexmetafunction), CHAT_Annotation('Simple Event', '7.8.1:60', '9.10.1:64-65', CHAT_SimpleRegex(r'&=[\w:]+', eps, False), - simplemetafunction(identity)), - CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65', CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False), + simplemetafunction(identity)), + CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65', + CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False), complexmetafunction), CHAT_Annotation('Pause', '7.8.3:62', '9.10.4:66', CHAT_SimpleRegex(r'\(\.\.?\.?\)', eps, False), simplemetafunction(identity)), @@ -577,54 +615,74 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y)) simplemetafunction(identity)), # erroR marking crucially before [/] [//] [///] etc - CHAT_Annotation('Error Marking', '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic), + CHAT_Annotation(errormarking, '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Error Marking', '8.5:75', '10.5:78', + CHAT_Annotation(errormarking, '8.5:75', '10.5:78', CHAT_ComplexRegex((r'\[\*', r'[\w:\-\+=]+', r'\]'), (keep, eps), False), complexmetafunction), - CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True), + CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71', + CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True), complexmetafunction), # pic bullet and text bullet must essentially before time alignment - CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True), + CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71', + CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True), complexmetafunction), - CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True), + CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', + CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True), complexmetafunction), - CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True), + CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', + CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True), complexmetafunction), # not an official code but it occurs as such in CLPF - CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72', CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True), + CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72', + CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), CHAT_Annotation('Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!\]', keep, False, monadic), simplescopedmetafunction), - CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic), + CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72', + CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic), simplescopedmetafunction), # Duration to be added here @@ - CHAT_Annotation('Explanation', '8.3:69', '10.3:73', CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False), + CHAT_Annotation('Explanation', '8.3:69', '10.3:73', + CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False), complexmetafunction), CHAT_Annotation('Replacement', '8.3:69', '10.3:73', - CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True), complexmetafunction), + CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True), + complexmetafunction), CHAT_Annotation('Replacement of Real Word', '8.3:70', '10.3:73', CHAT_ComplexRegex((r'\[::', r'([^\]]+)', r'\]'), (eps, keep), True), complexmetafunction), CHAT_Annotation('Alternative Transcription', '8.3:70', '10.3:74', CHAT_ComplexRegex((r'\[=\?', r'([^\]]+)', r'\]'), (keep, eps), True), complexmetafunction), CHAT_Annotation('Dependent Tier on Main Line', '8.3:70', 'none', - CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), # @@must do something with the speaker + CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), + # @@must do something with the speaker CHAT_Annotation('Comment on Main Line', '8.3:70', '10.3:74', CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), - CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), simplescopedmetafunction), + CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic), + simplescopedmetafunction), + CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), + simplescopedmetafunction), CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76', CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction), - CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), simplescopedmetafunction), - CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction), - CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic), simplescopedmetafunction), - CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False), simplemetafunction(identity)), # needs extension - CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False), # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm + CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), + simplescopedmetafunction), + CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), + simplescopedmetafunction), + CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77', + CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction), + CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77', + CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction), + CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic), + simplescopedmetafunction), + CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False), + simplemetafunction(identity)), # needs extension + CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False), + # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm simplemetafunction(interposedword)), - CHAT_Annotation('Postcode', '8.6:75', '10.5:78', CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False), + CHAT_Annotation('Postcode', '8.6:75', '10.5:78', + CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False), complexmetafunction), - CHAT_Annotation('Language Precode', '8.6:75', '10.5:79', CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False), + CHAT_Annotation('Language Precode', '8.6:75', '10.5:79', + CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False), complexmetafunction), CHAT_Annotation('Excluded Utterance', '8.6:75-76', '10.5:79', CHAT_SimpleRegex(r'\[\+\s+bch\]', eps, False), simplemetafunction(interposedword)), @@ -632,9 +690,12 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y)) simplemetafunction(interposedword)), CHAT_Annotation('Zero Utterance', '', '10.5:79, 11.1:81', CHAT_SimpleRegex(r'\b0\b', eps, False), simplemetafunction(identity)), - CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''), complexmetafunction), - CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction), # take care extra token!@@ - CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction), # take care extra token@@ + CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''), + complexmetafunction), + CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction), + # take care extra token!@@ + CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction), + # take care extra token@@ CHAT_Annotation('Blocked Segments', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u2260.*?\u2260', ''), complexmetafunction), # these must be applied after [/], [//], [///] etc diff --git a/TARSPpostfunctions.py b/TARSPpostfunctions.py index 4d9d424..4a1456e 100644 --- a/TARSPpostfunctions.py +++ b/TARSPpostfunctions.py @@ -6,6 +6,7 @@ from query import core_process from treebankfunctions import getmeta +from config import SDLOGGER OndVC = 'T071' OndWVC = 'T076' @@ -74,8 +75,11 @@ def getstage(uttcounts, allresults): cands = [] gtotaal = allresults.postresults['T152'] for el in uttcounts: - if uttcounts[el] / gtotaal >= gofase_minthreshold: - cands.append(el) + if gtotaal != 0: + if uttcounts[el] / gtotaal >= gofase_minthreshold: + cands.append(el) + else: + SDLOGGER.error('gtotaal has value 0') if cands == []: result = 1 else: diff --git a/adjtest.py b/adjtest.py new file mode 100644 index 0000000..84d738b --- /dev/null +++ b/adjtest.py @@ -0,0 +1,121 @@ +from lxml import etree +from treebankfunctions import showtree +from asta_queries import asta_bijzin + +streestrings = {} + +streestrings[0] = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + uh dus sinds ik hier ben heb ik logo omdat ik + Q#ng1647271273|dus sinds ik hier ben heb ik logo omdat ik|1|3|-0.6490448165400009 + + +""" + + +strees = {} +for x in streestrings: + strees[x] = etree.fromstring(streestrings[x]) + +thequery = """ +.//node[ + ( (@word="geboren") or + + (@pt="adj" and + (@rel="mod" and + parent::node[@cat="np"] and + ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")] and + (not(@begin < ../node[@rel="det" and (@pt="lid" or @pt="vnw")]/@begin) or @lemma='heel' or @lemma='geheel') + ) + ) + or + + (@pt="adj" and + (@rel="hd" and + parent::node[@cat="ap" and parent::node[@cat="np"] and + ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")]] + ) + ) + or + + (@pt="tw" and @numtype="rang") + or + + (@pt="adj" and @rel="hd" and parent::node[@cat="np"]) + or + + ( + (@pt="tw" and @numtype="rang") + and @positie = "nom" ) + or + + (@pt="ww" and @wvorm="vd" and @rel="mod" and parent::node[@cat="np"]) + or + + (@pt="ww" and @wvorm="od" and @rel="mod" and parent::node[@cat="np"]) + or + + (@pt="adj" and ( (@rel="predc" or @rel="predm" ) and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )]) +) + or + + (@pt="adj" and @rel="hd" and parent::node[@cat="ap" and ( (@rel="predc" or @rel="predm" ) and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )]) +]) + or + (@rel="det" and @pt="vnw" and @vwtype="onbep") + + ) +] +""" + +#matches = strees[0].xpath(thequery) +matches = asta_bijzin(strees[0]) +for m in matches: + showtree(m) \ No newline at end of file diff --git a/alpino.py b/alpino.py index 4ddfe11..24dda43 100644 --- a/alpino.py +++ b/alpino.py @@ -30,9 +30,12 @@ def getdehetwordinfo(wrd): # we only want to consider nouns or words of unknown word class (such as kopje in CELEX) wordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[0] in ['n', 'None']] - # if any of the alternatives is a de-word, we empty the whole list - if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]): - wordinfos = [] + # if any of the alternatives is a de-word, we keep only these + dewordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[1] == lexicon.de] + if dewordinfos != []: + wordinfos = dewordinfos + #if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]): + # wordinfos = [] # if not found yet we check with Alpino if wordinfos != []: diff --git a/asta_neo.py b/asta_neo.py new file mode 100644 index 0000000..e5d6d0d --- /dev/null +++ b/asta_neo.py @@ -0,0 +1,147 @@ +from lxml import etree +#from CHAT_Annotation import specialform, errormarking + +specialform = 'Special Form' +errormarking = 'Error Marking' + +mdnamemdxpathtemplate = """.//xmeta[@name="{mdname}" and @value="{mdvalue}"]""" +ptposxpathtemplate = './/node[@pt and @begin="{position}"]' + +def mdbasedquery(stree, mdname, mdvalue): + mdnamemdxpath = mdnamemdxpathtemplate.format(mdname=mdname, mdvalue=mdvalue) + mdnamemds = stree.xpath(mdnamemdxpath) + results = [] + for mdnamemd in mdnamemds: + annotatedposstr = mdnamemd.attrib['annotatedposlist'] + if annotatedposstr != '': + mdbeginval = annotatedposstr[1:-1] + ptposxpath = ptposxpathtemplate.format(position=mdbeginval) + newresults = stree.xpath(ptposxpath) + results += newresults + + return results + +def neologisme(stree): + results1 = mdbasedquery(stree, errormarking,"['n']") + results2 = mdbasedquery(stree, specialform, '@n') + results = results1 + results2 + return results + +def sempar(stree): + results = mdbasedquery(stree, errormarking, "['s']") + return results + +def phonpar(stree): + results = mdbasedquery(stree, errormarking, "['p']") + return results + + +def test(stree): + neoresults = neologisme(stree) + semparresults = sempar(stree) + phonparresults = phonpar(stree) + results = [('neo', neoresult) for neoresult in neoresults] +\ + [('sempar', semparresult) for semparresult in semparresults] +\ + [('phonpar', phonparresult) for phonparresult in phonparresults] + return results + +def main(): + for i in strees: + results = test(strees[i]) + for result in results: + print('{}: {}:{}'.format(result[0], result[1].attrib['word'], result[1].attrib['begin'])) + + + +streestrings = {} + +streestrings[1] = """ + + + + + + + + + + + + ik heb geduusterd + + Q#ng1646152422|ik heb geduusterd|1|1|-5.158487943820001 + + +""" + + +streestrings[2] = """ + + + + + + + + + ik heb ngeduusterd + + Q#ng1646219407|ik heb ngeduusterd|1|1|-1.6311900273499995 + + +""" + +streestrings[3] = """ + + + + + + + + + ik heb nngeduusterd + + Q#ng1646219408|ik heb nngeduusterd|1|1|-1.6311900273499995 + + +""" +streestrings[4] = """ + + + + + + + + + ik heb pgeduusterd + + Q#ng1646219409|ik heb pgeduusterd|1|1|-1.6311900273499995 + + +""" + +streestrings[5] = """ + + + + + + + + + ik heb sgeduusterd + + Q#ng1646219410|ik heb sgeduusterd|1|1|-1.6311900273499995 + + + +""" + +strees = {} +for i in streestrings: + strees[i] = etree.fromstring(streestrings[i]) + +if __name__ == '__main__': + main() diff --git a/asta_queries.py b/asta_queries.py index 5097681..575a80d 100644 --- a/asta_queries.py +++ b/asta_queries.py @@ -213,13 +213,18 @@ def asta_bijzin(stree): if getattval(cn1, 'begin') == getattval(cn0, 'begin'): cn0end = getattval(cn0, 'end') newbegin = cn0end - newokptnode = find1(cn1, '//node[@pt and @begin={newbegin}]'.format(newbegin=newbegin)) - result = sortedclausenodes[2:] + okptnodes + [newokptnode] + newokptnodexpath = '//node[@pt and @begin="{newbegin}"]'.format(newbegin=newbegin) + newokptnode = find1(cn1, newokptnodexpath) + result = sortedclausenodes[2:] + okptnodes + if newokptnode is not None: + result += [newokptnode] else: result = sortedclausenodes[1:] + okptnodes else: result = sortedclausenodes[1:] + okptnodes + #ad hoc statement to ensure that there are no None matches should not happen anymore + result = [el for el in result if el is not None] return result diff --git a/basicreplacements.py b/basicreplacements.py index f369702..339b327 100644 --- a/basicreplacements.py +++ b/basicreplacements.py @@ -41,6 +41,9 @@ ('effe', 'even', pron, infpron, varpron), ('set', 'zet', pron, infpron, initdev), ('hie', 'hier', pron, pronerr, codared), ('eers', 'eerst', pron, pronerr, codared), + ('era', 'eraf', pron, pronerr, codared), + ('il', 'wil', pron, pronerr, onsetred), + ('tee', 'twee', pron, pronerr, onsetred), ('nie', 'niet', pron, infpron, codared), ('s', 'is', orth, spellerr, apomiss), ('ooke', 'ook', pron, infpron, addschwa), ('it', 'dit', pron, pronerr, onsetred), @@ -67,6 +70,7 @@ ('dis', ['dit', 'is'], pron, infpron, contract), ('das', ['dat', 'is'], pron, infpron, contract), ('tis', ['dit', 'is'], pron, infpron, contract), + ('waas', ['waar', 'is'], pron, infpron, contract), ('is-t-ie', ['is', 'ie'], pron, infpron, t_ie), ('als-t-ie', ['als', 'ie'], pron, infpron, t_ie), ('of-t-ie', ['of', 'ie'], pron, infpron, t_ie), diff --git a/checkcorrection.py b/checkcorrection.py new file mode 100644 index 0000000..021cbea --- /dev/null +++ b/checkcorrection.py @@ -0,0 +1,66 @@ +''' +Compares the errorlogging file with the error reference file +''' + +import os +from xlsx import getxlsxdata + +dataset = 'vkltarsp' +dataset = 'vklstap' +dataset = 'vklasta' + +if dataset == 'vkltarsp': + resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\tarsp' + dataprefix = 'tarsp' + +elif dataset == 'vklstap': + resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata' + dataprefix = 'stap' + +elif dataset == 'vklasta': + resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\asta' + dataprefix = 'asta' + + +errorloggingfilename = dataprefix + '_errorlogging.xlsx' +errorloggingfullname = os.path.join(resultspath, errorloggingfilename) + +referencepath = r'D:\jodijk\Dropbox\Surfdrive\Shared\SASTAPLUS\November' +errorreffilename = dataprefix + '_error_ref.xlsx' +errorreffullname = os.path.join(referencepath, errorreffilename) + +logheader, logdata = getxlsxdata(errorloggingfullname) +refheader, refdata = getxlsxdata(errorreffullname) + +refdict = {(row[0], row[1]): row[3] for row in refdata} + +correctcorrections = 0 +missedcorrections = 0 +wrongcorrections = 0 +for row in logdata: + key = (row[0], row[5]) + if 'BEST' in row[10]: + logsent = row[9] + if key not in refdict: + print('Missing example in refdict: {}'.format(key)) + print(row[9]) + missedcorrections += 1 + else: + refsent = refdict[key] + if refsent != logsent: + print('Mismatch: {}'.format(key)) + print('refsent=<{}>'.format(refsent)) + print('logsent=<{}>'.format(logsent)) + wrongcorrections += 1 + else: + correctcorrections += 1 + +allcorrections = correctcorrections + wrongcorrections + missedcorrections + +correctioncounts = [correctcorrections, wrongcorrections, missedcorrections] +labels = ['correct corrections', 'wrong corrections', 'missed corrections'] +labeled_corrections = zip(labels, correctioncounts) + +print('\nSummary:\n') +for label, corr in labeled_corrections: + print('{} = {} ({:.2f}%)'.format(label, corr, corr / allcorrections * 100)) \ No newline at end of file diff --git a/cleanCHILDEStokens.py b/cleanCHILDEStokens.py index f7e074c..9d4a922 100644 --- a/cleanCHILDEStokens.py +++ b/cleanCHILDEStokens.py @@ -20,6 +20,16 @@ bstate, ostate, oostate, costate, ccstate = 0, 1, 2, 3, 4 +#this should be identical to the checkpattern of cleanCHILDESMD +# #checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ123456789·\u22A5\u00B7\u0001\u2260\u21AB]') +# checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ·\u22A5\u00B7\u0001\u2260\u21AB]') +# # + should not occur except as compound marker black+board +# # next one split up in order to do substitutions +# pluspattern = re.compile(r'(\W)\+|\+(\W)') +# pluspattern1 = re.compile(r'(\W)\+') +# pluspattern2 = re.compile(r'\+(\W)') +illegalcleanedchatsymbols = '<>' + def findscopeclose(tokens, offset=0): tokenctr = 0 @@ -83,22 +93,31 @@ def checkline(line, newline, outfilename, lineno, logfile): print('charcodes=<{}>'.format(thecodes), file=logfile) -def cleantext(utt, repkeep): +def purifytokens(tokens): + result = [token for token in tokens if token.word not in illegalcleanedchatsymbols] + return result + +def cleantext(utt, repkeep, tokenoutput=False): newutt = robustness(utt) tokens = sastatok.sasta_tokenize(newutt) inwordlist = [t.word for t in tokens] intokenstrings = [str(token) for token in tokens] # print(space.join(intokenstrings)) (newtokens, metadata) = cleantokens(tokens, repkeep) + #remove symbol tokens that should not be there anymore + newtokens = purifytokens(newtokens) resultwordlist = [t.word for t in newtokens] resultstring = smartjoin(resultwordlist) resultposlist = [t.pos for t in newtokens] newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none) newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none) - newmeta3 = Meta('cleanedtokenpositions', resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none) + newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none) metadata += [newmeta1, newmeta2, newmeta3] resultmetadata = metadata - return (resultstring, resultmetadata) + if tokenoutput: + return(newtokens, resultmetadata) + else: + return (resultstring, resultmetadata) def cleantokens(tokens, repkeep): @@ -133,7 +152,10 @@ def removesuspects(str): return result -robustnessrules = [(re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'), +robustnessrules = [(re.compile(r'\u2026'), '\u2026', '...', 'Horizontal Ellipsis (\u2026, Unicode U+2026) replaced by a sequence of three Full Stops (..., Unicode U+002E) '), + (re.compile('#'), '#', '', 'Number Sign (#, Unicode U+0023) removed'), + #(re.compile('#'), '#', '(.)', 'Number Sign (#, Unicode U+0023) replaced by CHAT (short) pause code: (.)'), + (re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'), (re.compile(r'\[\+trn\]'), '[+trn]', '[+ trn]', 'Missing space'), (re.compile(r'\[:(?![:\s])'), '[:', '[: ', 'Missing space'), (re.compile(r'(?<=\w)\+\.\.\.'), '+...', ' +...', 'Missing space'), diff --git a/corrector.py b/corrector.py index d5c78ba..a0947dd 100644 --- a/corrector.py +++ b/corrector.py @@ -1,15 +1,5 @@ '''' -Jij moet er dan voor zorgen dat je in de CHAT file die je produceert iedere uiting afgaat en een call doet naar een functie - -getcorrection -met als argument de string van de uiting. - -Deze functie geeft dan terug een tuple (correction, metadata) - -waarbij -• correction een string is die je op moet nemen in de chat file als de verbeterde uiting -• metadata metadata zijn a la PaQu (type, name, value) o.a. origutt van type text met als waarde de inputstring - +to be added ''' import copy @@ -26,7 +16,7 @@ getunwantedtokens, nodesfindjaneenou) from deregularise import correctinflection from iedims import getjeforms -from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart +from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart, tswnouns from macros import expandmacros # from namepartlexicon import namepart_isa_namepart from sastatok import sasta_tokenize @@ -36,7 +26,7 @@ vowels) from sva import getsvacorrections from tokenmd import TokenListMD, TokenMD, mdlist2listmd -from treebankfunctions import find1, getattval, getnodeyield +from treebankfunctions import find1, getattval, getnodeyield, showtree, treeinflate, fatparse from lxml import etree import sys # from alternative import Alternative, Replacement, Metadata, Meta @@ -46,6 +36,7 @@ from alpinoparsing import parse, escape_alpino_input from expandquery import expandmacros from find_ngram import findmatches, ngram1, ngram2, ngram7, ngram10, ngram11, ngram16, ngram17 +from smallclauses import smallclauses SASTA = 'SASTA' @@ -177,7 +168,8 @@ def reduce(tokens, tree): # remove tsw incl goh och hé oke but not ja, nee, nou tswtokens = [n for n in reducedtokens if n.pos in token2nodemap and getattval(token2nodemap[n.pos], 'pt') == 'tsw' - and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'}] + and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'} + and getattval(token2nodemap[n.pos], 'lemma') not in tswnouns] tswpositions = [n.pos for n in tswtokens] allremovetokens += tswtokens allremovepositions == tswpositions @@ -413,11 +405,12 @@ def getcorrection(utt, tree=None, interactive=False): return result -def getcorrections(utt, method, tree=None, interactive=False): - origutt = utt +def getcorrections(rawtokens, method, tree=None, interactive=False): allmetadata = [] - rawtokens = sasta_tokenize(utt) + # rawtokens = sasta_tokenize(utt) wordlist = tokenlist2stringlist(rawtokens) + utt = space.join(wordlist) + origutt = utt # check whether the tree has the same yield origtree = tree @@ -426,7 +419,7 @@ def getcorrections(utt, method, tree=None, interactive=False): if treewordlist != wordlist: revisedutt = space.join(wordlist) - tree = PARSE_FUNC(revisedutt) + tree = fatparse(revisedutt, rawtokens) tokens, metadata = cleantokens(rawtokens, repkeep=False) allmetadata += metadata @@ -489,8 +482,9 @@ def getalternatives(origtokensmd, method, tree, uttid): # now turn each sequence of (token, md) pairs into a pair (tokenlist, mergedmetadata) newaltuttmds = [] for altuttmd in altutts: - newaltuttmd = mdlist2listmd(altuttmd) - newaltuttmds.append(newaltuttmd) + if altuttmd != []: + newaltuttmd = mdlist2listmd(altuttmd) + newaltuttmds.append(newaltuttmd) # basic expansions @@ -508,8 +502,8 @@ def getalternatives(origtokensmd, method, tree, uttid): for uttmd in allalternativemds: # utterance = space.join([token.word for token in uttmd.tokens]) utterance, _ = mkuttwithskips(uttmd.tokens) - ntree = PARSE_FUNC(utterance) - newresults += getwrongdetalternatives(uttmd, ntree, uttid) + fatntree = fatparse(utterance, uttmd.tokens) + newresults += getwrongdetalternatives(uttmd, fatntree, uttid) allalternativemds += newresults newresults = [] @@ -518,9 +512,11 @@ def getalternatives(origtokensmd, method, tree, uttid): utterance, _ = mkuttwithskips(uttmd.tokens) # reducedtokens = [t for t in uttmd.tokens if not t.skip] # reduceduttmd = TokenListMD(reducedtokens, uttmd.metadata) - ntree = PARSE_FUNC(utterance) - # simpleshow(ntree) - uttalternativemds = getsvacorrections(uttmd, ntree, uttid) + fatntree = fatparse(utterance, uttmd.tokens) + debug = False + if debug: + showtree(fatntree) + uttalternativemds = getsvacorrections(uttmd, fatntree, uttid) newresults += uttalternativemds allalternativemds += newresults @@ -528,8 +524,16 @@ def getalternatives(origtokensmd, method, tree, uttid): for uttmd in allalternativemds: # utterance = space.join([token.word for token in uttmd.tokens]) utterance, _ = mkuttwithskips(uttmd.tokens) - ntree = PARSE_FUNC(utterance) - newresults += correctPdit(uttmd, ntree, uttid) + fatntree = fatparse(utterance, uttmd.tokens) + newresults += correctPdit(uttmd, fatntree, uttid) + allalternativemds += newresults + + newresults = [] + for uttmd in allalternativemds: + utterance, _ = mkuttwithskips(uttmd.tokens) + fatntree = fatparse(utterance, uttmd.tokens) + newresults += smallclauses(uttmd, fatntree) + # showtree(fatntree, text='fatntree') allalternativemds += newresults # final check whether the alternatives are improvements. It is not assumed that the original tokens is included in the alternatives @@ -570,7 +574,7 @@ def mkuttwithskips(tokens, delete=True): return result, tokenposlist -def getexpansions(uttmd): +def oldgetexpansions(uttmd): expansionfound = False newtokens = [] tokenctr = 0 @@ -612,6 +616,50 @@ def getexpansions(uttmd): return result + +def getexpansions(uttmd): + expansionfound = False + newtokens = [] + tokenctr = 0 + #newtokenctr = 0 + tokenposlist = [] + newmd = uttmd.metadata + for tokenctr, token in enumerate(uttmd.tokens): + if token.word.lower() in basicexpansions: + expansionfound = True + for (rlist, c, n, v) in basicexpansions[token.word.lower()]: + rlisttokenctr = 0 + for rlisttokenctr, rw in enumerate(rlist): + if rlisttokenctr == 0: + newtoken = Token(rw, token.pos) + else: + newtoken = Token(rw, token.pos, subpos=rlisttokenctr) + newtokens.append(newtoken) + tokenposlist.append(token.pos) + nwt = Token(space.join(rlist), token.pos) + meta1 = mkSASTAMeta(token, nwt, n, v, c, subcat=None, penalty=defaultpenalty, + backplacement=bpl_none) + newmd.append(meta1) + + else: + newtoken = Token(token.word, token.pos) + newtokens.append(newtoken) + tokenposlist.append(token.pos) + + # adapt the metadata + if expansionfound: + meta2 = Meta('OrigCleanTokenPosList', tokenposlist, annotatedposlist=[], + annotatedwordlist=[], annotationposlist=tokenposlist, + annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=defaultpenalty, + backplacement=bpl_none) + newmd.append(meta2) + result = [TokenListMD(newtokens, newmd)] + else: + result = [] + + return result + + def lexcheck(intokensmd, allalternativemds): finalalternativemds = [intokensmd] for alternativemd in allalternativemds: @@ -708,7 +756,7 @@ def explanationasreplacement(tokensmd, tree): bpl = bpl_node if known_word(oldword) else bpl_word meta = mkSASTAMeta(oldtoken, newtoken, name='ExplanationasReplacement', value='ExplanationasReplacement', - cat='Lexical Error', backplacement=bpl_node) + cat='Lexical Error', backplacement=bpl) newmetadata.append(meta) result = TokenListMD(newtokens, newmetadata) return result @@ -925,10 +973,10 @@ def getwrongdetalternatives(tokensmd, tree, uttid): meta = mkSASTAMeta(token, newcurtoken, name='GrammarError', value='deheterror', cat='Error', backplacement=bpl_node) metadata.append(meta) + correctiondone = True else: newcurtokenword = token.word newtokens.append(Token(newcurtokenword, token.pos)) - correctiondone = True else: newcurtokenword = token.word newtokens.append(token) @@ -959,24 +1007,24 @@ def correctPdit(tokensmd, tree, uttid): metadata = tokensmd.metadata newtokens = [] tokenctr = 0 + nonskiptokenctr = 0 prevtoken = None for token in tokens: - tokennode = next(filter(lambda x: getattval(x, 'begin') == str(tokenctr), tokennodes), None) + tokennode = next(filter(lambda x: getattval(x, 'begin') == str(token.pos + token.subpos), tokennodes), None) tokenlemma = getattval(tokennode, 'lemma') if not token.skip and prevtoken is not None and not prevtoken.skip and tokenlemma in {'dit', 'dat', 'deze', 'die'}: tokenrel = getattval(tokennode, 'rel') tokenpt = getattval(tokennode, 'pt') - prevtokennode = tokennodes[tokenctr - 1] if tokenctr > 0 else None + prevtokennode = tokennodes[nonskiptokenctr - 1] if tokenctr > 0 else None if prevtokennode is not None: prevpt = getattval(prevtokennode, 'pt') prevparent = prevtokennode.getparent() prevparentrel, prevparentcat = getattval(prevparent, 'rel'), getattval(prevparent, 'cat') indezemwp = getindezemwp(prevtokennode, tokennode) - if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'obj1', - 'det'} and tokenpt == 'vnw') or \ + if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'det'} and tokenpt == 'vnw') or \ indezemwp: - newtoken = Token('hem', tokenctr) + newtoken = Token('hem', token.pos, subpos=token.subpos) bpl = bpl_indeze if indezemwp else bpl_node meta = mkSASTAMeta(token, newtoken, name='parsed as', value='hem', cat='AlpinoImprovement', backplacement=bpl) @@ -990,6 +1038,8 @@ def correctPdit(tokensmd, tree, uttid): else: newtokens.append(token) tokenctr += 1 + if not token.skip: + nonskiptokenctr += 1 prevtoken = token result = TokenListMD(newtokens, metadata) if correctiondone: diff --git a/correcttreebank.py b/correcttreebank.py index 9354add..25912d7 100644 --- a/correcttreebank.py +++ b/correcttreebank.py @@ -4,8 +4,7 @@ from lxml import etree from basicreplacements import basicreplacements -from cleanCHILDEStokens import cleantext -from corrector import getcorrections, mkuttwithskips +from corrector import getcorrections, mkuttwithskips, disambiguationdict from lexicon import de, dets, known_word from metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, bpl_none, bpl_word, bpl_wordlemma) @@ -17,8 +16,13 @@ deletewordnodes, find1, getattval, getbeginend, getcompoundcount, getnodeyield, getsentid, gettokposlist, getyield, myfind, showflatxml, - simpleshow, transplant_node) + simpleshow, transplant_node, showtree, treeinflate, fatparse, treewithtokenpos, + updatetokenpos, getuttid) from config import PARSE_FUNC, SDLOGGER +from metadata import insertion +from sastatoken import inflate, deflate, tokeninflate, insertinflate +from CHAT_Annotation import omittedword +from cleanCHILDEStokens import cleantext ampersand = '&' @@ -123,61 +127,52 @@ def contextualise(node1, node2): newnode.attrib[prop] = node2.attrib[prop] return newnode +def updatemetadata(metadata, tokenposdict): + begintokenposdict = {k-1: v-1 for (k, v) in tokenposdict.items()} + newmetadata = [] + for meta in metadata: + newmeta = deepcopy(meta) + newmeta.annotationposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotationposlist] + newmeta.annotatedposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotatedposlist] + newmetadata.append(newmeta) + return newmetadata -def updatetokenpos(resulttree, tokenposdict): - # resulttree = deepcopy(stree) - for child in resulttree: - newchild = updatetokenpos(child, tokenposdict) - if ('pt' in resulttree.attrib or 'pos' in resulttree.attrib) and 'end' in resulttree.attrib and 'begin' in resulttree.attrib: - intend = int(resulttree.attrib['end']) - if intend in tokenposdict: - newendint = tokenposdict[intend] - resulttree.attrib['end'] = str(newendint) - resulttree.attrib['begin'] = str(newendint - 1) - else: - SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend)) - etree.dump(resulttree) - SDLOGGER.error('tokenposdict={}'.format(tokenposdict)) - elif 'cat' in resulttree.attrib: - children = [ch for ch in resulttree] - (b, e) = getbeginend(children) - resulttree.attrib['begin'] = b - resulttree.attrib['end'] = e +def updatetokenposmd(intree, metadata, tokenposdict): + resulttree = updatetokenpos(intree, tokenposdict) + newmetadata = updatemetadata(metadata, tokenposdict) + return resulttree, newmetadata - return resulttree def findskippednodes(stree, tokenlist): + debug = False + if debug: + showtree(stree, text='findskippednodes:stree:') topnode = find1(stree, './/node[@cat="top"]') - # tokenposdict = {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))} - tokenposdict = {} - elctr = 0 - i = 0 - for tok in tokenlist: - elctr += 1 - if not tok.skip: - tokenposdict[elctr] = i + 1 - i += 1 - resultlist = findskippednodes2(topnode, tokenposdict) + #tokenposdict = {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))} + tokenposset = {t.pos + 1 for t in tokenlist if not t.skip} + resultlist = findskippednodes2(topnode, tokenposset) return resultlist -def findskippednodes2(stree, tokenposdict): +def findskippednodes2(stree, tokenposset): resultlist = [] if stree is None: return resultlist if 'pt' in stree.attrib or 'pos' in stree.attrib: - if int(stree.attrib['end']) not in tokenposdict: + if int(stree.attrib['end']) not in tokenposset: resultlist.append(stree) elif 'cat' in stree.attrib: for child in stree: - resultlist += findskippednodes2(child, tokenposdict) + resultlist += findskippednodes2(child, tokenposset) else: pass return resultlist -def insertskips(newstree, tokenlist, stree): + + +def insertskips(newstree, tokenlist, stree): ''' :param newstree: the corrected tree, with skipped elements absent @@ -185,58 +180,81 @@ def insertskips(newstree, tokenlist, stree): :param stree: original stree with parses of the skipped elements :return: adapted tree, with the skipped elements inserted (node from the original stree as -- under top, begin/ends updates ''' - # debug = True debug = False if debug: - print('\nnewstree:') - etree.dump(newstree) - resulttree = deepcopy(newstree) + showtree(newstree, 'newstree:') + showtree(stree, 'stree') + reducedtokenlist = [t for t in tokenlist if not t.skip] + resulttree = treewithtokenpos(newstree, reducedtokenlist) + + if debug: + showtree(resulttree, text='resulttree:') + streetokenlist = [ t for t in tokenlist if t.subpos == 0] + stree = treewithtokenpos(stree, streetokenlist) + if debug: + showtree(stree, text='stree with tokenpos:') + debug = False # tokenpostree = deepcopy(stree) # update begin/ends - reducedtokenlist = [t for t in tokenlist if not t.skip] - tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))} - resulttree = updatetokenpos(resulttree, tokenposdict) + #next not needed anymore + #tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))} + #showtree(resulttree, text='in: ') + #resulttree, newmetadata = updatetokenposmd(resulttree, metadata, tokenposdict) + #showtree(resulttree, text='out:') # tokenpostree = updatetokenpos(tokenpostree, tokenposdict) - if debug: - print('\nstree:') - etree.dump(stree) - # print('\ntokenpostree:') - # etree.dump(tokenpostree) - print('\nresulttree:') - etree.dump(resulttree) + #if debug: + # print('\nstree:') + # etree.dump(stree) + # # print('\ntokenpostree:') + # # etree.dump(tokenpostree) + # print('\nresulttree:') + # etree.dump(resulttree) # insert skipped elements nodestoinsert = findskippednodes(stree, tokenlist) nodestoinsertcopies = [deepcopy(n) for n in nodestoinsert] - # simpleshow(stree) + if debug: + showtree(stree, text='insertskips: stree:') + if debug: + showtree(resulttree, text='insertskips: resulttree:') topnode = find1(resulttree, './/node[@cat="top"] ') topchildren = [ch for ch in topnode] allchildren = nodestoinsertcopies + topchildren sortedchildren = sorted(allchildren, key=lambda x: x.attrib['end'], reverse=True) - # simpleshow(stree) + if debug: + showtree(resulttree, text='insertskips: resulttree:') for ch in topnode: topnode.remove(ch) - # simpleshow(stree) + if debug: + showtree(resulttree, text='insertskips: resulttree:') for node in sortedchildren: node.attrib['rel'] = '--' # these are now extragrammatical with relation -- topnode.insert(0, node) - # simpleshow(stree) + if debug: + showtree(resulttree, text='insertskips: resulttree:') (b, e) = getbeginend(sortedchildren) topnode.attrib['begin'] = b topnode.attrib['end'] = e - # simpleshow(stree) + if debug: + showtree(resulttree, text='insertskips: resulttree:') sentlist = getyield(resulttree) sent = space.join(sentlist) sentnode = find1(resulttree, 'sentence') sentnode.text = sent if debug: - print('result of insertskips') - etree.dump(resulttree) + showtree(resulttree, 'result of insertskips') return resulttree +def getomittedwordbegins(metalist): + results = [] + for meta in metalist: + if meta.name == omittedword: + results += meta.annotatedposlist + return results + def correct_stree(stree, method, corr): ''' @@ -255,7 +273,7 @@ def correct_stree(stree, method, corr): print(showflatxml(stree)) allmetadata = [] - allorandalts = [] + orandalts = [] # uttid: uttid = getuttid(stree) @@ -266,7 +284,7 @@ def correct_stree(stree, method, corr): origutt = getorigutt(stree) if origutt is None: SDLOGGER.error('Missing origutt in utterance {}'.format(uttid)) - return stree + return stree, orandalts # list of token positions # get the original metadata; these will be added later to the tree of each correction @@ -282,19 +300,33 @@ def correct_stree(stree, method, corr): # allmetadata += origmetadata # clean in the tokenized manner - cleanutt, chatmetadata = cleantext(origutt, False) + cleanutttokens, chatmetadata = cleantext(origutt, False, tokenoutput=True) allmetadata += chatmetadata - cleanutttokens = sasta_tokenize(cleanutt) + #cleanutttokens = sasta_tokenize(cleanutt) cleanuttwordlist = [t.word for t in cleanutttokens] + cleanutt = space.join(cleanuttwordlist) - # get corrections, given the stree + # get corrections, given the inflated stree + #inflate the tree + fatstree = deepcopy(stree) + treeinflate(fatstree) + # adapt the begins and ends in the tree based on the token positions + debug = False + if debug: + showtree(fatstree, text='fatstree voor:') + tokenlist = [t for t in cleanutttokens] + fatstree = treewithtokenpos(fatstree, tokenlist) + if debug: + showtree(fatstree, text='fatstree na:') + debug = False + #(fatstree, text='fattened tree:') - ctmds = getcorrections(cleanutt, method, stree) + ctmds = getcorrections(cleanutttokens, method, fatstree) + debug = False if debug: - print('2:', end=': ') - simpleshow(stree) - print(showflatxml(stree)) + showtree(fatstree, text='2:') + debug = False ptmds = [] for correctiontokenlist, cwmdmetadata in ctmds: @@ -302,70 +334,89 @@ def correct_stree(stree, method, corr): correctionwordlist = tokenlist2stringlist(correctiontokenlist, skip=True) # parse the corrections - if correctionwordlist != cleanuttwordlist: - # @@@adapt this, skip the tokens to be skipped@@@ - # correction = space.join(correctionwordlist) + if correctionwordlist != cleanuttwordlist and correctionwordlist != []: correction, tokenposlist = mkuttwithskips(correctiontokenlist) cwmdmetadata += [Meta('parsed_as', correction, cat='Correction', source='SASTA')] - newstree = PARSE_FUNC(correction) - if newstree is None: - newstree = stree # is this what we want?@@ + reducedcorrectiontokenlist = [token for token in correctiontokenlist if not token.skip] + fatnewstree = fatparse(correction, reducedcorrectiontokenlist) + debugb = False + if debugb: + showtree(fatnewstree, text='fatnewstree') + + if fatnewstree is None: + fatnewstree = fatstree # is this what we want?@@ else: # insert the leftout words and adapt the begin/ends of the nodes # simpleshow(stree) - newstree = insertskips(newstree, correctiontokenlist, stree) + fatnewstree = insertskips(fatnewstree, correctiontokenlist, fatstree) + #newstree = insertskips(newstree, correctiontokenlist, stree) # simpleshow(stree) mdcopy = deepcopy(origmetadata) - newstree.insert(0, mdcopy) + fatnewstree.insert(0, mdcopy) # copy the sentid attribute - sentencenode = getsentencenode(newstree) + sentencenode = getsentencenode(fatnewstree) if sentencenode is not None: sentencenode.attrib['sentid'] = sentid - if debug: - print(etree.tostring(newstree, pretty_print=True)) - # etree.dump(newstree) + if debugb: + showtree(fatnewstree) + # etree.dump(fatnewstree) else: # make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata - newstree = add_metadata(stree, chatmetadata) + fatnewstree = add_metadata(fatstree, chatmetadata) - ptmds.append((correctionwordlist, newstree, cwmdmetadata)) + ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata)) # select the stree for the most promising correction + debug = False if debug: print('3:', end=': ') - simpleshow(stree) - print(showflatxml(stree)) + showtree(fatnewstree) + debug = False if ptmds == []: - thecorrection, orandalts = (cleanutt, stree, origmetadata), None + thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None elif corr in [corr1, corrn]: - thecorrection, orandalts = selectcorrection(stree, ptmds, corr) + thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr) else: SDLOGGER.error('Illegal correction value: {}. No corrections applied'.format(corr)) - thecorrection, orandalts = (cleanutt, stree, origmetadata), None + thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None thetree = deepcopy(thecorrection[1]) - if debug: - print('4:', end=': ') - simpleshow(stree) - print(showflatxml(stree)) + #debuga = True + debuga = False + if debuga: + print('4: (fatstree)') + etree.dump(fatstree, pretty_print=True) # do replacements in the tree - # etree.dump(thetree) + if debuga: + print('4b: (thetree)') + etree.dump(thetree, pretty_print=True) reverseposindex = gettokposlist(thetree) + if debuga: + print('4b: (thetree)') + etree.dump(thetree, pretty_print=True) + # resultposmeta = selectmeta('cleanedtokenpositions', allmetadata) # resultposlist = resultposmeta.value newcorrection2 = thecorrection[2] nodes2deletebegins = [] + # next adapted, the tree is fat already + debug = False + if debug: + showtree(thetree, text='thetree before treewithtokenpos') + thetree = treewithtokenpos(thetree, correctiontokenlist) + if debug: + showtree(thetree, text='thetree after treewithtokenpos') for meta in thecorrection[2]: if meta.backplacement == bpl_node: nodeend = meta.annotationposlist[-1] + 1 newnode = myfind(thetree, './/node[@pt and @end="{}"]'.format(nodeend)) - oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend)) + oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend)) if newnode is not None and oldnode is not None: # adapt oldnode1 for contextual features contextoldnode = contextualise(oldnode, newnode) @@ -374,7 +425,7 @@ def correct_stree(stree, method, corr): nodeend = meta.annotationposlist[-1] + 1 nodexpath = './/node[@pt and @begin="{}" and @end="{}"]'.format(nodeend - 1, nodeend) newnode = myfind(thetree, nodexpath) - oldnode = myfind(stree, nodexpath) + oldnode = myfind(fatstree, nodexpath) if newnode is not None and oldnode is not None: if 'word' in newnode.attrib and 'word' in oldnode.attrib: newnode.attrib['word'] = oldnode.attrib['word'] @@ -403,28 +454,39 @@ def correct_stree(stree, method, corr): elif meta.backplacement == bpl_indeze: nodebegin = meta.annotatedposlist[-1] nodeend = nodebegin + 1 - oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend)) + oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend)) if oldnode is not None: nodeid = oldnode.attrib['id'] dezeAVnode = etree.fromstring(dezeAVntemplate.format(begin=nodebegin, end=nodeend, id=nodeid)) thetree = transplant_node(oldnode, dezeAVnode, thetree) - # etree.dump(thetree, pretty_print=True) + #etree.dump(thetree, pretty_print=True) + + # now do all the deletions at once, incl adaptation of begins and ends, and new sentence node + debug = False + if debug: + showtree(thetree, text='thetree before deletion:') - # now do all the deletions at once, incl normalisation of begins and ends, and new sentence node + nodes2deletebegins = [int(b) for b in nodes2deletebegins] thetree = deletewordnodes(thetree, nodes2deletebegins) + if debug: + showtree(thetree, text='thetree after deletion:') + + debug = False + # adapt the metadata cleantokposlist = [meta.annotationwordlist for meta in newcorrection2 if meta.name == 'cleanedtokenpositions'] cleantokpos = cleantokposlist[0] if cleantokposlist != [] else [] - newcorrection2 = [updatecleantokmeta(meta, nodes2deletebegins, cleantokpos) for meta in newcorrection2] + insertbegins = [meta.annotatedposlist for meta in newcorrection2 if meta.name == insertion ] + flatinsertbegins = [str(v) for el in insertbegins for v in el] + purenodes2deletebegins = [str(v) for v in nodes2deletebegins if str(v) not in flatinsertbegins] + newcorrection2 = [updatecleantokmeta(meta, purenodes2deletebegins, cleantokpos) for meta in newcorrection2] - # etree.dump(thetree, pretty_print=True) + #etree.dump(thetree, pretty_print=True) if debug: - print('5:', end=': ') - simpleshow(stree) - print(showflatxml(stree)) + showtree(fatstree, text='5:') restoredtree = thetree @@ -451,12 +513,19 @@ def correct_stree(stree, method, corr): metadata.append(meta.toElement()) if debug: - streesentlist = getyield(stree) + streesentlist = getyield(fatstree) fulltreesentlist = getyield(fulltree) if streesentlist != fulltreesentlist: SDLOGGER.warning('Yield mismatch\nOriginal={original}\nAfter correction={newone}'.format(original=streesentlist, newone=fulltreesentlist)) - + rawoldleavenodes = getnodeyield(fatstree) + omittedwordbegins = getomittedwordbegins(newcorrection2) + oldleavenodes = [n for n in rawoldleavenodes if int(getattval(n, 'begin')) not in omittedwordbegins] + oldleaves = [ getattval(n, 'word') for n in oldleavenodes] + newleaves = getyield(fulltree) + uttid = getuttid(stree) + if debug and oldleaves != newleaves: + SDLOGGER.error('Yield mismatch:{uttid}\n:OLD={oldleaves}\nNEW={newleaves}'.format(uttid=uttid, oldleaves=oldleaves, newleaves=newleaves)) # return this stree # print('dump 2:') # etree.dump(fulltree, pretty_print=True) @@ -487,7 +556,7 @@ def updatecleantokmeta(meta, begins, cleantokpos): return meta -def getuttid(stree): +def oldgetuttid(stree): uttidlist = stree.xpath(uttidxpath) if uttidlist == []: SDLOGGER.error('Missing uttid') @@ -507,14 +576,14 @@ def getorigutt(stree): def scorefunction(obj): return (-obj.unknownwordcount, -obj.dpcount, -obj.dhyphencount, obj.goodcatcount, - -obj.basicreplaceecount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount, + -obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount, obj.compoundcount, obj.sucount, obj.svaok, -obj.deplusneutcount, -obj.penalty) class Alternative(): def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount, compcount, supcount, compoundcount, unknownwordcount, sucount, svaok, deplusneutcount, goodcatcount, - hyphencount, basicreplaceecount): + hyphencount, basicreplaceecount, ambigcount): self.stree = stree self.altid = altid self.altsent = altsent @@ -532,6 +601,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcou self.goodcatcount = int(goodcatcount) self.hyphencount = int(hyphencount) self.basicreplaceecount = int(basicreplaceecount) + self.ambigcount = int(ambigcount) def alt2row(self, uttid, base, user1='', user2='', user3='', bestaltids=[], selected=None, origsent=None): scores = ['BEST'] if self.altid in bestaltids else [] @@ -651,6 +721,12 @@ def isvalidword(w): return True +def countambigwords(stree): + leaves = getnodeyield(stree) + ambignodes = [leave for leave in leaves if getattval(leave, 'word').lower() in disambiguationdict] + result = len(ambignodes) + return result + def selectcorrection(stree, ptmds, corr): # to be implemented@@ # it is presupposed that ptmds is not [] @@ -677,9 +753,10 @@ def selectcorrection(stree, ptmds, corr): hyphencount = len([node for node in nt.xpath('.//node[contains(@word, "-")]')]) basicreplaceecount = len([node for node in nt.xpath('.//node[@word]') if getattval(node, 'word').lower() in basicreplacements]) + ambigwordcount = countambigwords(nt) alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount, compcount, supcount, compoundcount, unknownwordcount, sucount, svaokcount, deplusneutcount, goodcatcount, - hyphencount, basicreplaceecount) + hyphencount, basicreplaceecount, ambigwordcount) alts[altid] = alt altid += 1 orandalts = OrigandAlts(orig, alts) diff --git a/find_ngram.py b/find_ngram.py index e4a661c..1f1e41f 100644 --- a/find_ngram.py +++ b/find_ngram.py @@ -192,6 +192,8 @@ def cond17(ns, lvs, i): return lemma(ns[0]) == 'te' and getattval(ns[1], 'his') def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen' and lemma(ns[2]) == 'te' +def cond18(ns, lvs, i): return pt(ns[0]) == 'vz' and lemma(ns[1]) in {'dit', 'dat', 'deze', 'die'} + ngram1 = Ngram(4, cond1) ngram2 = Ngram(4, cond2) ngram3 = Ngram(2, cond3) @@ -211,7 +213,7 @@ def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen' ngram16a = Ngram(4, cond16a) # geen beroerte een beroerte test ngram17 = Ngram(4, cond17) # te kregen te krijgen ngram17a = Ngram(4, cond17a) # te kregen te krijgen test - +ngram18 = Ngram(2, cond18) # met dit def main(): @@ -231,7 +233,7 @@ def main(): leaves = getnodeyield(tree) cleanleaves = [leave for leave in leaves if getattval(leave, 'word') not in filledpauseslexicon] cleanwordlist = [getattval(leave, 'word') for leave in cleanleaves] - matches = findmatches(ngram1, cleanleaves) + matches = findmatches(ngram18, cleanleaves) # matches = sipvjpvjsi(cleanleaves, tree) for match in matches: uttid = getuttid(tree) diff --git a/lexicon.py b/lexicon.py index 9692346..380e1dc 100644 --- a/lexicon.py +++ b/lexicon.py @@ -12,6 +12,9 @@ lexicon = celex +#Alpino often analyses certain words as tsw though they should be analysed as nouns +tswnouns = ['baby', 'jongen', 'juf', 'jufforouw', 'mam', 'mama', 'mamma', 'meisje', 'mens', 'meneer', 'mevrouw', + 'pap', 'papa', 'pappa', 'stouterd', 'opa', 'oma'] de = '1' het = '2' diff --git a/macros/newimperatives.txt b/macros/newimperatives.txt index 2ab72e1..490a40f 100644 --- a/macros/newimperatives.txt +++ b/macros/newimperatives.txt @@ -54,8 +54,11 @@ nonfinvc = """(@rel="vc" and %nonfincat%) """ realcomplormodnode = """node[%realcomplormod%]""" realcomplormod = """(not(%particlesvp%) and not(%indexnode%) and not(%nonfinvc%) and not(@rel="hd"))""" indexnode = """(@index and not (@cat or @pt or @pos))""" +suindexnode = """(%indexnode% and @rel="su") """ nonfinindexnode = """(%indexnode% and parent::node[%nonfinvc%])""" +fillednode = """node[not(%indexnode%)]""" + particlesvp = """(@rel="svp" and @pt="vz")""" realcomplormodnodecount = """count(%realcomplormodnode% | node[%nonfinvc%]/%realcomplormodnode%)""" @@ -94,9 +97,23 @@ wond5plus = """(%ynquery% and %realcomplormodnodecount% >= 4)""" partofwhquestion = """((@cat="sv1" or @cat="ssub") and @rel="body" and parent::node[@cat="whq" or @cat="whsub" ]) """ declarative = """(@cat="smain" or (@cat="ssub" and not(%partofwhquestion%)) or (@cat="sv1" and not(%basicimperative%) and not(%ynquery%) and not(%partofwhquestion%)) )""" -Tarsp_OndWB = """ -(%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %realcomplormodnodecount% = 2 ) -""" +Tarsp_OndB = """(%Ond% and node[%Tarsp_Basic_B%] and count(node) = 2)""" + +Tarsp_OndVC = """(%Ond% and node[%Tarsp_Basic_VC%] and count(node) = 2) """ + +Tarsp_OndBVC = """(%Ond% and node[%Tarsp_Basic_B%] and node[%Tarsp_Basic_VC%] and count(node) = 3) """ + +Tarsp_OndW = """(%declarative% and %Ond% and (%Tarsp_W% or node[%Tarsp_onlyWinVC%]) and %realcomplormodnodecount% = 0 )""" + +Tarsp_onlyWinVC = """(@rel="vc" and node[@rel="hd" and @pt="ww" and %realcomplormodnodecount% = 0])""" + + +Tarsp_OndWB = """(%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %realcomplormodnodecount% = 2 )""" + +Tarsp_BasicVCW = """(node[@pt="ww" and @rel="hd"] and node[%Tarsp_Basic_VC%] and count(%fillednode%)=2)""" + +Tarsp_VCW_X = """(%Tarsp_BasicVCW% or (node[%nonfinvc% and %Tarsp_BasicVCW%] and count(node)=1) )""" + Tarsp_OndWBVC = """ (%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %Tarsp_VC_X% and %realcomplormodnodecount% = 3 ) @@ -180,6 +197,8 @@ Tarsp_Ov3 = """(%declarative% and not(%Tarsp_OndWB%) and not(%Tarsp_BBX%)and not(%Tarsp_WBVC%) and + not(%Tarsp_OndB%) and + not(%Tarsp_OndVC%) and %realcomplormodnodecount% = 2) """ @@ -190,11 +209,12 @@ Tarsp_kijkVU = """(@pt="ww" and @lemma="kijken" and @wvorm="pv" and @pvagr="ev" Tarsp_pporvc = """ (((@rel="pc" or @rel="mod" or @rel="ld") and @cat="pp") or @rel="vc")""" -Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"]) and +Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"] or %Tarsp_BarenonfinW%) and not(%Tarsp_kijkVU%) and not((@lemma="zijn" or @lemma="worden") and parent::node[node[@rel="vc"]]) )""" - + +Tarsp_BarenonfinW = """parent::node[@rel="vc" and parent::node[@cat="smain" and count(node)=1]]""" Tarsp_Hwwi = """(( @pt="ww" and @rel="hd" and @wvorm="pv" and %Tarsp_hww% and diff --git a/macros/sastamacros1.txt b/macros/sastamacros1.txt index e976d0c..5034941 100644 --- a/macros/sastamacros1.txt +++ b/macros/sastamacros1.txt @@ -37,9 +37,9 @@ JO_kijken_naar = """ parent::node[@cat="pp" and robusttopicdrop = """(@cat="sv1" and ../node[@lemma="."])""" Tarsp_hww = """ - (@lemma="kunnen" or + (@lemma = "kunnen" or @lemma = "moeten" or - @lemma= "hoeven" or + @lemma = "hoeven" or @lemma = "blijven" or @lemma = "willen" or @lemma = "zullen" or @@ -59,6 +59,7 @@ Tarsp_vc_sibling = """parent::node[ node[@rel="vc"]]""" Tarsp_predc_sibling = """parent::node[ node[@rel="predc"]]""" Tarsp_obj1_sibling = """parent::node[ node[@rel="obj1"]]""" Tarsp_ld_sibling = """parent::node[ node[@rel="ld"]]""" +Tarsp_onlymodR_sibling = """(parent::node[node[@rel="mod" and %Rpronoun%] and not(node[@rel="predc"])])""" Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and (( @@ -66,7 +67,7 @@ Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and @lemma = "hebben" ) and not(%Tarsp_vc_sibling%)) or - (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling%) + (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling% ) ) """ @@ -78,7 +79,7 @@ Tarsp_Kop = """ ((%Tarsp_predc_sibling% and not(%Tarsp_obj1_sibling%)) or - (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%)) + (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%) and not(%Tarsp_onlymodR_sibling%)) ) ) """ @@ -128,6 +129,13 @@ pv = """(@pt="ww" and @wvorm="pv" )""" bxnp1 = """(@cat="np" and count(node)=2 and node[@rel="hd" and @pt="ww"] and node[@rel="mod" and @pt])""" bxnp2 = """(@cat="np" and count(node)=2 and node[@rel="hd"] and node[@rel="mod" and %singlewordbw%])""" +Tarsp_Basic_VC = """((@rel="obj1" or @rel="pc" or @rel="predc" or @rel="ld" or @rel="obj2" or %Tarsp_finvc% or %Tarsp_vcvnw% or (@rel="svp" and @pt!="vz")) and not(%Tarsp_Basic_B%) )""" + + +Tarsp_Basic_B = """(@rel="mod" or @rel="ld" or @rel="predm" or %Tarsp_B_predc%) """ + +Tarsp_B_predc = """(@rel=predc and (@pt="vz" or @pt="bw" or @cat="pp" or @cat="advp" or %Rpronoun%))""" + Tarsp_B = """( ((((@rel="mod" or @rel="ld" or @rel="predm") and (not(@cat) or @cat!="conj") and @@ -169,9 +177,13 @@ pobj1B = """(@rel="pc" and ../node[@rel="hd" and %locverb%])""" singlewordbw = """ (@pt="bw" or %Rpronoun% or %adjadv%) """ + + corephrase = """(@cat="np" or @cat="pp" or @cat="advp" or @cat="ap")""" -coreBX = """(node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]])""" +coreBX = """((node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]]) )""" + +Tarsp_bnonfin = """((@cat="inf" or @cat="ppart") and @rel="vc" and parent::node[@cat="smain" and count(node)=1] and node[%Tarsp_B%] and node[@pt="ww" and @rel="hd"] and count(node[%realcomplormod%])=1 )""" ASTA_pred = """(@rel="predc" or @rel="predm" or (@rel="hd" and parent::node[@rel="predc" or @rel="predm"]))""" @@ -301,7 +313,12 @@ spec_noun = """ (@pt="spec" and (@pos="name" or starts-with(@frame,"proper_name" """ - asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or (@pt="ww" and @positie="nom") or (%monthname%) or @pos="name") + asta_numvrij = """(@pt="tw" and @positie="vrij" and @rel!="mwp" and @rel!="det" and @rel!="mod" )""" + + asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or + (@pt="ww" and @positie="nom") or + (%monthname%) or + @pos="name" ) """ @@ -479,3 +496,11 @@ robustdelpv = """(not(@rel="dp" and @begin > ancestor::node[@cat="top"]/descenda delpv = """(%coredelpv% and %robustdelpv%)""" +Vobij = """(@pt="bw" and (contains(@frame,"er_adverb" ) or contains(@frame, "tmp_adverb") or @lemma="daarom") and +@lemma!="er" and @lemma!="daar" and @lemma!="hier" and (starts-with(@lemma, 'er') or starts-with(@lemma, 'daar') or starts-with(@lemma, 'hier')))""" + +Tarsp_VzN = """(%vzn1xpath% or %vzn2xpath% ) """ + +vzn1xpath = """(@cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"])))""" +vzn2xpath = """(node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"])""" +vzn3xpath = """(@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin>=../node[@pt="vz"]/@end and count(node)<=3] )""" diff --git a/metadata.py b/metadata.py index 2760f14..0205aaa 100644 --- a/metadata.py +++ b/metadata.py @@ -17,7 +17,8 @@ class Meta: def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], annotatedposlist=[], - annotatedwordlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty, + annotatedwordlist=[], annotationcharlist=[], annotationcharposlist=[], annotatedcharlist=[], + annotatedcharposlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty, backplacement=defaultbackplacement): self.atype = atype self.name = name @@ -25,6 +26,10 @@ def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], ann self.annotationposlist = annotationposlist self.annotatedwordlist = annotatedwordlist self.annotatedposlist = annotatedposlist + self.annotationcharlist = annotationcharlist + self.annotationcharposlist = annotationcharposlist + self.annotatedcharlist = annotatedcharlist + self.annotatedcharposlist = annotatedcharposlist self.value = value self.cat = cat self.subcat = subcat @@ -93,3 +98,7 @@ def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalt repetition = 'Repetition' fstoken = 'Retraced token' falsestart = 'Retracing with Correction' +insertion = 'Insertion' +smallclause = 'Small Clause Treatment' +tokenmapping = 'Token Mapping' +insertiontokenmapping = 'Insertion Token Mapping' \ No newline at end of file diff --git a/methods/ASTA Index Current.xlsx b/methods/ASTA Index Current.xlsx index b575d44..ad598d8 100644 Binary files a/methods/ASTA Index Current.xlsx and b/methods/ASTA Index Current.xlsx differ diff --git a/methods/TARSP Index 2022-01-07.xlsx b/methods/TARSP Index 2022-01-07.xlsx new file mode 100644 index 0000000..1547465 Binary files /dev/null and b/methods/TARSP Index 2022-01-07.xlsx differ diff --git a/methods/TARSP Index Current.xlsx b/methods/TARSP Index Current.xlsx index 75e9075..6b00d71 100644 Binary files a/methods/TARSP Index Current.xlsx and b/methods/TARSP Index Current.xlsx differ diff --git a/methods/~$TARSP Index Current.xlsx b/methods/~$TARSP Index Current.xlsx new file mode 100644 index 0000000..8a7c89f Binary files /dev/null and b/methods/~$TARSP Index Current.xlsx differ diff --git a/mismatches.py b/mismatches.py index 214b938..9686ae5 100644 --- a/mismatches.py +++ b/mismatches.py @@ -1,10 +1,11 @@ - import os from collections import Counter from copy import copy from lxml import etree from config import SDLOGGER from treebankfunctions import getyield, getmarkedyield, getattval +from sastatoken import deflate + tab = '\t' space = ' ' eps = '' @@ -13,6 +14,7 @@ usercommentuntil = 3 usercommentdefaultvalue = eps + def getmarkedutt(m, syntree): thewordlist = getyield(syntree) thepositions = getwordpositions(m, syntree) @@ -20,10 +22,12 @@ def getmarkedutt(m, syntree): yieldstr = space.join(themarkedyield) return yieldstr + def mark(str): - result = '*'+ str + '*' + result = '*' + str + '*' return result + def getwordpositionsold(matchtree, syntree): positions1 = [] for node in matchtree.iter(): @@ -35,7 +39,7 @@ def getwordpositionsold(matchtree, syntree): for node in syntree.iter(): if 'index' in node.attrib and ('pt' in node.attrib or 'cat' in node.attrib or 'pos' in node.attrib): theindex = node.attrib['index'] - indexednodes[theindex]=node + indexednodes[theindex] = node thequery2 = ".//node[@index and not(@pt) and not(@cat)]" try: @@ -49,8 +53,9 @@ def getwordpositionsold(matchtree, syntree): result = [int(p) for p in positions] return result + def getwordpositions(matchtree, syntree): - #nothing special needs to be done for index nodes since they also have begin and end + # nothing special needs to be done for index nodes since they also have begin and end positions = [] for node in matchtree.iter(): if 'end' in node.attrib: @@ -58,6 +63,7 @@ def getwordpositions(matchtree, syntree): result = [int(p) for p in positions] return result + def getfirstwordposition(matchtree): if 'begin' in matchtree.attrib: positionstr = getattval(matchtree, 'begin') @@ -67,7 +73,6 @@ def getfirstwordposition(matchtree): return position - def getmarkedyield(wordlist, positions): pos = 1 resultlist = [] @@ -102,8 +107,23 @@ def mismatches(queryid, queries, theresultsminusgold, goldminustheresults, allma uttstr] print(tab.join(platinumcheckrow2), file=platinumcheckfile) -def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile, permsilverdatadict={}, annotationinput=False): +def getmarkposition(position, nodeendmap, uttid): + if position == 0: + result = 1 + elif uttid in nodeendmap: + if str(position) in nodeendmap[uttid]: + result = nodeendmap[uttid][str(position)] + else: + SDLOGGER.error('getmarkposition: No mapping found for position {} in utterance {}'.format(position, uttid)) + result = 1 + else: + SDLOGGER.error('getmarkposition: No mappings found for uttid {}'.format(uttid)) + result = 1 + return result + +def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile, + permsilverdatadict={}, annotationinput=False): theexactresults = exactresults[queryid] if queryid in exactresults else Counter() theexactgoldscores = exactgoldscores[queryid] if queryid in exactgoldscores else Counter() (theresultsminusgold, goldminustheresults, intersection) = exactcompare(theexactresults, theexactgoldscores) @@ -117,13 +137,13 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, markedwordlist = getmarkedyield(allutts[uttid], [markposition]) uttstr = space.join(markedwordlist) platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, - str(uttid), str(position), uttstr] + str(uttid), str(markposition), uttstr] print(tab.join(platinumcheckrow1), file=platinumcheckfile) key = (queryid, uttid, position) usercomments = getusercomments(permsilverdatadict, key, report=True) - xlplatinumcheckrow1 = usercomments + ['More examples'] + platinumcheckrow1 + xlplatinumcheckrow1 = usercomments + ['More examples'] + platinumcheckrow1 newrows.append(xlplatinumcheckrow1) - #for (m, syntree) in allmatches[(queryid, uttid)]: + # for (m, syntree) in allmatches[(queryid, uttid)]: # if getfirstwordposition(m) == position: # markedutt = getmarkedutt(m, syntree) # platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, @@ -139,9 +159,11 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, markedwordlist = getmarkedyield(allutts[uttid], [markposition]) uttstr = space.join(markedwordlist) else: - SDLOGGER.warning('uttid {} not in alluts'.format(uttid)) + SDLOGGER.warning('uttid {} not in allutts'.format(uttid)) uttstr = "" - platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid), str(position), + markposition = 0 + platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid), + str(markposition), uttstr] print(tab.join(platinumcheckrow2), file=platinumcheckfile) key = (queryid, uttid, position) @@ -150,6 +172,7 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, newrows.append(xlplatinumcheckrow2) return newrows + def compareunaligned(resultctr, goldctr): ''' @@ -168,20 +191,21 @@ def compareunaligned(resultctr, goldctr): takefromresultlist.append((utt1, pos1)) takefromgoldlist.append((utt1, 0)) newintersection.append((utt1, pos1)) - curgoldlist.remove((utt1,0)) + curgoldlist.remove((utt1, 0)) elif pos1 == 0: for (utt2, pos2) in curgoldlist: if utt1 == utt2: takefromresultlist.append((utt1, pos1)) takefromgoldlist.append((utt1, pos2)) newintersection.append((utt1, pos2)) - curgoldlist.remove((utt2,pos2)) + curgoldlist.remove((utt2, pos2)) break takefromresultctr = Counter(takefromresultlist) takefromgoldctr = Counter(takefromgoldlist) newintersectionctr = Counter(newintersection) return (takefromresultctr, takefromgoldctr, newintersectionctr) + def exactcompare(exactresults, exactgoldscores): ''' compares two lists of exact results, i.e. dlists of pairs (uttid, position) @@ -227,18 +251,21 @@ def getusercomments(permsilverdict, key, report=False): SDLOGGER.warning('No silver remark for key: {}'.format(key)) return result + def testcompare(): - testresults = [(1,2),(1,2), (1,2), (1,5), (1,6),(2,0), (2, 4)] - goldresults = [(1,2), (2,4), (2,6), (1,0), (3,5)] - reftestminusgold = [(1,2), (1,5), (1,6)] - refgoldminustest = [(3,5)] - refintersection = [(1,2), (1,2), (2,4), (2,6)] + testresults = [(1, 2), (1, 2), (1, 2), (1, 5), (1, 6), (2, 0), (2, 4)] + goldresults = [(1, 2), (2, 4), (2, 6), (1, 0), (3, 5)] + reftestminusgold = [(1, 2), (1, 5), (1, 6)] + refgoldminustest = [(3, 5)] + refintersection = [(1, 2), (1, 2), (2, 4), (2, 6)] (testminusgold, goldminustest, intersection) = exactcompare(testresults, goldresults) - for (l, r,g ) in zip(['R-G', 'G-R', 'R*G'],[testminusgold, goldminustest, intersection],[reftestminusgold, refgoldminustest, refintersection]): + for (l, r, g) in zip(['R-G', 'G-R', 'R*G'], [testminusgold, goldminustest, intersection], + [reftestminusgold, refgoldminustest, refintersection]): if r == g: - print('{}: OK {} == {}'.format(l, r,g)) + print('{}: OK {} == {}'.format(l, r, g)) else: - print('{}: NO: {} != {}'.format(l, r,g)) + print('{}: NO: {} != {}'.format(l, r, g)) + if __name__ == '__main__': - testcompare() \ No newline at end of file + testcompare() diff --git a/queryfunctions.py b/queryfunctions.py index 954a618..c0412e6 100644 --- a/queryfunctions.py +++ b/queryfunctions.py @@ -7,7 +7,7 @@ vzn1basexpath = './/node[ @cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"]))]' vzn1xpath = expandmacros(vzn1basexpath) vzn2xpath = './/node[node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"]]' -vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin=../node[@pt="vz"]/@end and count(node)<=3] ]' +vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin>=../node[@pt="vz"]/@end and count(node)<=3] ]' #vzn4basexpath = './/node[node[@pt="vz" and @rel="hd" and ../node[%Rpronoun% and @rel="obj1" and @end <= ../node[@rel="hd"]/@begin]]]' #vzn4xpath = expandmacros(vzn4basexpath) diff --git a/readcsv.py b/readcsv.py index 1cc5694..a6f9ecb 100644 --- a/readcsv.py +++ b/readcsv.py @@ -6,10 +6,10 @@ mysep = tab -def readcsv(filename, sep=mysep, header=True, quotechar='"'): +def readcsv(filename, sep=mysep, header=True, quotechar='"', encoding='utf8'): result = [] try: - infile = open(filename, 'r', encoding='utf8', newline='') + infile = open(filename, 'r', encoding=encoding, newline='') except FileNotFoundError as e: SDLOGGER.error(e) return result @@ -25,11 +25,11 @@ def readcsv(filename, sep=mysep, header=True, quotechar='"'): return result -def readheadedcsv(filename, sep=mysep, quotechar='"'): +def readheadedcsv(filename, sep=mysep, quotechar='"', encoding='utf8'): result = [] header = [] try: - infile = open(filename, 'r', encoding='utf8', newline='') + infile = open(filename, 'r', encoding=encoding, newline='') except FileNotFoundError as e: SDLOGGER.error(e) return header, result diff --git a/sastadev.py b/sastadev.py index cc2b549..653a943 100644 --- a/sastadev.py +++ b/sastadev.py @@ -43,7 +43,8 @@ from SAFreader import get_annotations, get_golddata, richscores2scores, exact2global, richexact2global from SAFreader import all_levels from external_functions import str2functionmap -from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno +from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno, \ + showtree, getnodeendmap, getxselseuttid from SRFreader import read_referencefile from goldcountreader import get_goldcounts from TARSPscreening import screening4stage @@ -53,7 +54,7 @@ from query import pre_process, core_process, post_process, form_process, is_preorcore, query_inform, query_exists, \ is_pre, is_core from macros import expandmacros -from mismatches import mismatches, exactmismatches +from mismatches import mismatches, exactmismatches, getmarkposition from xlsx import mkworkbook import xlsxwriter from counterfunctions import counter2liststr @@ -285,7 +286,7 @@ def isxpathquery(query): def doqueries(syntree, queries, exactresults, allmatches, criterion): uttid = getuttid(syntree) - #uttid = getuttidorno(syntree) + # uttid = getuttidorno(syntree) omittedwordpositions = getxmetatreepositions(syntree, 'Omitted Word', poslistname='annotatedposlist') # print(uttid) # core queries @@ -313,6 +314,9 @@ def doqueries(syntree, queries, exactresults, allmatches, criterion): exactresults[queryid] = [] # matchingids = [uttid for x in matches] for m in matches: + # showtree(m) + if m is None: + showtree(syntree) if (queryid, uttid) in allmatches: allmatches[(queryid, uttid)].append((m, syntree)) else: @@ -485,6 +489,18 @@ def exact2results(exactresults): return results +def adaptpositions(rawexactresults, nodeendmap): + newexactresults = {} + for qid in rawexactresults: + newlist = [] + for (uttid, position) in rawexactresults[qid]: + newposition = getmarkposition(position, nodeendmap, uttid) + newtuple = (uttid, newposition) + newlist.append(newtuple) + newexactresults[qid] = newlist + return newexactresults + + def passfilter(rawexactresults, method): ''' let's only those through that satisfy the @@ -669,6 +685,7 @@ def passfilter(rawexactresults, method): platinumoutfilename, options.platinuminfilename, goldscores) analysedtrees = [] +nodeendmap = {} # @vanaf nu gaat het om een treebank, dus hier een if statement toevoegen-done if annotationinput: @@ -715,18 +732,31 @@ def passfilter(rawexactresults, method): analysedtrees.append(syntree) doprequeries(syntree, queries, rawexactresults, allmatches) docorequeries(syntree, queries, rawexactresults, allmatches) - uttid = getuttid(syntree) - uttno = getuttno(syntree) - allutts[uttno] = getyield(syntree) - # allutts[uttid] = getyield(syntree) + + # uttid = getuttid(syntree) + uttid = getxselseuttid(syntree) + # showtree(syntree) + if uttid in nodeendmap: + SDLOGGER.error('Duplicate uttid in sample: {}'.format(uttid)) + nodeendmap[uttid] = getnodeendmap(syntree) + + # uttno = getuttno(syntree) + # allutts[uttno] = getyield(syntree) + allutts[uttid] = getyield(syntree) # determine exactresults and apply the filter to catch interdependencies between prequeries and corequeries # rawexactresults = getexactresults(allmatches) - exactresults = passfilter(rawexactresults, themethod) + rawexactresults2 = passfilter(rawexactresults, themethod) + exactresults = adaptpositions(rawexactresults2, nodeendmap) + + #pas hier de allutts en de rawexactresults2 aan om expansies te ontdoen, gebseerd op de nodeendmap + #@@to be implemented @@ of misschien in de loop hierboven al? # @ en vanaf hier kan het weer gemeenschappelijk worden; er met dus ook voor de annotatiefile een exactresults opgeleverd worden # @d epostfunctions for lemma's etc moeten mogelijk wel aangepast worden +# adapt the exactresults positions to the reference + coreresults = exact2results(exactresults) @@ -959,7 +989,9 @@ def passfilter(rawexactresults, method): logheader = ['datetime', 'treebank', 'scorenr,' 'R', 'P', 'F1', 'P-R', 'P-P', 'P-F1', 'GP-R', 'GP-P', 'GP-F1', 'ref', 'method'] logname = 'sastalog.txt' -biglogfile = open(logname, 'a', encoding='utf8') +logpath = r'D:\jodijk\Dropbox\jodijk\myprograms\python\sastacode\sastadev' +logfullname = os.path.join(logpath, logname) +biglogfile = open(logfullname, 'a', encoding='utf8') exactlynow = datetime.datetime.now() now = exactlynow.replace(microsecond=0).isoformat() diff --git a/sastatok.py b/sastatok.py index adb6c04..6072d60 100644 --- a/sastatok.py +++ b/sastatok.py @@ -61,5 +61,5 @@ def sasta_tokenize(instring): if instring is None: return [] tokenstring = fullsastare.findall(instring) - result = stringlist2tokenlist(tokenstring) + result = stringlist2tokenlist(tokenstring, start=10, inc=10) return result diff --git a/sastatoken.py b/sastatoken.py index cdb987c..50004f6 100644 --- a/sastatoken.py +++ b/sastatoken.py @@ -15,12 +15,12 @@ def __repr__(self): def __str__(self): skipstr = ' (skip=True)' if self.skip else '' - subposstr = '.{}' if self.subpos != 0 else '' + subposstr = '/{}'.format(self.subpos) if self.subpos != 0 else '' result = '{}{}:{}{}'.format(self.pos, subposstr, self.word, skipstr) return result -def stringlist2tokenlist(list): +def oldstringlist2tokenlist(list): result = [] llist = len(list) for el in range(llist): @@ -29,6 +29,17 @@ def stringlist2tokenlist(list): return result +def stringlist2tokenlist(list, start=0, inc=1): + result = [] + llist = len(list) + pos = start + for el in range(llist): + thetoken = Token(list[el], pos) + result.append(thetoken) + pos += inc + return result + + def tokenlist2stringlist(tlist, skip=False): if skip: result = [t.word for t in tlist if not t.skip] @@ -49,3 +60,24 @@ def show(tokenlist): resultlist.append(str(token)) result = ', '.join(resultlist) return result + + +def tokeninflate(token): + result = inflate(token.pos) + token.subpos + return result + + +def deflate(n: int): + result = (n // 10) - 1 + return result + + +def inflate(n: int): + result = (n + 1) * 10 + return result + + +def insertinflate(n: int): + dm = n % 10 + result = ((n - dm) + 1) * 10 + dm + return result diff --git a/smallclauses.py b/smallclauses.py new file mode 100644 index 0000000..d44cbaa --- /dev/null +++ b/smallclauses.py @@ -0,0 +1,292 @@ +from config import SDLOGGER +from treebankfunctions import getstree, getnodeyield, getattval +from dedup import filledpauseslexicon +from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon +from lexicon import known_word, tswnouns +from namepartlexicon import namepart_isa_namepart +from sastatoken import Token, show +from tokenmd import TokenListMD +from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\ + insertiontokenmapping + +space = ' ' +biglocvzs = ['achter', 'beneden', 'binnen', 'boven', 'bovenop', 'buiten', 'dichtbij'] +#surenouns = ['mama', 'papa'] replaced by tswnouns from lexicon +longvowels = ['a', 'é', 'i', 'o', 'u', 'y'] +vowels = ['a', 'e', 'i', 'o', 'u'] + +uniquelynominativeperspros = ['ik', 'jij', 'hij', 'zij', 'wij', 'ikke', "'k", "k", "ie", "we"] + + +def makegen(lemma): + if lemma is None or len(lemma) < 2: + result = None + elif lemma[-1] in ['s', 'z', 'x']: + result = lemma + "'" + elif lemma[-2:] in [ 'ij']: + result = lemma + 's' + elif lemma[-2] in vowels and lemma[-1] in vowels: + result = lemma + 's' + elif lemma[-1] in longvowels: + result = lemma + "'s" + else: + result = lemma + 's' + return result + +def realword(node): + result = True + result = result and getattval(node, 'pt') not in ['tsw', 'let'] + result = result and getattval(node, 'lemma') not in ['xx', 'xxx', 'yyy', 'www', 'hè'] + result = result and getattval(node, 'lemma') not in filledpauseslexicon + result = result or lemma(node) in tswnouns + + + return result + + +def hasgenitive(node): + lemma = getattval(node, 'lemma') + nodept = pt(node) + if nodept not in ['n', 'vnw']: + nodept = 'n' + result = (lemma, nodept) in genlexicon and 'yes' in genlexicon[(lemma, nodept)] + result = result or namepart_isa_namepart(lemma) + return result + +def aanwvnw(node): + result = getattval(node, 'pt') == 'vnw' and getattval(node, 'vwtype') == 'aanw' and not rpronoun(node) + return result + + +def n(node): + result = getattval(node, 'pt') == 'n' + return result + + +def getal(node): + result = getattval(node, 'getal') + return result + +def pt(node): + result = getattval(node, 'pt') + return result + +def bg(node): + result = int(getattval(node, 'begin')) + return result + +def tw(node): + result = getattval(node, 'pt') == 'tw' + return result + +def word(node): + result = getattval(node, 'word') + return result + + +def adj(node): + result = getattval(node, 'pt') == 'adj' + return result + +def perspro(node): + pt = getattval(node, 'pt') + vwtype = getattval(node, 'vwtype') + result = pt == 'vnw' and vwtype == 'pers' + return result + +def nomperspro(node): + lemma = getattval(node, 'lemma') + result = perspro(node) and lemma in uniquelynominativeperspros + return result + +def inf(node): + result = getattval(node, 'pt') == 'ww' and getattval(node, 'wvorm') == 'inf' + return result + + +def rpronoun(node): + result = getattval(node, 'pt') == 'vnw' and \ + getattval(node, 'lemma') in ['er', 'hier', 'daar', 'ergens', 'overal', 'nergens', 'waar'] + return result + +def bw(node): + result = getattval(node, 'pt') == 'bw' + return result + +def ww(node): + result = getattval(node, 'pt') == 'ww' + return result + + +def lemma(node): + result = getattval(node, 'lemma') + return result + +def predadv(node): + result = locadv(node) + result = result or (bw(node) and lemma(node) in ['niet', 'mee', 'weg']) + return result + +def vz(node): + result = getattval(node, 'pt') == 'vz' + return result + +def locadv(node): + result = getattval(node, 'pt') in ['bw', 'vz'] + frame = getattval(node, 'frame') + result = result and ('loc' in frame or 'er_adverb' in frame) + result = result or rpronoun(node) + return result + +def biglocvz(node): + result = getattval(node, 'lemma') in biglocvzs + return result + +def istswnoun(node): + result = getattval(node, 'lemma') in tswnouns + return result + +def getleavestr(leaves): + leaveseq = ['{}:{}:{}:{}'.format(getattval(leave, 'end'), getattval(leave, 'word'), getattval(leave, 'lemma'), + getattval(leave, 'pt')) for leave + in leaves] + leavestr = space.join(leaveseq) + return leavestr + +def knownnoun(node): + word = getattval(node, 'word') + lemma = getattval(node, 'lemma') + postag = pt(node) + result = postag == 'n' and (known_word(word) or known_word(lemma)) + result = result or lemma in tswnouns + return result + +def nominal(node): + result = pt(node) == 'n' or aanwvnw(node) + return result + +def mktoken(node, map): + nodebegin = bg(node) + nodeword = word(node) + if nodebegin in map: + nodepos = map[nodebegin] + else: + SDLOGGER.error('missing begin in map {}'.format(nodebegin)) + nodepos = int(nodebegin) + result = Token(nodeword, nodepos) + return result + + +def mktokenlist(tokens, fpos, inserttokens): + resultlist = [token for token in tokens if token.pos <= fpos] + \ + inserttokens + \ + [token for token in tokens if token.pos > fpos] + return resultlist + + +def oldmktokenlist(leaves, themap, fpos, inserttokens): + resultlist = [mktoken(lv, themap) for lv in leaves if bg(lv) <= fpos] + \ + inserttokens + \ + [mktoken(lv, themap) for lv in leaves if bg(lv) > fpos] + return resultlist + + +def mkinsertmeta(inserttokens, resultlist): + insertposs = [token.pos + token.subpos for token in inserttokens] + insertwordlist = [token.word for token in inserttokens] + tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist] + metadata1 = [Meta(insertion, [insertword], annotatedposlist=[insertpos], + annotatedwordlist=[], annotationposlist=[insertpos], + annotationwordlist=[insertword], cat=smallclause, source=SASTA, penalty=defaultpenalty, + backplacement=bpl_delete) for insertword, insertpos in zip(insertwordlist, insertposs)] + meta2 = Meta(insertiontokenmapping, tokenmappinglist, cat=tokenmapping, source=SASTA, penalty=0, + backplacement=bpl_none) + metadata = metadata1 + [meta2] + return metadata + + +def smallclauses(tokensmd, tree): + resultlist = [] + leaves = getnodeyield(tree) + reducedleaves = [leave for leave in leaves if realword(leave)] + if not(len(reducedleaves) > 1 and len(reducedleaves) <= 3): + return resultlist + tokens = tokensmd.tokens + treewords = [word(tokennode) for tokennode in leaves] + tokenwords = [token.word for token in tokens if not token.skip] + if treewords != tokenwords: + SDLOGGER.error('Token mismatch: {} v. {}'.format(treewords, tokenwords)) + return [] + themap = {bg(tokennode): token.pos for (tokennode, token) in zip(leaves, tokens)} + metadata = tokensmd.metadata + + if len(reducedleaves) <= 3: + first = leaves[0] + second = leaves[1] + if len(reducedleaves) == 3: + third = leaves[0] + + if len(reducedleaves) == 2: + if (aanwvnw(first) or knownnoun(first) or perspro(first)) and (predadv(second)or vz(second) or bw(second)): + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('moet' if getal(first) != 'mv' else 'moeten', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + #elif (aanwvnw(second) or knownnoun(second) or perspro(second) or tw(second)) and predadv(first): + elif nomperspro(second) and predadv(first): + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('moet' if getal(second) != 'mv' else 'moeten', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + elif (aanwvnw(first) or knownnoun(first)) and adj(second): + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + elif (aanwvnw(second) or knownnoun(second) or tw(second)) and biglocvz(first): + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + elif knownnoun(first) and knownnoun(second) and not(lemma(first) == lemma(second)): + if hasgenitive(first): + genform = makegen(lemma(first)) + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('[: ' + genform + ']', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + else: + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + elif (aanwvnw(first) or knownnoun(first) or istswnoun(first)) and inf(second): + if intransitive(second): + firstsubject = True + elif transitive(second) and ishuman(first): + firstsubject = True + elif pseudotr(second) and (ishuman(first) or isanimate(first)): + firstsubject = True + else: + firstsubject = False + if firstsubject: + fpos = int(getattval(first, 'begin')) + inserttokens = [Token('wil' if getal(first) != 'mv' else 'willen', fpos, subpos=5)] + else: + fpos = -1 + inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + elif not nominal(first) and not ww(first) and inf(second): + fpos = -1 + inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)] + resultlist = mktokenlist(tokens, fpos, inserttokens) + metadata += mkinsertmeta(inserttokens, resultlist) + if resultlist == []: + result = [] + else: + result = [TokenListMD(resultlist, metadata)] + return result + + + + diff --git a/sva.py b/sva.py index 7b5defd..0b2f8e3 100644 --- a/sva.py +++ b/sva.py @@ -7,7 +7,7 @@ from tokenmd import TokenListMD from treebankfunctions import (copymodifynode, find1, getattval, getdetof, getheadof, getlemma, indextransform, inverted, - lbrother, nominal, rbrother, simpleshow) + lbrother, nominal, rbrother, simpleshow, showtree) debug = False @@ -356,12 +356,11 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata): pvbegin = getattval(thepv, 'begin') inversion = inverted(snode, thepv) reducedtokens = [t for t in tokens if not t.skip] - tokenposmap = {i: reducedtokens[i].pos for i in range(len(reducedtokens))} newpv = getpvform(snode, thepv, inversion) if newpv is None: results = [] else: - newpos = tokenposmap[int(pvbegin)] + newpos = int(pvbegin) newtoken = Token(newpv, newpos) for token in tokens: if token.pos != newpos: @@ -378,6 +377,9 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata): def getsvacorrections(tokensmd, rawtree, uttid): + debug = False + if debug: + showtree(rawtree, text='rawtree') if rawtree is None: return [] else: @@ -540,7 +542,7 @@ def phicompatible(snode, vnode): elif '2i' in vnodepersons: subjbegin = getattval(subjnode, 'begin') vnodeend = getattval(vnode, 'end') - result = subjperson == '2' and '2i' in vnodepersons and subjbegin == vnodeend and \ + result = subjperson == '2' and '2i' in vnodepersons and subjbegin >= vnodeend and \ subjnodelemma in ['jij', 'je'] elif 'u' in vnodepersons: subjnodelemma = getattval(subjnode, 'lemma') diff --git a/test_smallclauses.py b/test_smallclauses.py new file mode 100644 index 0000000..c16ea95 --- /dev/null +++ b/test_smallclauses.py @@ -0,0 +1,50 @@ +from config import SDLOGGER +from treebankfunctions import getstree, getnodeyield, getattval +from dedup import filledpauseslexicon +from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon +from lexicon import known_word +from namepartlexicon import namepart_isa_namepart +from sastatoken import Token, show +from tokenmd import TokenListMD +from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\ + insertiontokenmapping +from smallclauses import smallclauses, word, getleavestr, bg + + +testbank = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\TARSP\smallclausetest.xml" +schlichtingtreebank = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\schlichtingtreebank\TREEBANK_ID.xml' +mieke06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE06_ID.xml" +mieke08 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE08_ID.xml" +aurisraw = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AURIS_ELISKA_ORIGINAL_ID.xml" +tarsp02 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_02.xml" +tarsp06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_06.xml" +#schlichtingall = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\treebank_schlichting_all_examples\TREEBANK_SCHLICHTING_CHAT_ID.xml" + + + + + +def main(): + smalltest = True + if smalltest: + fullnames = [testbank] + else: + fullnames = [ schlichtingtreebank, mieke06, mieke08, aurisraw, tarsp02, tarsp06] + for infullname in fullnames: + print(infullname) + fulltreebank = getstree(infullname) + if fulltreebank is not None: + treebank = fulltreebank.getroot() + for tree in treebank: + leaves = getnodeyield(tree) + tokens = [Token(word(leave), bg(leave)) for leave in leaves] + tokensmd = TokenListMD(tokens, []) + resultlist = smallclauses(tokensmd, tree) + if resultlist != []: + print('input: ', getleavestr(leaves) ) + print('output: ', show(resultlist[0].tokens)) + print('result: ', resultlist[0].metadata) + + +if __name__ == '__main__': + main() diff --git a/top3000.py b/top3000.py new file mode 100644 index 0000000..7bf181d --- /dev/null +++ b/top3000.py @@ -0,0 +1,66 @@ +from xlsx import getxlsxdata +from treebankfunctions import getattval +from namepartlexicon import namepart_isa_namepart +from config import SD_DIR +import os + +def ishuman(node): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + vwtype = getattval(node, 'vwtype') + result = (lemma, pt ) in semlexicon and 'human' in semlexicon[(lemma, pt)] + result = result or vwtype == 'pers' + result = result or namepart_isa_namepart(lemma) + return result + +def isanimate(node): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + result = (lemma, pt ) in semlexicon and 'animate' in semlexicon[(lemma, pt)] + return result + + +def transitivity(node, tr): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + result = (lemma, pt ) in semlexicon and tr in trlexicon[(lemma, pt)] + return result + +def transitive(node): + return transitivity(node, 'tr') + +def pseudotr(node): + return transitivity(node, 'tr/intr') + + +def intransitive(node): + return transitivity(node, 'intr') + +semicolon = ';' + +filename = os.path.join(SD_DIR, r'top3000\Woordenlijsten Current.xlsx') + + +lexiconheader, lexicondata = getxlsxdata(filename) + +semlexicon = {} +trlexicon = {} +genlexicon = {} + +for row in lexicondata: + lemma = row[1].strip() + pt = row[5] + rawsems = row[6].split(semicolon) + sems = [el.strip() for el in rawsems] + semlexicon[(lemma, pt)] = sems + + rawtrs = row[8].split(semicolon) + trs = [el.strip() for el in rawtrs] + trlexicon[(lemma, pt)] = trs + + rawgens = row[9].split(semicolon) + gens = [el.strip() for el in rawgens] + genlexicon[(lemma, pt)] = gens + +#next statement for debugging purposes +junk = 0 \ No newline at end of file diff --git a/top3000/Woordenlijsten Current.xlsx b/top3000/Woordenlijsten Current.xlsx new file mode 100644 index 0000000..7b7ac56 Binary files /dev/null and b/top3000/Woordenlijsten Current.xlsx differ diff --git a/treebankfunctions.py b/treebankfunctions.py index ca5e4b1..c0b120b 100644 --- a/treebankfunctions.py +++ b/treebankfunctions.py @@ -12,6 +12,7 @@ from stringfunctions import allconsonants # from lexicon import informlexiconpos, isa_namepart_uc, informlexicon, isa_namepart import lexicon as lex +from config import PARSE_FUNC class Metadata: @@ -184,6 +185,23 @@ def ismainclausenode(node): return result +def getnodeendmap(stree): + leaves = getnodeyield(stree) + result = {getattval(leave, 'end'): i + 1 for i, leave in enumerate(leaves)} + return result + + +def getxselseuttid(syntree): + result = getmeta(syntree, 'xsid') + if result is None: + result = getmeta(syntree, 'uttid') + if result is None: + result = getsentid(syntree) + if result is None: + result = '0' + return result + + def getuttid(syntree): result = getmeta(syntree, 'uttid') if result is None: @@ -199,6 +217,7 @@ def getuttno(syntree): result = '0' return result + def getuttidorno(syntree): result = getmeta(syntree, 'xsid') if result is None: @@ -441,8 +460,9 @@ def inverted(thesubj, thepv): subjbegin = getattval(thesubj, 'begin') subjlemma = getattval(thesubj, 'lemma') pvend = getattval(thepv, 'end') + # maybe defien immediately-follows for inflated trees inversion = '2' == subjperson[0] and tense == 'tgw' and subjnumber in ['ev', 'getal'] and \ - pvend == subjbegin and subjlemma in ['jij', 'je'] # getal added for je + pvend <= subjbegin and subjlemma in ['jij', 'je'] # getal added for je return inversion @@ -1131,11 +1151,11 @@ def test(): def getsentid(stree): sentidlist = stree.xpath(sentidxpath) if sentidlist == []: - SDLOGGER.error('Missing uttid') - uttid = 'None' + SDLOGGER.error('Missing sentid') + result = 'None' else: - uttid = str(sentidlist[0]) - return uttid + result = str(sentidlist[0]) + return result def testindextransform(): @@ -1381,6 +1401,15 @@ def deletewordnode(tree, begin): return newtree +def showtree(tree, text=None): + if text is not None: + print(text) + if tree is not None: + etree.dump(tree, pretty_print=True) + else: + print('None') + + def deletechildlessparent(thenode): if list(thenode) == []: theparent = thenode.getparent() @@ -1388,8 +1417,12 @@ def deletechildlessparent(thenode): deletechildlessparent(theparent) -def deletewordnodes(tree, begins): +def olddeletewordnodes(tree, begins): + # print('tree:') + # etree.dump(tree, pretty_print=True) newtree = deepcopy(tree) + # print('newtree:') + # etree.dump(newtree, pretty_print=True) if newtree is None: return newtree else: @@ -1403,9 +1436,14 @@ def deletewordnodes(tree, begins): theparent.remove(thenode) # if the parent has no sons left, it should be deleted as well deletechildlessparent(theparent) + children = [n for n in theparent] + (minbegin, maxend) = getbeginend(children) + theparent.attrib['begin'] = minbegin + theparent.attrib['end'] = maxend + # # renumber begins and ends ; - normalisebeginend(newtree) + # normalisebeginend(newtree) temporarily put off # adapt the cleantokenisation # done outside this function @@ -1415,6 +1453,184 @@ def deletewordnodes(tree, begins): return newtree +def childless(node): + children = [ch for ch in node] + result = children == [] + return result + +def deletewordnodes(tree, begins): + newtree = deepcopy(tree) + newtree = deletewordnodes2(newtree, begins) + newtree = adaptsentence(newtree) + return newtree + +def deletewordnodes2(tree, begins): + if tree is None: + return tree + for child in tree: + if child.tag == 'node': + newchild = deletewordnodes2(child, begins) + else: + newchild = child + for child in tree: + if child.tag == 'node': + childbegin = getattval(child, 'begin') + childbeginint = int(childbegin) + if childbeginint in begins and childless(child): + tree.remove(child) + if 'cat' in child.attrib and childless(child): # if its children have been deleted earlier + tree.remove(child) + # tree begin en end bijwerken + if tree. tag == 'node': + newchildren = [n for n in tree] + if newchildren != []: + (minbegin, maxend) = getbeginend(newchildren) + tree.attrib['begin'] = minbegin + tree.attrib['end'] = maxend + return tree + + +def olddeletewordnodes2(tree, begins): + if tree is None: + return tree + else: + for child in tree: + newchild = deletewordnodes2(child, begins) + if tree.tag == 'node': + nodebegin = getattval(tree, 'begin') + children = [child for child in tree] + if int(nodebegin) in begins: # only words and indexnodes can be deleted + theparent = tree.getparent() + if theparent is not None: + if children == []: + theparent.remove(tree) + # if the parent has no sons left, it should be deleted as well + deletechildlessparent(theparent) + if theparent.tag == 'node': + newchildren = [n for n in theparent] + (minbegin, maxend) = getbeginend(newchildren) + theparent.attrib['begin'] = minbegin + theparent.attrib['end'] = maxend + return tree + + +def treeinflate(stree, start=10, inc=10): + # fatstree = deepcopy(stree) + if stree is None: + pass + else: + for child in stree: + treeinflate(child, start, inc) + children = [ch for ch in stree] + if stree.tag == 'node': + ib = int(getattval(stree, 'begin')) + ie = int(getattval(stree, 'end')) + newib = (ib + 1) * 10 + stree.attrib['begin'] = str(newib) + if iswordnode(stree): + stree.attrib['end'] = str(newib + 1) + elif 'cat' in stree.attrib: + (b, e) = getbeginend(children) + stree.attrib['begin'] = b + stree.attrib['end'] = e + else: + stree.attrib['begin'] = str((ib + 1) * 10) + stree.attrib['end'] = str((ie * 10) + 1) + + +def isidentitymap(dct): + result = all([key == value for key, value in dct.items()]) + return result + + +def updatetokenpos(stree, tokenposdict): + if stree is None: + return stree + if isidentitymap(tokenposdict): + return stree + resulttree = deepcopy(stree) + resulttree = updatetokenpos2(resulttree, tokenposdict) + finaltree = updateindexnodes(resulttree) + + return finaltree + +def updatetokenpos2(node, tokenposdict): + if node is None: + return node + for child in node: + newchild = updatetokenpos2(child, tokenposdict) + if node.tag == 'node': + if ('pt' in node.attrib or 'pos' in node.attrib) and \ + 'end' in node.attrib and 'begin' in node.attrib: + intend = int(node.attrib['end']) + if intend in tokenposdict: + newendint = tokenposdict[intend] + node.attrib['end'] = str(newendint) + node.attrib['begin'] = str(newendint - 1) + else: + SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend)) + fulltrees = node.xpath('ancestor::node[@cat="top"]') + if fulltrees != []: + fulltree = fulltrees[0] + else: + fulltree = node + sent = getyield(fulltree) + SDLOGGER.error('utterance={}'.format(sent)) + # etree.dump(resulttree) + SDLOGGER.error('tokenposdict={}'.format(tokenposdict)) + elif 'cat' in node.attrib: + children = [ch for ch in node] + (b, e) = getbeginend(children) + node.attrib['begin'] = b + node.attrib['end'] = e + return node + + + +def updateindexnodes(stree): + #presupposes that the non bareindex nodes have been adapted already + indexednodesmap = getindexednodesmap(stree) + newstree = deepcopy(stree) + for node in newstree.iter(): + if node.tag == 'node': + if bareindexnode(node): + idx = getattval(node, 'index') + newbegin = getattval(indexednodesmap[idx], 'begin') + newend = getattval(indexednodesmap[idx], 'end') + node.attrib['begin'] = newbegin + node.attrib['end'] = newend + return newstree + +def treewithtokenpos(thetree, tokenlist): + resulttree = deepcopy(thetree) + thetreeleaves = getnodeyield(thetree) + intbegins = [int(getattval(n, 'begin')) for n in thetreeleaves] + tokenlistbegins = [t.pos + t.subpos for t in tokenlist] + if len(intbegins) != len(tokenlistbegins): + SDLOGGER.error('token mismatch') + SDLOGGER.error('tree yield={}'.format(getyield(thetree))) + SDLOGGER.error('tokenlist={}'.format(tokenlist)) + SDLOGGER.error('intbegins={}'.format(intbegins)) + SDLOGGER.error('tokenlistbegins ={}'.format(tokenlistbegins)) + pospairs = zip(intbegins, tokenlistbegins) + thetreetokenposdict = {treepos + 1: tokenpos + 1 for treepos, tokenpos in pospairs} + resulttree = updatetokenpos(resulttree, thetreetokenposdict) + return resulttree + + +def fatparse(utterance, tokenlist): + stree = PARSE_FUNC(utterance) + fatstree = deepcopy(stree) + treeinflate(fatstree, start=10, inc=10) + debug = False + if debug: + showtree(fatstree, text='fatparse: fatstree') + reducedtokenlist = [token for token in tokenlist if not token.skip] + fatstree = treewithtokenpos(fatstree, reducedtokenlist) + if debug: + showtree(fatstree, text='fatparse: fatstree') + return fatstree + def update_cleantokenisation(stree, begin): ''' @@ -1473,8 +1689,10 @@ def normalisebeginend(stree): :param stree: syntactic structure :return: stree with the values of begin and end attributes normalised ''' - begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')] - sortedbegins = sorted(begins, key=lambda x: int(x)) + # etree.dump(stree, pretty_print=True) + # begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')] # we must include indexed nodes but not have duplicates + begins = {getattval(node, 'begin') for node in stree.xpath('.//node[count(node)=0]')} + sortedbegins = sorted(list(begins), key=lambda x: int(x)) normalisebeginend2(stree, sortedbegins)