diff --git a/.gitignore b/.gitignore
index 97c0b03..cceb264 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,4 +25,8 @@ env.bak/
venv.bak/
# configuration
-config.py
\ No newline at end of file
+config.py
+
+# additional files
+.idea/
+sastalog.txt
\ No newline at end of file
diff --git a/CHAT_Annotation.py b/CHAT_Annotation.py
index 1cfd985..edc006f 100644
--- a/CHAT_Annotation.py
+++ b/CHAT_Annotation.py
@@ -22,6 +22,9 @@
emptyreplacement = eps
anybutrb = r'[^\]]*'
+errormarking = 'Error Marking'
+omittedword = 'Omitted Word'
+specialform = 'Special Form'
def fullre(pat):
result = r'^' + pat + r'$'
@@ -41,6 +44,7 @@ def refunction(x):
result = fullre(x)
return result
+
# u2013 = en-dash, u2014 = em-dash, u2015 = horizontal bar
@@ -135,7 +139,8 @@ def apply(self, tokens, annotation, repkeep):
annotatedposlist = [token.pos]
annotatedwordlist = [token.word]
annotationposlist = [p for p in range(m.start(), m.end())]
- newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist)
+ newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist,
+ annotatedwordlist, annotationposlist)
metadata.append(newmeta)
newword = self.compiledre.sub(self.replacement, token.word)
newtoken = Token(newword, token.pos)
@@ -226,7 +231,10 @@ def apply(self, tokens, annotation, repkeep):
else:
(b, e) = scope
if ltodotokens == e + 1:
- SDLOGGER.error('Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e, show(todotokens)))
+ SDLOGGER.error(
+ 'Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e,
+ show(
+ todotokens)))
newtokens += todotokens[:b] + todotokens[b + 1:e]
tokenctr = e + 1
elif self.compiledre.search(todotokens[e + 1].word):
@@ -234,7 +242,9 @@ def apply(self, tokens, annotation, repkeep):
annotationpositions = [token.pos for token in todotokens[b + 1:e]]
if self.arity == dyadic:
if ltodotokens <= e + 2:
- SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name, show(todotokens)))
+ SDLOGGER.error(
+ 'Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
+ show(todotokens)))
newtokens += todotokens[b + 1:e]
break
else:
@@ -247,7 +257,8 @@ def apply(self, tokens, annotation, repkeep):
SDLOGGER.error('Illegal arity specification ({}) on {}'.format(self.arity, annotation.name))
annotatedwords = []
annotatedpositions = []
- newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions, annotatedwords, annotationpositions)
+ newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions,
+ annotatedwords, annotationpositions)
metadata.append(newmeta)
newtokens += todotokens[tokenctr:b]
replacement = getreplacement(repkeep, annotation)
@@ -270,7 +281,8 @@ def apply(self, tokens, annotation, repkeep):
while i < ltodotokens:
if self.compiledre.search(todotokens[i].word):
if scopewords == []:
- SDLOGGER.error('First argument of annotation {} missing. Annotation ignored'.format(annotation.name))
+ SDLOGGER.error(
+ 'First argument of annotation {} missing. Annotation ignored'.format(annotation.name))
else:
if self.arity == monadic:
annotatedpositions = []
@@ -283,15 +295,17 @@ def apply(self, tokens, annotation, repkeep):
metadata.append(newmeta)
elif self.arity == dyadic:
if i + 1 >= ltodotokens:
- SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
- show(todotokens)))
+ SDLOGGER.error(
+ 'Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
+ show(todotokens)))
else:
annotatedpositions = [todotokens[i + 1].pos]
annotatedwords = [todotokens[i + 1].word]
replacement = getreplacement(repkeep, annotation)
newtokens = doreplacement([prevtoken], replacement, newtokens)
prevtoken = None
- newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions, annotatedwords, scopepositions)
+ newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions,
+ annotatedwords, scopepositions)
metadata.append(newmeta)
else:
if prevtoken is not None:
@@ -308,11 +322,11 @@ def apply(self, tokens, annotation, repkeep):
class CHAT_ComplexRegex(CHAT_Regex):
def __init__(self, regextuple, replacementtuple, scoped, containswords=False):
- self.regexbegin = regextuple[0] # 3 elements: begin mid end
- self.regexmid = regextuple[1] # 3 elements: begin mid end
- self.regexend = regextuple[2] # 3 elements: begin mid end
- self.scopereplacement = replacementtuple[0] # 2 elements: one for the scope and one for the text between [ ]
- self.bracketreplacement = replacementtuple[1] # 2 elements: one for the scope and one for the text between [ ]
+ self.regexbegin = regextuple[0] # 3 elements: begin mid end
+ self.regexmid = regextuple[1] # 3 elements: begin mid end
+ self.regexend = regextuple[2] # 3 elements: begin mid end
+ self.scopereplacement = replacementtuple[0] # 2 elements: one for the scope and one for the text between [ ]
+ self.bracketreplacement = replacementtuple[1] # 2 elements: one for the scope and one for the text between [ ]
self.scoped = scoped
self.containswords = containswords
self.compiledrebegin = re.compile(refunction(self.regexbegin))
@@ -360,7 +374,8 @@ def apply(self, tokens, annotation, repkeep):
elif state == scopestate:
scope = findscope(tokens[tokenctr - 1:], offset=tokenctr - 1)
if scope is None:
- SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos, show(tokens)))
+ SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos,
+ show(tokens)))
state = wstate
else:
(b, e) = scope
@@ -372,13 +387,16 @@ def apply(self, tokens, annotation, repkeep):
if bbbe is not None:
(bracketbegin, bracketend) = bbbe
annotationtokens = todotokens[bracketbegin + 1: bracketend]
- (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens, repkeep) if self.containswords else (annotationtokens, [])
+ (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens,
+ repkeep) if self.containswords else (
+ annotationtokens, [])
metadata += innermetadata
annotatedwords = [t.word for t in tobereplacedtokens if t.word not in ['<', '>']]
annotatedpositions = [t.pos for t in tobereplacedtokens if t.word not in ['<', '>']]
thevalue = [token.word for token in cleanannotationtokens]
annotationpositions = [token.pos for token in cleanannotationtokens]
- newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords, annotationpositions)
+ newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords,
+ annotationpositions)
metadata.append(newmeta)
replacement = self.scopereplacement
repltokens = [t for t in tobereplacedtokens if t.word not in ['<', '>']]
@@ -395,10 +413,10 @@ def apply(self, tokens, annotation, repkeep):
tokenctr += inc
newtokens += tobereplacedtokens
if state in estates:
- return(newtokens, metadata)
+ return (newtokens, metadata)
else:
SDLOGGER.error('Not in an end state, state={} in {}'.format(state, show(tokens)))
- return(tokens, [])
+ return (tokens, [])
def findbrackets(tokens, regexes, offset=0):
@@ -430,14 +448,29 @@ def dropbrackets(w):
return result
-def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT)
-def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT, backplacement=bpl_delete)
+def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
+ annotatedwordlist=[w], source=CHAT)
+
+
+def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
+ annotatedwordlist=[w], source=CHAT,
+ backplacement=bpl_delete)
def simplescopedmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
- Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist, annotatedwordlist=annotatedwordlist, source=CHAT)
+ Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist,
+ annotatedwordlist=annotatedwordlist, source=CHAT)
+
+
def complexmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
- Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist, annotatedposlist=annotatedposlist, source=CHAT)
+ Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
+ annotatedposlist=annotatedposlist, source=CHAT)
+
+
+def charmetafunction(ann, annotationcharlist, annotatedcharlist, annotationcharposlist, annotatedcharposlist):
+ return Meta(ann.name, annotationcharlist, annotationcharlist=annotationcharlist,
+ annotatedcharlist=annotatedcharlist,
+ annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist)
def epsf(w): return ''
@@ -492,6 +525,7 @@ def dropchars2(w, c):
def CHAT_message(msg):
def result(x, y): return SDLOGGER.warning(msg.format(x, y))
+
return result
@@ -502,12 +536,15 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
# here additional things could be done
CHAT_Annotation('Overlap Precedes', '8.4:71-72', '10.3:75',
CHAT_SimpleScopedRegex(r'\[\<[0-9]?\]', keep, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('Special Form', '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False), simplemetafunction(getsfvalue)),
- CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False), simplemetafunction(epsf)),
- CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False), simplemetafunction(epsf)),
+ CHAT_Annotation(specialform, '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False),
+ simplemetafunction(getsfvalue)),
+ CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False),
+ simplemetafunction(epsf)),
+ CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False),
+ simplemetafunction(epsf)),
CHAT_Annotation('Noncompletion of a Word', '6.5:43', '8.5:48',
- CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), complexmetafunction),
- CHAT_Annotation('Omitted Word', '6.5:43', '8.5:48-49',
+ CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), charmetafunction),
+ CHAT_Annotation(omittedword, '6.5:43', '8.5:48-49',
CHAT_SimpleRegex(r'0[\w:]+', dropzero, False), simple_bpldel_metafunction(dropzero)),
CHAT_Annotation('Satellite at End', '7.4:58', '9.2:59-60',
CHAT_SimpleRegex(r'\s„\s', eps, False), simplemetafunction(identity)),
@@ -524,8 +561,9 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
simplemetafunction(dropinitial)), # this one must crucially precede Pause Between Syllables
CHAT_Annotation('Pause Between Syllables', '7.7:60', '9.9:63-64', CHAT_InWordRegex(r'\^', ''), complexmetafunction),
CHAT_Annotation('Simple Event', '7.8.1:60', '9.10.1:64-65', CHAT_SimpleRegex(r'&=[\w:]+', eps, False),
- simplemetafunction(identity)),
- CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65', CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False),
+ simplemetafunction(identity)),
+ CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65',
+ CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False),
complexmetafunction),
CHAT_Annotation('Pause', '7.8.3:62', '9.10.4:66', CHAT_SimpleRegex(r'\(\.\.?\.?\)', eps, False),
simplemetafunction(identity)),
@@ -577,54 +615,74 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
simplemetafunction(identity)),
# erroR marking crucially before [/] [//] [///] etc
- CHAT_Annotation('Error Marking', '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic),
+ CHAT_Annotation(errormarking, '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic),
simplescopedmetafunction),
- CHAT_Annotation('Error Marking', '8.5:75', '10.5:78',
+ CHAT_Annotation(errormarking, '8.5:75', '10.5:78',
CHAT_ComplexRegex((r'\[\*', r'[\w:\-\+=]+', r'\]'), (keep, eps), False),
complexmetafunction),
- CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True),
+ CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71',
+ CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True),
complexmetafunction), # pic bullet and text bullet must essentially before time alignment
- CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True),
+ CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71',
+ CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True),
complexmetafunction),
- CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True),
+ CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71',
+ CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True),
complexmetafunction),
- CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True),
+ CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71',
+ CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True),
complexmetafunction), # not an official code but it occurs as such in CLPF
- CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72', CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True),
+ CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72',
+ CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True),
complexmetafunction),
CHAT_Annotation('Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!\]', keep, False, monadic),
simplescopedmetafunction),
- CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic),
+ CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72',
+ CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic),
simplescopedmetafunction),
# Duration to be added here @@
- CHAT_Annotation('Explanation', '8.3:69', '10.3:73', CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False),
+ CHAT_Annotation('Explanation', '8.3:69', '10.3:73',
+ CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False),
complexmetafunction),
CHAT_Annotation('Replacement', '8.3:69', '10.3:73',
- CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True), complexmetafunction),
+ CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True),
+ complexmetafunction),
CHAT_Annotation('Replacement of Real Word', '8.3:70', '10.3:73',
CHAT_ComplexRegex((r'\[::', r'([^\]]+)', r'\]'), (eps, keep), True), complexmetafunction),
CHAT_Annotation('Alternative Transcription', '8.3:70', '10.3:74',
CHAT_ComplexRegex((r'\[=\?', r'([^\]]+)', r'\]'), (keep, eps), True), complexmetafunction),
CHAT_Annotation('Dependent Tier on Main Line', '8.3:70', 'none',
- CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction), # @@must do something with the speaker
+ CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
+ # @@must do something with the speaker
CHAT_Annotation('Comment on Main Line', '8.3:70', '10.3:74',
CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
- CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), simplescopedmetafunction),
+ CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic),
+ simplescopedmetafunction),
+ CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
+ simplescopedmetafunction),
CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76',
CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction),
- CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),
- CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic), simplescopedmetafunction),
- CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False), simplemetafunction(identity)), # needs extension
- CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False), # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm
+ CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
+ simplescopedmetafunction),
+ CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
+ simplescopedmetafunction),
+ CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77',
+ CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),
+ CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77',
+ CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction),
+ CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic),
+ simplescopedmetafunction),
+ CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False),
+ simplemetafunction(identity)), # needs extension
+ CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False),
+ # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm
simplemetafunction(interposedword)),
- CHAT_Annotation('Postcode', '8.6:75', '10.5:78', CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False),
+ CHAT_Annotation('Postcode', '8.6:75', '10.5:78',
+ CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False),
complexmetafunction),
- CHAT_Annotation('Language Precode', '8.6:75', '10.5:79', CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False),
+ CHAT_Annotation('Language Precode', '8.6:75', '10.5:79',
+ CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False),
complexmetafunction),
CHAT_Annotation('Excluded Utterance', '8.6:75-76', '10.5:79', CHAT_SimpleRegex(r'\[\+\s+bch\]', eps, False),
simplemetafunction(interposedword)),
@@ -632,9 +690,12 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
simplemetafunction(interposedword)),
CHAT_Annotation('Zero Utterance', '', '10.5:79, 11.1:81', CHAT_SimpleRegex(r'\b0\b', eps, False),
simplemetafunction(identity)),
- CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''), complexmetafunction),
- CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction), # take care extra token!@@
- CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction), # take care extra token@@
+ CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''),
+ complexmetafunction),
+ CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction),
+ # take care extra token!@@
+ CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction),
+ # take care extra token@@
CHAT_Annotation('Blocked Segments', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u2260.*?\u2260', ''),
complexmetafunction),
# these must be applied after [/], [//], [///] etc
diff --git a/TARSPpostfunctions.py b/TARSPpostfunctions.py
index 4d9d424..4a1456e 100644
--- a/TARSPpostfunctions.py
+++ b/TARSPpostfunctions.py
@@ -6,6 +6,7 @@
from query import core_process
from treebankfunctions import getmeta
+from config import SDLOGGER
OndVC = 'T071'
OndWVC = 'T076'
@@ -74,8 +75,11 @@ def getstage(uttcounts, allresults):
cands = []
gtotaal = allresults.postresults['T152']
for el in uttcounts:
- if uttcounts[el] / gtotaal >= gofase_minthreshold:
- cands.append(el)
+ if gtotaal != 0:
+ if uttcounts[el] / gtotaal >= gofase_minthreshold:
+ cands.append(el)
+ else:
+ SDLOGGER.error('gtotaal has value 0')
if cands == []:
result = 1
else:
diff --git a/adjtest.py b/adjtest.py
new file mode 100644
index 0000000..84d738b
--- /dev/null
+++ b/adjtest.py
@@ -0,0 +1,121 @@
+from lxml import etree
+from treebankfunctions import showtree
+from asta_queries import asta_bijzin
+
+streestrings = {}
+
+streestrings[0] = """
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ uh dus sinds ik hier ben heb ik logo omdat ik
+ Q#ng1647271273|dus sinds ik hier ben heb ik logo omdat ik|1|3|-0.6490448165400009
+
+
+"""
+
+
+strees = {}
+for x in streestrings:
+ strees[x] = etree.fromstring(streestrings[x])
+
+thequery = """
+.//node[
+ ( (@word="geboren") or
+
+ (@pt="adj" and
+ (@rel="mod" and
+ parent::node[@cat="np"] and
+ ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")] and
+ (not(@begin < ../node[@rel="det" and (@pt="lid" or @pt="vnw")]/@begin) or @lemma='heel' or @lemma='geheel')
+ )
+ )
+ or
+
+ (@pt="adj" and
+ (@rel="hd" and
+ parent::node[@cat="ap" and parent::node[@cat="np"] and
+ ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")]]
+ )
+ )
+ or
+
+ (@pt="tw" and @numtype="rang")
+ or
+
+ (@pt="adj" and @rel="hd" and parent::node[@cat="np"])
+ or
+
+ (
+ (@pt="tw" and @numtype="rang")
+ and @positie = "nom" )
+ or
+
+ (@pt="ww" and @wvorm="vd" and @rel="mod" and parent::node[@cat="np"])
+ or
+
+ (@pt="ww" and @wvorm="od" and @rel="mod" and parent::node[@cat="np"])
+ or
+
+ (@pt="adj" and ( (@rel="predc" or @rel="predm" ) and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )])
+)
+ or
+
+ (@pt="adj" and @rel="hd" and parent::node[@cat="ap" and ( (@rel="predc" or @rel="predm" ) and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )])
+])
+ or
+ (@rel="det" and @pt="vnw" and @vwtype="onbep")
+
+ )
+]
+"""
+
+#matches = strees[0].xpath(thequery)
+matches = asta_bijzin(strees[0])
+for m in matches:
+ showtree(m)
\ No newline at end of file
diff --git a/alpino.py b/alpino.py
index 4ddfe11..24dda43 100644
--- a/alpino.py
+++ b/alpino.py
@@ -30,9 +30,12 @@ def getdehetwordinfo(wrd):
# we only want to consider nouns or words of unknown word class (such as kopje in CELEX)
wordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[0] in ['n', 'None']]
- # if any of the alternatives is a de-word, we empty the whole list
- if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]):
- wordinfos = []
+ # if any of the alternatives is a de-word, we keep only these
+ dewordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[1] == lexicon.de]
+ if dewordinfos != []:
+ wordinfos = dewordinfos
+ #if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]):
+ # wordinfos = []
# if not found yet we check with Alpino
if wordinfos != []:
diff --git a/asta_neo.py b/asta_neo.py
new file mode 100644
index 0000000..e5d6d0d
--- /dev/null
+++ b/asta_neo.py
@@ -0,0 +1,147 @@
+from lxml import etree
+#from CHAT_Annotation import specialform, errormarking
+
+specialform = 'Special Form'
+errormarking = 'Error Marking'
+
+mdnamemdxpathtemplate = """.//xmeta[@name="{mdname}" and @value="{mdvalue}"]"""
+ptposxpathtemplate = './/node[@pt and @begin="{position}"]'
+
+def mdbasedquery(stree, mdname, mdvalue):
+ mdnamemdxpath = mdnamemdxpathtemplate.format(mdname=mdname, mdvalue=mdvalue)
+ mdnamemds = stree.xpath(mdnamemdxpath)
+ results = []
+ for mdnamemd in mdnamemds:
+ annotatedposstr = mdnamemd.attrib['annotatedposlist']
+ if annotatedposstr != '':
+ mdbeginval = annotatedposstr[1:-1]
+ ptposxpath = ptposxpathtemplate.format(position=mdbeginval)
+ newresults = stree.xpath(ptposxpath)
+ results += newresults
+
+ return results
+
+def neologisme(stree):
+ results1 = mdbasedquery(stree, errormarking,"['n']")
+ results2 = mdbasedquery(stree, specialform, '@n')
+ results = results1 + results2
+ return results
+
+def sempar(stree):
+ results = mdbasedquery(stree, errormarking, "['s']")
+ return results
+
+def phonpar(stree):
+ results = mdbasedquery(stree, errormarking, "['p']")
+ return results
+
+
+def test(stree):
+ neoresults = neologisme(stree)
+ semparresults = sempar(stree)
+ phonparresults = phonpar(stree)
+ results = [('neo', neoresult) for neoresult in neoresults] +\
+ [('sempar', semparresult) for semparresult in semparresults] +\
+ [('phonpar', phonparresult) for phonparresult in phonparresults]
+ return results
+
+def main():
+ for i in strees:
+ results = test(strees[i])
+ for result in results:
+ print('{}: {}:{}'.format(result[0], result[1].attrib['word'], result[1].attrib['begin']))
+
+
+
+streestrings = {}
+
+streestrings[1] = """
+
+
+
+
+
+
+
+
+
+
+
+ ik heb geduusterd
+
+ Q#ng1646152422|ik heb geduusterd|1|1|-5.158487943820001
+
+
+"""
+
+
+streestrings[2] = """
+
+
+
+
+
+
+
+
+ ik heb ngeduusterd
+
+ Q#ng1646219407|ik heb ngeduusterd|1|1|-1.6311900273499995
+
+
+"""
+
+streestrings[3] = """
+
+
+
+
+
+
+
+
+ ik heb nngeduusterd
+
+ Q#ng1646219408|ik heb nngeduusterd|1|1|-1.6311900273499995
+
+
+"""
+streestrings[4] = """
+
+
+
+
+
+
+
+
+ ik heb pgeduusterd
+
+ Q#ng1646219409|ik heb pgeduusterd|1|1|-1.6311900273499995
+
+
+"""
+
+streestrings[5] = """
+
+
+
+
+
+
+
+
+ ik heb sgeduusterd
+
+ Q#ng1646219410|ik heb sgeduusterd|1|1|-1.6311900273499995
+
+
+
+"""
+
+strees = {}
+for i in streestrings:
+ strees[i] = etree.fromstring(streestrings[i])
+
+if __name__ == '__main__':
+ main()
diff --git a/asta_queries.py b/asta_queries.py
index 5097681..575a80d 100644
--- a/asta_queries.py
+++ b/asta_queries.py
@@ -213,13 +213,18 @@ def asta_bijzin(stree):
if getattval(cn1, 'begin') == getattval(cn0, 'begin'):
cn0end = getattval(cn0, 'end')
newbegin = cn0end
- newokptnode = find1(cn1, '//node[@pt and @begin={newbegin}]'.format(newbegin=newbegin))
- result = sortedclausenodes[2:] + okptnodes + [newokptnode]
+ newokptnodexpath = '//node[@pt and @begin="{newbegin}"]'.format(newbegin=newbegin)
+ newokptnode = find1(cn1, newokptnodexpath)
+ result = sortedclausenodes[2:] + okptnodes
+ if newokptnode is not None:
+ result += [newokptnode]
else:
result = sortedclausenodes[1:] + okptnodes
else:
result = sortedclausenodes[1:] + okptnodes
+ #ad hoc statement to ensure that there are no None matches should not happen anymore
+ result = [el for el in result if el is not None]
return result
diff --git a/basicreplacements.py b/basicreplacements.py
index f369702..339b327 100644
--- a/basicreplacements.py
+++ b/basicreplacements.py
@@ -41,6 +41,9 @@
('effe', 'even', pron, infpron, varpron),
('set', 'zet', pron, infpron, initdev), ('hie', 'hier', pron, pronerr, codared),
('eers', 'eerst', pron, pronerr, codared),
+ ('era', 'eraf', pron, pronerr, codared),
+ ('il', 'wil', pron, pronerr, onsetred),
+ ('tee', 'twee', pron, pronerr, onsetred),
('nie', 'niet', pron, infpron, codared),
('s', 'is', orth, spellerr, apomiss), ('ooke', 'ook', pron, infpron, addschwa),
('it', 'dit', pron, pronerr, onsetred),
@@ -67,6 +70,7 @@
('dis', ['dit', 'is'], pron, infpron, contract),
('das', ['dat', 'is'], pron, infpron, contract),
('tis', ['dit', 'is'], pron, infpron, contract),
+ ('waas', ['waar', 'is'], pron, infpron, contract),
('is-t-ie', ['is', 'ie'], pron, infpron, t_ie),
('als-t-ie', ['als', 'ie'], pron, infpron, t_ie),
('of-t-ie', ['of', 'ie'], pron, infpron, t_ie),
diff --git a/checkcorrection.py b/checkcorrection.py
new file mode 100644
index 0000000..021cbea
--- /dev/null
+++ b/checkcorrection.py
@@ -0,0 +1,66 @@
+'''
+Compares the errorlogging file with the error reference file
+'''
+
+import os
+from xlsx import getxlsxdata
+
+dataset = 'vkltarsp'
+dataset = 'vklstap'
+dataset = 'vklasta'
+
+if dataset == 'vkltarsp':
+ resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\tarsp'
+ dataprefix = 'tarsp'
+
+elif dataset == 'vklstap':
+ resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata'
+ dataprefix = 'stap'
+
+elif dataset == 'vklasta':
+ resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\asta'
+ dataprefix = 'asta'
+
+
+errorloggingfilename = dataprefix + '_errorlogging.xlsx'
+errorloggingfullname = os.path.join(resultspath, errorloggingfilename)
+
+referencepath = r'D:\jodijk\Dropbox\Surfdrive\Shared\SASTAPLUS\November'
+errorreffilename = dataprefix + '_error_ref.xlsx'
+errorreffullname = os.path.join(referencepath, errorreffilename)
+
+logheader, logdata = getxlsxdata(errorloggingfullname)
+refheader, refdata = getxlsxdata(errorreffullname)
+
+refdict = {(row[0], row[1]): row[3] for row in refdata}
+
+correctcorrections = 0
+missedcorrections = 0
+wrongcorrections = 0
+for row in logdata:
+ key = (row[0], row[5])
+ if 'BEST' in row[10]:
+ logsent = row[9]
+ if key not in refdict:
+ print('Missing example in refdict: {}'.format(key))
+ print(row[9])
+ missedcorrections += 1
+ else:
+ refsent = refdict[key]
+ if refsent != logsent:
+ print('Mismatch: {}'.format(key))
+ print('refsent=<{}>'.format(refsent))
+ print('logsent=<{}>'.format(logsent))
+ wrongcorrections += 1
+ else:
+ correctcorrections += 1
+
+allcorrections = correctcorrections + wrongcorrections + missedcorrections
+
+correctioncounts = [correctcorrections, wrongcorrections, missedcorrections]
+labels = ['correct corrections', 'wrong corrections', 'missed corrections']
+labeled_corrections = zip(labels, correctioncounts)
+
+print('\nSummary:\n')
+for label, corr in labeled_corrections:
+ print('{} = {} ({:.2f}%)'.format(label, corr, corr / allcorrections * 100))
\ No newline at end of file
diff --git a/cleanCHILDEStokens.py b/cleanCHILDEStokens.py
index f7e074c..9d4a922 100644
--- a/cleanCHILDEStokens.py
+++ b/cleanCHILDEStokens.py
@@ -20,6 +20,16 @@
bstate, ostate, oostate, costate, ccstate = 0, 1, 2, 3, 4
+#this should be identical to the checkpattern of cleanCHILDESMD
+# #checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ123456789·\u22A5\u00B7\u0001\u2260\u21AB]')
+# checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ·\u22A5\u00B7\u0001\u2260\u21AB]')
+# # + should not occur except as compound marker black+board
+# # next one split up in order to do substitutions
+# pluspattern = re.compile(r'(\W)\+|\+(\W)')
+# pluspattern1 = re.compile(r'(\W)\+')
+# pluspattern2 = re.compile(r'\+(\W)')
+illegalcleanedchatsymbols = '<>'
+
def findscopeclose(tokens, offset=0):
tokenctr = 0
@@ -83,22 +93,31 @@ def checkline(line, newline, outfilename, lineno, logfile):
print('charcodes=<{}>'.format(thecodes), file=logfile)
-def cleantext(utt, repkeep):
+def purifytokens(tokens):
+ result = [token for token in tokens if token.word not in illegalcleanedchatsymbols]
+ return result
+
+def cleantext(utt, repkeep, tokenoutput=False):
newutt = robustness(utt)
tokens = sastatok.sasta_tokenize(newutt)
inwordlist = [t.word for t in tokens]
intokenstrings = [str(token) for token in tokens]
# print(space.join(intokenstrings))
(newtokens, metadata) = cleantokens(tokens, repkeep)
+ #remove symbol tokens that should not be there anymore
+ newtokens = purifytokens(newtokens)
resultwordlist = [t.word for t in newtokens]
resultstring = smartjoin(resultwordlist)
resultposlist = [t.pos for t in newtokens]
newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
- newmeta3 = Meta('cleanedtokenpositions', resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
+ newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
metadata += [newmeta1, newmeta2, newmeta3]
resultmetadata = metadata
- return (resultstring, resultmetadata)
+ if tokenoutput:
+ return(newtokens, resultmetadata)
+ else:
+ return (resultstring, resultmetadata)
def cleantokens(tokens, repkeep):
@@ -133,7 +152,10 @@ def removesuspects(str):
return result
-robustnessrules = [(re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'),
+robustnessrules = [(re.compile(r'\u2026'), '\u2026', '...', 'Horizontal Ellipsis (\u2026, Unicode U+2026) replaced by a sequence of three Full Stops (..., Unicode U+002E) '),
+ (re.compile('#'), '#', '', 'Number Sign (#, Unicode U+0023) removed'),
+ #(re.compile('#'), '#', '(.)', 'Number Sign (#, Unicode U+0023) replaced by CHAT (short) pause code: (.)'),
+ (re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'),
(re.compile(r'\[\+trn\]'), '[+trn]', '[+ trn]', 'Missing space'),
(re.compile(r'\[:(?![:\s])'), '[:', '[: ', 'Missing space'),
(re.compile(r'(?<=\w)\+\.\.\.'), '+...', ' +...', 'Missing space'),
diff --git a/corrector.py b/corrector.py
index d5c78ba..a0947dd 100644
--- a/corrector.py
+++ b/corrector.py
@@ -1,15 +1,5 @@
''''
-Jij moet er dan voor zorgen dat je in de CHAT file die je produceert iedere uiting afgaat en een call doet naar een functie
-
-getcorrection
-met als argument de string van de uiting.
-
-Deze functie geeft dan terug een tuple (correction, metadata)
-
-waarbij
-• correction een string is die je op moet nemen in de chat file als de verbeterde uiting
-• metadata metadata zijn a la PaQu (type, name, value) o.a. origutt van type text met als waarde de inputstring
-
+to be added
'''
import copy
@@ -26,7 +16,7 @@
getunwantedtokens, nodesfindjaneenou)
from deregularise import correctinflection
from iedims import getjeforms
-from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart
+from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart, tswnouns
from macros import expandmacros
# from namepartlexicon import namepart_isa_namepart
from sastatok import sasta_tokenize
@@ -36,7 +26,7 @@
vowels)
from sva import getsvacorrections
from tokenmd import TokenListMD, TokenMD, mdlist2listmd
-from treebankfunctions import find1, getattval, getnodeyield
+from treebankfunctions import find1, getattval, getnodeyield, showtree, treeinflate, fatparse
from lxml import etree
import sys
# from alternative import Alternative, Replacement, Metadata, Meta
@@ -46,6 +36,7 @@
from alpinoparsing import parse, escape_alpino_input
from expandquery import expandmacros
from find_ngram import findmatches, ngram1, ngram2, ngram7, ngram10, ngram11, ngram16, ngram17
+from smallclauses import smallclauses
SASTA = 'SASTA'
@@ -177,7 +168,8 @@ def reduce(tokens, tree):
# remove tsw incl goh och hé oke but not ja, nee, nou
tswtokens = [n for n in reducedtokens if n.pos in token2nodemap
and getattval(token2nodemap[n.pos], 'pt') == 'tsw'
- and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'}]
+ and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'}
+ and getattval(token2nodemap[n.pos], 'lemma') not in tswnouns]
tswpositions = [n.pos for n in tswtokens]
allremovetokens += tswtokens
allremovepositions == tswpositions
@@ -413,11 +405,12 @@ def getcorrection(utt, tree=None, interactive=False):
return result
-def getcorrections(utt, method, tree=None, interactive=False):
- origutt = utt
+def getcorrections(rawtokens, method, tree=None, interactive=False):
allmetadata = []
- rawtokens = sasta_tokenize(utt)
+ # rawtokens = sasta_tokenize(utt)
wordlist = tokenlist2stringlist(rawtokens)
+ utt = space.join(wordlist)
+ origutt = utt
# check whether the tree has the same yield
origtree = tree
@@ -426,7 +419,7 @@ def getcorrections(utt, method, tree=None, interactive=False):
if treewordlist != wordlist:
revisedutt = space.join(wordlist)
- tree = PARSE_FUNC(revisedutt)
+ tree = fatparse(revisedutt, rawtokens)
tokens, metadata = cleantokens(rawtokens, repkeep=False)
allmetadata += metadata
@@ -489,8 +482,9 @@ def getalternatives(origtokensmd, method, tree, uttid):
# now turn each sequence of (token, md) pairs into a pair (tokenlist, mergedmetadata)
newaltuttmds = []
for altuttmd in altutts:
- newaltuttmd = mdlist2listmd(altuttmd)
- newaltuttmds.append(newaltuttmd)
+ if altuttmd != []:
+ newaltuttmd = mdlist2listmd(altuttmd)
+ newaltuttmds.append(newaltuttmd)
# basic expansions
@@ -508,8 +502,8 @@ def getalternatives(origtokensmd, method, tree, uttid):
for uttmd in allalternativemds:
# utterance = space.join([token.word for token in uttmd.tokens])
utterance, _ = mkuttwithskips(uttmd.tokens)
- ntree = PARSE_FUNC(utterance)
- newresults += getwrongdetalternatives(uttmd, ntree, uttid)
+ fatntree = fatparse(utterance, uttmd.tokens)
+ newresults += getwrongdetalternatives(uttmd, fatntree, uttid)
allalternativemds += newresults
newresults = []
@@ -518,9 +512,11 @@ def getalternatives(origtokensmd, method, tree, uttid):
utterance, _ = mkuttwithskips(uttmd.tokens)
# reducedtokens = [t for t in uttmd.tokens if not t.skip]
# reduceduttmd = TokenListMD(reducedtokens, uttmd.metadata)
- ntree = PARSE_FUNC(utterance)
- # simpleshow(ntree)
- uttalternativemds = getsvacorrections(uttmd, ntree, uttid)
+ fatntree = fatparse(utterance, uttmd.tokens)
+ debug = False
+ if debug:
+ showtree(fatntree)
+ uttalternativemds = getsvacorrections(uttmd, fatntree, uttid)
newresults += uttalternativemds
allalternativemds += newresults
@@ -528,8 +524,16 @@ def getalternatives(origtokensmd, method, tree, uttid):
for uttmd in allalternativemds:
# utterance = space.join([token.word for token in uttmd.tokens])
utterance, _ = mkuttwithskips(uttmd.tokens)
- ntree = PARSE_FUNC(utterance)
- newresults += correctPdit(uttmd, ntree, uttid)
+ fatntree = fatparse(utterance, uttmd.tokens)
+ newresults += correctPdit(uttmd, fatntree, uttid)
+ allalternativemds += newresults
+
+ newresults = []
+ for uttmd in allalternativemds:
+ utterance, _ = mkuttwithskips(uttmd.tokens)
+ fatntree = fatparse(utterance, uttmd.tokens)
+ newresults += smallclauses(uttmd, fatntree)
+ # showtree(fatntree, text='fatntree')
allalternativemds += newresults
# final check whether the alternatives are improvements. It is not assumed that the original tokens is included in the alternatives
@@ -570,7 +574,7 @@ def mkuttwithskips(tokens, delete=True):
return result, tokenposlist
-def getexpansions(uttmd):
+def oldgetexpansions(uttmd):
expansionfound = False
newtokens = []
tokenctr = 0
@@ -612,6 +616,50 @@ def getexpansions(uttmd):
return result
+
+def getexpansions(uttmd):
+ expansionfound = False
+ newtokens = []
+ tokenctr = 0
+ #newtokenctr = 0
+ tokenposlist = []
+ newmd = uttmd.metadata
+ for tokenctr, token in enumerate(uttmd.tokens):
+ if token.word.lower() in basicexpansions:
+ expansionfound = True
+ for (rlist, c, n, v) in basicexpansions[token.word.lower()]:
+ rlisttokenctr = 0
+ for rlisttokenctr, rw in enumerate(rlist):
+ if rlisttokenctr == 0:
+ newtoken = Token(rw, token.pos)
+ else:
+ newtoken = Token(rw, token.pos, subpos=rlisttokenctr)
+ newtokens.append(newtoken)
+ tokenposlist.append(token.pos)
+ nwt = Token(space.join(rlist), token.pos)
+ meta1 = mkSASTAMeta(token, nwt, n, v, c, subcat=None, penalty=defaultpenalty,
+ backplacement=bpl_none)
+ newmd.append(meta1)
+
+ else:
+ newtoken = Token(token.word, token.pos)
+ newtokens.append(newtoken)
+ tokenposlist.append(token.pos)
+
+ # adapt the metadata
+ if expansionfound:
+ meta2 = Meta('OrigCleanTokenPosList', tokenposlist, annotatedposlist=[],
+ annotatedwordlist=[], annotationposlist=tokenposlist,
+ annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=defaultpenalty,
+ backplacement=bpl_none)
+ newmd.append(meta2)
+ result = [TokenListMD(newtokens, newmd)]
+ else:
+ result = []
+
+ return result
+
+
def lexcheck(intokensmd, allalternativemds):
finalalternativemds = [intokensmd]
for alternativemd in allalternativemds:
@@ -708,7 +756,7 @@ def explanationasreplacement(tokensmd, tree):
bpl = bpl_node if known_word(oldword) else bpl_word
meta = mkSASTAMeta(oldtoken, newtoken, name='ExplanationasReplacement',
value='ExplanationasReplacement',
- cat='Lexical Error', backplacement=bpl_node)
+ cat='Lexical Error', backplacement=bpl)
newmetadata.append(meta)
result = TokenListMD(newtokens, newmetadata)
return result
@@ -925,10 +973,10 @@ def getwrongdetalternatives(tokensmd, tree, uttid):
meta = mkSASTAMeta(token, newcurtoken, name='GrammarError', value='deheterror', cat='Error',
backplacement=bpl_node)
metadata.append(meta)
+ correctiondone = True
else:
newcurtokenword = token.word
newtokens.append(Token(newcurtokenword, token.pos))
- correctiondone = True
else:
newcurtokenword = token.word
newtokens.append(token)
@@ -959,24 +1007,24 @@ def correctPdit(tokensmd, tree, uttid):
metadata = tokensmd.metadata
newtokens = []
tokenctr = 0
+ nonskiptokenctr = 0
prevtoken = None
for token in tokens:
- tokennode = next(filter(lambda x: getattval(x, 'begin') == str(tokenctr), tokennodes), None)
+ tokennode = next(filter(lambda x: getattval(x, 'begin') == str(token.pos + token.subpos), tokennodes), None)
tokenlemma = getattval(tokennode, 'lemma')
if not token.skip and prevtoken is not None and not prevtoken.skip and tokenlemma in {'dit', 'dat', 'deze',
'die'}:
tokenrel = getattval(tokennode, 'rel')
tokenpt = getattval(tokennode, 'pt')
- prevtokennode = tokennodes[tokenctr - 1] if tokenctr > 0 else None
+ prevtokennode = tokennodes[nonskiptokenctr - 1] if tokenctr > 0 else None
if prevtokennode is not None:
prevpt = getattval(prevtokennode, 'pt')
prevparent = prevtokennode.getparent()
prevparentrel, prevparentcat = getattval(prevparent, 'rel'), getattval(prevparent, 'cat')
indezemwp = getindezemwp(prevtokennode, tokennode)
- if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'obj1',
- 'det'} and tokenpt == 'vnw') or \
+ if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'det'} and tokenpt == 'vnw') or \
indezemwp:
- newtoken = Token('hem', tokenctr)
+ newtoken = Token('hem', token.pos, subpos=token.subpos)
bpl = bpl_indeze if indezemwp else bpl_node
meta = mkSASTAMeta(token, newtoken, name='parsed as', value='hem', cat='AlpinoImprovement',
backplacement=bpl)
@@ -990,6 +1038,8 @@ def correctPdit(tokensmd, tree, uttid):
else:
newtokens.append(token)
tokenctr += 1
+ if not token.skip:
+ nonskiptokenctr += 1
prevtoken = token
result = TokenListMD(newtokens, metadata)
if correctiondone:
diff --git a/correcttreebank.py b/correcttreebank.py
index 9354add..25912d7 100644
--- a/correcttreebank.py
+++ b/correcttreebank.py
@@ -4,8 +4,7 @@
from lxml import etree
from basicreplacements import basicreplacements
-from cleanCHILDEStokens import cleantext
-from corrector import getcorrections, mkuttwithskips
+from corrector import getcorrections, mkuttwithskips, disambiguationdict
from lexicon import de, dets, known_word
from metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, bpl_none,
bpl_word, bpl_wordlemma)
@@ -17,8 +16,13 @@
deletewordnodes, find1, getattval, getbeginend,
getcompoundcount, getnodeyield, getsentid,
gettokposlist, getyield, myfind, showflatxml,
- simpleshow, transplant_node)
+ simpleshow, transplant_node, showtree, treeinflate, fatparse, treewithtokenpos,
+ updatetokenpos, getuttid)
from config import PARSE_FUNC, SDLOGGER
+from metadata import insertion
+from sastatoken import inflate, deflate, tokeninflate, insertinflate
+from CHAT_Annotation import omittedword
+from cleanCHILDEStokens import cleantext
ampersand = '&'
@@ -123,61 +127,52 @@ def contextualise(node1, node2):
newnode.attrib[prop] = node2.attrib[prop]
return newnode
+def updatemetadata(metadata, tokenposdict):
+ begintokenposdict = {k-1: v-1 for (k, v) in tokenposdict.items()}
+ newmetadata = []
+ for meta in metadata:
+ newmeta = deepcopy(meta)
+ newmeta.annotationposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotationposlist]
+ newmeta.annotatedposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotatedposlist]
+ newmetadata.append(newmeta)
+ return newmetadata
-def updatetokenpos(resulttree, tokenposdict):
- # resulttree = deepcopy(stree)
- for child in resulttree:
- newchild = updatetokenpos(child, tokenposdict)
- if ('pt' in resulttree.attrib or 'pos' in resulttree.attrib) and 'end' in resulttree.attrib and 'begin' in resulttree.attrib:
- intend = int(resulttree.attrib['end'])
- if intend in tokenposdict:
- newendint = tokenposdict[intend]
- resulttree.attrib['end'] = str(newendint)
- resulttree.attrib['begin'] = str(newendint - 1)
- else:
- SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend))
- etree.dump(resulttree)
- SDLOGGER.error('tokenposdict={}'.format(tokenposdict))
- elif 'cat' in resulttree.attrib:
- children = [ch for ch in resulttree]
- (b, e) = getbeginend(children)
- resulttree.attrib['begin'] = b
- resulttree.attrib['end'] = e
+def updatetokenposmd(intree, metadata, tokenposdict):
+ resulttree = updatetokenpos(intree, tokenposdict)
+ newmetadata = updatemetadata(metadata, tokenposdict)
+ return resulttree, newmetadata
- return resulttree
def findskippednodes(stree, tokenlist):
+ debug = False
+ if debug:
+ showtree(stree, text='findskippednodes:stree:')
topnode = find1(stree, './/node[@cat="top"]')
- # tokenposdict = {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))}
- tokenposdict = {}
- elctr = 0
- i = 0
- for tok in tokenlist:
- elctr += 1
- if not tok.skip:
- tokenposdict[elctr] = i + 1
- i += 1
- resultlist = findskippednodes2(topnode, tokenposdict)
+ #tokenposdict = {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))}
+ tokenposset = {t.pos + 1 for t in tokenlist if not t.skip}
+ resultlist = findskippednodes2(topnode, tokenposset)
return resultlist
-def findskippednodes2(stree, tokenposdict):
+def findskippednodes2(stree, tokenposset):
resultlist = []
if stree is None:
return resultlist
if 'pt' in stree.attrib or 'pos' in stree.attrib:
- if int(stree.attrib['end']) not in tokenposdict:
+ if int(stree.attrib['end']) not in tokenposset:
resultlist.append(stree)
elif 'cat' in stree.attrib:
for child in stree:
- resultlist += findskippednodes2(child, tokenposdict)
+ resultlist += findskippednodes2(child, tokenposset)
else:
pass
return resultlist
-def insertskips(newstree, tokenlist, stree):
+
+
+def insertskips(newstree, tokenlist, stree):
'''
:param newstree: the corrected tree, with skipped elements absent
@@ -185,58 +180,81 @@ def insertskips(newstree, tokenlist, stree):
:param stree: original stree with parses of the skipped elements
:return: adapted tree, with the skipped elements inserted (node from the original stree as -- under top, begin/ends updates
'''
- # debug = True
debug = False
if debug:
- print('\nnewstree:')
- etree.dump(newstree)
- resulttree = deepcopy(newstree)
+ showtree(newstree, 'newstree:')
+ showtree(stree, 'stree')
+ reducedtokenlist = [t for t in tokenlist if not t.skip]
+ resulttree = treewithtokenpos(newstree, reducedtokenlist)
+
+ if debug:
+ showtree(resulttree, text='resulttree:')
+ streetokenlist = [ t for t in tokenlist if t.subpos == 0]
+ stree = treewithtokenpos(stree, streetokenlist)
+ if debug:
+ showtree(stree, text='stree with tokenpos:')
+ debug = False
# tokenpostree = deepcopy(stree)
# update begin/ends
- reducedtokenlist = [t for t in tokenlist if not t.skip]
- tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))}
- resulttree = updatetokenpos(resulttree, tokenposdict)
+ #next not needed anymore
+ #tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))}
+ #showtree(resulttree, text='in: ')
+ #resulttree, newmetadata = updatetokenposmd(resulttree, metadata, tokenposdict)
+ #showtree(resulttree, text='out:')
# tokenpostree = updatetokenpos(tokenpostree, tokenposdict)
- if debug:
- print('\nstree:')
- etree.dump(stree)
- # print('\ntokenpostree:')
- # etree.dump(tokenpostree)
- print('\nresulttree:')
- etree.dump(resulttree)
+ #if debug:
+ # print('\nstree:')
+ # etree.dump(stree)
+ # # print('\ntokenpostree:')
+ # # etree.dump(tokenpostree)
+ # print('\nresulttree:')
+ # etree.dump(resulttree)
# insert skipped elements
nodestoinsert = findskippednodes(stree, tokenlist)
nodestoinsertcopies = [deepcopy(n) for n in nodestoinsert]
- # simpleshow(stree)
+ if debug:
+ showtree(stree, text='insertskips: stree:')
+ if debug:
+ showtree(resulttree, text='insertskips: resulttree:')
topnode = find1(resulttree, './/node[@cat="top"] ')
topchildren = [ch for ch in topnode]
allchildren = nodestoinsertcopies + topchildren
sortedchildren = sorted(allchildren, key=lambda x: x.attrib['end'], reverse=True)
- # simpleshow(stree)
+ if debug:
+ showtree(resulttree, text='insertskips: resulttree:')
for ch in topnode:
topnode.remove(ch)
- # simpleshow(stree)
+ if debug:
+ showtree(resulttree, text='insertskips: resulttree:')
for node in sortedchildren:
node.attrib['rel'] = '--' # these are now extragrammatical with relation --
topnode.insert(0, node)
- # simpleshow(stree)
+ if debug:
+ showtree(resulttree, text='insertskips: resulttree:')
(b, e) = getbeginend(sortedchildren)
topnode.attrib['begin'] = b
topnode.attrib['end'] = e
- # simpleshow(stree)
+ if debug:
+ showtree(resulttree, text='insertskips: resulttree:')
sentlist = getyield(resulttree)
sent = space.join(sentlist)
sentnode = find1(resulttree, 'sentence')
sentnode.text = sent
if debug:
- print('result of insertskips')
- etree.dump(resulttree)
+ showtree(resulttree, 'result of insertskips')
return resulttree
+def getomittedwordbegins(metalist):
+ results = []
+ for meta in metalist:
+ if meta.name == omittedword:
+ results += meta.annotatedposlist
+ return results
+
def correct_stree(stree, method, corr):
'''
@@ -255,7 +273,7 @@ def correct_stree(stree, method, corr):
print(showflatxml(stree))
allmetadata = []
- allorandalts = []
+ orandalts = []
# uttid:
uttid = getuttid(stree)
@@ -266,7 +284,7 @@ def correct_stree(stree, method, corr):
origutt = getorigutt(stree)
if origutt is None:
SDLOGGER.error('Missing origutt in utterance {}'.format(uttid))
- return stree
+ return stree, orandalts
# list of token positions
# get the original metadata; these will be added later to the tree of each correction
@@ -282,19 +300,33 @@ def correct_stree(stree, method, corr):
# allmetadata += origmetadata
# clean in the tokenized manner
- cleanutt, chatmetadata = cleantext(origutt, False)
+ cleanutttokens, chatmetadata = cleantext(origutt, False, tokenoutput=True)
allmetadata += chatmetadata
- cleanutttokens = sasta_tokenize(cleanutt)
+ #cleanutttokens = sasta_tokenize(cleanutt)
cleanuttwordlist = [t.word for t in cleanutttokens]
+ cleanutt = space.join(cleanuttwordlist)
- # get corrections, given the stree
+ # get corrections, given the inflated stree
+ #inflate the tree
+ fatstree = deepcopy(stree)
+ treeinflate(fatstree)
+ # adapt the begins and ends in the tree based on the token positions
+ debug = False
+ if debug:
+ showtree(fatstree, text='fatstree voor:')
+ tokenlist = [t for t in cleanutttokens]
+ fatstree = treewithtokenpos(fatstree, tokenlist)
+ if debug:
+ showtree(fatstree, text='fatstree na:')
+ debug = False
+ #(fatstree, text='fattened tree:')
- ctmds = getcorrections(cleanutt, method, stree)
+ ctmds = getcorrections(cleanutttokens, method, fatstree)
+ debug = False
if debug:
- print('2:', end=': ')
- simpleshow(stree)
- print(showflatxml(stree))
+ showtree(fatstree, text='2:')
+ debug = False
ptmds = []
for correctiontokenlist, cwmdmetadata in ctmds:
@@ -302,70 +334,89 @@ def correct_stree(stree, method, corr):
correctionwordlist = tokenlist2stringlist(correctiontokenlist, skip=True)
# parse the corrections
- if correctionwordlist != cleanuttwordlist:
- # @@@adapt this, skip the tokens to be skipped@@@
- # correction = space.join(correctionwordlist)
+ if correctionwordlist != cleanuttwordlist and correctionwordlist != []:
correction, tokenposlist = mkuttwithskips(correctiontokenlist)
cwmdmetadata += [Meta('parsed_as', correction, cat='Correction', source='SASTA')]
- newstree = PARSE_FUNC(correction)
- if newstree is None:
- newstree = stree # is this what we want?@@
+ reducedcorrectiontokenlist = [token for token in correctiontokenlist if not token.skip]
+ fatnewstree = fatparse(correction, reducedcorrectiontokenlist)
+ debugb = False
+ if debugb:
+ showtree(fatnewstree, text='fatnewstree')
+
+ if fatnewstree is None:
+ fatnewstree = fatstree # is this what we want?@@
else:
# insert the leftout words and adapt the begin/ends of the nodes
# simpleshow(stree)
- newstree = insertskips(newstree, correctiontokenlist, stree)
+ fatnewstree = insertskips(fatnewstree, correctiontokenlist, fatstree)
+ #newstree = insertskips(newstree, correctiontokenlist, stree)
# simpleshow(stree)
mdcopy = deepcopy(origmetadata)
- newstree.insert(0, mdcopy)
+ fatnewstree.insert(0, mdcopy)
# copy the sentid attribute
- sentencenode = getsentencenode(newstree)
+ sentencenode = getsentencenode(fatnewstree)
if sentencenode is not None:
sentencenode.attrib['sentid'] = sentid
- if debug:
- print(etree.tostring(newstree, pretty_print=True))
- # etree.dump(newstree)
+ if debugb:
+ showtree(fatnewstree)
+ # etree.dump(fatnewstree)
else:
# make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata
- newstree = add_metadata(stree, chatmetadata)
+ fatnewstree = add_metadata(fatstree, chatmetadata)
- ptmds.append((correctionwordlist, newstree, cwmdmetadata))
+ ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata))
# select the stree for the most promising correction
+ debug = False
if debug:
print('3:', end=': ')
- simpleshow(stree)
- print(showflatxml(stree))
+ showtree(fatnewstree)
+ debug = False
if ptmds == []:
- thecorrection, orandalts = (cleanutt, stree, origmetadata), None
+ thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
elif corr in [corr1, corrn]:
- thecorrection, orandalts = selectcorrection(stree, ptmds, corr)
+ thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr)
else:
SDLOGGER.error('Illegal correction value: {}. No corrections applied'.format(corr))
- thecorrection, orandalts = (cleanutt, stree, origmetadata), None
+ thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
thetree = deepcopy(thecorrection[1])
- if debug:
- print('4:', end=': ')
- simpleshow(stree)
- print(showflatxml(stree))
+ #debuga = True
+ debuga = False
+ if debuga:
+ print('4: (fatstree)')
+ etree.dump(fatstree, pretty_print=True)
# do replacements in the tree
- # etree.dump(thetree)
+ if debuga:
+ print('4b: (thetree)')
+ etree.dump(thetree, pretty_print=True)
reverseposindex = gettokposlist(thetree)
+ if debuga:
+ print('4b: (thetree)')
+ etree.dump(thetree, pretty_print=True)
+
# resultposmeta = selectmeta('cleanedtokenpositions', allmetadata)
# resultposlist = resultposmeta.value
newcorrection2 = thecorrection[2]
nodes2deletebegins = []
+ # next adapted, the tree is fat already
+ debug = False
+ if debug:
+ showtree(thetree, text='thetree before treewithtokenpos')
+ thetree = treewithtokenpos(thetree, correctiontokenlist)
+ if debug:
+ showtree(thetree, text='thetree after treewithtokenpos')
for meta in thecorrection[2]:
if meta.backplacement == bpl_node:
nodeend = meta.annotationposlist[-1] + 1
newnode = myfind(thetree, './/node[@pt and @end="{}"]'.format(nodeend))
- oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend))
+ oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend))
if newnode is not None and oldnode is not None:
# adapt oldnode1 for contextual features
contextoldnode = contextualise(oldnode, newnode)
@@ -374,7 +425,7 @@ def correct_stree(stree, method, corr):
nodeend = meta.annotationposlist[-1] + 1
nodexpath = './/node[@pt and @begin="{}" and @end="{}"]'.format(nodeend - 1, nodeend)
newnode = myfind(thetree, nodexpath)
- oldnode = myfind(stree, nodexpath)
+ oldnode = myfind(fatstree, nodexpath)
if newnode is not None and oldnode is not None:
if 'word' in newnode.attrib and 'word' in oldnode.attrib:
newnode.attrib['word'] = oldnode.attrib['word']
@@ -403,28 +454,39 @@ def correct_stree(stree, method, corr):
elif meta.backplacement == bpl_indeze:
nodebegin = meta.annotatedposlist[-1]
nodeend = nodebegin + 1
- oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend))
+ oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend))
if oldnode is not None:
nodeid = oldnode.attrib['id']
dezeAVnode = etree.fromstring(dezeAVntemplate.format(begin=nodebegin, end=nodeend, id=nodeid))
thetree = transplant_node(oldnode, dezeAVnode, thetree)
- # etree.dump(thetree, pretty_print=True)
+ #etree.dump(thetree, pretty_print=True)
+
+ # now do all the deletions at once, incl adaptation of begins and ends, and new sentence node
+ debug = False
+ if debug:
+ showtree(thetree, text='thetree before deletion:')
- # now do all the deletions at once, incl normalisation of begins and ends, and new sentence node
+ nodes2deletebegins = [int(b) for b in nodes2deletebegins]
thetree = deletewordnodes(thetree, nodes2deletebegins)
+ if debug:
+ showtree(thetree, text='thetree after deletion:')
+
+ debug = False
+
# adapt the metadata
cleantokposlist = [meta.annotationwordlist for meta in newcorrection2 if meta.name == 'cleanedtokenpositions']
cleantokpos = cleantokposlist[0] if cleantokposlist != [] else []
- newcorrection2 = [updatecleantokmeta(meta, nodes2deletebegins, cleantokpos) for meta in newcorrection2]
+ insertbegins = [meta.annotatedposlist for meta in newcorrection2 if meta.name == insertion ]
+ flatinsertbegins = [str(v) for el in insertbegins for v in el]
+ purenodes2deletebegins = [str(v) for v in nodes2deletebegins if str(v) not in flatinsertbegins]
+ newcorrection2 = [updatecleantokmeta(meta, purenodes2deletebegins, cleantokpos) for meta in newcorrection2]
- # etree.dump(thetree, pretty_print=True)
+ #etree.dump(thetree, pretty_print=True)
if debug:
- print('5:', end=': ')
- simpleshow(stree)
- print(showflatxml(stree))
+ showtree(fatstree, text='5:')
restoredtree = thetree
@@ -451,12 +513,19 @@ def correct_stree(stree, method, corr):
metadata.append(meta.toElement())
if debug:
- streesentlist = getyield(stree)
+ streesentlist = getyield(fatstree)
fulltreesentlist = getyield(fulltree)
if streesentlist != fulltreesentlist:
SDLOGGER.warning('Yield mismatch\nOriginal={original}\nAfter correction={newone}'.format(original=streesentlist,
newone=fulltreesentlist))
-
+ rawoldleavenodes = getnodeyield(fatstree)
+ omittedwordbegins = getomittedwordbegins(newcorrection2)
+ oldleavenodes = [n for n in rawoldleavenodes if int(getattval(n, 'begin')) not in omittedwordbegins]
+ oldleaves = [ getattval(n, 'word') for n in oldleavenodes]
+ newleaves = getyield(fulltree)
+ uttid = getuttid(stree)
+ if debug and oldleaves != newleaves:
+ SDLOGGER.error('Yield mismatch:{uttid}\n:OLD={oldleaves}\nNEW={newleaves}'.format(uttid=uttid, oldleaves=oldleaves, newleaves=newleaves))
# return this stree
# print('dump 2:')
# etree.dump(fulltree, pretty_print=True)
@@ -487,7 +556,7 @@ def updatecleantokmeta(meta, begins, cleantokpos):
return meta
-def getuttid(stree):
+def oldgetuttid(stree):
uttidlist = stree.xpath(uttidxpath)
if uttidlist == []:
SDLOGGER.error('Missing uttid')
@@ -507,14 +576,14 @@ def getorigutt(stree):
def scorefunction(obj): return (-obj.unknownwordcount, -obj.dpcount, -obj.dhyphencount, obj.goodcatcount,
- -obj.basicreplaceecount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount,
+ -obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount,
obj.compoundcount, obj.sucount, obj.svaok, -obj.deplusneutcount, -obj.penalty)
class Alternative():
def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount,
compcount, supcount, compoundcount, unknownwordcount, sucount, svaok, deplusneutcount, goodcatcount,
- hyphencount, basicreplaceecount):
+ hyphencount, basicreplaceecount, ambigcount):
self.stree = stree
self.altid = altid
self.altsent = altsent
@@ -532,6 +601,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcou
self.goodcatcount = int(goodcatcount)
self.hyphencount = int(hyphencount)
self.basicreplaceecount = int(basicreplaceecount)
+ self.ambigcount = int(ambigcount)
def alt2row(self, uttid, base, user1='', user2='', user3='', bestaltids=[], selected=None, origsent=None):
scores = ['BEST'] if self.altid in bestaltids else []
@@ -651,6 +721,12 @@ def isvalidword(w):
return True
+def countambigwords(stree):
+ leaves = getnodeyield(stree)
+ ambignodes = [leave for leave in leaves if getattval(leave, 'word').lower() in disambiguationdict]
+ result = len(ambignodes)
+ return result
+
def selectcorrection(stree, ptmds, corr):
# to be implemented@@
# it is presupposed that ptmds is not []
@@ -677,9 +753,10 @@ def selectcorrection(stree, ptmds, corr):
hyphencount = len([node for node in nt.xpath('.//node[contains(@word, "-")]')])
basicreplaceecount = len([node for node in nt.xpath('.//node[@word]')
if getattval(node, 'word').lower() in basicreplacements])
+ ambigwordcount = countambigwords(nt)
alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount, compcount, supcount,
compoundcount, unknownwordcount, sucount, svaokcount, deplusneutcount, goodcatcount,
- hyphencount, basicreplaceecount)
+ hyphencount, basicreplaceecount, ambigwordcount)
alts[altid] = alt
altid += 1
orandalts = OrigandAlts(orig, alts)
diff --git a/find_ngram.py b/find_ngram.py
index e4a661c..1f1e41f 100644
--- a/find_ngram.py
+++ b/find_ngram.py
@@ -192,6 +192,8 @@ def cond17(ns, lvs, i): return lemma(ns[0]) == 'te' and getattval(ns[1], 'his')
def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen' and lemma(ns[2]) == 'te'
+def cond18(ns, lvs, i): return pt(ns[0]) == 'vz' and lemma(ns[1]) in {'dit', 'dat', 'deze', 'die'}
+
ngram1 = Ngram(4, cond1)
ngram2 = Ngram(4, cond2)
ngram3 = Ngram(2, cond3)
@@ -211,7 +213,7 @@ def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen'
ngram16a = Ngram(4, cond16a) # geen beroerte een beroerte test
ngram17 = Ngram(4, cond17) # te kregen te krijgen
ngram17a = Ngram(4, cond17a) # te kregen te krijgen test
-
+ngram18 = Ngram(2, cond18) # met dit
def main():
@@ -231,7 +233,7 @@ def main():
leaves = getnodeyield(tree)
cleanleaves = [leave for leave in leaves if getattval(leave, 'word') not in filledpauseslexicon]
cleanwordlist = [getattval(leave, 'word') for leave in cleanleaves]
- matches = findmatches(ngram1, cleanleaves)
+ matches = findmatches(ngram18, cleanleaves)
# matches = sipvjpvjsi(cleanleaves, tree)
for match in matches:
uttid = getuttid(tree)
diff --git a/lexicon.py b/lexicon.py
index 9692346..380e1dc 100644
--- a/lexicon.py
+++ b/lexicon.py
@@ -12,6 +12,9 @@
lexicon = celex
+#Alpino often analyses certain words as tsw though they should be analysed as nouns
+tswnouns = ['baby', 'jongen', 'juf', 'jufforouw', 'mam', 'mama', 'mamma', 'meisje', 'mens', 'meneer', 'mevrouw',
+ 'pap', 'papa', 'pappa', 'stouterd', 'opa', 'oma']
de = '1'
het = '2'
diff --git a/macros/newimperatives.txt b/macros/newimperatives.txt
index 2ab72e1..490a40f 100644
--- a/macros/newimperatives.txt
+++ b/macros/newimperatives.txt
@@ -54,8 +54,11 @@ nonfinvc = """(@rel="vc" and %nonfincat%) """
realcomplormodnode = """node[%realcomplormod%]"""
realcomplormod = """(not(%particlesvp%) and not(%indexnode%) and not(%nonfinvc%) and not(@rel="hd"))"""
indexnode = """(@index and not (@cat or @pt or @pos))"""
+suindexnode = """(%indexnode% and @rel="su") """
nonfinindexnode = """(%indexnode% and parent::node[%nonfinvc%])"""
+fillednode = """node[not(%indexnode%)]"""
+
particlesvp = """(@rel="svp" and @pt="vz")"""
realcomplormodnodecount = """count(%realcomplormodnode% | node[%nonfinvc%]/%realcomplormodnode%)"""
@@ -94,9 +97,23 @@ wond5plus = """(%ynquery% and %realcomplormodnodecount% >= 4)"""
partofwhquestion = """((@cat="sv1" or @cat="ssub") and @rel="body" and parent::node[@cat="whq" or @cat="whsub" ]) """
declarative = """(@cat="smain" or (@cat="ssub" and not(%partofwhquestion%)) or (@cat="sv1" and not(%basicimperative%) and not(%ynquery%) and not(%partofwhquestion%)) )"""
-Tarsp_OndWB = """
-(%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %realcomplormodnodecount% = 2 )
-"""
+Tarsp_OndB = """(%Ond% and node[%Tarsp_Basic_B%] and count(node) = 2)"""
+
+Tarsp_OndVC = """(%Ond% and node[%Tarsp_Basic_VC%] and count(node) = 2) """
+
+Tarsp_OndBVC = """(%Ond% and node[%Tarsp_Basic_B%] and node[%Tarsp_Basic_VC%] and count(node) = 3) """
+
+Tarsp_OndW = """(%declarative% and %Ond% and (%Tarsp_W% or node[%Tarsp_onlyWinVC%]) and %realcomplormodnodecount% = 0 )"""
+
+Tarsp_onlyWinVC = """(@rel="vc" and node[@rel="hd" and @pt="ww" and %realcomplormodnodecount% = 0])"""
+
+
+Tarsp_OndWB = """(%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %realcomplormodnodecount% = 2 )"""
+
+Tarsp_BasicVCW = """(node[@pt="ww" and @rel="hd"] and node[%Tarsp_Basic_VC%] and count(%fillednode%)=2)"""
+
+Tarsp_VCW_X = """(%Tarsp_BasicVCW% or (node[%nonfinvc% and %Tarsp_BasicVCW%] and count(node)=1) )"""
+
Tarsp_OndWBVC = """
(%declarative% and %Ond% and %Tarsp_W% and %Tarsp_B_X% and %Tarsp_VC_X% and %realcomplormodnodecount% = 3 )
@@ -180,6 +197,8 @@ Tarsp_Ov3 = """(%declarative% and
not(%Tarsp_OndWB%) and
not(%Tarsp_BBX%)and
not(%Tarsp_WBVC%) and
+ not(%Tarsp_OndB%) and
+ not(%Tarsp_OndVC%) and
%realcomplormodnodecount% = 2) """
@@ -190,11 +209,12 @@ Tarsp_kijkVU = """(@pt="ww" and @lemma="kijken" and @wvorm="pv" and @pvagr="ev"
Tarsp_pporvc = """ (((@rel="pc" or @rel="mod" or @rel="ld") and @cat="pp") or @rel="vc")"""
-Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"]) and
+Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"] or %Tarsp_BarenonfinW%) and
not(%Tarsp_kijkVU%) and
not((@lemma="zijn" or @lemma="worden") and
parent::node[node[@rel="vc"]]) )"""
-
+
+Tarsp_BarenonfinW = """parent::node[@rel="vc" and parent::node[@cat="smain" and count(node)=1]]"""
Tarsp_Hwwi = """(( @pt="ww" and @rel="hd" and @wvorm="pv" and
%Tarsp_hww% and
diff --git a/macros/sastamacros1.txt b/macros/sastamacros1.txt
index e976d0c..5034941 100644
--- a/macros/sastamacros1.txt
+++ b/macros/sastamacros1.txt
@@ -37,9 +37,9 @@ JO_kijken_naar = """ parent::node[@cat="pp" and
robusttopicdrop = """(@cat="sv1" and ../node[@lemma="."])"""
Tarsp_hww = """
- (@lemma="kunnen" or
+ (@lemma = "kunnen" or
@lemma = "moeten" or
- @lemma= "hoeven" or
+ @lemma = "hoeven" or
@lemma = "blijven" or
@lemma = "willen" or
@lemma = "zullen" or
@@ -59,6 +59,7 @@ Tarsp_vc_sibling = """parent::node[ node[@rel="vc"]]"""
Tarsp_predc_sibling = """parent::node[ node[@rel="predc"]]"""
Tarsp_obj1_sibling = """parent::node[ node[@rel="obj1"]]"""
Tarsp_ld_sibling = """parent::node[ node[@rel="ld"]]"""
+Tarsp_onlymodR_sibling = """(parent::node[node[@rel="mod" and %Rpronoun%] and not(node[@rel="predc"])])"""
Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and
((
@@ -66,7 +67,7 @@ Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and
@lemma = "hebben"
) and
not(%Tarsp_vc_sibling%)) or
- (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling%)
+ (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling% )
)
"""
@@ -78,7 +79,7 @@ Tarsp_Kop = """
((%Tarsp_predc_sibling% and not(%Tarsp_obj1_sibling%)) or
- (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%))
+ (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%) and not(%Tarsp_onlymodR_sibling%))
)
)
"""
@@ -128,6 +129,13 @@ pv = """(@pt="ww" and @wvorm="pv" )"""
bxnp1 = """(@cat="np" and count(node)=2 and node[@rel="hd" and @pt="ww"] and node[@rel="mod" and @pt])"""
bxnp2 = """(@cat="np" and count(node)=2 and node[@rel="hd"] and node[@rel="mod" and %singlewordbw%])"""
+Tarsp_Basic_VC = """((@rel="obj1" or @rel="pc" or @rel="predc" or @rel="ld" or @rel="obj2" or %Tarsp_finvc% or %Tarsp_vcvnw% or (@rel="svp" and @pt!="vz")) and not(%Tarsp_Basic_B%) )"""
+
+
+Tarsp_Basic_B = """(@rel="mod" or @rel="ld" or @rel="predm" or %Tarsp_B_predc%) """
+
+Tarsp_B_predc = """(@rel=predc and (@pt="vz" or @pt="bw" or @cat="pp" or @cat="advp" or %Rpronoun%))"""
+
Tarsp_B = """(
((((@rel="mod" or @rel="ld" or @rel="predm") and
(not(@cat) or @cat!="conj") and
@@ -169,9 +177,13 @@ pobj1B = """(@rel="pc" and ../node[@rel="hd" and %locverb%])"""
singlewordbw = """ (@pt="bw" or %Rpronoun% or %adjadv%)
"""
+
+
corephrase = """(@cat="np" or @cat="pp" or @cat="advp" or @cat="ap")"""
-coreBX = """(node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]])"""
+coreBX = """((node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]]) )"""
+
+Tarsp_bnonfin = """((@cat="inf" or @cat="ppart") and @rel="vc" and parent::node[@cat="smain" and count(node)=1] and node[%Tarsp_B%] and node[@pt="ww" and @rel="hd"] and count(node[%realcomplormod%])=1 )"""
ASTA_pred = """(@rel="predc" or @rel="predm" or (@rel="hd" and parent::node[@rel="predc" or @rel="predm"]))"""
@@ -301,7 +313,12 @@ spec_noun = """ (@pt="spec" and (@pos="name" or starts-with(@frame,"proper_name"
"""
- asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or (@pt="ww" and @positie="nom") or (%monthname%) or @pos="name")
+ asta_numvrij = """(@pt="tw" and @positie="vrij" and @rel!="mwp" and @rel!="det" and @rel!="mod" )"""
+
+ asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or
+ (@pt="ww" and @positie="nom") or
+ (%monthname%) or
+ @pos="name" )
"""
@@ -479,3 +496,11 @@ robustdelpv = """(not(@rel="dp" and @begin > ancestor::node[@cat="top"]/descenda
delpv = """(%coredelpv% and %robustdelpv%)"""
+Vobij = """(@pt="bw" and (contains(@frame,"er_adverb" ) or contains(@frame, "tmp_adverb") or @lemma="daarom") and
+@lemma!="er" and @lemma!="daar" and @lemma!="hier" and (starts-with(@lemma, 'er') or starts-with(@lemma, 'daar') or starts-with(@lemma, 'hier')))"""
+
+Tarsp_VzN = """(%vzn1xpath% or %vzn2xpath% ) """
+
+vzn1xpath = """(@cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"])))"""
+vzn2xpath = """(node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"])"""
+vzn3xpath = """(@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin>=../node[@pt="vz"]/@end and count(node)<=3] )"""
diff --git a/metadata.py b/metadata.py
index 2760f14..0205aaa 100644
--- a/metadata.py
+++ b/metadata.py
@@ -17,7 +17,8 @@
class Meta:
def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], annotatedposlist=[],
- annotatedwordlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty,
+ annotatedwordlist=[], annotationcharlist=[], annotationcharposlist=[], annotatedcharlist=[],
+ annotatedcharposlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty,
backplacement=defaultbackplacement):
self.atype = atype
self.name = name
@@ -25,6 +26,10 @@ def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], ann
self.annotationposlist = annotationposlist
self.annotatedwordlist = annotatedwordlist
self.annotatedposlist = annotatedposlist
+ self.annotationcharlist = annotationcharlist
+ self.annotationcharposlist = annotationcharposlist
+ self.annotatedcharlist = annotatedcharlist
+ self.annotatedcharposlist = annotatedcharposlist
self.value = value
self.cat = cat
self.subcat = subcat
@@ -93,3 +98,7 @@ def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalt
repetition = 'Repetition'
fstoken = 'Retraced token'
falsestart = 'Retracing with Correction'
+insertion = 'Insertion'
+smallclause = 'Small Clause Treatment'
+tokenmapping = 'Token Mapping'
+insertiontokenmapping = 'Insertion Token Mapping'
\ No newline at end of file
diff --git a/methods/ASTA Index Current.xlsx b/methods/ASTA Index Current.xlsx
index b575d44..ad598d8 100644
Binary files a/methods/ASTA Index Current.xlsx and b/methods/ASTA Index Current.xlsx differ
diff --git a/methods/TARSP Index 2022-01-07.xlsx b/methods/TARSP Index 2022-01-07.xlsx
new file mode 100644
index 0000000..1547465
Binary files /dev/null and b/methods/TARSP Index 2022-01-07.xlsx differ
diff --git a/methods/TARSP Index Current.xlsx b/methods/TARSP Index Current.xlsx
index 75e9075..6b00d71 100644
Binary files a/methods/TARSP Index Current.xlsx and b/methods/TARSP Index Current.xlsx differ
diff --git a/methods/~$TARSP Index Current.xlsx b/methods/~$TARSP Index Current.xlsx
new file mode 100644
index 0000000..8a7c89f
Binary files /dev/null and b/methods/~$TARSP Index Current.xlsx differ
diff --git a/mismatches.py b/mismatches.py
index 214b938..9686ae5 100644
--- a/mismatches.py
+++ b/mismatches.py
@@ -1,10 +1,11 @@
-
import os
from collections import Counter
from copy import copy
from lxml import etree
from config import SDLOGGER
from treebankfunctions import getyield, getmarkedyield, getattval
+from sastatoken import deflate
+
tab = '\t'
space = ' '
eps = ''
@@ -13,6 +14,7 @@
usercommentuntil = 3
usercommentdefaultvalue = eps
+
def getmarkedutt(m, syntree):
thewordlist = getyield(syntree)
thepositions = getwordpositions(m, syntree)
@@ -20,10 +22,12 @@ def getmarkedutt(m, syntree):
yieldstr = space.join(themarkedyield)
return yieldstr
+
def mark(str):
- result = '*'+ str + '*'
+ result = '*' + str + '*'
return result
+
def getwordpositionsold(matchtree, syntree):
positions1 = []
for node in matchtree.iter():
@@ -35,7 +39,7 @@ def getwordpositionsold(matchtree, syntree):
for node in syntree.iter():
if 'index' in node.attrib and ('pt' in node.attrib or 'cat' in node.attrib or 'pos' in node.attrib):
theindex = node.attrib['index']
- indexednodes[theindex]=node
+ indexednodes[theindex] = node
thequery2 = ".//node[@index and not(@pt) and not(@cat)]"
try:
@@ -49,8 +53,9 @@ def getwordpositionsold(matchtree, syntree):
result = [int(p) for p in positions]
return result
+
def getwordpositions(matchtree, syntree):
- #nothing special needs to be done for index nodes since they also have begin and end
+ # nothing special needs to be done for index nodes since they also have begin and end
positions = []
for node in matchtree.iter():
if 'end' in node.attrib:
@@ -58,6 +63,7 @@ def getwordpositions(matchtree, syntree):
result = [int(p) for p in positions]
return result
+
def getfirstwordposition(matchtree):
if 'begin' in matchtree.attrib:
positionstr = getattval(matchtree, 'begin')
@@ -67,7 +73,6 @@ def getfirstwordposition(matchtree):
return position
-
def getmarkedyield(wordlist, positions):
pos = 1
resultlist = []
@@ -102,8 +107,23 @@ def mismatches(queryid, queries, theresultsminusgold, goldminustheresults, allma
uttstr]
print(tab.join(platinumcheckrow2), file=platinumcheckfile)
-def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile, permsilverdatadict={}, annotationinput=False):
+def getmarkposition(position, nodeendmap, uttid):
+ if position == 0:
+ result = 1
+ elif uttid in nodeendmap:
+ if str(position) in nodeendmap[uttid]:
+ result = nodeendmap[uttid][str(position)]
+ else:
+ SDLOGGER.error('getmarkposition: No mapping found for position {} in utterance {}'.format(position, uttid))
+ result = 1
+ else:
+ SDLOGGER.error('getmarkposition: No mappings found for uttid {}'.format(uttid))
+ result = 1
+ return result
+
+def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile,
+ permsilverdatadict={}, annotationinput=False):
theexactresults = exactresults[queryid] if queryid in exactresults else Counter()
theexactgoldscores = exactgoldscores[queryid] if queryid in exactgoldscores else Counter()
(theresultsminusgold, goldminustheresults, intersection) = exactcompare(theexactresults, theexactgoldscores)
@@ -117,13 +137,13 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
markedwordlist = getmarkedyield(allutts[uttid], [markposition])
uttstr = space.join(markedwordlist)
platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item,
- str(uttid), str(position), uttstr]
+ str(uttid), str(markposition), uttstr]
print(tab.join(platinumcheckrow1), file=platinumcheckfile)
key = (queryid, uttid, position)
usercomments = getusercomments(permsilverdatadict, key, report=True)
- xlplatinumcheckrow1 = usercomments + ['More examples'] + platinumcheckrow1
+ xlplatinumcheckrow1 = usercomments + ['More examples'] + platinumcheckrow1
newrows.append(xlplatinumcheckrow1)
- #for (m, syntree) in allmatches[(queryid, uttid)]:
+ # for (m, syntree) in allmatches[(queryid, uttid)]:
# if getfirstwordposition(m) == position:
# markedutt = getmarkedutt(m, syntree)
# platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item,
@@ -139,9 +159,11 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
markedwordlist = getmarkedyield(allutts[uttid], [markposition])
uttstr = space.join(markedwordlist)
else:
- SDLOGGER.warning('uttid {} not in alluts'.format(uttid))
+ SDLOGGER.warning('uttid {} not in allutts'.format(uttid))
uttstr = ""
- platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid), str(position),
+ markposition = 0
+ platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid),
+ str(markposition),
uttstr]
print(tab.join(platinumcheckrow2), file=platinumcheckfile)
key = (queryid, uttid, position)
@@ -150,6 +172,7 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
newrows.append(xlplatinumcheckrow2)
return newrows
+
def compareunaligned(resultctr, goldctr):
'''
@@ -168,20 +191,21 @@ def compareunaligned(resultctr, goldctr):
takefromresultlist.append((utt1, pos1))
takefromgoldlist.append((utt1, 0))
newintersection.append((utt1, pos1))
- curgoldlist.remove((utt1,0))
+ curgoldlist.remove((utt1, 0))
elif pos1 == 0:
for (utt2, pos2) in curgoldlist:
if utt1 == utt2:
takefromresultlist.append((utt1, pos1))
takefromgoldlist.append((utt1, pos2))
newintersection.append((utt1, pos2))
- curgoldlist.remove((utt2,pos2))
+ curgoldlist.remove((utt2, pos2))
break
takefromresultctr = Counter(takefromresultlist)
takefromgoldctr = Counter(takefromgoldlist)
newintersectionctr = Counter(newintersection)
return (takefromresultctr, takefromgoldctr, newintersectionctr)
+
def exactcompare(exactresults, exactgoldscores):
'''
compares two lists of exact results, i.e. dlists of pairs (uttid, position)
@@ -227,18 +251,21 @@ def getusercomments(permsilverdict, key, report=False):
SDLOGGER.warning('No silver remark for key: {}'.format(key))
return result
+
def testcompare():
- testresults = [(1,2),(1,2), (1,2), (1,5), (1,6),(2,0), (2, 4)]
- goldresults = [(1,2), (2,4), (2,6), (1,0), (3,5)]
- reftestminusgold = [(1,2), (1,5), (1,6)]
- refgoldminustest = [(3,5)]
- refintersection = [(1,2), (1,2), (2,4), (2,6)]
+ testresults = [(1, 2), (1, 2), (1, 2), (1, 5), (1, 6), (2, 0), (2, 4)]
+ goldresults = [(1, 2), (2, 4), (2, 6), (1, 0), (3, 5)]
+ reftestminusgold = [(1, 2), (1, 5), (1, 6)]
+ refgoldminustest = [(3, 5)]
+ refintersection = [(1, 2), (1, 2), (2, 4), (2, 6)]
(testminusgold, goldminustest, intersection) = exactcompare(testresults, goldresults)
- for (l, r,g ) in zip(['R-G', 'G-R', 'R*G'],[testminusgold, goldminustest, intersection],[reftestminusgold, refgoldminustest, refintersection]):
+ for (l, r, g) in zip(['R-G', 'G-R', 'R*G'], [testminusgold, goldminustest, intersection],
+ [reftestminusgold, refgoldminustest, refintersection]):
if r == g:
- print('{}: OK {} == {}'.format(l, r,g))
+ print('{}: OK {} == {}'.format(l, r, g))
else:
- print('{}: NO: {} != {}'.format(l, r,g))
+ print('{}: NO: {} != {}'.format(l, r, g))
+
if __name__ == '__main__':
- testcompare()
\ No newline at end of file
+ testcompare()
diff --git a/queryfunctions.py b/queryfunctions.py
index 954a618..c0412e6 100644
--- a/queryfunctions.py
+++ b/queryfunctions.py
@@ -7,7 +7,7 @@
vzn1basexpath = './/node[ @cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"]))]'
vzn1xpath = expandmacros(vzn1basexpath)
vzn2xpath = './/node[node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"]]'
-vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin=../node[@pt="vz"]/@end and count(node)<=3] ]'
+vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat") and @begin>=../node[@pt="vz"]/@end and count(node)<=3] ]'
#vzn4basexpath = './/node[node[@pt="vz" and @rel="hd" and ../node[%Rpronoun% and @rel="obj1" and @end <= ../node[@rel="hd"]/@begin]]]'
#vzn4xpath = expandmacros(vzn4basexpath)
diff --git a/readcsv.py b/readcsv.py
index 1cc5694..a6f9ecb 100644
--- a/readcsv.py
+++ b/readcsv.py
@@ -6,10 +6,10 @@
mysep = tab
-def readcsv(filename, sep=mysep, header=True, quotechar='"'):
+def readcsv(filename, sep=mysep, header=True, quotechar='"', encoding='utf8'):
result = []
try:
- infile = open(filename, 'r', encoding='utf8', newline='')
+ infile = open(filename, 'r', encoding=encoding, newline='')
except FileNotFoundError as e:
SDLOGGER.error(e)
return result
@@ -25,11 +25,11 @@ def readcsv(filename, sep=mysep, header=True, quotechar='"'):
return result
-def readheadedcsv(filename, sep=mysep, quotechar='"'):
+def readheadedcsv(filename, sep=mysep, quotechar='"', encoding='utf8'):
result = []
header = []
try:
- infile = open(filename, 'r', encoding='utf8', newline='')
+ infile = open(filename, 'r', encoding=encoding, newline='')
except FileNotFoundError as e:
SDLOGGER.error(e)
return header, result
diff --git a/sastadev.py b/sastadev.py
index cc2b549..653a943 100644
--- a/sastadev.py
+++ b/sastadev.py
@@ -43,7 +43,8 @@
from SAFreader import get_annotations, get_golddata, richscores2scores, exact2global, richexact2global
from SAFreader import all_levels
from external_functions import str2functionmap
-from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno
+from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno, \
+ showtree, getnodeendmap, getxselseuttid
from SRFreader import read_referencefile
from goldcountreader import get_goldcounts
from TARSPscreening import screening4stage
@@ -53,7 +54,7 @@
from query import pre_process, core_process, post_process, form_process, is_preorcore, query_inform, query_exists, \
is_pre, is_core
from macros import expandmacros
-from mismatches import mismatches, exactmismatches
+from mismatches import mismatches, exactmismatches, getmarkposition
from xlsx import mkworkbook
import xlsxwriter
from counterfunctions import counter2liststr
@@ -285,7 +286,7 @@ def isxpathquery(query):
def doqueries(syntree, queries, exactresults, allmatches, criterion):
uttid = getuttid(syntree)
- #uttid = getuttidorno(syntree)
+ # uttid = getuttidorno(syntree)
omittedwordpositions = getxmetatreepositions(syntree, 'Omitted Word', poslistname='annotatedposlist')
# print(uttid)
# core queries
@@ -313,6 +314,9 @@ def doqueries(syntree, queries, exactresults, allmatches, criterion):
exactresults[queryid] = []
# matchingids = [uttid for x in matches]
for m in matches:
+ # showtree(m)
+ if m is None:
+ showtree(syntree)
if (queryid, uttid) in allmatches:
allmatches[(queryid, uttid)].append((m, syntree))
else:
@@ -485,6 +489,18 @@ def exact2results(exactresults):
return results
+def adaptpositions(rawexactresults, nodeendmap):
+ newexactresults = {}
+ for qid in rawexactresults:
+ newlist = []
+ for (uttid, position) in rawexactresults[qid]:
+ newposition = getmarkposition(position, nodeendmap, uttid)
+ newtuple = (uttid, newposition)
+ newlist.append(newtuple)
+ newexactresults[qid] = newlist
+ return newexactresults
+
+
def passfilter(rawexactresults, method):
'''
let's only those through that satisfy the
@@ -669,6 +685,7 @@ def passfilter(rawexactresults, method):
platinumoutfilename, options.platinuminfilename, goldscores)
analysedtrees = []
+nodeendmap = {}
# @vanaf nu gaat het om een treebank, dus hier een if statement toevoegen-done
if annotationinput:
@@ -715,18 +732,31 @@ def passfilter(rawexactresults, method):
analysedtrees.append(syntree)
doprequeries(syntree, queries, rawexactresults, allmatches)
docorequeries(syntree, queries, rawexactresults, allmatches)
- uttid = getuttid(syntree)
- uttno = getuttno(syntree)
- allutts[uttno] = getyield(syntree)
- # allutts[uttid] = getyield(syntree)
+
+ # uttid = getuttid(syntree)
+ uttid = getxselseuttid(syntree)
+ # showtree(syntree)
+ if uttid in nodeendmap:
+ SDLOGGER.error('Duplicate uttid in sample: {}'.format(uttid))
+ nodeendmap[uttid] = getnodeendmap(syntree)
+
+ # uttno = getuttno(syntree)
+ # allutts[uttno] = getyield(syntree)
+ allutts[uttid] = getyield(syntree)
# determine exactresults and apply the filter to catch interdependencies between prequeries and corequeries
# rawexactresults = getexactresults(allmatches)
- exactresults = passfilter(rawexactresults, themethod)
+ rawexactresults2 = passfilter(rawexactresults, themethod)
+ exactresults = adaptpositions(rawexactresults2, nodeendmap)
+
+ #pas hier de allutts en de rawexactresults2 aan om expansies te ontdoen, gebseerd op de nodeendmap
+ #@@to be implemented @@ of misschien in de loop hierboven al?
# @ en vanaf hier kan het weer gemeenschappelijk worden; er met dus ook voor de annotatiefile een exactresults opgeleverd worden
# @d epostfunctions for lemma's etc moeten mogelijk wel aangepast worden
+# adapt the exactresults positions to the reference
+
coreresults = exact2results(exactresults)
@@ -959,7 +989,9 @@ def passfilter(rawexactresults, method):
logheader = ['datetime', 'treebank', 'scorenr,' 'R', 'P', 'F1', 'P-R', 'P-P', 'P-F1', 'GP-R', 'GP-P', 'GP-F1', 'ref',
'method']
logname = 'sastalog.txt'
-biglogfile = open(logname, 'a', encoding='utf8')
+logpath = r'D:\jodijk\Dropbox\jodijk\myprograms\python\sastacode\sastadev'
+logfullname = os.path.join(logpath, logname)
+biglogfile = open(logfullname, 'a', encoding='utf8')
exactlynow = datetime.datetime.now()
now = exactlynow.replace(microsecond=0).isoformat()
diff --git a/sastatok.py b/sastatok.py
index adb6c04..6072d60 100644
--- a/sastatok.py
+++ b/sastatok.py
@@ -61,5 +61,5 @@ def sasta_tokenize(instring):
if instring is None:
return []
tokenstring = fullsastare.findall(instring)
- result = stringlist2tokenlist(tokenstring)
+ result = stringlist2tokenlist(tokenstring, start=10, inc=10)
return result
diff --git a/sastatoken.py b/sastatoken.py
index cdb987c..50004f6 100644
--- a/sastatoken.py
+++ b/sastatoken.py
@@ -15,12 +15,12 @@ def __repr__(self):
def __str__(self):
skipstr = ' (skip=True)' if self.skip else ''
- subposstr = '.{}' if self.subpos != 0 else ''
+ subposstr = '/{}'.format(self.subpos) if self.subpos != 0 else ''
result = '{}{}:{}{}'.format(self.pos, subposstr, self.word, skipstr)
return result
-def stringlist2tokenlist(list):
+def oldstringlist2tokenlist(list):
result = []
llist = len(list)
for el in range(llist):
@@ -29,6 +29,17 @@ def stringlist2tokenlist(list):
return result
+def stringlist2tokenlist(list, start=0, inc=1):
+ result = []
+ llist = len(list)
+ pos = start
+ for el in range(llist):
+ thetoken = Token(list[el], pos)
+ result.append(thetoken)
+ pos += inc
+ return result
+
+
def tokenlist2stringlist(tlist, skip=False):
if skip:
result = [t.word for t in tlist if not t.skip]
@@ -49,3 +60,24 @@ def show(tokenlist):
resultlist.append(str(token))
result = ', '.join(resultlist)
return result
+
+
+def tokeninflate(token):
+ result = inflate(token.pos) + token.subpos
+ return result
+
+
+def deflate(n: int):
+ result = (n // 10) - 1
+ return result
+
+
+def inflate(n: int):
+ result = (n + 1) * 10
+ return result
+
+
+def insertinflate(n: int):
+ dm = n % 10
+ result = ((n - dm) + 1) * 10 + dm
+ return result
diff --git a/smallclauses.py b/smallclauses.py
new file mode 100644
index 0000000..d44cbaa
--- /dev/null
+++ b/smallclauses.py
@@ -0,0 +1,292 @@
+from config import SDLOGGER
+from treebankfunctions import getstree, getnodeyield, getattval
+from dedup import filledpauseslexicon
+from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon
+from lexicon import known_word, tswnouns
+from namepartlexicon import namepart_isa_namepart
+from sastatoken import Token, show
+from tokenmd import TokenListMD
+from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\
+ insertiontokenmapping
+
+space = ' '
+biglocvzs = ['achter', 'beneden', 'binnen', 'boven', 'bovenop', 'buiten', 'dichtbij']
+#surenouns = ['mama', 'papa'] replaced by tswnouns from lexicon
+longvowels = ['a', 'é', 'i', 'o', 'u', 'y']
+vowels = ['a', 'e', 'i', 'o', 'u']
+
+uniquelynominativeperspros = ['ik', 'jij', 'hij', 'zij', 'wij', 'ikke', "'k", "k", "ie", "we"]
+
+
+def makegen(lemma):
+ if lemma is None or len(lemma) < 2:
+ result = None
+ elif lemma[-1] in ['s', 'z', 'x']:
+ result = lemma + "'"
+ elif lemma[-2:] in [ 'ij']:
+ result = lemma + 's'
+ elif lemma[-2] in vowels and lemma[-1] in vowels:
+ result = lemma + 's'
+ elif lemma[-1] in longvowels:
+ result = lemma + "'s"
+ else:
+ result = lemma + 's'
+ return result
+
+def realword(node):
+ result = True
+ result = result and getattval(node, 'pt') not in ['tsw', 'let']
+ result = result and getattval(node, 'lemma') not in ['xx', 'xxx', 'yyy', 'www', 'hè']
+ result = result and getattval(node, 'lemma') not in filledpauseslexicon
+ result = result or lemma(node) in tswnouns
+
+
+ return result
+
+
+def hasgenitive(node):
+ lemma = getattval(node, 'lemma')
+ nodept = pt(node)
+ if nodept not in ['n', 'vnw']:
+ nodept = 'n'
+ result = (lemma, nodept) in genlexicon and 'yes' in genlexicon[(lemma, nodept)]
+ result = result or namepart_isa_namepart(lemma)
+ return result
+
+def aanwvnw(node):
+ result = getattval(node, 'pt') == 'vnw' and getattval(node, 'vwtype') == 'aanw' and not rpronoun(node)
+ return result
+
+
+def n(node):
+ result = getattval(node, 'pt') == 'n'
+ return result
+
+
+def getal(node):
+ result = getattval(node, 'getal')
+ return result
+
+def pt(node):
+ result = getattval(node, 'pt')
+ return result
+
+def bg(node):
+ result = int(getattval(node, 'begin'))
+ return result
+
+def tw(node):
+ result = getattval(node, 'pt') == 'tw'
+ return result
+
+def word(node):
+ result = getattval(node, 'word')
+ return result
+
+
+def adj(node):
+ result = getattval(node, 'pt') == 'adj'
+ return result
+
+def perspro(node):
+ pt = getattval(node, 'pt')
+ vwtype = getattval(node, 'vwtype')
+ result = pt == 'vnw' and vwtype == 'pers'
+ return result
+
+def nomperspro(node):
+ lemma = getattval(node, 'lemma')
+ result = perspro(node) and lemma in uniquelynominativeperspros
+ return result
+
+def inf(node):
+ result = getattval(node, 'pt') == 'ww' and getattval(node, 'wvorm') == 'inf'
+ return result
+
+
+def rpronoun(node):
+ result = getattval(node, 'pt') == 'vnw' and \
+ getattval(node, 'lemma') in ['er', 'hier', 'daar', 'ergens', 'overal', 'nergens', 'waar']
+ return result
+
+def bw(node):
+ result = getattval(node, 'pt') == 'bw'
+ return result
+
+def ww(node):
+ result = getattval(node, 'pt') == 'ww'
+ return result
+
+
+def lemma(node):
+ result = getattval(node, 'lemma')
+ return result
+
+def predadv(node):
+ result = locadv(node)
+ result = result or (bw(node) and lemma(node) in ['niet', 'mee', 'weg'])
+ return result
+
+def vz(node):
+ result = getattval(node, 'pt') == 'vz'
+ return result
+
+def locadv(node):
+ result = getattval(node, 'pt') in ['bw', 'vz']
+ frame = getattval(node, 'frame')
+ result = result and ('loc' in frame or 'er_adverb' in frame)
+ result = result or rpronoun(node)
+ return result
+
+def biglocvz(node):
+ result = getattval(node, 'lemma') in biglocvzs
+ return result
+
+def istswnoun(node):
+ result = getattval(node, 'lemma') in tswnouns
+ return result
+
+def getleavestr(leaves):
+ leaveseq = ['{}:{}:{}:{}'.format(getattval(leave, 'end'), getattval(leave, 'word'), getattval(leave, 'lemma'),
+ getattval(leave, 'pt')) for leave
+ in leaves]
+ leavestr = space.join(leaveseq)
+ return leavestr
+
+def knownnoun(node):
+ word = getattval(node, 'word')
+ lemma = getattval(node, 'lemma')
+ postag = pt(node)
+ result = postag == 'n' and (known_word(word) or known_word(lemma))
+ result = result or lemma in tswnouns
+ return result
+
+def nominal(node):
+ result = pt(node) == 'n' or aanwvnw(node)
+ return result
+
+def mktoken(node, map):
+ nodebegin = bg(node)
+ nodeword = word(node)
+ if nodebegin in map:
+ nodepos = map[nodebegin]
+ else:
+ SDLOGGER.error('missing begin in map {}'.format(nodebegin))
+ nodepos = int(nodebegin)
+ result = Token(nodeword, nodepos)
+ return result
+
+
+def mktokenlist(tokens, fpos, inserttokens):
+ resultlist = [token for token in tokens if token.pos <= fpos] + \
+ inserttokens + \
+ [token for token in tokens if token.pos > fpos]
+ return resultlist
+
+
+def oldmktokenlist(leaves, themap, fpos, inserttokens):
+ resultlist = [mktoken(lv, themap) for lv in leaves if bg(lv) <= fpos] + \
+ inserttokens + \
+ [mktoken(lv, themap) for lv in leaves if bg(lv) > fpos]
+ return resultlist
+
+
+def mkinsertmeta(inserttokens, resultlist):
+ insertposs = [token.pos + token.subpos for token in inserttokens]
+ insertwordlist = [token.word for token in inserttokens]
+ tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist]
+ metadata1 = [Meta(insertion, [insertword], annotatedposlist=[insertpos],
+ annotatedwordlist=[], annotationposlist=[insertpos],
+ annotationwordlist=[insertword], cat=smallclause, source=SASTA, penalty=defaultpenalty,
+ backplacement=bpl_delete) for insertword, insertpos in zip(insertwordlist, insertposs)]
+ meta2 = Meta(insertiontokenmapping, tokenmappinglist, cat=tokenmapping, source=SASTA, penalty=0,
+ backplacement=bpl_none)
+ metadata = metadata1 + [meta2]
+ return metadata
+
+
+def smallclauses(tokensmd, tree):
+ resultlist = []
+ leaves = getnodeyield(tree)
+ reducedleaves = [leave for leave in leaves if realword(leave)]
+ if not(len(reducedleaves) > 1 and len(reducedleaves) <= 3):
+ return resultlist
+ tokens = tokensmd.tokens
+ treewords = [word(tokennode) for tokennode in leaves]
+ tokenwords = [token.word for token in tokens if not token.skip]
+ if treewords != tokenwords:
+ SDLOGGER.error('Token mismatch: {} v. {}'.format(treewords, tokenwords))
+ return []
+ themap = {bg(tokennode): token.pos for (tokennode, token) in zip(leaves, tokens)}
+ metadata = tokensmd.metadata
+
+ if len(reducedleaves) <= 3:
+ first = leaves[0]
+ second = leaves[1]
+ if len(reducedleaves) == 3:
+ third = leaves[0]
+
+ if len(reducedleaves) == 2:
+ if (aanwvnw(first) or knownnoun(first) or perspro(first)) and (predadv(second)or vz(second) or bw(second)):
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('moet' if getal(first) != 'mv' else 'moeten', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ #elif (aanwvnw(second) or knownnoun(second) or perspro(second) or tw(second)) and predadv(first):
+ elif nomperspro(second) and predadv(first):
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('moet' if getal(second) != 'mv' else 'moeten', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ elif (aanwvnw(first) or knownnoun(first)) and adj(second):
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ elif (aanwvnw(second) or knownnoun(second) or tw(second)) and biglocvz(first):
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ elif knownnoun(first) and knownnoun(second) and not(lemma(first) == lemma(second)):
+ if hasgenitive(first):
+ genform = makegen(lemma(first))
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('[: ' + genform + ']', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ else:
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ elif (aanwvnw(first) or knownnoun(first) or istswnoun(first)) and inf(second):
+ if intransitive(second):
+ firstsubject = True
+ elif transitive(second) and ishuman(first):
+ firstsubject = True
+ elif pseudotr(second) and (ishuman(first) or isanimate(first)):
+ firstsubject = True
+ else:
+ firstsubject = False
+ if firstsubject:
+ fpos = int(getattval(first, 'begin'))
+ inserttokens = [Token('wil' if getal(first) != 'mv' else 'willen', fpos, subpos=5)]
+ else:
+ fpos = -1
+ inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ elif not nominal(first) and not ww(first) and inf(second):
+ fpos = -1
+ inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)]
+ resultlist = mktokenlist(tokens, fpos, inserttokens)
+ metadata += mkinsertmeta(inserttokens, resultlist)
+ if resultlist == []:
+ result = []
+ else:
+ result = [TokenListMD(resultlist, metadata)]
+ return result
+
+
+
+
diff --git a/sva.py b/sva.py
index 7b5defd..0b2f8e3 100644
--- a/sva.py
+++ b/sva.py
@@ -7,7 +7,7 @@
from tokenmd import TokenListMD
from treebankfunctions import (copymodifynode, find1, getattval, getdetof,
getheadof, getlemma, indextransform, inverted,
- lbrother, nominal, rbrother, simpleshow)
+ lbrother, nominal, rbrother, simpleshow, showtree)
debug = False
@@ -356,12 +356,11 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata):
pvbegin = getattval(thepv, 'begin')
inversion = inverted(snode, thepv)
reducedtokens = [t for t in tokens if not t.skip]
- tokenposmap = {i: reducedtokens[i].pos for i in range(len(reducedtokens))}
newpv = getpvform(snode, thepv, inversion)
if newpv is None:
results = []
else:
- newpos = tokenposmap[int(pvbegin)]
+ newpos = int(pvbegin)
newtoken = Token(newpv, newpos)
for token in tokens:
if token.pos != newpos:
@@ -378,6 +377,9 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata):
def getsvacorrections(tokensmd, rawtree, uttid):
+ debug = False
+ if debug:
+ showtree(rawtree, text='rawtree')
if rawtree is None:
return []
else:
@@ -540,7 +542,7 @@ def phicompatible(snode, vnode):
elif '2i' in vnodepersons:
subjbegin = getattval(subjnode, 'begin')
vnodeend = getattval(vnode, 'end')
- result = subjperson == '2' and '2i' in vnodepersons and subjbegin == vnodeend and \
+ result = subjperson == '2' and '2i' in vnodepersons and subjbegin >= vnodeend and \
subjnodelemma in ['jij', 'je']
elif 'u' in vnodepersons:
subjnodelemma = getattval(subjnode, 'lemma')
diff --git a/test_smallclauses.py b/test_smallclauses.py
new file mode 100644
index 0000000..c16ea95
--- /dev/null
+++ b/test_smallclauses.py
@@ -0,0 +1,50 @@
+from config import SDLOGGER
+from treebankfunctions import getstree, getnodeyield, getattval
+from dedup import filledpauseslexicon
+from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon
+from lexicon import known_word
+from namepartlexicon import namepart_isa_namepart
+from sastatoken import Token, show
+from tokenmd import TokenListMD
+from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\
+ insertiontokenmapping
+from smallclauses import smallclauses, word, getleavestr, bg
+
+
+testbank = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\TARSP\smallclausetest.xml"
+schlichtingtreebank = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\schlichtingtreebank\TREEBANK_ID.xml'
+mieke06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE06_ID.xml"
+mieke08 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE08_ID.xml"
+aurisraw = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AURIS_ELISKA_ORIGINAL_ID.xml"
+tarsp02 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_02.xml"
+tarsp06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_06.xml"
+#schlichtingall = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\treebank_schlichting_all_examples\TREEBANK_SCHLICHTING_CHAT_ID.xml"
+
+
+
+
+
+def main():
+ smalltest = True
+ if smalltest:
+ fullnames = [testbank]
+ else:
+ fullnames = [ schlichtingtreebank, mieke06, mieke08, aurisraw, tarsp02, tarsp06]
+ for infullname in fullnames:
+ print(infullname)
+ fulltreebank = getstree(infullname)
+ if fulltreebank is not None:
+ treebank = fulltreebank.getroot()
+ for tree in treebank:
+ leaves = getnodeyield(tree)
+ tokens = [Token(word(leave), bg(leave)) for leave in leaves]
+ tokensmd = TokenListMD(tokens, [])
+ resultlist = smallclauses(tokensmd, tree)
+ if resultlist != []:
+ print('input: ', getleavestr(leaves) )
+ print('output: ', show(resultlist[0].tokens))
+ print('result: ', resultlist[0].metadata)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/top3000.py b/top3000.py
new file mode 100644
index 0000000..7bf181d
--- /dev/null
+++ b/top3000.py
@@ -0,0 +1,66 @@
+from xlsx import getxlsxdata
+from treebankfunctions import getattval
+from namepartlexicon import namepart_isa_namepart
+from config import SD_DIR
+import os
+
+def ishuman(node):
+ lemma = getattval(node, 'lemma')
+ pt = getattval(node, 'pt')
+ vwtype = getattval(node, 'vwtype')
+ result = (lemma, pt ) in semlexicon and 'human' in semlexicon[(lemma, pt)]
+ result = result or vwtype == 'pers'
+ result = result or namepart_isa_namepart(lemma)
+ return result
+
+def isanimate(node):
+ lemma = getattval(node, 'lemma')
+ pt = getattval(node, 'pt')
+ result = (lemma, pt ) in semlexicon and 'animate' in semlexicon[(lemma, pt)]
+ return result
+
+
+def transitivity(node, tr):
+ lemma = getattval(node, 'lemma')
+ pt = getattval(node, 'pt')
+ result = (lemma, pt ) in semlexicon and tr in trlexicon[(lemma, pt)]
+ return result
+
+def transitive(node):
+ return transitivity(node, 'tr')
+
+def pseudotr(node):
+ return transitivity(node, 'tr/intr')
+
+
+def intransitive(node):
+ return transitivity(node, 'intr')
+
+semicolon = ';'
+
+filename = os.path.join(SD_DIR, r'top3000\Woordenlijsten Current.xlsx')
+
+
+lexiconheader, lexicondata = getxlsxdata(filename)
+
+semlexicon = {}
+trlexicon = {}
+genlexicon = {}
+
+for row in lexicondata:
+ lemma = row[1].strip()
+ pt = row[5]
+ rawsems = row[6].split(semicolon)
+ sems = [el.strip() for el in rawsems]
+ semlexicon[(lemma, pt)] = sems
+
+ rawtrs = row[8].split(semicolon)
+ trs = [el.strip() for el in rawtrs]
+ trlexicon[(lemma, pt)] = trs
+
+ rawgens = row[9].split(semicolon)
+ gens = [el.strip() for el in rawgens]
+ genlexicon[(lemma, pt)] = gens
+
+#next statement for debugging purposes
+junk = 0
\ No newline at end of file
diff --git a/top3000/Woordenlijsten Current.xlsx b/top3000/Woordenlijsten Current.xlsx
new file mode 100644
index 0000000..7b7ac56
Binary files /dev/null and b/top3000/Woordenlijsten Current.xlsx differ
diff --git a/treebankfunctions.py b/treebankfunctions.py
index ca5e4b1..c0b120b 100644
--- a/treebankfunctions.py
+++ b/treebankfunctions.py
@@ -12,6 +12,7 @@
from stringfunctions import allconsonants
# from lexicon import informlexiconpos, isa_namepart_uc, informlexicon, isa_namepart
import lexicon as lex
+from config import PARSE_FUNC
class Metadata:
@@ -184,6 +185,23 @@ def ismainclausenode(node):
return result
+def getnodeendmap(stree):
+ leaves = getnodeyield(stree)
+ result = {getattval(leave, 'end'): i + 1 for i, leave in enumerate(leaves)}
+ return result
+
+
+def getxselseuttid(syntree):
+ result = getmeta(syntree, 'xsid')
+ if result is None:
+ result = getmeta(syntree, 'uttid')
+ if result is None:
+ result = getsentid(syntree)
+ if result is None:
+ result = '0'
+ return result
+
+
def getuttid(syntree):
result = getmeta(syntree, 'uttid')
if result is None:
@@ -199,6 +217,7 @@ def getuttno(syntree):
result = '0'
return result
+
def getuttidorno(syntree):
result = getmeta(syntree, 'xsid')
if result is None:
@@ -441,8 +460,9 @@ def inverted(thesubj, thepv):
subjbegin = getattval(thesubj, 'begin')
subjlemma = getattval(thesubj, 'lemma')
pvend = getattval(thepv, 'end')
+ # maybe defien immediately-follows for inflated trees
inversion = '2' == subjperson[0] and tense == 'tgw' and subjnumber in ['ev', 'getal'] and \
- pvend == subjbegin and subjlemma in ['jij', 'je'] # getal added for je
+ pvend <= subjbegin and subjlemma in ['jij', 'je'] # getal added for je
return inversion
@@ -1131,11 +1151,11 @@ def test():
def getsentid(stree):
sentidlist = stree.xpath(sentidxpath)
if sentidlist == []:
- SDLOGGER.error('Missing uttid')
- uttid = 'None'
+ SDLOGGER.error('Missing sentid')
+ result = 'None'
else:
- uttid = str(sentidlist[0])
- return uttid
+ result = str(sentidlist[0])
+ return result
def testindextransform():
@@ -1381,6 +1401,15 @@ def deletewordnode(tree, begin):
return newtree
+def showtree(tree, text=None):
+ if text is not None:
+ print(text)
+ if tree is not None:
+ etree.dump(tree, pretty_print=True)
+ else:
+ print('None')
+
+
def deletechildlessparent(thenode):
if list(thenode) == []:
theparent = thenode.getparent()
@@ -1388,8 +1417,12 @@ def deletechildlessparent(thenode):
deletechildlessparent(theparent)
-def deletewordnodes(tree, begins):
+def olddeletewordnodes(tree, begins):
+ # print('tree:')
+ # etree.dump(tree, pretty_print=True)
newtree = deepcopy(tree)
+ # print('newtree:')
+ # etree.dump(newtree, pretty_print=True)
if newtree is None:
return newtree
else:
@@ -1403,9 +1436,14 @@ def deletewordnodes(tree, begins):
theparent.remove(thenode)
# if the parent has no sons left, it should be deleted as well
deletechildlessparent(theparent)
+ children = [n for n in theparent]
+ (minbegin, maxend) = getbeginend(children)
+ theparent.attrib['begin'] = minbegin
+ theparent.attrib['end'] = maxend
+
#
# renumber begins and ends ;
- normalisebeginend(newtree)
+ # normalisebeginend(newtree) temporarily put off
# adapt the cleantokenisation
# done outside this function
@@ -1415,6 +1453,184 @@ def deletewordnodes(tree, begins):
return newtree
+def childless(node):
+ children = [ch for ch in node]
+ result = children == []
+ return result
+
+def deletewordnodes(tree, begins):
+ newtree = deepcopy(tree)
+ newtree = deletewordnodes2(newtree, begins)
+ newtree = adaptsentence(newtree)
+ return newtree
+
+def deletewordnodes2(tree, begins):
+ if tree is None:
+ return tree
+ for child in tree:
+ if child.tag == 'node':
+ newchild = deletewordnodes2(child, begins)
+ else:
+ newchild = child
+ for child in tree:
+ if child.tag == 'node':
+ childbegin = getattval(child, 'begin')
+ childbeginint = int(childbegin)
+ if childbeginint in begins and childless(child):
+ tree.remove(child)
+ if 'cat' in child.attrib and childless(child): # if its children have been deleted earlier
+ tree.remove(child)
+ # tree begin en end bijwerken
+ if tree. tag == 'node':
+ newchildren = [n for n in tree]
+ if newchildren != []:
+ (minbegin, maxend) = getbeginend(newchildren)
+ tree.attrib['begin'] = minbegin
+ tree.attrib['end'] = maxend
+ return tree
+
+
+def olddeletewordnodes2(tree, begins):
+ if tree is None:
+ return tree
+ else:
+ for child in tree:
+ newchild = deletewordnodes2(child, begins)
+ if tree.tag == 'node':
+ nodebegin = getattval(tree, 'begin')
+ children = [child for child in tree]
+ if int(nodebegin) in begins: # only words and indexnodes can be deleted
+ theparent = tree.getparent()
+ if theparent is not None:
+ if children == []:
+ theparent.remove(tree)
+ # if the parent has no sons left, it should be deleted as well
+ deletechildlessparent(theparent)
+ if theparent.tag == 'node':
+ newchildren = [n for n in theparent]
+ (minbegin, maxend) = getbeginend(newchildren)
+ theparent.attrib['begin'] = minbegin
+ theparent.attrib['end'] = maxend
+ return tree
+
+
+def treeinflate(stree, start=10, inc=10):
+ # fatstree = deepcopy(stree)
+ if stree is None:
+ pass
+ else:
+ for child in stree:
+ treeinflate(child, start, inc)
+ children = [ch for ch in stree]
+ if stree.tag == 'node':
+ ib = int(getattval(stree, 'begin'))
+ ie = int(getattval(stree, 'end'))
+ newib = (ib + 1) * 10
+ stree.attrib['begin'] = str(newib)
+ if iswordnode(stree):
+ stree.attrib['end'] = str(newib + 1)
+ elif 'cat' in stree.attrib:
+ (b, e) = getbeginend(children)
+ stree.attrib['begin'] = b
+ stree.attrib['end'] = e
+ else:
+ stree.attrib['begin'] = str((ib + 1) * 10)
+ stree.attrib['end'] = str((ie * 10) + 1)
+
+
+def isidentitymap(dct):
+ result = all([key == value for key, value in dct.items()])
+ return result
+
+
+def updatetokenpos(stree, tokenposdict):
+ if stree is None:
+ return stree
+ if isidentitymap(tokenposdict):
+ return stree
+ resulttree = deepcopy(stree)
+ resulttree = updatetokenpos2(resulttree, tokenposdict)
+ finaltree = updateindexnodes(resulttree)
+
+ return finaltree
+
+def updatetokenpos2(node, tokenposdict):
+ if node is None:
+ return node
+ for child in node:
+ newchild = updatetokenpos2(child, tokenposdict)
+ if node.tag == 'node':
+ if ('pt' in node.attrib or 'pos' in node.attrib) and \
+ 'end' in node.attrib and 'begin' in node.attrib:
+ intend = int(node.attrib['end'])
+ if intend in tokenposdict:
+ newendint = tokenposdict[intend]
+ node.attrib['end'] = str(newendint)
+ node.attrib['begin'] = str(newendint - 1)
+ else:
+ SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend))
+ fulltrees = node.xpath('ancestor::node[@cat="top"]')
+ if fulltrees != []:
+ fulltree = fulltrees[0]
+ else:
+ fulltree = node
+ sent = getyield(fulltree)
+ SDLOGGER.error('utterance={}'.format(sent))
+ # etree.dump(resulttree)
+ SDLOGGER.error('tokenposdict={}'.format(tokenposdict))
+ elif 'cat' in node.attrib:
+ children = [ch for ch in node]
+ (b, e) = getbeginend(children)
+ node.attrib['begin'] = b
+ node.attrib['end'] = e
+ return node
+
+
+
+def updateindexnodes(stree):
+ #presupposes that the non bareindex nodes have been adapted already
+ indexednodesmap = getindexednodesmap(stree)
+ newstree = deepcopy(stree)
+ for node in newstree.iter():
+ if node.tag == 'node':
+ if bareindexnode(node):
+ idx = getattval(node, 'index')
+ newbegin = getattval(indexednodesmap[idx], 'begin')
+ newend = getattval(indexednodesmap[idx], 'end')
+ node.attrib['begin'] = newbegin
+ node.attrib['end'] = newend
+ return newstree
+
+def treewithtokenpos(thetree, tokenlist):
+ resulttree = deepcopy(thetree)
+ thetreeleaves = getnodeyield(thetree)
+ intbegins = [int(getattval(n, 'begin')) for n in thetreeleaves]
+ tokenlistbegins = [t.pos + t.subpos for t in tokenlist]
+ if len(intbegins) != len(tokenlistbegins):
+ SDLOGGER.error('token mismatch')
+ SDLOGGER.error('tree yield={}'.format(getyield(thetree)))
+ SDLOGGER.error('tokenlist={}'.format(tokenlist))
+ SDLOGGER.error('intbegins={}'.format(intbegins))
+ SDLOGGER.error('tokenlistbegins ={}'.format(tokenlistbegins))
+ pospairs = zip(intbegins, tokenlistbegins)
+ thetreetokenposdict = {treepos + 1: tokenpos + 1 for treepos, tokenpos in pospairs}
+ resulttree = updatetokenpos(resulttree, thetreetokenposdict)
+ return resulttree
+
+
+def fatparse(utterance, tokenlist):
+ stree = PARSE_FUNC(utterance)
+ fatstree = deepcopy(stree)
+ treeinflate(fatstree, start=10, inc=10)
+ debug = False
+ if debug:
+ showtree(fatstree, text='fatparse: fatstree')
+ reducedtokenlist = [token for token in tokenlist if not token.skip]
+ fatstree = treewithtokenpos(fatstree, reducedtokenlist)
+ if debug:
+ showtree(fatstree, text='fatparse: fatstree')
+ return fatstree
+
def update_cleantokenisation(stree, begin):
'''
@@ -1473,8 +1689,10 @@ def normalisebeginend(stree):
:param stree: syntactic structure
:return: stree with the values of begin and end attributes normalised
'''
- begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')]
- sortedbegins = sorted(begins, key=lambda x: int(x))
+ # etree.dump(stree, pretty_print=True)
+ # begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')] # we must include indexed nodes but not have duplicates
+ begins = {getattval(node, 'begin') for node in stree.xpath('.//node[count(node)=0]')}
+ sortedbegins = sorted(list(begins), key=lambda x: int(x))
normalisebeginend2(stree, sortedbegins)