diff --git a/.gitignore b/.gitignore
index 97c0b03..cceb264 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,4 +25,8 @@ env.bak/
 venv.bak/
 
 # configuration
-config.py
\ No newline at end of file
+config.py
+
+# additional files
+.idea/
+sastalog.txt
\ No newline at end of file
diff --git a/CHAT_Annotation.py b/CHAT_Annotation.py
index 1cfd985..edc006f 100644
--- a/CHAT_Annotation.py
+++ b/CHAT_Annotation.py
@@ -22,6 +22,9 @@
 emptyreplacement = eps
 anybutrb = r'[^\]]*'
 
+errormarking = 'Error Marking'
+omittedword = 'Omitted Word'
+specialform = 'Special Form'
 
 def fullre(pat):
     result = r'^' + pat + r'$'
@@ -41,6 +44,7 @@ def refunction(x):
     result = fullre(x)
     return result
 
+
 # u2013 = en-dash, u2014 = em-dash, u2015 = horizontal bar
 
 
@@ -135,7 +139,8 @@ def apply(self, tokens, annotation, repkeep):
                         annotatedposlist = [token.pos]
                         annotatedwordlist = [token.word]
                         annotationposlist = [p for p in range(m.start(), m.end())]
-                        newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist)
+                        newmeta = annotation.metadatafunction(annotation, annotationwordlist, annotatedposlist,
+                                                              annotatedwordlist, annotationposlist)
                         metadata.append(newmeta)
                     newword = self.compiledre.sub(self.replacement, token.word)
                     newtoken = Token(newword, token.pos)
@@ -226,7 +231,10 @@ def apply(self, tokens, annotation, repkeep):
             else:
                 (b, e) = scope
                 if ltodotokens == e + 1:
-                    SDLOGGER.error('Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e, show(todotokens)))
+                    SDLOGGER.error(
+                        'Scope markings in positions {} and {} not followed by annotation ignored in {}'.format(b, e,
+                                                                                                                show(
+                                                                                                                    todotokens)))
                     newtokens += todotokens[:b] + todotokens[b + 1:e]
                     tokenctr = e + 1
                 elif self.compiledre.search(todotokens[e + 1].word):
@@ -234,7 +242,9 @@ def apply(self, tokens, annotation, repkeep):
                     annotationpositions = [token.pos for token in todotokens[b + 1:e]]
                     if self.arity == dyadic:
                         if ltodotokens <= e + 2:
-                            SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name, show(todotokens)))
+                            SDLOGGER.error(
+                                'Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
+                                                                                                show(todotokens)))
                             newtokens += todotokens[b + 1:e]
                             break
                         else:
@@ -247,7 +257,8 @@ def apply(self, tokens, annotation, repkeep):
                         SDLOGGER.error('Illegal arity specification ({}) on {}'.format(self.arity, annotation.name))
                         annotatedwords = []
                         annotatedpositions = []
-                    newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions, annotatedwords, annotationpositions)
+                    newmeta = annotation.metadatafunction(annotation, annotationwords, annotatedpositions,
+                                                          annotatedwords, annotationpositions)
                     metadata.append(newmeta)
                     newtokens += todotokens[tokenctr:b]
                     replacement = getreplacement(repkeep, annotation)
@@ -270,7 +281,8 @@ def apply(self, tokens, annotation, repkeep):
         while i < ltodotokens:
             if self.compiledre.search(todotokens[i].word):
                 if scopewords == []:
-                    SDLOGGER.error('First argument of annotation {} missing. Annotation ignored'.format(annotation.name))
+                    SDLOGGER.error(
+                        'First argument of annotation {} missing. Annotation ignored'.format(annotation.name))
                 else:
                     if self.arity == monadic:
                         annotatedpositions = []
@@ -283,15 +295,17 @@ def apply(self, tokens, annotation, repkeep):
                         metadata.append(newmeta)
                     elif self.arity == dyadic:
                         if i + 1 >= ltodotokens:
-                            SDLOGGER.error('Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
-                                                                                                           show(todotokens)))
+                            SDLOGGER.error(
+                                'Missing second argument for dyadic annotation {} in {}'.format(annotation.name,
+                                                                                                show(todotokens)))
                         else:
                             annotatedpositions = [todotokens[i + 1].pos]
                             annotatedwords = [todotokens[i + 1].word]
                             replacement = getreplacement(repkeep, annotation)
                             newtokens = doreplacement([prevtoken], replacement, newtokens)
                             prevtoken = None
-                            newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions, annotatedwords, scopepositions)
+                            newmeta = annotation.metadatafunction(annotation, scopewords, annotatedpositions,
+                                                                  annotatedwords, scopepositions)
                             metadata.append(newmeta)
             else:
                 if prevtoken is not None:
@@ -308,11 +322,11 @@ def apply(self, tokens, annotation, repkeep):
 
 class CHAT_ComplexRegex(CHAT_Regex):
     def __init__(self, regextuple, replacementtuple, scoped, containswords=False):
-        self.regexbegin = regextuple[0]               # 3 elements: begin mid end
-        self.regexmid = regextuple[1]               # 3 elements: begin mid end
-        self.regexend = regextuple[2]               # 3 elements: begin mid end
-        self.scopereplacement = replacementtuple[0]   # 2 elements: one for the scope and one for the text between [ ]
-        self.bracketreplacement = replacementtuple[1]   # 2 elements: one for the scope and one for the text between [ ]
+        self.regexbegin = regextuple[0]  # 3 elements: begin mid end
+        self.regexmid = regextuple[1]  # 3 elements: begin mid end
+        self.regexend = regextuple[2]  # 3 elements: begin mid end
+        self.scopereplacement = replacementtuple[0]  # 2 elements: one for the scope and one for the text between [ ]
+        self.bracketreplacement = replacementtuple[1]  # 2 elements: one for the scope and one for the text between [ ]
         self.scoped = scoped
         self.containswords = containswords
         self.compiledrebegin = re.compile(refunction(self.regexbegin))
@@ -360,7 +374,8 @@ def apply(self, tokens, annotation, repkeep):
             elif state == scopestate:
                 scope = findscope(tokens[tokenctr - 1:], offset=tokenctr - 1)
                 if scope is None:
-                    SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos, show(tokens)))
+                    SDLOGGER.error('No closing bracket found for < with pos={} in {}'.format(tokens[tokenctr - 1].pos,
+                                                                                             show(tokens)))
                     state = wstate
                 else:
                     (b, e) = scope
@@ -372,13 +387,16 @@ def apply(self, tokens, annotation, repkeep):
                 if bbbe is not None:
                     (bracketbegin, bracketend) = bbbe
                     annotationtokens = todotokens[bracketbegin + 1: bracketend]
-                    (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens, repkeep) if self.containswords else (annotationtokens, [])
+                    (cleanannotationtokens, innermetadata) = cleanCHILDEStokens.cleantokens(annotationtokens,
+                                                                                            repkeep) if self.containswords else (
+                    annotationtokens, [])
                     metadata += innermetadata
                     annotatedwords = [t.word for t in tobereplacedtokens if t.word not in ['<', '>']]
                     annotatedpositions = [t.pos for t in tobereplacedtokens if t.word not in ['<', '>']]
                     thevalue = [token.word for token in cleanannotationtokens]
                     annotationpositions = [token.pos for token in cleanannotationtokens]
-                    newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords, annotationpositions)
+                    newmeta = annotation.metadatafunction(annotation, thevalue, annotatedpositions, annotatedwords,
+                                                          annotationpositions)
                     metadata.append(newmeta)
                     replacement = self.scopereplacement
                     repltokens = [t for t in tobereplacedtokens if t.word not in ['<', '>']]
@@ -395,10 +413,10 @@ def apply(self, tokens, annotation, repkeep):
             tokenctr += inc
         newtokens += tobereplacedtokens
         if state in estates:
-            return(newtokens, metadata)
+            return (newtokens, metadata)
         else:
             SDLOGGER.error('Not in an end state, state={} in {}'.format(state, show(tokens)))
-            return(tokens, [])
+            return (tokens, [])
 
 
 def findbrackets(tokens, regexes, offset=0):
@@ -430,14 +448,29 @@ def dropbrackets(w):
     return result
 
 
-def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT)
-def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos], annotatedwordlist=[w], source=CHAT, backplacement=bpl_delete)
+def simplemetafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
+                                                           annotatedwordlist=[w], source=CHAT)
+
+
+def simple_bpldel_metafunction(f): return lambda ann, pos, w: Meta(ann.name, [f(w)], annotatedposlist=[pos],
+                                                                   annotatedwordlist=[w], source=CHAT,
+                                                                   backplacement=bpl_delete)
 
 
 def simplescopedmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
-    Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist, annotatedwordlist=annotatedwordlist, source=CHAT)
+    Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedposlist=annotatedposlist,
+         annotatedwordlist=annotatedwordlist, source=CHAT)
+
+
 def complexmetafunction(ann, annotationwordlist, annotatedposlist, annotatedwordlist, annotationposlist): return \
-    Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist, annotatedposlist=annotatedposlist, source=CHAT)
+    Meta(ann.name, annotationwordlist, annotationposlist=annotationposlist, annotatedwordlist=annotatedwordlist,
+         annotatedposlist=annotatedposlist, source=CHAT)
+
+
+def charmetafunction(ann, annotationcharlist, annotatedcharlist, annotationcharposlist, annotatedcharposlist):
+    return Meta(ann.name, annotationcharlist, annotationcharlist=annotationcharlist,
+                annotatedcharlist=annotatedcharlist,
+                annotationcharposlist=annotationcharposlist, annotatedcharposlist=annotatedcharposlist)
 
 
 def epsf(w): return ''
@@ -492,6 +525,7 @@ def dropchars2(w, c):
 
 def CHAT_message(msg):
     def result(x, y): return SDLOGGER.warning(msg.format(x, y))
+
     return result
 
 
@@ -502,12 +536,15 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
     # here additional things could be done
     CHAT_Annotation('Overlap Precedes', '8.4:71-72', '10.3:75',
                     CHAT_SimpleScopedRegex(r'\[\<[0-9]?\]', keep, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('Special Form', '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False), simplemetafunction(getsfvalue)),
-    CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False), simplemetafunction(epsf)),
-    CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False), simplemetafunction(epsf)),
+    CHAT_Annotation(specialform, '6.3:37', '8.3:43-44', CHAT_SimpleRegex(specialformpat, getsfword, False),
+                    simplemetafunction(getsfvalue)),
+    CHAT_Annotation('Unintelligible Speech', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'xxx', keep, False),
+                    simplemetafunction(epsf)),
+    CHAT_Annotation('Phonological Coding', '6.4:41', '8.4:47', CHAT_SimpleRegex(r'yyy', keep, False),
+                    simplemetafunction(epsf)),
     CHAT_Annotation('Noncompletion of a Word', '6.5:43', '8.5:48',
-                    CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), complexmetafunction),
-    CHAT_Annotation('Omitted Word', '6.5:43', '8.5:48-49',
+                    CHAT_InWordRegex(r'\(([-\w\']*)\)', r'\1'), charmetafunction),
+    CHAT_Annotation(omittedword, '6.5:43', '8.5:48-49',
                     CHAT_SimpleRegex(r'0[\w:]+', dropzero, False), simple_bpldel_metafunction(dropzero)),
     CHAT_Annotation('Satellite at End', '7.4:58', '9.2:59-60',
                     CHAT_SimpleRegex(r'\s„\s', eps, False), simplemetafunction(identity)),
@@ -524,8 +561,9 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
                     simplemetafunction(dropinitial)),  # this one must crucially precede Pause Between Syllables
     CHAT_Annotation('Pause Between Syllables', '7.7:60', '9.9:63-64', CHAT_InWordRegex(r'\^', ''), complexmetafunction),
     CHAT_Annotation('Simple Event', '7.8.1:60', '9.10.1:64-65', CHAT_SimpleRegex(r'&=[\w:]+', eps, False),
-                                    simplemetafunction(identity)),
-    CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65', CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False),
+                    simplemetafunction(identity)),
+    CHAT_Annotation('Complex Local Event', '7.8.2:61', '9.10.3:65',
+                    CHAT_ComplexRegex((r'\[\^\s', wordorpuncpat, r'\]'), (keep, eps), False),
                     complexmetafunction),
     CHAT_Annotation('Pause', '7.8.3:62', '9.10.4:66', CHAT_SimpleRegex(r'\(\.\.?\.?\)', eps, False),
                     simplemetafunction(identity)),
@@ -577,54 +615,74 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
                     simplemetafunction(identity)),
 
     # erroR marking crucially before [/] [//] [///] etc
-    CHAT_Annotation('Error Marking', '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic),
+    CHAT_Annotation(errormarking, '8.5:75', '10.5:78', CHAT_SimpleScopedRegex(r'\[\*\]', keep, True, monadic),
                     simplescopedmetafunction),
-    CHAT_Annotation('Error Marking', '8.5:75', '10.5:78',
+    CHAT_Annotation(errormarking, '8.5:75', '10.5:78',
                     CHAT_ComplexRegex((r'\[\*', r'[\w:\-\+=]+', r'\]'), (keep, eps), False),
                     complexmetafunction),
 
-    CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True),
+    CHAT_Annotation('Pic Bullet', '8.1:67', '10.1:71',
+                    CHAT_ComplexRegex((u'\u00b7' + r'%pic:', filenamepat, u'\u00b7'), (keep, eps), True),
                     complexmetafunction),  # pic bullet and text bullet must essentially before time alignment
-    CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True),
+    CHAT_Annotation('Text Bullet', '8.1:67', '10.1:71',
+                    CHAT_ComplexRegex((u'\u00b7' + r'%txt:', filenamepat, u'\u00b7'), (keep, eps), True),
                     complexmetafunction),
-    CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True),
+    CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71',
+                    CHAT_ComplexRegex((u'\u00b7', r'[0-9_]+', u'\u00b7'), (keep, eps), True),
                     complexmetafunction),
-    CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71', CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True),
+    CHAT_Annotation('Time Alignment', '7.10:67', '10.1:71',
+                    CHAT_ComplexRegex((u'\u0015', r'[0-9_]+', u'\u0015'), (keep, eps), True),
                     complexmetafunction),  # not an official code but it occurs as such in CLPF
-    CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72', CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True),
+    CHAT_Annotation('Paralinguistic Material', '8.2:68', '10.1:72',
+                    CHAT_ComplexRegex((r'\[=!', anybutrb, r'\]'), (keep, eps), True),
                     complexmetafunction),
     CHAT_Annotation('Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!\]', keep, False, monadic),
                     simplescopedmetafunction),
-    CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72', CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic),
+    CHAT_Annotation('Contrastive Stressing', '8.2:68', '10.1:72',
+                    CHAT_SimpleScopedRegex(r'\[!!\]', keep, False, monadic),
                     simplescopedmetafunction),
     # Duration to be added here @@
-    CHAT_Annotation('Explanation', '8.3:69', '10.3:73', CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False),
+    CHAT_Annotation('Explanation', '8.3:69', '10.3:73',
+                    CHAT_ComplexRegex((r'\[=', anybutrb, r'\]'), (keep, eps), False),
                     complexmetafunction),
     CHAT_Annotation('Replacement', '8.3:69', '10.3:73',
-                    CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True), complexmetafunction),
+                    CHAT_ComplexRegex((r'\[:\s', r'([^\]]+)', r'\]'), (eps, keep), True, containswords=True),
+                    complexmetafunction),
     CHAT_Annotation('Replacement of Real Word', '8.3:70', '10.3:73',
                     CHAT_ComplexRegex((r'\[::', r'([^\]]+)', r'\]'), (eps, keep), True), complexmetafunction),
     CHAT_Annotation('Alternative Transcription', '8.3:70', '10.3:74',
                     CHAT_ComplexRegex((r'\[=\?', r'([^\]]+)', r'\]'), (keep, eps), True), complexmetafunction),
     CHAT_Annotation('Dependent Tier on Main Line', '8.3:70', 'none',
-                    CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),  # @@must do something with the speaker
+                    CHAT_ComplexRegex((r'\[%\w\w\w:', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
+    # @@must do something with the speaker
     CHAT_Annotation('Comment on Main Line', '8.3:70', '10.3:74',
                     CHAT_ComplexRegex((r'\[%\s+', anybutrb, r'\]'), (keep, eps), True), complexmetafunction),
-    CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic), simplescopedmetafunction),
+    CHAT_Annotation('Best Guess', '8.3:70-71', '10.3:74', CHAT_SimpleScopedRegex(r'\[\?\]', keep, True, monadic),
+                    simplescopedmetafunction),
+    CHAT_Annotation('Repetition', '8.4:72', '10.4:75-76', CHAT_SimpleScopedRegex(r'\[/\]', eps, True, monadic),
+                    simplescopedmetafunction),
     CHAT_Annotation('Multiple Repetition', '8.4:72-73', '10.4:76',
                     CHAT_ComplexRegex((r'\[x', r'[0-9]+', r'\]'), (keep, eps), True), complexmetafunction),
-    CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),
-    CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77', CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic), simplescopedmetafunction),
-    CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False), simplemetafunction(identity)),    # needs extension
-    CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False),  # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm
+    CHAT_Annotation('Retracing', '8.4:73', '10.4:76-77', CHAT_SimpleScopedRegex(r'\[//\]', eps, True, monadic),
+                    simplescopedmetafunction),
+    CHAT_Annotation('Reformulation', '8.4:73-74', '10.4:77', CHAT_SimpleScopedRegex(r'\[///\]', eps, True, monadic),
+                    simplescopedmetafunction),
+    CHAT_Annotation('False Start Without Retracing', '8.4:74', '10.4:77',
+                    CHAT_SimpleScopedRegex(r'\[/\-\]', eps, True, dyadic), simplescopedmetafunction),
+    CHAT_Annotation('Unclear Retracing Type', '8.4:74', '10.4:77',
+                    CHAT_SimpleScopedRegex(r'\[/\?\]', keep, True, monadic), simplescopedmetafunction),
+    CHAT_Annotation('Excluded Material', '', '10.4:77-78', CHAT_SimpleScopedRegex(r'\[e\]', eps, True, monadic),
+                    simplescopedmetafunction),
+    CHAT_Annotation('Clause Delimiter', '8.4:74', '78', CHAT_SimpleRegex(r'\[\^c\]', eps, False),
+                    simplemetafunction(identity)),  # needs extension
+    CHAT_Annotation('Interposed Word', '8.4:74', '9.10.2:65', CHAT_SimpleRegex(r'&\*\w\w\w:[\w:]+', eps, False),
+                    # grouped metadata would come in handy here ID100 text speaker = XXX, ID100 text interposedword = hmm
                     simplemetafunction(interposedword)),
-    CHAT_Annotation('Postcode', '8.6:75', '10.5:78', CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False),
+    CHAT_Annotation('Postcode', '8.6:75', '10.5:78',
+                    CHAT_ComplexRegex((r'\[\+\s+', wordpat, r'\]'), (keep, eps), False),
                     complexmetafunction),
-    CHAT_Annotation('Language Precode', '8.6:75', '10.5:79', CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False),
+    CHAT_Annotation('Language Precode', '8.6:75', '10.5:79',
+                    CHAT_ComplexRegex((r'\[\-\s+', wordpat, r'\]'), (keep, eps), False),
                     complexmetafunction),
     CHAT_Annotation('Excluded Utterance', '8.6:75-76', '10.5:79', CHAT_SimpleRegex(r'\[\+\s+bch\]', eps, False),
                     simplemetafunction(interposedword)),
@@ -632,9 +690,12 @@ def result(x, y): return SDLOGGER.warning(msg.format(x, y))
                     simplemetafunction(interposedword)),
     CHAT_Annotation('Zero Utterance', '', '10.5:79, 11.1:81', CHAT_SimpleRegex(r'\b0\b', eps, False),
                     simplemetafunction(identity)),
-    CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''), complexmetafunction),
-    CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction),  # take care extra token!@@
-    CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction),  # take care extra token@@
+    CHAT_Annotation('Segment Repetition', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u21AB.*?\u21AB', ''),
+                    complexmetafunction),
+    CHAT_Annotation('Joined Words', '6.6.4:46', '8.6.3:51', CHAT_InWordRegex(r'_', space), complexmetafunction),
+    # take care extra token!@@
+    CHAT_Annotation('Clitic Boundary', '6.6.15:52', 'not found', CHAT_InWordRegex(r'~', space), complexmetafunction),
+    # take care extra token@@
     CHAT_Annotation('Blocked Segments', '10:85,11:89', '13:91', CHAT_InWordRegex(u'\u2260.*?\u2260', ''),
                     complexmetafunction),
     # these must be applied after [/], [//], [///] etc
diff --git a/TARSPpostfunctions.py b/TARSPpostfunctions.py
index 4d9d424..4a1456e 100644
--- a/TARSPpostfunctions.py
+++ b/TARSPpostfunctions.py
@@ -6,6 +6,7 @@
 
 from query import core_process
 from treebankfunctions import getmeta
+from config import SDLOGGER
 
 OndVC = 'T071'
 OndWVC = 'T076'
@@ -74,8 +75,11 @@ def getstage(uttcounts, allresults):
     cands = []
     gtotaal = allresults.postresults['T152']
     for el in uttcounts:
-        if uttcounts[el] / gtotaal >= gofase_minthreshold:
-            cands.append(el)
+        if gtotaal != 0:
+            if uttcounts[el] / gtotaal >= gofase_minthreshold:
+                cands.append(el)
+        else:
+            SDLOGGER.error('gtotaal has value 0')
     if cands == []:
         result = 1
     else:
diff --git a/adjtest.py b/adjtest.py
new file mode 100644
index 0000000..84d738b
--- /dev/null
+++ b/adjtest.py
@@ -0,0 +1,121 @@
+from lxml import etree
+from treebankfunctions import showtree
+from asta_queries import asta_bijzin
+
+streestrings = {}
+
+streestrings[0] = """
+<alpino_ds version="1.3">
+  <metadata>
+<meta type="text" name="charencoding" value="UTF8"/>
+<meta type="text" name="childage" value=""/>
+<meta type="text" name="childmonths" value=""/>
+<meta type="text" name="session" value="ASTA_sample_05"/>
+<meta type="text" name="origutt" value="uh dus sinds ik hier ben heb ik logo omdat ik "/>
+<meta type="text" name="parsefile" value="Unknown_corpus_ASTA_sample_05_u00000000006.xml"/>
+<meta type="text" name="speaker" value="PMA"/>
+<meta type="int" name="uttendlineno" value="17"/>
+<meta type="int" name="uttid" value="4"/>
+<meta type="int" name="uttstartlineno" value="17"/>
+<meta type="text" name="name" value="pma"/>
+<meta type="text" name="SES" value=""/>
+<meta type="text" name="age" value=""/>
+<meta type="text" name="custom" value=""/>
+<meta type="text" name="education" value=""/>
+<meta type="text" name="group" value=""/>
+<meta type="text" name="language" value="nld"/>
+<meta type="text" name="months" value=""/>
+<meta type="text" name="role" value="Other"/>
+<meta type="text" name="sex" value=""/>
+<meta type="text" name="xsid" value="4"/>
+<meta type="int" name="uttno" value="6"/>
+<xmeta annotatedposlist="[10]" annotatedwordlist="['uh']" annotationposlist="[10]" annotationwordlist="['uh']" atype="text" backplacement="0" cat="Syntax" name="ExtraGrammatical" penalty="10" source="SASTA" subcat="None" value="Filled Pause"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['uh', 'dus', 'sinds', 'ik', 'hier', 'ben', 'heb', 'ik', 'logo', 'omdat', 'ik']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['uh', 'dus', 'sinds', 'ik', 'hier', 'ben', 'heb', 'ik', 'logo', 'omdat', 'ik']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['uh', 'dus', 'sinds', 'ik', 'hier', 'ben', 'heb', 'ik', 'logo', 'omdat', 'ik']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['uh', 'dus', 'sinds', 'ik', 'hier', 'ben', 'heb', 'ik', 'logo', 'omdat', 'ik']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]" annotationwordlist="[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="dus sinds ik hier ben heb ik logo omdat ik" atype="text" backplacement="0" cat="Correction" name="parsed_as" penalty="10" source="SASTA" subcat="None" value="dus sinds ik hier ben heb ik logo omdat ik"/></metadata>
+<node begin="10" cat="top" end="111" id="0" rel="top">
+    <node begin="100" conjtype="onder" end="101" frame="complementizer" id="1" lcat="--" lemma="omdat" pos="comp" postag="VG(onder)" pt="vg" rel="--" root="omdat" sense="omdat" word="omdat"/>
+    <node begin="10" end="11" frame="--" genus="zijd" getal="ev" graad="basis" his="skip" id="1" lcat="--" lemma="uh" naamval="stan" ntype="soort" pos="--" postag="N(soort,ev,basis,zijd,stan)" pt="n" rel="--" root="uh" sense="uh" word="uh"/>
+    <node begin="20" cat="du" end="111" id="2" rel="--">
+      <node begin="20" cat="du" end="91" id="3" rel="dp">
+        <node begin="20" end="21" frame="complementizer(root)" id="4" lcat="du" lemma="dus" pos="comp" postag="BW()" pt="bw" rel="dlink" root="dus" sc="root" sense="dus" word="dus"/>
+        <node begin="30" cat="smain" end="91" id="5" rel="nucl">
+          <node begin="30" cat="cp" end="61" id="6" rel="mod">
+            <node begin="30" end="31" frame="complementizer" id="7" lcat="cp" lemma="sinds" pos="comp" postag="VZ(init)" pt="vz" rel="cmp" root="sinds" sense="sinds" vztype="init" word="sinds"/>
+            <node begin="40" cat="ssub" end="61" id="8" rel="body">
+              <node begin="40" case="nom" def="def" end="41" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="9" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+              <node begin="50" end="51" frame="er_loc_adverb" getal="getal" id="10" lcat="advp" lemma="hier" naamval="obl" pdtype="adv-pron" persoon="3o" pos="adv" postag="VNW(aanw,adv-pron,obl,vol,3o,getal)" pt="vnw" rel="ld" root="hier" sense="hier" special="er_loc" status="vol" vwtype="aanw" word="hier"/>
+              <node begin="60" end="61" frame="verb(unacc,sg1,ld_adv)" id="11" infl="sg1" lcat="ssub" lemma="zijn" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="ben" sc="ld_adv" sense="ben" tense="present" word="ben" wvorm="pv"/>
+            </node>
+          </node>
+          <node begin="70" end="71" frame="verb(hebben,sg1,transitive_ndev)" id="12" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="transitive_ndev" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+          <node begin="80" case="nom" def="def" end="81" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="13" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+          <node begin="90" end="91" frame="noun(het,count,sg)" gen="het" genus="onz" getal="ev" graad="basis" id="14" lcat="np" lemma="logo" naamval="stan" ntype="soort" num="sg" pos="noun" postag="N(soort,ev,basis,onz,stan)" pt="n" rel="obj1" rnum="sg" root="logo" sense="logo" word="logo"/>
+        </node>
+      </node>
+      <node begin="110" case="nom" def="def" end="111" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="15" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="dp" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+    </node>
+  </node>
+  <sentence sentid="4">uh dus sinds ik hier ben heb ik logo omdat ik</sentence><comments>
+    <comment>Q#ng1647271273|dus sinds ik hier ben heb ik logo omdat ik|1|3|-0.6490448165400009</comment>
+  </comments>
+</alpino_ds>
+"""
+
+
+strees = {}
+for x in streestrings:
+    strees[x] = etree.fromstring(streestrings[x])
+
+thequery = """
+.//node[
+    ( (@word="geboren")  or
+      
+    (@pt="adj" and 
+     (@rel="mod" and 
+      parent::node[@cat="np"] and 
+      ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")] and
+	  (not(@begin < ../node[@rel="det" and (@pt="lid" or @pt="vnw")]/@begin) or @lemma='heel' or @lemma='geheel')
+	 )
+	)
+ or
+     
+   (@pt="adj" and
+    (@rel="hd" and
+     parent::node[@cat="ap" and parent::node[@cat="np"] and 
+     ../node[@rel="hd" and (@pt="n" or @pt="vnw" or @cat="mwu")]]
+	 )
+	)
+ or
+     
+   (@pt="tw" and @numtype="rang")
+ or
+      
+   (@pt="adj" and @rel="hd" and parent::node[@cat="np"])
+ or
+     
+    (
+   (@pt="tw" and @numtype="rang")
+ and @positie = "nom" )
+ or
+	 
+   (@pt="ww" and @wvorm="vd" and @rel="mod" and parent::node[@cat="np"])
+ or
+	 
+   (@pt="ww" and @wvorm="od" and @rel="mod" and parent::node[@cat="np"])
+ or
+	 
+     (@pt="adj" and  ( (@rel="predc" or @rel="predm" )  and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )])
+)
+ or
+	 
+     (@pt="adj" and @rel="hd" and parent::node[@cat="ap" and  ( (@rel="predc" or @rel="predm" )  and ../node[ (@pt="ww" and @rel="hd" and @lemma!="uit_zien" and @lemma!="heten" and @lemma!="gaan" and @lemma!="zitten" and (contains(@frame, "copula") or not(@stype="topic_drop")) and parent::node[node[@rel="predc"] and not(node[@rel="obj1"]) ] )])
+])
+ or
+	  (@rel="det" and @pt="vnw" and @vwtype="onbep")
+ 
+	)
+]
+"""
+
+#matches = strees[0].xpath(thequery)
+matches = asta_bijzin(strees[0])
+for m in matches:
+    showtree(m)
\ No newline at end of file
diff --git a/alpino.py b/alpino.py
index 4ddfe11..24dda43 100644
--- a/alpino.py
+++ b/alpino.py
@@ -30,9 +30,12 @@ def getdehetwordinfo(wrd):
 
     # we only want to consider nouns or words of unknown word class (such as kopje in CELEX)
     wordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[0] in ['n', 'None']]
-    # if any of the alternatives is a de-word, we empty the whole list
-    if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]):
-        wordinfos = []
+    # if any of the alternatives is a de-word, we keep only these
+    dewordinfos = [wordinfo for wordinfo in wordinfos if wordinfo[1] == lexicon.de]
+    if dewordinfos != []:
+        wordinfos = dewordinfos
+    #if any([wordinfo[1] == lexicon.de for wordinfo in wordinfos]):
+    #    wordinfos = []
 
     # if not found yet we check with Alpino
     if wordinfos != []:
diff --git a/asta_neo.py b/asta_neo.py
new file mode 100644
index 0000000..e5d6d0d
--- /dev/null
+++ b/asta_neo.py
@@ -0,0 +1,147 @@
+from lxml import etree
+#from CHAT_Annotation import specialform, errormarking
+
+specialform = 'Special Form'
+errormarking  = 'Error Marking'
+
+mdnamemdxpathtemplate = """.//xmeta[@name="{mdname}" and @value="{mdvalue}"]"""
+ptposxpathtemplate = './/node[@pt and @begin="{position}"]'
+
+def mdbasedquery(stree, mdname, mdvalue):
+    mdnamemdxpath = mdnamemdxpathtemplate.format(mdname=mdname, mdvalue=mdvalue)
+    mdnamemds = stree.xpath(mdnamemdxpath)
+    results = []
+    for mdnamemd in mdnamemds:
+        annotatedposstr = mdnamemd.attrib['annotatedposlist']
+        if annotatedposstr != '':
+            mdbeginval = annotatedposstr[1:-1]
+            ptposxpath = ptposxpathtemplate.format(position=mdbeginval)
+            newresults = stree.xpath(ptposxpath)
+            results += newresults
+
+    return results
+
+def neologisme(stree):
+    results1 = mdbasedquery(stree, errormarking,"['n']")
+    results2 = mdbasedquery(stree, specialform, '@n')
+    results = results1 + results2
+    return results
+
+def sempar(stree):
+    results = mdbasedquery(stree, errormarking, "['s']")
+    return results
+
+def phonpar(stree):
+    results = mdbasedquery(stree, errormarking, "['p']")
+    return results
+
+
+def test(stree):
+    neoresults = neologisme(stree)
+    semparresults = sempar(stree)
+    phonparresults = phonpar(stree)
+    results = [('neo', neoresult) for neoresult in neoresults] +\
+              [('sempar', semparresult) for semparresult in  semparresults] +\
+              [('phonpar', phonparresult) for phonparresult in phonparresults]
+    return results
+
+def main():
+    for i in strees:
+        results = test(strees[i])
+        for result in results:
+            print('{}: {}:{}'.format(result[0], result[1].attrib['word'], result[1].attrib['begin']))
+
+
+
+streestrings = {}
+
+streestrings[1] = """
+<alpino_ds version="1.3">
+  <metadata><meta name="uttid" value="1" type="text"/><meta name="xsid" value="1" type="text"/><meta name="origutt" value="ik heb geduusterd [*n]" type="text"/><xmeta annotatedposlist="[2]" annotatedwordlist="['geduusterd']" annotationposlist="[4]" annotationwordlist="['n']" atype="text" backplacement="0" cat="None" name="Error Marking" penalty="10" source="CHAT" subcat="None" value="['n']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'geduusterd', '[*', 'n', ']']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'geduusterd', '[*', 'n', ']']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'geduusterd']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'geduusterd']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="[0, 1, 2]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[0, 1, 2]"/></metadata><node begin="0" cat="top" end="3" id="0" rel="top">
+    <node begin="0" cat="smain" end="3" id="1" rel="--">
+      <node begin="0" case="nom" def="def" end="1" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="2" index="1" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+      <node begin="1" end="2" frame="verb(hebben,sg1,aux_psp_hebben)" id="3" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="aux_psp_hebben" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+      <node begin="0" cat="ppart" end="3" id="4" rel="vc">
+        <node begin="0" end="1" id="5" index="1" rel="su"/>
+        <node begin="2" buiging="zonder" end="3" frame="verb('hebben/zijn',psp,intransitive)" id="6" infl="psp" lcat="ppart" lemma="geduusterd" pos="verb" positie="vrij" postag="WW(vd,vrij,zonder)" pt="ww" rel="hd" root="geduusterd" sc="intransitive" sense="geduusterd" word="geduusterd" wvorm="vd"/>
+      </node>
+    </node>
+  </node>
+  <sentence sentid="1">ik heb geduusterd</sentence>
+  <comments>
+    <comment>Q#ng1646152422|ik heb geduusterd|1|1|-5.158487943820001</comment>
+  </comments>
+</alpino_ds>
+"""
+
+
+streestrings[2] = """
+  <alpino_ds version="1.3">
+  <metadata><meta name="uttid" value="1" type="text"/><meta name="xsid" value="1" type="text"/><meta name="origutt" value="ik heb ngeduusterd [*n]" type="text"/><xmeta annotatedposlist="[2]" annotatedwordlist="['ngeduusterd']" annotationposlist="[4]" annotationwordlist="['n']" atype="text" backplacement="0" cat="None" name="Error Marking" penalty="10" source="CHAT" subcat="None" value="['n']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'ngeduusterd', '[*', 'n', ']']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'ngeduusterd', '[*', 'n', ']']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'ngeduusterd']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'ngeduusterd']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="[0, 1, 2]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[0, 1, 2]"/></metadata><node begin="0" cat="top" end="3" id="0" rel="top">
+    <node begin="0" cat="smain" end="3" id="1" rel="--">
+      <node begin="0" case="nom" def="def" end="1" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="2" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+      <node begin="1" end="2" frame="verb(hebben,sg1,transitive_ndev)" id="3" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="transitive_ndev" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+      <node begin="2" end="3" frame="noun(both,both,both)" gen="both" genus="zijd" getal="ev" graad="basis" id="4" lcat="np" lemma="ngeduusterd" naamval="stan" ntype="soort" num="both" pos="noun" postag="N(soort,ev,basis,zijd,stan)" pt="n" rel="obj1" rnum="sg" root="ngeduusterd" sense="ngeduusterd" word="ngeduusterd"/>
+    </node>
+  </node>
+  <sentence sentid="1">ik heb ngeduusterd</sentence>
+  <comments>
+    <comment>Q#ng1646219407|ik heb ngeduusterd|1|1|-1.6311900273499995</comment>
+  </comments>
+</alpino_ds>
+"""
+
+streestrings[3] = """
+  <alpino_ds version="1.3">
+  <metadata><meta name="uttid" value="2" type="text"/><meta name="xsid" value="2" type="text"/><meta name="origutt" value="ik heb nngeduusterd@n" type="text"/><xmeta annotatedposlist="[2]" annotatedwordlist="['nngeduusterd@n']" annotationposlist="[]" annotationwordlist="['@n']" atype="text" backplacement="0" cat="None" name="Special Form" penalty="10" source="CHAT" subcat="None" value="['@n']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'nngeduusterd@n']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'nngeduusterd@n']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'nngeduusterd']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'nngeduusterd']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="[0, 1, 2]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[0, 1, 2]"/></metadata><node begin="0" cat="top" end="3" id="0" rel="top">
+    <node begin="0" cat="smain" end="3" id="1" rel="--">
+      <node begin="0" case="nom" def="def" end="1" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="2" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+      <node begin="1" end="2" frame="verb(hebben,sg1,transitive_ndev)" id="3" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="transitive_ndev" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+      <node begin="2" end="3" frame="noun(both,both,both)" gen="both" genus="zijd" getal="ev" graad="basis" id="4" lcat="np" lemma="nngeduusterd" naamval="stan" ntype="soort" num="both" pos="noun" postag="N(soort,ev,basis,zijd,stan)" pt="n" rel="obj1" rnum="sg" root="nngeduusterd" sense="nngeduusterd" word="nngeduusterd"/>
+    </node>
+  </node>
+  <sentence sentid="2">ik heb nngeduusterd</sentence>
+  <comments>
+    <comment>Q#ng1646219408|ik heb nngeduusterd|1|1|-1.6311900273499995</comment>
+  </comments>
+</alpino_ds>
+"""
+streestrings[4] = """
+  <alpino_ds version="1.3">
+  <metadata><meta name="uttid" value="3" type="text"/><meta name="xsid" value="3" type="text"/><meta name="origutt" value="ik heb pgeduusterd [*p]" type="text"/><xmeta annotatedposlist="[2]" annotatedwordlist="['pgeduusterd']" annotationposlist="[4]" annotationwordlist="['p']" atype="text" backplacement="0" cat="None" name="Error Marking" penalty="10" source="CHAT" subcat="None" value="['p']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'pgeduusterd', '[*', 'p', ']']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'pgeduusterd', '[*', 'p', ']']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'pgeduusterd']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'pgeduusterd']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="[0, 1, 2]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[0, 1, 2]"/></metadata><node begin="0" cat="top" end="3" id="0" rel="top">
+    <node begin="0" cat="smain" end="3" id="1" rel="--">
+      <node begin="0" case="nom" def="def" end="1" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="2" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+      <node begin="1" end="2" frame="verb(hebben,sg1,transitive_ndev)" id="3" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="transitive_ndev" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+      <node begin="2" end="3" frame="noun(both,both,both)" gen="both" genus="zijd" getal="ev" graad="basis" id="4" lcat="np" lemma="pgeduusterd" naamval="stan" ntype="soort" num="both" pos="noun" postag="N(soort,ev,basis,zijd,stan)" pt="n" rel="obj1" rnum="sg" root="pgeduusterd" sense="pgeduusterd" word="pgeduusterd"/>
+    </node>
+  </node>
+  <sentence sentid="3">ik heb pgeduusterd</sentence>
+  <comments>
+    <comment>Q#ng1646219409|ik heb pgeduusterd|1|1|-1.6311900273499995</comment>
+  </comments>
+</alpino_ds>
+"""
+
+streestrings[5] = """
+  <alpino_ds version="1.3">
+  <metadata><meta name="uttid" value="4" type="text"/><meta name="xsid" value="4" type="text"/><meta name="origutt" value="ik heb sgeduusterd [*s]" type="text"/><xmeta annotatedposlist="[2]" annotatedwordlist="['sgeduusterd']" annotationposlist="[4]" annotationwordlist="['s']" atype="text" backplacement="0" cat="None" name="Error Marking" penalty="10" source="CHAT" subcat="None" value="['s']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'sgeduusterd', '[*', 's', ']']" atype="list" backplacement="0" cat="None" name="tokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'sgeduusterd', '[*', 's', ']']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="['ik', 'heb', 'sgeduusterd']" atype="list" backplacement="0" cat="None" name="cleanedtokenisation" penalty="10" source="CHAT/Tokenisation" subcat="None" value="['ik', 'heb', 'sgeduusterd']"/><xmeta annotatedposlist="[]" annotatedwordlist="[]" annotationposlist="[]" annotationwordlist="[0, 1, 2]" atype="list" backplacement="0" cat="None" name="cleanedtokenpositions" penalty="10" source="CHAT/Tokenisation" subcat="None" value="[0, 1, 2]"/></metadata><node begin="0" cat="top" end="3" id="0" rel="top">
+    <node begin="0" cat="smain" end="3" id="1" rel="--">
+      <node begin="0" case="nom" def="def" end="1" frame="pronoun(nwh,fir,sg,de,nom,def)" gen="de" getal="ev" id="2" lcat="np" lemma="ik" naamval="nomin" num="sg" pdtype="pron" per="fir" persoon="1" pos="pron" postag="VNW(pers,pron,nomin,vol,1,ev)" pt="vnw" rel="su" rnum="sg" root="ik" sense="ik" status="vol" vwtype="pers" wh="nwh" word="ik"/>
+      <node begin="1" end="2" frame="verb(hebben,sg1,transitive_ndev)" id="3" infl="sg1" lcat="smain" lemma="hebben" pos="verb" postag="WW(pv,tgw,ev)" pt="ww" pvagr="ev" pvtijd="tgw" rel="hd" root="heb" sc="transitive_ndev" sense="heb" stype="declarative" tense="present" word="heb" wvorm="pv"/>
+      <node begin="2" end="3" frame="noun(both,both,both)" gen="both" genus="zijd" getal="ev" graad="basis" id="4" lcat="np" lemma="sgeduusterd" naamval="stan" ntype="soort" num="both" pos="noun" postag="N(soort,ev,basis,zijd,stan)" pt="n" rel="obj1" rnum="sg" root="sgeduusterd" sense="sgeduusterd" word="sgeduusterd"/>
+    </node>
+  </node>
+  <sentence sentid="4">ik heb sgeduusterd</sentence>
+  <comments>
+    <comment>Q#ng1646219410|ik heb sgeduusterd|1|1|-1.6311900273499995</comment>
+  </comments>
+</alpino_ds>
+
+"""
+
+strees = {}
+for i in streestrings:
+    strees[i] = etree.fromstring(streestrings[i])
+
+if __name__ == '__main__':
+    main()
diff --git a/asta_queries.py b/asta_queries.py
index 5097681..575a80d 100644
--- a/asta_queries.py
+++ b/asta_queries.py
@@ -213,13 +213,18 @@ def asta_bijzin(stree):
         if getattval(cn1, 'begin') == getattval(cn0, 'begin'):
             cn0end = getattval(cn0, 'end')
             newbegin = cn0end
-            newokptnode = find1(cn1, '//node[@pt and @begin={newbegin}]'.format(newbegin=newbegin))
-            result = sortedclausenodes[2:] + okptnodes + [newokptnode]
+            newokptnodexpath = '//node[@pt and @begin="{newbegin}"]'.format(newbegin=newbegin)
+            newokptnode = find1(cn1, newokptnodexpath)
+            result = sortedclausenodes[2:] + okptnodes
+            if newokptnode is not None:
+                result += [newokptnode]
         else:
             result = sortedclausenodes[1:] + okptnodes
     else:
         result = sortedclausenodes[1:] + okptnodes
 
+   #ad hoc statement to ensure that there are no None matches should not happen anymore
+    result = [el for el in result if el is not None]
     return result
 
 
diff --git a/basicreplacements.py b/basicreplacements.py
index f369702..339b327 100644
--- a/basicreplacements.py
+++ b/basicreplacements.py
@@ -41,6 +41,9 @@
                         ('effe', 'even', pron, infpron, varpron),
                         ('set', 'zet', pron, infpron, initdev), ('hie', 'hier', pron, pronerr, codared),
                         ('eers', 'eerst', pron, pronerr, codared),
+                        ('era', 'eraf', pron, pronerr, codared),
+                        ('il', 'wil', pron, pronerr, onsetred),
+                        ('tee', 'twee', pron, pronerr, onsetred),
                         ('nie', 'niet', pron, infpron, codared),
                         ('s', 'is', orth, spellerr, apomiss), ('ooke', 'ook', pron, infpron, addschwa),
                         ('it', 'dit', pron, pronerr, onsetred),
@@ -67,6 +70,7 @@
                       ('dis', ['dit', 'is'], pron, infpron, contract),
                       ('das', ['dat', 'is'], pron, infpron, contract),
                       ('tis', ['dit', 'is'], pron, infpron, contract),
+                      ('waas', ['waar', 'is'], pron, infpron, contract),
                       ('is-t-ie', ['is', 'ie'], pron, infpron, t_ie),
                       ('als-t-ie', ['als', 'ie'], pron, infpron, t_ie),
                       ('of-t-ie', ['of', 'ie'], pron, infpron, t_ie),
diff --git a/checkcorrection.py b/checkcorrection.py
new file mode 100644
index 0000000..021cbea
--- /dev/null
+++ b/checkcorrection.py
@@ -0,0 +1,66 @@
+'''
+Compares the errorlogging file with the error reference file
+'''
+
+import os
+from xlsx import getxlsxdata
+
+dataset = 'vkltarsp'
+dataset = 'vklstap'
+dataset = 'vklasta'
+
+if dataset == 'vkltarsp':
+    resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\tarsp'
+    dataprefix = 'tarsp'
+
+elif dataset == 'vklstap':
+    resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata'
+    dataprefix = 'stap'
+
+elif dataset == 'vklasta':
+    resultspath = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\asta'
+    dataprefix = 'asta'
+
+
+errorloggingfilename = dataprefix + '_errorlogging.xlsx'
+errorloggingfullname = os.path.join(resultspath, errorloggingfilename)
+
+referencepath = r'D:\jodijk\Dropbox\Surfdrive\Shared\SASTAPLUS\November'
+errorreffilename = dataprefix + '_error_ref.xlsx'
+errorreffullname = os.path.join(referencepath, errorreffilename)
+
+logheader, logdata = getxlsxdata(errorloggingfullname)
+refheader, refdata = getxlsxdata(errorreffullname)
+
+refdict = {(row[0], row[1]): row[3] for row in refdata}
+
+correctcorrections = 0
+missedcorrections = 0
+wrongcorrections = 0
+for row in logdata:
+    key = (row[0], row[5])
+    if 'BEST' in row[10]:
+        logsent = row[9]
+        if key not in refdict:
+            print('Missing example in refdict: {}'.format(key))
+            print(row[9])
+            missedcorrections += 1
+        else:
+            refsent = refdict[key]
+            if refsent != logsent:
+                print('Mismatch: {}'.format(key))
+                print('refsent=<{}>'.format(refsent))
+                print('logsent=<{}>'.format(logsent))
+                wrongcorrections += 1
+            else:
+                correctcorrections += 1
+
+allcorrections = correctcorrections + wrongcorrections + missedcorrections
+
+correctioncounts = [correctcorrections, wrongcorrections, missedcorrections]
+labels = ['correct corrections', 'wrong corrections', 'missed corrections']
+labeled_corrections = zip(labels, correctioncounts)
+
+print('\nSummary:\n')
+for label, corr in labeled_corrections:
+    print('{} = {} ({:.2f}%)'.format(label, corr, corr / allcorrections * 100))
\ No newline at end of file
diff --git a/cleanCHILDEStokens.py b/cleanCHILDEStokens.py
index f7e074c..9d4a922 100644
--- a/cleanCHILDEStokens.py
+++ b/cleanCHILDEStokens.py
@@ -20,6 +20,16 @@
 
 bstate, ostate, oostate, costate, ccstate = 0, 1, 2, 3, 4
 
+#this should be identical to the checkpattern of cleanCHILDESMD
+# #checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ123456789·\u22A5\u00B7\u0001\u2260\u21AB]')
+# checkpattern = re.compile(r'[][\(\)&%@/=><_0^~↓↑↑↓⇗↗→↘⇘∞≈≋≡∙⌈⌉⌊⌋∆∇⁎⁇°◉▁▔☺∬Ϋ·\u22A5\u00B7\u0001\u2260\u21AB]')
+# # + should not occur except as compound marker black+board
+# # next one split up in order to do substitutions
+# pluspattern = re.compile(r'(\W)\+|\+(\W)')
+# pluspattern1 = re.compile(r'(\W)\+')
+# pluspattern2 = re.compile(r'\+(\W)')
+illegalcleanedchatsymbols = '<>'
+
 
 def findscopeclose(tokens, offset=0):
     tokenctr = 0
@@ -83,22 +93,31 @@ def checkline(line, newline, outfilename, lineno, logfile):
         print('charcodes=<{}>'.format(thecodes), file=logfile)
 
 
-def cleantext(utt, repkeep):
+def purifytokens(tokens):
+    result = [token for token in tokens if token.word not in illegalcleanedchatsymbols]
+    return result
+
+def cleantext(utt, repkeep, tokenoutput=False):
     newutt = robustness(utt)
     tokens = sastatok.sasta_tokenize(newutt)
     inwordlist = [t.word for t in tokens]
     intokenstrings = [str(token) for token in tokens]
     # print(space.join(intokenstrings))
     (newtokens, metadata) = cleantokens(tokens, repkeep)
+    #remove symbol tokens that should not be there anymore
+    newtokens = purifytokens(newtokens)
     resultwordlist = [t.word for t in newtokens]
     resultstring = smartjoin(resultwordlist)
     resultposlist = [t.pos for t in newtokens]
     newmeta1 = Meta('tokenisation', inwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
     newmeta2 = Meta('cleanedtokenisation', resultwordlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
-    newmeta3 = Meta('cleanedtokenpositions', resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
+    newmeta3 = Meta('cleanedtokenpositions', resultposlist, annotationposlist=resultposlist, atype='list', source='CHAT/Tokenisation', backplacement=bpl_none)
     metadata += [newmeta1, newmeta2, newmeta3]
     resultmetadata = metadata
-    return (resultstring, resultmetadata)
+    if tokenoutput:
+        return(newtokens, resultmetadata)
+    else:
+        return (resultstring, resultmetadata)
 
 
 def cleantokens(tokens, repkeep):
@@ -133,7 +152,10 @@ def removesuspects(str):
     return result
 
 
-robustnessrules = [(re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'),
+robustnessrules = [(re.compile(r'\u2026'), '\u2026', '...', 'Horizontal Ellipsis (\u2026, Unicode U+2026) replaced by a sequence of three Full Stops (..., Unicode U+002E) '),
+                   (re.compile('#'), '#', '', 'Number Sign (#, Unicode U+0023) removed'),
+                   #(re.compile('#'), '#', '(.)', 'Number Sign (#, Unicode U+0023) replaced by CHAT (short) pause code: (.)'),
+                   (re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'),
                    (re.compile(r'\[\+trn\]'), '[+trn]', '[+ trn]', 'Missing space'),
                    (re.compile(r'\[:(?![:\s])'), '[:', '[: ', 'Missing space'),
                    (re.compile(r'(?<=\w)\+\.\.\.'), '+...', ' +...', 'Missing space'),
diff --git a/corrector.py b/corrector.py
index d5c78ba..a0947dd 100644
--- a/corrector.py
+++ b/corrector.py
@@ -1,15 +1,5 @@
 ''''
-Jij moet er dan voor zorgen dat je in de CHAT file die je produceert iedere uiting afgaat en een call doet naar een functie
-
-getcorrection
-met als argument de string van de uiting.
-
-Deze functie geeft dan terug een tuple (correction, metadata)
-
-waarbij
-•	correction een string is die je op moet nemen in de chat file als de verbeterde uiting
-•	metadata metadata zijn a la PaQu (type, name, value) o.a. origutt van type text met als waarde de inputstring
-
+to be added
 '''
 
 import copy
@@ -26,7 +16,7 @@
                    getunwantedtokens, nodesfindjaneenou)
 from deregularise import correctinflection
 from iedims import getjeforms
-from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart
+from lexicon import de, dets, getwordinfo, het, informlexicon, known_word, isa_namepart, tswnouns
 from macros import expandmacros
 # from namepartlexicon import namepart_isa_namepart
 from sastatok import sasta_tokenize
@@ -36,7 +26,7 @@
                              vowels)
 from sva import getsvacorrections
 from tokenmd import TokenListMD, TokenMD, mdlist2listmd
-from treebankfunctions import find1, getattval, getnodeyield
+from treebankfunctions import find1, getattval, getnodeyield, showtree, treeinflate, fatparse
 from lxml import etree
 import sys
 # from alternative import Alternative, Replacement, Metadata, Meta
@@ -46,6 +36,7 @@
 from alpinoparsing import parse, escape_alpino_input
 from expandquery import expandmacros
 from find_ngram import findmatches, ngram1, ngram2, ngram7, ngram10, ngram11, ngram16, ngram17
+from smallclauses import smallclauses
 
 SASTA = 'SASTA'
 
@@ -177,7 +168,8 @@ def reduce(tokens, tree):
     # remove tsw incl goh och hé oke but not ja, nee, nou
     tswtokens = [n for n in reducedtokens if n.pos in token2nodemap
                  and getattval(token2nodemap[n.pos], 'pt') == 'tsw'
-                 and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'}]
+                 and getattval(token2nodemap[n.pos], 'lemma') not in {'ja', 'nee', 'nou'}
+                 and getattval(token2nodemap[n.pos], 'lemma') not in tswnouns]
     tswpositions = [n.pos for n in tswtokens]
     allremovetokens += tswtokens
     allremovepositions == tswpositions
@@ -413,11 +405,12 @@ def getcorrection(utt, tree=None, interactive=False):
     return result
 
 
-def getcorrections(utt, method, tree=None, interactive=False):
-    origutt = utt
+def getcorrections(rawtokens, method, tree=None, interactive=False):
     allmetadata = []
-    rawtokens = sasta_tokenize(utt)
+    # rawtokens = sasta_tokenize(utt)
     wordlist = tokenlist2stringlist(rawtokens)
+    utt = space.join(wordlist)
+    origutt = utt
 
     # check whether the tree has the same yield
     origtree = tree
@@ -426,7 +419,7 @@ def getcorrections(utt, method, tree=None, interactive=False):
 
     if treewordlist != wordlist:
         revisedutt = space.join(wordlist)
-        tree = PARSE_FUNC(revisedutt)
+        tree = fatparse(revisedutt, rawtokens)
 
     tokens, metadata = cleantokens(rawtokens, repkeep=False)
     allmetadata += metadata
@@ -489,8 +482,9 @@ def getalternatives(origtokensmd, method, tree, uttid):
     # now turn each sequence of (token, md) pairs into a pair (tokenlist, mergedmetadata)
     newaltuttmds = []
     for altuttmd in altutts:
-        newaltuttmd = mdlist2listmd(altuttmd)
-        newaltuttmds.append(newaltuttmd)
+        if altuttmd != []:
+            newaltuttmd = mdlist2listmd(altuttmd)
+            newaltuttmds.append(newaltuttmd)
 
     # basic expansions
 
@@ -508,8 +502,8 @@ def getalternatives(origtokensmd, method, tree, uttid):
     for uttmd in allalternativemds:
         # utterance = space.join([token.word for token in uttmd.tokens])
         utterance, _ = mkuttwithskips(uttmd.tokens)
-        ntree = PARSE_FUNC(utterance)
-        newresults += getwrongdetalternatives(uttmd, ntree, uttid)
+        fatntree = fatparse(utterance, uttmd.tokens)
+        newresults += getwrongdetalternatives(uttmd, fatntree, uttid)
     allalternativemds += newresults
 
     newresults = []
@@ -518,9 +512,11 @@ def getalternatives(origtokensmd, method, tree, uttid):
         utterance, _ = mkuttwithskips(uttmd.tokens)
         # reducedtokens = [t for t in uttmd.tokens if not t.skip]
         # reduceduttmd = TokenListMD(reducedtokens, uttmd.metadata)
-        ntree = PARSE_FUNC(utterance)
-        # simpleshow(ntree)
-        uttalternativemds = getsvacorrections(uttmd, ntree, uttid)
+        fatntree = fatparse(utterance, uttmd.tokens)
+        debug = False
+        if debug:
+            showtree(fatntree)
+        uttalternativemds = getsvacorrections(uttmd, fatntree, uttid)
         newresults += uttalternativemds
     allalternativemds += newresults
 
@@ -528,8 +524,16 @@ def getalternatives(origtokensmd, method, tree, uttid):
     for uttmd in allalternativemds:
         # utterance = space.join([token.word for token in uttmd.tokens])
         utterance, _ = mkuttwithskips(uttmd.tokens)
-        ntree = PARSE_FUNC(utterance)
-        newresults += correctPdit(uttmd, ntree, uttid)
+        fatntree = fatparse(utterance, uttmd.tokens)
+        newresults += correctPdit(uttmd, fatntree, uttid)
+    allalternativemds += newresults
+
+    newresults = []
+    for uttmd in allalternativemds:
+        utterance, _ = mkuttwithskips(uttmd.tokens)
+        fatntree = fatparse(utterance, uttmd.tokens)
+        newresults += smallclauses(uttmd, fatntree)
+        # showtree(fatntree, text='fatntree')
     allalternativemds += newresults
 
     # final check whether the alternatives are improvements. It is not assumed that the original tokens is included in the alternatives
@@ -570,7 +574,7 @@ def mkuttwithskips(tokens, delete=True):
     return result, tokenposlist
 
 
-def getexpansions(uttmd):
+def oldgetexpansions(uttmd):
     expansionfound = False
     newtokens = []
     tokenctr = 0
@@ -612,6 +616,50 @@ def getexpansions(uttmd):
     return result
 
 
+
+def getexpansions(uttmd):
+    expansionfound = False
+    newtokens = []
+    tokenctr = 0
+    #newtokenctr = 0
+    tokenposlist = []
+    newmd = uttmd.metadata
+    for tokenctr, token in enumerate(uttmd.tokens):
+        if token.word.lower() in basicexpansions:
+            expansionfound = True
+            for (rlist, c, n, v) in basicexpansions[token.word.lower()]:
+                rlisttokenctr = 0
+                for rlisttokenctr, rw in enumerate(rlist):
+                    if rlisttokenctr == 0:
+                        newtoken = Token(rw, token.pos)
+                    else:
+                        newtoken = Token(rw, token.pos, subpos=rlisttokenctr)
+                    newtokens.append(newtoken)
+                    tokenposlist.append(token.pos)
+                    nwt = Token(space.join(rlist), token.pos)
+                meta1 = mkSASTAMeta(token, nwt, n, v, c, subcat=None, penalty=defaultpenalty,
+                                    backplacement=bpl_none)
+                newmd.append(meta1)
+
+        else:
+            newtoken = Token(token.word, token.pos)
+            newtokens.append(newtoken)
+            tokenposlist.append(token.pos)
+
+    # adapt the metadata
+    if expansionfound:
+        meta2 = Meta('OrigCleanTokenPosList', tokenposlist, annotatedposlist=[],
+                     annotatedwordlist=[], annotationposlist=tokenposlist,
+                     annotationwordlist=[], cat='Tokenisation', subcat=None, source=SASTA, penalty=defaultpenalty,
+                     backplacement=bpl_none)
+        newmd.append(meta2)
+        result = [TokenListMD(newtokens, newmd)]
+    else:
+        result = []
+
+    return result
+
+
 def lexcheck(intokensmd, allalternativemds):
     finalalternativemds = [intokensmd]
     for alternativemd in allalternativemds:
@@ -708,7 +756,7 @@ def explanationasreplacement(tokensmd, tree):
                 bpl = bpl_node if known_word(oldword) else bpl_word
                 meta = mkSASTAMeta(oldtoken, newtoken, name='ExplanationasReplacement',
                                    value='ExplanationasReplacement',
-                                   cat='Lexical Error', backplacement=bpl_node)
+                                   cat='Lexical Error', backplacement=bpl)
                 newmetadata.append(meta)
                 result = TokenListMD(newtokens, newmetadata)
     return result
@@ -925,10 +973,10 @@ def getwrongdetalternatives(tokensmd, tree, uttid):
                         meta = mkSASTAMeta(token, newcurtoken, name='GrammarError', value='deheterror', cat='Error',
                                            backplacement=bpl_node)
                         metadata.append(meta)
+                        correctiondone = True
                     else:
                         newcurtokenword = token.word
                 newtokens.append(Token(newcurtokenword, token.pos))
-                correctiondone = True
             else:
                 newcurtokenword = token.word
                 newtokens.append(token)
@@ -959,24 +1007,24 @@ def correctPdit(tokensmd, tree, uttid):
     metadata = tokensmd.metadata
     newtokens = []
     tokenctr = 0
+    nonskiptokenctr = 0
     prevtoken = None
     for token in tokens:
-        tokennode = next(filter(lambda x: getattval(x, 'begin') == str(tokenctr), tokennodes), None)
+        tokennode = next(filter(lambda x: getattval(x, 'begin') == str(token.pos + token.subpos), tokennodes), None)
         tokenlemma = getattval(tokennode, 'lemma')
         if not token.skip and prevtoken is not None and not prevtoken.skip and tokenlemma in {'dit', 'dat', 'deze',
                                                                                               'die'}:
             tokenrel = getattval(tokennode, 'rel')
             tokenpt = getattval(tokennode, 'pt')
-            prevtokennode = tokennodes[tokenctr - 1] if tokenctr > 0 else None
+            prevtokennode = tokennodes[nonskiptokenctr - 1] if tokenctr > 0 else None
             if prevtokennode is not None:
                 prevpt = getattval(prevtokennode, 'pt')
                 prevparent = prevtokennode.getparent()
                 prevparentrel, prevparentcat = getattval(prevparent, 'rel'), getattval(prevparent, 'cat')
                 indezemwp = getindezemwp(prevtokennode, tokennode)
-                if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'obj1',
-                                                                                  'det'} and tokenpt == 'vnw') or \
+                if (prevpt == 'vz' and prevparentcat != 'pp' and tokenrel not in {'det'} and tokenpt == 'vnw') or \
                         indezemwp:
-                    newtoken = Token('hem', tokenctr)
+                    newtoken = Token('hem', token.pos, subpos=token.subpos)
                     bpl = bpl_indeze if indezemwp else bpl_node
                     meta = mkSASTAMeta(token, newtoken, name='parsed as', value='hem', cat='AlpinoImprovement',
                                        backplacement=bpl)
@@ -990,6 +1038,8 @@ def correctPdit(tokensmd, tree, uttid):
         else:
             newtokens.append(token)
         tokenctr += 1
+        if not token.skip:
+            nonskiptokenctr += 1
         prevtoken = token
     result = TokenListMD(newtokens, metadata)
     if correctiondone:
diff --git a/correcttreebank.py b/correcttreebank.py
index 9354add..25912d7 100644
--- a/correcttreebank.py
+++ b/correcttreebank.py
@@ -4,8 +4,7 @@
 from lxml import etree
 
 from basicreplacements import basicreplacements
-from cleanCHILDEStokens import cleantext
-from corrector import getcorrections, mkuttwithskips
+from corrector import getcorrections, mkuttwithskips, disambiguationdict
 from lexicon import de, dets, known_word
 from metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, bpl_none,
                       bpl_word, bpl_wordlemma)
@@ -17,8 +16,13 @@
                                deletewordnodes, find1, getattval, getbeginend,
                                getcompoundcount, getnodeyield, getsentid,
                                gettokposlist, getyield, myfind, showflatxml,
-                               simpleshow, transplant_node)
+                               simpleshow, transplant_node, showtree, treeinflate, fatparse, treewithtokenpos,
+                               updatetokenpos, getuttid)
 from config import PARSE_FUNC, SDLOGGER
+from metadata import insertion
+from sastatoken import inflate, deflate, tokeninflate, insertinflate
+from CHAT_Annotation import omittedword
+from cleanCHILDEStokens import cleantext
 
 ampersand = '&'
 
@@ -123,61 +127,52 @@ def contextualise(node1, node2):
             newnode.attrib[prop] = node2.attrib[prop]
     return newnode
 
+def updatemetadata(metadata, tokenposdict):
+    begintokenposdict = {k-1: v-1 for (k, v) in tokenposdict.items()}
+    newmetadata = []
+    for meta in metadata:
+        newmeta = deepcopy(meta)
+        newmeta.annotationposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotationposlist]
+        newmeta.annotatedposlist = [begintokenposdict[pos] if pos in begintokenposdict else insertinflate(pos) for pos in meta.annotatedposlist]
+        newmetadata.append(newmeta)
+    return newmetadata
 
-def updatetokenpos(resulttree, tokenposdict):
-    # resulttree = deepcopy(stree)
-    for child in resulttree:
-        newchild = updatetokenpos(child, tokenposdict)
-    if ('pt' in resulttree.attrib or 'pos' in resulttree.attrib) and 'end' in resulttree.attrib and 'begin' in resulttree.attrib:
-        intend = int(resulttree.attrib['end'])
-        if intend in tokenposdict:
-            newendint = tokenposdict[intend]
-            resulttree.attrib['end'] = str(newendint)
-            resulttree.attrib['begin'] = str(newendint - 1)
-        else:
-            SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend))
-            etree.dump(resulttree)
-            SDLOGGER.error('tokenposdict={}'.format(tokenposdict))
-    elif 'cat' in resulttree.attrib:
-        children = [ch for ch in resulttree]
-        (b, e) = getbeginend(children)
-        resulttree.attrib['begin'] = b
-        resulttree.attrib['end'] = e
+def updatetokenposmd(intree, metadata, tokenposdict):
+    resulttree = updatetokenpos(intree, tokenposdict)
+    newmetadata = updatemetadata(metadata, tokenposdict)
+    return resulttree, newmetadata
 
-    return resulttree
 
 
 def findskippednodes(stree, tokenlist):
+    debug = False
+    if debug:
+        showtree(stree, text='findskippednodes:stree:')
     topnode = find1(stree, './/node[@cat="top"]')
-    # tokenposdict =  {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))}
-    tokenposdict = {}
-    elctr = 0
-    i = 0
-    for tok in tokenlist:
-        elctr += 1
-        if not tok.skip:
-            tokenposdict[elctr] = i + 1
-            i += 1
-    resultlist = findskippednodes2(topnode, tokenposdict)
+    #tokenposdict =  {i+1:tokenlist[i].pos+1 for i in range(len(tokenlist))}
+    tokenposset = {t.pos + 1 for t in tokenlist if not t.skip}
+    resultlist = findskippednodes2(topnode, tokenposset)
     return resultlist
 
 
-def findskippednodes2(stree, tokenposdict):
+def findskippednodes2(stree, tokenposset):
     resultlist = []
     if stree is None:
         return resultlist
     if 'pt' in stree.attrib or 'pos' in stree.attrib:
-        if int(stree.attrib['end']) not in tokenposdict:
+        if int(stree.attrib['end']) not in tokenposset:
             resultlist.append(stree)
     elif 'cat' in stree.attrib:
         for child in stree:
-            resultlist += findskippednodes2(child, tokenposdict)
+            resultlist += findskippednodes2(child, tokenposset)
     else:
         pass
     return resultlist
 
 
-def insertskips(newstree, tokenlist, stree):
+
+
+def insertskips(newstree,  tokenlist, stree):
     '''
 
     :param newstree: the corrected tree, with skipped elements absent
@@ -185,58 +180,81 @@ def insertskips(newstree, tokenlist, stree):
     :param stree: original stree with parses of the skipped elements
     :return: adapted tree, with the skipped elements inserted (node from the original stree as -- under top, begin/ends updates
     '''
-    # debug = True
     debug = False
 
     if debug:
-        print('\nnewstree:')
-        etree.dump(newstree)
-    resulttree = deepcopy(newstree)
+        showtree(newstree, 'newstree:')
+        showtree(stree, 'stree')
+    reducedtokenlist = [t for t in tokenlist if not t.skip]
+    resulttree = treewithtokenpos(newstree, reducedtokenlist)
+
+    if debug:
+        showtree(resulttree, text='resulttree:')
+    streetokenlist = [ t for t in tokenlist if t.subpos == 0]
+    stree = treewithtokenpos(stree, streetokenlist)
+    if debug:
+        showtree(stree, text='stree with tokenpos:')
+    debug = False
     # tokenpostree = deepcopy(stree)
     # update begin/ends
-    reducedtokenlist = [t for t in tokenlist if not t.skip]
-    tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))}
-    resulttree = updatetokenpos(resulttree, tokenposdict)
+    #next not needed anymore
+    #tokenposdict = {i + 1: reducedtokenlist[i].pos + 1 for i in range(len(reducedtokenlist))}
+    #showtree(resulttree, text='in: ')
+    #resulttree, newmetadata = updatetokenposmd(resulttree, metadata, tokenposdict)
+    #showtree(resulttree, text='out:')
     # tokenpostree = updatetokenpos(tokenpostree, tokenposdict)
-    if debug:
-        print('\nstree:')
-        etree.dump(stree)
-        # print('\ntokenpostree:')
-        # etree.dump(tokenpostree)
-        print('\nresulttree:')
-        etree.dump(resulttree)
+    #if debug:
+        # print('\nstree:')
+        # etree.dump(stree)
+        # # print('\ntokenpostree:')
+        # # etree.dump(tokenpostree)
+        # print('\nresulttree:')
+        # etree.dump(resulttree)
 
     # insert skipped elements
     nodestoinsert = findskippednodes(stree, tokenlist)
     nodestoinsertcopies = [deepcopy(n) for n in nodestoinsert]
-    # simpleshow(stree)
+    if debug:
+        showtree(stree, text='insertskips: stree:')
+    if debug:
+        showtree(resulttree, text='insertskips: resulttree:')
     topnode = find1(resulttree, './/node[@cat="top"] ')
     topchildren = [ch for ch in topnode]
     allchildren = nodestoinsertcopies + topchildren
     sortedchildren = sorted(allchildren, key=lambda x: x.attrib['end'], reverse=True)
-    # simpleshow(stree)
+    if debug:
+        showtree(resulttree, text='insertskips: resulttree:')
     for ch in topnode:
         topnode.remove(ch)
-    # simpleshow(stree)
+    if debug:
+        showtree(resulttree, text='insertskips: resulttree:')
     for node in sortedchildren:
         node.attrib['rel'] = '--'    # these are now extragrammatical with relation --
         topnode.insert(0, node)
-    # simpleshow(stree)
+    if debug:
+        showtree(resulttree, text='insertskips: resulttree:')
     (b, e) = getbeginend(sortedchildren)
     topnode.attrib['begin'] = b
     topnode.attrib['end'] = e
-    # simpleshow(stree)
+    if debug:
+        showtree(resulttree, text='insertskips: resulttree:')
 
     sentlist = getyield(resulttree)
     sent = space.join(sentlist)
     sentnode = find1(resulttree, 'sentence')
     sentnode.text = sent
     if debug:
-        print('result of insertskips')
-        etree.dump(resulttree)
+        showtree(resulttree, 'result of insertskips')
 
     return resulttree
 
+def getomittedwordbegins(metalist):
+    results = []
+    for meta in metalist:
+        if meta.name == omittedword:
+            results += meta.annotatedposlist
+    return results
+
 
 def correct_stree(stree, method, corr):
     '''
@@ -255,7 +273,7 @@ def correct_stree(stree, method, corr):
         print(showflatxml(stree))
 
     allmetadata = []
-    allorandalts = []
+    orandalts = []
 
     # uttid:
     uttid = getuttid(stree)
@@ -266,7 +284,7 @@ def correct_stree(stree, method, corr):
     origutt = getorigutt(stree)
     if origutt is None:
         SDLOGGER.error('Missing origutt in utterance {}'.format(uttid))
-        return stree
+        return stree, orandalts
     # list of token positions
 
     # get the original metadata; these will be added later to the tree of each correction
@@ -282,19 +300,33 @@ def correct_stree(stree, method, corr):
     # allmetadata += origmetadata
     # clean in the tokenized manner
 
-    cleanutt, chatmetadata = cleantext(origutt, False)
+    cleanutttokens, chatmetadata = cleantext(origutt, False, tokenoutput=True)
     allmetadata += chatmetadata
-    cleanutttokens = sasta_tokenize(cleanutt)
+    #cleanutttokens = sasta_tokenize(cleanutt)
     cleanuttwordlist = [t.word for t in cleanutttokens]
+    cleanutt = space.join(cleanuttwordlist)
 
-    # get corrections, given the stree
+    # get corrections, given the inflated stree
+    #inflate the tree
+    fatstree = deepcopy(stree)
+    treeinflate(fatstree)
+    # adapt the begins and ends  in the tree based on the token positions
+    debug = False
+    if debug:
+        showtree(fatstree, text='fatstree voor:')
+    tokenlist = [t for t in cleanutttokens]
+    fatstree = treewithtokenpos(fatstree, tokenlist)
+    if debug:
+        showtree(fatstree, text='fatstree na:')
+    debug = False
+    #(fatstree, text='fattened tree:')
 
-    ctmds = getcorrections(cleanutt, method, stree)
+    ctmds = getcorrections(cleanutttokens, method, fatstree)
 
+    debug = False
     if debug:
-        print('2:', end=': ')
-        simpleshow(stree)
-        print(showflatxml(stree))
+        showtree(fatstree, text='2:')
+    debug = False
 
     ptmds = []
     for correctiontokenlist, cwmdmetadata in ctmds:
@@ -302,70 +334,89 @@ def correct_stree(stree, method, corr):
         correctionwordlist = tokenlist2stringlist(correctiontokenlist, skip=True)
 
         # parse the corrections
-        if correctionwordlist != cleanuttwordlist:
-            # @@@adapt this, skip the tokens to be skipped@@@
-            # correction = space.join(correctionwordlist)
+        if correctionwordlist != cleanuttwordlist and correctionwordlist != []:
             correction, tokenposlist = mkuttwithskips(correctiontokenlist)
             cwmdmetadata += [Meta('parsed_as', correction, cat='Correction', source='SASTA')]
-            newstree = PARSE_FUNC(correction)
-            if newstree is None:
-                newstree = stree  # is this what we want?@@
+            reducedcorrectiontokenlist = [token for token in correctiontokenlist if not token.skip]
+            fatnewstree = fatparse(correction, reducedcorrectiontokenlist)
+            debugb = False
+            if debugb:
+                showtree(fatnewstree, text='fatnewstree')
+
+            if fatnewstree is None:
+                fatnewstree = fatstree  # is this what we want?@@
             else:
                 # insert the leftout words and adapt the begin/ends of the nodes
                 # simpleshow(stree)
-                newstree = insertskips(newstree, correctiontokenlist, stree)
+                fatnewstree = insertskips(fatnewstree, correctiontokenlist, fatstree)
+                #newstree = insertskips(newstree, correctiontokenlist, stree)
                 # simpleshow(stree)
                 mdcopy = deepcopy(origmetadata)
-                newstree.insert(0, mdcopy)
+                fatnewstree.insert(0, mdcopy)
                 # copy the sentid attribute
-                sentencenode = getsentencenode(newstree)
+                sentencenode = getsentencenode(fatnewstree)
                 if sentencenode is not None:
                     sentencenode.attrib['sentid'] = sentid
-                if debug:
-                    print(etree.tostring(newstree, pretty_print=True))
-                # etree.dump(newstree)
+                if debugb:
+                    showtree(fatnewstree)
+                # etree.dump(fatnewstree)
 
         else:
             # make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata
-            newstree = add_metadata(stree, chatmetadata)
+            fatnewstree = add_metadata(fatstree, chatmetadata)
 
-        ptmds.append((correctionwordlist, newstree, cwmdmetadata))
+        ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata))
 
     # select the stree for the most promising correction
+    debug = False
     if debug:
         print('3:', end=': ')
-        simpleshow(stree)
-        print(showflatxml(stree))
+        showtree(fatnewstree)
+    debug = False
 
     if ptmds == []:
-        thecorrection, orandalts = (cleanutt, stree, origmetadata), None
+        thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
     elif corr in [corr1, corrn]:
-        thecorrection, orandalts = selectcorrection(stree, ptmds, corr)
+        thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr)
     else:
         SDLOGGER.error('Illegal correction value: {}. No corrections applied'.format(corr))
-        thecorrection, orandalts = (cleanutt, stree, origmetadata), None
+        thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
 
     thetree = deepcopy(thecorrection[1])
 
-    if debug:
-        print('4:', end=': ')
-        simpleshow(stree)
-        print(showflatxml(stree))
+    #debuga = True
+    debuga = False
+    if debuga:
+        print('4: (fatstree)')
+        etree.dump(fatstree, pretty_print=True)
 
     # do replacements in the tree
-    # etree.dump(thetree)
+    if debuga:
+        print('4b: (thetree)')
+        etree.dump(thetree, pretty_print=True)
     reverseposindex = gettokposlist(thetree)
 
+    if debuga:
+        print('4b: (thetree)')
+        etree.dump(thetree, pretty_print=True)
+
     # resultposmeta = selectmeta('cleanedtokenpositions', allmetadata)
     # resultposlist = resultposmeta.value
 
     newcorrection2 = thecorrection[2]
     nodes2deletebegins = []
+    # next adapted, the tree is fat already
+    debug = False
+    if debug:
+        showtree(thetree, text='thetree before treewithtokenpos')
+    thetree = treewithtokenpos(thetree, correctiontokenlist)
+    if debug:
+        showtree(thetree, text='thetree after treewithtokenpos')
     for meta in thecorrection[2]:
         if meta.backplacement == bpl_node:
             nodeend = meta.annotationposlist[-1] + 1
             newnode = myfind(thetree, './/node[@pt and @end="{}"]'.format(nodeend))
-            oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend))
+            oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend))
             if newnode is not None and oldnode is not None:
                 # adapt oldnode1 for contextual features
                 contextoldnode = contextualise(oldnode, newnode)
@@ -374,7 +425,7 @@ def correct_stree(stree, method, corr):
             nodeend = meta.annotationposlist[-1] + 1
             nodexpath = './/node[@pt and @begin="{}" and @end="{}"]'.format(nodeend - 1, nodeend)
             newnode = myfind(thetree, nodexpath)
-            oldnode = myfind(stree, nodexpath)
+            oldnode = myfind(fatstree, nodexpath)
             if newnode is not None and oldnode is not None:
                 if 'word' in newnode.attrib and 'word' in oldnode.attrib:
                     newnode.attrib['word'] = oldnode.attrib['word']
@@ -403,28 +454,39 @@ def correct_stree(stree, method, corr):
         elif meta.backplacement == bpl_indeze:
             nodebegin = meta.annotatedposlist[-1]
             nodeend = nodebegin + 1
-            oldnode = myfind(stree, './/node[@pt and @end="{}"]'.format(nodeend))
+            oldnode = myfind(fatstree, './/node[@pt and @end="{}"]'.format(nodeend))
             if oldnode is not None:
                 nodeid = oldnode.attrib['id']
                 dezeAVnode = etree.fromstring(dezeAVntemplate.format(begin=nodebegin, end=nodeend, id=nodeid))
                 thetree = transplant_node(oldnode, dezeAVnode, thetree)
 
-        # etree.dump(thetree, pretty_print=True)
+        #etree.dump(thetree, pretty_print=True)
+
+    # now do all the deletions at once, incl adaptation of begins and ends, and new sentence node
+    debug = False
+    if debug:
+        showtree(thetree, text='thetree before deletion:')
 
-    # now do all the deletions at once, incl normalisation of begins and ends, and new sentence node
+    nodes2deletebegins = [int(b) for b in nodes2deletebegins]
     thetree = deletewordnodes(thetree, nodes2deletebegins)
 
+    if debug:
+        showtree(thetree, text='thetree after deletion:')
+
+    debug = False
+
     # adapt the metadata
     cleantokposlist = [meta.annotationwordlist for meta in newcorrection2 if meta.name == 'cleanedtokenpositions']
     cleantokpos = cleantokposlist[0] if cleantokposlist != [] else []
-    newcorrection2 = [updatecleantokmeta(meta, nodes2deletebegins, cleantokpos) for meta in newcorrection2]
+    insertbegins = [meta.annotatedposlist for meta in newcorrection2 if meta.name == insertion ]
+    flatinsertbegins = [str(v) for el in insertbegins for v in el]
+    purenodes2deletebegins = [str(v) for v in nodes2deletebegins if str(v) not in flatinsertbegins]
+    newcorrection2 = [updatecleantokmeta(meta, purenodes2deletebegins, cleantokpos) for meta in newcorrection2]
 
-    # etree.dump(thetree, pretty_print=True)
+    #etree.dump(thetree, pretty_print=True)
 
     if debug:
-        print('5:', end=': ')
-        simpleshow(stree)
-        print(showflatxml(stree))
+        showtree(fatstree, text='5:')
 
     restoredtree = thetree
 
@@ -451,12 +513,19 @@ def correct_stree(stree, method, corr):
         metadata.append(meta.toElement())
 
     if debug:
-        streesentlist = getyield(stree)
+        streesentlist = getyield(fatstree)
         fulltreesentlist = getyield(fulltree)
         if streesentlist != fulltreesentlist:
             SDLOGGER.warning('Yield mismatch\nOriginal={original}\nAfter correction={newone}'.format(original=streesentlist,
                                                                                                      newone=fulltreesentlist))
-
+    rawoldleavenodes = getnodeyield(fatstree)
+    omittedwordbegins = getomittedwordbegins(newcorrection2)
+    oldleavenodes = [n for n in rawoldleavenodes if int(getattval(n, 'begin')) not in omittedwordbegins]
+    oldleaves = [ getattval(n, 'word') for n in oldleavenodes]
+    newleaves = getyield(fulltree)
+    uttid = getuttid(stree)
+    if debug and oldleaves != newleaves:
+        SDLOGGER.error('Yield mismatch:{uttid}\n:OLD={oldleaves}\nNEW={newleaves}'.format(uttid=uttid, oldleaves=oldleaves, newleaves=newleaves))
     # return this stree
     # print('dump 2:')
     # etree.dump(fulltree, pretty_print=True)
@@ -487,7 +556,7 @@ def updatecleantokmeta(meta, begins, cleantokpos):
         return meta
 
 
-def getuttid(stree):
+def oldgetuttid(stree):
     uttidlist = stree.xpath(uttidxpath)
     if uttidlist == []:
         SDLOGGER.error('Missing uttid')
@@ -507,14 +576,14 @@ def getorigutt(stree):
 
 
 def scorefunction(obj): return (-obj.unknownwordcount, -obj.dpcount, -obj.dhyphencount, obj.goodcatcount,
-                                -obj.basicreplaceecount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount,
+                                -obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, obj.dimcount, obj.compcount, obj.supcount,
                                 obj.compoundcount, obj.sucount, obj.svaok, -obj.deplusneutcount, -obj.penalty)
 
 
 class Alternative():
     def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount,
                  compcount, supcount, compoundcount, unknownwordcount, sucount, svaok, deplusneutcount, goodcatcount,
-                 hyphencount, basicreplaceecount):
+                 hyphencount, basicreplaceecount, ambigcount):
         self.stree = stree
         self.altid = altid
         self.altsent = altsent
@@ -532,6 +601,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, dimcou
         self.goodcatcount = int(goodcatcount)
         self.hyphencount = int(hyphencount)
         self.basicreplaceecount = int(basicreplaceecount)
+        self.ambigcount = int(ambigcount)
 
     def alt2row(self, uttid, base, user1='', user2='', user3='', bestaltids=[], selected=None, origsent=None):
         scores = ['BEST'] if self.altid in bestaltids else []
@@ -651,6 +721,12 @@ def isvalidword(w):
         return True
 
 
+def countambigwords(stree):
+    leaves = getnodeyield(stree)
+    ambignodes = [leave for leave in leaves if getattval(leave, 'word').lower() in disambiguationdict]
+    result = len(ambignodes)
+    return result
+
 def selectcorrection(stree, ptmds, corr):
     # to be implemented@@
     # it is presupposed that ptmds is not []
@@ -677,9 +753,10 @@ def selectcorrection(stree, ptmds, corr):
         hyphencount = len([node for node in nt.xpath('.//node[contains(@word, "-")]')])
         basicreplaceecount = len([node for node in nt.xpath('.//node[@word]')
                                   if getattval(node, 'word').lower() in basicreplacements])
+        ambigwordcount = countambigwords(nt)
         alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, dimcount, compcount, supcount,
                           compoundcount, unknownwordcount, sucount, svaokcount, deplusneutcount, goodcatcount,
-                          hyphencount, basicreplaceecount)
+                          hyphencount, basicreplaceecount, ambigwordcount)
         alts[altid] = alt
         altid += 1
     orandalts = OrigandAlts(orig, alts)
diff --git a/find_ngram.py b/find_ngram.py
index e4a661c..1f1e41f 100644
--- a/find_ngram.py
+++ b/find_ngram.py
@@ -192,6 +192,8 @@ def cond17(ns, lvs, i): return lemma(ns[0]) == 'te' and getattval(ns[1], 'his')
 def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen' and lemma(ns[2]) == 'te'
 
 
+def cond18(ns, lvs, i): return pt(ns[0]) == 'vz' and lemma(ns[1]) in {'dit', 'dat', 'deze', 'die'}
+
 ngram1 = Ngram(4, cond1)
 ngram2 = Ngram(4, cond2)
 ngram3 = Ngram(2, cond3)
@@ -211,7 +213,7 @@ def cond17a(ns, lvs, i): return lemma(ns[0]) == 'te' and word(ns[1]) == 'kregen'
 ngram16a = Ngram(4, cond16a)  # geen beroerte een beroerte test
 ngram17 = Ngram(4, cond17)  # te kregen te krijgen
 ngram17a = Ngram(4, cond17a)  # te kregen te krijgen test
-
+ngram18 = Ngram(2, cond18)  # met dit
 
 def main():
 
@@ -231,7 +233,7 @@ def main():
                 leaves = getnodeyield(tree)
                 cleanleaves = [leave for leave in leaves if getattval(leave, 'word') not in filledpauseslexicon]
                 cleanwordlist = [getattval(leave, 'word') for leave in cleanleaves]
-                matches = findmatches(ngram1, cleanleaves)
+                matches = findmatches(ngram18, cleanleaves)
                 # matches = sipvjpvjsi(cleanleaves, tree)
                 for match in matches:
                     uttid = getuttid(tree)
diff --git a/lexicon.py b/lexicon.py
index 9692346..380e1dc 100644
--- a/lexicon.py
+++ b/lexicon.py
@@ -12,6 +12,9 @@
 
 lexicon = celex
 
+#Alpino often analyses certain words as tsw though they should be analysed as nouns
+tswnouns = ['baby', 'jongen', 'juf', 'jufforouw', 'mam', 'mama', 'mamma', 'meisje', 'mens', 'meneer', 'mevrouw',
+            'pap', 'papa', 'pappa', 'stouterd', 'opa', 'oma']
 
 de = '1'
 het = '2'
diff --git a/macros/newimperatives.txt b/macros/newimperatives.txt
index 2ab72e1..490a40f 100644
--- a/macros/newimperatives.txt
+++ b/macros/newimperatives.txt
@@ -54,8 +54,11 @@ nonfinvc = """(@rel="vc" and %nonfincat%) """
 realcomplormodnode = """node[%realcomplormod%]"""
 realcomplormod = """(not(%particlesvp%) and not(%indexnode%) and not(%nonfinvc%) and not(@rel="hd"))"""
 indexnode = """(@index and not (@cat or @pt or @pos))"""
+suindexnode = """(%indexnode% and @rel="su") """
 nonfinindexnode = """(%indexnode% and parent::node[%nonfinvc%])"""
 
+fillednode = """node[not(%indexnode%)]"""
+
 particlesvp = """(@rel="svp" and @pt="vz")"""
 
 realcomplormodnodecount = """count(%realcomplormodnode% | node[%nonfinvc%]/%realcomplormodnode%)"""
@@ -94,9 +97,23 @@ wond5plus = """(%ynquery% and %realcomplormodnodecount% >= 4)"""
 partofwhquestion = """((@cat="sv1" or @cat="ssub") and @rel="body" and parent::node[@cat="whq" or @cat="whsub" ]) """
 declarative = """(@cat="smain" or (@cat="ssub" and not(%partofwhquestion%)) or (@cat="sv1" and not(%basicimperative%) and not(%ynquery%) and not(%partofwhquestion%)) )"""
 
-Tarsp_OndWB = """
-(%declarative% and %Ond% and  %Tarsp_W% and %Tarsp_B_X%  and %realcomplormodnodecount% = 2 )
-"""
+Tarsp_OndB = """(%Ond% and  node[%Tarsp_Basic_B%]  and count(node) = 2)"""	
+
+Tarsp_OndVC = """(%Ond% and  node[%Tarsp_Basic_VC%]  and count(node) = 2) """
+
+Tarsp_OndBVC = """(%Ond% and node[%Tarsp_Basic_B%]  and node[%Tarsp_Basic_VC%]  and count(node) = 3) """
+
+Tarsp_OndW = """(%declarative% and %Ond% and  (%Tarsp_W%  or node[%Tarsp_onlyWinVC%]) and %realcomplormodnodecount% = 0 )"""
+
+Tarsp_onlyWinVC = """(@rel="vc" and node[@rel="hd" and @pt="ww" and %realcomplormodnodecount% = 0])"""
+
+
+Tarsp_OndWB = """(%declarative% and %Ond% and  %Tarsp_W% and %Tarsp_B_X%  and %realcomplormodnodecount% = 2 )"""
+
+Tarsp_BasicVCW = """(node[@pt="ww" and @rel="hd"] and node[%Tarsp_Basic_VC%] and count(%fillednode%)=2)"""
+
+Tarsp_VCW_X = """(%Tarsp_BasicVCW% or (node[%nonfinvc% and %Tarsp_BasicVCW%] and count(node)=1) )""" 
+
 
 Tarsp_OndWBVC = """
 (%declarative% and %Ond% and  %Tarsp_W% and %Tarsp_B_X% and %Tarsp_VC_X% and %realcomplormodnodecount% = 3 )
@@ -180,6 +197,8 @@ Tarsp_Ov3 = """(%declarative% and
 				not(%Tarsp_OndWB%) and 
 				not(%Tarsp_BBX%)and 
 				not(%Tarsp_WBVC%) and 
+				not(%Tarsp_OndB%) and
+				not(%Tarsp_OndVC%) and
 				%realcomplormodnodecount% = 2) """
 
 
@@ -190,11 +209,12 @@ Tarsp_kijkVU = """(@pt="ww" and @lemma="kijken" and @wvorm="pv" and @pvagr="ev"
 
 Tarsp_pporvc = """ (((@rel="pc" or @rel="mod" or @rel="ld") and @cat="pp")  or @rel="vc")"""
 
-Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"]) and 
+Tarsp_coreW = """ ( @pt="ww" and (@wvorm="pv" or parent::node[@rel!="vc"] or %Tarsp_BarenonfinW%) and 
         not(%Tarsp_kijkVU%) and
         not((@lemma="zijn" or @lemma="worden") and 
             parent::node[node[@rel="vc"]]) )"""
-			
+
+Tarsp_BarenonfinW = """parent::node[@rel="vc" and  parent::node[@cat="smain" and count(node)=1]]"""
 
 Tarsp_Hwwi = """(( @pt="ww" and @rel="hd" and @wvorm="pv" and
                   %Tarsp_hww% and
diff --git a/macros/sastamacros1.txt b/macros/sastamacros1.txt
index e976d0c..5034941 100644
--- a/macros/sastamacros1.txt
+++ b/macros/sastamacros1.txt
@@ -37,9 +37,9 @@ JO_kijken_naar = """        parent::node[@cat="pp" and
 robusttopicdrop = """(@cat="sv1" and ../node[@lemma="."])"""
 
 Tarsp_hww = """
- (@lemma="kunnen" or
+ (@lemma = "kunnen" or
   @lemma = "moeten" or
-  @lemma= "hoeven" or
+  @lemma = "hoeven" or
   @lemma = "blijven" or
   @lemma = "willen" or
   @lemma = "zullen" or   
@@ -59,6 +59,7 @@ Tarsp_vc_sibling = """parent::node[ node[@rel="vc"]]"""
 Tarsp_predc_sibling = """parent::node[ node[@rel="predc"]]"""
 Tarsp_obj1_sibling = """parent::node[ node[@rel="obj1"]]"""
 Tarsp_ld_sibling = """parent::node[ node[@rel="ld"]]"""
+Tarsp_onlymodR_sibling = """(parent::node[node[@rel="mod" and %Rpronoun%] and not(node[@rel="predc"])])"""
 
 Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and 
  ((
@@ -66,7 +67,7 @@ Tarsp_HwwZ = """(@pt="ww" and @rel="hd" and @wvorm="pv" and
    @lemma = "hebben" 
    ) and 
    not(%Tarsp_vc_sibling%)) or
-   (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling%)
+   (@lemma="zijn" and not(%Tarsp_vc_sibling%) and %Tarsp_ld_sibling% )
  )
 """
 
@@ -78,7 +79,7 @@ Tarsp_Kop = """
 
     ((%Tarsp_predc_sibling% and not(%Tarsp_obj1_sibling%)) or
 
-     (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%))
+     (@lemma="zijn" and not(%Tarsp_vc_sibling%) and not(%Tarsp_ld_sibling%) and not(%Tarsp_onlymodR_sibling%))
     )
  )
 """
@@ -128,6 +129,13 @@ pv = """(@pt="ww" and @wvorm="pv" )"""
 bxnp1 = """(@cat="np" and count(node)=2 and node[@rel="hd" and @pt="ww"] and node[@rel="mod" and @pt])"""
 bxnp2 = """(@cat="np" and count(node)=2 and node[@rel="hd"] and node[@rel="mod" and %singlewordbw%])"""
 
+Tarsp_Basic_VC = """((@rel="obj1" or @rel="pc"  or @rel="predc" or @rel="ld" or @rel="obj2" or %Tarsp_finvc% or %Tarsp_vcvnw% or (@rel="svp" and @pt!="vz")) and not(%Tarsp_Basic_B%) )"""
+
+
+Tarsp_Basic_B = """(@rel="mod" or @rel="ld" or @rel="predm"  or %Tarsp_B_predc%) """
+
+Tarsp_B_predc = """(@rel=predc and (@pt="vz" or @pt="bw" or @cat="pp" or @cat="advp" or %Rpronoun%))"""
+
 Tarsp_B = """(
        ((((@rel="mod" or @rel="ld" or @rel="predm") and
           (not(@cat) or @cat!="conj") and
@@ -169,9 +177,13 @@ pobj1B = """(@rel="pc" and ../node[@rel="hd" and %locverb%])"""
 singlewordbw = """ (@pt="bw" or %Rpronoun% or %adjadv%)
 """
 
+
+
 corephrase = """(@cat="np" or @cat="pp" or @cat="advp" or @cat="ap")"""
 
-coreBX = """(node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]])"""
+coreBX = """((node[@cat="du" and node[%singlewordbw% and @lemma!="niet" ] and node[(%corephrase% or (@pt and not(%pv%))) and @begin!=../node[%singlewordbw% and @lemma!=niet]/@begin ]]) )"""
+
+Tarsp_bnonfin = """((@cat="inf" or @cat="ppart") and @rel="vc" and parent::node[@cat="smain" and count(node)=1] and node[%Tarsp_B%] and node[@pt="ww" and @rel="hd"] and count(node[%realcomplormod%])=1 )"""
 
 
 ASTA_pred =  """(@rel="predc" or @rel="predm" or (@rel="hd" and parent::node[@rel="predc" or @rel="predm"]))"""
@@ -301,7 +313,12 @@ spec_noun = """ (@pt="spec" and (@pos="name" or starts-with(@frame,"proper_name"
  
  """
  
- asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or (@pt="ww" and @positie="nom") or (%monthname%) or @pos="name")
+ asta_numvrij = """(@pt="tw" and @positie="vrij" and @rel!="mwp" and @rel!="det" and @rel!="mod" )"""
+ 
+ asta_noun = """ ((@pt="n" and not(%ASTA_filled_pause%) and not(%ASTA_numeral%)) or 
+                  (@pt="ww" and @positie="nom") or 
+				  (%monthname%) or 
+				  @pos="name" )
  """
 
 
@@ -479,3 +496,11 @@ robustdelpv = """(not(@rel="dp" and @begin > ancestor::node[@cat="top"]/descenda
 
 delpv = """(%coredelpv% and %robustdelpv%)"""
 
+Vobij = """(@pt="bw" and (contains(@frame,"er_adverb" ) or contains(@frame, "tmp_adverb") or @lemma="daarom") and 
+@lemma!="er" and @lemma!="daar" and @lemma!="hier" and (starts-with(@lemma, 'er') or starts-with(@lemma, 'daar') or starts-with(@lemma, 'hier')))"""
+
+Tarsp_VzN = """(%vzn1xpath% or %vzn2xpath% ) """
+
+vzn1xpath = """(@cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"])))"""
+vzn2xpath = """(node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"])"""
+vzn3xpath = """(@pt="vz" and ../node[(@lemma="dit" or @lemma="dat")  and @begin>=../node[@pt="vz"]/@end and count(node)<=3] )"""
diff --git a/metadata.py b/metadata.py
index 2760f14..0205aaa 100644
--- a/metadata.py
+++ b/metadata.py
@@ -17,7 +17,8 @@
 
 class Meta:
     def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], annotatedposlist=[],
-                 annotatedwordlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty,
+                 annotatedwordlist=[], annotationcharlist=[], annotationcharposlist=[], annotatedcharlist=[],
+                 annotatedcharposlist=[], atype='text', cat=None, subcat=None, source=None, penalty=defaultpenalty,
                  backplacement=defaultbackplacement):
         self.atype = atype
         self.name = name
@@ -25,6 +26,10 @@ def __init__(self, name, value, annotationwordlist=[], annotationposlist=[], ann
         self.annotationposlist = annotationposlist
         self.annotatedwordlist = annotatedwordlist
         self.annotatedposlist = annotatedposlist
+        self.annotationcharlist = annotationcharlist
+        self.annotationcharposlist = annotationcharposlist
+        self.annotatedcharlist = annotatedcharlist
+        self.annotatedcharposlist = annotatedcharposlist
         self.value = value
         self.cat = cat
         self.subcat = subcat
@@ -93,3 +98,7 @@ def mkSASTAMeta(token, nwt, name, value, cat, subcat=None, penalty=defaultpenalt
 repetition = 'Repetition'
 fstoken = 'Retraced token'
 falsestart = 'Retracing with Correction'
+insertion = 'Insertion'
+smallclause = 'Small Clause Treatment'
+tokenmapping = 'Token Mapping'
+insertiontokenmapping = 'Insertion Token Mapping'
\ No newline at end of file
diff --git a/methods/ASTA Index Current.xlsx b/methods/ASTA Index Current.xlsx
index b575d44..ad598d8 100644
Binary files a/methods/ASTA Index Current.xlsx and b/methods/ASTA Index Current.xlsx differ
diff --git a/methods/TARSP Index 2022-01-07.xlsx b/methods/TARSP Index 2022-01-07.xlsx
new file mode 100644
index 0000000..1547465
Binary files /dev/null and b/methods/TARSP Index 2022-01-07.xlsx differ
diff --git a/methods/TARSP Index Current.xlsx b/methods/TARSP Index Current.xlsx
index 75e9075..6b00d71 100644
Binary files a/methods/TARSP Index Current.xlsx and b/methods/TARSP Index Current.xlsx differ
diff --git a/methods/~$TARSP Index Current.xlsx b/methods/~$TARSP Index Current.xlsx
new file mode 100644
index 0000000..8a7c89f
Binary files /dev/null and b/methods/~$TARSP Index Current.xlsx differ
diff --git a/mismatches.py b/mismatches.py
index 214b938..9686ae5 100644
--- a/mismatches.py
+++ b/mismatches.py
@@ -1,10 +1,11 @@
-
 import os
 from collections import Counter
 from copy import copy
 from lxml import etree
 from config import SDLOGGER
 from treebankfunctions import getyield, getmarkedyield, getattval
+from sastatoken import deflate
+
 tab = '\t'
 space = ' '
 eps = ''
@@ -13,6 +14,7 @@
 usercommentuntil = 3
 usercommentdefaultvalue = eps
 
+
 def getmarkedutt(m, syntree):
     thewordlist = getyield(syntree)
     thepositions = getwordpositions(m, syntree)
@@ -20,10 +22,12 @@ def getmarkedutt(m, syntree):
     yieldstr = space.join(themarkedyield)
     return yieldstr
 
+
 def mark(str):
-    result = '*'+ str + '*'
+    result = '*' + str + '*'
     return result
 
+
 def getwordpositionsold(matchtree, syntree):
     positions1 = []
     for node in matchtree.iter():
@@ -35,7 +39,7 @@ def getwordpositionsold(matchtree, syntree):
     for node in syntree.iter():
         if 'index' in node.attrib and ('pt' in node.attrib or 'cat' in node.attrib or 'pos' in node.attrib):
             theindex = node.attrib['index']
-            indexednodes[theindex]=node
+            indexednodes[theindex] = node
 
     thequery2 = ".//node[@index and not(@pt) and not(@cat)]"
     try:
@@ -49,8 +53,9 @@ def getwordpositionsold(matchtree, syntree):
     result = [int(p) for p in positions]
     return result
 
+
 def getwordpositions(matchtree, syntree):
-    #nothing special needs to be done for index nodes since they also have begin and end
+    # nothing special needs to be done for index nodes since they also have begin and end
     positions = []
     for node in matchtree.iter():
         if 'end' in node.attrib:
@@ -58,6 +63,7 @@ def getwordpositions(matchtree, syntree):
     result = [int(p) for p in positions]
     return result
 
+
 def getfirstwordposition(matchtree):
     if 'begin' in matchtree.attrib:
         positionstr = getattval(matchtree, 'begin')
@@ -67,7 +73,6 @@ def getfirstwordposition(matchtree):
     return position
 
 
-
 def getmarkedyield(wordlist, positions):
     pos = 1
     resultlist = []
@@ -102,8 +107,23 @@ def mismatches(queryid, queries, theresultsminusgold, goldminustheresults, allma
                              uttstr]
         print(tab.join(platinumcheckrow2), file=platinumcheckfile)
 
-def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile, permsilverdatadict={}, annotationinput=False):
 
+def getmarkposition(position, nodeendmap, uttid):
+    if position == 0:
+        result = 1
+    elif uttid in nodeendmap:
+        if str(position) in nodeendmap[uttid]:
+            result = nodeendmap[uttid][str(position)]
+        else:
+            SDLOGGER.error('getmarkposition: No mapping found for position {} in utterance {}'.format(position, uttid))
+            result = 1
+    else:
+        SDLOGGER.error('getmarkposition: No mappings found for uttid {}'.format(uttid))
+        result = 1
+    return result
+
+def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches, allutts, platinumcheckfile,
+                    permsilverdatadict={}, annotationinput=False):
     theexactresults = exactresults[queryid] if queryid in exactresults else Counter()
     theexactgoldscores = exactgoldscores[queryid] if queryid in exactgoldscores else Counter()
     (theresultsminusgold, goldminustheresults, intersection) = exactcompare(theexactresults, theexactgoldscores)
@@ -117,13 +137,13 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
             markedwordlist = getmarkedyield(allutts[uttid], [markposition])
             uttstr = space.join(markedwordlist)
             platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item,
-                                 str(uttid), str(position), uttstr]
+                                 str(uttid), str(markposition), uttstr]
             print(tab.join(platinumcheckrow1), file=platinumcheckfile)
             key = (queryid, uttid, position)
             usercomments = getusercomments(permsilverdatadict, key, report=True)
-            xlplatinumcheckrow1 = usercomments +  ['More examples'] + platinumcheckrow1
+            xlplatinumcheckrow1 = usercomments + ['More examples'] + platinumcheckrow1
             newrows.append(xlplatinumcheckrow1)
-            #for (m, syntree) in allmatches[(queryid, uttid)]:
+            # for (m, syntree) in allmatches[(queryid, uttid)]:
             #    if getfirstwordposition(m) == position:
             #        markedutt = getmarkedutt(m, syntree)
             #       platinumcheckrow1 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item,
@@ -139,9 +159,11 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
             markedwordlist = getmarkedyield(allutts[uttid], [markposition])
             uttstr = space.join(markedwordlist)
         else:
-            SDLOGGER.warning('uttid {} not in alluts'.format(uttid))
+            SDLOGGER.warning('uttid {} not in allutts'.format(uttid))
             uttstr = ""
-        platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid), str(position),
+            markposition = 0
+        platinumcheckrow2 = [queryid, queries[queryid].cat, queries[queryid].subcat, queries[queryid].item, str(uttid),
+                             str(markposition),
                              uttstr]
         print(tab.join(platinumcheckrow2), file=platinumcheckfile)
         key = (queryid, uttid, position)
@@ -150,6 +172,7 @@ def exactmismatches(queryid, queries, exactresults, exactgoldscores, allmatches,
         newrows.append(xlplatinumcheckrow2)
     return newrows
 
+
 def compareunaligned(resultctr, goldctr):
     '''
 
@@ -168,20 +191,21 @@ def compareunaligned(resultctr, goldctr):
             takefromresultlist.append((utt1, pos1))
             takefromgoldlist.append((utt1, 0))
             newintersection.append((utt1, pos1))
-            curgoldlist.remove((utt1,0))
+            curgoldlist.remove((utt1, 0))
         elif pos1 == 0:
             for (utt2, pos2) in curgoldlist:
                 if utt1 == utt2:
                     takefromresultlist.append((utt1, pos1))
                     takefromgoldlist.append((utt1, pos2))
                     newintersection.append((utt1, pos2))
-                    curgoldlist.remove((utt2,pos2))
+                    curgoldlist.remove((utt2, pos2))
                     break
     takefromresultctr = Counter(takefromresultlist)
     takefromgoldctr = Counter(takefromgoldlist)
     newintersectionctr = Counter(newintersection)
     return (takefromresultctr, takefromgoldctr, newintersectionctr)
 
+
 def exactcompare(exactresults, exactgoldscores):
     '''
     compares two lists of exact results, i.e. dlists of pairs (uttid, position)
@@ -227,18 +251,21 @@ def getusercomments(permsilverdict, key, report=False):
             SDLOGGER.warning('No silver remark for key: {}'.format(key))
     return result
 
+
 def testcompare():
-    testresults = [(1,2),(1,2), (1,2), (1,5), (1,6),(2,0), (2, 4)]
-    goldresults = [(1,2), (2,4), (2,6), (1,0), (3,5)]
-    reftestminusgold = [(1,2), (1,5), (1,6)]
-    refgoldminustest = [(3,5)]
-    refintersection = [(1,2), (1,2),  (2,4), (2,6)]
+    testresults = [(1, 2), (1, 2), (1, 2), (1, 5), (1, 6), (2, 0), (2, 4)]
+    goldresults = [(1, 2), (2, 4), (2, 6), (1, 0), (3, 5)]
+    reftestminusgold = [(1, 2), (1, 5), (1, 6)]
+    refgoldminustest = [(3, 5)]
+    refintersection = [(1, 2), (1, 2), (2, 4), (2, 6)]
     (testminusgold, goldminustest, intersection) = exactcompare(testresults, goldresults)
-    for (l, r,g ) in zip(['R-G', 'G-R', 'R*G'],[testminusgold, goldminustest, intersection],[reftestminusgold, refgoldminustest, refintersection]):
+    for (l, r, g) in zip(['R-G', 'G-R', 'R*G'], [testminusgold, goldminustest, intersection],
+                         [reftestminusgold, refgoldminustest, refintersection]):
         if r == g:
-            print('{}: OK {} == {}'.format(l, r,g))
+            print('{}: OK {} == {}'.format(l, r, g))
         else:
-            print('{}: NO: {} != {}'.format(l, r,g))
+            print('{}: NO: {} != {}'.format(l, r, g))
+
 
 if __name__ == '__main__':
-    testcompare()
\ No newline at end of file
+    testcompare()
diff --git a/queryfunctions.py b/queryfunctions.py
index 954a618..c0412e6 100644
--- a/queryfunctions.py
+++ b/queryfunctions.py
@@ -7,7 +7,7 @@
 vzn1basexpath = './/node[ @cat="pp" and (node[@pt="vz"] and node[(@pt="n" or @pt="vnw") and not (%Rpronoun%) and @rel="obj1"] and not(node[@pt="vz" and @vztype="fin"]))]'
 vzn1xpath = expandmacros(vzn1basexpath)
 vzn2xpath = './/node[node[@lemma="in" and @rel="mwp"] and node[@lemma="deze" and @rel="mwp"]]'
-vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat")  and @begin=../node[@pt="vz"]/@end and count(node)<=3] ]'
+vzn3xpath = './/node[@pt="vz" and ../node[(@lemma="dit" or @lemma="dat")  and @begin>=../node[@pt="vz"]/@end and count(node)<=3] ]'
 #vzn4basexpath = './/node[node[@pt="vz" and @rel="hd" and ../node[%Rpronoun% and @rel="obj1" and @end <= ../node[@rel="hd"]/@begin]]]'
 #vzn4xpath = expandmacros(vzn4basexpath)
 
diff --git a/readcsv.py b/readcsv.py
index 1cc5694..a6f9ecb 100644
--- a/readcsv.py
+++ b/readcsv.py
@@ -6,10 +6,10 @@
 mysep = tab
 
 
-def readcsv(filename, sep=mysep, header=True, quotechar='"'):
+def readcsv(filename, sep=mysep, header=True, quotechar='"', encoding='utf8'):
     result = []
     try:
-        infile = open(filename, 'r', encoding='utf8', newline='')
+        infile = open(filename, 'r', encoding=encoding, newline='')
     except FileNotFoundError as e:
         SDLOGGER.error(e)
         return result
@@ -25,11 +25,11 @@ def readcsv(filename, sep=mysep, header=True, quotechar='"'):
     return result
 
 
-def readheadedcsv(filename, sep=mysep, quotechar='"'):
+def readheadedcsv(filename, sep=mysep, quotechar='"', encoding='utf8'):
     result = []
     header = []
     try:
-        infile = open(filename, 'r', encoding='utf8', newline='')
+        infile = open(filename, 'r', encoding=encoding, newline='')
     except FileNotFoundError as e:
         SDLOGGER.error(e)
         return header, result
diff --git a/sastadev.py b/sastadev.py
index cc2b549..653a943 100644
--- a/sastadev.py
+++ b/sastadev.py
@@ -43,7 +43,8 @@
 from SAFreader import get_annotations, get_golddata, richscores2scores, exact2global, richexact2global
 from SAFreader import all_levels
 from external_functions import str2functionmap
-from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno
+from treebankfunctions import getuttid, getyield, getmeta, getattval, getxmetatreepositions, getuttno, getuttidorno, \
+    showtree, getnodeendmap, getxselseuttid
 from SRFreader import read_referencefile
 from goldcountreader import get_goldcounts
 from TARSPscreening import screening4stage
@@ -53,7 +54,7 @@
 from query import pre_process, core_process, post_process, form_process, is_preorcore, query_inform, query_exists, \
     is_pre, is_core
 from macros import expandmacros
-from mismatches import mismatches, exactmismatches
+from mismatches import mismatches, exactmismatches, getmarkposition
 from xlsx import mkworkbook
 import xlsxwriter
 from counterfunctions import counter2liststr
@@ -285,7 +286,7 @@ def isxpathquery(query):
 
 def doqueries(syntree, queries, exactresults, allmatches, criterion):
     uttid = getuttid(syntree)
-    #uttid = getuttidorno(syntree)
+    # uttid = getuttidorno(syntree)
     omittedwordpositions = getxmetatreepositions(syntree, 'Omitted Word', poslistname='annotatedposlist')
     # print(uttid)
     # core queries
@@ -313,6 +314,9 @@ def doqueries(syntree, queries, exactresults, allmatches, criterion):
                 exactresults[queryid] = []
             # matchingids = [uttid for x in matches]
             for m in matches:
+                # showtree(m)
+                if m is None:
+                    showtree(syntree)
                 if (queryid, uttid) in allmatches:
                     allmatches[(queryid, uttid)].append((m, syntree))
                 else:
@@ -485,6 +489,18 @@ def exact2results(exactresults):
     return results
 
 
+def adaptpositions(rawexactresults, nodeendmap):
+    newexactresults = {}
+    for qid in rawexactresults:
+        newlist = []
+        for (uttid, position) in rawexactresults[qid]:
+            newposition = getmarkposition(position, nodeendmap, uttid)
+            newtuple = (uttid, newposition)
+            newlist.append(newtuple)
+        newexactresults[qid] = newlist
+    return newexactresults
+
+
 def passfilter(rawexactresults, method):
     '''
     let's only those through that satisfy the
@@ -669,6 +685,7 @@ def passfilter(rawexactresults, method):
                                              platinumoutfilename, options.platinuminfilename, goldscores)
 
 analysedtrees = []
+nodeendmap = {}
 
 # @vanaf nu gaat het om een treebank, dus hier een if statement toevoegen-done
 if annotationinput:
@@ -715,18 +732,31 @@ def passfilter(rawexactresults, method):
             analysedtrees.append(syntree)
             doprequeries(syntree, queries, rawexactresults, allmatches)
             docorequeries(syntree, queries, rawexactresults, allmatches)
-        uttid = getuttid(syntree)
-        uttno = getuttno(syntree)
-        allutts[uttno] = getyield(syntree)
-        # allutts[uttid] = getyield(syntree)
+
+            # uttid = getuttid(syntree)
+            uttid = getxselseuttid(syntree)
+            # showtree(syntree)
+            if uttid in nodeendmap:
+                SDLOGGER.error('Duplicate uttid in sample: {}'.format(uttid))
+            nodeendmap[uttid] = getnodeendmap(syntree)
+
+            # uttno = getuttno(syntree)
+            # allutts[uttno] = getyield(syntree)
+            allutts[uttid] = getyield(syntree)
 
     # determine exactresults and apply the filter to catch interdependencies between prequeries and corequeries
     # rawexactresults = getexactresults(allmatches)
-    exactresults = passfilter(rawexactresults, themethod)
+    rawexactresults2 = passfilter(rawexactresults, themethod)
+    exactresults = adaptpositions(rawexactresults2, nodeendmap)
+
+    #pas hier de allutts en de rawexactresults2 aan om expansies te ontdoen, gebseerd op de nodeendmap
+    #@@to be implemented @@ of misschien in de loop hierboven al?
 
 # @ en vanaf hier kan het weer gemeenschappelijk worden; er met dus ook voor de annotatiefile een exactresults opgeleverd worden
 # @d epostfunctions for lemma's etc moeten mogelijk wel aangepast worden
 
+# adapt the exactresults  positions to the reference
+
 
 coreresults = exact2results(exactresults)
 
@@ -959,7 +989,9 @@ def passfilter(rawexactresults, method):
 logheader = ['datetime', 'treebank', 'scorenr,' 'R', 'P', 'F1', 'P-R', 'P-P', 'P-F1', 'GP-R', 'GP-P', 'GP-F1', 'ref',
              'method']
 logname = 'sastalog.txt'
-biglogfile = open(logname, 'a', encoding='utf8')
+logpath = r'D:\jodijk\Dropbox\jodijk\myprograms\python\sastacode\sastadev'
+logfullname = os.path.join(logpath, logname)
+biglogfile = open(logfullname, 'a', encoding='utf8')
 
 exactlynow = datetime.datetime.now()
 now = exactlynow.replace(microsecond=0).isoformat()
diff --git a/sastatok.py b/sastatok.py
index adb6c04..6072d60 100644
--- a/sastatok.py
+++ b/sastatok.py
@@ -61,5 +61,5 @@ def sasta_tokenize(instring):
     if instring is None:
         return []
     tokenstring = fullsastare.findall(instring)
-    result = stringlist2tokenlist(tokenstring)
+    result = stringlist2tokenlist(tokenstring, start=10, inc=10)
     return result
diff --git a/sastatoken.py b/sastatoken.py
index cdb987c..50004f6 100644
--- a/sastatoken.py
+++ b/sastatoken.py
@@ -15,12 +15,12 @@ def __repr__(self):
 
     def __str__(self):
         skipstr = ' (skip=True)' if self.skip else ''
-        subposstr = '.{}' if self.subpos != 0 else ''
+        subposstr = '/{}'.format(self.subpos) if self.subpos != 0 else ''
         result = '{}{}:{}{}'.format(self.pos, subposstr, self.word, skipstr)
         return result
 
 
-def stringlist2tokenlist(list):
+def oldstringlist2tokenlist(list):
     result = []
     llist = len(list)
     for el in range(llist):
@@ -29,6 +29,17 @@ def stringlist2tokenlist(list):
     return result
 
 
+def stringlist2tokenlist(list, start=0, inc=1):
+    result = []
+    llist = len(list)
+    pos = start
+    for el in range(llist):
+        thetoken = Token(list[el], pos)
+        result.append(thetoken)
+        pos += inc
+    return result
+
+
 def tokenlist2stringlist(tlist, skip=False):
     if skip:
         result = [t.word for t in tlist if not t.skip]
@@ -49,3 +60,24 @@ def show(tokenlist):
         resultlist.append(str(token))
     result = ', '.join(resultlist)
     return result
+
+
+def tokeninflate(token):
+    result = inflate(token.pos) + token.subpos
+    return result
+
+
+def deflate(n: int):
+    result = (n // 10) - 1
+    return result
+
+
+def inflate(n: int):
+    result = (n + 1) * 10
+    return result
+
+
+def insertinflate(n: int):
+    dm = n % 10
+    result = ((n - dm) + 1) * 10 + dm
+    return result
diff --git a/smallclauses.py b/smallclauses.py
new file mode 100644
index 0000000..d44cbaa
--- /dev/null
+++ b/smallclauses.py
@@ -0,0 +1,292 @@
+from config import SDLOGGER
+from treebankfunctions import getstree, getnodeyield, getattval
+from dedup import filledpauseslexicon
+from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon
+from lexicon import known_word, tswnouns
+from namepartlexicon import namepart_isa_namepart
+from sastatoken import Token, show
+from tokenmd import TokenListMD
+from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\
+    insertiontokenmapping
+
+space = ' '
+biglocvzs = ['achter', 'beneden', 'binnen', 'boven', 'bovenop', 'buiten', 'dichtbij']
+#surenouns = ['mama', 'papa'] replaced by tswnouns from lexicon
+longvowels = ['a', 'é', 'i', 'o', 'u', 'y']
+vowels = ['a', 'e', 'i', 'o', 'u']
+
+uniquelynominativeperspros = ['ik', 'jij', 'hij', 'zij', 'wij', 'ikke', "'k", "k", "ie", "we"]
+
+
+def makegen(lemma):
+    if lemma is None or len(lemma) < 2:
+        result = None
+    elif lemma[-1] in ['s', 'z', 'x']:
+        result = lemma + "'"
+    elif lemma[-2:] in [ 'ij']:
+        result = lemma + 's'
+    elif lemma[-2] in vowels and lemma[-1] in vowels:
+        result = lemma + 's'
+    elif lemma[-1] in longvowels:
+        result = lemma + "'s"
+    else:
+        result = lemma + 's'
+    return result
+
+def realword(node):
+    result = True
+    result = result and getattval(node, 'pt') not in ['tsw', 'let']
+    result = result and getattval(node, 'lemma') not in ['xx', 'xxx', 'yyy', 'www', 'hè']
+    result = result and getattval(node, 'lemma') not in filledpauseslexicon
+    result = result or lemma(node) in tswnouns
+
+
+    return result
+
+
+def hasgenitive(node):
+    lemma = getattval(node, 'lemma')
+    nodept = pt(node)
+    if nodept not in ['n', 'vnw']:
+        nodept = 'n'
+    result = (lemma, nodept) in genlexicon and 'yes' in genlexicon[(lemma, nodept)]
+    result = result or namepart_isa_namepart(lemma)
+    return result
+
+def aanwvnw(node):
+    result = getattval(node, 'pt') == 'vnw' and getattval(node, 'vwtype') == 'aanw' and not rpronoun(node)
+    return result
+
+
+def n(node):
+    result = getattval(node, 'pt') == 'n'
+    return result
+
+
+def getal(node):
+    result = getattval(node, 'getal')
+    return result
+
+def pt(node):
+    result = getattval(node, 'pt')
+    return result
+
+def bg(node):
+    result = int(getattval(node, 'begin'))
+    return result
+
+def tw(node):
+    result = getattval(node, 'pt') == 'tw'
+    return result
+
+def word(node):
+    result = getattval(node, 'word')
+    return result
+
+
+def adj(node):
+    result = getattval(node, 'pt') == 'adj'
+    return result
+
+def perspro(node):
+    pt = getattval(node, 'pt')
+    vwtype = getattval(node, 'vwtype')
+    result = pt == 'vnw' and vwtype == 'pers'
+    return result
+
+def nomperspro(node):
+    lemma = getattval(node, 'lemma')
+    result = perspro(node) and lemma in uniquelynominativeperspros
+    return result
+
+def inf(node):
+    result = getattval(node, 'pt') == 'ww' and getattval(node, 'wvorm') == 'inf'
+    return result
+
+
+def rpronoun(node):
+    result = getattval(node, 'pt') == 'vnw' and \
+             getattval(node, 'lemma') in ['er', 'hier', 'daar', 'ergens', 'overal', 'nergens', 'waar']
+    return result
+
+def bw(node):
+    result = getattval(node, 'pt') == 'bw'
+    return result
+
+def ww(node):
+    result = getattval(node, 'pt') == 'ww'
+    return result
+
+
+def lemma(node):
+    result = getattval(node, 'lemma')
+    return result
+
+def predadv(node):
+    result = locadv(node)
+    result = result or (bw(node) and lemma(node) in ['niet', 'mee', 'weg'])
+    return result
+
+def vz(node):
+    result = getattval(node, 'pt') == 'vz'
+    return result
+
+def locadv(node):
+    result = getattval(node, 'pt') in ['bw', 'vz']
+    frame = getattval(node, 'frame')
+    result = result and ('loc' in frame or 'er_adverb' in frame)
+    result = result or rpronoun(node)
+    return result
+
+def biglocvz(node):
+    result = getattval(node, 'lemma') in biglocvzs
+    return result
+
+def istswnoun(node):
+    result = getattval(node, 'lemma') in tswnouns
+    return result
+
+def getleavestr(leaves):
+    leaveseq = ['{}:{}:{}:{}'.format(getattval(leave, 'end'), getattval(leave, 'word'), getattval(leave, 'lemma'),
+                                     getattval(leave, 'pt')) for leave
+                in leaves]
+    leavestr = space.join(leaveseq)
+    return leavestr
+
+def knownnoun(node):
+    word = getattval(node, 'word')
+    lemma = getattval(node, 'lemma')
+    postag = pt(node)
+    result = postag == 'n' and (known_word(word) or known_word(lemma))
+    result = result or lemma in tswnouns
+    return result
+
+def nominal(node):
+    result = pt(node) == 'n' or aanwvnw(node)
+    return result
+
+def mktoken(node, map):
+    nodebegin = bg(node)
+    nodeword = word(node)
+    if nodebegin in map:
+        nodepos = map[nodebegin]
+    else:
+        SDLOGGER.error('missing begin in map {}'.format(nodebegin))
+        nodepos = int(nodebegin)
+    result = Token(nodeword, nodepos)
+    return result
+
+
+def mktokenlist(tokens, fpos, inserttokens):
+    resultlist = [token for token in tokens if token.pos <= fpos] + \
+                 inserttokens + \
+                 [token for token in tokens if token.pos > fpos]
+    return resultlist
+
+
+def oldmktokenlist(leaves, themap, fpos, inserttokens):
+    resultlist = [mktoken(lv, themap) for lv in leaves if bg(lv) <= fpos] + \
+                 inserttokens + \
+                 [mktoken(lv, themap) for lv in leaves if bg(lv) > fpos]
+    return resultlist
+
+
+def mkinsertmeta(inserttokens, resultlist):
+    insertposs = [token.pos + token.subpos for token in inserttokens]
+    insertwordlist = [token.word for token in inserttokens]
+    tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist]
+    metadata1 = [Meta(insertion, [insertword], annotatedposlist=[insertpos],
+                 annotatedwordlist=[], annotationposlist=[insertpos],
+                 annotationwordlist=[insertword], cat=smallclause, source=SASTA, penalty=defaultpenalty,
+                 backplacement=bpl_delete) for insertword, insertpos in zip(insertwordlist, insertposs)]
+    meta2 = Meta(insertiontokenmapping, tokenmappinglist, cat=tokenmapping, source=SASTA, penalty=0,
+                 backplacement=bpl_none)
+    metadata = metadata1 + [meta2]
+    return metadata
+
+
+def smallclauses(tokensmd, tree):
+    resultlist = []
+    leaves = getnodeyield(tree)
+    reducedleaves = [leave for leave in leaves if realword(leave)]
+    if not(len(reducedleaves) > 1 and len(reducedleaves) <= 3):
+        return resultlist
+    tokens = tokensmd.tokens
+    treewords = [word(tokennode) for tokennode in leaves]
+    tokenwords = [token.word for token in tokens if not token.skip]
+    if treewords != tokenwords:
+        SDLOGGER.error('Token mismatch: {} v. {}'.format(treewords, tokenwords))
+        return []
+    themap = {bg(tokennode): token.pos for (tokennode, token) in zip(leaves, tokens)}
+    metadata = tokensmd.metadata
+
+    if len(reducedleaves) <= 3:
+        first = leaves[0]
+        second = leaves[1]
+    if len(reducedleaves) == 3:
+        third = leaves[0]
+
+    if len(reducedleaves) == 2:
+        if (aanwvnw(first) or knownnoun(first) or perspro(first)) and (predadv(second)or vz(second) or bw(second)):
+            fpos = int(getattval(first, 'begin'))
+            inserttokens = [Token('moet' if getal(first) != 'mv' else 'moeten', fpos, subpos=5)]
+            resultlist = mktokenlist(tokens, fpos, inserttokens)
+            metadata += mkinsertmeta(inserttokens, resultlist)
+        #elif (aanwvnw(second) or knownnoun(second) or perspro(second) or tw(second)) and predadv(first):
+        elif nomperspro(second)  and predadv(first):
+            fpos = int(getattval(first, 'begin'))
+            inserttokens = [Token('moet' if getal(second) != 'mv' else 'moeten', fpos, subpos=5)]
+            resultlist = mktokenlist(tokens,  fpos, inserttokens)
+            metadata += mkinsertmeta(inserttokens, resultlist)
+        elif (aanwvnw(first) or knownnoun(first)) and adj(second):
+            fpos = int(getattval(first, 'begin'))
+            inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+            resultlist = mktokenlist(tokens, fpos, inserttokens)
+            metadata += mkinsertmeta(inserttokens, resultlist)
+        elif (aanwvnw(second) or knownnoun(second) or tw(second)) and biglocvz(first):
+            fpos = int(getattval(first, 'begin'))
+            inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+            resultlist = mktokenlist(tokens, fpos, inserttokens)
+        elif knownnoun(first) and knownnoun(second) and not(lemma(first) == lemma(second)):
+            if hasgenitive(first):
+                genform = makegen(lemma(first))
+                fpos = int(getattval(first, 'begin'))
+                inserttokens = [Token('[: ' + genform + ']', fpos, subpos=5)]
+                resultlist = mktokenlist(tokens, fpos, inserttokens)
+                metadata += mkinsertmeta(inserttokens, resultlist)
+            else:
+                fpos = int(getattval(first, 'begin'))
+                inserttokens = [Token('is' if getal(first) != 'mv' else 'zijn', fpos, subpos=5)]
+                resultlist = mktokenlist(tokens, fpos, inserttokens)
+                metadata += mkinsertmeta(inserttokens, resultlist)
+        elif (aanwvnw(first) or knownnoun(first) or istswnoun(first)) and inf(second):
+            if intransitive(second):
+                firstsubject = True
+            elif transitive(second) and ishuman(first):
+                firstsubject = True
+            elif pseudotr(second) and (ishuman(first) or isanimate(first)):
+                firstsubject = True
+            else:
+                firstsubject = False
+            if firstsubject:
+                fpos = int(getattval(first, 'begin'))
+                inserttokens = [Token('wil' if getal(first) != 'mv' else 'willen', fpos, subpos=5)]
+            else:
+                fpos = -1
+                inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)]
+            resultlist =  mktokenlist(tokens, fpos, inserttokens)
+            metadata += mkinsertmeta(inserttokens, resultlist)
+        elif not nominal(first) and not ww(first) and inf(second):
+            fpos = -1
+            inserttokens = [Token('ik', fpos, subpos=5), Token('wil', fpos, subpos=8)]
+            resultlist = mktokenlist(tokens, fpos, inserttokens)
+            metadata += mkinsertmeta(inserttokens, resultlist)
+    if resultlist == []:
+        result = []
+    else:
+        result = [TokenListMD(resultlist, metadata)]
+    return result
+
+
+
+
diff --git a/sva.py b/sva.py
index 7b5defd..0b2f8e3 100644
--- a/sva.py
+++ b/sva.py
@@ -7,7 +7,7 @@
 from tokenmd import TokenListMD
 from treebankfunctions import (copymodifynode, find1, getattval, getdetof,
                                getheadof, getlemma, indextransform, inverted,
-                               lbrother, nominal, rbrother, simpleshow)
+                               lbrother, nominal, rbrother, simpleshow, showtree)
 
 debug = False
 
@@ -356,12 +356,11 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata):
     pvbegin = getattval(thepv, 'begin')
     inversion = inverted(snode, thepv)
     reducedtokens = [t for t in tokens if not t.skip]
-    tokenposmap = {i: reducedtokens[i].pos for i in range(len(reducedtokens))}
     newpv = getpvform(snode, thepv, inversion)
     if newpv is None:
         results = []
     else:
-        newpos = tokenposmap[int(pvbegin)]
+        newpos = int(pvbegin)
         newtoken = Token(newpv, newpos)
         for token in tokens:
             if token.pos != newpos:
@@ -378,6 +377,9 @@ def getsvacorrectedutt(snode, thepv, tokens, metadata):
 
 
 def getsvacorrections(tokensmd, rawtree, uttid):
+    debug = False
+    if debug:
+        showtree(rawtree, text='rawtree')
     if rawtree is None:
         return []
     else:
@@ -540,7 +542,7 @@ def phicompatible(snode, vnode):
         elif '2i' in vnodepersons:
             subjbegin = getattval(subjnode, 'begin')
             vnodeend = getattval(vnode, 'end')
-            result = subjperson == '2' and '2i' in vnodepersons and subjbegin == vnodeend and \
+            result = subjperson == '2' and '2i' in vnodepersons and subjbegin >= vnodeend and \
                 subjnodelemma in ['jij', 'je']
         elif 'u' in vnodepersons:
             subjnodelemma = getattval(subjnode, 'lemma')
diff --git a/test_smallclauses.py b/test_smallclauses.py
new file mode 100644
index 0000000..c16ea95
--- /dev/null
+++ b/test_smallclauses.py
@@ -0,0 +1,50 @@
+from config import SDLOGGER
+from treebankfunctions import getstree, getnodeyield, getattval
+from dedup import filledpauseslexicon
+from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate, genlexicon
+from lexicon import known_word
+from namepartlexicon import namepart_isa_namepart
+from sastatoken import Token, show
+from tokenmd import TokenListMD
+from metadata import Meta, bpl_delete, defaultpenalty, insertion, smallclause, SASTA, bpl_none, tokenmapping,\
+    insertiontokenmapping
+from smallclauses import smallclauses, word, getleavestr, bg
+
+
+testbank = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\TARSP\smallclausetest.xml"
+schlichtingtreebank = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\schlichtingtreebank\TREEBANK_ID.xml'
+mieke06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE06_ID.xml"
+mieke08 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE08_ID.xml"
+aurisraw = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AURIS_ELISKA_ORIGINAL_ID.xml"
+tarsp02 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_02.xml"
+tarsp06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_06.xml"
+#schlichtingall = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\treebank_schlichting_all_examples\TREEBANK_SCHLICHTING_CHAT_ID.xml"
+
+
+
+
+
+def main():
+    smalltest = True
+    if smalltest:
+        fullnames = [testbank]
+    else:
+        fullnames = [ schlichtingtreebank,  mieke06, mieke08, aurisraw, tarsp02, tarsp06]
+    for infullname in fullnames:
+        print(infullname)
+        fulltreebank = getstree(infullname)
+        if fulltreebank is not None:
+            treebank = fulltreebank.getroot()
+            for tree in treebank:
+                leaves = getnodeyield(tree)
+                tokens = [Token(word(leave), bg(leave)) for leave in leaves]
+                tokensmd = TokenListMD(tokens, [])
+                resultlist = smallclauses(tokensmd, tree)
+                if resultlist != []:
+                    print('input:  ', getleavestr(leaves) )
+                    print('output: ', show(resultlist[0].tokens))
+                    print('result: ', resultlist[0].metadata)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/top3000.py b/top3000.py
new file mode 100644
index 0000000..7bf181d
--- /dev/null
+++ b/top3000.py
@@ -0,0 +1,66 @@
+from xlsx import getxlsxdata
+from treebankfunctions import getattval
+from namepartlexicon import namepart_isa_namepart
+from config import SD_DIR
+import os
+
+def ishuman(node):
+    lemma = getattval(node, 'lemma')
+    pt = getattval(node, 'pt')
+    vwtype = getattval(node, 'vwtype')
+    result = (lemma, pt ) in semlexicon and 'human' in semlexicon[(lemma, pt)]
+    result = result or vwtype == 'pers'
+    result = result or namepart_isa_namepart(lemma)
+    return result
+
+def isanimate(node):
+    lemma = getattval(node, 'lemma')
+    pt = getattval(node, 'pt')
+    result = (lemma, pt ) in semlexicon and 'animate' in semlexicon[(lemma, pt)]
+    return result
+
+
+def transitivity(node, tr):
+    lemma = getattval(node, 'lemma')
+    pt = getattval(node, 'pt')
+    result = (lemma, pt ) in semlexicon and tr in trlexicon[(lemma, pt)]
+    return result
+
+def transitive(node):
+    return transitivity(node, 'tr')
+
+def pseudotr(node):
+    return transitivity(node, 'tr/intr')
+
+
+def intransitive(node):
+    return transitivity(node, 'intr')
+
+semicolon = ';'
+
+filename = os.path.join(SD_DIR, r'top3000\Woordenlijsten Current.xlsx')
+
+
+lexiconheader, lexicondata = getxlsxdata(filename)
+
+semlexicon = {}
+trlexicon = {}
+genlexicon = {}
+
+for row in lexicondata:
+    lemma = row[1].strip()
+    pt = row[5]
+    rawsems = row[6].split(semicolon)
+    sems = [el.strip() for el in rawsems]
+    semlexicon[(lemma, pt)] = sems
+
+    rawtrs = row[8].split(semicolon)
+    trs = [el.strip() for el in rawtrs]
+    trlexicon[(lemma, pt)] = trs
+
+    rawgens = row[9].split(semicolon)
+    gens = [el.strip() for el in rawgens]
+    genlexicon[(lemma, pt)] = gens
+
+#next statement for debugging purposes
+junk = 0
\ No newline at end of file
diff --git a/top3000/Woordenlijsten Current.xlsx b/top3000/Woordenlijsten Current.xlsx
new file mode 100644
index 0000000..7b7ac56
Binary files /dev/null and b/top3000/Woordenlijsten Current.xlsx differ
diff --git a/treebankfunctions.py b/treebankfunctions.py
index ca5e4b1..c0b120b 100644
--- a/treebankfunctions.py
+++ b/treebankfunctions.py
@@ -12,6 +12,7 @@
 from stringfunctions import allconsonants
 # from lexicon import informlexiconpos, isa_namepart_uc, informlexicon, isa_namepart
 import lexicon as lex
+from config import PARSE_FUNC
 
 
 class Metadata:
@@ -184,6 +185,23 @@ def ismainclausenode(node):
     return result
 
 
+def getnodeendmap(stree):
+    leaves = getnodeyield(stree)
+    result = {getattval(leave, 'end'): i + 1 for i, leave in enumerate(leaves)}
+    return result
+
+
+def getxselseuttid(syntree):
+    result = getmeta(syntree, 'xsid')
+    if result is None:
+        result = getmeta(syntree, 'uttid')
+        if result is None:
+            result = getsentid(syntree)
+            if result is None:
+                result = '0'
+    return result
+
+
 def getuttid(syntree):
     result = getmeta(syntree, 'uttid')
     if result is None:
@@ -199,6 +217,7 @@ def getuttno(syntree):
         result = '0'
     return result
 
+
 def getuttidorno(syntree):
     result = getmeta(syntree, 'xsid')
     if result is None:
@@ -441,8 +460,9 @@ def inverted(thesubj, thepv):
     subjbegin = getattval(thesubj, 'begin')
     subjlemma = getattval(thesubj, 'lemma')
     pvend = getattval(thepv, 'end')
+    # maybe defien immediately-follows for inflated trees
     inversion = '2' == subjperson[0] and tense == 'tgw' and subjnumber in ['ev', 'getal'] and \
-                pvend == subjbegin and subjlemma in ['jij', 'je']  # getal added for je
+                pvend <= subjbegin and subjlemma in ['jij', 'je']  # getal added for je
     return inversion
 
 
@@ -1131,11 +1151,11 @@ def test():
 def getsentid(stree):
     sentidlist = stree.xpath(sentidxpath)
     if sentidlist == []:
-        SDLOGGER.error('Missing uttid')
-        uttid = 'None'
+        SDLOGGER.error('Missing sentid')
+        result = 'None'
     else:
-        uttid = str(sentidlist[0])
-    return uttid
+        result = str(sentidlist[0])
+    return result
 
 
 def testindextransform():
@@ -1381,6 +1401,15 @@ def deletewordnode(tree, begin):
         return newtree
 
 
+def showtree(tree, text=None):
+    if text is not None:
+        print(text)
+    if tree is not None:
+        etree.dump(tree, pretty_print=True)
+    else:
+        print('None')
+
+
 def deletechildlessparent(thenode):
     if list(thenode) == []:
         theparent = thenode.getparent()
@@ -1388,8 +1417,12 @@ def deletechildlessparent(thenode):
         deletechildlessparent(theparent)
 
 
-def deletewordnodes(tree, begins):
+def olddeletewordnodes(tree, begins):
+    # print('tree:')
+    # etree.dump(tree, pretty_print=True)
     newtree = deepcopy(tree)
+    # print('newtree:')
+    # etree.dump(newtree, pretty_print=True)
     if newtree is None:
         return newtree
     else:
@@ -1403,9 +1436,14 @@ def deletewordnodes(tree, begins):
                 theparent.remove(thenode)
                 # if the parent has no sons left, it should be deleted as well
                 deletechildlessparent(theparent)
+                children = [n for n in theparent]
+                (minbegin, maxend) = getbeginend(children)
+                theparent.attrib['begin'] = minbegin
+                theparent.attrib['end'] = maxend
+
         #
         # renumber begins and ends ;
-        normalisebeginend(newtree)
+        # normalisebeginend(newtree) temporarily put off
 
         # adapt the cleantokenisation
         # done outside this function
@@ -1415,6 +1453,184 @@ def deletewordnodes(tree, begins):
 
         return newtree
 
+def childless(node):
+    children = [ch for ch in node]
+    result = children == []
+    return result
+
+def deletewordnodes(tree, begins):
+    newtree = deepcopy(tree)
+    newtree = deletewordnodes2(newtree, begins)
+    newtree = adaptsentence(newtree)
+    return newtree
+
+def deletewordnodes2(tree, begins):
+    if tree is None:
+        return tree
+    for child in tree:
+        if child.tag == 'node':
+            newchild = deletewordnodes2(child, begins)
+        else:
+            newchild = child
+    for child in tree:
+        if child.tag == 'node':
+            childbegin = getattval(child, 'begin')
+            childbeginint  = int(childbegin)
+            if childbeginint in begins and childless(child):
+                tree.remove(child)
+            if 'cat' in child.attrib and childless(child):  # if its children have been deleted earlier
+                tree.remove(child)
+     # tree  begin en end bijwerken
+    if tree. tag == 'node':
+        newchildren = [n for n in tree]
+        if newchildren != []:
+            (minbegin, maxend) = getbeginend(newchildren)
+            tree.attrib['begin'] = minbegin
+            tree.attrib['end'] = maxend
+    return tree
+
+
+def olddeletewordnodes2(tree, begins):
+    if tree is None:
+        return tree
+    else:
+        for child in tree:
+            newchild = deletewordnodes2(child, begins)
+        if tree.tag == 'node':
+            nodebegin = getattval(tree, 'begin')
+            children = [child for child in tree]
+            if int(nodebegin) in begins:  # only words and indexnodes can be deleted
+                theparent = tree.getparent()
+                if theparent is not None:
+                    if children == []:
+                        theparent.remove(tree)
+                        # if the parent has no sons left, it should be deleted as well
+                        deletechildlessparent(theparent)
+                    if theparent.tag == 'node':
+                        newchildren = [n for n in theparent]
+                        (minbegin, maxend) = getbeginend(newchildren)
+                        theparent.attrib['begin'] = minbegin
+                        theparent.attrib['end'] = maxend
+        return tree
+
+
+def treeinflate(stree, start=10, inc=10):
+    # fatstree = deepcopy(stree)
+    if stree is None:
+        pass
+    else:
+        for child in stree:
+            treeinflate(child, start, inc)
+        children = [ch for ch in stree]
+        if stree.tag == 'node':
+            ib = int(getattval(stree, 'begin'))
+            ie = int(getattval(stree, 'end'))
+            newib = (ib + 1) * 10
+            stree.attrib['begin'] = str(newib)
+            if iswordnode(stree):
+                stree.attrib['end'] = str(newib + 1)
+            elif 'cat' in stree.attrib:
+                (b, e) = getbeginend(children)
+                stree.attrib['begin'] = b
+                stree.attrib['end'] = e
+            else:
+                stree.attrib['begin'] = str((ib + 1) * 10)
+                stree.attrib['end'] = str((ie * 10) + 1)
+
+
+def isidentitymap(dct):
+    result = all([key == value for key, value in dct.items()])
+    return result
+
+
+def updatetokenpos(stree, tokenposdict):
+    if stree is None:
+        return stree
+    if isidentitymap(tokenposdict):
+        return stree
+    resulttree = deepcopy(stree)
+    resulttree = updatetokenpos2(resulttree, tokenposdict)
+    finaltree = updateindexnodes(resulttree)
+
+    return finaltree
+
+def updatetokenpos2(node, tokenposdict):
+    if node is None:
+        return node
+    for child in node:
+        newchild = updatetokenpos2(child, tokenposdict)
+    if node.tag == 'node':
+        if ('pt' in node.attrib or 'pos' in node.attrib) and \
+                'end' in node.attrib and 'begin' in node.attrib:
+            intend = int(node.attrib['end'])
+            if intend in tokenposdict:
+                newendint = tokenposdict[intend]
+                node.attrib['end'] = str(newendint)
+                node.attrib['begin'] = str(newendint - 1)
+            else:
+                SDLOGGER.error('Correcttreebank:updatetokenpos: Missing key in tokenposdict: key={key}'.format(key=intend))
+                fulltrees = node.xpath('ancestor::node[@cat="top"]')
+                if fulltrees != []:
+                    fulltree = fulltrees[0]
+                else:
+                    fulltree = node
+                sent = getyield(fulltree)
+                SDLOGGER.error('utterance={}'.format(sent))
+                # etree.dump(resulttree)
+                SDLOGGER.error('tokenposdict={}'.format(tokenposdict))
+        elif 'cat' in node.attrib:
+            children = [ch for ch in node]
+            (b, e) = getbeginend(children)
+            node.attrib['begin'] = b
+            node.attrib['end'] = e
+    return node
+
+
+
+def updateindexnodes(stree):
+    #presupposes that the non bareindex nodes have been adapted already
+    indexednodesmap = getindexednodesmap(stree)
+    newstree = deepcopy(stree)
+    for node in newstree.iter():
+        if node.tag == 'node':
+            if bareindexnode(node):
+                idx = getattval(node, 'index')
+                newbegin = getattval(indexednodesmap[idx], 'begin')
+                newend = getattval(indexednodesmap[idx], 'end')
+                node.attrib['begin'] = newbegin
+                node.attrib['end'] = newend
+    return newstree
+
+def treewithtokenpos(thetree, tokenlist):
+    resulttree = deepcopy(thetree)
+    thetreeleaves = getnodeyield(thetree)
+    intbegins = [int(getattval(n, 'begin')) for n in thetreeleaves]
+    tokenlistbegins = [t.pos + t.subpos for t in tokenlist]
+    if len(intbegins) != len(tokenlistbegins):
+        SDLOGGER.error('token mismatch')
+        SDLOGGER.error('tree yield={}'.format(getyield(thetree)))
+        SDLOGGER.error('tokenlist={}'.format(tokenlist))
+        SDLOGGER.error('intbegins={}'.format(intbegins))
+        SDLOGGER.error('tokenlistbegins ={}'.format(tokenlistbegins))
+    pospairs = zip(intbegins, tokenlistbegins)
+    thetreetokenposdict = {treepos + 1: tokenpos + 1 for treepos, tokenpos in pospairs}
+    resulttree = updatetokenpos(resulttree, thetreetokenposdict)
+    return resulttree
+
+
+def fatparse(utterance, tokenlist):
+    stree = PARSE_FUNC(utterance)
+    fatstree = deepcopy(stree)
+    treeinflate(fatstree, start=10, inc=10)
+    debug = False
+    if debug:
+        showtree(fatstree, text='fatparse: fatstree')
+    reducedtokenlist = [token for token in tokenlist if not token.skip]
+    fatstree = treewithtokenpos(fatstree, reducedtokenlist)
+    if debug:
+        showtree(fatstree, text='fatparse: fatstree')
+    return fatstree
+
 
 def update_cleantokenisation(stree, begin):
     '''
@@ -1473,8 +1689,10 @@ def normalisebeginend(stree):
     :param stree: syntactic structure
     :return: stree with the values of begin and end attributes normalised
     '''
-    begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')]
-    sortedbegins = sorted(begins, key=lambda x: int(x))
+    # etree.dump(stree, pretty_print=True)
+    # begins = [getattval(node, 'begin') for node in stree.xpath('.//node[@pt or @pos]')]  # we must include indexed nodes but not have duplicates
+    begins = {getattval(node, 'begin') for node in stree.xpath('.//node[count(node)=0]')}
+    sortedbegins = sorted(list(begins), key=lambda x: int(x))
     normalisebeginend2(stree, sortedbegins)