Skip to content

Commit

Permalink
Postnominal modifiers update
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Dec 6, 2024
1 parent 8b954c7 commit bd491dd
Show file tree
Hide file tree
Showing 14 changed files with 477 additions and 120 deletions.
6 changes: 3 additions & 3 deletions src/sastadev/Sziplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ def isvcinforppart(node: SynTree) -> bool:

def isrealnode(node: SynTree) -> bool:
'''
The function *isrealnode* determines whether a nide is a real node, which it is if:
The function *isrealnode* determines whether a node is a real node, which it is if:
* it is not a node for an interpunction sign
* it is not a nonfinite complement
* if it is not a separable particle word of a verb
* if it is not an index node 9as detemined by the function *isindexnode*)
* if it is not an index node (as determined by the function *isindexnode*)
The function *isindexnode* is defined as follows:
Expand All @@ -102,7 +102,7 @@ def isrealnode(node: SynTree) -> bool:
result = False
elif isvcinforppart(node):
result = False
elif rel == 'svp' and pt in node.attrib:
elif rel == 'svp' and 'word' in node.attrib:
result = False
elif isindexnode(node):
result = False
Expand Down
2 changes: 2 additions & 0 deletions src/sastadev/basicreplacements.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ def combine(strlist: List[str]) -> str:
('blon', 'ballon', pron, infpron, voweldel, dp),
('ooien', 'gooien', pron, wrongpron, onsetred, dp),
('poppe', 'pop', pron, wrongpron, emphasis, dp),
('lus', 'lust', pron, infpron, codared, dp),
('jou', 'jouw', pron, infpron, codared, -dp) # Td 22, 30 ik wil ook keer naar jou huis find criterion
# ('leggen', 'liggen', lexical, dial, '', dp), # moved to corrector : only if parse is illformed
# ('legt', 'ligt', lexical, dial, '', dp), # moved to corrector : only if parse is illformed
# ('leg', 'lig', lexical, dial, '', dp) # moved to corrector : only if parse is illformed
Expand Down
8 changes: 5 additions & 3 deletions src/sastadev/cleanCHILDEStokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,15 @@ def removesuspecttokens(tokens: List[Token]) -> List[Token]:



RobustnessTuple = Tuple[Pattern, str, str, str]
RobustnessTuple = Tuple[Pattern, str, str, str] # regex, instring, outstring, message

robustnessrules: List[RobustnessTuple] = [(re.compile(r'\u2026'), '\u2026', '...', 'Horizontal Ellipsis (\u2026, Unicode U+2026) replaced by a sequence of three Full Stops (..., Unicode U+002E) '),
(re.compile('#'), '#', '', 'Number Sign (#, Unicode U+0023) removed'),
#(re.compile('#'), '#', '(.)', 'Number Sign (#, Unicode U+0023) replaced by CHAT (short) pause code: (.)'),
(re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'),
(re.compile(r'\[\+trn\]'), '[+trn]', '[+ trn]', 'Missing space'),
(re.compile(r'\[\+bch\]', re.I), '[+bch]', '[+ bch]', 'Missing space'),
(re.compile(r'\[\+trn\]', re.I), '[+trn]', '[+ trn]', 'Missing space'),
(re.compile(r'\[\+ea\]', re.I), '[+ea]', '[+ ea]', 'Missing space'),
(re.compile(r'\[%(?![\s])'), '[%', '[% ', 'Missing space'),
(re.compile(r'\[:(?![:\s])'), '[:', '[: ', 'Missing space'),
(re.compile(r'(?<=\w)\+\.\.\.'), '+...', ' +...', 'Missing space'),
(re.compile(r'\u2018'), '\u2018', "'", "Left Single Quotation Mark (\u2018. Unicode U+2018) replaced by Apostrophe ' (Unicode U+0027)"),
Expand Down
9 changes: 8 additions & 1 deletion src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SASTA, ADULTSPELLINGCORRECTION, ALLSAMPLECORRECTIONS, BASICREPLACEMENTS, CONTEXT,
HISTORY, CHILDRENSPELLINGCORRECTION, THISSAMPLECORRECTIONS, replacementsubsources
)
from sastadev.postnominalmodifiers import transformbwinnp, transformppinnp
from sastadev.sastatok import sasta_tokenize
from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string
from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement,
Expand All @@ -41,7 +42,7 @@
showtree, simpleshow, subclasscompatible, transplant_node,
treeinflate, treewithtokenpos,
updatetokenpos)
from sastadev.treetransform import transformtreeld, transformtreenogeen, transformtreenogde
from sastadev.treetransform import transformtagcomma, transformtreeld, transformtreenogeen, transformtreenogde

ampersand = '&'

Expand Down Expand Up @@ -564,7 +565,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C

# tree transformations
if correctionparameters.method in ['tarsp', ' stap']:
stree = transformtagcomma(stree)
stree = transformtreeld(stree)
stree = transformppinnp(stree)
stree = transformbwinnp(stree)
stree = transformtreenogeen(stree)
stree = transformtreenogde(stree)

Expand Down Expand Up @@ -908,7 +912,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C

# tree transformations
if correctionparameters.method in ['tarsp', ' stap']:
fulltree = transformtagcomma(fulltree)
fulltree = transformtreeld(fulltree)
fulltree = transformppinnp(fulltree)
fulltree = transformbwinnp(fulltree)
fulltree = transformtreenogeen(fulltree)
fulltree = transformtreenogde(fulltree)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ siko cirkel replacement 1
bjokje blokje replacement 1
kikke kikker noncompletion 1
springe springen noncompletion 1
in met replacement 1
in erin noncompletion 3
in deze replacement 2
in op replacement 2
Expand Down Expand Up @@ -211,7 +212,8 @@ ke die replacement 1
dese deze replacement 4
hijs hij is replacement 1
eigk eigenlijk replacement 1
de het replacement 21
de het replacement 23
de zitten replacement 1
de der noncompletion 11
de dan replacement 2
de deze replacement 2
Expand Down Expand Up @@ -249,7 +251,7 @@ worst borst replacement 1
ennu en replacement 1
koeie koeien noncompletion 1
o op noncompletion 3
mij mijn noncompletion 9
mij mijn noncompletion 10
bet bent noncompletion 1
bent ben replacement 1
tinnen tillen replacement 1
Expand Down Expand Up @@ -378,6 +380,8 @@ deze dit replacement 4
deze waar hoort deze explanation 1
evallen gevallen noncompletion 1
nog ook explanation 1
voor bij replacement 1
voor om replacement 2
voor van explanation 1
wat waar replacement 2
wordt hoort replacement 2
Expand Down Expand Up @@ -443,6 +447,7 @@ pantoet pannekoeken replacement 2
eet eten replacement 2
bejo hallo explanation 1
teje gaan replacement 2
maar normaal replacement 1
maar en replacement 7
maar want replacement 1
maar ga daar maar explanation 1
Expand Down Expand Up @@ -472,7 +477,7 @@ bochje bochtje explanation 1
anders andere explanation 1
saar daar explanation 1
saal haal explanation 1
saan gaan explanation 1
saan gaan replacement 2
bruiken gebruiken explanation 1
keerd verkeerd explanation 1
vrastauto vrachtauto replacement 1
Expand Down Expand Up @@ -520,6 +525,7 @@ gun ging replacement 2
gin ging noncompletion 4
som soms noncompletion 6
witte wit replacement 2
dee deed noncompletion 1
dee was replacement 2
dan toen replacement 6
ging gingen replacement 2
Expand Down Expand Up @@ -549,12 +555,12 @@ lew wel replacement 2
slin ging replacement 2
stond stonden replacement 4
stond was replacement 2
van bij replacement 2
van om replacement 2
van door replacement 1
van met replacement 1
van uit replacement 1
van veel replacement 1
van bij replacement 1
arreen alleen replacement 2
het er replacement 2
het hij replacement 2
Expand Down Expand Up @@ -614,6 +620,7 @@ zaten hadden replacement 1
zo zo'n replacement 1
ook blauwe ramen zitten er ook explanation 1
ginnen gingen replacement 2
dat daar replacement 1
dat het replacement 1
allegrooste allegrootste noncompletion 2
en maar replacement 3
Expand Down Expand Up @@ -641,9 +648,11 @@ ta tat noncompletion 1
peltje pijltje replacement 1
pijwtje pijltje replacement 1
sef zeg replacement 1
doe toen replacement 1
doe ga replacement 1
zwaarden vechten replacement 1
lekkes lekkers noncompletion 1
was ging replacement 1
was waren replacement 1
was wat replacement 1
suimpjes sguimpjes noncompletion 1
Expand All @@ -666,6 +675,8 @@ itten eten replacement 2
ziekje muziekje replacement 1
Bobdebouwerkattet Bobdebouwerkwartet replacement 1
blazen geblazen replacement 1
ik je replacement 1
ik jij replacement 1
ik mij replacement 1
wou wilde replacement 3
vashouden vasthouden noncompletion 2
Expand Down Expand Up @@ -802,6 +813,9 @@ rekele rekenen replacement 1
daarzo daar replacement 1
voorbeeld bijvoorbeeld replacement 1
freene free-runnen replacement 1
x racen-x replacement 1
x los ? replacement 1
x gelaten ? replacement 1
x gewoon replacement 1
feerunnen freerunnen replacement 1
wees geweest replacement 1
Expand All @@ -812,3 +826,22 @@ zorgen verzorgen replacement 1
kantie vakantie replacement 1
zomerkantie vakantie replacement 1
boekje 'n boekje voor David explanation 1
xxx ? replacement 1
koppie kopbal replacement 1
weer meer replacement 1
kubbelbum bubblegum ? replacement 1
probreren proberen replacement 2
doem doen replacement 1
durf durft noncompletion 1
tik tikkertje replacement 1
teek betekent replacement 1
palaplu paraplu replacement 1
valaag vandaag replacement 1
, dat replacement 1
tegen aan replacement 1
zeggen vragen replacement 1
vliegtuig vliegveld replacement 1
anneen alleen replacement 1
na dan replacement 1
noen doen replacement 1
teeëndertig tweeëndertig replacement 1
Loading

0 comments on commit bd491dd

Please sign in to comment.