From bd491dd3ec1be282b02757535ccef8103dcc999f Mon Sep 17 00:00:00 2001 From: Jan Odijk Date: Fri, 6 Dec 2024 16:04:43 +0100 Subject: [PATCH] Postnominal modifiers update --- src/sastadev/Sziplus.py | 6 +- src/sastadev/basicreplacements.py | 2 + src/sastadev/cleanCHILDEStokens.py | 8 +- src/sastadev/correcttreebank.py | 9 +- .../children_samplecorrections.txt | 41 +++- .../data/childescorrections/donefiles.txt | 188 +++++++++--------- src/sastadev/data/macros/newimperatives.txt | 13 +- src/sastadev/lexicon.py | 3 + src/sastadev/normalise_lemma.py | 2 +- src/sastadev/postnominalmodifiers.py | 123 ++++++++++++ src/sastadev/treebankfunctions.py | 14 +- src/sastadev/treetransform.py | 120 +++++++++-- src/sastadev/trycleantext.py | 14 ++ src/sastadev/trypostnominalmodifiers.py | 54 +++++ 14 files changed, 477 insertions(+), 120 deletions(-) create mode 100644 src/sastadev/postnominalmodifiers.py create mode 100644 src/sastadev/trycleantext.py create mode 100644 src/sastadev/trypostnominalmodifiers.py diff --git a/src/sastadev/Sziplus.py b/src/sastadev/Sziplus.py index 4a1dc8c..5ab7091 100644 --- a/src/sastadev/Sziplus.py +++ b/src/sastadev/Sziplus.py @@ -85,12 +85,12 @@ def isvcinforppart(node: SynTree) -> bool: def isrealnode(node: SynTree) -> bool: ''' - The function *isrealnode* determines whether a nide is a real node, which it is if: + The function *isrealnode* determines whether a node is a real node, which it is if: * it is not a node for an interpunction sign * it is not a nonfinite complement * if it is not a separable particle word of a verb - * if it is not an index node 9as detemined by the function *isindexnode*) + * if it is not an index node (as determined by the function *isindexnode*) The function *isindexnode* is defined as follows: @@ -102,7 +102,7 @@ def isrealnode(node: SynTree) -> bool: result = False elif isvcinforppart(node): result = False - elif rel == 'svp' and pt in node.attrib: + elif rel == 'svp' and 'word' in node.attrib: result = False elif isindexnode(node): result = False diff --git a/src/sastadev/basicreplacements.py b/src/sastadev/basicreplacements.py index c4057c0..8c7ca74 100644 --- a/src/sastadev/basicreplacements.py +++ b/src/sastadev/basicreplacements.py @@ -245,6 +245,8 @@ def combine(strlist: List[str]) -> str: ('blon', 'ballon', pron, infpron, voweldel, dp), ('ooien', 'gooien', pron, wrongpron, onsetred, dp), ('poppe', 'pop', pron, wrongpron, emphasis, dp), + ('lus', 'lust', pron, infpron, codared, dp), + ('jou', 'jouw', pron, infpron, codared, -dp) # Td 22, 30 ik wil ook keer naar jou huis find criterion # ('leggen', 'liggen', lexical, dial, '', dp), # moved to corrector : only if parse is illformed # ('legt', 'ligt', lexical, dial, '', dp), # moved to corrector : only if parse is illformed # ('leg', 'lig', lexical, dial, '', dp) # moved to corrector : only if parse is illformed diff --git a/src/sastadev/cleanCHILDEStokens.py b/src/sastadev/cleanCHILDEStokens.py index 45857d3..8b9d24d 100644 --- a/src/sastadev/cleanCHILDEStokens.py +++ b/src/sastadev/cleanCHILDEStokens.py @@ -230,13 +230,15 @@ def removesuspecttokens(tokens: List[Token]) -> List[Token]: -RobustnessTuple = Tuple[Pattern, str, str, str] +RobustnessTuple = Tuple[Pattern, str, str, str] # regex, instring, outstring, message robustnessrules: List[RobustnessTuple] = [(re.compile(r'\u2026'), '\u2026', '...', 'Horizontal Ellipsis (\u2026, Unicode U+2026) replaced by a sequence of three Full Stops (..., Unicode U+002E) '), (re.compile('#'), '#', '', 'Number Sign (#, Unicode U+0023) removed'), #(re.compile('#'), '#', '(.)', 'Number Sign (#, Unicode U+0023) replaced by CHAT (short) pause code: (.)'), - (re.compile(r'\[\+bch\]'), '[+bch]', '[+ bch]', 'Missing space'), - (re.compile(r'\[\+trn\]'), '[+trn]', '[+ trn]', 'Missing space'), + (re.compile(r'\[\+bch\]', re.I), '[+bch]', '[+ bch]', 'Missing space'), + (re.compile(r'\[\+trn\]', re.I), '[+trn]', '[+ trn]', 'Missing space'), + (re.compile(r'\[\+ea\]', re.I), '[+ea]', '[+ ea]', 'Missing space'), + (re.compile(r'\[%(?![\s])'), '[%', '[% ', 'Missing space'), (re.compile(r'\[:(?![:\s])'), '[:', '[: ', 'Missing space'), (re.compile(r'(?<=\w)\+\.\.\.'), '+...', ' +...', 'Missing space'), (re.compile(r'\u2018'), '\u2018', "'", "Left Single Quotation Mark (\u2018. Unicode U+2018) replaced by Apostrophe ' (Unicode U+0027)"), diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py index 4c65a68..e6a8119 100644 --- a/src/sastadev/correcttreebank.py +++ b/src/sastadev/correcttreebank.py @@ -21,6 +21,7 @@ SASTA, ADULTSPELLINGCORRECTION, ALLSAMPLECORRECTIONS, BASICREPLACEMENTS, CONTEXT, HISTORY, CHILDRENSPELLINGCORRECTION, THISSAMPLECORRECTIONS, replacementsubsources ) +from sastadev.postnominalmodifiers import transformbwinnp, transformppinnp from sastadev.sastatok import sasta_tokenize from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement, @@ -41,7 +42,7 @@ showtree, simpleshow, subclasscompatible, transplant_node, treeinflate, treewithtokenpos, updatetokenpos) -from sastadev.treetransform import transformtreeld, transformtreenogeen, transformtreenogde +from sastadev.treetransform import transformtagcomma, transformtreeld, transformtreenogeen, transformtreenogde ampersand = '&' @@ -564,7 +565,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C # tree transformations if correctionparameters.method in ['tarsp', ' stap']: + stree = transformtagcomma(stree) stree = transformtreeld(stree) + stree = transformppinnp(stree) + stree = transformbwinnp(stree) stree = transformtreenogeen(stree) stree = transformtreenogde(stree) @@ -908,7 +912,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C # tree transformations if correctionparameters.method in ['tarsp', ' stap']: + fulltree = transformtagcomma(fulltree) fulltree = transformtreeld(fulltree) + fulltree = transformppinnp(fulltree) + fulltree = transformbwinnp(fulltree) fulltree = transformtreenogeen(fulltree) fulltree = transformtreenogde(fulltree) diff --git a/src/sastadev/data/childescorrections/children_samplecorrections.txt b/src/sastadev/data/childescorrections/children_samplecorrections.txt index 72bf2ec..4e1c207 100644 --- a/src/sastadev/data/childescorrections/children_samplecorrections.txt +++ b/src/sastadev/data/childescorrections/children_samplecorrections.txt @@ -121,6 +121,7 @@ siko cirkel replacement 1 bjokje blokje replacement 1 kikke kikker noncompletion 1 springe springen noncompletion 1 +in met replacement 1 in erin noncompletion 3 in deze replacement 2 in op replacement 2 @@ -211,7 +212,8 @@ ke die replacement 1 dese deze replacement 4 hijs hij is replacement 1 eigk eigenlijk replacement 1 -de het replacement 21 +de het replacement 23 +de zitten replacement 1 de der noncompletion 11 de dan replacement 2 de deze replacement 2 @@ -249,7 +251,7 @@ worst borst replacement 1 ennu en replacement 1 koeie koeien noncompletion 1 o op noncompletion 3 -mij mijn noncompletion 9 +mij mijn noncompletion 10 bet bent noncompletion 1 bent ben replacement 1 tinnen tillen replacement 1 @@ -378,6 +380,8 @@ deze dit replacement 4 deze waar hoort deze explanation 1 evallen gevallen noncompletion 1 nog ook explanation 1 +voor bij replacement 1 +voor om replacement 2 voor van explanation 1 wat waar replacement 2 wordt hoort replacement 2 @@ -443,6 +447,7 @@ pantoet pannekoeken replacement 2 eet eten replacement 2 bejo hallo explanation 1 teje gaan replacement 2 +maar normaal replacement 1 maar en replacement 7 maar want replacement 1 maar ga daar maar explanation 1 @@ -472,7 +477,7 @@ bochje bochtje explanation 1 anders andere explanation 1 saar daar explanation 1 saal haal explanation 1 -saan gaan explanation 1 +saan gaan replacement 2 bruiken gebruiken explanation 1 keerd verkeerd explanation 1 vrastauto vrachtauto replacement 1 @@ -520,6 +525,7 @@ gun ging replacement 2 gin ging noncompletion 4 som soms noncompletion 6 witte wit replacement 2 +dee deed noncompletion 1 dee was replacement 2 dan toen replacement 6 ging gingen replacement 2 @@ -549,12 +555,12 @@ lew wel replacement 2 slin ging replacement 2 stond stonden replacement 4 stond was replacement 2 +van bij replacement 2 van om replacement 2 van door replacement 1 van met replacement 1 van uit replacement 1 van veel replacement 1 -van bij replacement 1 arreen alleen replacement 2 het er replacement 2 het hij replacement 2 @@ -614,6 +620,7 @@ zaten hadden replacement 1 zo zo'n replacement 1 ook blauwe ramen zitten er ook explanation 1 ginnen gingen replacement 2 +dat daar replacement 1 dat het replacement 1 allegrooste allegrootste noncompletion 2 en maar replacement 3 @@ -641,9 +648,11 @@ ta tat noncompletion 1 peltje pijltje replacement 1 pijwtje pijltje replacement 1 sef zeg replacement 1 +doe toen replacement 1 doe ga replacement 1 zwaarden vechten replacement 1 lekkes lekkers noncompletion 1 +was ging replacement 1 was waren replacement 1 was wat replacement 1 suimpjes sguimpjes noncompletion 1 @@ -666,6 +675,8 @@ itten eten replacement 2 ziekje muziekje replacement 1 Bobdebouwerkattet Bobdebouwerkwartet replacement 1 blazen geblazen replacement 1 +ik je replacement 1 +ik jij replacement 1 ik mij replacement 1 wou wilde replacement 3 vashouden vasthouden noncompletion 2 @@ -802,6 +813,9 @@ rekele rekenen replacement 1 daarzo daar replacement 1 voorbeeld bijvoorbeeld replacement 1 freene free-runnen replacement 1 +x racen-x replacement 1 +x los ? replacement 1 +x gelaten ? replacement 1 x gewoon replacement 1 feerunnen freerunnen replacement 1 wees geweest replacement 1 @@ -812,3 +826,22 @@ zorgen verzorgen replacement 1 kantie vakantie replacement 1 zomerkantie vakantie replacement 1 boekje 'n boekje voor David explanation 1 +xxx ? replacement 1 +koppie kopbal replacement 1 +weer meer replacement 1 +kubbelbum bubblegum ? replacement 1 +probreren proberen replacement 2 +doem doen replacement 1 +durf durft noncompletion 1 +tik tikkertje replacement 1 +teek betekent replacement 1 +palaplu paraplu replacement 1 +valaag vandaag replacement 1 +, dat replacement 1 +tegen aan replacement 1 +zeggen vragen replacement 1 +vliegtuig vliegveld replacement 1 +anneen alleen replacement 1 +na dan replacement 1 +noen doen replacement 1 +teeëndertig tweeëndertig replacement 1 diff --git a/src/sastadev/data/childescorrections/donefiles.txt b/src/sastadev/data/childescorrections/donefiles.txt index 05e3c76..67fd284 100644 --- a/src/sastadev/data/childescorrections/donefiles.txt +++ b/src/sastadev/data/childescorrections/donefiles.txt @@ -1,108 +1,114 @@ -vklstap\intreebanks\STAP_10.xml -vklstapfase2\intreebanks\kind1.xml -auristrain\intreebanks\TD05.xml -auristest\intreebanks\TD15.xml +auristrain\intreebanks\DLD16.xml +auristrain\intreebanks\TD02.xml +AurisTrain\intreebanks\TD03.xml +Auris\intreebanks\TD03.xml +vklstap\intreebanks\STAP_05.xml +auristrain\intreebanks\TD24.xml auristrain\intreebanks\TD09.xml -vkltarsp\intreebanks\Tarsp_02.xml -vkltarsp\intreebanks\TARSP_09.xml -vklstapfase2\intreebanks\STAP_DP.xml -Auris\intreebanks\TD16.xml +vklstap\intreebanks\STAP_08.xml +vkltarsp\intreebanks\TARSP_10.xml +auristrain\intreebanks\TD16.xml +vkltarsp\intreebanks\Tarsp_05.xml +vklstapfase2\intreebanks\K2.xml vklastafase2\intreebanks\ASTA_16.xml +vklstap\intreebanks\STAP_10.xml +auristrain\intreebanks\TD13.xml +vklasta\intreebanks\ASTA_02.xml vklasta\intreebanks\ASTA_07.xml -vklstapfase2\intreebanks\STP_Du.xml -vkltarsp\intreebanks\TARSP_07.xml -Auris\intreebanks\DLD03.xml -vklasta\intreebanks\ASTA_10.xml -vklastafase2\intreebanks\ASTA_11.xml -vklastafase2\intreebanks\ASTA_15.xml +auristest\intreebanks\TD10.xml +auristrain\intreebanks\TD22.xml +AurisTrain\intreebanks\TD02.xml vkltarsp\intreebanks\tarsp_01.xml -vklstap\intreebanks\STAP_03.xml -vklasta\intreebanks\asta_10.xml -vklstap\intreebanks\STAP_04.xml -vklstap\intreebanks\STAP_05.xml +auristrain\intreebanks\DLD14.xml +test_tarsp\intreebanks\test_tarsp.xml +auristest\intreebanks\TD30.xml +vklasta\intreebanks\ASTA_01.xml +vklastafase2\intreebanks\ASTA_14.xml +vklstapfase2\intreebanks\kind1.xml vklstapfase2\intreebanks\SASTA_STAP_023.xml -Auristrain\intreebanks\TD16.xml -vklasta\intreebanks\asta_04.xml -auristrain\intreebanks\TD14.xml -auristest\intreebanks\DLD20.xml -auristest\intreebanks\TD25.xml -auristrain\intreebanks\TD08.xml -vklstapfase2\intreebanks\STAP025.xml -vklasta\intreebanks\ASTA_09.xml -vklstapfase2\intreebanks\SASTA_STAP_022.xml -auristrain\intreebanks\TD24.xml -AurisTrain\intreebanks\TD03.xml -Auris\intreebanks\TD03.xml -vklstap\intreebanks\STAP_06.xml -vklstapfase2\intreebanks\STP_Ko.xml -vklstapfase2\intreebanks\STP_MP_MZ.xml -vklstapfase2\intreebanks\STP_Da.xml -vkltarsp\intreebanks\Tarsp_04.xml -vkltarsp\intreebanks\Tarsp_05.xml -auristrain\intreebanks\TD23.xml -auristest\intreebanks\TD10.xml -auristrain\intreebanks\TD16.xml -auristrain\intreebanks\DLD11.xml -Auris\intreebanks\TD01.xml -auristrain\intreebanks\TD06.xml -auristrain\intreebanks\TD19.xml -vklstapfase2\intreebanks\STP_3.xml -auristrain\intreebanks\TD18.xml -vkltarsp\intreebanks\Tarsp_03.xml -vklasta\intreebanks\ASTA_04.xml -Auris\intreebanks\TD13.xml -AurisTest\intreebanks\TD01.xml -Auristrain\intreebanks\TD07.xml -auristrain\intreebanks\TD21.xml +elsdejong\intreebanks\STAP_01.xml test_stap\intreebanks\test_stap.xml -auristrain\intreebanks\TD02.xml -vkltarsp\intreebanks\TARSP_08.xml -auristrain\intreebanks\TD26.xml auristrain\intreebanks\DLD03.xml +vkltarsp\intreebanks\Tarsp_04.xml +vklastafase2\intreebanks\ASTA_11.xml +vklstapfase2\intreebanks\STP_Ko.xml +vklstapfase2\intreebanks\STAP025.xml +auristrain\intreebanks\TD08.xml vklstap\intreebanks\STAP_09.xml -VKLStapFase2\intreebanks\K2.xml -vkltarsp\intreebanks\Tarsp_01.xml -vklasta\intreebanks\ASTA_03.xml +auristrain\intreebanks\TD19.xml +elsdejong\intreebanks\STAP_05.xml +elsdejong\intreebanks\STAP_03.xml +vklasta\intreebanks\ASTA_06.xml +Auris\intreebanks\TD16.xml +vklstapfase2\intreebanks\STP_Du.xml +vkltarsp\intreebanks\TARSP_07.xml auristest\intreebanks\TD01.xml -Auris\intreebanks\TD18.xml -auristrain\intreebanks\TD29.xml -vklstapfase2\intreebanks\STAP_024.xml -auristrain\intreebanks\TD03.xml -vklasta\intreebanks\asta_01.xml -vklasta\intreebanks\ASTA_05.xml -vkltarsp\intreebanks\TARSP_13.xml -vklastafase2\intreebanks\ASTA_13.xml -vklastafase2\intreebanks\ASTA_14.xml -Auris\intreebanks\DLD16.xml +auristest\intreebanks\DLD20.xml +vklstapfase2\intreebanks\SASTA_STAP_022.xml +auristrain\intreebanks\TD21.xml +vklstapfase2\intreebanks\STP_Da.xml auristest\intreebanks\TD20.xml -auristrain\intreebanks\DLD14.xml -auristrain\intreebanks\TD07.xml -vklstapfase2\intreebanks\K2.xml -AurisTrain\intreebanks\TD02.xml -auristrain\intreebanks\TD12.xml -vkltarsp\intreebanks\TARSP_10.xml -auristrain\intreebanks\DLD16.xml +vklstap\intreebanks\stap_02.xml +aurisbugs\intreebanks\Kind_3_JM.xml +auristrain\intreebanks\TD14.xml +auristrain\intreebanks\TD18.xml +auristrain\intreebanks\TD06.xml +vklasta\intreebanks\ASTA_09.xml vklasta\intreebanks\ASTA_08.xml -vklasta\intreebanks\ASTA_02.xml -Auris\intreebanks\TD02.xml -auristest\intreebanks\TD30.xml +auristrain\intreebanks\TD07.xml +vkltarsp\intreebanks\TARSP_09.xml +vklastafase2\intreebanks\ASTA_15.xml +vklstap\intreebanks\STAP_03.xml +auristrain\intreebanks\TD28.xml +auristrain\intreebanks\TD03.xml test_asta\intreebanks\test_asta.xml -auristrain\intreebanks\TD22.xml -vklstap\intreebanks\STAP_08.xml -auristrain\intreebanks\TD04.xml -Auris\intreebanks\TD11.xml +auristrain\intreebanks\TD05.xml +vkltarsp\intreebanks\TARSP_08.xml +vklasta\intreebanks\asta_01.xml Auristrain\intreebanks\DLD03.xml -auchanntest\intreebanks\auchanNtest01.xml +Auris\intreebanks\TD13.xml +Auris\intreebanks\TD11.xml +vklastafase2\intreebanks\ASTA_13.xml +vklstapfase2\intreebanks\STP_MP_MZ.xml +vklstap\intreebanks\STAP_06.xml +auristrain\intreebanks\TD29.xml +vklstap\intreebanks\STAP_02.xml +auristrain\intreebanks\TD04.xml +Auris\intreebanks\TD01.xml +Auristrain\intreebanks\TD16.xml +vklstapfase2\intreebanks\STAP_DP.xml +auristrain\intreebanks\DLD11.xml vkltarsp\intreebanks\TARSP_06.xml -auristrain\intreebanks\TD13.xml +auristest\intreebanks\TD15.xml +auristrain\intreebanks\TD23.xml +vklasta\intreebanks\asta_10.xml +auristest\intreebanks\DLD07.xml +vklstapfase2\intreebanks\STP_3.xml +auristest\intreebanks\TD25.xml +auchanntest\intreebanks\auchanNtest01.xml +Auris\intreebanks\TD02.xml +Auris\intreebanks\TD18.xml +vklasta\intreebanks\ASTA_05.xml +vkltarsp\intreebanks\TARSP_13.xml +vkltarsp\intreebanks\Tarsp_02.xml +Auris\intreebanks\DLD16.xml +vkltarsp\intreebanks\Tarsp_03.xml +AurisTest\intreebanks\TD01.xml +Auristrain\intreebanks\TD07.xml vklstap\intreebanks\STAP_07.xml +vklasta\intreebanks\asta_04.xml +Auris\intreebanks\DLD03.xml +auristrain\intreebanks\TD12.xml auristrain\intreebanks\TD11.xml -vklasta\intreebanks\ASTA_01.xml -vklasta\intreebanks\ASTA_06.xml -auristest\intreebanks\DLD07.xml -vklstapfase2\intreebanks\STP_KC.xml vklstap\intreebanks\stap_03.xml -test_tarsp\intreebanks\test_tarsp.xml -vklstap\intreebanks\stap_02.xml -vklstap\intreebanks\STAP_02.xml -auristrain\intreebanks\TD28.xml +vklstapfase2\intreebanks\STAP_024.xml +vklasta\intreebanks\ASTA_10.xml +vklstap\intreebanks\STAP_04.xml +vklstapfase2\intreebanks\STP_KC.xml +vklasta\intreebanks\ASTA_03.xml +vklasta\intreebanks\ASTA_04.xml +VKLStapFase2\intreebanks\K2.xml +elsdejong\intreebanks\STAP_04.xml +elsdejong\intreebanks\STAP_06.xml +auristrain\intreebanks\TD26.xml +vkltarsp\intreebanks\Tarsp_01.xml diff --git a/src/sastadev/data/macros/newimperatives.txt b/src/sastadev/data/macros/newimperatives.txt index 524a46c..dbf97af 100644 --- a/src/sastadev/data/macros/newimperatives.txt +++ b/src/sastadev/data/macros/newimperatives.txt @@ -95,14 +95,16 @@ wxyz5 = """(%basicimperative% ynquery = """@cat="sv1" and - (@rel="--" or @rel="dp") and + (@rel="--" or @rel="dp" or %nofollowinghenucl%) and not(%topcontainsperiodmark%) and not(%topcontainsexclamationmark%) and node[@rel="hd" and @pt="ww" and @pvtijd !="conj" and (@stype="ynquestion" or %topcontainsquestionmark% ) ] and (node[@rel="su"] or %topcontainsquestionmark%) and (not(%impmodfound%) or %topcontainsquestionmark% ) """ - + +nofollowinghenucl = """(@rel="nucl" and not(../node[@rel="tag" and (@lemma="hè" or @lemma="he") and @begin >= ../node[@rel="nucl"]/@end])) """ + wondx = """(%ynquery% and %realcomplormodnodecount% <= 2 )""" wond4 = """(%ynquery% and %realcomplormodnodecount% = 3)""" @@ -198,7 +200,7 @@ Tarsp_OndWVC = """ Tarsp_WBVC = """(%declarative% and %Tarsp_W% and %Tarsp_B_X% and %Tarsp_VC_X% and %realcomplormodnodecount% = 2 )""" -Tarsp_Ov4 = """(%declarative% and %realcomplormodnodecount% = 3 and not(%Tarsp_OndWBVC%) and not(%Tarsp_OndWBB%) and not(%Tarsp_OndWVCVCX%))""" +Tarsp_Ov4 = """(%declarative% and %realcomplormodnodecount% = 3 and %Tarsp_W% and not(%Tarsp_OndWBVC%) and not(%Tarsp_OndWBB%) and not(%Tarsp_OndWVCVCX%))""" Tarsp_Ov5 = """(%declarative% and %realcomplormodnodecount% = 4 and not(%Tarsp_VCWOndBB%) and not(%Tarsp_OndWVCVCX%) and not(%Tarsp_BWOndBB%) )""" @@ -245,7 +247,7 @@ old_Tarsp_BBX = """((@cat="top" and )""" -Tarsp_BBX = """(%coreBBX% or %clausalBBX% or %wvcBBX% or %abcBBX% or %vabcBBX% or %nadvpbbBBX% or %bplusxpbBBX%)""" +Tarsp_BBX = """(%coreBBX% or %clausalBBX% or %wvcBBX% or %abcBBX% or %vabcBBX% or %nadvpbbBBX% or %bplusxpbBBX% or %smallclausebbx%)""" old_corebb = """ ((@special="er_loc" or @pt="bw" or @pt="vz" or (@pt="adj" and @rel="dp" )) and @rel!="hd" and @rel!="hdf")""" @@ -262,6 +264,8 @@ vabcBBX = """((@cat!="smain" and @cat!="sv1") and nadvpbbBBX = """(node[not(%Tarsp_B%) and (not(@pt) or (@pt!="ww" and @pt!="let")) and @cat!="advp"] and node[@cat="advp" and node[@rel="mod" ] and node[@rel="hd" ]] and count(node) = 2 and %onlythesearerealwords%) """ bplusxpbBBX = """(node[%Tarsp_B% and @rel!="tag"] and not(node[@pt="ww"]) and node[not(%Tarsp_B%) and count(.//node[%Tarsp_B%]) = 1]and %onlythesearerealwords%)""" + +smallclausebbx = """(@cat="smain" and node[%complement%] and count(node[%coreB%])=2 and count(node)=3 and %onlythesearerealwords% )""" Tarsp_Ov3 = """(%declarative% and not(%Tarsp_OndWVC%) and @@ -270,6 +274,7 @@ Tarsp_Ov3 = """(%declarative% and not(%Tarsp_WBVC%) and not(%Tarsp_OndB%) and not(%Tarsp_OndVC%) and + %Tarsp_W% and %realcomplormodnodecount% = 2) """ diff --git a/src/sastadev/lexicon.py b/src/sastadev/lexicon.py index 006ed17..f559bd8 100644 --- a/src/sastadev/lexicon.py +++ b/src/sastadev/lexicon.py @@ -19,6 +19,7 @@ namepart_isa_namepart_uc) from sastadev.readcsv import readcsv from sastadev.sastatypes import CELEX_INFL, DCOITuple, Lemma, SynTree, WordInfo +from sastadev.stringfunctions import ispunctuation alpinoparse = settings.PARSE_FUNC space = ' ' @@ -236,6 +237,8 @@ def nochildword(wrd: str) -> bool: return result def isalpinonouncompound(wrd: str) -> bool: + if ispunctuation(wrd): + return False fullstr = f'geen {wrd}' # geen makes it a noun and can combine with uter and neuter, count and mass, sg and plural tree = alpinoparse(fullstr) # find the noun diff --git a/src/sastadev/normalise_lemma.py b/src/sastadev/normalise_lemma.py index 77b6e2b..a49f81b 100644 --- a/src/sastadev/normalise_lemma.py +++ b/src/sastadev/normalise_lemma.py @@ -4,7 +4,7 @@ Example: word = verkeerslichtjes. Alpino lemma = verkeer_licht. But for many applications we need the 'normal' lemma for such as compound, i.e. -for *verkeerslichtje* that is *verkeersicht*. This is achieved by the function *normalizelemma* +for *verkeerslichtje* that is *verkeerslicht*. This is achieved by the function *normalizelemma* The function has been tested against all noun compounds in Lassy-Klein diff --git a/src/sastadev/postnominalmodifiers.py b/src/sastadev/postnominalmodifiers.py new file mode 100644 index 0000000..a63c030 --- /dev/null +++ b/src/sastadev/postnominalmodifiers.py @@ -0,0 +1,123 @@ +import copy +from lxml import etree +from sastadev.metadata import Meta, defaultpenalty, insertion, bpl_delete, SASTA +from sastadev.sastatypes import SynTree, XpathExpression +from sastadev.treebankfunctions import getattval, hasnominativehead +from typing import List, Tuple + +postnominalmodifier = "Postnominal Modifier Adaptation" + +ppinnpxpath = """.//node[@cat="pp" and node[@rel="hd" and @lemma!="van" and @lemma!="met" and @lemma!="mee"] and + parent::node[@cat="np" and node[@rel="hd" and @pt!="ww"]]]""" + +modbwinnpxpath = """.//node[(@lemma="ook" or @lemma="alleen" or @lemma="eerst") and + parent::node[@cat="np" and node[@rel="hd" and @pt!="ww"]]]""" + +def transformppinnp(instree: SynTree) -> SynTree: + result = transformmodinnp(instree, ppinnpxpath) + return result + +def transformbwinnp(instree: SynTree) -> SynTree: + result = transformmodinnp(instree, modbwinnpxpath) + return result + + + +def transformmodinnp(instree: SynTree, modxpath: XpathExpression) -> SynTree: + stree = copy.deepcopy(instree) + ppsinnp = stree.xpath(modxpath) + for ppinnp in ppsinnp: + theparent = ppinnp.getparent() + grandparent = theparent.getparent() + grandparentcat = getattval(grandparent, 'cat') + if grandparentcat == 'top': + # create a new clause node under + clausebegin = getattval(theparent, 'begin') + clauseend = getattval(theparent, 'end') + clausenode = etree.Element('node', {'cat': 'smain', 'begin': clausebegin, 'end': clauseend, 'id': "1000"}) + grandparent.append(clausenode) + # put the np under this clausenode + clausenode.append(theparent) + # detach the pp to under the clausenode + detach(ppinnp) + # make and insert a verb + # determine getal of the np + # next is not needed and not desirable + # getal = getgetal(theparent) + # moetattrib = getmoetattrib(getal, "1001", theparent.attrib['end']) + # smallclauseverb = etree.Element('node', moetattrib) + # clausenode.append(smallclauseverb) + # insertword = getattval(smallclauseverb, 'word') + # insertpos = getattval(smallclauseverb, 'begin') + # meta1 = Meta(insertion, [insertword], annotatedposlist=[insertpos], + # annotatedwordlist=[], annotationposlist=[insertpos], + # annotationwordlist=[insertword], cat=postnominalmodifier, source=SASTA, penalty=defaultpenalty, + # backplacement=bpl_delete) + # metadata = [meta1] + + else: + detach(ppinnp) + # metadata = [] + + return stree + +def detach(node: SynTree): + """ + remove the node from its parent and attach it to its grandparent but only if node.emd == parent.end + :param node: + :return: + """ + + # remove node from its parent, adapt parent begin and end; only if end of node1 == end of parent + parent = node.getparent() + targetnode = parent.getparent() + node_end = getattval(node, 'end') + parent_end = getattval(parent, 'end') + node_begin = getattval(node, 'begin') + parent_begin = getattval(node, 'begin') + if node_end == parent_end or node_begin == parent_begin: + parent.remove(node) + therel = 'su' if hasnominativehead(parent) else 'obj1' + parent.attrib['rel'] = therel # @@@ or obj1 if not clearly nominative + + # adapt the (begin and ) end of the nodeparent + newbegin, newend = getbeginandend(parent) + parent.attrib['begin'] = newbegin + parent.attrib['end'] = newend + + # append node to the targetnode + targetnode.append(node) + + +def getmoetattrib(getal: str, id: int, prevend: str) -> dict: + word = 'moet' if getal == 'ev' else 'moeten' + moetbegin = str(int(prevend) - 1 + 5) + moetend = str(int(moetbegin) + 1) + result = {'lemma': 'moeten', 'word': word, 'pt': 'ww', 'wvorm': 'pv', 'pvagr': getal, + 'pvtijd': 'tgw', 'root': 'moet', 'sense': 'moet', 'postag': f'WW(pv, tgw,{getal})', + 'id': id, 'begin': moetbegin, 'end': moetend} + return result + + +def getgetal(node: SynTree) -> str: + # @@to be extended + for child in node: + childrel = getattval(child, 'rel') + if childrel == 'hd': + if 'getal' in child.attrib: + return child.attrib['getal'] + else: + return 'ev' + elif childrel == 'cnj': + return 'mv' # @@this is a adhoc and will be sufficient for most cases but must be extended' + +def getbeginandend(node: SynTree) -> Tuple[str, str]: + curbegin = 10000 + curend = 0 + for child in node: + if int(child.attrib['begin']) < curbegin: + curbegin = int(child.attrib['begin']) + if int(child.attrib['end']) > curend: + curend = int(child.attrib['end']) + return str(curbegin), str(curend) + diff --git a/src/sastadev/treebankfunctions.py b/src/sastadev/treebankfunctions.py index 9c86243..db168a9 100644 --- a/src/sastadev/treebankfunctions.py +++ b/src/sastadev/treebankfunctions.py @@ -788,7 +788,8 @@ def addmetadata(stree: SynTree, meta: Metadata) -> SynTree: def iswordnode(thenode: SynTree) -> bool: - result = 'pt' in thenode.attrib or 'pos' in thenode.attrib + # result = 'pt' in thenode.attrib or 'pos' in thenode.attrib + result = 'word' in thenode.attrib return result @@ -1730,6 +1731,17 @@ def findfirstnode(tree: SynTree, condition: Callable[[SynTree], bool]) -> Option return None +def hasnominativehead(node: SynTree) -> bool: + hd = find1(node, './node[@rel="hd"]') + cnjs = node.xpath('./node[@rel="cnj"]') # coordinations + if cnjs != []: + result = any([hasnominativehead(cnj) for cnj in cnjs]) + elif hd is not None: + result = getattval(hd, 'naamval') == 'nomin' + else: + result = False + return result + def nominal(node: SynTree) -> bool: pt = getattval(node, 'pt') cat = getattval(node, 'cat') diff --git a/src/sastadev/treetransform.py b/src/sastadev/treetransform.py index e19be4d..d9d4207 100644 --- a/src/sastadev/treetransform.py +++ b/src/sastadev/treetransform.py @@ -1,8 +1,29 @@ import copy -from sastadev.treebankfunctions import immediately_precedes, showtree +from sastadev.conf import settings +from sastadev.treebankfunctions import find1, getattval, getbeginend, getnodeyield, getyield, \ + immediately_precedes, iswordnode, showtree from sastadev.sastatypes import SynTree from lxml import etree +space = ' ' + +tagcommaclausexpath = """.//node[@cat="smain" and + node[@pt="n" and @end = ancestor::alpino_ds/descendant::node[@lemma="," ]/@begin and + @begin = ancestor::node[@cat="top"]/@begin]]""" + +sv1xpath = """.//node[@cat="sv1" and parent::node[@cat="top"]]""" +tagxpath = """.//node[@pt="n" and @end = ancestor::alpino_ds/descendant::node[@lemma="," ]/@begin and + @begin = ancestor::node[@cat="top"]/@begin]""" +tagcommaxpath = """.//node[@lemma=","]""" +notsv1xpath = """.//node[(not(@cat) or @cat!="sv1") and parent::node[@cat="top"]]""" + +nognonpxpath = """.//node[@lemma="nog" and parent::node[not(@cat="np")]]""" +nogxpath = """.//node[@lemma="nog" and parent::node[@cat="np" and not(node[@rel="hd" and @pt="ww"])]]""" +eenxpath = """.//node[(@lemma="een" or @lemma="één" or @lemma="eentje" or @lemma="meer" or @lemma="minder" or + @lemma="zo'n" or @pt="tw") and parent::node[@cat="np"]]""" +dexpath = """.//node[(@lemma="de" or @lemma="het" or @lemma="deze" or @lemma="die") and parent::node[@cat="np"]]""" + + def transformtreeld(stree:SynTree) -> SynTree: debug = False if debug: @@ -27,9 +48,7 @@ def transformtreenogeen(stree:SynTree) -> SynTree: if debug: showtree(stree, 'intree') newstree = copy.deepcopy(stree) - nogxpath = """.//node[@lemma="nog" and parent::node[not(@cat="np")]]""" - eenxpath = """.//node[(@lemma="een" or @lemma="één" or @lemma="eentje") and parent::node[@cat="np"]]""" - nogs = newstree.xpath(nogxpath) + nogs = newstree.xpath(nognonpxpath) eens = newstree.xpath(eenxpath) for nog in nogs: for een in eens: @@ -45,15 +64,92 @@ def transformtreenogde(stree:SynTree) -> SynTree: if debug: showtree(stree, 'intree') newstree = copy.deepcopy(stree) - nogxpath = """.//node[@lemma="nog" and parent::node[@cat="np"]]""" - dexpath = """.//node[(@lemma="de" or @lemma="het" or @lemma="deze" or @lemma="die") and parent::node[@cat="np"]]""" nogs = newstree.xpath(nogxpath) des = newstree.xpath(dexpath) - for nog in nogs: - for de in des: - if immediately_precedes(nog, de, newstree): + eens = newstree.xpath(eenxpath) + if eens == []: # otherwise we have transformtreenogeen + for nog in nogs: + for de in des: + if immediately_precedes(nog, de, newstree): + nog.getparent().remove(nog) + de.getparent().getparent().append(nog) + if des == [] and eens == []: + nog_grandparent = nog.getparent().getparent() nog.getparent().remove(nog) - de.getparent().getparent().append(nog) + nog_grandparent.append(nog) + if debug: + showtree(newstree, 'outtree') + return newstree + +def transformtagcomma(stree: SynTree) -> SynTree: + debug = False + newtree = copy.deepcopy(stree) + match = find1(newtree, tagcommaclausexpath) + + if match is not None: + topnode = match.getparent() + thetag = find1(newtree, tagxpath) + thetagcomma = find1(newtree, tagcommaxpath) + thenodeyield = getnodeyield(newtree) + if isfiniteverbnode(thenodeyield[2]): + theyield = getyield(newtree) + sv1str = space.join(theyield[2:]) + sv1parse = settings.PARSE_FUNC(sv1str) + if debug: + showtree(sv1parse, 'sv1parse') + if sv1parse is not None: + sv1top = find1(sv1parse, './/node[@cat="top"]') + incr = 2 if thenodeyield[0].attrib['begin'] == '0' else 20 + sv1top = increasebeginends(sv1top, incr) + sv1node = find1(sv1top, sv1xpath) + otherpuncs = sv1top.xpath(notsv1xpath) + topattrib = {'cat': 'top', 'id': getattval(topnode, 'id'), 'begin': getattval(topnode, 'begin'), + 'end': getattval(topnode, 'end')} + newtop = etree.Element('node', topattrib) + duattrib = {'cat': 'du', 'rel': '--', 'id': f'{getattval(topnode, "id")}a', + 'begin': getattval(thetag, 'begin'), 'end': f'{getattval(sv1node, "end")}'} + thedu = etree.Element('node', duattrib) + thetag.attrib['rel'] = 'tag' + sv1node.attrib['rel'] = 'nucl' + thedu.append(thetag) + thedu.append(sv1node) + newtop.append(thetagcomma) + newtop.append(thedu) + newtop.extend(otherpuncs) + newtree.remove(topnode) + newtreechildren = [child for child in newtree] + newtreechildren = [newtop] + newtreechildren + newtree.extend(newtreechildren) + result = newtree + else: + result = stree + else: + result = stree + else: + result = stree + if debug: - showtree(newstree, 'outtree') - return newstree \ No newline at end of file + showtree(result, 'result') + return result + + +def isfiniteverbnode(node: SynTree) -> bool: + pt = getattval(node, 'pt') + wvorm = getattval(node, 'wvorm') + result = pt == 'ww' and wvorm == 'pv' + return result + +def increasebeginends(stree: SynTree, incr: int) -> SynTree: + newtree = copy.copy(stree) + newchildren = [increasebeginends(child, incr) for child in stree] + for child in newtree: + newtree.remove(child) + if iswordnode(newtree): + newtree.attrib['begin'] = str(int(newtree.attrib['begin']) + incr) + newtree.attrib['end'] = str(int(newtree.attrib['begin']) + 1) + else: + (b, e) = getbeginend(newchildren) + newtree.attrib['begin'] = b + newtree.attrib['end'] = e + newtree.extend(newchildren) + return newtree \ No newline at end of file diff --git a/src/sastadev/trycleantext.py b/src/sastadev/trycleantext.py new file mode 100644 index 0000000..63bd50c --- /dev/null +++ b/src/sastadev/trycleantext.py @@ -0,0 +1,14 @@ +from sastadev.cleanCHILDEStokens import cleantext, robustness +from sastadev import sastatok + + +utts = [(1, 'ja. [+EA]', 'ja.'), + (2, + 'dat [//] ligt meer verder ofzo, naast STAD. [+ VU] [%zwaait met haar handen om de richting aan te geven]', + 'dat ligt meer verder ofzo, naast STAD.')] + +for i, utt, correctcleanutt in utts: + robutt = robustness(utt) + cleanutt, meta = cleantext(robutt, repkeep=False) + if cleanutt != correctcleanutt: + print(f'NO:{utt}:{cleanutt}!={correctcleanutt}') \ No newline at end of file diff --git a/src/sastadev/trypostnominalmodifiers.py b/src/sastadev/trypostnominalmodifiers.py new file mode 100644 index 0000000..5315242 --- /dev/null +++ b/src/sastadev/trypostnominalmodifiers.py @@ -0,0 +1,54 @@ +from sastadev.postnominalmodifiers import transformppinnp +from lxml import etree +from sastadev.treebankfunctions import showtree, treeinflate + +examples = [(1, """ + + + + + + + + + + + ik naar omie . + + + + + + + + + + + + + + + + + + + + + + + + + +""")] + +exampletrees = [(i, etree.fromstring(example)) for i, example in examples] + +def main(): + for i, exampletree in exampletrees: + newtree = transformppinnp(exampletree) + showtree(newtree, 'newtree') + + + +if __name__ == '__main__': + main() \ No newline at end of file