From 1cb9a900ba820f0a5c89da23bbd490cfd9c1c8af Mon Sep 17 00:00:00 2001 From: Odijk Date: Tue, 4 Jan 2022 15:20:48 +0100 Subject: [PATCH 01/11] smallclauses V1 --- .idea/inspectionProfiles/Project_Default.xml | 28 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/sastadev.iml | 12 + .idea/vcs.xml | 6 + .idea/workspace.xml | 734 +++++++++++++++++++ basicreplacements.py | 4 + sastalog.txt | 60 ++ test_smallclauses.py | 199 +++++ top3000.py | 58 ++ 10 files changed, 1113 insertions(+) create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/sastadev.iml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml create mode 100644 sastalog.txt create mode 100644 test_smallclauses.py create mode 100644 top3000.py diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..2024689 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,28 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..65531ca --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..8443573 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sastadev.iml b/.idea/sastadev.iml new file mode 100644 index 0000000..7c9d48f --- /dev/null +++ b/.idea/sastadev.iml @@ -0,0 +1,12 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..d3538cf --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,734 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + logging + logfilename + getcorr + uttno + silver + getuu + getu + getlemmas + 'A + allresults + cpa + cpana + cpana2xlsx + allre + ngram1 + realword + ngramreducti + filled + lepel + + + SDLOGGER + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1639990015961 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + file://$PROJECT_DIR$/astaforms.py + 326 + + + file://$PROJECT_DIR$/external_functions.py + 64 + + + file://$PROJECT_DIR$/astaforms.py + 345 + + + file://$PROJECT_DIR$/../../sastadevaux/sd_test.py + 5 + + + file://$PROJECT_DIR$/../../sastadevaux/runsasta.py + 180 + + + file://$PROJECT_DIR$/test_smallclauses.py + 196 + + + file://$PROJECT_DIR$/test_smallclauses.py + 157 + + + file://$PROJECT_DIR$/test_smallclauses.py + 187 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/basicreplacements.py b/basicreplacements.py index f369702..339b327 100644 --- a/basicreplacements.py +++ b/basicreplacements.py @@ -41,6 +41,9 @@ ('effe', 'even', pron, infpron, varpron), ('set', 'zet', pron, infpron, initdev), ('hie', 'hier', pron, pronerr, codared), ('eers', 'eerst', pron, pronerr, codared), + ('era', 'eraf', pron, pronerr, codared), + ('il', 'wil', pron, pronerr, onsetred), + ('tee', 'twee', pron, pronerr, onsetred), ('nie', 'niet', pron, infpron, codared), ('s', 'is', orth, spellerr, apomiss), ('ooke', 'ook', pron, infpron, addschwa), ('it', 'dit', pron, pronerr, onsetred), @@ -67,6 +70,7 @@ ('dis', ['dit', 'is'], pron, infpron, contract), ('das', ['dat', 'is'], pron, infpron, contract), ('tis', ['dit', 'is'], pron, infpron, contract), + ('waas', ['waar', 'is'], pron, infpron, contract), ('is-t-ie', ['is', 'ie'], pron, infpron, t_ie), ('als-t-ie', ['als', 'ie'], pron, infpron, t_ie), ('of-t-ie', ['of', 'ie'], pron, infpron, t_ie), diff --git a/sastalog.txt b/sastalog.txt new file mode 100644 index 0000000..8dde43b --- /dev/null +++ b/sastalog.txt @@ -0,0 +1,60 @@ +2021-12-20T12:04:08 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 1 4.8 4.7 4.7 4.7 5.3 5.0 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:04:08 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 2 4.8 4.7 4.7 4.7 5.3 5.0 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:04:08 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 3 4.8 4.7 4.7 4.7 5.3 5.0 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:04:08 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 4 4.8 4.7 4.7 4.7 5.3 5.0 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:46:15 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 1 4.8 4.7 4.7 51.7 100.0 68.1 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:46:15 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 2 4.8 4.7 4.7 51.7 100.0 68.1 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:46:15 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 3 4.8 4.7 4.7 51.7 100.0 68.1 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T12:46:15 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 4 4.8 4.7 4.7 51.7 100.0 68.1 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T14:35:40 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 1 85.1 83.6 84.4 43.5 84.2 57.4 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T14:35:40 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 2 85.1 83.6 84.4 43.5 84.2 57.4 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T14:35:40 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 3 85.1 83.6 84.4 43.5 84.2 57.4 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T14:35:40 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 4 85.1 83.6 84.4 43.5 84.2 57.4 50.8 100.0 67.3 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T15:12:51 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 1 85.1 83.6 84.4 86.9 97.1 91.7 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T15:12:51 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 2 85.1 83.6 84.4 86.9 97.1 91.7 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T15:12:51 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 3 85.1 83.6 84.4 86.9 97.1 91.7 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-20T15:12:51 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xml 4 85.1 83.6 84.4 86.9 97.1 91.7 88.0 100.0 93.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\stapdata\STAP_02.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\STAP_Index_Current.xlsx +2021-12-21T09:18:24 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\ASTA_sample_01.xml 1 78.7 82.5 80.5 46.8 86.0 60.6 57.0 100.0 72.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\SASTA sample 01.xlsx methods\ASTA Index Current.xlsx +2021-12-21T09:18:24 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\ASTA_sample_01.xml 2 62.0 82.5 70.8 46.8 86.0 60.6 57.0 78.9 66.2 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\SASTA sample 01.xlsx methods\ASTA Index Current.xlsx +2021-12-21T09:18:24 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\ASTA_sample_01.xml 3 78.7 82.5 80.5 46.8 86.0 60.6 57.0 100.0 72.6 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\SASTA sample 01.xlsx methods\ASTA Index Current.xlsx +2021-12-21T09:18:24 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\ASTA_sample_01.xml 4 62.0 82.5 70.8 46.8 86.0 60.6 57.0 78.9 66.2 D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\astadata\ASTA\SASTA sample 01.xlsx methods\ASTA Index Current.xlsx +2021-12-21T09:44:25 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 1 89.6 86.6 88.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:44:25 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 2 85.6 86.6 86.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:44:25 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 3 89.6 86.6 88.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:44:25 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 4 85.6 86.6 86.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:02 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 1 82.1 82.1 82.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:02 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 2 79.0 82.1 80.5 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:02 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 3 82.1 82.1 82.1 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:02 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 4 79.0 82.1 80.5 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:13 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 1 88.3 87.4 87.8 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:13 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 2 85.5 87.4 86.4 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:13 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 3 88.3 87.4 87.8 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:13 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 4 85.5 87.4 86.4 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:22 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 1 90.0 92.1 91.0 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:22 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 2 84.3 92.1 88.0 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:22 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 3 90.0 92.1 91.0 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:22 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 4 83.9 92.1 87.8 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:30 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 1 89.4 87.3 88.3 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:30 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 2 86.2 87.3 86.7 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:30 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 3 89.4 87.3 88.3 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:30 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 4 86.2 87.3 86.7 100.0 0.0 0.0 100.0 0.0 0.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:41 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 1 89.6 86.6 88.1 90.9 100.0 95.2 87.8 100.0 93.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:41 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 2 85.6 86.6 86.1 90.9 100.0 95.2 87.8 95.6 91.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:41 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 3 89.6 86.6 88.1 90.9 100.0 95.2 87.8 100.0 93.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:41 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF.xlsx 4 85.6 86.6 86.1 90.9 100.0 95.2 87.8 95.6 91.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_11_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:49 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 1 82.1 82.1 82.1 84.8 100.0 91.8 84.8 100.0 91.8 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:49 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 2 79.0 82.1 80.5 84.8 100.0 91.8 84.8 96.3 90.2 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:49 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 3 82.1 82.1 82.1 84.8 100.0 91.8 84.8 100.0 91.8 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:49 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF.xlsx 4 79.0 82.1 80.5 84.8 100.0 91.8 84.8 96.3 90.2 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_12_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:58 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 1 88.3 87.4 87.8 89.7 100.0 94.5 88.7 100.0 94.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:58 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 2 85.5 87.4 86.4 89.7 100.0 94.5 88.7 96.8 92.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:58 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 3 88.3 87.4 87.8 89.7 100.0 94.5 88.7 100.0 94.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:48:58 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF.xlsx 4 85.5 87.4 86.4 89.7 100.0 94.5 88.7 96.8 92.5 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_13_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:06 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 1 90.0 92.1 91.0 90.7 100.0 95.1 92.8 100.0 96.3 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:06 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 2 84.3 92.1 88.0 90.7 100.0 95.1 92.8 93.6 93.2 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:06 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 3 90.0 92.1 91.0 90.7 100.0 95.1 92.8 100.0 96.3 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:06 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF.xlsx 4 83.9 92.1 87.8 90.7 100.0 95.1 92.8 93.2 93.0 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_14_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:14 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 1 89.4 87.3 88.3 90.6 100.0 95.1 88.5 100.0 93.9 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:14 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 2 86.2 87.3 86.7 90.6 100.0 95.1 88.5 96.4 92.3 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:14 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 3 89.4 87.3 88.3 90.6 100.0 95.1 88.5 100.0 93.9 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx +2021-12-21T09:49:14 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF.xlsx 4 86.2 87.3 86.7 90.6 100.0 95.1 88.5 96.4 92.3 D:\jodijk\Dropbox\jodijk\myprograms\python\lassy\annoinputtest\ASTA\sample_15_SAF_Corr.xlsx D:\jodijk\Dropbox\jodijk\myprograms\python\sastadev\sastadev\methods\ASTA Index Current.xlsx diff --git a/test_smallclauses.py b/test_smallclauses.py new file mode 100644 index 0000000..a2d59fa --- /dev/null +++ b/test_smallclauses.py @@ -0,0 +1,199 @@ +from treebankfunctions import getstree, getnodeyield, getattval +from dedup import filledpauseslexicon +from top3000 import ishuman, transitive, intransitive, pseudotr, isanimate +from lexicon import known_word + +space = ' ' +biglocvzs = ['achter', 'beneden', 'binnen', 'boven', 'bovenop', 'buiten', 'dichtbij'] + +testbank = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\TARSP\smallclausetest.xml" +schlichtingtreebank = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\schlichtingtreebank\TREEBANK_ID.xml' +mieke06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE06_ID.xml" +mieke08 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\miekeplat_tests\TARSP_MIEKE08_ID.xml" +aurisraw = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\Auris\AURIS_ELISKA_ORIGINAL_ID.xml" +tarsp02 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_02.xml" +tarsp06 = r"D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\tarspdata\Tarsp_06.xml" + + +def realword(node): + result = True + result = result and getattval(node, 'pt') not in ['tsw', 'let'] + result = result and getattval(node, 'lemma') not in ['xx', 'xxx', 'yyy', 'www'] + result = result and getattval(node, 'lemma') not in filledpauseslexicon + + return result + + +def hasgenitive(node): + result = False + return result + +def aanwvnw(node): + result = getattval(node, 'pt') == 'vnw' and getattval(node, 'vwtype') == 'aanw' + return result + + +def n(node): + result = getattval(node, 'pt') == 'n' + return result + + +def getal(node): + result = getattval(node, 'getal') + return result + +def pt(node): + result = getattval(node, 'pt') + return result + +def bg(node): + result = int(getattval(node, 'begin')) + return result + +def tw(node): + result = getattval(node, 'pt') == 'tw' + return result + +def word(node): + result = getattval(node, 'word') + return result + + +def adj(node): + result = getattval(node, 'pt') == 'adj' + return result + + +def inf(node): + result = getattval(node, 'pt') == 'ww' and getattval(node, 'wvorm') == 'inf' + return result + + +def rpronoun(node): + result = getattval(node, 'pt') == 'vnw' and \ + getattval(node, 'lemma') in ['er', 'hier', 'daar', 'ergens', 'overal', 'nergens', 'waar'] + return result + +def bw(node): + result = getattval(node, 'pt') == 'bw' + return result + +def lemma(node): + result = getattval(node, 'lemma') + return result + +def predadv(node): + result = locadv(node) + result = result or (bw(node) and lemma(node) in ['niet', 'mee', 'weg']) + return result + +def locadv(node): + result = getattval(node, 'pt') in ['bw', 'vz'] + frame = getattval(node, 'frame') + result = result and ('loc' in frame or 'er_adverb' in frame) + result = result or rpronoun(node) + return result + +def biglocvz(node): + result = getattval(node, 'lemma') in biglocvzs + return result + +def getleavestr(leaves): + leaveseq = ['{}:{}:{}:{}'.format(getattval(leave, 'end'), getattval(leave, 'word'), getattval(leave, 'lemma'), + getattval(leave, 'pt')) for leave + in leaves] + leavestr = space.join(leaveseq) + return leavestr + +def knownnoun(node): + word = getattval(node, 'word') + lemma = getattval(node, 'lemma') + postag = pt(node) + result = postag == 'n' and (known_word(word) or known_word(lemma)) + return result + +def smallclauses(leaves, reducedleaves): + resultlist = [] + # aanwvnw or n + locbw + if len(reducedleaves) <= 3: + first = leaves[0] + second = leaves[1] + if len(reducedleaves) == 3: + third = leaves[0] + + if len(reducedleaves) == 2: + #fword = word(first) + #sword = word(second) + if (aanwvnw(first) or knownnoun(first)) and predadv(second): + fpos = int(getattval(first, 'begin')) + insertword = 'moet' if getal(first) != 'mv' else 'moeten' + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + [insertword] + [word(lv) for lv in leaves if bg(lv) > fpos] + elif (aanwvnw(second) or knownnoun(second) or tw(second)) and predadv(first): + fpos = int(getattval(first, 'begin')) + insertword = 'moet' if getal(second) != 'mv' else 'moeten' + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + [insertword] + [word(lv) for lv in leaves if + bg(lv) > fpos] + elif (aanwvnw(first) or knownnoun(first)) and adj(second): + fpos = int(getattval(first, 'begin')) + insertword = 'is' if getal(first) != 'mv' else 'zijn' + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + [insertword] + [word(lv) for lv in leaves if + bg(lv) > fpos] + elif (aanwvnw(second) or knownnoun(second) or tw(second)) and biglocvz(first): + fpos = int(getattval(first, 'begin')) + insertword = 'is' if getal(first) != 'mv' else 'zijn' + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + [insertword] + [word(lv) for lv in leaves if + bg(lv) > fpos] + elif knownnoun(first) and knownnoun(second) and not(lemma(first) == lemma(second)): + if hasgenitive(first): + pass + else: + fpos = int(getattval(first, 'begin')) + insertword = 'is' if getal(first) != 'mv' else 'zijn' + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + [insertword] + \ + [word(lv) for lv in leaves if bg(lv) > fpos] + + elif (aanwvnw(first) or knownnoun(first)) and inf(second): + if intransitive(second): + firstsubject = True + elif transitive(second) and ishuman(first): + firstsubject = True + elif pseudotr(second) and (ishuman(first) or isanimate(first)): + firstsubject = True + else: + firstsubject = False + if firstsubject: + insertwords = ['wil' if getal(first) != 'mv' else 'willen'] + fpos = int(getattval(first, 'begin')) + else: + insertwords = ['ik', 'wil'] + fpos = -1 + resultlist = [word(lv) for lv in leaves if bg(lv) <= fpos] + insertwords + [word(lv) for lv in leaves if + bg(lv) > fpos] + + return resultlist + + +def main(): + smalltest = False + if smalltest: + fullnames = [testbank] + else: + fullnames = [schlichtingtreebank, mieke06, mieke08, aurisraw, tarsp02, tarsp06] + for infullname in fullnames: + print(infullname) + fulltreebank = getstree(infullname) + if fulltreebank is not None: + treebank = fulltreebank.getroot() + for tree in treebank: + leaves = getnodeyield(tree) + reducedleaves = [leave for leave in leaves if realword(leave)] + + if len(reducedleaves) > 1 and len(reducedleaves) <= 3: + resultlist = smallclauses(leaves, reducedleaves) + if resultlist != []: + print('input: ', getleavestr(leaves), '/', getleavestr(reducedleaves)) + print('result: ', space.join(resultlist)) + + +if __name__ == '__main__': + main() diff --git a/top3000.py b/top3000.py new file mode 100644 index 0000000..ce7105b --- /dev/null +++ b/top3000.py @@ -0,0 +1,58 @@ +from xlsx import getxlsxdata +from treebankfunctions import getattval +from namepartlexicon import namepart_isa_namepart + +def ishuman(node): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + vwtype = getattval(node, 'vwtype') + result = (lemma, pt ) in semlexicon and 'human' in semlexicon[(lemma, pt)] + result = result or vwtype == 'pers' + result = result or namepart_isa_namepart(lemma) + return result + +def isanimate(node): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + result = (lemma, pt ) in semlexicon and 'animate' in semlexicon[(lemma, pt)] + return result + + +def transitivity(node, tr): + lemma = getattval(node, 'lemma') + pt = getattval(node, 'pt') + result = (lemma, pt ) in semlexicon and tr in trlexicon[(lemma, pt)] + return result + +def transitive(node): + return transitivity(node, 'tr') + +def pseudotr(node): + return transitivity(node, 'tr/intr') + + +def intransitive(node): + return transitivity(node, 'intr') + +semicolon = ';' + +filename = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\woordenlijsten\Woordenlijsten Current.xlsx' + + +lexiconheader, lexicondata = getxlsxdata(filename) + +semlexicon = {} +for row in lexicondata: + lemma = row[1] + pt = row[5] + rawsems = row[6].split(semicolon) + sems = [el.strip() for el in rawsems] + semlexicon[(lemma, pt)] = sems + +trlexicon = {} +for row in lexicondata: + lemma = row[1] + pt = row[5] + rawtrs = row[8].split(semicolon) + trs = [el.strip() for el in rawtrs] + trlexicon[(lemma, pt)] = trs From 770cecd6581c708916c90cebb443aec8dcc0e336 Mon Sep 17 00:00:00 2001 From: Odijk Date: Thu, 6 Jan 2022 10:31:30 +0100 Subject: [PATCH 02/11] smallclauses V2 --- .idea/workspace.xml | 171 +++++++++++++++++++++---------------------- test_smallclauses.py | 61 +++++++++++++-- top3000.py | 16 ++-- 3 files changed, 148 insertions(+), 100 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index d3538cf..498d7ae 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,9 +2,8 @@ - - - + + - +