diff --git a/macros/sastamacros1.txt b/macros/sastamacros1.txt index a5b665a..5034941 100644 --- a/macros/sastamacros1.txt +++ b/macros/sastamacros1.txt @@ -37,9 +37,9 @@ JO_kijken_naar = """ parent::node[@cat="pp" and robusttopicdrop = """(@cat="sv1" and ../node[@lemma="."])""" Tarsp_hww = """ - (@lemma="kunnen" or + (@lemma = "kunnen" or @lemma = "moeten" or - @lemma= "hoeven" or + @lemma = "hoeven" or @lemma = "blijven" or @lemma = "willen" or @lemma = "zullen" or diff --git a/smallclauses.py b/smallclauses.py index c6655d8..d44cbaa 100644 --- a/smallclauses.py +++ b/smallclauses.py @@ -15,6 +15,8 @@ longvowels = ['a', 'é', 'i', 'o', 'u', 'y'] vowels = ['a', 'e', 'i', 'o', 'u'] +uniquelynominativeperspros = ['ik', 'jij', 'hij', 'zij', 'wij', 'ikke', "'k", "k", "ie", "we"] + def makegen(lemma): if lemma is None or len(lemma) < 2: @@ -92,6 +94,11 @@ def perspro(node): result = pt == 'vnw' and vwtype == 'pers' return result +def nomperspro(node): + lemma = getattval(node, 'lemma') + result = perspro(node) and lemma in uniquelynominativeperspros + return result + def inf(node): result = getattval(node, 'pt') == 'ww' and getattval(node, 'wvorm') == 'inf' return result @@ -225,7 +232,8 @@ def smallclauses(tokensmd, tree): inserttokens = [Token('moet' if getal(first) != 'mv' else 'moeten', fpos, subpos=5)] resultlist = mktokenlist(tokens, fpos, inserttokens) metadata += mkinsertmeta(inserttokens, resultlist) - elif (aanwvnw(second) or knownnoun(second) or perspro(second) or tw(second)) and predadv(first): + #elif (aanwvnw(second) or knownnoun(second) or perspro(second) or tw(second)) and predadv(first): + elif nomperspro(second) and predadv(first): fpos = int(getattval(first, 'begin')) inserttokens = [Token('moet' if getal(second) != 'mv' else 'moeten', fpos, subpos=5)] resultlist = mktokenlist(tokens, fpos, inserttokens) diff --git a/top3000.py b/top3000.py index 6e13966..7bf181d 100644 --- a/top3000.py +++ b/top3000.py @@ -1,6 +1,8 @@ from xlsx import getxlsxdata from treebankfunctions import getattval from namepartlexicon import namepart_isa_namepart +from config import SD_DIR +import os def ishuman(node): lemma = getattval(node, 'lemma') @@ -36,7 +38,7 @@ def intransitive(node): semicolon = ';' -filename = r'D:\jodijk\Dropbox\jodijk\Utrecht\Projects\CLARIAH CORE\WP3\VKL\woordenlijsten\Woordenlijsten Current.xlsx' +filename = os.path.join(SD_DIR, r'top3000\Woordenlijsten Current.xlsx') lexiconheader, lexicondata = getxlsxdata(filename) diff --git a/top3000/Woordenlijsten Current.xlsx b/top3000/Woordenlijsten Current.xlsx new file mode 100644 index 0000000..7b7ac56 Binary files /dev/null and b/top3000/Woordenlijsten Current.xlsx differ