From 461cbb99d8c522f601da9663e3c9fa65be6f317f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 3 Jan 2017 18:21:29 +0100 Subject: [PATCH] Revert "Reorganise English tokenizer exceptions (as discussed in #718)" This reverts commit b19cfcc14482ccabc63da01811927b0e96650c24. --- spacy/en/tokenizer_exceptions.py | 2197 ++++++++++++++++++++++++------ 1 file changed, 1781 insertions(+), 416 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 4df3fe53540..398ae486bdd 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -5,279 +5,1791 @@ from ..language_data import PRON_LEMMA -EXC = {} - -EXCLUDE_EXC = ["Ill", "ill", "Hell", "hell", "Well", "well", "Whore", "whore"] - - -# Pronouns - -for pron in ["i"]: - for orth in [pron, pron.title()]: - EXC[orth + "'m"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1} - ] - - EXC[pron + "m"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", TAG: "VBP", "tenspect": 1, "number": 1 } - ] - - EXC[pron + "'ma"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] - - EXC[pron + "ma"] = [ - {ORTH: pron, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "m", LEMMA: "be", NORM: "am"}, - {ORTH: "a", LEMMA: "going to", NORM: "gonna"} - ] - - -for pron in ["i", "you", "he", "she", "it", "we", "they"]: - for orth in [pron, pron.title()]: - EXC[orth + "'ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] - - EXC[orth + "ll"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] - - EXC[orth + "'ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "ve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "'ll've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "llve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "'d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"} - ] - - EXC[orth + "d"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"} - ] - - EXC[orth + "'d've"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "dve"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - -for pron in ["you", "we", "they"]: - for orth in [pron, pron.title()]: - EXC[orth + "'re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] - - EXC[orth + "re"] = [ - {ORTH: orth, LEMMA: PRON_LEMMA, TAG: "PRP"}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ] - - -# W-words, relative pronouns, prepositions etc. - -for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: - for orth in [word, word.title()]: - EXC[orth + "'s"] = [ - {ORTH: orth}, - {ORTH: "'s"} - ] - - EXC[orth + "s"] = [ - {ORTH: orth}, - {ORTH: "s"} - ] - - EXC[orth + "'ll"] = [ - {ORTH: orth}, - {ORTH: "'ll", LEMMA: "will", TAG: "MD"} - ] - - EXC[orth + "ll"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"} - ] - - EXC[orth + "'ll've"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "llve"] = [ - {ORTH: orth}, - {ORTH: "ll", LEMMA: "will", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "'re"] = [ - {ORTH: orth}, - {ORTH: "'re", LEMMA: "be", NORM: "are"} - ] - - EXC[orth + "re"] = [ - {ORTH: orth}, - {ORTH: "re", LEMMA: "be", NORM: "are"} - ] - - EXC[orth + "'ve"] = [ - {ORTH: orth}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "ve"] = [ - {ORTH: orth}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "'d"] = [ - {ORTH: orth}, - {ORTH: "'d"} - ] - - EXC[orth + "d"] = [ - {ORTH: orth}, - {ORTH: "d"} - ] - - EXC[orth + "'d've"] = [ - {ORTH: orth}, - {ORTH: "'d", LEMMA: "would", TAG: "MD"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[orth + "dve"] = [ - {ORTH: orth}, - {ORTH: "d", LEMMA: "would", TAG: "MD"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - -# Verbs - -for verb_data in [ - {ORTH: "ca", LEMMA: "can", TAG: "MD"}, - {ORTH: "could", TAG: "MD"}, - {ORTH: "do", LEMMA: "do"}, - {ORTH: "does", LEMMA: "do"}, - {ORTH: "did", LEMMA: "do", TAG: "VBD"}, - {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "might"}, - {ORTH: "must"}, - {ORTH: "need"}, - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "should"}, - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "would"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ - dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] - - EXC[data[ORTH] + "nt"] = [ - dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] - - EXC[data[ORTH] + "n't've"] = [ - {ORTH: "n't", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[data[ORTH] + "ntve"] = [ - {ORTH: "nt", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - -for verb_data in [ - {ORTH: "could", TAG: "MD"}, - {ORTH: "might"}, - {ORTH: "must"}, - {ORTH: "should"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "'ve"] = [ - dict(data), - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ] - - EXC[data[ORTH] + "ve"] = [ - dict(data), - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ] - - -for verb_data in [ - {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, - {ORTH: "are", LEMMA: "be", TAG: "VBP", "number": 2}, - {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, - {ORTH: "was", LEMMA: "be"}, - {ORTH: "were", LEMMA: "be"} -]: - verb_data_tc = dict(verb_data) - verb_data_tc[ORTH] = verb_data_tc[ORTH].title() - - for data in [verb_data, verb_data_tc]: - EXC[data[ORTH] + "n't"] = [ - dict(data), - {ORTH: "n't", LEMMA: "not", TAG: "RB"} - ] - - EXC[data[ORTH] + "nt"] = [ - dict(data), - {ORTH: "nt", LEMMA: "not", TAG: "RB"} - ] - - -# Abbreviations - -ABBREVIATIONS = { +TOKENIZER_EXCEPTIONS = { + "and/or": [ + {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} + ], + + "Theydve": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "shouldn't've": [ + {ORTH: "should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "There'll": [ + {ORTH: "There"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "howll": [ + {ORTH: "how"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "Hadn't've": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "who'll": [ + {ORTH: "who"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "aint": [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + " ": [ + {TAG: "SP", ORTH: " "} + ], + + "Shouldnt": [ + {ORTH: "Should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "when's": [ + {ORTH: "when"}, + {ORTH: "'s", LEMMA: "be"} + ], + + "Didnt": [ + {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "itll": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "Who're": [ + {ORTH: "Who"}, + {ORTH: "'re", LEMMA: "be"} + ], + + "Ain't": [ + {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Can't": [ + {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Whyre": [ + {ORTH: "Why"}, + {ORTH: "re"} + ], + + "Aren't": [ + {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Neednt": [ + {ORTH: "Need"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "should've": [ + {ORTH: "should"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "shouldn't": [ + {ORTH: "should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Idve": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "weve": [ + {ORTH: "we"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Ive": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "they'd": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Youdve": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "theyve": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Weren't": [ + {ORTH: "Were"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "werent": [ + {ORTH: "were"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whyre": [ + {ORTH: "why"}, + {ORTH: "re"} + ], + + "I'm": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], + + "She'd've": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "not've": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "we'll": [ + {ORTH: "we"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Don't": [ + {ORTH: "Do", LEMMA: "do"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Whyll": [ + {ORTH: "Why"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "they've": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "wasn't": [ + {ORTH: "was"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "could've": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "what've": [ + {ORTH: "what"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "havent": [ + {ORTH: "have", TAG: "VB"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Who've": [ + {ORTH: "Who"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Shan't": [ + {ORTH: "Sha", LEMMA: "shall"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "i'll": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "you'd": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "whens": [ + {ORTH: "when"}, + {ORTH: "s", LEMMA: "be"} + ], + + "whys": [ + {ORTH: "why"}, + {ORTH: "s"} + ], + + "Whereve": [ + {ORTH: "Where"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "\u00a0": [ + {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} + ], + + "there'd": [ + {ORTH: "there"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "hadn't've": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "whatll": [ + {ORTH: "what"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "wouldn't've": [ + {ORTH: "would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "there's": [ + {ORTH: "there"}, + {ORTH: "'s"} + ], + + "Who'll": [ + {ORTH: "Who"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "youll": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "wouldve": [ + {ORTH: "would"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldnt": [ + {ORTH: "Would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Thered": [ + {ORTH: "There"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Youre": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be"} + ], + + "Couldn't've": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "who're": [ + {ORTH: "who"}, + {ORTH: "'re", LEMMA: "be"} + ], + + "Whys": [ + {ORTH: "Why"}, + {ORTH: "s"} + ], + + "mightn't've": [ + {ORTH: "might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wholl": [ + {ORTH: "Who"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "hadn't": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Havent": [ + {ORTH: "Have", TAG: "VB"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Whatve": [ + {ORTH: "What"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Thats": [ + {ORTH: "That"}, + {ORTH: "s"} + ], + + "Howll": [ + {ORTH: "How"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "wouldn't": [ + {ORTH: "would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "You'll": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Cant": [ + {ORTH: "Ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "i'd": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "weren't": [ + {ORTH: "were"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "would've": [ + {ORTH: "would"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "i'm": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], + + "why'll": [ + {ORTH: "why"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "we'd've": [ + {ORTH: "we"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Shouldve": [ + {ORTH: "Should"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "can't": [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "thats": [ + {ORTH: "that"}, + {ORTH: "s"} + ], + + "Hes": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "Needn't": [ + {ORTH: "Need"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "It's": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "Why're": [ + {ORTH: "Why"}, + {ORTH: "'re", LEMMA: "be"} + ], + + "Hed": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + "Mt.": [ {ORTH: "Mt.", LEMMA: "Mount"} ], + "couldn't": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "What've": [ + {ORTH: "What"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "It'd": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "theydve": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "aren't": [ + {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Mightn't": [ + {ORTH: "Might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "'S": [ + {ORTH: "'S", LEMMA: "'s"} + ], + + "I've": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whered": [ + {ORTH: "Where"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Itdve": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "I'ma": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ma"} + ], + + "whos": [ + {ORTH: "who"}, + {ORTH: "s"} + ], + + "They'd": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "What'll": [ + {ORTH: "What"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "You've": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mustve": [ + {ORTH: "Must"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "whod": [ + {ORTH: "who"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "mightntve": [ + {ORTH: "might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "I'd've": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Must've": [ + {ORTH: "Must"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "it'd": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "what're": [ + {ORTH: "what"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "Wasn't": [ + {ORTH: "Was"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "what's": [ + {ORTH: "what"}, + {ORTH: "'s"} + ], + + "he'd've": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "She'd": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "shedve": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "ain't": [ + {ORTH: "ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "She's": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "i'd've": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "We'd've": [ + {ORTH: "We"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "must've": [ + {ORTH: "must"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "That's": [ + {ORTH: "That"}, + {ORTH: "'s"} + ], + + "whatre": [ + {ORTH: "what"}, + {ORTH: "re"} + ], + + "you'd've": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Dont": [ + {ORTH: "Do", LEMMA: "do"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "thered": [ + {ORTH: "there"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Youd": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "couldn't've": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whens": [ + {ORTH: "When"}, + {ORTH: "s"} + ], + + "Isnt": [ + {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "mightve": [ + {ORTH: "might"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "didnt": [ + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "ive": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "It'd've": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "\t": [ + {ORTH: "\t", TAG: "SP"} + ], + + "Itll": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "didn't": [ + {ORTH: "did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "cant": [ + {ORTH: "ca", LEMMA: "can", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "im": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be"} + ], + + "they'd've": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Hadntve": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Weve": [ + {ORTH: "We"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Mightnt": [ + {ORTH: "Might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "youdve": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Shedve": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "theyd": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Cannot": [ + {ORTH: "Can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Hadn't": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "What're": [ + {ORTH: "What"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "He'll": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "wholl": [ + {ORTH: "who"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "They're": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "shouldnt": [ + {ORTH: "should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "\n": [ + {ORTH: "\n", TAG: "SP"} + ], + + "whered": [ + {ORTH: "where"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "youve": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "notve": [ + {ORTH: "not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldve": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "mustve": [ + {ORTH: "must"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Youve": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "therell": [ + {ORTH: "there"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "might've": [ + {ORTH: "might"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mustn't": [ + {ORTH: "Must"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "wheres": [ + {ORTH: "where"}, + {ORTH: "s"} + ], + + "they're": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "idve": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "hows": [ + {ORTH: "how"}, + {ORTH: "s"} + ], + + "youre": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "Didn't": [ + {ORTH: "Did", LEMMA: "do", TAG: "VBD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Couldve": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "cannot": [ + {ORTH: "can", LEMMA: "can", TAG: "MD"}, + {ORTH: "not", LEMMA: "not", TAG: "RB"} + ], + + "Im": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "m", TAG: "VBP", "tenspect": 1, "number": 1, LEMMA: "be", NORM: "am"} + ], + + "howd": [ + {ORTH: "how"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "you've": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "You're": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "she'll": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Theyll": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "don't": [ + {ORTH: "do", LEMMA: "do"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "itd": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Hedve": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "isnt": [ + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "won't": [ + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "We're": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "\u2018S": [ + {ORTH: "\u2018S", LEMMA: "'s"} + ], + + "\u2018s": [ + {ORTH: "\u2018s", LEMMA: "'s"} + ], + + "dont": [ + {ORTH: "do", LEMMA: "do"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "ima": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ma"} + ], + + "Let's": [ + {ORTH: "Let"}, + {ORTH: "'s", LEMMA: "us"} + ], + + "he's": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "we've": [ + {ORTH: "we"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "What's": [ + {ORTH: "What"}, + {ORTH: "'s"} + ], + + "Who's": [ + {ORTH: "Who"}, + {ORTH: "'s"} + ], + + "hedve": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "he'd": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "When's": [ + {ORTH: "When"}, + {ORTH: "'s"} + ], + + "Mightn't've": [ + {ORTH: "Might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "We've": [ + {ORTH: "We"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Couldntve": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Who'd": [ + {ORTH: "Who"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "haven't": [ + {ORTH: "have", TAG: "VB"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "arent": [ + {ORTH: "are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "You'd've": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldn't": [ + {ORTH: "Would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "who's": [ + {ORTH: "who"}, + {ORTH: "'s"} + ], + + "Mightve": [ + {ORTH: "Might"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Theredve": [ + {ORTH: "There"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "theredve": [ + {ORTH: "there"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "who'd": [ + {ORTH: "who"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Where's": [ + {ORTH: "Where"}, + {ORTH: "'s"} + ], + + "wont": [ + {ORTH: "wo", LEMMA: "will"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "she'd've": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Should've": [ + {ORTH: "Should"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "theyre": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re"} + ], + + "Wouldntve": [ + {ORTH: "Would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Where've": [ + {ORTH: "Where"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mustn't": [ + {ORTH: "must"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "isn't": [ + {ORTH: "is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Aint": [ + {ORTH: "Ai", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "why's": [ + {ORTH: "why"}, + {ORTH: "'s"} + ], + + "There'd": [ + {ORTH: "There"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "They'll": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "how'll": [ + {ORTH: "how"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Wedve": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldntve": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "There's": [ + {ORTH: "There"}, + {ORTH: "'s"} + ], + + "we'd": [ + {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Whod": [ + {ORTH: "Who"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "whatve": [ + {ORTH: "what"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wouldve": [ + {ORTH: "Would"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "there'll": [ + {ORTH: "there"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "needn't": [ + {ORTH: "need"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "shouldntve": [ + {ORTH: "should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "why're": [ + {ORTH: "why"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "Doesnt": [ + {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whereve": [ + {ORTH: "where"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "they'll": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "I'd": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Might've": [ + {ORTH: "Might"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mightnt": [ + {ORTH: "might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Not've": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "mightn't": [ + {ORTH: "might"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "you're": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM: "are"} + ], + + "They've": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "what'll": [ + {ORTH: "what"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Could've": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Would've": [ + {ORTH: "Would"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Isn't": [ + {ORTH: "Is", LEMMA: "be", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "let's": [ + {ORTH: "let"}, + {ORTH: "'s", LEMMA: "us"} + ], + + "She'll": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "You'd": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "wouldnt": [ + {ORTH: "would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Why'll": [ + {ORTH: "Why"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Where'd": [ + {ORTH: "Where"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Theyre": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "Won't": [ + {ORTH: "Wo", LEMMA: "will"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Couldn't": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "it's": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "it'll": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "They'd've": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Ima": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ma"} + ], + + "gonna": [ + {ORTH: "gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "Gonna": [ + {ORTH: "Gon", LEMMA: "go", NORM: "going"}, + {ORTH: "na", LEMMA: "to"} + ], + + "whats": [ + {ORTH: "what"}, + {ORTH: "s"} + ], + + "How's": [ + {ORTH: "How"}, + {ORTH: "'s"} + ], + + "Shouldntve": [ + {ORTH: "Should"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "youd": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "Whatll": [ + {ORTH: "What"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "Wouldn't've": [ + {ORTH: "Would"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "How'd": [ + {ORTH: "How"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "doesnt": [ + {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Shouldn't": [ + {ORTH: "Should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "He'd've": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Mightntve": [ + {ORTH: "Might"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "couldnt": [ + {ORTH: "could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Haven't": [ + {ORTH: "Have", TAG: "VB"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "doesn't": [ + {ORTH: "does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Hasn't": [ + {ORTH: "Has"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "how's": [ + {ORTH: "how"}, + {ORTH: "'s"} + ], + + "hes": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "he'll": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "hed": [ + {ORTH: "he", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "how'd": [ + {ORTH: "how"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "we're": [ + {ORTH: "we", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'re", LEMMA: "be", NORM :"are"} + ], + + "Hadnt": [ + {ORTH: "Had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Shant": [ + {ORTH: "Sha", LEMMA: "shall"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Theyve": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Hows": [ + {ORTH: "How"}, + {ORTH: "s"} + ], + + "We'll": [ + {ORTH: "We"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "i've": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whove": [ + {ORTH: "Who"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "i'ma": [ + {ORTH: "i", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ma"} + ], + + "Howd": [ + {ORTH: "How"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "hadnt": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "shant": [ + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "There'd've": [ + {ORTH: "There"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "I'll": [ + {ORTH: "I", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Why's": [ + {ORTH: "Why"}, + {ORTH: "'s"} + ], + + "Shouldn't've": [ + {ORTH: "Should"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Wasnt": [ + {ORTH: "Was"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whove": [ + {ORTH: "who"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "hasn't": [ + {ORTH: "has"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "wouldntve": [ + {ORTH: "would"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Wheres": [ + {ORTH: "Where"}, + {ORTH: "s"} + ], + + "How'll": [ + {ORTH: "How"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "there'd've": [ + {ORTH: "there"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whos": [ + {ORTH: "Who"}, + {ORTH: "s"} + ], + + "shes": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "Doesn't": [ + {ORTH: "Does", LEMMA: "do", TAG: "VBZ"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Arent": [ + {ORTH: "Are", TAG: "VBP", "number": 2, LEMMA: "be"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Hasnt": [ + {ORTH: "Has", LEMMA: "have"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "He's": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "wasnt": [ + {ORTH: "was"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "whyll": [ + {ORTH: "why"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "mustnt": [ + {ORTH: "must"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "He'd": [ + {ORTH: "He", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Shes": [ + {ORTH: "She", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "s"} + ], + + "where've": [ + {ORTH: "where"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Youll": [ + {ORTH: "You", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "hasnt": [ + {ORTH: "has"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "theyll": [ + {ORTH: "they", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "it'd've": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "itdve": [ + {ORTH: "it", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "wedve": [ + {ORTH: "we"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "Werent": [ + {ORTH: "Were"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Therell": [ + {ORTH: "There"}, + {ORTH: "ll", LEMMA: "will", TAG: "MD"} + ], + + "shan't": [ + {ORTH: "sha", LEMMA: "shall"}, + {ORTH: "n't", LEMMA: "not", TAG: "RB"} + ], + + "Wont": [ + {ORTH: "Wo", LEMMA: "will"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "hadntve": [ + {ORTH: "had", LEMMA: "have", TAG: "VBD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "who've": [ + {ORTH: "who"}, + {ORTH: "'ve", LEMMA: "have", TAG: "VB"} + ], + + "Whatre": [ + {ORTH: "What"}, + {ORTH: "re", LEMMA: "be", NORM: "are"} + ], + + "'s": [ + {ORTH: "'s", LEMMA: "'s"} + ], + + "where'd": [ + {ORTH: "where"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "shouldve": [ + {ORTH: "should"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "where's": [ + {ORTH: "where"}, + {ORTH: "'s"} + ], + + "neednt": [ + {ORTH: "need"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "It'll": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "We'd": [ + {ORTH: "We", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Whats": [ + {ORTH: "What"}, + {ORTH: "s"} + ], + + "\u2014": [ + {ORTH: "\u2014", TAG: ":", LEMMA: "--"} + ], + + "Itd": [ + {ORTH: "It", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "she'd": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'d", LEMMA: "would", TAG: "MD"} + ], + + "Mustnt": [ + {ORTH: "Must"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "Notve": [ + {ORTH: "Not", LEMMA: "not", TAG: "RB"}, + {ORTH: "ve", LEMMA: "have", TAG: "VB"} + ], + + "you'll": [ + {ORTH: "you", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'ll", LEMMA: "will", TAG: "MD"} + ], + + "Theyd": [ + {ORTH: "They", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "d", LEMMA: "would", TAG: "MD"} + ], + + "she's": [ + {ORTH: "she", LEMMA: PRON_LEMMA, TAG: "PRP"}, + {ORTH: "'s"} + ], + + "Couldnt": [ + {ORTH: "Could", TAG: "MD"}, + {ORTH: "nt", LEMMA: "not", TAG: "RB"} + ], + + "that's": [ + {ORTH: "that"}, + {ORTH: "'s"} + ], + + "'em": [ + {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} + ], + + "ol'": [ + {ORTH: "ol'", LEMMA: "old"} + ], + "Ak.": [ {ORTH: "Ak.", LEMMA: "Alaska"} ], @@ -488,153 +2000,6 @@ } -# Other exceptions - -OTHER = { - " ": [ - {ORTH: " ", TAG: "SP"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "'cause": [ - {ORTH: "'cause", LEMMA: "because"} - ], - - "y'all": [ - {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "yall": [ - {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "em": [ - {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "nothin'": [ - {ORTH: "nothin'", LEMMA: "nothing"} - ], - - "nuthin'": [ - {ORTH: "nuthin'", LEMMA: "nothing"} - ], - - "'nuff": [ - {ORTH: "'nuff", LEMMA: "enough"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ] -} - - -TOKENIZER_EXCEPTIONS = dict(EXC) -TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) -TOKENIZER_EXCEPTIONS.update(OTHER) - - -# Remove EXCLUDE_EXC if in exceptions - -for string in EXCLUDE_EXC: - if string in TOKENIZER_EXCEPTIONS: - TOKENIZER_EXCEPTIONS.pop(string) - - -# Abbreviations with only one ORTH token - ORTH_ONLY = [ "''", "\")",