diff --git a/README.md b/README.md index f6071a3..fdfd107 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Author: Richard Paul Hudson, msg systems a - [6.3 Version 1.1.0](#version-110) - [6.4 Version 1.1.1](#version-111) - [6.5 Version 1.1.2](#version-112) + - [6.6 Version 1.1.3](#version-113) - [7. Open issues/requests for assistance](#open-issues) @@ -564,6 +565,12 @@ The initial open-source version. - Added support for French, which was kindly supplied by [Pantalaymon](https://github.com/Pantalaymon) + +##### 6.6 Version 1.1.3 + +- Updated French rules to new version, again supplied by [Pantalaymon](https://github.com/Pantalaymon) +- Fixed an endless-loop problem in `language_independent_is_anaphoric_pair()` + ### 7. Open issues / requests for assistance diff --git a/coreferee/lang/fr/data/blacklisted_nouns.dat b/coreferee/lang/fr/data/blacklisted_nouns.dat new file mode 100644 index 0000000..ea3d268 --- /dev/null +++ b/coreferee/lang/fr/data/blacklisted_nouns.dat @@ -0,0 +1,263 @@ +# Fractions +# Informatique +# metrics units +# non International System metrics +A +A +A +A/m +C +C/kg +Eb +F +Flux +H +Hz +J +K +K +L +M +MHz +MJ +MPa +MV +MW +N +Nm +Pa +S +T +T +V +VA +W +Wb +a +acre +ampère +ampère +ampère +ampère par mètre +année +are +arpent +baril +baud +bel +bit +brasse +byte +cL +candela +carat +cd +centigramme +centilitre +centimorgan +centimètre +centimètre carré +centimètre cube +cg +cm +cm2 +cm3 +coudée +coulomb +coulomb par kilogramme +curie +d +dB +dL +degré +degré celsius +dg +dm +dm2 +dm3 +décibel +décigramme +décilitre +décimètre +décimètre carré +décimètre cube +encablure +exabit +farad +g +galopin +gigabit +gigabyte +gr +grade +grain +gramme +h +ha +hectare +hectare +hectolitre +hectomètre carré +henry +hertz +heure +hl +hm2 +joule +jour +kA +kHz +kJ +kV +kW +kapok +kelvin +kelvin +kg +kg +kg/m3 +kiloampère +kilobit +kilogramme +kilogramme +kilogramme par mètre cube +kilohertz +kilojoule +kilomole +kilomètre carré +kilomètre cube +kilomètre par heure +kilovolt +kilowatt +km/h +km2 +km3 +kmol +lieue +litre +m +m +m/s +m/s2 +m2 +m3 +mA +mH +mL +mV +magnétique +mg +microampère +microfarad +microgramme +microhenry +microhm +micromole +micromètre +microseconde +microvolt +microwatt +mille +milliampère +milligramme +millihenry +millilitre +millimole +millimètre +millimètre carré +milliseconde +millivolt +min +minute +mm +mm2 +mmol +mol +mol +mol par mètre cube +mol/m3 +mole +mole +morgan +ms +mètre +mètre +mètre carré +mètre cube +mètre par seconde +mètre par seconde carré +mégabit +mégabyte +mégahertz +mégajoule +mégapascal +mégavolt +mégawatt +mégohm +nF +nanofarad +newton +newton-mètre +noeud +octet +ohm +pF +pas +pascal +paume +pce +perche +picofarad +picotin +pied +pixel +pouce +ppb +ppcm +ppm +psu +quintal +r +r/min +r/s +rad +rad/s +radian +radian +radian par seconde +s +s +seconde +seconde +siemens +t +tesla +tonne +tonne +tonneau +tour +tour par minute +tour par seconde +térabit +téraohm +verge +verste +volt +voltampère +watt +weber +yard +yottabit +zettabitzolotnik +µ +˚ +˚C +μA +μF +μH +μV +μW +μg +μm +μm +μs diff --git a/coreferee/lang/fr/data/blacklisted_phrases.dat b/coreferee/lang/fr/data/blacklisted_phrases.dat index f06293d..14b6632 100644 --- a/coreferee/lang/fr/data/blacklisted_phrases.dat +++ b/coreferee/lang/fr/data/blacklisted_phrases.dat @@ -6,5 +6,12 @@ par exemple sans surprise à droite à gauche -à l'instantà mon avis +à jour +à l'instant +à mon avis à mon sens +à part +à pic +à propos +à suivre +à vif diff --git a/coreferee/lang/fr/data/mixed_gender_person_roles.dat b/coreferee/lang/fr/data/mixed_gender_person_roles.dat new file mode 100644 index 0000000..e06bdf2 --- /dev/null +++ b/coreferee/lang/fr/data/mixed_gender_person_roles.dat @@ -0,0 +1,129 @@ +alto +arbitre +architecte +artiste +assesseur +athlète +auteur +auxiliaire +barista +bourgmestre +bourreau +boutefeu +cadre +censeur +chef +chef-coq +clerc +clown +comique +commandeur +critique +dentiste +denturologiste +dermatologue +diabétologue +dialectologue +diamantaire +diplomate +disc-jockey +disquaire +diététiste +docteur +défenseur +enseigne +expéditionnaire +factotum +flic +gangster +garde +gendarme +gouverneur +guide +guérilléro +hôte +idole +imposteur +imprésario +individu +ingénieur +intercesseur +intermédiaire +interne +jockey +judoka +juge +junior +jurisconsulte +karatéka +kiné +libraire +légiste +maire +maitre +major +mannequin +manucure +manœuvre +marin +membre +mentor +mime +modèle +mousse +mécène +médecin +métallo +notaire +orl +p.d.g +pape +parlementaire +pasteur +pdg +peintre +personne +philosophe +photographe +pilote +pirate +pizzaiolo +porte-parole +possesseur +poète +procureur +professeur +proviseur +prudhomme +précurseur +prédécesseur +psychothérapeute +pédicure +quartier-maitre +questeur +radiologue +responsable +réceptionnaire +scripte +signataire +sous-chef +souverain +sponsor +star +successeur +sénior +titulaire +traiteur +tribun +troll +trompette +tuba +témoin +vacataire +vainqueur +vedette +vétérinaire +webmestre +yogi +écrivain +évêque diff --git a/coreferee/lang/fr/data/person_roles.dat b/coreferee/lang/fr/data/person_roles.dat index ac5487a..7256de5 100644 --- a/coreferee/lang/fr/data/person_roles.dat +++ b/coreferee/lang/fr/data/person_roles.dat @@ -941,6 +941,7 @@ câbleur câbleuse câblier câblière +célébrité céramiste céramologue céréaliculteur @@ -1372,7 +1373,6 @@ foudrier foudrière fouleur fouleuse -fournisseur fournisseuse fourreur fourreuse @@ -1602,6 +1602,7 @@ ichtyologiste iconographe identificateur identificatrice +idole idéologue illuminateur illuminatrice @@ -2068,6 +2069,7 @@ neurobiologiste neurochirurgien neurochirurgienne neurologue +neuropsychologue nomenclaturiste notaire notateur @@ -2878,6 +2880,7 @@ souscripteur souscriptrice soussigné soussignée +souverain soviétologue speaker speakerine @@ -2900,6 +2903,7 @@ staffeur staffeuse stagiaire standardiste +star statisticien statisticienne steward @@ -3155,6 +3159,7 @@ vannier vannière varappeur varappeuse +vedette veilleur veilleuse vendangeur diff --git a/coreferee/lang/fr/data/plural_toponyms.dat b/coreferee/lang/fr/data/plural_toponyms.dat new file mode 100644 index 0000000..dd85a46 --- /dev/null +++ b/coreferee/lang/fr/data/plural_toponyms.dat @@ -0,0 +1,394 @@ +ABLEUVENETTES +ABRETS +ABYMES +ACHARDS +ADJOTS +ADRETS +ADRETS-DE-L'ESTÉREL +AGEUX +AIRES +AIX-D'ANGILLON +ALBRES +ALLIÉS +ALLUES +ALLUETS-LE-ROI +ALPES +ALPES-DE-HAUTE-PROVENCE +ALPILLES +ALÉOUTIENNES +ANCIZES-COMPS +ANDELYS +ANDES +ANGLES +ANGLES +ANGLES +ANGLES-SUR-CORRÈZE +ANSES-D'ARLET +ANTILLES +APENNINS +APPALACHES +ARCS +ARDENNES +ARDILLATS +ARQUES +ARSURES +ARTIGUES-DE-LUSSAC +ASPRES +ASSIONS +ATTAQUES +AULNEAUX +AUTELS +AUTELS-VILLEVILLON +AUTHIEUX +AUTHIEUX-DU-PUITS +AUTHIEUX-SUR-CALONNE +AUTHIEUX-SUR-LE-PORT-SAINT-OUEN +AUXONS +AVANCHERS-VALMOREL +AVENIÈRES VEYRINS-THUELLIN +AVIRONS +AYNANS +AYVELLES +AÇORES +BAHAMAS +BARILS +BAROCHES +BARTHES +BASQUES +BASSES-ALPES +BASSES-PYRÉNÉES +BAUX-DE-BRETEUIL +BAUX-DE-PROVENCE +BAUX-SAINTE-CROIX +BELLEVILLE +BESSONS +BILLANGES +BILLAUX +BIZOTS +BOIS D'ANJOU +BONDONS +BORDES +BORDES +BORDES +BORDES +BORDES-AUMONT +BORDES-SUR-ARIZE +BOTTEREAUX +BOUCHOUX +BROUZILS +BRULAIS +BRUNELS +BRÉSEUX +BRÉVIAIRES +BÂTIES +CABANNES +CABANNES +CAMMAZES +CANARIES +CARPATES +CARS +CASSÉS +CENT-ACRES +CERQUEUX +CHALESMES +CHAMPEAUX +CHAMPS-GÉRAUX +CHAPELLES +CHAPELLES-BOURBON +CHARMONTOIS +CHAVANNES-EN-MAURIENNE +CHENAUX +CHOUX +CHUTES-DE-LA-CHAUDIÈRE +CHÂTELETS +CHÂTELIERS +CHÂTELLIERS-NOTRE-DAME +CHÈRES +CLAYES-SOUS-BOIS +CLEFS +CLUSES +CLÉRIMOIS +COLLINES +COMBES +COMTÉS +CONTAMINES-MONTJOIE +CORVÉES-LES-YYS +COSTES-GOZON +COTEAUX +CRESNAYS +CROZETS +CROÛTES +CYCLADES +CÔTES-D'AREY +CÔTES-DE-CORPS +DAMPS +DEUX ALPES +DEUX-FAYS +DEUX-MONTAGNES +DEUX-SÈVRES +DEUX-VILLES +DÉSERTS +EMIRATS +EPESSES +ESSARDS +ESSARDS +ESSARDS-TAIGNEVAUX +ESSARTS +ESSARTS-LE-ROI +ESSARTS-LE-VICOMTE +ESSARTS-LÈS-SÉZANNE +ESSEINTES +ESTABLES +ETATS-UNIS +ETCHEMINS +EYZIES +FARGES +FERRES +FESSEY +FINS +FONTENELLES +FORGES +FORGES +FOSSES +FOUGERÊTS +FOURGS +GARENNES SUR LOIRE +GENETTES +GETS +GHATS +GONDS +GOULLES +GOURS +GRANDES-ARMOISES +GRANDES-CHAPELLES +GRANDES-LOGES +GRANDES-VENTES +GRANDS-CHÉZEAUX +GRANGES +GRANGES-GONTARDES +GRANGES-LE-ROI +GRANGETTES +GRAS +GROSEILLERS +GUERREAUX +HAIES +HALLES +HAUTES-ALPES +HAUTES-PYRÉNÉES +HAUTES-RIVIÈRES +HAUTS DE FORTERRE +HAUTS-D'ANJOU +HAUTS-DE-CAUX +HAUTS-DE-CHÉE +HAUTS-DE-SEINE +HAUTS-TALICAN +HAYES +HAYS +HERBIERS +HERMAUX +HERMITES +HOGUES +HOUCHES +HÉBRIDES +HÔPITAUX-NEUFS +HÔPITAUX-VIEUX +IFFS +IFS +ILHES +ISLES-BARDEL +ISLETTES +ISSARDS +ISTRES-ET-BURY +JARDINS-DE-NAPIERVILLE +JUNIES +LANDES +LANDES-GENUSSON +LAUBIES +LAURENTIDES +LILAS +LOGES +LOGES +LOGES +LOGES-EN-JOSAS +LOGES-MARCHIS +LOGES-MARGUERON +LOGES-SAULCES +LOGES-SUR-BRÉCEY +LUCS-SUR-BOULOGNE +LÈCHES +LÈVES-ET-THOUMEYRAGUES +MADONIES +MAGES +MAGNILS-REIGNIERS +MAGNY +MAILLYS +MALDIVES +MALDIVES +MARITIMES +MARS +MARTRES-D'ARTIÈRE +MARTRES-DE-VEYRE +MARTYS +MARÊTS +MASCAREIGNES +MASKOUTAINS +MATELLES +MATHES +MAYONS +MAZURES +MENUS +MESNEUX +MESNULS +MOITIERS-D'ALLONNE +MOLIÈRES +MOLLETTES +MOLUQUES +MONCEAUX +MONTHAIRONS +MONTILS +MONTS +MONTS D'ANDAINE +MONTS D'AUNAY +MONTS DU ROUMOIS +MONTS-VERTS +MOULINS +MOUSSIÈRES +MOUTIERS-EN-AUGE +MOUTIERS-EN-CINGLAIS +MOUTIERS-EN-RETZ +MUJOULS +MUREAUX +MÉES +MÉES +MÉTAIRIES +NANS +NEYROLLES +NOUILLERS +NOËS +NOËS-PRÈS-TROYES +NÉBRODES +OLLIÈRES-SUR-EYRIEUX +OMERGUES +ORCADES +ORMES +ORMES +ORMES-SUR-VOULZIE +ORRES +PAROCHES +PAVILLONS-SOUS-BOIS +PAYS-BAS +PAYS-D'EN-HAUT +PECHS DU VERS +PEINTURES +PENNES-MIRABEAU +PENNINES +PETITES-ARMOISES +PETITES-LOGES +PHILIPPINES +PIEUX +PILLES +PINEAUX +PINS +PINTHIÈRES +PLACES +PLAINS-ET-GRANDS-ESSARTS +PLANCHES-EN-MONTAGNE +PLANCHES-PRÈS-ARBOIS +PLANS +PLANS +PLANTIERS +PONTETS +PONTS-DE-CÉ +PORTES DU COGLAIS +PORTES-EN-RÉ +POULIÈRES +PRADEAUX +PRAIRIES +PREMIERS SAPINS +PRÉAUX +PRÉS +PUJOLS +PYRÉNÉES +PYRÉNÉES-ATLANTIQUES +PYRÉNÉES-ORIENTALES +RAIRIES +REPÔTS +RESSUINTES +RHODOPES +RICEYS +RIVES +RIVIÈRES-HENRUEL +ROCHES-DE-CONDRIEU +ROCHES-L'ÉVÊQUE +ROCHEUSES +ROISES +ROUGES-EAUX +ROUSSES +RUES-DES-VIGNES +SABLES-D'OLONNE +SALCES +SALELLES +SALELLES +SALLES +SALLES-DE-CASTILLON +SALLES-DU-GARDON +SALLES-LAVAUGUYON +SALLES-SUR-VERDON +SAUVAGES +SEPTVALLONS +SEYCHELLES +SIÈGES +SORINIÈRES +SORLINGUES +SOUHESMES-RAMPONT +SOURCES +SPORADES +TERNES +TERRES-DE-CHAUX +TERRITOIRES +THILLIERS-EN-VEXIN +THONS +THUILES +TONILS +TOUCHES +TOUCHES-DE-PÉRIGNY +TOURREILLES +TOURRETTES +TROIS-BASSINS +TROIS-CHÂTEAUX +TROIS-DOMAINES +TROIS-MOUTIERS +TROIS-PIERRES +TROIS-ÎLETS +ULIS +ULMES +VALLOIS +VALLÉES DE LA VANNE +VANS +VASTRES +VELLUIRE-SUR-VENDÉE +VENTES +VENTES-DE-BOURSE +VIGNEAUX +VILLAGES VOVÉENS +VILLARDS-SUR-THÔNES +VILLEDIEU +VILLETTES +VOIVRES +VOSGES +YVELINES +YVETEAUX +ÉCHELLES +ÉCORCES +ÉCRENNES +ÉDUTS +ÉGLISES-D'ARGENTEUIL +ÉGLISOTTES-ET-CHALAURES +ÉMIRATS +ÉPARGES +ÉPARRES +ÉTANGS +ÉTATS-UNIS +ÉTILLEUX +ÎLES +ÎLES-DE-LA-MADELEINE diff --git a/coreferee/lang/fr/data/verbs_with_personal_subject.dat b/coreferee/lang/fr/data/verbs_with_personal_subject.dat index 2b38297..2de341f 100644 --- a/coreferee/lang/fr/data/verbs_with_personal_subject.dat +++ b/coreferee/lang/fr/data/verbs_with_personal_subject.dat @@ -1,49 +1,147 @@ +acquiescer +admettre +adorer affirmer +agréer +aimer +ajouter aviser +avouer +balbutier +baragouiner +batifoler +blaguer +blasphémer +boire bredouiller bégayer +cambrioler +chahuter chanter +chantonner +chipoter chuchoter +chérir cogiter +commenter +compatir concevoir conjecturer connaitre connaître conseiller +courir crier croire +devine +dicter dire +discourir douter +déblatérer +déduire délibérer désirer +détester espérer estimer exclamer fredonner +fredonner +geindre +grommeler gueuler +gémir +haleter +haïr +honnir hurler +hésiter imaginer +immiter +implorer implorer +injurier insulter +ironiser jacasser +jeter juger +lamenter +lire +marcher +marmonner +marrer marteler +maudire mentir +modifier +moquer murmurer +médire méditer +mépriser +nommer +noter +objecter +opiner +oublier +palabrer +pardonner +parler parler parler penser +persifler +pinailler +pipléter plaider prier +prier proclamer proférer +protester +pâmer +pérorer +raconter raisonner +rajouter +regretter +remarquer remarquer +ricaner +rigoler +rire +rougir +râler +réagir +réciter +réjouir +résumer rêvasser rêver +sangloter +sauter savoir +siffler +signer +singer +soliloquer +souffle +souligner +soupirer soupçonner +sourire souvenir +suffoquer +supplier +susurrer +tancer +taquiner +tergiverser +vilipender +vitupérer vociférer +vénérerzozoter ânonner +écrire diff --git a/coreferee/lang/fr/language_specific_rules.py b/coreferee/lang/fr/language_specific_rules.py index 276acfe..cf31a41 100644 --- a/coreferee/lang/fr/language_specific_rules.py +++ b/coreferee/lang/fr/language_specific_rules.py @@ -16,14 +16,16 @@ from spacy.tokens import Token from ...rules import RulesAnalyzer from ...data_model import Mention +import sys +import re class LanguageSpecificRulesAnalyzer(RulesAnalyzer): - maximum_coreferring_nouns_sentence_referential_distance = 20 - maximum_anaphora_sentence_referential_distance = 5 + maximum_coreferring_nouns_sentence_referential_distance = 3 + random_word = "albatros" dependent_sibling_deps = ("conj",) @@ -32,7 +34,7 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): adverbial_clause_deps = ("advcl", "advmod", "dep") - or_lemmas = "ou" + or_lemmas = ("ou", "soit") entity_noun_dictionary = { "PER": [ @@ -42,7 +44,6 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): "garçon", "fille", "individu", - "type", "gars", "dame", "demoiselle", @@ -65,6 +66,7 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): "nièce", "cousin", "ami", + "amie", "mari", "époux", "épouse", @@ -112,12 +114,26 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): ("“", "”"), ] + person_titles = {"m.","mm.","monsieur", + "messieurs","mgr","monseigneur", + "président", "mme","mmes","madame", + 'mesdames',"mlle","mlles", + "mademoiselle","mesdemoiselles", + "vve","veuve", "présidente","docteur","dr", "docteurs", "drs", + "professeur","pr", "professeurs", "prs" + "maitre","maître","me", "ministre" + } + term_operator_pos = ("DET", "ADJ") term_operator_dep = ("det", "amod", "nmod", "nummod") clause_root_pos = ("VERB", "AUX") + disjointed_dep = ("dislocated","vocative","parataxis","discourse") + + french_word = re.compile("[\\-\\w][\\-\\w'&\\.]*$") + def get_dependent_siblings(self, token: Token) -> list: def add_siblings_recursively(recursed_token: Token, visited_set: set) -> None: visited_set.add(recursed_token) @@ -149,7 +165,10 @@ def add_siblings_recursively(recursed_token: Token, visited_set: set) -> None: return sorted(siblings_set) def is_independent_noun(self, token: Token) -> bool: - + if not self.french_word.match(token.text) : return False + if token.pos_ == "PROPN" and \ + re.match("[^A-ZÂÊÎÔÛÄËÏÖÜÀÆÇÉÈŒÙ]",token.lemma_): + return False if ( token.lemma_ in {"un", "certains", "certain"} or self.has_morph(token, "NumType", "Card") @@ -172,16 +191,30 @@ def is_independent_noun(self, token: Token) -> bool: elif ( token.pos_ not in self.noun_pos + ("ADJ", "PRON") or token.dep_ in ("fixed", "flat:name", "flat:foreign", "amod") - or ( - token.pos_ in ("ADJ", "PRON") - and not any(child for child in token.children if child.dep_ == "det") + or (token.pos_ in ("ADJ", "PRON") and not self.has_det(token)) + ): + return False + elif ( + token.lemma_ == "dernier" + and any( + self.has_morph(child, "PronType", "Dem") for child in token.children ) + and token.dep_ not in ("amod", "appos") ): return False + if ( + token.i>0 and token.ent_type_ != "" and + token.doc[token.i-1].ent_type_ == token.ent_type_ + and token.doc[token.i-1] not in token.subtree + ): + return False + + if not self.has_det(token) and token.lemma_ in self.blacklisted_nouns: + return False return not self.is_token_in_one_of_phrases(token, self.blacklisted_phrases) def is_potential_anaphor(self, token: Token) -> bool: - + if not self.french_word.match(token.text) : return False # Ce dernier, cette dernière... if ( token.lemma_ == "dernier" @@ -193,7 +226,15 @@ def is_potential_anaphor(self, token: Token) -> bool: return True if self.is_emphatic_reflexive_anaphor(token): return True - if token.lemma_ == "celui": + if token.lemma_ in {"celui", "celle"}: + return True + if token.lower_ in {"-elle"}: + return True + if (token.lower_ == "-il" and + token.i > 0 and + token.doc[token.i-1].lemma_ != "avoir" and + token.dep_ != "expl:subj" + ): return True if not ( ( @@ -214,7 +255,7 @@ def is_potential_anaphor(self, token: Token) -> bool: ): return False # When anaphoric , the demonstrative refers almost always to a whole proposition and not a noun phrase - if token.lemma_ in {"ce", "ça", "cela"}: + if token.lemma_ in {"ce", "ça", "cela", "-ce"}: return False if token.lemma_ == "on": @@ -242,24 +283,24 @@ def is_potential_anaphor(self, token: Token) -> bool: if token.lemma_ in ("-", "ci", "-ci", "-là"): return False # Avalent Il. In case some are not marked as expletive + inclusive_head_children = [token.head] + list(token.head.children) if ( token.dep_ != self.root_dep and token.head.pos_ in ("AUX", "VERB") - and len( + and any( [ - child - for child in token.head.subtree + 1 + for child in inclusive_head_children if child.lemma_ in self.avalent_verbs ] ) - > 0 - ): + ): return False # impersonal constructions if ( token.dep_ in {"expl:comp", "expl:pass", "expl:subj"} - and token.lemma_ != "en" + and token.lemma_ not in {"en"} and not self.has_morph(token, "Reflex", "Yes") ): return False @@ -314,101 +355,141 @@ def is_quelqun_head(self, token: Token) -> bool: ): return True return False + + def has_det(self, token: Token) ->bool: + return any(det for det in token.children if det.dep_ == "det") + + def get_gender_number_info(self, token : Token, directly = False, det_infos = False) -> bool: + masc = fem = sing = plur = False + if self.is_quelqun_head(token): + sing = masc = fem = True + elif self.has_morph(token, "Poss", "Yes") and not det_infos: + if self.is_potential_anaphor(token): + # the plural morphs of poss determiner don't mark the owner but the owned + if token.lemma_ == "leur": + plur = True + if token.lemma_ == "son": + sing = True + masc = fem = True + else: + if self.has_morph(token, "Number", "Sing"): + sing = True + if self.has_morph(token, "Number", "Plur"): + plur = True + if self.has_morph(token, "Gender", "Masc"): + masc = True + if self.has_morph(token, "Gender", "Fem"): + fem = True + + if token.lemma_ in {"ici", "là", "y", "en"}: + masc = fem = sing = plur = True + + elif self.is_potential_anaphor(token): + # object pronouns are not well recognized by the models + if token.lower_.startswith("lui"): + masc = True + sing = True + elif token.lower_.startswith("eux"): + masc = True + plur = True + elif token.lower_.startswith("elles"): + fem = True + plur = True + elif token.lower_.startswith("elle"): + fem = True + sing = True + elif token.lower_.startswith("soi"): + masc = fem = sing = plur = True - def is_potential_anaphoric_pair( - self, referred: Mention, referring: Token, directly: bool - ) -> bool: - def refers_to_person(token): - if ( - token.ent_type_ == "PER" - or self.is_quelqun_head(token) - or token.lemma_ - in self.entity_noun_dictionary["PER"] + self.person_roles - or ( - token.pos_ == self.propn_pos - and token.lemma_ in self.male_names + self.female_names - ) - ): - return True - if ( - token.dep_ in ("nsubj", "nsubj:pass") - and token.head.lemma_ in self.verbs_with_personal_subject - ): - return True - - return False + if self.has_morph(token, "Reflex", "Yes"): + #se + if token.head.pos_ in self.clause_root_pos: + sing = self.has_morph(token.head, "Number", "Sing") + plur = self.has_morph(token.head, "Number", "Plur") + masc = fem = True - def get_gender_number_info(token): + elif token.pos_ == "PROPN": - masc = fem = sing = plur = False - if self.is_quelqun_head(token): - sing = masc = fem = True - elif self.has_morph(token, "Poss", "Yes"): - if self.is_potential_anaphor(token): - # the plural morphs of poss determiner don't mark the owner but the owned - if token.lemma_ == "leur": - plur = True - if token.lemma_ == "son": - sing = True - masc = fem = True - else: - if self.has_morph(token, "Number", "Sing"): - sing = True - if self.has_morph(token, "Number", "Plur"): - plur = True - if self.has_morph(token, "Gender", "Masc"): + if token.lemma_ in self.male_names: masc = True - if self.has_morph(token, "Gender", "Fem"): + if token.lemma_ in self.female_names: fem = True + if token.lemma_ not in self.male_names + self.female_names: + masc = fem = True + if not plur: + # proper nouns without plur mark are typically singular + sing = True + if not directly and not self.has_det(token): + masc = fem = True + + if token.pos_ == "PRON" and token.lower_ == "le" and plur: + # Je les vois + masc = fem = True + # get grammatical info from det + if token.pos_ in self.noun_pos + ('ADJ',) and not det_infos: + for det in token.children: + # prevent recurs for single det phrase + if det == token : break + if det.dep_ != 'det': continue + ( + det_masc, + det_fem, + det_sing, + det_plur, + ) = self.get_gender_number_info(det, directly=directly, det_infos=True) + # If determiner has a decisive information it trumps that of noun + #" Especially in case of epicene nouns : e.g "la ministre" + if any([det_sing, det_plur]): + sing, plur = det_sing, det_plur + # or invariable nouns : le bras / les bras + if any([det_fem, det_masc]): + fem, masc = det_fem, det_masc + break - if token.lemma_ in {"ici", "là", "y", "en"}: - masc = fem = sing = plur = True - - elif self.is_potential_anaphor(token): - # object pronouns are not well recognized by the models - if token.lower_.startswith("lui"): - masc = True - sing = True - elif token.lower_.startswith("eux"): - masc = True - plur = True - elif token.lower_.startswith("elles"): - fem = True - plur = True - elif token.lower_.startswith("elle"): - fem = True - sing = True - elif token.lower_.startswith("soi"): - masc = fem = sing = plur = True - - if self.has_morph(token, "Reflex", "Yes"): - if token.head.pos_ in self.clause_root_pos: - sing = self.has_morph(token.head, "Number", "Sing") - plur = self.has_morph(token.head, "Number", "Plur") - masc = fem = True - - elif token.pos_ == "PROPN": - - if token.lemma_ in self.male_names: - masc = True - if token.lemma_ in self.female_names: - fem = True - if token.lemma_ not in self.male_names + self.female_names: - masc = fem = True - if not plur: - # proper nouns without plur mark are typically singular - sing = True - if not directly: - masc = fem = sing = plur = True - - if token.pos_ == "PRON" and token.lower_ == "le" and plur: - # Je les vois - masc = fem = True + if not det_infos: if not any([sing, plur]): sing = plur = True if not any([fem, masc]): fem = masc = True - return masc, fem, sing, plur + return masc, fem, sing, plur + + def refers_to_person(self, token) -> bool: + + if ( + token.ent_type_ == "PER" + or self.is_quelqun_head(token) + or token.lemma_.lower() + in self.entity_noun_dictionary["PER"] + self.person_roles + ): + return True + if ( + token.pos_ == self.propn_pos + and token.lemma_ in self.male_names + self.female_names + and ( + token.ent_type_ not in ["LOC","ORG"] or + token.lemma_ in [ + "Caroline", + "Virginie", + "Salvador", + "Maurice", + "Washington"] + ) + ): + return True + + if ( + token.dep_ in ("nsubj", "nsubj:pass") + ): + verb_lemma = token.head.lemma_ + if verb_lemma[-1] == "e" and verb_lemma[-2]!="r": + # first group verbs that are not lemmatised correctly + verb_lemma = verb_lemma + "r" + if verb_lemma in self.verbs_with_personal_subject: + return True + return False + def is_potential_anaphoric_pair( + self, referred: Mention, referring: Token, directly: bool + ) -> bool: doc = referring.doc referred_root = doc[referred.root_index] @@ -430,7 +511,7 @@ def get_gender_number_info(token): referring_fem, referring_sing, referring_plur, - ) = get_gender_number_info(referring) + ) = self.get_gender_number_info(referring, directly=directly) # e.g. 'les hommes et les femmes' ... 'ils': 'ils' cannot refer only to # 'les hommes' or 'les femmes' if ( @@ -465,7 +546,7 @@ def get_gender_number_info(token): working_fem, working_sing, working_plur, - ) = get_gender_number_info(working_token) + ) = self.get_gender_number_info(working_token, directly=directly) referred_masc = referred_masc or working_masc referred_fem = referred_fem or working_fem referred_sing = referred_sing or working_sing @@ -498,17 +579,21 @@ def get_gender_number_info(token): ): if ( not self.is_independent_noun(referred_root) - and referred_root.lemma_ != referring.lemma_ + and referred_root.lemma_ not in ["ici","là","y"] ): return 0 - if refers_to_person( - referred_root - ): # or working_token.lemma_ in self.animal_names: + if self.refers_to_person(referred_root): return 0 if referred_root.ent_type_ == "ORG" and referring.lemma_ != "y": uncertain = True + referred_ent_type = self.reverse_entity_noun_dictionary.get(referred_root) + if referred_ent_type in ("PER","ORG"): + uncertain = True + if directly: + # possessive det can't be referred to directly + if self.has_morph(referred_root, "Poss") and referred_root.pos_ == "DET": return False if self.is_potential_anaphor(referring) > 0: try: if ( @@ -525,17 +610,20 @@ def get_gender_number_info(token): ) ) ): - #'celui-ci' and 'ce dernier' can only refer to last noun phrase + #'celui-ci' and 'ce dernier' can only refer to last grammatically compatible noun phrase if referring.i == 0: return 0 for previous_token_index in range(referring.i - 1, 0, -1): - if previous_token_index == referring.i: - continue - if self.is_independent_noun(doc[previous_token_index]): - if previous_token_index not in (referred.token_indexes): + previous_token = doc[previous_token_index] + if self.is_independent_noun(previous_token) and \ + self.is_potential_anaphoric_pair(Mention(previous_token), referring, directly=False): + if previous_token_index != referred.root_index : + if previous_token.dep_ in ("nmod", "appos"): + continue + # Except if noun phrase is modifier of other noun phrase + # eg: "Le président du pays... ce dernier" can refer to both nouns return 0 - else: - break + break if ( referring.lemma_ == "celui" @@ -563,15 +651,36 @@ def get_gender_number_info(token): pass if referring.lemma_ == "en": # requires list of mass/countable nouns to be implemented - if not referred_plur and (refers_to_person(referred_root)): + if not referred_plur and (self.refers_to_person(referred_root)): uncertain = True + if ( + referring.pos_ == "PRON" and self.has_morph(referring, "Person", "3") and + self.has_morph(referring,"Number") and not self.refers_to_person(referred_root) + ): + #Some semantic restrictions on named entities / pronoun pair + if referred_root.ent_type_ == "ORG" and referred_root.pos_ in self.propn_pos\ + and not self.has_det(referred_root) and not \ + any(prep for prep in referred_root.children if prep.dep_ == 'case'): + # "Twitter ... Il " is not possible + return False + if ( + referred_root.ent_type_ in {"LOC","MISC"} and referred_root.pos_ in self.propn_pos + and not self.has_det(referred_root) and not + any(prep for prep in referred_root.children if prep.dep_ == 'case') + ): + # "Paris... elle" is possible but unlikely + # Except for cases when the toponym has a determiner, such as most country name + # "La France...elle" is ok. Same for cities with det : "Le Havre... il" + uncertain = True + if ( self.is_potential_reflexive_pair(referred, referring) and self.is_reflexive_anaphor(referring) == 0 and not self.has_morph(referred_root, "Poss", "Yes") ): # * Les hommes le voyaient. "le" can't refer to "hommes" + #print("SUSUSUSU", referred, referring) return 0 if self.is_potential_reflexive_pair(referred, referring) == 0 and ( @@ -580,6 +689,14 @@ def get_gender_number_info(token): # * Les hommes étaient sûrs qu'ils se trompaient. "se" can't directly refer to "hommes" return 0 + if self.refers_to_person(referring) and not self.refers_to_person(referred_root): + # Le Luxembourg... Il mange ... -> impossible + if referred_root.ent_type_ in {"ORG", "LOC", "MISC"} : + return False + # Le Balcon... il mange... -> impossible but some other nouns are dubious + if referred_root.pos_ == "NOUN" : + uncertain = True + referring_governing_sibling = referring if referring._.coref_chains.temp_governing_sibling is not None: referring_governing_sibling = ( @@ -591,15 +708,17 @@ def get_gender_number_info(token): in self.verbs_with_personal_subject ): for working_token in (doc[index] for index in referred.token_indexes): - if refers_to_person(working_token): + if self.refers_to_person(working_token): return 2 - uncertain = True + if referred_root.pos == "NOUN": + uncertain = True return 1 if uncertain else 2 def has_operator_child_with_any_morph(self, token: Token, morphs: dict): for child in ( - child for child in token.children if child.pos_ in self.term_operator_pos + child for child in token.children if + child.pos_ in self.term_operator_pos + ("ADP",) ): for morph in morphs: if self.has_morph(child, morph, morphs.get(morph)): @@ -612,6 +731,7 @@ def is_potentially_indefinite(self, token: Token) -> bool: ) or self.is_quelqun_head(token) def is_potentially_definite(self, token: Token) -> bool: + return self.has_operator_child_with_any_morph( token, {"Definite": "Def", "PronType": "Dem"} ) @@ -655,6 +775,9 @@ def is_potential_reflexive_pair(self, referred: Mention, referring: Token) -> bo ): return False + if referring.dep_ in self.disjointed_dep: + return False + referred_root = referring.doc[referred.root_index] if referred_root._.coref_chains.temp_governing_sibling is not None: @@ -662,11 +785,15 @@ def is_potential_reflexive_pair(self, referred: Mention, referring: Token) -> bo if referring._.coref_chains.temp_governing_sibling is not None: referring = referring._.coref_chains.temp_governing_sibling - if referred_root.dep_ in ("nsubj", "nsubj:pass"): + + if referred_root.dep_ in ("nsubj", "nsubj:pass") and \ + not any(selon for selon in referring.children + if selon.lemma_ == "selon" and selon.dep_ == "case"): for referring_ancestor in referring.ancestors: # Loop up through the verb ancestors of the pronoun - + if referring_ancestor.dep_ in self.disjointed_dep: + return False if referred_root in referring_ancestor.children: return True # Relative clauses @@ -681,16 +808,12 @@ def is_potential_reflexive_pair(self, referred: Mention, referring: Token) -> bo return True # The ancestor has its own subject, so stop here - if ( - len( - [ + subjects = [ t for t in referring_ancestor.children - if t.dep_ in ("nsubj", "nsubj:pass") and t != referred_root + if t.dep_ in ("nsubj", "nsubj:pass") ] - ) - > 0 - ): + if any(subjects) and referred_root not in subjects: return False return False @@ -769,6 +892,40 @@ def is_potential_cataphoric_pair(self, referred: Mention, referring: Token) -> b return True return False + def get_propn_subtree(self, token:Token) -> list: + """ Returns a list containing each member M of the subtree of *token* that are proper nouns + and where all the tokens between M and *token* are themselves proper nouns. If *token* + is itself not a proper noun or if the head of *token* is a proper noun, an empty list + is returned. + """ + """"Has to be edited for french as the titles are parsed as heads of the propn + (and are those titles also included in named entities) + """ + def is_propn_part(token:Token) -> bool: + if token.lemma_.lower() not in self.person_titles and \ + token.text[0].upper() != token.text[0] and\ + re.search("\\W", token.text): + return False + return token.pos_ in self.propn_pos or \ + (token.lemma_.lower() in self.person_titles and token.pos_ in self.noun_pos) + + if not is_propn_part(token): + return [] + if token.dep_ != self.root_dep and token.dep_ not in self.dependent_sibling_deps and \ + is_propn_part(token.head): + return [] + subtree = list(token.subtree) + before_start_index = -1 + after_end_index = sys.maxsize + for subtoken in subtree: + if not is_propn_part(subtoken) and subtoken.i < token.i and \ + before_start_index < subtoken.i: + before_start_index = subtoken.i + elif not is_propn_part(subtoken) and subtoken.i > token.i and \ + after_end_index > subtoken.i: + after_end_index = subtoken.i + return ([t for t in subtree if t.i > before_start_index and t.i < after_end_index]) + def is_potentially_referring_back_noun(self, token: Token) -> bool: if ( @@ -802,6 +959,127 @@ def is_potentially_referring_back_noun(self, token: Token) -> bool: token._.coref_chains.temp_governing_sibling ) ) + def get_noun_core_lemma(self, token): + prefix = re.compile("^((vice)|(^ex)|(^co))-") + return prefix.sub("",token.lemma_).lower() + + def is_grammatically_compatible_noun_pair(self, referred : Token, referring:Token): + ( + referred_masc, + referred_fem, + referred_sing, + referred_plur, + ) = self.get_gender_number_info(referred, directly=True) + ( + referring_masc, + referring_fem, + referring_sing, + referring_plur, + ) = self.get_gender_number_info(referring, directly=True) + + if not ( + (referred_plur and referring_plur) or (referred_sing and referring_sing) + ) and not (referred.ent_type_ == "LOC" and referred.lemma_.upper() in self.plural_toponyms): + # two nouns with different numbers can't corefer. This is true for substantives and propn alike + return False + + + if (referred.ent_type_ == 'PER' or self.get_noun_core_lemma(referred) in self.person_titles) and \ + not (referring.pos_ == 'NOUN' and self.get_noun_core_lemma(referring) in self.mixed_gender_person_roles): + # Gender compatibility is only ensured for person and their roles + # And only when the role does not allow mixed gender + # "Sophie... l'auteur du livre'" is possible + # "Sophie... l'instituteur'" is impossible + if not ( + (referred_masc and referring_masc) or (referred_fem and referring_fem) + ): + return False + if ( + self.has_morph(referring, "Gender","Masc") and + referring_fem and not referred_fem + ): + # when fem gender is enforced by det + # eg : la juge + return False + return True + + def is_potential_coreferring_pair_with_substantive(self, + referred: Token, referring: Token) -> bool: + ''' + Returns True if pragmatical rules of the language + allow the two nouns to corefer + ''' + #Nouns can't corefer in same predication + verb_referred_ancestors = [t for t in referred.ancestors \ + if t.dep_ == 'ROOT' or t.pos_ in self.clause_root_pos] + verb_referring_ancestors = [t for t in referring.ancestors \ + if t.dep_ == 'ROOT' or t.pos_ in self.clause_root_pos] + referred_verb_parent = verb_referred_ancestors[0] if verb_referred_ancestors else referred + referring_verb_parent = verb_referring_ancestors[0] if verb_referring_ancestors else referring + # Covers cases of unrecognised appos + if referred_verb_parent == referring_verb_parent and\ + referring.dep_ != "xcomp" : + return False + + for appos_token in [referred, referring]: + #Prevents any non Propn from appos chain from connecting to other nouns + # That way we ensure that only the propn will be linked to the bigger chains + # E.g : "Justin Trudeau.... Le Président, Donald Trump". + # We don't want "president" to be able to be linked to "Justin" + appos_children = [c for c in appos_token.children if c.dep_ == "appos"] + if ( + any(1 for propn in appos_children + if propn.pos_ == "PROPN" or propn.ent_type_) + and + appos_token.pos_ != "PROPN" and not appos_token.ent_type_ + ): + return False + + if appos_token.pos_ != "PROPN" and appos_token.dep_ == "appos" and \ + not appos_token.ent_type_ and \ + (appos_token.head.pos_ == "PROPN" or appos_token.ent_type_): + return False + return True + + def language_dependent_is_coreferring_noun_pair(self, + referred: Token, referring: Token) -> bool: + ''' + Return True if language rules make it necessary + for the two noun phrases to corever + ''' + # Apposition chains + if referred == referring.head and referring.dep_ == "appos": + return True + if referred == referring.head.head and \ + referring.dep_ == "appos" and referring.head.dep_ == "appos": + return True + + # Cases of apposition wrongly tagged as conj + if ( + referred == referring.head and referring.dep_ == "conj" + and self.is_involved_in_non_or_conjunction(referring) + and referred.dep_ in ("nsubj", "nsubj:pass") + and referred.head.pos_ in ("VERB","AUX") + ): + + *_, referred_sing , referred_plur = self.get_gender_number_info(referred) + if referred_sing and not referred_plur and\ + self.has_morph(referred.head, "Number","Sing"): + return True + # Copular structures + if referring == referred.head and \ + any( + cop for cop in referring.children if cop.dep_ == "cop" and + cop.lemma_ == "être" + ): + return True + + # state verbs + stative_verbs = ["devenir","rester","demeurer"] + if referred.dep_ in ("nsubj","nsubj:pass") and referring.dep_ == "obj" and \ + referring.head == referred.head and referring.head.lemma_ in stative_verbs: + return True + return False def is_potential_coreferring_noun_pair( self, referred: Token, referring: Token @@ -811,12 +1089,17 @@ def is_potential_coreferring_noun_pair( already returned *True* for both *referred* and *referring* and that *referred* precedes *referring* within the document. """ - if len(referred.text) == 1 and len(referring.text) == 1: return False # get rid of copyright signs etc. - if referred.pos_ not in self.noun_pos or referring.pos_ not in self.noun_pos: + if (referred.pos_ not in self.noun_pos and not self.has_det(referred))\ + or (referring.pos_ not in self.noun_pos and not self.has_det(referring)): return False + grammatically_compatible= self.is_grammatically_compatible_noun_pair(referred,referring) + # Needs to be here as it covers cases of incorrect parsing + if self.language_dependent_is_coreferring_noun_pair(referred, referring) and\ + grammatically_compatible: + return True if referring in referred._.coref_chains.temp_dependent_siblings: return False @@ -845,27 +1128,41 @@ def is_potential_coreferring_noun_pair( ).endswith(" ".join(t.lemma_.lower() for t in referring_propn_subtree)): return True + if not self.is_potential_coreferring_pair_with_substantive(referred, referring): + return False # e.g. 'Peugeot' -> 'l'entreprise' new_reverse_entity_noun_dictionary = { noun: "PER" for noun in self.person_roles } | self.reverse_entity_noun_dictionary + if ( - referring.lemma_.lower() in new_reverse_entity_noun_dictionary - and referred.pos_ in self.propn_pos - and referred.ent_type_ - == new_reverse_entity_noun_dictionary[referring.lemma_.lower()] - and self.is_potentially_definite(referring) - ): + self.get_noun_core_lemma(referring) in new_reverse_entity_noun_dictionary + and self.is_potentially_definite(referring) and + ( + ( + referred.ent_type_ == + new_reverse_entity_noun_dictionary[self.get_noun_core_lemma(referring)] + ) + or + ( + new_reverse_entity_noun_dictionary[self.get_noun_core_lemma(referring)] + == "PER" + and referred.ent_type_ and self.refers_to_person(referred) + ) + ) + and grammatically_compatible + and not (referring.ent_type_ != "" and referring.pos_ != "PROPN") + ): return True - + if not self.is_potentially_referring_back_noun(referring): return False if not self.is_potentially_introducing_noun( referred ) and not self.is_potentially_referring_back_noun(referred): return False - if referred.lemma_ == referring.lemma_ and referred.morph.get( - self.number_morph_key - ) == referring.morph.get(self.number_morph_key): + if self.get_noun_core_lemma(referred) == self.get_noun_core_lemma(referring)\ + and referred.morph.get(self.number_morph_key) == \ + referring.morph.get(self.number_morph_key): return True return False diff --git a/coreferee/rules.py b/coreferee/rules.py index a96ca73..6388f14 100644 --- a/coreferee/rules.py +++ b/coreferee/rules.py @@ -405,7 +405,7 @@ def language_independent_is_potential_anaphoric_pair( break if result == 1: break - if referring_or_governor.dep_ == 'ROOT': + if referring_or_governor == referring_or_governor.head: break referring_or_governor = referring_or_governor.head diff --git a/setup.cfg b/setup.cfg index 99861b3..c7689ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = coreferee -version = 1.1.2 +version = 1.1.3 description = Coreference resolution for English, French, German and Polish, optimised for limited training data and easily extensible for further languages long_description = file: SHORTREADME.md long_description_content_type = text/markdown @@ -18,6 +18,7 @@ classifiers = Intended Audience :: Science/Research License :: OSI Approved :: Apache Software License Natural Language :: English + Natural Language :: French Natural Language :: German Natural Language :: Polish Programming Language :: Python :: 3.9 diff --git a/tests/fr/test_rules_fr.py b/tests/fr/test_rules_fr.py index b85b072..ffbf07b 100644 --- a/tests/fr/test_rules_fr.py +++ b/tests/fr/test_rules_fr.py @@ -556,13 +556,13 @@ def test_potential_pair_apposition_2(self): def test_potential_pair_male_name(self): - self.compare_potential_pair('Je voyais Pierre. Il dormait', 2, False, 4, 2) + self.compare_potential_pair('Je voyais Gérard. Il dormait', 2, False, 4, 2) def test_potential_pair_male_name_control_1(self): - self.compare_potential_pair('Je voyais Pierre. Elle dormait', 2, False, 4, 0) + self.compare_potential_pair('Je voyais Gérard. Elle dormait', 2, False, 4, 0) def test_potential_pair_male_name_control_2(self): - self.compare_potential_pair('Je voyais Pierre. Ils dormaient', 2, False, 4, 0) + self.compare_potential_pair('Je voyais Gérard. Ils dormaient', 2, False, 4, 0) def test_potential_pair_female_name(self): self.compare_potential_pair('Je voyais Julie. Elle dormait', 2, False, 4, 2) @@ -610,13 +610,15 @@ def test_potential_pair_fem_acc_anaphor_control_3(self): self.compare_potential_pair('Je voyais des maisons. Je l\'ai vu', 3, False, 6, 0) def test_potential_pair_fem_acc_anaphor_4(self): - self.compare_potential_pair('Je prends la valise. Je l\'ai', 3, False, 6, 2) + self.compare_potential_pair('Je prends la valise. Je l\'ai', 3, False, 6, 2, + excluded_nlps="core_news_sm") def test_potential_pair_fem_acc_anaphor_control_4(self): self.compare_potential_pair('Je prends les valises. Je l\'ai', 3, False, 6, 0) def test_potential_pair_dislocation_left_cataphor(self): - self.compare_potential_pair("Elle est bleue, la valise", 5, False, 0, 2) + self.compare_potential_pair("Elle est bleue, la valise", 5, False, 0, 2, + excluded_nlps="core_news_sm") def test_potential_pair_dislocation_right_anaphor(self): self.compare_potential_pair("La valise, elle est bleue", 1, False, 3, 2, @@ -633,15 +635,15 @@ def test_potential_pair_location_anaphor_2(self): excluded_nlps=['core_news_sm']) def test_potential_pair_location_anaphor_ici(self): - self.compare_potential_pair('Voici ma maison. Je vis ici', 2 , False, 6, 2, + self.compare_potential_pair('Voici ma maison. Je vis ici', 2, False, 6, 2, excluded_nlps=['core_news_sm']) def test_potential_pair_location_anaphor_en(self): - self.compare_potential_pair('Je viens de France. J\'en viens.',3 , False, 6, 2, + self.compare_potential_pair('Je viens de France. J\'en viens.', 3, False, 6, 2, excluded_nlps=['core_news_sm']) def test_potential_pair_location_anaphor_y(self): - self.compare_potential_pair('J\'habite en France. J\'y habite.',3 , False, 6, 2, + self.compare_potential_pair('J\'habite en France. J\'y habite.', 3, False, 6, 2, excluded_nlps=['core_news_sm']) def test_potential_pair_location_anaphor_y_control(self): @@ -762,6 +764,75 @@ def test_potential_pair_subordinate_clause(self): 10, False, 2, 2) + def test_potential_pair_org_pronoun(self): + self.compare_potential_pair( + "Depuis des années, Sony travaille sur son image de marque. Il change de nom", + 4, False, 12, 0, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_potential_pair_org_pronoun_with_det(self): + self.compare_potential_pair( + "Depuis des années, la Société Sony travaille sur son image de marque. Elle change de nom", + 5, False, 14, 2, + excluded_nlps=[] + ) + def test_potential_pair_org_pronoun_control_1(self): + self.compare_potential_pair( + "Depuis des années, la Société Sony travaille sur son image de marque. Il change de nom", + 5, False, 14, 0, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_potential_pair_org_pronoun_control_2(self): + self.compare_potential_pair( + "Depuis des années, Sony travaille sur son image de marque. Il change de nom", + 4, False, 12, 0, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_potential_pair_loc_pronoun_without_det(self): + self.compare_potential_pair( + "Paris change de maire. Elle entre dans un nouveau tournant", + 0, False, 5, 1, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_potential_pair_loc_pronoun_with_det(self): + self.compare_potential_pair( + "La France change de président. Elle entre dans un nouveau tournant", + 1, False, 6, 2, + excluded_nlps=[] + ) + + def test_potential_pair_loc_pronoun_without_det_2(self): + self.compare_potential_pair( + "Paris change de maire. Il entre dans un nouveau tournant", + 0, False, 5, 1, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_potential_pair_loc_pronoun_control(self): + self.compare_potential_pair( + "La France change de président. Il entre dans un nouveau tournant", + 1, False, 6, 0, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + def test_potential_pair_dernier(self): + + minitext = "Ce sera un cas unique au monde, avance le chercheur de l'Institut économique de Montréal (IEDM). Photo courtoisie. Selon ce dernier, le gouvernement Legault a encore le temps de faire marche arrière et de « sortir » de la vente au détail." + self.compare_potential_pair(minitext, + 10, False, 26, 2, + excluded_nlps=["core_news_sm", "core_news_md"] + ) + ''' + def test_potential_pair_dernier(self): + minitext = "Pascal Bérubé dit qu'il «assume toutes les décisions jusqu'au bout». Il ajoute toutefois que «le contexte a changé» et qu'il «va falloir se poser des questions importantes sur beaucoup de choses»." + self.compare_potential_pair(minitext, + 0, False, 4, 2, + excluded_nlps=["core_news_sm"] + ) + ''' def compare_potential_reflexive_pair(self, doc_text, referred_index, include_dependent_siblings, referring_index, expected_truth, expected_reflexive_truth, is_reflexive_anaphor_truth, *, excluded_nlps=[]): @@ -857,7 +928,8 @@ def test_reflexive_with_passive(self): def test_reflexive_with_passive_and_conjunction(self): self.compare_potential_reflexive_pair( 'La maison et la voiture furent achetées par elles-mêmes', - 1, True, 8, 2, True, 2) + 1, True, 8, 2, True, 2, + excluded_nlps=["core_news_sm"]) def test_reflexive_with_object_antecedent(self): self.compare_potential_reflexive_pair('Elle mélangea le produit avec lui-même', @@ -1138,11 +1210,112 @@ def test_potential_noun_pair_proadverb_location(self): 5, 15, False) def test_potential_noun_pair_apposition(self): - self.compare_potential_noun_pair('Alexandre, le roi de Macédoine devient empereur. Le roi de Macédoine meurt à 33 ans.', - 0, 3, True) + self.compare_potential_noun_pair('Alexandre, le souverain de Macédoine devient empereur. Le roi de Macédoine meurt à 33 ans.', + 0, 3, True, + excluded_nlps=['core_news_sm', 'core_news_md']) def test_potential_noun_pair_apposition_2(self): - self.compare_potential_noun_pair('Alexandre, le roi de Macédoine devient empereur. Le roi de Macédoine meurt à 33 ans.', - 0, 10, True) + self.compare_potential_noun_pair('Gerbert d\'Auriac, le pape de l\'an Mil est élu en 999. Le pape meurt en 1003.', + 0, 16, True) + + def test_potential_noun_pair_same_number(self): + self.compare_potential_noun_pair("Nicolas Sarkozy venait d'arriver. Le président portait un costume.", + 0, 7, True) + + def test_potential_noun_pair_different_number(self): + self.compare_potential_noun_pair("Nicolas Sarkozy venait d'arriver. Les présidents portaient des costumes.", + 0, 7, False) + + def test_potential_noun_pair_person_noun_different_gender(self): + self.compare_potential_noun_pair("Nicolas Sarkozy venait d'arriver. La présidente portait un costume.", + 0, 7, False) + def test_potential_noun_pair_person_noun_mixed_gender_male_propn(self): + self.compare_potential_noun_pair("Nicolas Dupond venait d'arriver. Le juge portait un costume.", + 0, 7, True) + + def test_potential_noun_pair_person_noun_mixed_gender_female_propn(self): + self.compare_potential_noun_pair("Aurélie Dupond venait d'arriver. Le juge portait un costume.", + 0, 7, True) + ''' + # Needs different list of mixed nouns for fem and masc + def test_potential_noun_pair_person_noun_mixed_gender_male_propn_control(self): + self.compare_potential_noun_pair("Nicolas Dupond venait d'arriver. La juge portait un costume.", + 0, 7, False) + ''' + + def test_potential_noun_pair_same_proposition(self): + self.compare_potential_noun_pair("Nicolas Dupond voyait l'homme.", + 0, 4, False) + + def test_potential_noun_pair_same_proposition_be_clause(self): + self.compare_potential_noun_pair("Nicolas Dupond est l'homme dont il parlait.", + 0, 4, True) + + def test_potential_noun_pair_different_propositions_same_sentence_coord(self): + self.compare_potential_noun_pair("Nicolas Dupond est arrivé et le ministre sentait la rose.", + 0, 6, True) + + def test_potential_noun_pair_different_propositions_same_sentence_comma(self): + self.compare_potential_noun_pair("Nicolas Dupond est arrivé , le ministre sentait la rose.", + 0, 6, True) + + def test_potential_noun_pair_different_propositions_same_sentence_semicolon(self): + self.compare_potential_noun_pair("Nicolas Dupond est arrivé ; le ministre sentait la rose.", + 0, 6, True) + + def test_potential_noun_pair_title_complete(self): + self.compare_potential_noun_pair("Madame Angela Merkel est arrivée. La chancelière est bien habillée", + 0, 7, True, + excluded_nlps = ['core_news_sm']) + + def test_potential_noun_pair_title_abbr(self): + self.compare_potential_noun_pair("Mme Angela Merkel est arrivée. La chancelière est bien habillée", + 0, 7, True, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_title_complete_control(self): + self.compare_potential_noun_pair("Madame Angela Merkel est arrivée. Le chancelier est bien habillé", + 0, 7, False, + excluded_nlps = ['core_news_sm']) + + def test_potential_noun_pair_title_abbr_control(self): + self.compare_potential_noun_pair("Mme Angela Merkel est arrivée. Le chancelier est bien habillé", + 0, 7, False, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_mixed_title_mixed__noun(self): + self.compare_potential_noun_pair("Docteur Jonas est là. Le médecin est habillé en blanc", + 0, 6, True, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_masc_title_mixed__noun(self): + self.compare_potential_noun_pair("Monsieur Jonas est là. Le médecin est habillé en blanc", + 0, 6, True, + excluded_nlps= ['core_news_sm']) + def test_potential_noun_pair_mixed_title_fem_noun(self): + self.compare_potential_noun_pair("Docteur Jonas est là. La doctoresse est habillée en blanc", + 0, 6, True, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_plur_loc_exception_single_noun(self): + self.compare_potential_noun_pair("La semaine prochaine, je vais aux Etats-Unis. J'adore ce pays.", + 7, 12, True, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_plur_loc_single_noun(self): + self.compare_potential_noun_pair("Christophe Colomb a découvert les Amériques. J'adore ce pays.", + 5, 10, False, + excluded_nlps= ['core_news_sm']) + + def test_potential_noun_pair_no_gender(self): + self.compare_potential_noun_pair("M. Belzile est là. L'économiste est d'avis que le gouvernement devrait instaurer une taxe", + 0, 6, True, + excluded_nlps = ['core_news_sm', 'core_news_md']) + + def test_potential_noun_pair_propn_appos_head(self): + test_text = "Vendredi dernier, 106 patients attendaient sur des civières, alors que la capacité d'accueil est de 32, selon Caroline , infirmière depuis quelques années à l'hôpital de Saint-Eustache, dans les Laurentides. La jeune femme souhaite elle aussi témoigner sous le couvert de l'anonymat, par peur de représailles de son employeur." + self.compare_potential_noun_pair(test_text, + 21, 39, True, + ) diff --git a/tests/fr/test_smoke_tests_fr.py b/tests/fr/test_smoke_tests_fr.py index 01e14ba..b687c4d 100644 --- a/tests/fr/test_smoke_tests_fr.py +++ b/tests/fr/test_smoke_tests_fr.py @@ -185,7 +185,19 @@ def test_masc_over_fem_coordination(self): "Les australiennes admirent la giraffe et l'hippopotame. Elles boient beaucoup.", '[0: [1], [9]]', excluded_nlps='core_news_sm', ) - + + def test_titles_noun_pair_titles(self): + self.compare_annotations( + "Monsieur Lauret et Madame Ferrière sont allés voir une pièce de théâtre. Le pompier a passé une excellente soirée mais la dame n'était pas ravie.", + '[0: [0], [14], 1: [3], [22]]', excluded_nlps=['core_news_sm', 'core_news_md'], + ) + + def test_titles_noun_pair_titles_abbrev(self): + self.compare_annotations( + "M. Lauret et Mme Ferrière sont allés voir une pièce de théâtre. Le pompier a passé une excellente soirée mais la dame n'était pas ravie.", + '[0: [0], [14], 1: [3], [22]]', excluded_nlps=['core_news_sm', 'core_news_md'], + ) + def test_documentation_example_1(self): self.compare_annotations( 'Même si elle était très occupée par son travail, Julie en avait marre. Alors, elle et son mari décidèrent qu\'ils avaient besoin de vacances. Ils allèrent en Espagne car ils adoraient le pays',