diff --git a/coreferee/lang/fr/config.cfg b/coreferee/lang/fr/config.cfg index c5da919..38d4a7d 100644 --- a/coreferee/lang/fr/config.cfg +++ b/coreferee/lang/fr/config.cfg @@ -1,18 +1,17 @@ -[sm_3_2_0] +[sm_3_3_0] model: core_news_sm from_version: 3.1.0 -to_version: 3.2.0 -train_version: 3.2.0 +to_version: 3.3.0 +train_version: 3.3.0 - -[md_3_2_0] +[md_3_3_0] model: core_news_md from_version: 3.1.0 -to_version: 3.2.0 -train_version: 3.2.0 +to_version: 3.3.0 +train_version: 3.3.0 -[lg_3_2_0] +[lg_3_3_0] model: core_news_lg from_version: 3.1.0 -to_version: 3.2.0 -train_version: 3.2.0 +to_version: 3.3.0 +train_version: 3.3.0 diff --git a/coreferee/lang/fr/language_specific_rules.py b/coreferee/lang/fr/language_specific_rules.py index 363d865..e8970f9 100644 --- a/coreferee/lang/fr/language_specific_rules.py +++ b/coreferee/lang/fr/language_specific_rules.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021 Valentin-Gabriel Soumah, 2021 msg systems ag, +# Copyright (C) 2021 Valentin-Gabriel Soumah, 2021 msg systems ag, # 2021-2022 ExplosionAI GmbH from typing import List, Set, Tuple, Optional, cast @@ -115,6 +115,7 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): "madame", "mesdames", "mlle", + "melle", "mlles", "mademoiselle", "mesdemoiselles", @@ -128,7 +129,8 @@ class LanguageSpecificRulesAnalyzer(RulesAnalyzer): "professeur", "pr", "professeurs", - "prs" "maitre", + "prs", + "maitre", "maître", "me", "ministre", @@ -186,7 +188,14 @@ def add_siblings_recursively( def is_independent_noun(self, token: Token) -> bool: if not self.french_word.match(token.text): return False + if ( + token.lemma_.lower() in self.person_titles and + token.pos_ in self.noun_pos + ): + # dr Jugnot ... + return True if token.pos_ == "PROPN" and re.match("[^A-ZÂÊÎÔÛÄËÏÖÜÀÆÇÉÈŒÙ]", token.lemma_): + # mistagged propns that are not capitalized return False if ( token.lemma_ in {"un", "certains", "certain"} @@ -205,16 +214,42 @@ def is_independent_noun(self, token: Token) -> bool: ) ): # Une des filles, certains des garçons... - pass - elif self.is_quelqun_head(token): - pass - elif ( + return True + if self.is_quelqun_head(token): + return True + if ( + token.head.lemma_.lower() in self.person_titles and + token.dep_ == "nmod" and + token.pos_ == "PROPN" and + token.head.i == token.i - 1 + ): + # Docteur Jugnot ... + return False + + # now that we have dealt with all exceptions/mistagging we specify regular cases + if ( token.pos_ not in self.noun_pos + ("ADJ", "PRON") or token.dep_ in ("fixed", "flat:name", "flat:foreign", "amod") or (token.pos_ in ("ADJ", "PRON") and not self.has_det(token)) ): + # Only nouns without det or adjective nouns return False - elif ( + if ( + token.pos_ != "PROPN" and not self.has_det(token) + and token.dep_ not in ("ROOT", "appos") + and not( + any( + child.dep_ == "amod" and self.has_det(child) + for child in token.children + ) + ) + ): + return False + if self.is_token_in_one_of_phrases( + token, self.blacklisted_phrases # type:ignore[attr-defined] + ): + return False + if ( token.lemma_ == "dernier" and any( self.has_morph(child, "PronType", "Dem") for child in token.children @@ -235,14 +270,11 @@ def is_independent_noun(self, token: Token) -> bool: and token.lemma_ in self.blacklisted_nouns # type:ignore[attr-defined] ): return False - return not self.is_token_in_one_of_phrases( - token, self.blacklisted_phrases # type:ignore[attr-defined] - ) + return True def is_potential_anaphor(self, token: Token) -> bool: if not self.french_word.match(token.text): return False - # Ce dernier, cette dernière... if ( token.lemma_ == "dernier" and any( @@ -250,6 +282,7 @@ def is_potential_anaphor(self, token: Token) -> bool: ) and token.dep_ not in ("amod", "appos") ): + # Ce dernier, cette dernière.. return True if self.is_emphatic_reflexive_anaphor(token): return True @@ -265,12 +298,17 @@ def is_potential_anaphor(self, token: Token) -> bool: ): return True if ( - token.pos_ == "DET" + token.pos_ == "DET" and token.dep_ == "obj" and token.i < len(token.doc) - 1 and token.head.i == token.i + 1 ): - # Covers cases of clitic pronouns wrongly tagged as DET + # Covers cases of clitic pronouns wrongly tagged as DET + return True + if ( + token.dep_ == "case" and token.lemma_ in ["en", "y"] + and (token.head.pos_ == "VERB" or token.head.head.pos_ == 'PRON') + ): return True if not ( ( @@ -283,6 +321,7 @@ def is_potential_anaphor(self, token: Token) -> bool: or (token.pos_ == "ADV" and token.lemma_ in {"ici", "là"}) or (token.pos_ == "DET" and self.has_morph(token, "Poss", "Yes")) ): + # anaphors are either third person pronouns or pro adv or possessive return False if ( token.pos_ == "DET" @@ -333,7 +372,17 @@ def is_potential_anaphor(self, token: Token) -> bool: ): return False + if ( + token.dep_ in ("nsubj", "nsubj:pass") + and token.head.lemma_ in ("falloir", "valoir") + ): + return False # impersonal constructions + if token.dep_ == "expl:subj" and any( + c for c in token.head.children + if c.dep_ in ("cop", 'aux:tense') + ): + return True if ( token.dep_ in {"expl:comp", "expl:pass", "expl:subj"} and token.lemma_ not in {"en"} @@ -393,7 +442,10 @@ def is_quelqun_head(self, token: Token) -> bool: return False def has_det(self, token: Token) -> bool: - return any(det for det in token.children if det.dep_ == "det") + for child in token.children: + if child.dep_ =="det" or self.has_morph(child, "PronType", "Art"): + return True + return False def get_gender_number_info( self, token: Token, directly=False, det_infos=False @@ -607,11 +659,13 @@ def is_potential_anaphoric_pair( return 0 if not ((referred_masc and referring_masc) or (referred_fem and referring_fem)): + # gender compatibility return 0 if not ( (referred_plur and referring_plur) or (referred_sing and referring_sing) ): + # number compatibility return 0 #'ici , là... cannot refer to person. only loc and possibly orgs @@ -745,6 +799,17 @@ def is_potential_anaphoric_pair( # * Les hommes étaient sûrs qu'ils se trompaient. "se" can't directly refer to "hommes" return 0 + if (referred_root == referring.head.head + and referring.head.pos_ == "VERB" + and self.has_morph(referring.head, "VerbForm", "Fin") + and self.is_reflexive_anaphor(referring) == 0 + and referring.head.dep_ in ["acl:relcl"] + and referring.dep_ in ["obj", "nsubj", "nsubj:pass"] + ): + # L'homme qu'il voyait . "il" can't refer to "hommes" + # Covers other cases of pairs inside same predication + return 0 + if self.refers_to_person(referring) and not self.refers_to_person( referred_root ): @@ -765,10 +830,11 @@ def is_potential_anaphoric_pair( and referring_governing_sibling.head.lemma_ in self.verbs_with_personal_subject # type:ignore[attr-defined] ): + # if referring is a person, referred should be as well for working_token in (doc[index] for index in referred.token_indexes): if self.refers_to_person(working_token): return 2 - if referred_root.pos == "NOUN": + if referred_root.pos_ == "NOUN": uncertain = True return 1 if uncertain else 2 @@ -850,7 +916,7 @@ def is_potential_reflexive_pair(self, referred: Mention, referring: Token) -> bo if referring._.coref_chains.temp_governing_sibling is not None: referring = referring._.coref_chains.temp_governing_sibling - if referred_root.dep_ in ("nsubj", "nsubj:pass") and not any( + if referred_root.dep_ in ("nsubj", "nsubj:pass", "expl:subj") and not any( selon for selon in referring.children if selon.lemma_ == "selon" and selon.dep_ == "case" @@ -877,7 +943,7 @@ def is_potential_reflexive_pair(self, referred: Mention, referring: Token) -> bo subjects = [ t for t in referring_ancestor.children - if t.dep_ in ("nsubj", "nsubj:pass") + if t.dep_ in ("nsubj", "nsubj:pass", "expl:subj") ] if any(subjects) and referred_root not in subjects: return False @@ -918,7 +984,7 @@ def is_potential_cataphoric_pair(self, referred: Mention, referring: Token) -> b # is conjunction between verbs for ancestor in referred_root.ancestors: if ancestor.pos_ in self.clause_root_pos or any( - child for child in ancestor.children if child.dep_ == "cop" + child for child in ancestor.children if child.dep_ in ["cop","aux:tense"] ): referred_verb_ancestors.append(ancestor) if ancestor.dep_ in self.dependent_sibling_deps: @@ -926,8 +992,7 @@ def is_potential_cataphoric_pair(self, referred: Mention, referring: Token) -> b # Loop through the ancestors of the referring pronoun that are verbs, that are not # within the first list and that have an adverbial clause dependency label - referring_inclusive_ancestors = [referring] - referring_inclusive_ancestors.extend(referring.ancestors) + referring_inclusive_ancestors = [referring] + list(referring.ancestors) if ( len( [ @@ -939,13 +1004,13 @@ def is_potential_cataphoric_pair(self, referred: Mention, referring: Token) -> b == 0 ): return False - for referring_verb_ancestor in ( - t - for t in referring_inclusive_ancestors - if t not in referred_verb_ancestors - and t.dep_ in self.adverbial_clause_deps - and t.pos_ in self.clause_root_pos + self.noun_pos + ("ADJ",) - ): + for referring_verb_ancestor in referring_inclusive_ancestors: + if ( + referring_verb_ancestor in referred_verb_ancestors or + referring_verb_ancestor.dep_ not in self.adverbial_clause_deps or + referring_verb_ancestor.pos_ not in self.clause_root_pos + self.noun_pos + ("ADJ",) + ): + continue # If one of the elements of the second list has one of the elements of the first list # within its ancestors, we have subordination and cataphora is permissible if ( @@ -1187,7 +1252,7 @@ def language_dependent_is_coreferring_noun_pair( ): return True # Other cases of apposition - if referring not in referred._.coref_chains.temp_dependent_siblings: + if referring not in referred._.coref_chains.temp_dependent_siblings and 0: referred_right_in_subtree = list(referred.subtree)[-1] referring_left_in_subtree = list(referring.subtree)[0] if ( @@ -1226,7 +1291,7 @@ def is_potential_coreferring_noun_pair( already returned *True* for both *referred* and *referring* and that *referred* precedes *referring* within the document. """ - if len(referred.text) == 1 and len(referring.text) == 1: + if len(referred.text) == 1 or len(referring.text) == 1: return False # get rid of copyright signs etc. if (referred.pos_ not in self.noun_pos and not self.has_det(referred)) or ( diff --git a/coreferee/training/loaders.py b/coreferee/training/loaders.py index 0f3592b..aef770c 100644 --- a/coreferee/training/loaders.py +++ b/coreferee/training/loaders.py @@ -423,28 +423,34 @@ def load_file( split_conll_lines = [ l.split() for l in conll_file.readlines() if len(l.split()) > 10 ] - part_ids = sorted(list({l[1] for l in split_conll_lines})) + part_ids = sorted({tuple(l[:2]) for l in split_conll_lines}, key=lambda k: (k[0], k[1])) docs = [] for part_id in part_ids: + print(part_id) this_part_split_conll_lines = [ - l for l in split_conll_lines if l[1] == part_id + l for l in split_conll_lines if tuple(l[:2]) == part_id ] - if nlp.meta["lang"] in ("fr"): - # Tokens ending an apostrophes have to be merged with following tokens in French, + if nlp.meta["lang"] in ("fr",): + # Tokens ending with apostrophes have to be merged with following tokens in French, # otherwise parsing errors will result corrected_this_part_split_conll_lines: List[List[str]] = [] index = 0 while index < len(this_part_split_conll_lines): - conll_token = this_part_split_conll_lines[index][3].lstrip("/") + conll_token = this_part_split_conll_lines[index][3] + if conll_token != "/": + conll_token = conll_token.lstrip("/") if ( index + 1 < len(this_part_split_conll_lines) and len(conll_token) > 0 and len(this_part_split_conll_lines[index + 1][3]) > 0 - and conll_token[-1] in ("'") + and conll_token[-1] in ("'",) ): + next_split_conll_line = this_part_split_conll_lines[index + 1][3] + if next_split_conll_line != "/": + next_split_conll_line = next_split_conll_line.lstrip("/") this_part_split_conll_lines[index][ 3 - ] += this_part_split_conll_lines[index + 1][3].lstrip("/") + ] += next_split_conll_line if this_part_split_conll_lines[index + 1][-1] not in ("-", "_"): if this_part_split_conll_lines[index][-1] not in ("-", "_"): this_part_split_conll_lines[index][-1] += ( @@ -464,7 +470,7 @@ def load_file( ) index += 1 this_part_split_conll_lines = corrected_this_part_split_conll_lines - conll_tokens = [l[3].lstrip("/") for l in this_part_split_conll_lines] + conll_tokens = [l[3].lstrip("/") if l[3] != "/" else l[3] for l in this_part_split_conll_lines] doc = nlp(" ".join(conll_tokens)) rules_analyzer.initialize(doc) conll_to_spacy_lookup = ( @@ -501,19 +507,32 @@ def load_file( for chain_marker in chain_markers.split("|"): chain_index = "".join([d for d in chain_marker if d.isdigit()]) if "(" in chain_marker: - working_spans[chain_index] = conll_to_spacy_lookup[ - conll_token_index - ][0] + spacy_token_index_list = conll_to_spacy_lookup[ + conll_token_index + ] + spacy_token_index = spacy_token_index_list[0] + + if chain_index in working_spans: + working_spans[chain_index].append(spacy_token_index) + else: + working_spans[chain_index] = [spacy_token_index] + if ( + ')' in chain_marker and '(' not in chain_marker and + (chain_index not in working_spans or not working_spans[chain_index]) + ): + print("Warning : faulty coreference annotation in Conll. Unopened mention", chain_index) if ( - ")" in chain_marker and chain_index in working_spans + ")" in chain_marker and chain_index in working_spans and working_spans[chain_index] ): # sometimes errors in OntoNotes -> not the case this_span = doc[ - working_spans[chain_index] : conll_to_spacy_lookup[ + working_spans[chain_index].pop(-1) : conll_to_spacy_lookup[ conll_token_index ][-1] + 1 ] - del working_spans[chain_index] + #print("this span", this_span) + if not working_spans[chain_index]: + del working_spans[chain_index] if rules_analyzer.is_independent_noun( this_span.root ) or rules_analyzer.is_potential_anaphor(this_span.root): @@ -521,6 +540,9 @@ def load_file( chains[chain_index].append(this_span) else: chains[chain_index] = [this_span] + + if working_spans: + print("Warning : faulty coreference annotation in Conll. Unclosed mentions :", working_spans) for chain in (c for c in chains.values() if len(c) > 1): chain.sort(key=lambda span: span[0]) # type: ignore[arg-type, return-value] for span_index, span in enumerate(chain): @@ -583,3 +605,4 @@ def load( docs.extend(self.load_file(conll_filename, nlp, rules_analyzer)) print() return docs +# python -m coreferee train --lang fr --loader ConllLoader --data ..\..\..\corpus\dem1921\train_dev\ --log logs diff --git a/tests/fr/test_rules_fr.py b/tests/fr/test_rules_fr.py index 151fb10..ceaf738 100644 --- a/tests/fr/test_rules_fr.py +++ b/tests/fr/test_rules_fr.py @@ -1,18 +1,13 @@ import unittest -from coreferee.errors import ModelNotSupportedError from coreferee.rules import RulesAnalyzerFactory from coreferee.test_utils import get_nlps from coreferee.data_model import Mention -try: - nlps = get_nlps("fr") -except ModelNotSupportedError: - raise unittest.SkipTest("Model version not supported.") class FrenchRulesTest(unittest.TestCase): def setUp(self): - self.nlps = get_nlps("fr") + self.nlps = get_nlps("fr", add_coreferee=False) self.rules_analyzers = [ RulesAnalyzerFactory.get_rules_analyzer(nlp) for nlp in self.nlps ] @@ -96,7 +91,7 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_comma_a self, ): self.compare_get_dependent_sibling_info( - "Carol, Richard et Ralf ont mangé un buffet", + "Carole, Richard et Ralf ont mangé un buffet", 0, "[Richard, Ralf]", None, @@ -108,7 +103,7 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_comma_o self, ): self.compare_get_dependent_sibling_info( - "Carol, Richard ou Ralf mangeaient un buffet", + "Carole, Richard ou Ralf mangeaient un buffet", 0, "[Richard, Ralf]", None, @@ -118,7 +113,7 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_comma_o def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_and(self): self.compare_get_dependent_sibling_info( - "Il y avait une réunion avec Carol et Ralf et Richard", + "Il y avait une réunion avec Carole et Ralf et Richard", 6, "[Ralf, Richard]", None, @@ -127,7 +122,7 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_and(sel def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_or(self): self.compare_get_dependent_sibling_info( - "Une réunion avec Carol ou Ralf ou Richard avait lieu", + "Une réunion avec Carole ou Ralf ou Richard avait lieu", 3, "[Ralf, Richard]", None, @@ -138,7 +133,7 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_and_and self, ): self.compare_get_dependent_sibling_info( - "Il y avait une réunion avec Carol ou Ralf et Richard", + "Il y avait une réunion avec Carole ou Ralf et Richard", 6, "[Ralf, Richard]", None, @@ -147,12 +142,12 @@ def test_get_dependent_sibling_info_three_member_conjunction_phrase_with_and_and def test_get_dependent_sibling_info_conjunction_itself(self): self.compare_get_dependent_sibling_info( - "Il y avait une réunion avec Carol et Ralf et Richard", 7, "[]", None, False + "Il y avait une réunion avec Carole et Ralf et Richard", 7, "[]", None, False ) def test_get_dependent_sibling_info_dependent_sibling(self): self.compare_get_dependent_sibling_info( - "Il y avait une réunion avec Carol et Ralf et Richard", 8, "[]", 6, False + "Il y avait une réunion avec Carole et Ralf et Richard", 8, "[]", 6, False ) def compare_independent_noun( @@ -177,7 +172,8 @@ def test_independent_noun_simple(self): def test_independent_noun_conjunction(self): self.compare_independent_noun( - "Ils ont regardé les grands lions, les tigres et les éléphants", [5, 8, 11] + "Ils ont regardé les grands lions, les tigres et les éléphants", [5, 8, 11], + excluded_nlps=["core_news_sm"] ) def test_multi_word_determiner(self): @@ -224,6 +220,38 @@ def test_substantive_adjective(self): "Les premiers ont pris un petit chat. Le petit est mignon.", [1, 6, 9] ) + def test_noun_with_amalgam_det(self): + self.compare_independent_noun( + "Le garçon va au cinéma ce soir puis demain il va à la montagne.", + [1, 4, 6, 13] + ) + + def test_noun_without_det(self): + self.compare_independent_noun( + "Il a répondu de nouveau à côté. Il apprend par coeur le poème en entier.", + [13] + ) + + def test_noun_without_det_control(self): + self.compare_independent_noun( + "Poèmes. Ils déchainent les passions.", + [0, 5], + excluded_nlps=["core_news_sm"] + ) + def test_noun_titles(self): + self.compare_independent_noun( + "Monsieur et Madame sont arrivés. Maitre Dupont accompagne Mademoiselle Perrat et Docteur Noreau", + [0, 2, 6, 9, 12], + excluded_nlps=["core_news_sm", "core_news_md"] + ) + + def test_noun_titles_abbrv(self): + self.compare_independent_noun( + "M. et Mme sont arrivés. Me Dupont accompagne Mlle Perrat et Dr Noreau", + [0, 2, 6, 9, 12], + excluded_nlps=["core_news_sm"] + ) + def compare_potential_anaphor( self, doc_text, expected_per_indexes, *, excluded_nlps=[] ): @@ -246,14 +274,16 @@ def test_third_person_pronouns(self): def test_first_and_second_person_pronouns(self): self.compare_potential_anaphor( - "Je sais que tu le connais", + "Je sais que tu le connaîs", [4], excluded_nlps=["core_news_md", "core_news_sm"], ) def test_pronouns(self): self.compare_potential_anaphor( - "On y va demain", [1], excluded_nlps=["core_news_md", "core_news_sm"] + "Vous y Allez demain. C'est là qu'on voit qui a raison.", + [1, 7], + excluded_nlps=["core_news_sm"] ) def test_demonstrative_pronouns(self): @@ -279,8 +309,8 @@ def test_location_proadverbs(self): def test_explicit_anaphor(self): self.compare_potential_anaphor( - "Ce dernier vient de rejoindre Camille. Cette dernière est en retard", - [1, 8], + "Ce dernier a rejoint Camille. Cette dernière est en retard", + [1, 7], excluded_nlps=["core_news_sm"], ) @@ -295,12 +325,12 @@ def test_pleonastic_il_1(self): self.compare_potential_anaphor( "Il pleuvait. Il faisait très beau. Il a fait froid. Il fit chaud. Il avait fait frais.", [], - excluded_nlps=["core_news_sm"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_pleonastic_il_2(self): self.compare_potential_anaphor( - "Il faut bien manger. Il vaut mieux y aller. Il y a deux fleurs. ", + "Il faut bien manger. Il vaut mieux y aller. Il y a deux fleurs.", [8], excluded_nlps=["core_news_md"], ) @@ -313,10 +343,11 @@ def test_pleonastic_il_3(self): ) def test_pleonastic_il_4(self): + # Rule to was removed since it excluded valid pronouns due to too many false positives in nlp model self.compare_potential_anaphor( - "Il est vrai que ce jeu est dur. Il en existe trois sortes. Il manque deux pièces.", + "Il est vrai que ce jeu est dur. Il en existe trois sortes. Il en manque.", [10], - excluded_nlps=["core_news_sm", "core_news_md"], + excluded_nlps=["core_news_sm", "core_news_md", "core_news_lg"], ) def test_possessive_determiners(self): @@ -440,6 +471,8 @@ def func(nlp): if nlp.meta["name"] in excluded_nlps: return doc = nlp(doc_text) + if "Sony" in doc.text: + print(nlp.meta["name"], doc, [(ent, ent.label_) for ent in doc.ents]) rules_analyzer = RulesAnalyzerFactory.get_rules_analyzer(nlp) rules_analyzer.initialize(doc) assert rules_analyzer.is_independent_noun( @@ -568,12 +601,14 @@ def test_potential_pair_trivial_plur_coordination_control_2(self): def test_potential_pair_trivial_plur_coordination_possessive(self): self.compare_potential_pair( - "Je voyais un homme et une femme. Leur chien dormait", 3, True, 8, 2 + "Je voyais un homme et une femme. Leur chien dormait", 3, True, 8, 2, + excluded_nlps=["core_news_md", "core_news_sm"] ) def test_potential_pair_trivial_plur_coordination_possessive_control(self): self.compare_potential_pair( - "Je voyais un homme et une femme. Son chien dormaient", 3, True, 8, 0 + "Je voyais un homme et une femme. Son chien dormait", 3, True, 8, 0, + excluded_nlps=["core_news_md", "core_news_sm"] ) def test_potential_pair_trivial_plur_coordination_elements_plural_1(self): @@ -708,12 +743,12 @@ def test_potential_pair_apposition(self): def test_potential_pair_apposition_2(self): self.compare_potential_pair( - "Alexandre, roi de Macédoine devient empereur. Il meurt à 33 ans.", - 2, + "Napoléon Bonaparte, empereur des Français est couronné en 1804. Il meurt en 1821", + 3, True, - 8, + 11, 2, - excluded_nlps=["core_news_md", "core_news_sm"], + excluded_nlps=["core_news_sm"], ) def test_potential_pair_male_name(self): @@ -731,7 +766,7 @@ def test_potential_pair_female_name(self): def test_potential_pair_female_name_control_1(self): self.compare_potential_pair("Je voyais Julie. Il dormait", 2, False, 4, 0) - def test_potential_pair_female_name_control_3(self): + def test_potential_pair_female_name_control_2(self): self.compare_potential_pair("Je voyais Julie. Ils dormaient", 2, False, 4, 0) def test_potential_pair_female_name_control_3(self): @@ -753,7 +788,7 @@ def test_potential_pair_male_female_name_control_2(self): def test_potential_pair_fem_acc_anaphor_1(self): self.compare_potential_pair( - "Je voyais une femme. Je la préviens", + "Je voyais une femme. Je la vois", 3, False, 6, @@ -800,7 +835,7 @@ def test_potential_pair_dislocation_left_cataphor(self): def test_potential_pair_dislocation_right_anaphor(self): self.compare_potential_pair( - "La valise, elle est bleue", + "La valise, elle est petite", 1, False, 3, @@ -984,22 +1019,22 @@ def test_potential_posessive_determiner_control(self): def test_potential_reflexive_doubled(self): self.compare_potential_pair( - "La panthère se chassait elle-même.", + "La panthère se chasse elle-même", 1, False, 4, 2, - excluded_nlps="core_news_sm", + excluded_nlps=["core_news_sm"], ) def test_potential_reflexive_emphatic(self): self.compare_potential_pair( - "La panthère chassait elle-même.", + "La panthère chasse elle-même.", 1, False, 3, 2, - excluded_nlps="core_news_sm", + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_potential_reflexive_doubled_control(self): @@ -1195,12 +1230,11 @@ def test_reflexive_in_wrong_situation_different_sentence(self): self.compare_potential_reflexive_pair( "Je voyais l'homme. L'Homme se voyait", 3, False, 7, 0, False, 2 ) - def test_reflexive_in_wrong_situation_different_sentence_control(self): self.compare_potential_reflexive_pair( - "Je voyais l'homme. L'autre homme le voyait", 3, False, 8, 2, False, 0 + "Je voyais l'homme. L'autre homme le voyait", 3, False, 8, 2, False, 0, + excluded_nlps=["core_news_sm"] ) - def test_reflexive_in_wrong_situation_same_sentence_1(self): self.compare_potential_reflexive_pair( "Je voyais l'homme pendant que l'autre homme se voyait lui-même.", @@ -1210,7 +1244,7 @@ def test_reflexive_in_wrong_situation_same_sentence_1(self): 0, False, 2, - ) # AJOUTER EXEMPLES lui-même + ) def test_reflexive_in_wrong_situation_same_sentence_control(self): self.compare_potential_reflexive_pair( @@ -1221,6 +1255,7 @@ def test_reflexive_in_wrong_situation_same_sentence_control(self): 2, False, 0, + excluded_nlps=["core_news_sm"] ) def test_reflexive_emphasis(self): @@ -1232,7 +1267,7 @@ def test_reflexive_emphasis(self): 2, True, 2, - ) # AJOUTER EXEMPLES lui-même + ) def test_reflexive_emphasis_control(self): self.compare_potential_reflexive_pair( @@ -1247,7 +1282,8 @@ def test_reflexive_emphasis_control(self): def test_non_reflexive_in_wrong_situation_same_sentence(self): self.compare_potential_reflexive_pair( - "L'homme le voyait.", 1, False, 2, 0, True, 0 + "L'homme le voyait.", 1, False, 2, 0, True, 0, + excluded_nlps=["core_news_sm"] ) def test_non_reflexive_in_wrong_situation_same_sentence_control(self): @@ -1358,7 +1394,8 @@ def test_reflexive_with_object_antecedent_and_coordination(self): def test_reflexive_with_verb_coordination_one_subject(self): self.compare_potential_reflexive_pair( - "L'homme le voyait et se félicitait", 1, False, 5, 2, True, 2 + "L'homme le voyait et se félicitait", 1, False, 5, 2, True, 2, + excluded_nlps=["core_news_sm"] ) def test_reflexive_with_verb_coordination_two_subjects(self): @@ -1464,36 +1501,38 @@ def test_reflexive_double_coordination_with_preposition(self): def test_reflexive_relative_clause_subject(self): self.compare_potential_reflexive_pair( - "L'homme qui le voyait, est rentré.", 1, False, 3, 0, True, 0 + "L'homme qui le voyait, est grand.", 1, False, 3, 0, True, 0, + excluded_nlps=["core_news_sm"] ) def test_reflexive_relative_clause_object_1(self): self.compare_potential_reflexive_pair( - "L'homme qu'il voyait, est rentré.", 1, False, 3, 0, True, 0 + "L'homme qu'il voyait, est grand.", 1, False, 3, 0, True, 0, + excluded_nlps=["core_news_md", "core_news_sm"] ) def test_reflexive_relative_clause_subject_with_conjunction(self): self.compare_potential_reflexive_pair( - "L'homme et la femme qui les voyaient, sont rentrés", + "L'homme et la femme qui les voyaient, sont très grands", 1, True, 6, 0, True, 0, - excluded_nlps=["core_news_sm", "core_news_md", "dep_news_trf"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_reflexive_relative_clause_object_with_conjunction(self): self.compare_potential_reflexive_pair( - "L'homme et la femme qu'ils voyaient, sont rentrés", + "L'homme et la femme qu'ils voyaient, sont très grands", 1, True, 6, 0, True, 0, - excluded_nlps=["core_news_sm", "core_news_md", "dep_news_trf"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def compare_potential_cataphoric_pair( @@ -1550,7 +1589,7 @@ def test_cataphora_with_conjunction(self): True, 2, True, - excluded_nlps=["core_news_sm"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_cataphora_with_conjunction_control(self): @@ -1632,7 +1671,7 @@ def test_cataphora_conjunction_at_verb_level(self): False, 2, False, - excluded_nlps=["core_news_sm"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_cataphora_referred_is_pronoun(self): @@ -1652,7 +1691,8 @@ def test_cataphora_referred_is_pronoun_control(self): def test_cataphora_not_advcl(self): self.compare_potential_cataphoric_pair( - "Il était libre ; il rentra à la maison", 4, False, 0, False + "Il était libre ; il rentra à la maison", 4, False, 0, False, + excluded_nlps=["core_news_sm"] ) def compare_potential_referreds( @@ -1710,6 +1750,7 @@ def test_potential_referreds_maximum_sentence_referential_distance(self): "Richard vint. Un homme. Un homme. Un homme. Un homme. Il parla.", 15, ["Richard(0)", "homme(4)", "homme(7)", "homme(10)", "homme(13)"], + excluded_nlps=["core_news_sm"] ) def test_potential_referreds_over_maximum_sentence_referential_distance(self): @@ -1717,14 +1758,15 @@ def test_potential_referreds_over_maximum_sentence_referential_distance(self): "Richard vint. Un homme. Un homme. Un homme. Un homme. Un homme. Il parla.", 18, ["homme(4)", "homme(7)", "homme(10)", "homme(13)", "homme(16)"], + excluded_nlps=["core_news_sm"] ) def test_potential_referreds_last_token(self): self.compare_potential_referreds( - "Richard entra et un homme le vit", - 5, - ["Richard(0)"], - excluded_nlps=["core_news_sm"], + "Lucas est entré et un homme l'a regardé", + 6, + ["Lucas(0)"], + excluded_nlps=["core_news_sm", "core_news_md"], ) def test_potential_referreds_cataphora_simple(self): @@ -1779,6 +1821,7 @@ def test_potential_noun_pair_apposition_same_lemma(self): 1, 8, True, + excluded_nlps=["core_news_sm"] ) def test_potential_noun_pair_proper_noun_noun(self): @@ -1834,8 +1877,45 @@ def test_potential_noun_pair_apposition_2(self): 0, 16, True, + excluded_nlps=["core_news_sm"] + ) + + def test_potential_pair_copula_propn_first(self): + self.compare_potential_noun_pair( + "Georges Marais est le contrôleur des finances.", + 0, + 4, + True, + excluded_nlps=[] ) + def test_potential_pair_copula_propn_second(self): + self.compare_potential_noun_pair( + "Le contrôleur des finances est Georges Marais.", + 1, + 5, + True, + excluded_nlps=["core_news_sm"], + ) + + def test_potential_pair_copula_propn_control(self): + self.compare_potential_noun_pair( + "Le garçon est au cinéma.", + 1, + 4, + False, + excluded_nlps=[], + ) + + def test_potential_pair_copula_propn_control_2(self): + self.compare_potential_noun_pair( + "Le garçon est le cinéma.", + 1, + 4, + True, + excluded_nlps=[], + ) + def test_potential_noun_pair_same_number(self): self.compare_potential_noun_pair( "Nicolas Sarkozy venait d'arriver. Le président portait un costume.", @@ -1935,23 +2015,33 @@ def test_potential_noun_pair_title_abbr_control(self): False, excluded_nlps=["core_news_sm"], ) - - def test_potential_noun_pair_mixed_title_mixed__noun(self): + ''' + def test_potential_noun_pair_nationality(self): + self.compare_potential_noun_pair( + "Parmi les bonnes pioches estivales du club rennais figure Lovro Majer. " + "Le Croate a été directement freiné par une mystérieuse blessure à la hanche ", + 9, + 13, + True, + excluded_nlps=["core_news_sm"], + ) + ''' + def test_potential_noun_pair_mixed_title_mixed_noun(self): self.compare_potential_noun_pair( "Docteur Jonas est là. Le médecin est habillé en blanc", 0, 6, True, - excluded_nlps=["core_news_sm"], + excluded_nlps=["core_news_sm","core_news_md"], ) - def test_potential_noun_pair_masc_title_mixed__noun(self): + def test_potential_noun_pair_masc_title_mixed_noun(self): self.compare_potential_noun_pair( "Docteur Jonas est là. Le médecin est habillé en blanc", 0, 6, True, - excluded_nlps=["core_news_sm"], + excluded_nlps=["core_news_sm","core_news_md"], ) def test_potential_noun_pair_mixed_title_fem_noun(self): @@ -1991,10 +2081,23 @@ def test_potential_noun_pair_no_gender(self): ) def test_potential_noun_pair_propn_appos_head(self): - test_text = "Vendredi dernier, 106 patients attendaient sur des civières, alors que la capacité d'accueil est de 32, selon Caroline , infirmière depuis quelques années à l'hôpital de Saint-Eustache, dans les Laurentides. La jeune femme souhaite elle aussi témoigner sous le couvert de l'anonymat, par peur de représailles de son employeur." + test_text = ( + "Vendredi dernier, 106 patients attendaient sur des civières" + + ", alors que la capacité d'accueil est de 32, selon Caroline ," + + " infirmière depuis quelques années à l'hôpital de Saint-Eustache, dans les Laurentides." + + "La jeune femme souhaite elle aussi témoigner sous le couvert de l'anonymat, par peur de représailles de son employeur." + ) self.compare_potential_noun_pair( test_text, 21, 39, True, ) + def test_potential_noun_pair_noun_sentence(self): + self.compare_potential_noun_pair( + "Les Poèmes. Les poèmes déchainent les passions.", + 1, + 4, + True, + ) + \ No newline at end of file diff --git a/tests/fr/test_smoke_tests_fr.py b/tests/fr/test_smoke_tests_fr.py index 76e2aed..0aaeee8 100644 --- a/tests/fr/test_smoke_tests_fr.py +++ b/tests/fr/test_smoke_tests_fr.py @@ -1,23 +1,18 @@ import unittest -from coreferee.errors import ModelNotSupportedError from coreferee.test_utils import get_nlps -try: - nlps = get_nlps("fr") -except ModelNotSupportedError: - raise unittest.SkipTest("Model version not supported.") - +nlps = get_nlps('fr') train_version_mismatch = False -train_version_mismatch_message = "Loaded model version does not match train model version" for nlp in nlps: if not nlp.meta["matches_train_version"]: train_version_mismatch = True - +train_version_mismatch_message = "Loaded model version does not match train model version" class FrenchSmokeTest(unittest.TestCase): def setUp(self): - self.nlps = get_nlps("fr") + + self.nlps = get_nlps('fr') def all_nlps(self, func): for nlp in self.nlps: @@ -116,7 +111,7 @@ def test_reflexive_doubled(self): def test_reflexive_coordination(self): self.compare_annotations( - 'La panthère et le léopard se chassaient', + 'Le léopard et la panthère se chassaient', '[0: [1, 4], [5]]', excluded_nlps=['core_news_md','core_news_sm']) @@ -128,7 +123,8 @@ def test_reflexive_excluded_mix_of_coordination_and_single_member_1(self): def test_reflexive_excluded_mix_of_coordination_and_single_member_2(self): self.compare_annotations( 'Jacques et Julie entrèrent. Ils les virent.', - '[0: [0, 2], [5]]') + '[0: [0, 2], [5]]', + excluded_nlps=["core_news_sm"]) def test_reflexive_anaphor_precedes_referent(self): @@ -138,18 +134,18 @@ def test_reflexive_anaphor_precedes_referent(self): def test_cataphora_simple(self): self.compare_annotations( - 'Bien qu\'il était enervé, Jacques rentra dans le métro', - '[0: [2], [6]]') + 'Même s\'il était nerveux, Jacques rentra dans le métro', + '[0: [2], [6]]', excluded_nlps=["core_news_sm", "core_news_md"]) def test_cataphora_with_coordination(self): self.compare_annotations( - 'Bien qu\'ils partaient, l\'homme et la femme étaient tristes', - '[0: [2], [6, 9]]', excluded_nlps=['core_news_sm']) + 'Même s\'ils paraissaient heureux, l\'homme et la femme étaient tristes', + '[0: [2], [7, 10]]', excluded_nlps=['core_news_sm', "core_news_md"]) def test_possessive_pronoun_within_threeway_coordination(self): self.compare_annotations( - 'Nous vîment Jacques, ses amis et son chien.', + 'Nous voyons Jacques, ses amis et son chien.', '[0: [2], [4], [7]]') def test_crossed_demonstrative_anaphors(self): @@ -161,7 +157,7 @@ def test_crossed_demonstrative_anaphors(self): def test_proadverb_location(self): self.compare_annotations( 'Claire a acheté une nouvelle maison. C\'est là qu\'on ira manger demain avec elle et son mari.', - '[0: [0], [16], [18], 1: [5], [9]]', excluded_nlps=["core_news_md"]) + '[0: [0], [16], [18], 1: [5], [9]]', excluded_nlps=["core_news_sm", "core_news_md"]) def test_reflexive_noun(self): self.compare_annotations( @@ -184,27 +180,27 @@ def test_masc_over_fem_coordination(self): def test_titles_noun_pair_titles(self): self.compare_annotations( - "M. Lauret et Madame Ferrière sont allés voir une pièce de théâtre. Le pompier a passé une excellente soirée mais la dame n'était pas ravie.", - '[0: [0], [14], 1: [3], [22]]', excluded_nlps=['core_news_sm', 'core_news_md'], + "Hier, Monsieur Lauret et Madame Ferrière sont allés voir une pièce de théâtre. Le pompier a passé une excellente soirée mais la dame n'était pas ravie.", + '[0: [2], [16], 1: [5], [24]]', excluded_nlps=['core_news_sm', 'core_news_md'], ) def test_titles_noun_pair_titles_abbrev(self): self.compare_annotations( - "M. Lauret et Mme Ferrière sont allés voir une pièce de théâtre. Le pompier a passé une excellente soirée mais la dame n'était pas ravie.", - '[0: [0], [14], 1: [3], [22]]', excluded_nlps=['core_news_sm', 'core_news_md'], + "Hier, M. Lauret et Mme Ferrière sont allés voir une pièce de théâtre. Le facteur a passé une excellente soirée mais la dame n'était pas ravie.", + '[0: [2], [16], 1: [5], [24]]', excluded_nlps=['core_news_sm', 'core_news_md'], ) - @unittest.skipIf(train_version_mismatch, train_version_mismatch_message) + #@unittest.skipIf(train_version_mismatch, train_version_mismatch_message) def test_documentation_example_1(self): self.compare_annotations( 'Même si elle était très occupée par son travail, Julie en avait marre. Alors, elle et son mari décidèrent qu\'ils avaient besoin de vacances. Ils allèrent en Espagne car ils adoraient le pays', '[0: [2], [7], [10], [17], [19], 1: [8], [11], 2: [17, 20], [23], [29], [34], 3: [32], [37]]', excluded_nlps = ['core_news_sm'] ) - + def test_documentation_example_2(self): self.compare_annotations( - 'La femme se leva et regarda Dominique. Elle se tourna et la salua', + 'La femme se leva et regarda Dominique. Elle se tourna pour la saluer', '[0: [1], [2], [12], 1: [6], [8], [9]]', excluded_nlps=['core_news_md', 'core_news_sm'], alternative_expected_coref_chains='[0: [1], [2], [8], [9], 1: [6], [12]]') @@ -218,7 +214,7 @@ def test_documentation_example_3(self): def test_documentation_example_4(self): self.compare_annotations( - 'Marc et Léa étaient en Espagne. Ils adorèrent le pays et prévoient d\'y retourner l\'an prochain avec leurs parents.', - '[0: [0, 2], [7], [20], 1: [5], [10], [14]]', + 'Marc et Léa étaient partis en Espagne. Ils adorèrent le pays et prévoient d\'y retourner l\'an prochain avec leurs parents.', + '[0: [0, 2], [8], [21], 1: [6], [11], [15]]', excluded_nlps=['core_news_md','core_news_sm'] )