From 2ea201cd58330238cd16988e48bd945bbd04ceb9 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 6 Sep 2024 20:35:39 +0800 Subject: [PATCH] rm more person entities < min number when "search person" disabled Should remove person entity when "search person" option is disabled even if it has Wikipedia page. --- data/deps.json | 2 +- x_ray.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/data/deps.json b/data/deps.json index f2ef1b2..86ae8e6 100644 --- a/data/deps.json +++ b/data/deps.json @@ -1,6 +1,6 @@ { "lxml": "5.3.0", - "rapidfuzz": "3.9.6", + "rapidfuzz": "3.9.7", "spacy": "3.7.6", "spacy_cpu_model": "3.7.0", "en_spacy_cpu_model": "3.7.1", diff --git a/x_ray.py b/x_ray.py index b843517..7f16d54 100644 --- a/x_ray.py +++ b/x_ray.py @@ -148,7 +148,7 @@ def add_entity( self.entity_occurrences[entity_id].append((start, entity_len)) - def merge_entities(self, minimal_count: int) -> None: + def merge_entities(self, prefs: Prefs) -> None: for entity_name, entity_data in self.entities.copy().items(): if entity_name in self.custom_x_ray: continue @@ -161,8 +161,15 @@ def merge_entities(self, minimal_count: int) -> None: del self.entity_occurrences[entity_data.id] del self.entities[entity_name] continue - entity_cache = self.mediawiki.get_cache(entity_name) - if entity_cache is None and entity_data.count < minimal_count: + has_cache = self.mediawiki.get_cache(entity_name) is not None + is_person = entity_data.label in PERSON_LABELS + if entity_data.count < prefs["minimal_x_ray_count"] and ( + (prefs["search_people"] and not has_cache) + or ( + not prefs["search_people"] + and (is_person or (not is_person and not has_cache)) + ) + ): del self.entity_occurrences[entity_data.id] del self.entities[entity_name] @@ -178,7 +185,7 @@ def finish( self.mediawiki.query(self.entities, prefs["search_people"]) if self.wikidata is not None: query_wikidata(self.entities, self.mediawiki, self.wikidata) - self.merge_entities(prefs["minimal_x_ray_count"]) + self.merge_entities(prefs) insert_x_entities( self.conn,