From c038312f3c960a8fdb57bee6bdcdd6e99012a4c3 Mon Sep 17 00:00:00 2001 From: Xinyu Liu Date: Wed, 11 Nov 2020 23:06:22 -0500 Subject: [PATCH 1/2] Add entity linking script --- docs/working-with-spacy.md | 75 ++++++++++++++++++++++++++++++ scripts/entity_linking.py | 93 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 scripts/entity_linking.py diff --git a/docs/working-with-spacy.md b/docs/working-with-spacy.md index 71503246e..c8b2355dc 100644 --- a/docs/working-with-spacy.md +++ b/docs/working-with-spacy.md @@ -171,3 +171,78 @@ Then we have sentences: | 4 | If she wins, she will join Theresa May of Britain and Angela Merkel of Germany in the ranks of women who lead prominent Western democracies. | | ... | ... | +## Entity Linking + +Unfortunately, spaCy does not provide any pre-trained Entity Linking model currently. However, we found another great +Entity Linking package called [Radboud Entity Linker (REL)](https://github.com/informagi/REL#rel-radboud-entity-linker). + +In this section, we introduce an entity linking [script](../scripts/entity_linking.py) which links texts to both Wikipedia and Wikidata entities, using spaCy NER and +REL Entity Linker. The input should be a JSONL file which has one json object per line, like [this](https://github.com/castorini/pyserini/blob/master/integrations/resources/sample_collection_jsonl/documents.jsonl), +while the output is also a JSONL file, where each json object is of format: + +``` +{ + "id": ..., + "contents": ..., + "entities": [ + {"start_pos": ..., "end_pos": ..., "ent_text": ..., "wikipedia_id": ..., "wikidata_id": ..., "ent_type": ...}, + ... + ] +} +``` + +For example, given the input file +```json +{"id": "doc1", "contents": "The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."} +``` + +, the output file would be + +```json +{ + "id": "doc1", + "contents": "The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.", + "entities": [ + {"start_pos": 0, "end_pos": 21, "ent_text": "The Manhattan Project", "wikipedia_id": "Manhattan_Project", "wikidata_id": "Q127050", "ent_type": "ORG"}, + {"start_pos": 65, "end_pos": 77, "ent_text": "World War II", "wikipedia_id": "World_War_II", "wikidata_id": "Q362", "ent_type": "EVENT"} + ] +} +``` + +### Input Prep + +Let us take MS MARCO passage dataset as an example. We need to download the MS MARCO passage dataset and convert the tsv collection into jsonl files by following the +detailed instruction [here](https://github.com/x389liu/pyserini/blob/master/docs/experiments-msmarco-passage.md#data-prep). +Now we should have 9 jsonl files in `collections/msmarco-passage/collection_jsonl`, and each file path can be considered as +`input_path` in our scripts. + +### REL + +First, we follow the github [instruction](https://github.com/informagi/REL#installation-from-source) to install REL and +download required generic file, appropriate wikipedia corpus as well as the corresponding ED model. Then we set up +variable `base_url` as explained in this [tutorial](https://github.com/informagi/REL/blob/master/tutorials/01_How_to_get_started.md#how-to-get-started). + +Note that the `base_url` and ED model path are required as `rel_base_url` and `rel_ed_model_path` in our script respectively. +Another parameter `rel_wiki_version` depends on the version of wikipedia corpus downloaded, e.g. +`wiki_2019` for 2019 Wikipedia corpus. + +### wikimapper + +REL Entity Linker only links texts to Wikipedia entities, but we need their Wikidata information as well. +[Wikimapper](https://pypi.org/project/wikimapper/) is a Python library mapping Wikipedia titles to Wikidata IDs. +In order to use the mapping functionality, we have to download its precomputed indices [here](https://public.ukp.informatik.tu-darmstadt.de/wikimapper/). +Note that the path storing precomputed indices is required as `wikimapper_index` in our script. + +### Run Script + +Finally, we are ready to run our entity linking script: + +```bash +python entity_linking.py --input_path [input_jsonl_file] --rel_base_url [base_url] --rel_ed_model_path [ED_model] \ +--rel_wiki_version [wikipedia_corpus_version] --wikimapper_index [precomputed_index] \ +--spacy_model [en_core_web_sm, en_core_web_lg, etc.] --output_path [output_jsonl_file] +``` + +It should take about 5 to 10 minutes to run entity linking on 5,000 MS MARCO passages on Compute Canada. See +[this](https://github.com/castorini/onboarding/blob/master/docs/cc-guide.md#compute-canada) for instructions about +running scripts on Compute Canada. diff --git a/scripts/entity_linking.py b/scripts/entity_linking.py new file mode 100644 index 000000000..8d0196fa4 --- /dev/null +++ b/scripts/entity_linking.py @@ -0,0 +1,93 @@ +import argparse +import jsonlines +import spacy +from REL.REL.mention_detection import MentionDetection +from REL.REL.utils import process_results +from REL.REL.entity_disambiguation import EntityDisambiguation +from REL.REL.ner import NERBase, Span +from wikimapper import WikiMapper + + +# Spacy Mention Detection class which overrides the NERBase class in the REL entity linking process +class NERSpacy(NERBase): + def __init__(self): + # we only want to link entities of specific types + self.ner_labels = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', + 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'MONEY', 'QUANTITY'] + + # mandatory function which overrides NERBase.predict() + def predict(self, doc): + mentions = [] + for ent in doc.ents: + if ent.label_ in self.ner_labels: + mentions.append(Span(ent.text, ent.start_char, ent.end_char, 0, ent.label_)) + return mentions + + +# run REL entity linking on processed doc +def rel_entity_linking(spacy_docs, rel_base_url, rel_wiki_version, rel_ed_model_path): + mention_detection = MentionDetection(rel_base_url, rel_wiki_version) + tagger_spacy = NERSpacy() + mentions_dataset, _ = mention_detection.find_mentions(spacy_docs, tagger_spacy) + config = { + 'mode': 'eval', + 'model_path': rel_ed_model_path, + } + ed_model = EntityDisambiguation(rel_base_url, rel_wiki_version, config) + predictions, _ = ed_model.predict(mentions_dataset) + + linked_entities = process_results(mentions_dataset, predictions, spacy_docs) + return linked_entities + + +# apply spaCy nlp processing pipeline on each doc +def apply_spacy_pipeline(input_path, spacy_model): + nlp = spacy.load(spacy_model) + spacy_docs = {} + with jsonlines.open(input_path) as reader: + for obj in reader: + spacy_docs[obj['id']] = nlp(obj['contents']) + return spacy_docs + + +# enrich REL entity linking results with entities' wikidata ids, and write final results as json objects +def enrich_el_results(rel_linked_entities, spacy_docs, wikimapper_index): + wikimapper = WikiMapper(wikimapper_index) + linked_entities_json = [] + for docid, ents in rel_linked_entities.items(): + linked_entities_info = [] + for start_pos, end_pos, ent_text, ent_wikipedia_id, ent_type in ents: + # find entities' wikidata ids using their REL results (i.e. linked wikipedia ids) + ent_wikipedia_id = ent_wikipedia_id.replace('&', '&') + ent_wikidata_id = wikimapper.title_to_id(ent_wikipedia_id) + + # write results as json objects + linked_entities_info.append({'start_pos': start_pos, 'end_pos': end_pos, 'ent_text': ent_text, + 'wikipedia_id': ent_wikipedia_id, 'wikidata_id': ent_wikidata_id, + 'ent_type': ent_type}) + linked_entities_json.append({'id': docid, 'contents': spacy_docs[docid].text, + 'entities': linked_entities_info}) + return linked_entities_json + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--input_path', type=str, help='path to input texts') + parser.add_argument('-u', '--rel_base_url', type=str, help='directory containing all required REL data folders') + parser.add_argument('-m', '--rel_ed_model_path', type=str, help='path to the REL entity disambiguation model') + parser.add_argument('-v', '--rel_wiki_version', type=str, help='wikipedia corpus version used for REL') + parser.add_argument('-w', '--wikimapper_index', type=str, help='precomputed index used by Wikimapper') + parser.add_argument('-s', '--spacy_model', type=str, help='spacy model type') + parser.add_argument('-o', '--output_path', type=str, help='path to output json file') + args = parser.parse_args() + + spacy_docs = apply_spacy_pipeline(args.input_path, args.spacy_model) + rel_linked_entities = rel_entity_linking(spacy_docs, args.rel_base_url, args.rel_wiki_version, + args.rel_ed_model_path) + linked_entities_json = enrich_el_results(rel_linked_entities, spacy_docs, args.wikimapper_index) + with jsonlines.open(args.output_path, mode='w') as writer: + writer.write_all(linked_entities_json) + + +if __name__ == '__main__': + main() \ No newline at end of file From c8ee2b2b38e731a76d3bd5c66e6f695eb9b10cea Mon Sep 17 00:00:00 2001 From: Xinyu Liu Date: Thu, 12 Nov 2020 14:09:46 -0500 Subject: [PATCH 2/2] Fix typos and reformat sentences --- docs/working-with-spacy.md | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/working-with-spacy.md b/docs/working-with-spacy.md index c8b2355dc..566c30e9e 100644 --- a/docs/working-with-spacy.md +++ b/docs/working-with-spacy.md @@ -173,12 +173,11 @@ Then we have sentences: ## Entity Linking -Unfortunately, spaCy does not provide any pre-trained Entity Linking model currently. However, we found another great -Entity Linking package called [Radboud Entity Linker (REL)](https://github.com/informagi/REL#rel-radboud-entity-linker). +Unfortunately, spaCy does not provide any pre-trained entity linking model currently. +However, we found another great entity linking package called [Radboud Entity Linker (REL)](https://github.com/informagi/REL#rel-radboud-entity-linker). -In this section, we introduce an entity linking [script](../scripts/entity_linking.py) which links texts to both Wikipedia and Wikidata entities, using spaCy NER and -REL Entity Linker. The input should be a JSONL file which has one json object per line, like [this](https://github.com/castorini/pyserini/blob/master/integrations/resources/sample_collection_jsonl/documents.jsonl), -while the output is also a JSONL file, where each json object is of format: +In this section, we introduce an entity linking [script](../scripts/entity_linking.py) which links texts to both Wikipedia and Wikidata entities, using spaCy NER and REL Entity Linker. +The input should be a JSONL file which has one json object per line, like [this](https://github.com/castorini/pyserini/blob/master/integrations/resources/sample_collection_jsonl/documents.jsonl), while the output is also a JSONL file, where each json object is of format: ``` { @@ -192,6 +191,7 @@ while the output is also a JSONL file, where each json object is of format: ``` For example, given the input file + ```json {"id": "doc1", "contents": "The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science."} ``` @@ -211,20 +211,17 @@ For example, given the input file ### Input Prep -Let us take MS MARCO passage dataset as an example. We need to download the MS MARCO passage dataset and convert the tsv collection into jsonl files by following the -detailed instruction [here](https://github.com/x389liu/pyserini/blob/master/docs/experiments-msmarco-passage.md#data-prep). -Now we should have 9 jsonl files in `collections/msmarco-passage/collection_jsonl`, and each file path can be considered as -`input_path` in our scripts. +Let us take MS MARCO passage dataset as an example. +We need to download the MS MARCO passage dataset and convert the tsv collection into jsonl files by following the detailed instruction [here](https://github.com/castorini/pyserini/blob/master/docs/experiments-msmarco-passage.md#data-prep). +Now we should have 9 jsonl files in `collections/msmarco-passage/collection_jsonl`, and each file path can be considered as `input_path` in our scripts. ### REL -First, we follow the github [instruction](https://github.com/informagi/REL#installation-from-source) to install REL and -download required generic file, appropriate wikipedia corpus as well as the corresponding ED model. Then we set up -variable `base_url` as explained in this [tutorial](https://github.com/informagi/REL/blob/master/tutorials/01_How_to_get_started.md#how-to-get-started). +First, we follow the Github [instruction](https://github.com/informagi/REL#installation-from-source) to install REL and download required generic file, appropriate wikipedia corpus as well as the corresponding ED model. +Then we set up variable `base_url` as explained in this [tutorial](https://github.com/informagi/REL/blob/master/tutorials/01_How_to_get_started.md#how-to-get-started). Note that the `base_url` and ED model path are required as `rel_base_url` and `rel_ed_model_path` in our script respectively. -Another parameter `rel_wiki_version` depends on the version of wikipedia corpus downloaded, e.g. -`wiki_2019` for 2019 Wikipedia corpus. +Another parameter `rel_wiki_version` depends on the version of wikipedia corpus downloaded, e.g. `wiki_2019` for 2019 Wikipedia corpus. ### wikimapper @@ -243,6 +240,5 @@ python entity_linking.py --input_path [input_jsonl_file] --rel_base_url [base_ur --spacy_model [en_core_web_sm, en_core_web_lg, etc.] --output_path [output_jsonl_file] ``` -It should take about 5 to 10 minutes to run entity linking on 5,000 MS MARCO passages on Compute Canada. See -[this](https://github.com/castorini/onboarding/blob/master/docs/cc-guide.md#compute-canada) for instructions about -running scripts on Compute Canada. +It should take about 5 to 10 minutes to run entity linking on 5,000 MS MARCO passages on Compute Canada. +See [this](https://github.com/castorini/onboarding/blob/master/docs/cc-guide.md#compute-canada) for instructions about running scripts on Compute Canada.