From cdf31525d98e5eb95e619958d821fa243724a13d Mon Sep 17 00:00:00 2001 From: Daniele Guido <1181642+danieleguido@users.noreply.github.com> Date: Wed, 16 Oct 2024 13:40:52 +0200 Subject: [PATCH] update notebooks --- src/components/NotebookCard.tsx | 7 +- ...detect-news-agency-with-impresso-model.mdx | 74 ++++-- src/content/notebooks/generic-entity-api.mdx | 102 ++++---- .../notebooks/impresso-py-collections.mdx | 16 +- src/content/notebooks/impresso-py-connect.mdx | 230 +++++++++++++----- src/content/notebooks/impresso-py-maps.mdx | 23 +- src/content/notebooks/impresso-py-network.mdx | 21 -- src/content/notebooks/impresso-py-search.mdx | 90 +------ 8 files changed, 277 insertions(+), 286 deletions(-) diff --git a/src/components/NotebookCard.tsx b/src/components/NotebookCard.tsx index 1cf40f8..e51acec 100644 --- a/src/components/NotebookCard.tsx +++ b/src/components/NotebookCard.tsx @@ -26,7 +26,12 @@ const NotebookCard: React.FC<{ }> = ({ notebook, children, className = "" }) => { const accessTime = notebook.date ?? new Date() const accessDateTime = DateTime.fromJSDate(accessTime) - + console.debug( + "[NotebookCard] - accessDateTime:", + accessDateTime, + "- title:", + notebook?.title, + ) return (
diff --git a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx index be7fffc..493ab8f 100644 --- a/src/content/notebooks/detect-news-agency-with-impresso-model.mdx +++ b/src/content/notebooks/detect-news-agency-with-impresso-model.mdx @@ -1,11 +1,11 @@ --- -githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_02_newsagencies.ipynb +githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_03_newsagencies.ipynb title: News Agencies Recognition and Linking with Impresso BERT models excerpt: This notebook provides a practical guide to setting up a workflow for entity recognition in historical texts. -sha: 7ebadeb1938720b0c660c24a525b2b72c1f1ec95 -date: 2024-09-18T10:11:47Z -googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_02_newsagencies.ipynb +sha: 510b71a19bac0e4aa1cab15bf42651bf26bc6dd4 +date: 2024-09-30T13:40:55Z +googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_03_newsagencies.ipynb authors: - impresso-team --- @@ -20,46 +20,68 @@ Install necessary libraries (if not already installed) and download the necessary NLTK data. {/* cell:2 cell_type:code */} - ```python -!pip install python-dotenv !pip install transformers -!pip install torch +!pip install spacy +!pip install pysbd ``` {/* cell:3 cell_type:markdown */} -_Note: This notebook requires `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable). We use [dotenv](https://pypi.org/project/python-dotenv/) library to load the HF_TOKEN value from a local .env file_ +Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text. {/* cell:4 cell_type:code */} - ```python -from dotenv import load_dotenv -load_dotenv() # take environment variables from .env. +from transformers import pipeline + +# Named Entity Recognition pipeline +newsagency_ner_pipeline = pipeline("newsagency-ner", model="impresso-project/ner-newsagency-bert-fr", trust_remote_code=True) ``` {/* cell:5 cell_type:markdown */} -Now the fun part, this function will download the requried model and gives you the keys to successfullly detect news agencies in your text. +Run the example below to see how it works. {/* cell:6 cell_type:code */} - ```python -from transformers import is_torch_available -from transformers import pipeline +# Example +text = """Apple est créée le 1er avril 1976 dans le garage de la maison + d'enfance de Steve Jobs à Los Altos en Californie par Steve Jobs, Steve Wozniak + et Ronald Wayne, puis constituée sous forme de société le 3 janvier 1977 à l'origine + sous le nom d'Apple Computer, mais pour ses 30 ans et pour refléter la diversification + de ses produits, le mot « computer » est retiré le 9 janvier 2015. (Reuter)""" -# Check if PyTorch is available -print(is_torch_available()) -# Named Entity Recognition pipeline -nlp = pipeline("newsagency-ner", model="impresso-project/bert-newsagency-ner-fr", trust_remote_code=True) +newsagency_ner_pipeline(text) + ``` {/* cell:7 cell_type:markdown */} -Run the example below to see how it works. -{/* cell:8 cell_type:code */} +## About Impresso + +### Impresso project + +[Impresso - Media Monitoring of the Past](https://impresso-project.ch) is an +interdisciplinary research project that aims to develop and consolidate tools for +processing and exploring large collections of media archives across modalities, time, +languages and national borders. The first project (2017-2021) was funded by the Swiss +National Science Foundation under grant +No. [CRSII5_173719](http://p3.snf.ch/project-173719) and the second project (2023-2027) +by the SNSF under grant No. [CRSII5_213585](https://data.snf.ch/grants/grant/213585) +and the Luxembourg National Research Fund under grant No. 17498891. + +### Copyright + +Copyright (C) 2024 The Impresso team. + +### License + +This program is provided as open source under +the [GNU Affero General Public License](https://github.com/impresso/impresso-pyindexation/blob/master/LICENSE) +v3 or later. + +--- + +

+ Impresso Project Logo +

-```python -# Example -text = "Mon nom est François et j'habite à Paris. (Reuter)" -nlp(text) -``` diff --git a/src/content/notebooks/generic-entity-api.mdx b/src/content/notebooks/generic-entity-api.mdx index e896632..40d805c 100644 --- a/src/content/notebooks/generic-entity-api.mdx +++ b/src/content/notebooks/generic-entity-api.mdx @@ -1,68 +1,29 @@ --- -githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/generic-entity-api.ipynb +githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_04_ner_nel_API.ipynb authors: - impresso-team -# - EmanuelaBoros title: Detect Entities and Link them to Wikipedia and Wikidata in a Text through the Impresso API -sha: 54802fcabc0e32a4a05a1b4f2761a54b9807b0c5 -date: 2024-09-18T09:47:53Z -googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/generic-entity-api.ipynb +sha: 087d08f44c299889d5635eece4f303344246a80c +date: 2024-09-30T13:34:13Z +googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/2-entity/NE_04_ner_nel_API.ipynb --- {/* cell:0 cell_type:markdown */} -Named entities such as organizations, locations, persons, and temporal expressions play a crucial role in the comprehension and analysis of both historical and contemporary texts. The HIPE-2022 project focuses on named entity recognition and classification (NERC) and entity linking (EL) in multilingual historical documents. +We refer to "named entity recognition" as NER, which is a tool that recognises entities such as persons and locations from text. A "named entity linker" (NEL) connects these entities to an existing one such as a real person that can be found on Wikipedia (with a unique id in Wikidata). Wikipedia is a free, user-edited encyclopedia with articles on a wide range of topics like historical events, famous people, or scientific concepts. Wikidata is a sister project of Wikipedia that stores structured data, like facts and relationships between entities, used for tasks where computers need to understand and process data, such as NER and NEL. -### About HIPE-2022 - -HIPE-2022 involves processing diverse datasets from historical newspapers and classical commentaries, spanning approximately 200 years and multiple languages. The primary goal is to confront systems with challenges related to multilinguality, domain-specific entities, and varying annotation tag sets. - -### Datasets - -The HIPE-2022 datasets are based on six primary datasets, but this model was only trained on **hipe2020** in French and German. - -- **ajmc**: Classical commentaries in German, French, and English. -- **hipe2020**: Historical newspapers in German, French, and English. -- **letemps**: Historical newspapers in French. -- **topres19th**: Historical newspapers in English. -- **newseye**: Historical newspapers in German, Finnish, French, and Swedish. -- **sonar**: Historical newspapers in German. - -### Annotation Types and Levels - -HIPE-2022 employs an IOB tagging scheme (inside-outside-beginning format) for entity annotations. The annotation levels include: - -1. **TOKEN**: The annotated token. -2. **NE-COARSE-LIT**: Coarse type of the entity (literal sense). -3. **NE-COARSE-METO**: Coarse type of the entity (metonymic sense). -4. **NE-FINE-LIT**: Fine-grained type of the entity (literal sense). -5. **NE-FINE-METO**: Fine-grained type of the entity (metonymic sense). -6. **NE-FINE-COMP**: Component type of the entity. -7. **NE-NESTED**: Coarse type of the nested entity. - -### Getting Started - -This notebook will guide you through setting up a workflow to identify named entities within your text using the HIPE-2022 trained pipeline. By leveraging this pipeline, you can detect mentions of people, places, organizations, and temporal expressions, enhancing your analysis and understanding of historical and contemporary documents. - ---- - -This updated description provides a clear overview of the HIPE-2022 project's goals, datasets, and annotation types, focusing on the identification of generic named entities in multilingual historical documents. -*Note: This notebook *might* require `HF_TOKEN` to be set in the environment variables. You can get your token by signing up on the [Hugging Face website](https://huggingface.co/join) and read more in the [official documentation](https://huggingface.co/docs/huggingface_hub/v0.20.2/en/quick-start#environment-variable)* {/* cell:1 cell_type:markdown */} -Install necessary libraries (if not already installed) and -download the necessary NLTK data. +In the context of _Impresso_, the NER tool was trained on the [HIPE 2020](https://github.com/hipe-eval/HIPE-2022-data/blob/main/documentation/README-hipe2020.md) dataset. It was trained to recognise coarse and fine grained entities such as persons and locations, but also their names, titles, and functions. Further, the _Impresso_ NEL tool links these entity mentions to unique referents in a knowledge base – here Wikipedia and Wikidata – or not if the mention's referent is not found. {/* cell:2 cell_type:code */} - ```python !pip install transformers -!pip install nltk -!pip install torch +!wget https://raw.githubusercontent.com/impresso/impresso-datalab-notebooks/3f7afc05caef3f527db8320cdf8c131aec41d7cd/2-entity/utils.py + ``` {/* cell:3 cell_type:code */} - ```python def print_nicely(results, text): # Print the timestamp and system ID @@ -111,63 +72,84 @@ def print_nicely(results, text): Now the fun part, this function will download the requried model and gives you the keys to successfullly detect entities in your text. {/* cell:5 cell_type:code */} - ```python -from utils import get_linked_entities -import requests - sentences = ["Apple est créée le 1er avril 1976 dans le garage de la maison d'enfance de Steve Jobs à Los Altos en Californie par Steve Jobs, Steve Wozniak et Ronald Wayne, puis constituée sous forme de société le 3 janvier 1977 à l'origine sous le nom d'Apple Computer, mais pour ses 30 ans et pour refléter la diversification de ses produits, le mot « computer » est retiré le 9 janvier 2015."] for sentence in sentences: results = get_linked_entities(sentence) + print(results) print_nicely(results, sentence) ``` {/* cell:6 cell_type:code */} - ```python ``` -{/* cell:7 cell_type:code */} +{/* cell:7 cell_type:markdown */} -```python +## About Impresso -``` +### Impresso project -{/* cell:8 cell_type:code */} +[Impresso - Media Monitoring of the Past](https://impresso-project.ch) is an +interdisciplinary research project that aims to develop and consolidate tools for +processing and exploring large collections of media archives across modalities, time, +languages and national borders. The first project (2017-2021) was funded by the Swiss +National Science Foundation under grant +No. [CRSII5_173719](http://p3.snf.ch/project-173719) and the second project (2023-2027) +by the SNSF under grant No. [CRSII5_213585](https://data.snf.ch/grants/grant/213585) +and the Luxembourg National Research Fund under grant No. 17498891. + +### Copyright + +Copyright (C) 2024 The Impresso team. + +### License + +This program is provided as open source under +the [GNU Affero General Public License](https://github.com/impresso/impresso-pyindexation/blob/master/LICENSE) +v3 or later. + +--- + +

+ Impresso Project Logo +

+ +{/* cell:8 cell_type:code */} ```python ``` {/* cell:9 cell_type:code */} - ```python ``` {/* cell:10 cell_type:code */} - ```python ``` {/* cell:11 cell_type:code */} - ```python ``` {/* cell:12 cell_type:code */} - ```python ``` {/* cell:13 cell_type:code */} +```python + +``` +{/* cell:14 cell_type:code */} ```python ``` diff --git a/src/content/notebooks/impresso-py-collections.mdx b/src/content/notebooks/impresso-py-collections.mdx index af4faf8..0c4f6e1 100644 --- a/src/content/notebooks/impresso-py-collections.mdx +++ b/src/content/notebooks/impresso-py-collections.mdx @@ -1,16 +1,14 @@ --- githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/collections.ipynb authors: - # - RomanKalyakin - impresso-team title: Search collections -sha: fbebc19629cfc008a085283e61c0669de326add9 -date: 2024-09-18T15:04:39Z +sha: 4a05f4772be7279de1908f46c93dc12de334d112 +date: 2024-10-11T07:37:06Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/blob/main/examples/notebooks/collections.ipynb --- {/* cell:0 cell_type:code */} - ```python from impresso import connect @@ -18,33 +16,28 @@ impresso = connect() ``` {/* cell:1 cell_type:code */} - ```python result = impresso.collections.find() result ``` {/* cell:2 cell_type:markdown */} - # Get collection Get metadata of a colection by its ID. {/* cell:3 cell_type:code */} - ```python result = impresso.collections.get("local-roka-tOrwrOG3") result ``` {/* cell:4 cell_type:markdown */} - ## Get collection items Get items from a collection by its ID. {/* cell:5 cell_type:code */} - ```python colection_id = result.raw["uid"] items = impresso.collections.items(colection_id) @@ -52,28 +45,23 @@ items ``` {/* cell:6 cell_type:markdown */} - ## Remove items from collection {/* cell:7 cell_type:code */} - ```python item_id = items.pydantic.data[0].uid item_id ``` {/* cell:8 cell_type:code */} - ```python impresso.collections.remove_items(colection_id, [item_id]) ``` {/* cell:9 cell_type:markdown */} - ## Add items to collection {/* cell:10 cell_type:code */} - ```python impresso.collections.add_items(colection_id, [item_id]) ``` diff --git a/src/content/notebooks/impresso-py-connect.mdx b/src/content/notebooks/impresso-py-connect.mdx index 1589af6..789a3de 100644 --- a/src/content/notebooks/impresso-py-connect.mdx +++ b/src/content/notebooks/impresso-py-connect.mdx @@ -1,5 +1,5 @@ --- -title: How to connect to the API +title: Basic interactions with the Impresso python library excerpt: This is the first notebook in the Enter Impresso series. githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/1-starter/ST_01_basics.ipynb tags: @@ -7,130 +7,236 @@ tags: binderUrl: https://mybinder.org/v2/gh/binder-examples/r/master?urlpath=rstudio authors: - impresso-team -date: 2024-09-18T15:04:39Z +date: 2024-10-16T11:20:14Z seealso: - impresso-py-search -sha: fbebc19629cfc008a085283e61c0669de326add9 -googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/blob/main/examples/notebooks/basic.ipynb +sha: 677cd139a86d3264d08ef8b0b22d48f7b204fc59 +googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/1-starter/ST_01_basics.ipynb --- -{/* cell:0 cell_type:code */} +{/* cell:0 cell_type:markdown */} +## Good to know before starting -```python -from impresso import connect +This notebook is a quick introduction to the Impresso python library. It is a good starting point to understand how to interact with the library and how to use it to access the Impresso dataset. An Impresso account is required to access the data. If you don't have an account, you can register on the [Impresso Datalab page](https://impresso-project.ch/datalab/). -impresso = connect() -``` {/* cell:1 cell_type:markdown */} +## Prerequisites -## Search articles - -In this notebook, we will search for articles that contain the term "European Union" in the text. The results are ordered by date. - -Below the result container is rendered as an overview of what it contains. +Install the `impresso` python library: {/* cell:2 cell_type:code */} - ```python -result = impresso.search.find( - q="European Union", - order_by="date", -) -result +%pip install -q impresso ``` {/* cell:3 cell_type:markdown */} -Below, we will search for a term "European Union" in the Impresso data. -Then we will use the `result` variable, to access and print the excerpts of the first three articles returned by the search query. +# Initialize Impresso Client -The `pydantic` property is a [Pydantic](https://docs.pydantic.dev/latest/) model representing the response of the Impresso API. It provides a way to ensure that the data conforms to specified types and constraints, making it easier to work with structured data in a reliable and consistent manner. -We use the `data` property of the response to iterate over the page of the results and return excerpts of the articles that contain the search term. +In this cell, we initialize the Impresso client and authenticate it with the Impresso API. +The `impresso` object allows us to interact with the API and perform various operations such as searching for articles, retrieving entities, and fetching facets. The following call will prompt you to enter your Impresso token if it has not been authenticated within the last few hours (the token has a short lifetime). {/* cell:4 cell_type:code */} - ```python -result = impresso.search.find( - q="European Union", - order_by="date", -) -for article in result.pydantic.data[:3]: - print(article.excerpt) +from impresso import connect + +impresso = connect() ``` {/* cell:5 cell_type:markdown */} -There are several useful properties on the result object that let us know the total nubmer of results found, the current page and its size. +## Making first request -{/* cell:6 cell_type:code */} +Let's start by making a simple request to the Impresso API. We will search for articles that contain the word "Titanic" in the text and order the results by date in an ascending order. +{/* cell:6 cell_type:code */} ```python -print("%i results were found for this term. The current result object contains %i items starting from the item number %i" % (result.total, result.size, result.offset)) +result = impresso.search.find( + q="Titanic", + order_by="date", +) +result ``` {/* cell:7 cell_type:markdown */} +The result of the search query is rendered as a notebook friendly preview when running in a Jupyter notebook. The preview contains the total number of results, the number of results returned in the current page, a link back to the Impresso App and a table with the preview of the data as a Pandas DataFrame (only the first 3 items are returned in the preview). -### Pydantic +The result object has several properties that return the data in various formats: + * `df` returns the result as a Pandas DataFrame + * `raw` returns the result as a python list of dicts (the raw JSON response from the API) + * `data` returns the result as a list of Pydantic objects -The full response from the Impresso API as a pydantic model. +Explore the results as a pandas DataFrame: {/* cell:8 cell_type:code */} - ```python -result.pydantic +result.df[:5] ``` {/* cell:9 cell_type:markdown */} +## Documentation + +Methods of the Impresso library are documented in the code and use Python [type hints](https://docs.python.org/3/library/typing.html) to further help with understanding what values various arguments accept. The hints are often activated on mouse hover, pressing tab or other IDE specific shortcuts. + +{/* cell:10 cell_type:markdown */} +This is what function documentation looks like in VSCode Jupyter notebook: + +Function documentation in VSCode Jupyter notebook + +{/* cell:11 cell_type:markdown */} +The same in Google Colab: + +Function documentation in Google Colab + +{/* cell:12 cell_type:markdown */} +Type hints in VSCode Jupyter -### Pandas +Type hints in VSCode Jupyter -We can also get the search results as a [Pandas](https://pandas.pydata.org/) DataFrame. -This allows us to easily manipulate and analyze the data using pandas' powerful data manipulation capabilities. +{/* cell:13 cell_type:markdown */} +Type hints in Google Colab: + +Type hints in Google Colab + + +{/* cell:14 cell_type:markdown */} +## Namespaces of the Impresso library -{/* cell:10 cell_type:code */} +The Impresso library functionality is split into several namespaces that loosely follow the pages of the Impresso App. Each namespace normally contains a `find` method to search for a list of items and/or a `get` method to retrieve a single item by ID. +{/* cell:15 cell_type:markdown */} +### Search and Article + +The `search.find` method has been demonstrated in the previous example. The `articles.get` method can be used to retrieve an article by its ID. + +{/* cell:16 cell_type:code */} ```python -df = result.df -df.head(2) +result = impresso.articles.get("indeplux-1909-04-10-a-i0042") +result ``` -{/* cell:11 cell_type:markdown */} +{/* cell:17 cell_type:markdown */} +### Newspaper -## Get an article +This namespace deals with newspapers available in the Impresso library. -Below we will use the `articles` resource to get an article by its ID: +{/* cell:18 cell_type:code */} +```python +# get a single newspaper with the most recent publication year +result = impresso.newspapers.find(limit=1, order_by="-endYear") +result +``` + +{/* cell:19 cell_type:markdown */} +## Entities -{/* cell:12 cell_type:code */} +Search and retrieve metadata of entities. +{/* cell:20 cell_type:code */} ```python -article = impresso.articles.get("NZZ-1794-08-09-a-i0002") -article +result = impresso.entities.find("Titanic") +result ``` -{/* cell:13 cell_type:markdown */} -We can also get it as a Pydantic model or as a DataFrame. +{/* cell:21 cell_type:code */} +```python +result = impresso.entities.get("aida-0001-50-RMS_Titanic") +result +``` + +{/* cell:22 cell_type:markdown */} +### Collections -{/* cell:14 cell_type:code */} +Manage your own collections. +{/* cell:23 cell_type:code */} ```python -article.pydantic.excerpt +result = impresso.collections.find(order_by="-size") +if result.size == 0: + "No collections found" +else: + print("%s collections found" % result.size) + first_collection_id = result.df.index[0] + print("First collection contains %i items" % result.df["countItems"].iloc[0]) + + first_collection_items = impresso.search.find(collection_id=first_collection_id) + print( + "Got first %i items in the collection. The first item is: %s" % \ + (first_collection_items.size, first_collection_items.df["title"].iloc[0]) + ) + + # Find an article and add it to the collection + titanic_search_result = impresso.search.find(q="Titanic", limit=1) + titanic_search_first_item_id = titanic_search_result.df.index[0] + print("Adding article %s to the collection" % titanic_search_first_item_id) + + impresso.collections.add_items(first_collection_id, [titanic_search_first_item_id]) + print("Added article to collection") + + # Then remove the first article from the collection + first_article_id = first_collection_items.df.index[0] + impresso.collections.remove_items(first_collection_id, [first_article_id]) + print("Removed article %s from collection" % first_article_id) + ``` -{/* cell:15 cell_type:code */} +{/* cell:24 cell_type:markdown */} +### Text reuse + +Search for text reuse. +{/* cell:25 cell_type:code */} ```python -article.df[['uid', 'country', 'language']] +# get 5 text reuse clusters starting from the second one. +# The clusters are ordered by the number of passages they contain (descending). +clusters = impresso.text_reuse.clusters.find( + text="pandemie", + offset=2, + limit=5, + order_by="-passages-count", +) +clusters +``` +{/* cell:26 cell_type:code */} +```python +# get passages of the first cluster +impresso.text_reuse.passages.find( + cluster_id=clusters.df.index[0], +) ``` -{/* cell:16 cell_type:markdown */} +{/* cell:27 cell_type:markdown */} +### Facets -## Search facets +Some namespaces have a `facet` method that can be used to retrieve various facets for the namespace index. +Below are examples of how to get various facets for the `search` and `entities` namespaces. The `facet` call accepts the same filter arguments that can be passed to the `find` method. All facets come with a preview graph that can be used to visually assess the query and adjust it if necessary. -In this cell, we will search for facets related to the term "fromage" in the Impresso collection. This is a convenient way to see a breakdown of the search results by country. +{/* cell:28 cell_type:code */} +```python +impresso.search.facet(facet="daterange", q="titanic") +``` + +{/* cell:29 cell_type:code */} +```python +# get the number of Text Reuse clusters with a lexical overlap between 1 and 2 for every date +impresso.text_reuse.clusters.facet("daterange", lexical_overlap=(1, 2)) +``` + +{/* cell:30 cell_type:markdown */} +### Tools - Named entity recognition -{/* cell:17 cell_type:code */} +Impresso API has its own Named entity recognition (NER) service trained on the Impresso corpus. It allows to locate and classify named entities mentioned in unstructured text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc. +{/* cell:31 cell_type:code */} ```python -country_facet = impresso.search.facet("country", q="fromage") -country_facet.df +text = """ +Jean-Baptiste Nicolas Robert Schuman ( +29 June 1886 – 4 September 1963) was a Luxembourg-born French +statesman. Schuman was a Christian democratic (Popular +Republican Movement) political thinker and activist. +""" +result = impresso.tools.ner( + text=text +) +result ``` diff --git a/src/content/notebooks/impresso-py-maps.mdx b/src/content/notebooks/impresso-py-maps.mdx index d4b4286..59e2039 100644 --- a/src/content/notebooks/impresso-py-maps.mdx +++ b/src/content/notebooks/impresso-py-maps.mdx @@ -3,33 +3,28 @@ title: Exploring impresso with maps githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb authors: - impresso-team - # - RomanKalyakin -sha: 168c669246385a2ec6c3e088b0081364f129d11c -date: 2024-09-27T12:54:12Z +sha: 246afc17b57e972ce9321fedfc19967c0e8e61a0 +date: 2024-10-03T08:32:32Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/maps_explore.ipynb --- {/* cell:0 cell_type:markdown */} - ## Install dependencies We need the following packages: -- [impresso-py](https://impresso-project.ch/) -- [ipyleaflet](https://ipyleaflet.readthedocs.io/en/latest/index.html) + * [impresso-py](https://impresso-project.ch/) + * [ipyleaflet](https://ipyleaflet.readthedocs.io/en/latest/index.html) {/* cell:1 cell_type:code */} - ```python %pip install git+https://github.com/impresso/impresso-py.git ipyleaflet ``` {/* cell:2 cell_type:markdown */} - ## Connect to Impresso {/* cell:3 cell_type:code */} - ```python from impresso import connect, OR, DateRange @@ -37,13 +32,11 @@ impresso = connect(public_api_url="https://dev.impresso-project.ch/public-api") ``` {/* cell:4 cell_type:markdown */} - ## Search and collect entities Find top 100 location entities mentioned in articles that talk about nuclear power plants in the first three decades following the second world war. {/* cell:5 cell_type:code */} - ```python locations = impresso.search.facet( "location", @@ -59,7 +52,6 @@ locations Get entities details, including wikidata details {/* cell:7 cell_type:code */} - ```python entities_ids = locations.df.index.tolist() entities = impresso.entities.find(entity_id=OR(*entities_ids), load_wikidata=True, limit=len(entities_ids)) @@ -70,7 +62,6 @@ entities Filter out entities that have no coordinates and add a country tag. {/* cell:9 cell_type:code */} - ```python df = entities.df entities_with_coordinates = df[df['wikidata.coordinates.latitude'].notna() & df['wikidata.coordinates.longitude'].notna()] @@ -83,7 +74,6 @@ entities_with_coordinates Add counts of mentions to the entities dataframe. {/* cell:11 cell_type:code */} - ```python entities_with_coordinates['mentions_count'] = entities_with_coordinates.index.map(locations.df['count']) ``` @@ -92,7 +82,6 @@ entities_with_coordinates['mentions_count'] = entities_with_coordinates.index.ma Plot entities on the map. {/* cell:13 cell_type:markdown */} - ### Utility methods Functions used to calculate extra details needed to plot data on a map. @@ -101,7 +90,6 @@ Functions used to calculate extra details needed to plot data on a map. Find geo bounds of a group of items. {/* cell:15 cell_type:code */} - ```python def find_bounds(coordinates): """ @@ -136,7 +124,6 @@ def find_bounds(coordinates): Create an HTML used for rendering the hover pop-up. {/* cell:17 cell_type:code */} - ```python from ipywidgets import HTML from ipyleaflet import Popup @@ -163,11 +150,9 @@ def build_hover_popup(title: str, subtitle: str, mentions: int) -> Popup: ``` {/* cell:18 cell_type:markdown */} - ### Plot {/* cell:19 cell_type:code */} - ```python from ipyleaflet import Map, Marker, AwesomeIcon, CircleMarker diff --git a/src/content/notebooks/impresso-py-network.mdx b/src/content/notebooks/impresso-py-network.mdx index 07836df..db4b7d5 100644 --- a/src/content/notebooks/impresso-py-network.mdx +++ b/src/content/notebooks/impresso-py-network.mdx @@ -3,28 +3,23 @@ title: Network graph with Impresso Py githubUrl: https://github.com/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb authors: - impresso-team - # - RomanKalyakin sha: 168c669246385a2ec6c3e088b0081364f129d11c date: 2024-09-27T12:54:12Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-datalab-notebooks/blob/main/4-impresso-py/network_graph.ipynb --- {/* cell:0 cell_type:markdown */} - ## Install dependencies {/* cell:1 cell_type:code */} - ```python %pip install git+https://github.com/impresso/impresso-py.git ipysigma ``` {/* cell:2 cell_type:markdown */} - ## Connect to Impresso {/* cell:3 cell_type:code */} - ```python from impresso import connect, OR, AND @@ -32,19 +27,16 @@ impresso = connect(public_api_url="https://dev.impresso-project.ch/public-api") ``` {/* cell:4 cell_type:markdown */} - ## Part 1: Get entities and their co-occurrences Find all persons mentioned in all articles that talk about the [Prague Spring](https://en.wikipedia.org/wiki/Prague_Spring). {/* cell:5 cell_type:code */} - ```python query = OR("Prague Spring", "Prager Frühling", "Printemps de Prague") ``` {/* cell:6 cell_type:code */} - ```python persons = impresso.search.facet( facet="person", @@ -59,7 +51,6 @@ persons Get all combinations of all entities with a mention count higher than `N`. {/* cell:8 cell_type:code */} - ```python import itertools @@ -75,7 +66,6 @@ print(f"Total combinations: {len(persons_ids_combinations)}") ``` {/* cell:9 cell_type:code */} - ```python if len(persons_ids_combinations) > 500: msg = ( @@ -91,7 +81,6 @@ if len(persons_ids_combinations) > 500: Get timestamps and counts of all articles where persons pairs appear. {/* cell:11 cell_type:code */} - ```python from impresso.util.error import ImpressoError from time import sleep @@ -126,7 +115,6 @@ for idx, combo in enumerate(persons_ids_combinations): Put them all into a dataframe {/* cell:13 cell_type:code */} - ```python import pandas as pd @@ -144,17 +132,14 @@ connections_df ``` {/* cell:14 cell_type:code */} - ```python connections_df.to_csv("connections.csv") ``` {/* cell:15 cell_type:markdown */} - ## Part 2: visualise {/* cell:16 cell_type:code */} - ```python import pandas as pd @@ -163,7 +148,6 @@ connections_df ``` {/* cell:17 cell_type:code */} - ```python grouped_connections_df = connections_df.groupby(['node_a', 'node_b']) \ .agg({'timestamp': lambda x: ', '.join(list(x)), 'count': 'sum', 'url': lambda x: list(set(x))[0]}) \ @@ -172,7 +156,6 @@ grouped_connections_df ``` {/* cell:18 cell_type:code */} - ```python import networkx as nx @@ -189,14 +172,12 @@ G.nodes ``` {/* cell:19 cell_type:code */} - ```python filename = input("Enter the filename: ") filename = f"{filename.replace(' ', '_')}.gefx" ``` {/* cell:20 cell_type:code */} - ```python nx.write_gexf(G, filename) ``` @@ -205,7 +186,6 @@ nx.write_gexf(G, filename) If running in Colab - activate custom widgets to allow Sigma to render the graph. {/* cell:22 cell_type:code */} - ```python try: from google.colab import output @@ -218,7 +198,6 @@ except: Render the graph. {/* cell:24 cell_type:code */} - ```python import networkx as nx from ipysigma import Sigma diff --git a/src/content/notebooks/impresso-py-search.mdx b/src/content/notebooks/impresso-py-search.mdx index e750ff7..a5bb7fd 100644 --- a/src/content/notebooks/impresso-py-search.mdx +++ b/src/content/notebooks/impresso-py-search.mdx @@ -2,17 +2,15 @@ githubUrl: https://github.com/impresso/impresso-py/blob/main/examples/notebooks/search.ipynb authors: - impresso-team - # - RomanKalyakin seealso: - impresso-py-collections title: Search -sha: fbebc19629cfc008a085283e61c0669de326add9 -date: 2024-09-18T15:04:39Z +sha: 4a05f4772be7279de1908f46c93dc12de334d112 +date: 2024-10-11T07:37:06Z googleColabUrl: https://colab.research.google.com/github/impresso/impresso-py/blob/main/examples/notebooks/search.ipynb --- {/* cell:0 cell_type:code */} - ```python from impresso import connect @@ -20,49 +18,39 @@ impresso = connect() ``` {/* cell:1 cell_type:markdown */} - ## Term Find all items containing "impresso" keyword. {/* cell:2 cell_type:code */} - ```python impresso.search.find(q="impresso") ``` {/* cell:3 cell_type:markdown */} - ## With text content only Limit to articles that have text. {/* cell:4 cell_type:code */} - ```python impresso.search.find(q="impresso", with_text_contents=True) ``` {/* cell:5 cell_type:markdown */} - ## Title - Find items that have the keyword "impresso" in their title. {/* cell:6 cell_type:code */} - ```python impresso.search.find(title="impresso") ``` {/* cell:7 cell_type:markdown */} - ### Complex term requests - Find items that have both terms. {/* cell:8 cell_type:code */} - ```python from impresso import AND @@ -75,7 +63,6 @@ Find items that have either one term or the other. Here we find all articles that contain either "homme" or "femme" in the title. {/* cell:10 cell_type:code */} - ```python from impresso import OR @@ -83,13 +70,11 @@ impresso.search.find(title=OR("homme", "femme")) ``` {/* cell:11 cell_type:markdown */} - -## Inverted search (everything excluding term A **OR** term B). +## Inverted search (everything excluding term A __OR__ term B). We want to find all articles with the word "luddite" in the title that do not mention neither "textile" nor "machine" {/* cell:12 cell_type:code */} - ```python from impresso import OR @@ -97,17 +82,15 @@ impresso.search.find(title="luddite", q=~OR("textile", "machine")) ``` {/* cell:13 cell_type:markdown */} - ### Complex combintation of terms The following cell searches all articles with all of the the following condition: -- mentioning "hitler" and "stalin" -- also mentioning one of: "molotow" or "ribbentrop" -- and not mentioning "churchill" +* mentioning "hitler" and "stalin" +* also mentioning one of: "molotow" or "ribbentrop" +* and not mentioning "churchill" {/* cell:14 cell_type:code */} - ```python from impresso import AND, OR @@ -115,25 +98,21 @@ impresso.search.find(q=AND("hitler", "stalin") & OR("molotow", "ribbentrop") & ~ ``` {/* cell:15 cell_type:markdown */} - ## Front page Find articles published on the front page only {/* cell:16 cell_type:code */} - ```python impresso.search.find(q="impresso", front_page=True) ``` {/* cell:17 cell_type:markdown */} - ## Entity ID Search by entity ID {/* cell:18 cell_type:code */} - ```python impresso.search.find(entity_id="aida-0001-54-Switzerland") ``` @@ -142,7 +121,6 @@ impresso.search.find(entity_id="aida-0001-54-Switzerland") Find all articles that mention Switzerland and Albert Einstein. {/* cell:20 cell_type:code */} - ```python impresso.search.find(entity_id=AND("aida-0001-54-Switzerland", "aida-0001-50-Albert_Einstein")) ``` @@ -151,31 +129,26 @@ impresso.search.find(entity_id=AND("aida-0001-54-Switzerland", "aida-0001-50-Alb Find all articles that mention either Switzerland or Albert Einstein. {/* cell:22 cell_type:code */} - ```python impresso.search.find(entity_id=OR("aida-0001-54-Switzerland", "aida-0001-50-Albert_Einstein")) ``` {/* cell:23 cell_type:markdown */} - ## Newspaper Limit search to two newspapers {/* cell:24 cell_type:code */} - ```python impresso.search.find(q="independence", newspaper_id=OR("EXP", "GDL")) ``` {/* cell:25 cell_type:markdown */} - ## Date range Items published between dates {/* cell:26 cell_type:code */} - ```python from impresso import DateRange @@ -186,7 +159,6 @@ impresso.search.find(q="independence", date_range=DateRange("1921-05-21", "2001- Articles published at any time excluding the range (not the `~` that negates the range). {/* cell:28 cell_type:code */} - ```python from impresso import DateRange @@ -194,13 +166,11 @@ impresso.search.find(q="independence", date_range=~DateRange("1921-05-21", "2001 ``` {/* cell:29 cell_type:markdown */} - ## Language Search for the term "banana" in English or Italian. {/* cell:30 cell_type:code */} - ```python impresso.search.find(q="banana", language=OR("it", "en")) ``` @@ -209,281 +179,235 @@ impresso.search.find(q="banana", language=OR("it", "en")) And now search for the word "banana" in any language _except_ English or Italian. {/* cell:32 cell_type:code */} - ```python impresso.search.find(q="banana", language=~OR("it", "en")) ``` {/* cell:33 cell_type:markdown */} - ## Entity mention Find articles that mention two entities. {/* cell:34 cell_type:code */} - ```python impresso.search.find(mention=AND("Charlie Chaplin", "Switzerland")) ``` {/* cell:35 cell_type:markdown */} - ## Topic Find articles that match either of the two topics. {/* cell:36 cell_type:code */} - ```python impresso.search.find(topic_id=OR("tm-fr-all-v2.0_tp07_fr", "tm-fr-all-v2.0_tp48_fr")) ``` {/* cell:37 cell_type:markdown */} - ## Collection Find all articles in a collection. {/* cell:38 cell_type:code */} - ```python impresso.search.find(collection_id="REPLACEME") ``` {/* cell:39 cell_type:markdown */} - ## Country Find all articles published in either of the two specified countries. {/* cell:40 cell_type:code */} - ```python impresso.search.find(q="Schengen", country=OR("FR", "CH")) ``` {/* cell:41 cell_type:markdown */} - ## Access rights Limit search to articles with specific access rights. {/* cell:42 cell_type:code */} - ```python impresso.search.find(q="Schengen", access_rights="Closed") ``` {/* cell:43 cell_type:markdown */} - ## Partner Limit search to articles provided by a specific partner of the Impresso project. {/* cell:44 cell_type:code */} - ```python impresso.search.find(q="Schengen", partner_id="Migros") ``` {/* cell:45 cell_type:markdown */} - ## Text reuse cluster Find all articles that are part of a specific text reuse cluster. {/* cell:46 cell_type:code */} - ```python from impresso import OR impresso.search.find(text_reuse_cluster_id=OR("tr-nobp-all-v01-c29")) ``` {/* cell:47 cell_type:markdown */} - # Facets -Facets are a way to get a summary of the search results from the perspective of a specific field. In a facet search result the field values are grouped together and the number of items in each group is displayed. +Facets are a way to get a summary of the search results from the perspective of a specific field. In a facet search result the field values are grouped together and the number of items in each group is displayed. Facet search method has the same attributes as the search method. {/* cell:48 cell_type:markdown */} - ## Date range Get the number of articles that mention "Impresso", published on ever particular date. {/* cell:49 cell_type:code */} - ```python impresso.search.facet("daterange", q="impresso") ``` {/* cell:50 cell_type:markdown */} - ## Year Get the number of articles that mention "impresso", published during every particular year. {/* cell:51 cell_type:code */} - ```python impresso.search.facet("year", q="impresso") ``` {/* cell:52 cell_type:markdown */} - ## Content length Get the number of articles that mention "impresso", grouped by content length. {/* cell:53 cell_type:code */} - ```python impresso.search.facet("contentLength", q="impresso") ``` {/* cell:54 cell_type:markdown */} - ## Month Get the number of articles that mention "impresso", published during every particular month. {/* cell:55 cell_type:code */} - ```python impresso.search.facet("month", q="impresso") ``` {/* cell:56 cell_type:markdown */} - ## Country Get the number of articles that mention "impresso", grouped by country they were published in. {/* cell:57 cell_type:code */} - ```python impresso.search.facet("country", q="impresso") ``` {/* cell:58 cell_type:markdown */} - ## Type Get the number of items that mention "impresso", grouped by type of item. {/* cell:59 cell_type:code */} - ```python impresso.search.facet("type") ``` {/* cell:60 cell_type:markdown */} - ## Topic Find topics that the articles mentioning "impresso" are related to. {/* cell:61 cell_type:code */} - ```python impresso.search.facet("topic", q="pomme") ``` {/* cell:62 cell_type:markdown */} - ## Collection Find collections the articles mentioning "pomme" are part of. {/* cell:63 cell_type:code */} - ```python impresso.search.facet("collection", q="pomme") ``` {/* cell:64 cell_type:markdown */} - ## Newspaper Find newspapers that the articles mentioning "Schengen" were published in. {/* cell:65 cell_type:code */} - ```python impresso.search.facet("newspaper", q="Schengen") ``` {/* cell:66 cell_type:markdown */} - ## Language Find all languages the articles mentioning "impresso" were published in. {/* cell:67 cell_type:code */} - ```python impresso.search.facet("language", q="Schengen") ``` {/* cell:68 cell_type:markdown */} - ## Person Find all persons mentioned in articles that mention "Schengen". Get only the last page. {/* cell:69 cell_type:code */} - ```python impresso.search.facet("person", q="Schengen", offset=7140) ``` {/* cell:70 cell_type:markdown */} - ## Location Find all locations mentioned in articles that mention "Schengen". Get only the last page. {/* cell:71 cell_type:code */} - ```python impresso.search.facet("location", q="Schengen", offset=3310) ``` {/* cell:72 cell_type:markdown */} - ## NAG Find all entities without a known type mentioned in articles that mention "homme" and "femme". {/* cell:73 cell_type:code */} - ```python from impresso import AND impresso.search.facet("nag", title=AND("homme", "femme")) ``` {/* cell:74 cell_type:markdown */} - ## Access rights Get access rights of articles mentioning "pomme". {/* cell:75 cell_type:code */} - ```python impresso.search.facet("accessRight", q="pomme") ``` {/* cell:76 cell_type:markdown */} - ## Partner Get Impresso partners that provided articles mentioning "pomme". {/* cell:77 cell_type:code */} - ```python impresso.search.facet("partner", q="pomme") ```