From 02412a4507b8461c65531e8c927fc8be0bed82b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Tue, 20 Feb 2024 22:15:47 +0100 Subject: [PATCH] Add files via upload --- .../tutorial-entity-linking-in-progress.ipynb | 1027 +++++++++++++++++ 1 file changed, 1027 insertions(+) create mode 100644 tutorials/tutorial-entity-linking-in-progress.ipynb diff --git a/tutorials/tutorial-entity-linking-in-progress.ipynb b/tutorials/tutorial-entity-linking-in-progress.ipynb new file mode 100644 index 0000000..633b482 --- /dev/null +++ b/tutorials/tutorial-entity-linking-in-progress.ipynb @@ -0,0 +1,1027 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# A work in progress notebook for entity linking\n", + "\n", + "(Submission is currently in progress, looks like we have to lowercase all queries before linking the entities, currently discussing this with Marcel)" + ], + "metadata": { + "id": "w8g9eAcFXPPh" + } + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GCPbVYynSBnZ", + "outputId": "4fd50167-8d5e-4064-d750-120902515118" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting python-terrier\n", + " Downloading python-terrier-0.10.0.tar.gz (107 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/107.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━\u001b[0m \u001b[32m102.4/107.6 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.6/107.6 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting tira\n", + " Downloading tira-0.0.103-py3-none-any.whl (46 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.4/46.4 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting ir-datasets\n", + " Downloading ir_datasets-0.5.6-py3-none-any.whl (335 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m335.2/335.2 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.25.2)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.5.3)\n", + "Collecting wget (from python-terrier)\n", + " Downloading wget-3.2.zip (10 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from python-terrier) (4.66.2)\n", + "Collecting pyjnius>=1.4.2 (from python-terrier)\n", + " Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting matchpy (from python-terrier)\n", + " Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.6/69.6 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.2)\n", + "Collecting deprecated (from python-terrier)\n", + " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", + "Collecting chest (from python-terrier)\n", + " Downloading chest-0.2.3.tar.gz (9.6 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from python-terrier) (2.31.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", + "Collecting nptyping==1.4.4 (from python-terrier)\n", + " Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)\n", + "Requirement already satisfied: more_itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.3)\n", + "Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.1)\n", + "Collecting ir_measures>=0.3.1 (from python-terrier)\n", + " Downloading ir_measures-0.3.3.tar.gz (48 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.8/48.8 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting dill (from python-terrier)\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pytrec_eval_terrier>=0.5.3 (from python-terrier)\n", + " Downloading pytrec_eval_terrier-0.5.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.4/287.4 kB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typish>=1.7.0 (from nptyping==1.4.4->python-terrier)\n", + " Downloading typish-1.9.3-py3-none-any.whl (45 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting docker==6.*,>=6.0.0 (from tira)\n", + " Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m148.1/148.1 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.0.7)\n", + "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (3.6)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (2024.2.2)\n", + "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.3)\n", + "Collecting inscriptis>=2.2.0 (from ir-datasets)\n", + " Downloading inscriptis-2.4.0.1-py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.4)\n", + "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", + "Collecting trec-car-tools>=2.5.4 (from ir-datasets)\n", + " Downloading trec_car_tools-2.6-py3-none-any.whl (8.4 kB)\n", + "Collecting lz4>=3.1.10 (from ir-datasets)\n", + " Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting warc3-wet>=0.2.3 (from ir-datasets)\n", + " Downloading warc3_wet-0.2.3-py3-none-any.whl (13 kB)\n", + "Collecting warc3-wet-clueweb09>=0.2.5 (from ir-datasets)\n", + " Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting zlib-state>=0.1.3 (from ir-datasets)\n", + " Downloading zlib-state-0.1.6.tar.gz (9.5 kB)\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting ijson>=3.1.3 (from ir-datasets)\n", + " Downloading ijson-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (111 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.8/111.8 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pyautocorpus>=0.1.1 (from ir-datasets)\n", + " Downloading pyautocorpus-0.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (379 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m379.9/379.9 kB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting unlzw3>=0.2.1 (from ir-datasets)\n", + " Downloading unlzw3-0.2.2-py3-none-any.whl (6.1 kB)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n", + "Collecting cwl-eval>=1.0.10 (from ir_measures>=0.3.1->python-terrier)\n", + " Downloading cwl-eval-1.0.12.tar.gz (31 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting cbor>=1.0.0 (from trec-car-tools>=2.5.4->ir-datasets)\n", + " Downloading cbor-1.0.0.tar.gz (20 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting heapdict (from chest->python-terrier)\n", + " Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.14.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.5)\n", + "Collecting multiset<3.0,>=2.0 (from matchpy->python-terrier)\n", + " Downloading multiset-2.1.1-py2.py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->python-terrier) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->python-terrier) (2023.4)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.3.0)\n", + "Requirement already satisfied: patsy>=0.5.4 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.6)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.4->statsmodels->python-terrier) (1.16.0)\n", + "Building wheels for collected packages: python-terrier, ir_measures, warc3-wet-clueweb09, zlib-state, chest, wget, cbor, cwl-eval\n", + " Building wheel for python-terrier (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for python-terrier: filename=python_terrier-0.10.0-py3-none-any.whl size=115532 sha256=50793be4120fd471c5b97dd411e04fecaca0bd6bc445b7077781da0951aaa10d\n", + " Stored in directory: /root/.cache/pip/wheels/79/7c/8f/679a982895c53af35178eceda648a4bc9a9af6af5542e31a0e\n", + " Building wheel for ir_measures (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ir_measures: filename=ir_measures-0.3.3-py3-none-any.whl size=61182 sha256=fc2159e8a5a993fa86e25d38279d48605b80403802fedb08ca523007f1dea5cf\n", + " Stored in directory: /root/.cache/pip/wheels/9f/0e/22/718279f23fef1673a4c5e433881c25080a6afaa147e007183e\n", + " Building wheel for warc3-wet-clueweb09 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for warc3-wet-clueweb09: filename=warc3_wet_clueweb09-0.2.5-py3-none-any.whl size=18919 sha256=ca6cd3dab107bc599f7640bb45bab482193c185159e30978ca1ed158a6aa22c2\n", + " Stored in directory: /root/.cache/pip/wheels/1a/d7/91/7ffb991df87e62355d945745035470ba2616aa3d83a250b5f9\n", + " Building wheel for zlib-state (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for zlib-state: filename=zlib_state-0.1.6-cp310-cp310-linux_x86_64.whl size=21163 sha256=4a2c2c56a0ff4226645e6b46f778a32c59eae6e7483d541f71fe67bb0d605ad8\n", + " Stored in directory: /root/.cache/pip/wheels/32/72/7e/aff80f26e926b6e1fb08dfb52aba03c0e058f5e2258deb50a9\n", + " Building wheel for chest (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for chest: filename=chest-0.2.3-py3-none-any.whl size=7612 sha256=72b291ba41b17157da60c20e1910e9386b01d5bb710084bcf12d65f3779a98c3\n", + " Stored in directory: /root/.cache/pip/wheels/88/cf/99/4773b31f855f9ecedc32a0ae400f7a4a3001b37c439b6d1a73\n", + " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=af9e11fe4e7e1706ceb4cd568cc2387ca8b98f188a1293ad3f7b9e2f86030149\n", + " Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769\n", + " Building wheel for cbor (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cbor: filename=cbor-1.0.0-cp310-cp310-linux_x86_64.whl size=53431 sha256=56d67264a9278d7fa7ec8ec4172bf07bea41fa4f2c98a58550557c1844818330\n", + " Stored in directory: /root/.cache/pip/wheels/85/df/c9/b39e40eccaf76dbd218556639a6dc81562226f4c6a64902c85\n", + " Building wheel for cwl-eval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for cwl-eval: filename=cwl_eval-1.0.12-py3-none-any.whl size=38068 sha256=d38c3a0712cb7d9108554b67a17b806d5765cfbb79e4e44e4ebde6dbd9d4c8af\n", + " Stored in directory: /root/.cache/pip/wheels/3d/c1/94/94a3e5379b1aa8fb7c7f1ad1956305d5edc98ef745b6067d87\n", + "Successfully built python-terrier ir_measures warc3-wet-clueweb09 zlib-state chest wget cbor cwl-eval\n", + "Installing collected packages: wget, warc3-wet-clueweb09, warc3-wet, typish, pyjnius, multiset, ijson, heapdict, cbor, zlib-state, unlzw3, trec-car-tools, pytrec_eval_terrier, pyautocorpus, nptyping, matchpy, lz4, dill, deprecated, cwl-eval, chest, ir_measures, inscriptis, docker, tira, ir-datasets, python-terrier\n", + "Successfully installed cbor-1.0.0 chest-0.2.3 cwl-eval-1.0.12 deprecated-1.2.14 dill-0.3.8 docker-6.1.3 heapdict-1.0.1 ijson-3.2.3 inscriptis-2.4.0.1 ir-datasets-0.5.6 ir_measures-0.3.3 lz4-4.3.3 matchpy-0.5.5 multiset-2.1.1 nptyping-1.4.4 pyautocorpus-0.1.12 pyjnius-1.6.1 python-terrier-0.10.0 pytrec_eval_terrier-0.5.6 tira-0.0.103 trec-car-tools-2.6 typish-1.9.3 unlzw3-0.2.2 warc3-wet-0.2.3 warc3-wet-clueweb09-0.2.5 wget-3.2 zlib-state-0.1.6\n" + ] + } + ], + "source": [ + "# Only needed in Colab, in codespaces everything is already installed.\n", + "!pip3 install python-terrier tira ir-datasets" + ] + }, + { + "cell_type": "code", + "source": [ + "import pyterrier as pt\n", + "\n", + "if not pt.started():\n", + " pt.init()\n", + "\n", + "from tira.rest_api_client import Client\n", + "tira = Client()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kP6nwrlJSSUw", + "outputId": "14b6a45d-5b30-4a74-eb1a-ef60a47294e6" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "dataset = pt.get_dataset(\"irds:disks45/nocr/trec-robust-2004\")\n", + "topics = dataset.get_topics(variant='title')\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 143 + }, + "id": "n8keQrBMVUR_", + "outputId": "c987dcbb-84fa-4fed-b78c-0ee9a911d302" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " qid query\n", + "0 301 international organized crime\n", + "1 302 poliomyelitis and post polio\n", + "2 303 hubble telescope achievements" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidquery
0301international organized crime
1302poliomyelitis and post polio
2303hubble telescope achievements
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "topics", + "summary": "{\n \"name\": \"topics\",\n \"rows\": 250,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"443\",\n \"307\",\n \"398\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 249,\n \"samples\": [\n \"inventions scientific discoveries\",\n \"new hydroelectric projects\",\n \"dismantling europe s arsenal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_entity_linking = tira.pt.transform_queries('ir-benchmarks/marcel-gohsen/courtly-vision', dataset)\n", + "query_entity_linking(topics)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "tqOFa0PJVcvW", + "outputId": "83838d44-ed55-4bf4-eeff-d2244b639652" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " qid query \\\n", + "0 301 International Organized Crime \n", + "1 302 Poliomyelitis and Post-Polio \n", + "2 303 Hubble Telescope Achievements \n", + "3 304 Endangered Species (Mammals) \n", + "4 305 Most Dangerous Vehicles \n", + ".. ... ... \n", + "245 696 safety plastic surgery \n", + "246 697 air traffic controller \n", + "247 698 literacy rates Africa \n", + "248 699 term limits \n", + "249 700 gasoline tax U.S. \n", + "\n", + " original_query \\\n", + "0 {'query_id': '301', 'title': 'International Or... \n", + "1 {'query_id': '302', 'title': 'Poliomyelitis an... \n", + "2 {'query_id': '303', 'title': 'Hubble Telescope... \n", + "3 {'query_id': '304', 'title': 'Endangered Speci... \n", + "4 {'query_id': '305', 'title': 'Most Dangerous V... \n", + ".. ... \n", + "245 {'query_id': '696', 'title': 'safety plastic s... \n", + "246 {'query_id': '697', 'title': 'air traffic cont... \n", + "247 {'query_id': '698', 'title': 'literacy rates A... \n", + "248 {'query_id': '699', 'title': 'term limits', 'd... \n", + "249 {'query_id': '700', 'title': 'gasoline tax U.S... \n", + "\n", + " entities \n", + "0 [] \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + ".. ... \n", + "245 [{'begin': 7, 'end': 22, 'mention': 'plastic s... \n", + "246 [{'begin': 0, 'end': 22, 'mention': 'air traff... \n", + "247 [{'begin': 0, 'end': 8, 'mention': 'literacy',... \n", + "248 [{'begin': 0, 'end': 4, 'mention': 'term', 'ur... \n", + "249 [{'begin': 0, 'end': 8, 'mention': 'gasoline',... \n", + "\n", + "[250 rows x 4 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidqueryoriginal_queryentities
0301International Organized Crime{'query_id': '301', 'title': 'International Or...[]
1302Poliomyelitis and Post-Polio{'query_id': '302', 'title': 'Poliomyelitis an...[]
2303Hubble Telescope Achievements{'query_id': '303', 'title': 'Hubble Telescope...[]
3304Endangered Species (Mammals){'query_id': '304', 'title': 'Endangered Speci...[]
4305Most Dangerous Vehicles{'query_id': '305', 'title': 'Most Dangerous V...[]
...............
245696safety plastic surgery{'query_id': '696', 'title': 'safety plastic s...[{'begin': 7, 'end': 22, 'mention': 'plastic s...
246697air traffic controller{'query_id': '697', 'title': 'air traffic cont...[{'begin': 0, 'end': 22, 'mention': 'air traff...
247698literacy rates Africa{'query_id': '698', 'title': 'literacy rates A...[{'begin': 0, 'end': 8, 'mention': 'literacy',...
248699term limits{'query_id': '699', 'title': 'term limits', 'd...[{'begin': 0, 'end': 4, 'mention': 'term', 'ur...
249700gasoline tax U.S.{'query_id': '700', 'title': 'gasoline tax U.S...[{'begin': 0, 'end': 8, 'mention': 'gasoline',...
\n", + "

250 rows × 4 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "topics", + "summary": "{\n \"name\": \"topics\",\n \"rows\": 250,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"443\",\n \"307\",\n \"398\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"U.S., investment, Africa\",\n \"New Hydroelectric Projects\",\n \"dismantling Europe's arsenal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"original_query\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entities\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_entity_linking(topics).iloc[247].to_dict()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "08x9c7tjWTAe", + "outputId": "10092391-6a28-4ecd-fe76-c9120e15af80" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'qid': '698',\n", + " 'query': 'literacy rates Africa',\n", + " 'original_query': {'query_id': '698',\n", + " 'title': 'literacy rates Africa',\n", + " 'description': 'What are literacy rates in African countries?',\n", + " 'narrative': 'A relevant document will contain information about the\\nliteracy rate in an African country.\\nGeneral education levels that do not specifically include literacy rates\\nare not relevant.'},\n", + " 'entities': [{'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'literacy',\n", + " 'url': 'https://en.wikipedia.org/wiki/Literacy',\n", + " 'score': 0.710061993323795},\n", + " {'begin': 0,\n", + " 'end': 14,\n", + " 'mention': 'literacy rates',\n", + " 'url': 'https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate',\n", + " 'score': 0.09090909090909001},\n", + " {'begin': 9,\n", + " 'end': 14,\n", + " 'mention': 'rates',\n", + " 'url': 'https://en.wikipedia.org/wiki/Rates_(Póvoa_de_Varzim)',\n", + " 'score': 0.012711864406779001}]}" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_entity_linking(topics).iloc[249].to_dict()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KubJXyAsV8cX", + "outputId": "03ddd508-d0e2-4019-a0c8-71bad902f694" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'qid': '700',\n", + " 'query': 'gasoline tax U.S.',\n", + " 'original_query': {'query_id': '700',\n", + " 'title': 'gasoline tax U.S.',\n", + " 'description': 'What are the arguments for and against an increase in gasoline\\ntaxes in the U.S.?',\n", + " 'narrative': 'Relevant documents present reasons for or against raising gasoline taxes\\nin the U.S. Documents discussing rises or decreases in the price of\\ngasoline are not relevant.'},\n", + " 'entities': [{'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline',\n", + " 'score': 0.838221076377699},\n", + " {'begin': 0,\n", + " 'end': 12,\n", + " 'mention': 'gasoline tax',\n", + " 'url': 'https://en.wikipedia.org/wiki/Fuel_tax',\n", + " 'score': 0.106382978723404},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(Theory_of_a_Deadman_album)',\n", + " 'score': 0.014502094747019},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(Seether_song)',\n", + " 'score': 0.004834031582339001},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(film)',\n", + " 'score': 0.004834031582339001},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(Halsey_song)',\n", + " 'score': 0.003222687721559},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(band)',\n", + " 'score': 0.002255881405091},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(1913_song)',\n", + " 'score': 0.0006445375443119562},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(the_Hard_Lessons_album)',\n", + " 'score': 0.0006445375443119562},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline_(magazine)',\n", + " 'score': 0.0003222687721559781},\n", + " {'begin': 0,\n", + " 'end': 8,\n", + " 'mention': 'gasoline',\n", + " 'url': 'https://en.wikipedia.org/wiki/Gasoline,_Texas',\n", + " 'score': 0.0003222687721559781}]}" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + } + ] +} \ No newline at end of file