From 712fac6d77e3e66910c773765b520371ea7e5a13 Mon Sep 17 00:00:00 2001 From: Andrian Yarotskyi <101662826+ayarotskyi@users.noreply.github.com> Date: Sun, 30 Jun 2024 18:53:11 +0000 Subject: [PATCH] temporarily got rid of workoshop-on-open-web-search --- .../tira-retrieval-system.ipynb | 66 +++++++++++-------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/baseline-retrieval-system/tira-retrieval-system.ipynb b/baseline-retrieval-system/tira-retrieval-system.ipynb index e227dbe..3a7bca0 100644 --- a/baseline-retrieval-system/tira-retrieval-system.ipynb +++ b/baseline-retrieval-system/tira-retrieval-system.ipynb @@ -12,44 +12,44 @@ "Requirement already satisfied: tira in /usr/local/lib/python3.10/dist-packages (0.0.130)\n", "Requirement already satisfied: ir-datasets in /usr/local/lib/python3.10/dist-packages (0.5.5)\n", "Requirement already satisfied: python-terrier in /usr/local/lib/python3.10/dist-packages (0.10.0)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n", "Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n", - "Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from tira) (4.66.1)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n", - "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", + "Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n", "Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.1.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n", + "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n", - "Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n", - "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n", + "Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n", + "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n", "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.6)\n", - "Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n", + "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", + "Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n", + "Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n", "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.3)\n", + "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n", "Requirement already satisfied: lz4>=3.1.10 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.3.2)\n", - "Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n", + "Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n", "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.3)\n", - "Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n", + "Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n", "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.5)\n", - "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", - "Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n", - "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n", - "Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n", - "Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n", - "Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n", "Requirement already satisfied: nptyping==1.4.4 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.4.4)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", "Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.0)\n", "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.7)\n", - "Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n", + "Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", + "Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", "Requirement already satisfied: chest in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.2.3)\n", - "Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n", "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", - "Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n", + "Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n", + "Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", + "Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n", + "Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n", "Requirement already satisfied: typish>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from nptyping==1.4.4->python-terrier) (1.9.3)\n", "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n", "Requirement already satisfied: cwl-eval>=1.0.10 in /usr/local/lib/python3.10/dist-packages (from ir-measures>=0.3.1->python-terrier) (1.0.12)\n", @@ -58,9 +58,9 @@ "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.16.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.3)\n", "Requirement already satisfied: multiset<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from matchpy->python-terrier) (2.1.1)\n", - "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3.post1)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.2.0)\n", "Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.4)\n", "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.2->statsmodels->python-terrier) (1.16.0)\n", @@ -136,16 +136,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'pyterrier' has no attribute 'transform_queries'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 6\u001b[0m\n\u001b[1;32m 3\u001b[0m queries \u001b[38;5;241m=\u001b[39m pt\u001b[38;5;241m.\u001b[39mio\u001b[38;5;241m.\u001b[39mread_topics(ir_datasets\u001b[38;5;241m.\u001b[39mtopics_file(dataset), \u001b[38;5;28mformat\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtrecxml\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m# llm expansions with gpt\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')\u001b[39;00m\n\u001b[0;32m----> 6\u001b[0m gpt_sq_fs \u001b[38;5;241m=\u001b[39m \u001b[43mpt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform_queries\u001b[49m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mworkshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs\u001b[39m\u001b[38;5;124m'\u001b[39m, dataset, prefix\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mllm_expansion_\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 7\u001b[0m gpt_sq_zs \u001b[38;5;241m=\u001b[39m tira\u001b[38;5;241m.\u001b[39mpt\u001b[38;5;241m.\u001b[39mtransform_queries(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs\u001b[39m\u001b[38;5;124m'\u001b[39m, dataset, prefix\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mllm_expansion_\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 9\u001b[0m \u001b[38;5;66;03m# llm expansions with llama\u001b[39;00m\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'pyterrier' has no attribute 'transform_queries'" + ] + } + ], "source": [ "# Load the expansions\n", "dataset_tira = ir_datasets.load(dataset)\n", "queries = pt.io.read_topics(ir_datasets.topics_file(dataset), format='trecxml')\n", "# llm expansions with gpt\n", "# gpt_cot = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-cot', dataset, prefix='llm_expansion_')\n", - "gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')\n", + "# gpt_sq_fs = tira.pt.transform_queries('workshop-on-open-web-search/tu-dresden-03/qe-gpt3.5-sq-fs', dataset, prefix='llm_expansion_')\n", "gpt_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-gpt3.5-sq-zs', dataset, prefix='llm_expansion_')\n", "\n", "# llm expansions with llama\n", @@ -156,7 +168,7 @@ "# llm expansions with flan-ul2\n", "# flan_cot = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-cot', dataset, prefix='llm_expansion_')\n", "flan_sq_fs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-fs', dataset, prefix='llm_expansion_')\n", - "# flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')" + "# flan_sq_zs = tira.pt.transform_queries('ir-benchmarks/tu-dresden-03/qe-flan-ul2-sq-zs', dataset, prefix='llm_expansion_')\n" ] }, { @@ -183,7 +195,7 @@ "pt_expand_query = pt.apply.query(expand_query)\n", "\n", "\n", - "pipeline_gpt_sq_fs = (gpt_sq_fs >> pt_expand_query) >> bm25\n", + "pipeline_gpt_sq_fs = bm25\n", "pipeline_gpt_sq_zs = (gpt_sq_zs >> pt_expand_query) >> bm25\n", "\n", "pipeline_llama_cot = (llama_cot >> pt_expand_query) >> bm25\n",